Skip to content

Instantly share code, notes, and snippets.

@ppo
Created January 20, 2026 12:27
Show Gist options
  • Select an option

  • Save ppo/cf6c660e5e828cec611f3700728d370e to your computer and use it in GitHub Desktop.

Select an option

Save ppo/cf6c660e5e828cec611f3700728d370e to your computer and use it in GitHub Desktop.
Transcribe files (audio or video, to TXT or SRT). Using Whisper.
#!/usr/bin/env python
"""Transcribe files (audio or video, to TXT or SRT).
AI Models: https://github.com/openai/whisper
Requirements:
```
brew install ffmpeg
pip install openai-whisper
```
"""
import argparse
import time
from datetime import timedelta
from pathlib import Path
import whisper
# Arg choices
MODEL_CHOICES = ["tiny", "base", "small", "medium", "large", "turbo"] # See https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
FORMAT_CHOICES=["txt", "srt"]
# Arg defaults
DEFAULT_MODEL = "large"
DEFAULT_FORMAT = "txt"
DEFAULT_LANGUAGE = "en"
# Colors
C_HEADING = "\033[1;33m"
C_SUCCESS = "\033[1;32m"
C_ERROR = "\033[1;31m"
C_RESET = "\033[0m"
def heading(text):
print(f"{C_HEADING}{text}{C_RESET}")
def format_duration(seconds):
"""Format duration as `H:MM:SS`."""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
return f"{hours}:{minutes:02d}:{secs:02d}"
def format_srt_timestamp(seconds):
"""Convert seconds to SRT timestamp format: `HH:MM:SS,mmm`."""
td = timedelta(seconds=seconds)
hours = int(td.total_seconds() // 3600)
minutes = int((td.total_seconds() % 3600) // 60)
secs = int(td.total_seconds() % 60)
millis = int((td.total_seconds() % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
def save_as_srt(result, output_path):
"""Save result segments as SRT."""
with open(output_path, "w", encoding="utf-8") as f:
for i, segment in enumerate(result["segments"], start=1):
start = format_srt_timestamp(segment["start"])
end = format_srt_timestamp(segment["end"])
text = segment["text"].strip()
f.write(f"{i}\n")
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
return output_path
def save_as_txt(result, output_path):
"""Save result segments as TXT."""
with open(output_path, "w", encoding="utf-8") as f:
for segment in result["segments"]:
text = segment["text"].strip()
f.write(f"{text}\n")
def transcribe_file(path, model, language, format):
"""Transcribe file to the given format."""
result = model.transcribe(
str(path),
language=language,
verbose=False,
fp16=False, # Suppress warning
)
if format == "srt":
output_path = path.with_suffix(f".{language}.srt")
save_as_srt(result, output_path)
else:
output_path = path.with_suffix(".txt")
save_as_txt(result, output_path)
return output_path
def parse_args():
parser = argparse.ArgumentParser(description="Transcribe files (audio or video) using Whisper")
parser.add_argument("files", nargs="+", help="Files to transcribe")
parser.add_argument(
"--model",
default=DEFAULT_MODEL,
choices=MODEL_CHOICES,
help=f"Whisper model size (default: {DEFAULT_MODEL})",
)
parser.add_argument(
"--format",
choices=FORMAT_CHOICES,
default=DEFAULT_FORMAT,
help="Output format: txt, srt (default: txt)",
)
parser.add_argument(
"--lang",
dest="language",
default=DEFAULT_LANGUAGE,
help=f"Audio language (default: {DEFAULT_LANGUAGE})",
)
return parser.parse_args()
def main():
args = parse_args()
print(f"Loading Whisper model '{args.model}'…")
model = whisper.load_model(args.model)
for file in args.files:
path = Path(file)
if not path.exists():
print(f"{C_ERROR}Error: {file} not found{C_RESET}")
continue
print()
heading(f"Transcribing: {path.name}")
start_time = time.time()
output_path = transcribe_file(path, model, args.language, args.format)
duration = time.time() - start_time
duration_str = format_duration(duration)
print(f" {C_SUCCESS}{args.format.upper()}: {output_path.name}{C_RESET}")
print(f" Processed in {duration_str}")
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment