Skip to content

Instantly share code, notes, and snippets.

@leeirvinekezzico
Last active August 14, 2025 05:32
Show Gist options
  • Select an option

  • Save leeirvinekezzico/d529bbecae4629a7ebd23e58ff7ce25d to your computer and use it in GitHub Desktop.

Select an option

Save leeirvinekezzico/d529bbecae4629a7ebd23e58ff7ce25d to your computer and use it in GitHub Desktop.
Generate an SRT file from a WAV for easy transcript creation
#!/usr/bin/env python3
"""
Audio to SRT Transcript Generator
A command-line tool for transcribing audio files to SRT subtitle format using OpenAI's Whisper model.
Supports multiple audio formats and provides clean, timestamped transcriptions with GPU acceleration.
Author: Lee Irvine <lee@kezzi.co>
License: Open Source (MIT)
https://gist.github.com/leeirvinekezzico/d529bbecae4629a7ebd23e58ff7ce25d
Usage:
python transcript-from-wav.py audio.wav
python transcript-from-wav.py video.mp4 -m large -o subtitles.srt
python transcript-from-wav.py audio.mp3 --print-only
"""
import os
import sys
import argparse
from pathlib import Path
import torch
from transformers import pipeline
from transformers.utils import logging
import warnings
warnings.filterwarnings('ignore')
# Suppress transformers warnings
logging.set_verbosity_error()
def transcribe_audio(audio_path, model_size="small", language="en"):
"""
Transcribe audio file using Whisper model
Args:
audio_path: Path to audio file
model_size: Whisper model size (tiny, base, small, medium, large)
language: Language code for transcription (e.g., 'en', 'es', 'fr')
"""
print(f"Loading Whisper-{model_size} model...")
# Check if GPU is available and use it
device = 0 if torch.cuda.is_available() else -1
device_name = "GPU" if device == 0 else "CPU"
print(f"Using device: {device_name}")
transcriber = pipeline(
"automatic-speech-recognition",
model=f"openai/whisper-{model_size}",
device=device
)
print(f"Transcribing audio file: {audio_path}")
result = transcriber(audio_path, return_timestamps=True, generate_kwargs={"language": language})
return result
def format_timestamp(seconds):
"""Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
hours, remainder = divmod(seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = int((seconds % 1) * 1000)
seconds = int(seconds)
return f"{int(hours):02d}:{int(minutes):02d}:{seconds:02d},{milliseconds:03d}"
def save_srt(transcript, output_path):
"""Save transcript in SRT format to file"""
with open(output_path, 'w', encoding='utf-8') as f:
for i, segment in enumerate(transcript['chunks'], 1):
start_time = segment['timestamp'][0]
end_time = segment['timestamp'][1]
start_formatted = format_timestamp(start_time)
end_formatted = format_timestamp(end_time)
f.write(f"{i}\n")
f.write(f"{start_formatted} --> {end_formatted}\n")
f.write(f"{segment['text'].strip()}\n\n")
def print_srt(transcript):
"""Print transcript in SRT format to console"""
print("---- SRT transcript ----")
for i, segment in enumerate(transcript['chunks'], 1):
start_time = segment['timestamp'][0]
end_time = segment['timestamp'][1]
start_formatted = format_timestamp(start_time)
end_formatted = format_timestamp(end_time)
print(f"{i}")
print(f"{start_formatted} --> {end_formatted}")
print(f"{segment['text'].strip()}")
print()
def main():
parser = argparse.ArgumentParser(description='Transcribe audio to SRT subtitles using Whisper')
parser.add_argument('audio_file', help='Path to audio file')
parser.add_argument('-o', '--output', help='Output SRT file path (default: same name as input with .srt extension)')
parser.add_argument('-m', '--model', choices=['tiny', 'base', 'small', 'medium', 'large'],
default='small', help='Whisper model size (default: small)')
parser.add_argument('-l', '--language', default='en',
help='Language code for transcription (default: en for English)')
parser.add_argument('--print-only', action='store_true',
help='Only print to console, don\'t save file')
args = parser.parse_args()
audio_path = Path(args.audio_file)
if not audio_path.exists():
print(f"Error: File '{audio_path}' not found.")
sys.exit(1)
# Validate audio file extension
audio_extensions = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.mp4', '.mov', '.avi'}
if audio_path.suffix.lower() not in audio_extensions:
print(f"Warning: '{audio_path.suffix}' may not be a supported audio format.")
# Determine output path
if args.output:
output_path = Path(args.output)
else:
output_path = audio_path.with_suffix('.srt')
try:
transcript = transcribe_audio(str(audio_path), args.model, args.language)
if args.print_only:
print_srt(transcript)
else:
save_srt(transcript, output_path)
print(f"SRT file saved to: {output_path}")
print(f"Total segments: {len(transcript['chunks'])}")
except Exception as e:
print(f"Error during transcription: {e}")
sys.exit(1)
if __name__ == "__main__":
if not sys.warnoptions:
warnings.simplefilter("ignore")
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment