Last active
August 14, 2025 05:32
-
-
Save leeirvinekezzico/d529bbecae4629a7ebd23e58ff7ce25d to your computer and use it in GitHub Desktop.
Generate an SRT file from a WAV for easy transcript creation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Audio to SRT Transcript Generator | |
| A command-line tool for transcribing audio files to SRT subtitle format using OpenAI's Whisper model. | |
| Supports multiple audio formats and provides clean, timestamped transcriptions with GPU acceleration. | |
| Author: Lee Irvine <lee@kezzi.co> | |
| License: Open Source (MIT) | |
| https://gist.github.com/leeirvinekezzico/d529bbecae4629a7ebd23e58ff7ce25d | |
| Usage: | |
| python transcript-from-wav.py audio.wav | |
| python transcript-from-wav.py video.mp4 -m large -o subtitles.srt | |
| python transcript-from-wav.py audio.mp3 --print-only | |
| """ | |
| import os | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| import torch | |
| from transformers import pipeline | |
| from transformers.utils import logging | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Suppress transformers warnings | |
| logging.set_verbosity_error() | |
| def transcribe_audio(audio_path, model_size="small", language="en"): | |
| """ | |
| Transcribe audio file using Whisper model | |
| Args: | |
| audio_path: Path to audio file | |
| model_size: Whisper model size (tiny, base, small, medium, large) | |
| language: Language code for transcription (e.g., 'en', 'es', 'fr') | |
| """ | |
| print(f"Loading Whisper-{model_size} model...") | |
| # Check if GPU is available and use it | |
| device = 0 if torch.cuda.is_available() else -1 | |
| device_name = "GPU" if device == 0 else "CPU" | |
| print(f"Using device: {device_name}") | |
| transcriber = pipeline( | |
| "automatic-speech-recognition", | |
| model=f"openai/whisper-{model_size}", | |
| device=device | |
| ) | |
| print(f"Transcribing audio file: {audio_path}") | |
| result = transcriber(audio_path, return_timestamps=True, generate_kwargs={"language": language}) | |
| return result | |
| def format_timestamp(seconds): | |
| """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)""" | |
| hours, remainder = divmod(seconds, 3600) | |
| minutes, seconds = divmod(remainder, 60) | |
| milliseconds = int((seconds % 1) * 1000) | |
| seconds = int(seconds) | |
| return f"{int(hours):02d}:{int(minutes):02d}:{seconds:02d},{milliseconds:03d}" | |
| def save_srt(transcript, output_path): | |
| """Save transcript in SRT format to file""" | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for i, segment in enumerate(transcript['chunks'], 1): | |
| start_time = segment['timestamp'][0] | |
| end_time = segment['timestamp'][1] | |
| start_formatted = format_timestamp(start_time) | |
| end_formatted = format_timestamp(end_time) | |
| f.write(f"{i}\n") | |
| f.write(f"{start_formatted} --> {end_formatted}\n") | |
| f.write(f"{segment['text'].strip()}\n\n") | |
| def print_srt(transcript): | |
| """Print transcript in SRT format to console""" | |
| print("---- SRT transcript ----") | |
| for i, segment in enumerate(transcript['chunks'], 1): | |
| start_time = segment['timestamp'][0] | |
| end_time = segment['timestamp'][1] | |
| start_formatted = format_timestamp(start_time) | |
| end_formatted = format_timestamp(end_time) | |
| print(f"{i}") | |
| print(f"{start_formatted} --> {end_formatted}") | |
| print(f"{segment['text'].strip()}") | |
| print() | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Transcribe audio to SRT subtitles using Whisper') | |
| parser.add_argument('audio_file', help='Path to audio file') | |
| parser.add_argument('-o', '--output', help='Output SRT file path (default: same name as input with .srt extension)') | |
| parser.add_argument('-m', '--model', choices=['tiny', 'base', 'small', 'medium', 'large'], | |
| default='small', help='Whisper model size (default: small)') | |
| parser.add_argument('-l', '--language', default='en', | |
| help='Language code for transcription (default: en for English)') | |
| parser.add_argument('--print-only', action='store_true', | |
| help='Only print to console, don\'t save file') | |
| args = parser.parse_args() | |
| audio_path = Path(args.audio_file) | |
| if not audio_path.exists(): | |
| print(f"Error: File '{audio_path}' not found.") | |
| sys.exit(1) | |
| # Validate audio file extension | |
| audio_extensions = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.mp4', '.mov', '.avi'} | |
| if audio_path.suffix.lower() not in audio_extensions: | |
| print(f"Warning: '{audio_path.suffix}' may not be a supported audio format.") | |
| # Determine output path | |
| if args.output: | |
| output_path = Path(args.output) | |
| else: | |
| output_path = audio_path.with_suffix('.srt') | |
| try: | |
| transcript = transcribe_audio(str(audio_path), args.model, args.language) | |
| if args.print_only: | |
| print_srt(transcript) | |
| else: | |
| save_srt(transcript, output_path) | |
| print(f"SRT file saved to: {output_path}") | |
| print(f"Total segments: {len(transcript['chunks'])}") | |
| except Exception as e: | |
| print(f"Error during transcription: {e}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| if not sys.warnoptions: | |
| warnings.simplefilter("ignore") | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment