ppo/transcribe.py

## transcribe.py
#!/usr/bin/env python
"""Transcribe files (audio or video, to TXT or SRT).

AI Models: https://github.com/openai/whisper

Requirements:
```
brew install ffmpeg
pip install openai-whisper
```
"""

import argparse
import time
from datetime import timedelta
from pathlib import Path

import whisper


# Arg choices
MODEL_CHOICES = ["tiny", "base", "small", "medium", "large", "turbo"]  # See https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
FORMAT_CHOICES=["txt", "srt"]

# Arg defaults
DEFAULT_MODEL = "large"
DEFAULT_FORMAT = "txt"
DEFAULT_LANGUAGE = "en"


# Colors
C_HEADING = "\033[1;33m"
C_SUCCESS = "\033[1;32m"
C_ERROR = "\033[1;31m"
C_RESET = "\033[0m"


def heading(text):
    print(f"{C_HEADING}{text}{C_RESET}")


def format_duration(seconds):
    """Format duration as `H:MM:SS`."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hours}:{minutes:02d}:{secs:02d}"


def format_srt_timestamp(seconds):
    """Convert seconds to SRT timestamp format: `HH:MM:SS,mmm`."""
    td = timedelta(seconds=seconds)
    hours = int(td.total_seconds() // 3600)
    minutes = int((td.total_seconds() % 3600) // 60)
    secs = int(td.total_seconds() % 60)
    millis = int((td.total_seconds() % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"


def save_as_srt(result, output_path):
    """Save result segments as SRT."""
    with open(output_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(result["segments"], start=1):
            start = format_srt_timestamp(segment["start"])
            end = format_srt_timestamp(segment["end"])
            text = segment["text"].strip()

            f.write(f"{i}\n")
            f.write(f"{start} --> {end}\n")
            f.write(f"{text}\n\n")

    return output_path


def save_as_txt(result, output_path):
    """Save result segments as TXT."""
    with open(output_path, "w", encoding="utf-8") as f:
        for segment in result["segments"]:
            text = segment["text"].strip()
            f.write(f"{text}\n")


def transcribe_file(path, model, language, format):
    """Transcribe file to the given format."""
    result = model.transcribe(
        str(path),
        language=language,
        verbose=False,
        fp16=False,  # Suppress warning
    )

    if format == "srt":
        output_path = path.with_suffix(f".{language}.srt")
        save_as_srt(result, output_path)
    else:
        output_path = path.with_suffix(".txt")
        save_as_txt(result, output_path)

    return output_path


def parse_args():
    parser = argparse.ArgumentParser(description="Transcribe files (audio or video) using Whisper")
    parser.add_argument("files", nargs="+", help="Files to transcribe")
    parser.add_argument(
        "--model",
        default=DEFAULT_MODEL,
        choices=MODEL_CHOICES,
        help=f"Whisper model size (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "--format",
        choices=FORMAT_CHOICES,
        default=DEFAULT_FORMAT,
        help="Output format: txt, srt (default: txt)",
    )
    parser.add_argument(
        "--lang",
        dest="language",
        default=DEFAULT_LANGUAGE,
        help=f"Audio language (default: {DEFAULT_LANGUAGE})",
    )

    return parser.parse_args()

def main():
    args = parse_args()

    print(f"Loading Whisper model '{args.model}'…")
    model = whisper.load_model(args.model)

    for file in args.files:
        path = Path(file)
        if not path.exists():
            print(f"{C_ERROR}Error: {file} not found{C_RESET}")
            continue

        print()
        heading(f"Transcribing: {path.name}")

        start_time = time.time()

        output_path = transcribe_file(path, model, args.language, args.format)

        duration = time.time() - start_time
        duration_str = format_duration(duration)

        print(f"  {C_SUCCESS}{args.format.upper()}: {output_path.name}{C_RESET}")
        print(f"  Processed in {duration_str}")

    print()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""Transcribe files (audio or video, to TXT or SRT).

	AI Models: https://github.com/openai/whisper

	Requirements:
	```
	brew install ffmpeg
	pip install openai-whisper
	```
	"""

	import argparse
	import time
	from datetime import timedelta
	from pathlib import Path

	import whisper


	# Arg choices
	MODEL_CHOICES = ["tiny", "base", "small", "medium", "large", "turbo"] # See https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
	FORMAT_CHOICES=["txt", "srt"]

	# Arg defaults
	DEFAULT_MODEL = "large"
	DEFAULT_FORMAT = "txt"
	DEFAULT_LANGUAGE = "en"


	# Colors
	C_HEADING = "\033[1;33m"
	C_SUCCESS = "\033[1;32m"
	C_ERROR = "\033[1;31m"
	C_RESET = "\033[0m"


	def heading(text):
	print(f"{C_HEADING}{text}{C_RESET}")


	def format_duration(seconds):
	"""Format duration as `H:MM:SS`."""
	hours = int(seconds // 3600)
	minutes = int((seconds % 3600) // 60)
	secs = int(seconds % 60)
	return f"{hours}:{minutes:02d}:{secs:02d}"


	def format_srt_timestamp(seconds):
	"""Convert seconds to SRT timestamp format: `HH:MM:SS,mmm`."""
	td = timedelta(seconds=seconds)
	hours = int(td.total_seconds() // 3600)
	minutes = int((td.total_seconds() % 3600) // 60)
	secs = int(td.total_seconds() % 60)
	millis = int((td.total_seconds() % 1) * 1000)
	return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"


	def save_as_srt(result, output_path):
	"""Save result segments as SRT."""
	with open(output_path, "w", encoding="utf-8") as f:
	for i, segment in enumerate(result["segments"], start=1):
	start = format_srt_timestamp(segment["start"])
	end = format_srt_timestamp(segment["end"])
	text = segment["text"].strip()

	f.write(f"{i}\n")
	f.write(f"{start} --> {end}\n")
	f.write(f"{text}\n\n")

	return output_path


	def save_as_txt(result, output_path):
	"""Save result segments as TXT."""
	with open(output_path, "w", encoding="utf-8") as f:
	for segment in result["segments"]:
	text = segment["text"].strip()
	f.write(f"{text}\n")


	def transcribe_file(path, model, language, format):
	"""Transcribe file to the given format."""
	result = model.transcribe(
	str(path),
	language=language,
	verbose=False,
	fp16=False, # Suppress warning
	)

	if format == "srt":
	output_path = path.with_suffix(f".{language}.srt")
	save_as_srt(result, output_path)
	else:
	output_path = path.with_suffix(".txt")
	save_as_txt(result, output_path)

	return output_path


	def parse_args():
	parser = argparse.ArgumentParser(description="Transcribe files (audio or video) using Whisper")
	parser.add_argument("files", nargs="+", help="Files to transcribe")
	parser.add_argument(
	"--model",
	default=DEFAULT_MODEL,
	choices=MODEL_CHOICES,
	help=f"Whisper model size (default: {DEFAULT_MODEL})",
	)
	parser.add_argument(
	"--format",
	choices=FORMAT_CHOICES,
	default=DEFAULT_FORMAT,
	help="Output format: txt, srt (default: txt)",
	)
	parser.add_argument(
	"--lang",
	dest="language",
	default=DEFAULT_LANGUAGE,
	help=f"Audio language (default: {DEFAULT_LANGUAGE})",
	)

	return parser.parse_args()

	def main():
	args = parse_args()

	print(f"Loading Whisper model '{args.model}'…")
	model = whisper.load_model(args.model)

	for file in args.files:
	path = Path(file)
	if not path.exists():
	print(f"{C_ERROR}Error: {file} not found{C_RESET}")
	continue

	print()
	heading(f"Transcribing: {path.name}")

	start_time = time.time()

	output_path = transcribe_file(path, model, args.language, args.format)

	duration = time.time() - start_time
	duration_str = format_duration(duration)

	print(f" {C_SUCCESS}{args.format.upper()}: {output_path.name}{C_RESET}")
	print(f" Processed in {duration_str}")

	print()


	if __name__ == "__main__":
	main()
No results found