Created
March 9, 2026 05:19
-
-
Save cedricvidal/b5f33e98afaaca09e3a6177860faf998 to your computer and use it in GitHub Desktop.
Youtube Video transcription self contained uv script using `youtube-transcript-api`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "rich-click", | |
| # "youtube-transcript-api", | |
| # ] | |
| # /// | |
| import json | |
| import sys | |
| from datetime import timedelta | |
| from urllib.parse import urlparse, parse_qs | |
| import rich_click as click | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript | |
| click.rich_click.USE_RICH_MARKUP = True | |
| click.rich_click.USE_MARKDOWN = True | |
| def extract_video_id(video: str) -> str: | |
| """Extract YouTube video ID from a URL or return as-is if already an ID.""" | |
| parsed = urlparse(video) | |
| if parsed.scheme in ("http", "https"): | |
| if parsed.hostname in ("youtu.be",): | |
| return parsed.path.lstrip("/") | |
| qs = parse_qs(parsed.query) | |
| if "v" in qs: | |
| return qs["v"][0] | |
| return video | |
| def format_srt_time(seconds: float) -> str: | |
| td = timedelta(seconds=seconds) | |
| total_seconds = int(td.total_seconds()) | |
| millis = int((td.total_seconds() - total_seconds) * 1000) | |
| hours, remainder = divmod(total_seconds, 3600) | |
| minutes, secs = divmod(remainder, 60) | |
| return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}" | |
| def to_srt(transcript) -> str: | |
| lines = [] | |
| for i, entry in enumerate(transcript, start=1): | |
| start = format_srt_time(entry.start) | |
| end = format_srt_time(entry.start + entry.duration) | |
| text = entry.text.strip() | |
| lines.append(f"{i}\n{start} --> {end}\n{text}\n") | |
| return "\n".join(lines) | |
| def to_txt(transcript) -> str: | |
| return "\n".join(entry.text.strip() for entry in transcript) | |
| def to_json(transcript) -> str: | |
| return json.dumps( | |
| [{"text": e.text, "start": e.start, "duration": e.duration} for e in transcript], | |
| indent=2, | |
| ensure_ascii=False, | |
| ) | |
| @click.command() | |
| @click.argument("video") | |
| @click.option("--output", "-o", default=None, help="Output file path (default: stdout)") | |
| @click.option( | |
| "--format", "-f", "fmt", | |
| default="txt", | |
| type=click.Choice(["txt", "srt", "json"]), | |
| help="Output format (default: txt)", | |
| ) | |
| @click.option("--language", "-l", default="en", help="Preferred language code (default: en)") | |
| def cli(video: str, output: str | None, fmt: str, language: str): | |
| """ | |
| Fetch and format the transcript of a YouTube video. | |
| VIDEO: YouTube URL or video ID | |
| """ | |
| video_id = extract_video_id(video) | |
| click.echo(f"Fetching transcript for: {video_id}", err=True) | |
| try: | |
| transcript = YouTubeTranscriptApi().fetch(video_id, languages=[language]) | |
| except (TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript) as e: | |
| click.echo(f"❌ {e}", err=True) | |
| sys.exit(1) | |
| if fmt == "srt": | |
| content = to_srt(transcript) | |
| elif fmt == "json": | |
| content = to_json(transcript) | |
| else: | |
| content = to_txt(transcript) | |
| if output: | |
| with open(output, "w", encoding="utf-8") as f: | |
| f.write(content) | |
| click.echo(f"✅ Saved to: {output}", err=True) | |
| else: | |
| click.echo(content) | |
| if __name__ == "__main__": | |
| cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment