Skip to content

Instantly share code, notes, and snippets.

@shdwkl
Forked from glasslion/vtt2text.py
Last active March 21, 2025 11:46
Show Gist options
  • Select an option

  • Save shdwkl/0c1b4345d358518422bfffbce3400543 to your computer and use it in GitHub Desktop.

Select an option

Save shdwkl/0c1b4345d358518422bfffbce3400543 to your computer and use it in GitHub Desktop.
This script convert youtube subtitle file(vtt) to plain text (with no duplicated).
#!/usr/bin/env python3
import re
import os
import glob
def vtt_to_txt(vtt_filepath, txt_filepath):
"""Converts a VTT file to plain text, removing timestamps, header, and duplicates.
"""
try:
with open(vtt_filepath, 'r', encoding='utf-8') as vtt_file, \
open(txt_filepath, 'w', encoding='utf-8') as txt_file:
previous_line = ""
for line in vtt_file:
line = line.strip()
if line.startswith('WEBVTT') or re.match(r'^[0-9:.]+( --> [0-9:.]+)?$', line) or not line:
continue
if line != previous_line:
txt_file.write(line + '\n')
previous_line = line
except FileNotFoundError:
print(f"Error: File not found: {vtt_filepath}")
except Exception as e:
print(f"An error occurred: {e}")
def main():
vtt_files = glob.glob("*.vtt")
if not vtt_files:
print("No .vtt files found in current directory.")
exit(1)
for vtt_file in vtt_files:
txt_file = os.path.splitext(vtt_file)[0] + ".txt"
if os.path.exists(txt_file):
print(f"Skipping '{vtt_file}': Output file '{txt_file}' already exists.")
continue
vtt_to_txt(vtt_file, txt_file)
print(f"Converted '{vtt_file}' to '{txt_file}'")
if __name__ == "__main__":
main()
@shdwkl
Copy link
Author

shdwkl commented Jul 12, 2023

cp vtt2text ~
sudo ln -s ~/vtt2text /usr/bin
vtt2text /path/to/vttfile
will generate a vttfile.txt within it's relateive path

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment