-
-
Save shdwkl/0c1b4345d358518422bfffbce3400543 to your computer and use it in GitHub Desktop.
This script convert youtube subtitle file(vtt) to plain text (with no duplicated).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import re | |
| import os | |
| import glob | |
| def vtt_to_txt(vtt_filepath, txt_filepath): | |
| """Converts a VTT file to plain text, removing timestamps, header, and duplicates. | |
| """ | |
| try: | |
| with open(vtt_filepath, 'r', encoding='utf-8') as vtt_file, \ | |
| open(txt_filepath, 'w', encoding='utf-8') as txt_file: | |
| previous_line = "" | |
| for line in vtt_file: | |
| line = line.strip() | |
| if line.startswith('WEBVTT') or re.match(r'^[0-9:.]+( --> [0-9:.]+)?$', line) or not line: | |
| continue | |
| if line != previous_line: | |
| txt_file.write(line + '\n') | |
| previous_line = line | |
| except FileNotFoundError: | |
| print(f"Error: File not found: {vtt_filepath}") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| def main(): | |
| vtt_files = glob.glob("*.vtt") | |
| if not vtt_files: | |
| print("No .vtt files found in current directory.") | |
| exit(1) | |
| for vtt_file in vtt_files: | |
| txt_file = os.path.splitext(vtt_file)[0] + ".txt" | |
| if os.path.exists(txt_file): | |
| print(f"Skipping '{vtt_file}': Output file '{txt_file}' already exists.") | |
| continue | |
| vtt_to_txt(vtt_file, txt_file) | |
| print(f"Converted '{vtt_file}' to '{txt_file}'") | |
| if __name__ == "__main__": | |
| main() |
Author
shdwkl
commented
Jul 12, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment