Created
September 24, 2025 11:17
-
-
Save alrocar/3f7c0c94d210a46bd787cc1294739dea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # translate_srt_openai.py | |
| import re, sys, time, tiktoken | |
| from openai import OpenAI | |
| client = OpenAI() | |
| model = "gpt-4o" # Using GPT-4o which is officially supported | |
| enc = tiktoken.encoding_for_model(model) | |
| def clean_srt(path): | |
| import re | |
| s=open(path).read() | |
| s=re.sub(r'^\d+\s*$', '', s, flags=re.M) | |
| s=re.sub(r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}.*','',s) | |
| return '\n'.join([l.strip() for l in s.splitlines() if l.strip()]) | |
| def get_last_sentences(text, num_sentences=3): | |
| """Extract the last few sentences for context overlap""" | |
| sentences = text.split('. ') | |
| if len(sentences) <= num_sentences: | |
| return text | |
| return '. '.join(sentences[-num_sentences:]) + '.' | |
| def remove_overlap_from_translation(translation, overlap_text): | |
| """Remove the overlapping part from the translation to avoid duplicates""" | |
| if not overlap_text or not translation: | |
| return translation | |
| # Try to find and remove the overlap at the beginning | |
| overlap_sentences = overlap_text.split('. ') | |
| translation_sentences = translation.split('. ') | |
| # Remove matching sentences from the beginning | |
| while (translation_sentences and overlap_sentences and | |
| translation_sentences[0].strip() == overlap_sentences[0].strip()): | |
| translation_sentences.pop(0) | |
| overlap_sentences.pop(0) | |
| return '. '.join(translation_sentences) | |
| def chunk(text, max_tokens=120000): | |
| """ | |
| Chunk text with overlap for better context preservation. | |
| Uses 120k tokens to leave room for system prompt and response. | |
| """ | |
| parts, buf=[], [] | |
| for line in text.splitlines(): | |
| test = ('\n'.join(buf+[line])).strip() | |
| if len(enc.encode(test)) > max_tokens: | |
| parts.append('\n'.join(buf)); buf=[line] | |
| else: | |
| buf.append(line) | |
| if buf: parts.append('\n'.join(buf)) | |
| return parts | |
| src, out = sys.argv[1], sys.argv[2] | |
| text = clean_srt(src) | |
| parts = chunk(text) | |
| out_segs=[] | |
| overlap_text = None | |
| for i,p in enumerate(parts,1): | |
| # Add overlap from previous chunk for better context (except for first chunk) | |
| if i > 1 and overlap_text: | |
| p = overlap_text + "\n\n" + p | |
| msg = [ | |
| {"role":"system","content":"Translate the following Spanish transcript to clear, natural English. Output only the translation."}, | |
| {"role":"user","content":p} | |
| ] | |
| r = client.chat.completions.create(model=model, messages=msg, temperature=0) | |
| translation = r.choices[0].message.content.strip() | |
| # Remove overlap from translation to avoid duplicates | |
| if i > 1 and overlap_text: | |
| translation = remove_overlap_from_translation(translation, overlap_text) | |
| out_segs.append(translation) | |
| # Store last 3 sentences for next chunk overlap | |
| overlap_text = get_last_sentences(translation, 3) | |
| time.sleep(0.2) | |
| open(out,'w').write('\n\n'.join(out_segs)) | |
| print(f"wrote {out}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment