Skip to content

Instantly share code, notes, and snippets.

@alrocar
Created September 24, 2025 11:17
Show Gist options
  • Select an option

  • Save alrocar/3f7c0c94d210a46bd787cc1294739dea to your computer and use it in GitHub Desktop.

Select an option

Save alrocar/3f7c0c94d210a46bd787cc1294739dea to your computer and use it in GitHub Desktop.
# translate_srt_openai.py
import re, sys, time, tiktoken
from openai import OpenAI
client = OpenAI()
model = "gpt-4o" # Using GPT-4o which is officially supported
enc = tiktoken.encoding_for_model(model)
def clean_srt(path):
import re
s=open(path).read()
s=re.sub(r'^\d+\s*$', '', s, flags=re.M)
s=re.sub(r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}.*','',s)
return '\n'.join([l.strip() for l in s.splitlines() if l.strip()])
def get_last_sentences(text, num_sentences=3):
"""Extract the last few sentences for context overlap"""
sentences = text.split('. ')
if len(sentences) <= num_sentences:
return text
return '. '.join(sentences[-num_sentences:]) + '.'
def remove_overlap_from_translation(translation, overlap_text):
"""Remove the overlapping part from the translation to avoid duplicates"""
if not overlap_text or not translation:
return translation
# Try to find and remove the overlap at the beginning
overlap_sentences = overlap_text.split('. ')
translation_sentences = translation.split('. ')
# Remove matching sentences from the beginning
while (translation_sentences and overlap_sentences and
translation_sentences[0].strip() == overlap_sentences[0].strip()):
translation_sentences.pop(0)
overlap_sentences.pop(0)
return '. '.join(translation_sentences)
def chunk(text, max_tokens=120000):
"""
Chunk text with overlap for better context preservation.
Uses 120k tokens to leave room for system prompt and response.
"""
parts, buf=[], []
for line in text.splitlines():
test = ('\n'.join(buf+[line])).strip()
if len(enc.encode(test)) > max_tokens:
parts.append('\n'.join(buf)); buf=[line]
else:
buf.append(line)
if buf: parts.append('\n'.join(buf))
return parts
src, out = sys.argv[1], sys.argv[2]
text = clean_srt(src)
parts = chunk(text)
out_segs=[]
overlap_text = None
for i,p in enumerate(parts,1):
# Add overlap from previous chunk for better context (except for first chunk)
if i > 1 and overlap_text:
p = overlap_text + "\n\n" + p
msg = [
{"role":"system","content":"Translate the following Spanish transcript to clear, natural English. Output only the translation."},
{"role":"user","content":p}
]
r = client.chat.completions.create(model=model, messages=msg, temperature=0)
translation = r.choices[0].message.content.strip()
# Remove overlap from translation to avoid duplicates
if i > 1 and overlap_text:
translation = remove_overlap_from_translation(translation, overlap_text)
out_segs.append(translation)
# Store last 3 sentences for next chunk overlap
overlap_text = get_last_sentences(translation, 3)
time.sleep(0.2)
open(out,'w').write('\n\n'.join(out_segs))
print(f"wrote {out}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment