alrocar/translate_openai.py

## translate_openai.py
# translate_srt_openai.py
import re, sys, time, tiktoken
from openai import OpenAI
client = OpenAI()

model = "gpt-4o"  # Using GPT-4o which is officially supported
enc = tiktoken.encoding_for_model(model)

def clean_srt(path):
    import re
    s=open(path).read()
    s=re.sub(r'^\d+\s*$', '', s, flags=re.M)
    s=re.sub(r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}.*','',s)
    return '\n'.join([l.strip() for l in s.splitlines() if l.strip()])

def get_last_sentences(text, num_sentences=3):
    """Extract the last few sentences for context overlap"""
    sentences = text.split('. ')
    if len(sentences) <= num_sentences:
        return text
    return '. '.join(sentences[-num_sentences:]) + '.'

def remove_overlap_from_translation(translation, overlap_text):
    """Remove the overlapping part from the translation to avoid duplicates"""
    if not overlap_text or not translation:
        return translation

    # Try to find and remove the overlap at the beginning
    overlap_sentences = overlap_text.split('. ')
    translation_sentences = translation.split('. ')

    # Remove matching sentences from the beginning
    while (translation_sentences and overlap_sentences and
           translation_sentences[0].strip() == overlap_sentences[0].strip()):
        translation_sentences.pop(0)
        overlap_sentences.pop(0)

    return '. '.join(translation_sentences)

def chunk(text, max_tokens=120000):
    """
    Chunk text with overlap for better context preservation.
    Uses 120k tokens to leave room for system prompt and response.
    """
    parts, buf=[], []
    for line in text.splitlines():
        test = ('\n'.join(buf+[line])).strip()
        if len(enc.encode(test)) > max_tokens:
            parts.append('\n'.join(buf)); buf=[line]
        else:
            buf.append(line)
    if buf: parts.append('\n'.join(buf))
    return parts

src, out = sys.argv[1], sys.argv[2]
text = clean_srt(src)
parts = chunk(text)

out_segs=[]
overlap_text = None

for i,p in enumerate(parts,1):
    # Add overlap from previous chunk for better context (except for first chunk)
    if i > 1 and overlap_text:
        p = overlap_text + "\n\n" + p

    msg = [
      {"role":"system","content":"Translate the following Spanish transcript to clear, natural English. Output only the translation."},
      {"role":"user","content":p}
    ]
    r = client.chat.completions.create(model=model, messages=msg, temperature=0)
    translation = r.choices[0].message.content.strip()

    # Remove overlap from translation to avoid duplicates
    if i > 1 and overlap_text:
        translation = remove_overlap_from_translation(translation, overlap_text)

    out_segs.append(translation)

    # Store last 3 sentences for next chunk overlap
    overlap_text = get_last_sentences(translation, 3)

    time.sleep(0.2)

open(out,'w').write('\n\n'.join(out_segs))
print(f"wrote {out}")
	# translate_srt_openai.py
	import re, sys, time, tiktoken
	from openai import OpenAI
	client = OpenAI()

	model = "gpt-4o" # Using GPT-4o which is officially supported
	enc = tiktoken.encoding_for_model(model)

	def clean_srt(path):
	import re
	s=open(path).read()
	s=re.sub(r'^\d+\s*$', '', s, flags=re.M)
	s=re.sub(r'\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3}.*','',s)
	return '\n'.join([l.strip() for l in s.splitlines() if l.strip()])

	def get_last_sentences(text, num_sentences=3):
	"""Extract the last few sentences for context overlap"""
	sentences = text.split('. ')
	if len(sentences) <= num_sentences:
	return text
	return '. '.join(sentences[-num_sentences:]) + '.'

	def remove_overlap_from_translation(translation, overlap_text):
	"""Remove the overlapping part from the translation to avoid duplicates"""
	if not overlap_text or not translation:
	return translation

	# Try to find and remove the overlap at the beginning
	overlap_sentences = overlap_text.split('. ')
	translation_sentences = translation.split('. ')

	# Remove matching sentences from the beginning
	while (translation_sentences and overlap_sentences and
	translation_sentences[0].strip() == overlap_sentences[0].strip()):
	translation_sentences.pop(0)
	overlap_sentences.pop(0)

	return '. '.join(translation_sentences)

	def chunk(text, max_tokens=120000):
	"""
	Chunk text with overlap for better context preservation.
	Uses 120k tokens to leave room for system prompt and response.
	"""
	parts, buf=[], []
	for line in text.splitlines():
	test = ('\n'.join(buf+[line])).strip()
	if len(enc.encode(test)) > max_tokens:
	parts.append('\n'.join(buf)); buf=[line]
	else:
	buf.append(line)
	if buf: parts.append('\n'.join(buf))
	return parts

	src, out = sys.argv[1], sys.argv[2]
	text = clean_srt(src)
	parts = chunk(text)

	out_segs=[]
	overlap_text = None

	for i,p in enumerate(parts,1):
	# Add overlap from previous chunk for better context (except for first chunk)
	if i > 1 and overlap_text:
	p = overlap_text + "\n\n" + p

	msg = [
	{"role":"system","content":"Translate the following Spanish transcript to clear, natural English. Output only the translation."},
	{"role":"user","content":p}
	]
	r = client.chat.completions.create(model=model, messages=msg, temperature=0)
	translation = r.choices[0].message.content.strip()

	# Remove overlap from translation to avoid duplicates
	if i > 1 and overlap_text:
	translation = remove_overlap_from_translation(translation, overlap_text)

	out_segs.append(translation)

	# Store last 3 sentences for next chunk overlap
	overlap_text = get_last_sentences(translation, 3)

	time.sleep(0.2)

	open(out,'w').write('\n\n'.join(out_segs))
	print(f"wrote {out}")
No results found