Skip to content

Instantly share code, notes, and snippets.

@drscotthawley
Last active December 1, 2025 03:40
Show Gist options
  • Select an option

  • Save drscotthawley/27d3774c0fff18bfdfe7986346d2a853 to your computer and use it in GitHub Desktop.

Select an option

Save drscotthawley/27d3774c0fff18bfdfe7986346d2a853 to your computer and use it in GitHub Desktop.
Gets the .tex source for a paper, given URL or (arxiv) PDF
#!/usr/bin/env python3
# Get .tex source for talking to LLMs about papers w/o them mangling PDFs
import os, re, tarfile, subprocess, tempfile
import arxiv
from fastcore.script import call_parse
def extract_arxiv_id(text: str) -> str|None:
"""Extract arxiv ID from text (URL or PDF content)."""
patterns = [
r'arxiv\.org/abs/(\d+\.\d+)', # URL format
r'arxiv\.org/pdf/(\d+\.\d+)', # PDF URL format
r'arXiv:(\d+\.\d+)', # Citation format
]
for p in patterns:
if m := re.search(p, text, re.IGNORECASE): return m.group(1)
return None
def download_arxiv_source(arxiv_id: str, dest_dir: str) -> str:
"""Download LaTeX source tarball, return path."""
client = arxiv.Client()
search = arxiv.Search(id_list=[arxiv_id])
paper = next(client.results(search))
return paper.download_source(dirpath=dest_dir)
def extract_tarball(tar_path: str, dest_dir: str) -> None:
"""Extract .tar.gz to destination directory."""
with tarfile.open(tar_path, 'r:gz') as tar:
tar.extractall(path=dest_dir, filter='data')
def find_main_tex(ext_dir: str) -> str|None:
"""Find the main .tex file (contains \\documentclass)."""
tex_files = [f for f in os.listdir(ext_dir) if f.endswith('.tex')]
for f in tex_files:
with open(os.path.join(ext_dir, f)) as fh:
if '\\documentclass' in fh.read(): return f
return tex_files[0] if tex_files else None
def find_macro_files(ext_dir: str, main_file: str) -> list[str]:
"""Find .tex files that aren't the main file (likely macro definitions)."""
return [f for f in os.listdir(ext_dir) if f.endswith('.tex') and f != main_file]
def clean_latex(text: str) -> str:
"""Remove blank lines from LaTeX text."""
return ''.join(line for line in text.splitlines(keepends=True) if line.strip())
def read_pdf_text(filepath: str) -> str:
"""Extract text from PDF file."""
from pypdf import PdfReader
reader = PdfReader(os.path.expanduser(filepath))
return '\n'.join(page.extract_text() for page in reader.pages)
def expand_inputs(tex:str, ext_dir='', debug=True) -> str:
inputs = re.findall(r'\\input{(.*)}', tex)
for i, inp in enumerate(inputs):
orig_inp = inp
if not inp.endswith(".tex"): inp += ".tex"
if debug: print(f"Applying input {i+1}/{len(inputs)}: {inp}")
try:
with open(os.path.join(ext_dir, inp)) as f:
inp_text = f.read()
tex = tex.replace(f'\\input{{{orig_inp}}}', inp_text)
except Exception as e:
print(f"Exception reading {inp}: {e}")
return tex
def expand_simple_macros(tex:str, debug=False) -> str:
cmd_lines = [l for l in tex.splitlines() if '\\newcommand' in l or '\\renewcommand' in l]
for line in cmd_lines:
if re.match(r'\\newcommand\{\\[^}]+\}\[', line):
if debug: print("Skipping complex newcommand:",line)
continue
else:
if debug: print("Found simple newcommand:",line)
m = re.match(r'\\(?:new|renew)command\*?\{(\\[^}]+)\}\{(.+)\}$', line)
if m:
name, replacement = m.groups()
tex = re.sub(re.escape(name) + r'(?![a-zA-Z])', lambda m: replacement, tex)
return tex
def run_xpandlatex(ext_dir: str, main_file: str, macro_files: list[str]) -> str:
"""Run xpandlatex to expand macros and merge inputs, return processed LaTeX."""
cmd = ['python', 'xpandlatex.py', '-m', 'on', '-I', 'on']
for mf in macro_files: cmd.extend(['-f', os.path.join(ext_dir, mf)])
cmd.append(os.path.join(ext_dir, main_file))
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
return result.stdout, result.returncode
except subprocess.TimeoutExpired:
return "", -1
def openreview_to_arxiv(url:str, debug=False) -> str:
"Given an OpenReview URL, return a matching arXiv URL"
import httpx
if not 'openreview' in url.lower(): return None # we're not doing this
response = httpx.get(url)
or_html = response.text
m = re.search(r'<title>(.+?) \| OpenReview</title>', or_html)
if m:
paper_title = m.group(1)
if debug: print("paper title:",paper_title)
client = arxiv.Client()
search = arxiv.Search(query=f'ti:"{paper_title}"', max_results=5)
results = client.results(search)
for r in results:
if debug: print(f'arXiv search result: title="{r.title}", url={r}, other keys={dir(r)}')
return str(r)
def paper_scrape(path_or_url: str, debug=False) -> str:
"""Get clean LaTeX source from arxiv URL, arxiv ID, or PDF path."""
# Determine arxiv ID
if 'openreview.net' in path_or_url:
path_or_url = openreview_to_arxiv(path_or_url, debug=debug)
if re.match(r'^\d+\.\d+$', path_or_url):
arxiv_id = path_or_url # Already an ID
elif 'arxiv.org' in path_or_url:
arxiv_id = extract_arxiv_id(path_or_url) # URL
elif os.path.isfile(path_or_url):
arxiv_id = extract_arxiv_id(read_pdf_text(path_or_url)) # PDF
else:
raise ValueError(f"Cannot parse: {path_or_url}")
if not arxiv_id: raise ValueError("Could not extract arxiv ID")
if debug: print(f"arXiv ID =",arxiv_id)
with tempfile.TemporaryDirectory() as tmpdir:
if debug: print("Downloading source")
tar_path = download_arxiv_source(arxiv_id, tmpdir)
ext_dir = os.path.join(tmpdir, 'extracted')
os.makedirs(ext_dir)
if debug: print("Extracting Tarball")
extract_tarball(tar_path, ext_dir)
if debug: print("Finding main tex file")
main_file = find_main_tex(ext_dir)
if debug: print("Main tex file =",main_file)
if not main_file: raise ValueError("No .tex file found")
macro_files = find_macro_files(ext_dir, main_file)
if debug: print("Macro files found:", macro_files)
if debug: print("Running xpandlatex...")
processed, rc = run_xpandlatex(ext_dir, main_file, macro_files)
if rc != 0 or len(processed)<2000:
if debug: print("Error with xpandlatex. Trying DIY approach...")
with open(os.path.join(ext_dir, main_file)) as f:
tex = f.read()
tex = expand_inputs(tex, ext_dir, debug=debug)
processed = expand_simple_macros(tex, debug=debug)
return clean_latex(processed)
@call_parse
def main(url_or_file, debug=False):
result = paper_scrape(url_or_file, debug=debug)
print(result) # goes to stdout; you can pipe to a file if you want
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment