Last active
December 1, 2025 03:40
-
-
Save drscotthawley/27d3774c0fff18bfdfe7986346d2a853 to your computer and use it in GitHub Desktop.
Gets the .tex source for a paper, given URL or (arxiv) PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Get .tex source for talking to LLMs about papers w/o them mangling PDFs | |
| import os, re, tarfile, subprocess, tempfile | |
| import arxiv | |
| from fastcore.script import call_parse | |
| def extract_arxiv_id(text: str) -> str|None: | |
| """Extract arxiv ID from text (URL or PDF content).""" | |
| patterns = [ | |
| r'arxiv\.org/abs/(\d+\.\d+)', # URL format | |
| r'arxiv\.org/pdf/(\d+\.\d+)', # PDF URL format | |
| r'arXiv:(\d+\.\d+)', # Citation format | |
| ] | |
| for p in patterns: | |
| if m := re.search(p, text, re.IGNORECASE): return m.group(1) | |
| return None | |
| def download_arxiv_source(arxiv_id: str, dest_dir: str) -> str: | |
| """Download LaTeX source tarball, return path.""" | |
| client = arxiv.Client() | |
| search = arxiv.Search(id_list=[arxiv_id]) | |
| paper = next(client.results(search)) | |
| return paper.download_source(dirpath=dest_dir) | |
| def extract_tarball(tar_path: str, dest_dir: str) -> None: | |
| """Extract .tar.gz to destination directory.""" | |
| with tarfile.open(tar_path, 'r:gz') as tar: | |
| tar.extractall(path=dest_dir, filter='data') | |
| def find_main_tex(ext_dir: str) -> str|None: | |
| """Find the main .tex file (contains \\documentclass).""" | |
| tex_files = [f for f in os.listdir(ext_dir) if f.endswith('.tex')] | |
| for f in tex_files: | |
| with open(os.path.join(ext_dir, f)) as fh: | |
| if '\\documentclass' in fh.read(): return f | |
| return tex_files[0] if tex_files else None | |
| def find_macro_files(ext_dir: str, main_file: str) -> list[str]: | |
| """Find .tex files that aren't the main file (likely macro definitions).""" | |
| return [f for f in os.listdir(ext_dir) if f.endswith('.tex') and f != main_file] | |
| def clean_latex(text: str) -> str: | |
| """Remove blank lines from LaTeX text.""" | |
| return ''.join(line for line in text.splitlines(keepends=True) if line.strip()) | |
| def read_pdf_text(filepath: str) -> str: | |
| """Extract text from PDF file.""" | |
| from pypdf import PdfReader | |
| reader = PdfReader(os.path.expanduser(filepath)) | |
| return '\n'.join(page.extract_text() for page in reader.pages) | |
| def expand_inputs(tex:str, ext_dir='', debug=True) -> str: | |
| inputs = re.findall(r'\\input{(.*)}', tex) | |
| for i, inp in enumerate(inputs): | |
| orig_inp = inp | |
| if not inp.endswith(".tex"): inp += ".tex" | |
| if debug: print(f"Applying input {i+1}/{len(inputs)}: {inp}") | |
| try: | |
| with open(os.path.join(ext_dir, inp)) as f: | |
| inp_text = f.read() | |
| tex = tex.replace(f'\\input{{{orig_inp}}}', inp_text) | |
| except Exception as e: | |
| print(f"Exception reading {inp}: {e}") | |
| return tex | |
| def expand_simple_macros(tex:str, debug=False) -> str: | |
| cmd_lines = [l for l in tex.splitlines() if '\\newcommand' in l or '\\renewcommand' in l] | |
| for line in cmd_lines: | |
| if re.match(r'\\newcommand\{\\[^}]+\}\[', line): | |
| if debug: print("Skipping complex newcommand:",line) | |
| continue | |
| else: | |
| if debug: print("Found simple newcommand:",line) | |
| m = re.match(r'\\(?:new|renew)command\*?\{(\\[^}]+)\}\{(.+)\}$', line) | |
| if m: | |
| name, replacement = m.groups() | |
| tex = re.sub(re.escape(name) + r'(?![a-zA-Z])', lambda m: replacement, tex) | |
| return tex | |
| def run_xpandlatex(ext_dir: str, main_file: str, macro_files: list[str]) -> str: | |
| """Run xpandlatex to expand macros and merge inputs, return processed LaTeX.""" | |
| cmd = ['python', 'xpandlatex.py', '-m', 'on', '-I', 'on'] | |
| for mf in macro_files: cmd.extend(['-f', os.path.join(ext_dir, mf)]) | |
| cmd.append(os.path.join(ext_dir, main_file)) | |
| try: | |
| result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) | |
| return result.stdout, result.returncode | |
| except subprocess.TimeoutExpired: | |
| return "", -1 | |
| def openreview_to_arxiv(url:str, debug=False) -> str: | |
| "Given an OpenReview URL, return a matching arXiv URL" | |
| import httpx | |
| if not 'openreview' in url.lower(): return None # we're not doing this | |
| response = httpx.get(url) | |
| or_html = response.text | |
| m = re.search(r'<title>(.+?) \| OpenReview</title>', or_html) | |
| if m: | |
| paper_title = m.group(1) | |
| if debug: print("paper title:",paper_title) | |
| client = arxiv.Client() | |
| search = arxiv.Search(query=f'ti:"{paper_title}"', max_results=5) | |
| results = client.results(search) | |
| for r in results: | |
| if debug: print(f'arXiv search result: title="{r.title}", url={r}, other keys={dir(r)}') | |
| return str(r) | |
| def paper_scrape(path_or_url: str, debug=False) -> str: | |
| """Get clean LaTeX source from arxiv URL, arxiv ID, or PDF path.""" | |
| # Determine arxiv ID | |
| if 'openreview.net' in path_or_url: | |
| path_or_url = openreview_to_arxiv(path_or_url, debug=debug) | |
| if re.match(r'^\d+\.\d+$', path_or_url): | |
| arxiv_id = path_or_url # Already an ID | |
| elif 'arxiv.org' in path_or_url: | |
| arxiv_id = extract_arxiv_id(path_or_url) # URL | |
| elif os.path.isfile(path_or_url): | |
| arxiv_id = extract_arxiv_id(read_pdf_text(path_or_url)) # PDF | |
| else: | |
| raise ValueError(f"Cannot parse: {path_or_url}") | |
| if not arxiv_id: raise ValueError("Could not extract arxiv ID") | |
| if debug: print(f"arXiv ID =",arxiv_id) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| if debug: print("Downloading source") | |
| tar_path = download_arxiv_source(arxiv_id, tmpdir) | |
| ext_dir = os.path.join(tmpdir, 'extracted') | |
| os.makedirs(ext_dir) | |
| if debug: print("Extracting Tarball") | |
| extract_tarball(tar_path, ext_dir) | |
| if debug: print("Finding main tex file") | |
| main_file = find_main_tex(ext_dir) | |
| if debug: print("Main tex file =",main_file) | |
| if not main_file: raise ValueError("No .tex file found") | |
| macro_files = find_macro_files(ext_dir, main_file) | |
| if debug: print("Macro files found:", macro_files) | |
| if debug: print("Running xpandlatex...") | |
| processed, rc = run_xpandlatex(ext_dir, main_file, macro_files) | |
| if rc != 0 or len(processed)<2000: | |
| if debug: print("Error with xpandlatex. Trying DIY approach...") | |
| with open(os.path.join(ext_dir, main_file)) as f: | |
| tex = f.read() | |
| tex = expand_inputs(tex, ext_dir, debug=debug) | |
| processed = expand_simple_macros(tex, debug=debug) | |
| return clean_latex(processed) | |
| @call_parse | |
| def main(url_or_file, debug=False): | |
| result = paper_scrape(url_or_file, debug=debug) | |
| print(result) # goes to stdout; you can pipe to a file if you want |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment