drscotthawley/paper_scrape.py

## paper_scrape.py
#!/usr/bin/env python3
# Get .tex source for talking to LLMs about papers w/o them mangling PDFs
import os, re, tarfile, subprocess, tempfile
import arxiv
from fastcore.script import call_parse

def extract_arxiv_id(text: str) -> str|None:
    """Extract arxiv ID from text (URL or PDF content)."""
    patterns = [
        r'arxiv\.org/abs/(\d+\.\d+)',           # URL format
        r'arxiv\.org/pdf/(\d+\.\d+)',           # PDF URL format
        r'arXiv:(\d+\.\d+)',                     # Citation format
    ]
    for p in patterns:
        if m := re.search(p, text, re.IGNORECASE): return m.group(1)
    return None

def download_arxiv_source(arxiv_id: str, dest_dir: str) -> str:
    """Download LaTeX source tarball, return path."""
    client = arxiv.Client()
    search = arxiv.Search(id_list=[arxiv_id])
    paper = next(client.results(search))
    return paper.download_source(dirpath=dest_dir)

def extract_tarball(tar_path: str, dest_dir: str) -> None:
    """Extract .tar.gz to destination directory."""
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(path=dest_dir, filter='data')

def find_main_tex(ext_dir: str) -> str|None:
    """Find the main .tex file (contains \\documentclass)."""
    tex_files = [f for f in os.listdir(ext_dir) if f.endswith('.tex')]
    for f in tex_files:
        with open(os.path.join(ext_dir, f)) as fh:
            if '\\documentclass' in fh.read(): return f
    return tex_files[0] if tex_files else None

def find_macro_files(ext_dir: str, main_file: str) -> list[str]:
    """Find .tex files that aren't the main file (likely macro definitions)."""
    return [f for f in os.listdir(ext_dir) if f.endswith('.tex') and f != main_file]

def clean_latex(text: str) -> str:
    """Remove blank lines from LaTeX text."""
    return ''.join(line for line in text.splitlines(keepends=True) if line.strip())

def read_pdf_text(filepath: str) -> str:
    """Extract text from PDF file."""
    from pypdf import PdfReader
    reader = PdfReader(os.path.expanduser(filepath))
    return '\n'.join(page.extract_text() for page in reader.pages)

def expand_inputs(tex:str, ext_dir='', debug=True) -> str:
    inputs = re.findall(r'\\input{(.*)}', tex)
    for i, inp in enumerate(inputs):
        orig_inp = inp
        if not inp.endswith(".tex"): inp += ".tex"
        if debug: print(f"Applying input {i+1}/{len(inputs)}: {inp}")
        try:
            with open(os.path.join(ext_dir, inp)) as f:
                inp_text = f.read()
                tex = tex.replace(f'\\input{{{orig_inp}}}', inp_text)
        except Exception as e:
            print(f"Exception reading {inp}: {e}")
    return tex

def expand_simple_macros(tex:str, debug=False) -> str:
    cmd_lines = [l for l in tex.splitlines() if '\\newcommand' in l or '\\renewcommand' in l]
    for line in cmd_lines:
        if re.match(r'\\newcommand\{\\[^}]+\}\[', line):
            if debug: print("Skipping complex newcommand:",line)
            continue
        else:
            if debug: print("Found simple newcommand:",line)
            m = re.match(r'\\(?:new|renew)command\*?\{(\\[^}]+)\}\{(.+)\}$', line)
            if m:
                name, replacement = m.groups()
                tex = re.sub(re.escape(name) + r'(?![a-zA-Z])', lambda m: replacement, tex)
    return tex

def run_xpandlatex(ext_dir: str, main_file: str, macro_files: list[str]) -> str:
    """Run xpandlatex to expand macros and merge inputs, return processed LaTeX."""
    cmd = ['python', 'xpandlatex.py', '-m', 'on', '-I', 'on']
    for mf in macro_files: cmd.extend(['-f', os.path.join(ext_dir, mf)])
    cmd.append(os.path.join(ext_dir, main_file))
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
        return result.stdout, result.returncode
    except subprocess.TimeoutExpired:
        return "", -1

def openreview_to_arxiv(url:str, debug=False) -> str:
    "Given an OpenReview URL, return a matching arXiv URL"
    import httpx

    if not 'openreview' in url.lower(): return None # we're not doing this
    response = httpx.get(url)
    or_html = response.text
    m = re.search(r'<title>(.+?) \| OpenReview</title>', or_html)
    if m:
        paper_title = m.group(1)
        if debug: print("paper title:",paper_title)
        client = arxiv.Client()
        search = arxiv.Search(query=f'ti:"{paper_title}"', max_results=5)
        results = client.results(search)
        for r in results:
            if debug: print(f'arXiv search result: title="{r.title}", url={r}, other keys={dir(r)}')
            return str(r)

def paper_scrape(path_or_url: str, debug=False) -> str:
    """Get clean LaTeX source from arxiv URL, arxiv ID, or PDF path."""
    # Determine arxiv ID
    if 'openreview.net' in path_or_url:
        path_or_url = openreview_to_arxiv(path_or_url, debug=debug)
    if re.match(r'^\d+\.\d+$', path_or_url):
        arxiv_id = path_or_url                          # Already an ID
    elif 'arxiv.org' in path_or_url:
        arxiv_id = extract_arxiv_id(path_or_url)        # URL
    elif os.path.isfile(path_or_url):
        arxiv_id = extract_arxiv_id(read_pdf_text(path_or_url))  # PDF
    else:
        raise ValueError(f"Cannot parse: {path_or_url}")

    if not arxiv_id: raise ValueError("Could not extract arxiv ID")
    if debug: print(f"arXiv ID =",arxiv_id)

    with tempfile.TemporaryDirectory() as tmpdir:
        if debug: print("Downloading source")
        tar_path = download_arxiv_source(arxiv_id, tmpdir)
        ext_dir = os.path.join(tmpdir, 'extracted')
        os.makedirs(ext_dir)
        if debug: print("Extracting Tarball")
        extract_tarball(tar_path, ext_dir)

        if debug: print("Finding main tex file")
        main_file = find_main_tex(ext_dir)
        if debug: print("Main tex file =",main_file)
        if not main_file: raise ValueError("No .tex file found")

        macro_files = find_macro_files(ext_dir, main_file)
        if debug: print("Macro files found:", macro_files)
        if debug: print("Running xpandlatex...")
        processed, rc = run_xpandlatex(ext_dir, main_file, macro_files)
        if rc != 0 or len(processed)<2000:
            if debug: print("Error with xpandlatex. Trying DIY approach...")
            with open(os.path.join(ext_dir, main_file)) as f:
                tex = f.read()
            tex = expand_inputs(tex, ext_dir, debug=debug)
            processed = expand_simple_macros(tex, debug=debug)
    return clean_latex(processed)

@call_parse
def main(url_or_file, debug=False):
    result = paper_scrape(url_or_file, debug=debug)
    print(result) # goes to stdout; you can pipe to a file if you want
	#!/usr/bin/env python3
	# Get .tex source for talking to LLMs about papers w/o them mangling PDFs
	import os, re, tarfile, subprocess, tempfile
	import arxiv
	from fastcore.script import call_parse

	def extract_arxiv_id(text: str) -> str\|None:
	"""Extract arxiv ID from text (URL or PDF content)."""
	patterns = [
	r'arxiv\.org/abs/(\d+\.\d+)', # URL format
	r'arxiv\.org/pdf/(\d+\.\d+)', # PDF URL format
	r'arXiv:(\d+\.\d+)', # Citation format
	]
	for p in patterns:
	if m := re.search(p, text, re.IGNORECASE): return m.group(1)
	return None

	def download_arxiv_source(arxiv_id: str, dest_dir: str) -> str:
	"""Download LaTeX source tarball, return path."""
	client = arxiv.Client()
	search = arxiv.Search(id_list=[arxiv_id])
	paper = next(client.results(search))
	return paper.download_source(dirpath=dest_dir)

	def extract_tarball(tar_path: str, dest_dir: str) -> None:
	"""Extract .tar.gz to destination directory."""
	with tarfile.open(tar_path, 'r:gz') as tar:
	tar.extractall(path=dest_dir, filter='data')

	def find_main_tex(ext_dir: str) -> str\|None:
	"""Find the main .tex file (contains \\documentclass)."""
	tex_files = [f for f in os.listdir(ext_dir) if f.endswith('.tex')]
	for f in tex_files:
	with open(os.path.join(ext_dir, f)) as fh:
	if '\\documentclass' in fh.read(): return f
	return tex_files[0] if tex_files else None

	def find_macro_files(ext_dir: str, main_file: str) -> list[str]:
	"""Find .tex files that aren't the main file (likely macro definitions)."""
	return [f for f in os.listdir(ext_dir) if f.endswith('.tex') and f != main_file]

	def clean_latex(text: str) -> str:
	"""Remove blank lines from LaTeX text."""
	return ''.join(line for line in text.splitlines(keepends=True) if line.strip())

	def read_pdf_text(filepath: str) -> str:
	"""Extract text from PDF file."""
	from pypdf import PdfReader
	reader = PdfReader(os.path.expanduser(filepath))
	return '\n'.join(page.extract_text() for page in reader.pages)

	def expand_inputs(tex:str, ext_dir='', debug=True) -> str:
	inputs = re.findall(r'\\input{(.*)}', tex)
	for i, inp in enumerate(inputs):
	orig_inp = inp
	if not inp.endswith(".tex"): inp += ".tex"
	if debug: print(f"Applying input {i+1}/{len(inputs)}: {inp}")
	try:
	with open(os.path.join(ext_dir, inp)) as f:
	inp_text = f.read()
	tex = tex.replace(f'\\input{{{orig_inp}}}', inp_text)
	except Exception as e:
	print(f"Exception reading {inp}: {e}")
	return tex

	def expand_simple_macros(tex:str, debug=False) -> str:
	cmd_lines = [l for l in tex.splitlines() if '\\newcommand' in l or '\\renewcommand' in l]
	for line in cmd_lines:
	if re.match(r'\\newcommand\{\\[^}]+\}\[', line):
	if debug: print("Skipping complex newcommand:",line)
	continue
	else:
	if debug: print("Found simple newcommand:",line)
	m = re.match(r'\\(?:new\|renew)command\*?\{(\\[^}]+)\}\{(.+)\}$', line)
	if m:
	name, replacement = m.groups()
	tex = re.sub(re.escape(name) + r'(?![a-zA-Z])', lambda m: replacement, tex)
	return tex

	def run_xpandlatex(ext_dir: str, main_file: str, macro_files: list[str]) -> str:
	"""Run xpandlatex to expand macros and merge inputs, return processed LaTeX."""
	cmd = ['python', 'xpandlatex.py', '-m', 'on', '-I', 'on']
	for mf in macro_files: cmd.extend(['-f', os.path.join(ext_dir, mf)])
	cmd.append(os.path.join(ext_dir, main_file))
	try:
	result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
	return result.stdout, result.returncode
	except subprocess.TimeoutExpired:
	return "", -1

	def openreview_to_arxiv(url:str, debug=False) -> str:
	"Given an OpenReview URL, return a matching arXiv URL"
	import httpx

	if not 'openreview' in url.lower(): return None # we're not doing this
	response = httpx.get(url)
	or_html = response.text
	m = re.search(r'<title>(.+?) \\| OpenReview</title>', or_html)
	if m:
	paper_title = m.group(1)
	if debug: print("paper title:",paper_title)
	client = arxiv.Client()
	search = arxiv.Search(query=f'ti:"{paper_title}"', max_results=5)
	results = client.results(search)
	for r in results:
	if debug: print(f'arXiv search result: title="{r.title}", url={r}, other keys={dir(r)}')
	return str(r)

	def paper_scrape(path_or_url: str, debug=False) -> str:
	"""Get clean LaTeX source from arxiv URL, arxiv ID, or PDF path."""
	# Determine arxiv ID
	if 'openreview.net' in path_or_url:
	path_or_url = openreview_to_arxiv(path_or_url, debug=debug)
	if re.match(r'^\d+\.\d+$', path_or_url):
	arxiv_id = path_or_url # Already an ID
	elif 'arxiv.org' in path_or_url:
	arxiv_id = extract_arxiv_id(path_or_url) # URL
	elif os.path.isfile(path_or_url):
	arxiv_id = extract_arxiv_id(read_pdf_text(path_or_url)) # PDF
	else:
	raise ValueError(f"Cannot parse: {path_or_url}")

	if not arxiv_id: raise ValueError("Could not extract arxiv ID")
	if debug: print(f"arXiv ID =",arxiv_id)

	with tempfile.TemporaryDirectory() as tmpdir:
	if debug: print("Downloading source")
	tar_path = download_arxiv_source(arxiv_id, tmpdir)
	ext_dir = os.path.join(tmpdir, 'extracted')
	os.makedirs(ext_dir)
	if debug: print("Extracting Tarball")
	extract_tarball(tar_path, ext_dir)

	if debug: print("Finding main tex file")
	main_file = find_main_tex(ext_dir)
	if debug: print("Main tex file =",main_file)
	if not main_file: raise ValueError("No .tex file found")

	macro_files = find_macro_files(ext_dir, main_file)
	if debug: print("Macro files found:", macro_files)
	if debug: print("Running xpandlatex...")
	processed, rc = run_xpandlatex(ext_dir, main_file, macro_files)
	if rc != 0 or len(processed)<2000:
	if debug: print("Error with xpandlatex. Trying DIY approach...")
	with open(os.path.join(ext_dir, main_file)) as f:
	tex = f.read()
	tex = expand_inputs(tex, ext_dir, debug=debug)
	processed = expand_simple_macros(tex, debug=debug)
	return clean_latex(processed)

	@call_parse
	def main(url_or_file, debug=False):
	result = paper_scrape(url_or_file, debug=debug)
	print(result) # goes to stdout; you can pipe to a file if you want
No results found