Last active
November 26, 2025 10:55
-
-
Save farzadhallaji/a814c33f49d6b8f27cef82a2861f9330 to your computer and use it in GitHub Desktop.
Convert .tex to .md file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import re | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Optional | |
| from texmd import tex # pip install texmd | |
| # ---------------------------- | |
| # Utils | |
| # ---------------------------- | |
| def have(cmd: str) -> bool: | |
| try: | |
| subprocess.run([cmd, "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False) | |
| return True | |
| except FileNotFoundError: | |
| return False | |
| def read_text(p: Path) -> str: | |
| return p.read_text(encoding="utf-8", errors="replace") | |
| def write_text(p: Path, s: str) -> None: | |
| p.parent.mkdir(parents=True, exist_ok=True) | |
| p.write_text(s, encoding="utf-8") | |
| # ---------------------------- | |
| # Flattening | |
| # ---------------------------- | |
| INPUT_RE = re.compile(r"""\\(input|include)\s*\{([^}]+)\}""") | |
| def strip_comments(tex_src: str) -> str: | |
| # remove % comments unless escaped as \% | |
| out = [] | |
| i = 0 | |
| while i < len(tex_src): | |
| ch = tex_src[i] | |
| if ch == "%": | |
| if i > 0 and tex_src[i - 1] == "\\": | |
| out.append(ch) | |
| i += 1 | |
| continue | |
| while i < len(tex_src) and tex_src[i] != "\n": | |
| i += 1 | |
| continue | |
| out.append(ch) | |
| i += 1 | |
| return "".join(out) | |
| def resolve_tex_path(base_dir: Path, name: str) -> Path: | |
| name = name.strip() | |
| if not name.lower().endswith(".tex"): | |
| name = name + ".tex" | |
| return (base_dir / name).resolve() | |
| def flatten_python(main_tex: Path, max_depth: int = 80) -> str: | |
| visited = set() | |
| def _expand(path: Path, depth: int) -> str: | |
| if depth > max_depth: | |
| raise RuntimeError(f"Max include depth exceeded at {path}") | |
| if not path.exists(): | |
| raise RuntimeError(f"Missing included file: {path}") | |
| key = str(path) | |
| if key in visited: | |
| return f"\n% [flatten-python] skipped already included: {path}\n" | |
| visited.add(key) | |
| src = strip_comments(read_text(path)) | |
| base = path.parent | |
| def repl(m: re.Match) -> str: | |
| inc = m.group(2).strip() | |
| inc_path = resolve_tex_path(base, inc) | |
| return ( | |
| f"\n% ===== begin input: {inc} =====\n" | |
| + _expand(inc_path, depth + 1) | |
| + f"\n% ===== end input: {inc} =====\n" | |
| ) | |
| return INPUT_RE.sub(repl, src) | |
| return _expand(main_tex.resolve(), 0) | |
| def flatten_with_latexpand(main_tex: Path, out_tex: Path) -> Path: | |
| out_tex.parent.mkdir(parents=True, exist_ok=True) | |
| with open(out_tex, "w", encoding="utf-8") as f: | |
| res = subprocess.run( | |
| ["latexpand", main_tex.name], | |
| cwd=str(main_tex.parent), | |
| stdout=f, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| ) | |
| if res.returncode != 0: | |
| raise RuntimeError(f"latexpand failed:\n{res.stderr}") | |
| return out_tex | |
| def flatten_auto(main_tex: Path, out_tex: Path) -> Path: | |
| if have("latexpand"): | |
| return flatten_with_latexpand(main_tex, out_tex) | |
| write_text(out_tex, flatten_python(main_tex)) | |
| return out_tex | |
| # ---------------------------- | |
| # LaTeX pre-processing (kill list/enumitem issues + preserve stuff) | |
| # ---------------------------- | |
| # Remove optional args even if they span newlines: | |
| # \begin{enumerate}[ ... possibly multiline ... ] | |
| BEGIN_LIST_OPT_RE = re.compile( | |
| r"""\\begin\{(enumerate|itemize|description)\}\s*\[(.*?)\]""", | |
| re.DOTALL | |
| ) | |
| # Remove optional labels on \item[...] | |
| ITEM_OPT_RE = re.compile(r"""\\item\s*\[(.*?)\]""", re.DOTALL) | |
| # Remove enumitem configuration commands anywhere (they poison converters) | |
| SETLIST_LINE_RE = re.compile(r"""^\s*\\setlist.*$""", re.MULTILINE) | |
| # Paragraph-like headings that texmd tends to output as {...} | |
| PARA_RE = re.compile(r"""\\(sub)?paragraph\*?\{([^}]*)\}""") | |
| # Noindent noise | |
| NOINDENT_RE = re.compile(r"""\\noindent\s*""") | |
| # Algorithms: keep them as fenced text (otherwise they vanish or become raw TeX) | |
| ALGO_ENV_RE = re.compile(r"""\\begin\{algorithm\*?\}.*?\\end\{algorithm\*?\}""", re.DOTALL) | |
| ALGO_INNER_RE = re.compile(r"""\\begin\{algorithmic\*?\}.*?\\end\{algorithmic\*?\}""", re.DOTALL) | |
| ALGPSEUDO_RE = re.compile(r"""\\begin\{algpseudocode\*?\}.*?\\end\{algpseudocode\*?\}""", re.DOTALL) | |
| def preprocess_latex(src: str) -> str: | |
| # 0) strip enumitem setup lines | |
| src = SETLIST_LINE_RE.sub("", src) | |
| # 1) remove list optional args (enumitem) robustly | |
| src = BEGIN_LIST_OPT_RE.sub(lambda m: f"\\begin{{{m.group(1)}}}", src) | |
| # 2) remove \item[...] | |
| src = ITEM_OPT_RE.sub(r"\\item", src) | |
| # 3) rewrite \paragraph/\subparagraph into converter-safe sectioning (NON-starred) | |
| def _para(m: re.Match) -> str: | |
| title = m.group(2).replace("\n", " ").strip() | |
| if not title: | |
| return "" | |
| return f"\\subsubsection{{{title}}}\n" | |
| src = PARA_RE.sub(_para, src) | |
| # 4) remove \noindent | |
| src = NOINDENT_RE.sub("", src) | |
| # 5) preserve algorithm blocks as fenced code so you don't lose them | |
| def _fence(m: re.Match) -> str: | |
| block = m.group(0).strip() | |
| return "\n\\subsubsection{Algorithmic Summary}\n```text\n" + block + "\n```\n" | |
| src = ALGO_ENV_RE.sub(_fence, src) | |
| src = ALGO_INNER_RE.sub(_fence, src) | |
| src = ALGPSEUDO_RE.sub(_fence, src) | |
| return src | |
| # ---------------------------- | |
| # Conversion engines | |
| # ---------------------------- | |
| def texmd_convert(tex_path: Path, bib_path: Optional[Path]) -> str: | |
| parser = tex.TexParser() | |
| if bib_path and bib_path.exists(): | |
| parser.load_citations(str(bib_path)) | |
| doc = parser.load_file(str(tex_path)) | |
| return parser.to_md(doc).to_str() | |
| def pandoc_convert(tex_path: Path, out_md: Path) -> None: | |
| cmd = [ | |
| "pandoc", | |
| str(tex_path), | |
| "-f", "latex", | |
| "-t", "gfm+tex_math_dollars", | |
| "--wrap=none", | |
| "-o", str(out_md), | |
| ] | |
| subprocess.run(cmd, check=True) | |
| # ---------------------------- | |
| # Markdown post-processing | |
| # ---------------------------- | |
| # Standalone brace artifact line like "{Goal.}" or "{Encode:}" | |
| BRACE_LINE_RE = re.compile(r"""^\{([^{}]{1,200})\}\s*$""") | |
| def remove_orphan_bullets(md: str) -> str: | |
| lines = md.splitlines() | |
| out = [] | |
| in_code = False | |
| def next_nonempty(idx: int) -> str: | |
| j = idx + 1 | |
| while j < len(lines) and lines[j].strip() == "": | |
| j += 1 | |
| return lines[j] if j < len(lines) else "" | |
| for i, line in enumerate(lines): | |
| s = line.strip() | |
| if s.startswith("```"): | |
| in_code = not in_code | |
| out.append(line) | |
| continue | |
| if in_code: | |
| out.append(line) | |
| continue | |
| if s in {"*", "-", "+"}: | |
| nxt = next_nonempty(i) | |
| if nxt.startswith(" ") or nxt.startswith("\t"): | |
| out.append(line) | |
| else: | |
| continue | |
| else: | |
| out.append(line) | |
| return "\n".join(out) + "\n" | |
| def fix_brace_artifacts(md: str) -> str: | |
| """ | |
| If converter spits: | |
| {Encode:} | |
| turn into: | |
| **Encode:** | |
| If it spits: | |
| {Goal.} | |
| turn into: | |
| #### Goal. | |
| """ | |
| lines = md.splitlines() | |
| out = [] | |
| in_code = False | |
| for line in lines: | |
| s = line.strip() | |
| if s.startswith("```"): | |
| in_code = not in_code | |
| out.append(line) | |
| continue | |
| if in_code: | |
| out.append(line) | |
| continue | |
| m = BRACE_LINE_RE.match(s) | |
| if m: | |
| title = m.group(1).strip() | |
| if title.endswith(":"): | |
| out.append(f"**{title}**") | |
| else: | |
| out.append(f"#### {title}") | |
| continue | |
| out.append(line) | |
| return "\n".join(out) + "\n" | |
| def normalize_math_delimiters(md: str) -> str: | |
| r""" | |
| Keep inline $...$ and force display $$...$$. | |
| Converts: | |
| - \(...\) -> $...$ | |
| - \[ and \] as lines -> $$ | |
| - bare '$' delimiter lines -> '$$' | |
| - one-line '$ ... $' as whole line -> $$ ... $$ | |
| """ | |
| md = re.sub(r"""\\\((.+?)\\\)""", r"$\1$", md, flags=re.DOTALL) | |
| lines = md.splitlines() | |
| out = [] | |
| in_code = False | |
| def is_fence(line: str) -> bool: | |
| return line.strip().startswith("```") | |
| def is_line_only_single_dollar_math(line: str) -> bool: | |
| s = line.strip() | |
| if s.startswith("$$") or s.endswith("$$"): | |
| return False | |
| if len(s) < 2 or not (s.startswith("$") and s.endswith("$")): | |
| return False | |
| inner = s[1:-1].strip() | |
| if not inner: | |
| return False | |
| return s.count("$") == 2 | |
| for line in lines: | |
| if is_fence(line): | |
| in_code = not in_code | |
| out.append(line) | |
| continue | |
| if in_code: | |
| out.append(line) | |
| continue | |
| s = line.strip() | |
| if s in {"$", r"\[", r"\]"}: | |
| out.append("$$") | |
| continue | |
| if is_line_only_single_dollar_math(line): | |
| inner = line.strip()[1:-1].strip() | |
| out.append("$$") | |
| out.append(inner) | |
| out.append("$$") | |
| continue | |
| out.append(line) | |
| return "\n".join(out) + "\n" | |
| def dedupe_consecutive_headings(md: str) -> str: | |
| lines = md.splitlines() | |
| out = [] | |
| last_heading = None | |
| blanks = 0 | |
| def is_heading(line: str) -> bool: | |
| s = line.strip() | |
| return s.startswith("#") | |
| for line in lines: | |
| s = line.strip() | |
| if s == "": | |
| blanks += 1 | |
| out.append(line) | |
| continue | |
| if is_heading(line): | |
| if last_heading == s and blanks <= 5: | |
| blanks = 0 | |
| continue | |
| last_heading = s | |
| blanks = 0 | |
| out.append(line) | |
| continue | |
| blanks = 0 | |
| out.append(line) | |
| return "\n".join(out) + "\n" | |
| def postprocess(md: str) -> str: | |
| md = fix_brace_artifacts(md) | |
| md = remove_orphan_bullets(md) | |
| md = normalize_math_delimiters(md) | |
| md = split_display_math_from_text(md) | |
| md = ensure_blank_lines_around_display_math(md) | |
| md = dedupe_consecutive_headings(md) | |
| return md | |
| def output_has_list_damage(md: str) -> bool: | |
| r""" | |
| Detect texmd failing at lists: leaks \item or environments or lots of brace artifacts. | |
| """ | |
| bad = 0 | |
| if "\\item" in md: | |
| bad += 2 | |
| if "\\begin{enumerate}" in md or "\\end{enumerate}" in md: | |
| bad += 2 | |
| if sum(1 for ln in md.splitlines() if BRACE_LINE_RE.match(ln.strip())) >= 5: | |
| bad += 2 | |
| return bad >= 2 | |
| def ensure_blank_lines_around_display_math(md: str) -> str: | |
| """ | |
| Ensure a blank line before the opening $$ and after the closing $$, | |
| without inserting blank lines *inside* the math block. | |
| Preserves indentation (important if $$ appears inside list items). | |
| Skips fenced code blocks. | |
| """ | |
| lines = md.splitlines() | |
| out = [] | |
| in_code = False | |
| in_display = False | |
| for i, line in enumerate(lines): | |
| s = line.strip() | |
| # code fences | |
| if s.startswith("```"): | |
| in_code = not in_code | |
| out.append(line) | |
| continue | |
| if in_code: | |
| out.append(line) | |
| continue | |
| if s == "$$": | |
| indent = line[: len(line) - len(line.lstrip())] | |
| blank = indent # "blank" line with same indent | |
| if not in_display: | |
| # opening $$ | |
| if out and out[-1].strip() != "": | |
| out.append(blank) | |
| out.append(line) | |
| in_display = True | |
| else: | |
| # closing $$ | |
| out.append(line) | |
| # add blank line after closing $$ if next line exists and isn't blank | |
| if i + 1 < len(lines) and lines[i + 1].strip() != "": | |
| out.append(blank) | |
| in_display = False | |
| continue | |
| out.append(line) | |
| return "\n".join(out) + "\n" | |
| LIST_PREFIX_RE = re.compile(r"^(\s*(?:[-*+]\s+|\d+\.\s+))") | |
| def split_display_math_from_text(md: str) -> str: | |
| """ | |
| Turn inline/paragraph-wrapped display math like: | |
| text $$math$$ text | |
| or multi-line: | |
| text $$math | |
| more | |
| $$ text | |
| into true block math: | |
| text | |
| $$ | |
| math | |
| more | |
| $$ | |
| text | |
| Preserves list indentation. Skips fenced code blocks. | |
| """ | |
| lines = md.splitlines() | |
| out = [] | |
| in_code = False | |
| in_display = False | |
| for line in lines: | |
| s = line.strip() | |
| # code fences | |
| if s.startswith("```"): | |
| in_code = not in_code | |
| out.append(line) | |
| continue | |
| if in_code: | |
| out.append(line) | |
| continue | |
| # detect list prefix to keep blocks inside list items | |
| m = LIST_PREFIX_RE.match(line) | |
| prefix = m.group(1) if m else "" | |
| base_indent = (" " * len(prefix)) if prefix else line[:len(line) - len(line.lstrip())] | |
| content = line[len(prefix):] if prefix else line[len(base_indent):] | |
| if "$$" not in content: | |
| out.append(line) | |
| continue | |
| parts = re.split(r"(\$\$)", content) | |
| first_text_emitted = False | |
| for part in parts: | |
| if part == "$$": | |
| # ensure any pending text ends cleanly before math | |
| if not in_display: | |
| # opening | |
| out.append(base_indent + "$$") | |
| in_display = True | |
| else: | |
| # closing | |
| out.append(base_indent + "$$") | |
| in_display = False | |
| continue | |
| if part == "": | |
| continue | |
| if in_display: | |
| # math content line (keep as-is, but indent appropriately if in list) | |
| out.append(base_indent + part.rstrip()) | |
| else: | |
| # normal text segment | |
| txt = part.rstrip() | |
| if txt.strip() == "": | |
| continue | |
| if not prefix: | |
| # non-list line | |
| out.append(base_indent + txt if base_indent else txt) | |
| else: | |
| # list line: first segment keeps bullet/number prefix, remaining segments are continuation | |
| if not first_text_emitted: | |
| out.append(prefix + txt) | |
| first_text_emitted = True | |
| else: | |
| out.append(base_indent + txt) | |
| return "\n".join(out) + "\n" | |
| # ---------------------------- | |
| # Main | |
| # ---------------------------- | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("main_tex", type=Path) | |
| ap.add_argument("-o", "--output", type=Path, default=None) | |
| ap.add_argument("--bib", type=Path, default=None) | |
| ap.add_argument("--workdir", type=Path, default=Path(".build_tex2md")) | |
| ap.add_argument("--engine", choices=["auto", "pandoc", "texmd"], default="auto") | |
| ap.add_argument("--keep-temp", action="store_true") | |
| args = ap.parse_args() | |
| main_tex = args.main_tex.resolve() | |
| if not main_tex.exists(): | |
| print(f"ERROR: not found: {main_tex}", file=sys.stderr) | |
| sys.exit(1) | |
| out_md = (args.output if args.output else main_tex.with_suffix(".md")).resolve() | |
| workdir = args.workdir.resolve() | |
| workdir.mkdir(parents=True, exist_ok=True) | |
| flattened = workdir / (main_tex.stem + ".flattened.tex") | |
| prepped = workdir / (main_tex.stem + ".prepped.tex") | |
| try: | |
| # flatten | |
| tex_flat = flatten_auto(main_tex, flattened) | |
| # preprocess | |
| src = read_text(tex_flat) | |
| src = preprocess_latex(src) | |
| write_text(prepped, src) | |
| # convert: pandoc is the reliable one for lists. Use it if possible. | |
| bib = args.bib.resolve() if args.bib else None | |
| if args.engine in {"auto", "pandoc"} and have("pandoc"): | |
| pandoc_convert(prepped, out_md) | |
| md = postprocess(read_text(out_md)) | |
| write_text(out_md, md) | |
| elif args.engine == "pandoc" and not have("pandoc"): | |
| raise RuntimeError("pandoc not found. Install it or use --engine texmd.") | |
| else: | |
| md = texmd_convert(prepped, bib) | |
| md = postprocess(md) | |
| # If texmd still screwed up lists and pandoc exists, fall back. | |
| if args.engine == "auto" and output_has_list_damage(md) and have("pandoc"): | |
| pandoc_convert(prepped, out_md) | |
| md2 = postprocess(read_text(out_md)) | |
| write_text(out_md, md2) | |
| else: | |
| write_text(out_md, md) | |
| if not args.keep_temp: | |
| try: flattened.unlink(missing_ok=True) | |
| except Exception: pass | |
| try: prepped.unlink(missing_ok=True) | |
| except Exception: pass | |
| except Exception as e: | |
| print(f"ERROR: {e}", file=sys.stderr) | |
| sys.exit(2) | |
| print(f"Wrote: {out_md}") | |
| if args.keep_temp: | |
| print(f"Kept: {flattened}") | |
| print(f"Kept: {prepped}") | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python tex2md.py main.tex --bib refs.bib -o main.md --engine auto