Skip to content

Instantly share code, notes, and snippets.

@farzadhallaji
Last active November 26, 2025 10:55
Show Gist options
  • Select an option

  • Save farzadhallaji/a814c33f49d6b8f27cef82a2861f9330 to your computer and use it in GitHub Desktop.

Select an option

Save farzadhallaji/a814c33f49d6b8f27cef82a2861f9330 to your computer and use it in GitHub Desktop.
Convert .tex to .md file
#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional
from texmd import tex # pip install texmd
# ----------------------------
# Utils
# ----------------------------
def have(cmd: str) -> bool:
try:
subprocess.run([cmd, "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
return True
except FileNotFoundError:
return False
def read_text(p: Path) -> str:
return p.read_text(encoding="utf-8", errors="replace")
def write_text(p: Path, s: str) -> None:
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(s, encoding="utf-8")
# ----------------------------
# Flattening
# ----------------------------
INPUT_RE = re.compile(r"""\\(input|include)\s*\{([^}]+)\}""")
def strip_comments(tex_src: str) -> str:
# remove % comments unless escaped as \%
out = []
i = 0
while i < len(tex_src):
ch = tex_src[i]
if ch == "%":
if i > 0 and tex_src[i - 1] == "\\":
out.append(ch)
i += 1
continue
while i < len(tex_src) and tex_src[i] != "\n":
i += 1
continue
out.append(ch)
i += 1
return "".join(out)
def resolve_tex_path(base_dir: Path, name: str) -> Path:
name = name.strip()
if not name.lower().endswith(".tex"):
name = name + ".tex"
return (base_dir / name).resolve()
def flatten_python(main_tex: Path, max_depth: int = 80) -> str:
visited = set()
def _expand(path: Path, depth: int) -> str:
if depth > max_depth:
raise RuntimeError(f"Max include depth exceeded at {path}")
if not path.exists():
raise RuntimeError(f"Missing included file: {path}")
key = str(path)
if key in visited:
return f"\n% [flatten-python] skipped already included: {path}\n"
visited.add(key)
src = strip_comments(read_text(path))
base = path.parent
def repl(m: re.Match) -> str:
inc = m.group(2).strip()
inc_path = resolve_tex_path(base, inc)
return (
f"\n% ===== begin input: {inc} =====\n"
+ _expand(inc_path, depth + 1)
+ f"\n% ===== end input: {inc} =====\n"
)
return INPUT_RE.sub(repl, src)
return _expand(main_tex.resolve(), 0)
def flatten_with_latexpand(main_tex: Path, out_tex: Path) -> Path:
out_tex.parent.mkdir(parents=True, exist_ok=True)
with open(out_tex, "w", encoding="utf-8") as f:
res = subprocess.run(
["latexpand", main_tex.name],
cwd=str(main_tex.parent),
stdout=f,
stderr=subprocess.PIPE,
text=True,
)
if res.returncode != 0:
raise RuntimeError(f"latexpand failed:\n{res.stderr}")
return out_tex
def flatten_auto(main_tex: Path, out_tex: Path) -> Path:
if have("latexpand"):
return flatten_with_latexpand(main_tex, out_tex)
write_text(out_tex, flatten_python(main_tex))
return out_tex
# ----------------------------
# LaTeX pre-processing (kill list/enumitem issues + preserve stuff)
# ----------------------------
# Remove optional args even if they span newlines:
# \begin{enumerate}[ ... possibly multiline ... ]
BEGIN_LIST_OPT_RE = re.compile(
r"""\\begin\{(enumerate|itemize|description)\}\s*\[(.*?)\]""",
re.DOTALL
)
# Remove optional labels on \item[...]
ITEM_OPT_RE = re.compile(r"""\\item\s*\[(.*?)\]""", re.DOTALL)
# Remove enumitem configuration commands anywhere (they poison converters)
SETLIST_LINE_RE = re.compile(r"""^\s*\\setlist.*$""", re.MULTILINE)
# Paragraph-like headings that texmd tends to output as {...}
PARA_RE = re.compile(r"""\\(sub)?paragraph\*?\{([^}]*)\}""")
# Noindent noise
NOINDENT_RE = re.compile(r"""\\noindent\s*""")
# Algorithms: keep them as fenced text (otherwise they vanish or become raw TeX)
ALGO_ENV_RE = re.compile(r"""\\begin\{algorithm\*?\}.*?\\end\{algorithm\*?\}""", re.DOTALL)
ALGO_INNER_RE = re.compile(r"""\\begin\{algorithmic\*?\}.*?\\end\{algorithmic\*?\}""", re.DOTALL)
ALGPSEUDO_RE = re.compile(r"""\\begin\{algpseudocode\*?\}.*?\\end\{algpseudocode\*?\}""", re.DOTALL)
def preprocess_latex(src: str) -> str:
# 0) strip enumitem setup lines
src = SETLIST_LINE_RE.sub("", src)
# 1) remove list optional args (enumitem) robustly
src = BEGIN_LIST_OPT_RE.sub(lambda m: f"\\begin{{{m.group(1)}}}", src)
# 2) remove \item[...]
src = ITEM_OPT_RE.sub(r"\\item", src)
# 3) rewrite \paragraph/\subparagraph into converter-safe sectioning (NON-starred)
def _para(m: re.Match) -> str:
title = m.group(2).replace("\n", " ").strip()
if not title:
return ""
return f"\\subsubsection{{{title}}}\n"
src = PARA_RE.sub(_para, src)
# 4) remove \noindent
src = NOINDENT_RE.sub("", src)
# 5) preserve algorithm blocks as fenced code so you don't lose them
def _fence(m: re.Match) -> str:
block = m.group(0).strip()
return "\n\\subsubsection{Algorithmic Summary}\n```text\n" + block + "\n```\n"
src = ALGO_ENV_RE.sub(_fence, src)
src = ALGO_INNER_RE.sub(_fence, src)
src = ALGPSEUDO_RE.sub(_fence, src)
return src
# ----------------------------
# Conversion engines
# ----------------------------
def texmd_convert(tex_path: Path, bib_path: Optional[Path]) -> str:
parser = tex.TexParser()
if bib_path and bib_path.exists():
parser.load_citations(str(bib_path))
doc = parser.load_file(str(tex_path))
return parser.to_md(doc).to_str()
def pandoc_convert(tex_path: Path, out_md: Path) -> None:
cmd = [
"pandoc",
str(tex_path),
"-f", "latex",
"-t", "gfm+tex_math_dollars",
"--wrap=none",
"-o", str(out_md),
]
subprocess.run(cmd, check=True)
# ----------------------------
# Markdown post-processing
# ----------------------------
# Standalone brace artifact line like "{Goal.}" or "{Encode:}"
BRACE_LINE_RE = re.compile(r"""^\{([^{}]{1,200})\}\s*$""")
def remove_orphan_bullets(md: str) -> str:
lines = md.splitlines()
out = []
in_code = False
def next_nonempty(idx: int) -> str:
j = idx + 1
while j < len(lines) and lines[j].strip() == "":
j += 1
return lines[j] if j < len(lines) else ""
for i, line in enumerate(lines):
s = line.strip()
if s.startswith("```"):
in_code = not in_code
out.append(line)
continue
if in_code:
out.append(line)
continue
if s in {"*", "-", "+"}:
nxt = next_nonempty(i)
if nxt.startswith(" ") or nxt.startswith("\t"):
out.append(line)
else:
continue
else:
out.append(line)
return "\n".join(out) + "\n"
def fix_brace_artifacts(md: str) -> str:
"""
If converter spits:
{Encode:}
turn into:
**Encode:**
If it spits:
{Goal.}
turn into:
#### Goal.
"""
lines = md.splitlines()
out = []
in_code = False
for line in lines:
s = line.strip()
if s.startswith("```"):
in_code = not in_code
out.append(line)
continue
if in_code:
out.append(line)
continue
m = BRACE_LINE_RE.match(s)
if m:
title = m.group(1).strip()
if title.endswith(":"):
out.append(f"**{title}**")
else:
out.append(f"#### {title}")
continue
out.append(line)
return "\n".join(out) + "\n"
def normalize_math_delimiters(md: str) -> str:
r"""
Keep inline $...$ and force display $$...$$.
Converts:
- \(...\) -> $...$
- \[ and \] as lines -> $$
- bare '$' delimiter lines -> '$$'
- one-line '$ ... $' as whole line -> $$ ... $$
"""
md = re.sub(r"""\\\((.+?)\\\)""", r"$\1$", md, flags=re.DOTALL)
lines = md.splitlines()
out = []
in_code = False
def is_fence(line: str) -> bool:
return line.strip().startswith("```")
def is_line_only_single_dollar_math(line: str) -> bool:
s = line.strip()
if s.startswith("$$") or s.endswith("$$"):
return False
if len(s) < 2 or not (s.startswith("$") and s.endswith("$")):
return False
inner = s[1:-1].strip()
if not inner:
return False
return s.count("$") == 2
for line in lines:
if is_fence(line):
in_code = not in_code
out.append(line)
continue
if in_code:
out.append(line)
continue
s = line.strip()
if s in {"$", r"\[", r"\]"}:
out.append("$$")
continue
if is_line_only_single_dollar_math(line):
inner = line.strip()[1:-1].strip()
out.append("$$")
out.append(inner)
out.append("$$")
continue
out.append(line)
return "\n".join(out) + "\n"
def dedupe_consecutive_headings(md: str) -> str:
lines = md.splitlines()
out = []
last_heading = None
blanks = 0
def is_heading(line: str) -> bool:
s = line.strip()
return s.startswith("#")
for line in lines:
s = line.strip()
if s == "":
blanks += 1
out.append(line)
continue
if is_heading(line):
if last_heading == s and blanks <= 5:
blanks = 0
continue
last_heading = s
blanks = 0
out.append(line)
continue
blanks = 0
out.append(line)
return "\n".join(out) + "\n"
def postprocess(md: str) -> str:
md = fix_brace_artifacts(md)
md = remove_orphan_bullets(md)
md = normalize_math_delimiters(md)
md = split_display_math_from_text(md)
md = ensure_blank_lines_around_display_math(md)
md = dedupe_consecutive_headings(md)
return md
def output_has_list_damage(md: str) -> bool:
r"""
Detect texmd failing at lists: leaks \item or environments or lots of brace artifacts.
"""
bad = 0
if "\\item" in md:
bad += 2
if "\\begin{enumerate}" in md or "\\end{enumerate}" in md:
bad += 2
if sum(1 for ln in md.splitlines() if BRACE_LINE_RE.match(ln.strip())) >= 5:
bad += 2
return bad >= 2
def ensure_blank_lines_around_display_math(md: str) -> str:
"""
Ensure a blank line before the opening $$ and after the closing $$,
without inserting blank lines *inside* the math block.
Preserves indentation (important if $$ appears inside list items).
Skips fenced code blocks.
"""
lines = md.splitlines()
out = []
in_code = False
in_display = False
for i, line in enumerate(lines):
s = line.strip()
# code fences
if s.startswith("```"):
in_code = not in_code
out.append(line)
continue
if in_code:
out.append(line)
continue
if s == "$$":
indent = line[: len(line) - len(line.lstrip())]
blank = indent # "blank" line with same indent
if not in_display:
# opening $$
if out and out[-1].strip() != "":
out.append(blank)
out.append(line)
in_display = True
else:
# closing $$
out.append(line)
# add blank line after closing $$ if next line exists and isn't blank
if i + 1 < len(lines) and lines[i + 1].strip() != "":
out.append(blank)
in_display = False
continue
out.append(line)
return "\n".join(out) + "\n"
LIST_PREFIX_RE = re.compile(r"^(\s*(?:[-*+]\s+|\d+\.\s+))")
def split_display_math_from_text(md: str) -> str:
"""
Turn inline/paragraph-wrapped display math like:
text $$math$$ text
or multi-line:
text $$math
more
$$ text
into true block math:
text
$$
math
more
$$
text
Preserves list indentation. Skips fenced code blocks.
"""
lines = md.splitlines()
out = []
in_code = False
in_display = False
for line in lines:
s = line.strip()
# code fences
if s.startswith("```"):
in_code = not in_code
out.append(line)
continue
if in_code:
out.append(line)
continue
# detect list prefix to keep blocks inside list items
m = LIST_PREFIX_RE.match(line)
prefix = m.group(1) if m else ""
base_indent = (" " * len(prefix)) if prefix else line[:len(line) - len(line.lstrip())]
content = line[len(prefix):] if prefix else line[len(base_indent):]
if "$$" not in content:
out.append(line)
continue
parts = re.split(r"(\$\$)", content)
first_text_emitted = False
for part in parts:
if part == "$$":
# ensure any pending text ends cleanly before math
if not in_display:
# opening
out.append(base_indent + "$$")
in_display = True
else:
# closing
out.append(base_indent + "$$")
in_display = False
continue
if part == "":
continue
if in_display:
# math content line (keep as-is, but indent appropriately if in list)
out.append(base_indent + part.rstrip())
else:
# normal text segment
txt = part.rstrip()
if txt.strip() == "":
continue
if not prefix:
# non-list line
out.append(base_indent + txt if base_indent else txt)
else:
# list line: first segment keeps bullet/number prefix, remaining segments are continuation
if not first_text_emitted:
out.append(prefix + txt)
first_text_emitted = True
else:
out.append(base_indent + txt)
return "\n".join(out) + "\n"
# ----------------------------
# Main
# ----------------------------
def main():
ap = argparse.ArgumentParser()
ap.add_argument("main_tex", type=Path)
ap.add_argument("-o", "--output", type=Path, default=None)
ap.add_argument("--bib", type=Path, default=None)
ap.add_argument("--workdir", type=Path, default=Path(".build_tex2md"))
ap.add_argument("--engine", choices=["auto", "pandoc", "texmd"], default="auto")
ap.add_argument("--keep-temp", action="store_true")
args = ap.parse_args()
main_tex = args.main_tex.resolve()
if not main_tex.exists():
print(f"ERROR: not found: {main_tex}", file=sys.stderr)
sys.exit(1)
out_md = (args.output if args.output else main_tex.with_suffix(".md")).resolve()
workdir = args.workdir.resolve()
workdir.mkdir(parents=True, exist_ok=True)
flattened = workdir / (main_tex.stem + ".flattened.tex")
prepped = workdir / (main_tex.stem + ".prepped.tex")
try:
# flatten
tex_flat = flatten_auto(main_tex, flattened)
# preprocess
src = read_text(tex_flat)
src = preprocess_latex(src)
write_text(prepped, src)
# convert: pandoc is the reliable one for lists. Use it if possible.
bib = args.bib.resolve() if args.bib else None
if args.engine in {"auto", "pandoc"} and have("pandoc"):
pandoc_convert(prepped, out_md)
md = postprocess(read_text(out_md))
write_text(out_md, md)
elif args.engine == "pandoc" and not have("pandoc"):
raise RuntimeError("pandoc not found. Install it or use --engine texmd.")
else:
md = texmd_convert(prepped, bib)
md = postprocess(md)
# If texmd still screwed up lists and pandoc exists, fall back.
if args.engine == "auto" and output_has_list_damage(md) and have("pandoc"):
pandoc_convert(prepped, out_md)
md2 = postprocess(read_text(out_md))
write_text(out_md, md2)
else:
write_text(out_md, md)
if not args.keep_temp:
try: flattened.unlink(missing_ok=True)
except Exception: pass
try: prepped.unlink(missing_ok=True)
except Exception: pass
except Exception as e:
print(f"ERROR: {e}", file=sys.stderr)
sys.exit(2)
print(f"Wrote: {out_md}")
if args.keep_temp:
print(f"Kept: {flattened}")
print(f"Kept: {prepped}")
if __name__ == "__main__":
main()
@farzadhallaji
Copy link
Author

python tex2md.py main.tex --bib refs.bib -o main.md --engine auto

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment