farzadhallaji/tex2md.py

## tex2md.py
#!/usr/bin/env python3
import argparse
import re
import subprocess
import sys
from pathlib import Path
from typing import Optional

from texmd import tex  # pip install texmd


# ----------------------------
# Utils
# ----------------------------
def have(cmd: str) -> bool:
    try:
        subprocess.run([cmd, "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
        return True
    except FileNotFoundError:
        return False


def read_text(p: Path) -> str:
    return p.read_text(encoding="utf-8", errors="replace")


def write_text(p: Path, s: str) -> None:
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(s, encoding="utf-8")


# ----------------------------
# Flattening
# ----------------------------
INPUT_RE = re.compile(r"""\\(input|include)\s*\{([^}]+)\}""")


def strip_comments(tex_src: str) -> str:
    # remove % comments unless escaped as \%
    out = []
    i = 0
    while i < len(tex_src):
        ch = tex_src[i]
        if ch == "%":
            if i > 0 and tex_src[i - 1] == "\\":
                out.append(ch)
                i += 1
                continue
            while i < len(tex_src) and tex_src[i] != "\n":
                i += 1
            continue
        out.append(ch)
        i += 1
    return "".join(out)


def resolve_tex_path(base_dir: Path, name: str) -> Path:
    name = name.strip()
    if not name.lower().endswith(".tex"):
        name = name + ".tex"
    return (base_dir / name).resolve()


def flatten_python(main_tex: Path, max_depth: int = 80) -> str:
    visited = set()

    def _expand(path: Path, depth: int) -> str:
        if depth > max_depth:
            raise RuntimeError(f"Max include depth exceeded at {path}")
        if not path.exists():
            raise RuntimeError(f"Missing included file: {path}")

        key = str(path)
        if key in visited:
            return f"\n% [flatten-python] skipped already included: {path}\n"
        visited.add(key)

        src = strip_comments(read_text(path))
        base = path.parent

        def repl(m: re.Match) -> str:
            inc = m.group(2).strip()
            inc_path = resolve_tex_path(base, inc)
            return (
                f"\n% ===== begin input: {inc} =====\n"
                + _expand(inc_path, depth + 1)
                + f"\n% ===== end input: {inc} =====\n"
            )

        return INPUT_RE.sub(repl, src)

    return _expand(main_tex.resolve(), 0)


def flatten_with_latexpand(main_tex: Path, out_tex: Path) -> Path:
    out_tex.parent.mkdir(parents=True, exist_ok=True)
    with open(out_tex, "w", encoding="utf-8") as f:
        res = subprocess.run(
            ["latexpand", main_tex.name],
            cwd=str(main_tex.parent),
            stdout=f,
            stderr=subprocess.PIPE,
            text=True,
        )
    if res.returncode != 0:
        raise RuntimeError(f"latexpand failed:\n{res.stderr}")
    return out_tex


def flatten_auto(main_tex: Path, out_tex: Path) -> Path:
    if have("latexpand"):
        return flatten_with_latexpand(main_tex, out_tex)
    write_text(out_tex, flatten_python(main_tex))
    return out_tex


# ----------------------------
# LaTeX pre-processing (kill list/enumitem issues + preserve stuff)
# ----------------------------

# Remove optional args even if they span newlines:
# \begin{enumerate}[ ... possibly multiline ... ]
BEGIN_LIST_OPT_RE = re.compile(
    r"""\\begin\{(enumerate|itemize|description)\}\s*\[(.*?)\]""",
    re.DOTALL
)

# Remove optional labels on \item[...]
ITEM_OPT_RE = re.compile(r"""\\item\s*\[(.*?)\]""", re.DOTALL)

# Remove enumitem configuration commands anywhere (they poison converters)
SETLIST_LINE_RE = re.compile(r"""^\s*\\setlist.*$""", re.MULTILINE)

# Paragraph-like headings that texmd tends to output as {...}
PARA_RE = re.compile(r"""\\(sub)?paragraph\*?\{([^}]*)\}""")

# Noindent noise
NOINDENT_RE = re.compile(r"""\\noindent\s*""")

# Algorithms: keep them as fenced text (otherwise they vanish or become raw TeX)
ALGO_ENV_RE = re.compile(r"""\\begin\{algorithm\*?\}.*?\\end\{algorithm\*?\}""", re.DOTALL)
ALGO_INNER_RE = re.compile(r"""\\begin\{algorithmic\*?\}.*?\\end\{algorithmic\*?\}""", re.DOTALL)
ALGPSEUDO_RE = re.compile(r"""\\begin\{algpseudocode\*?\}.*?\\end\{algpseudocode\*?\}""", re.DOTALL)


def preprocess_latex(src: str) -> str:
    # 0) strip enumitem setup lines
    src = SETLIST_LINE_RE.sub("", src)

    # 1) remove list optional args (enumitem) robustly
    src = BEGIN_LIST_OPT_RE.sub(lambda m: f"\\begin{{{m.group(1)}}}", src)

    # 2) remove \item[...]
    src = ITEM_OPT_RE.sub(r"\\item", src)

    # 3) rewrite \paragraph/\subparagraph into converter-safe sectioning (NON-starred)
    def _para(m: re.Match) -> str:
        title = m.group(2).replace("\n", " ").strip()
        if not title:
            return ""
        return f"\\subsubsection{{{title}}}\n"
    src = PARA_RE.sub(_para, src)

    # 4) remove \noindent
    src = NOINDENT_RE.sub("", src)

    # 5) preserve algorithm blocks as fenced code so you don't lose them
    def _fence(m: re.Match) -> str:
        block = m.group(0).strip()
        return "\n\\subsubsection{Algorithmic Summary}\n```text\n" + block + "\n```\n"

    src = ALGO_ENV_RE.sub(_fence, src)
    src = ALGO_INNER_RE.sub(_fence, src)
    src = ALGPSEUDO_RE.sub(_fence, src)

    return src


# ----------------------------
# Conversion engines
# ----------------------------
def texmd_convert(tex_path: Path, bib_path: Optional[Path]) -> str:
    parser = tex.TexParser()
    if bib_path and bib_path.exists():
        parser.load_citations(str(bib_path))
    doc = parser.load_file(str(tex_path))
    return parser.to_md(doc).to_str()


def pandoc_convert(tex_path: Path, out_md: Path) -> None:
    cmd = [
        "pandoc",
        str(tex_path),
        "-f", "latex",
        "-t", "gfm+tex_math_dollars",
        "--wrap=none",
        "-o", str(out_md),
    ]
    subprocess.run(cmd, check=True)


# ----------------------------
# Markdown post-processing
# ----------------------------

# Standalone brace artifact line like "{Goal.}" or "{Encode:}"
BRACE_LINE_RE = re.compile(r"""^\{([^{}]{1,200})\}\s*$""")


def remove_orphan_bullets(md: str) -> str:
    lines = md.splitlines()
    out = []
    in_code = False

    def next_nonempty(idx: int) -> str:
        j = idx + 1
        while j < len(lines) and lines[j].strip() == "":
            j += 1
        return lines[j] if j < len(lines) else ""

    for i, line in enumerate(lines):
        s = line.strip()

        if s.startswith("```"):
            in_code = not in_code
            out.append(line)
            continue
        if in_code:
            out.append(line)
            continue

        if s in {"*", "-", "+"}:
            nxt = next_nonempty(i)
            if nxt.startswith(" ") or nxt.startswith("\t"):
                out.append(line)
            else:
                continue
        else:
            out.append(line)

    return "\n".join(out) + "\n"


def fix_brace_artifacts(md: str) -> str:
    """
    If converter spits:
      {Encode:}
    turn into:
      **Encode:**
    If it spits:
      {Goal.}
    turn into:
      #### Goal.
    """
    lines = md.splitlines()
    out = []
    in_code = False

    for line in lines:
        s = line.strip()
        if s.startswith("```"):
            in_code = not in_code
            out.append(line)
            continue
        if in_code:
            out.append(line)
            continue

        m = BRACE_LINE_RE.match(s)
        if m:
            title = m.group(1).strip()
            if title.endswith(":"):
                out.append(f"**{title}**")
            else:
                out.append(f"#### {title}")
            continue

        out.append(line)

    return "\n".join(out) + "\n"


def normalize_math_delimiters(md: str) -> str:
    r"""
    Keep inline $...$ and force display $$...$$.
    Converts:
      - \(...\) -> $...$
      - \[ and \] as lines -> $$
      - bare '$' delimiter lines -> '$$'
      - one-line '$ ... $' as whole line -> $$ ... $$
    """
    md = re.sub(r"""\\\((.+?)\\\)""", r"$\1$", md, flags=re.DOTALL)

    lines = md.splitlines()
    out = []
    in_code = False

    def is_fence(line: str) -> bool:
        return line.strip().startswith("```")

    def is_line_only_single_dollar_math(line: str) -> bool:
        s = line.strip()
        if s.startswith("$$") or s.endswith("$$"):
            return False
        if len(s) < 2 or not (s.startswith("$") and s.endswith("$")):
            return False
        inner = s[1:-1].strip()
        if not inner:
            return False
        return s.count("$") == 2

    for line in lines:
        if is_fence(line):
            in_code = not in_code
            out.append(line)
            continue
        if in_code:
            out.append(line)
            continue

        s = line.strip()
        if s in {"$", r"\[", r"\]"}:
            out.append("$$")
            continue

        if is_line_only_single_dollar_math(line):
            inner = line.strip()[1:-1].strip()
            out.append("$$")
            out.append(inner)
            out.append("$$")
            continue

        out.append(line)

    return "\n".join(out) + "\n"


def dedupe_consecutive_headings(md: str) -> str:
    lines = md.splitlines()
    out = []
    last_heading = None
    blanks = 0

    def is_heading(line: str) -> bool:
        s = line.strip()
        return s.startswith("#")

    for line in lines:
        s = line.strip()
        if s == "":
            blanks += 1
            out.append(line)
            continue

        if is_heading(line):
            if last_heading == s and blanks <= 5:
                blanks = 0
                continue
            last_heading = s
            blanks = 0
            out.append(line)
            continue

        blanks = 0
        out.append(line)

    return "\n".join(out) + "\n"


def postprocess(md: str) -> str:
    md = fix_brace_artifacts(md)
    md = remove_orphan_bullets(md)
    md = normalize_math_delimiters(md)
    md = split_display_math_from_text(md)
    md = ensure_blank_lines_around_display_math(md)
    md = dedupe_consecutive_headings(md)
    return md


def output_has_list_damage(md: str) -> bool:
    r"""
    Detect texmd failing at lists: leaks \item or environments or lots of brace artifacts.
    """
    bad = 0
    if "\\item" in md:
        bad += 2
    if "\\begin{enumerate}" in md or "\\end{enumerate}" in md:
        bad += 2
    if sum(1 for ln in md.splitlines() if BRACE_LINE_RE.match(ln.strip())) >= 5:
        bad += 2
    return bad >= 2

def ensure_blank_lines_around_display_math(md: str) -> str:
    """
    Ensure a blank line before the opening $$ and after the closing $$,
    without inserting blank lines *inside* the math block.

    Preserves indentation (important if $$ appears inside list items).
    Skips fenced code blocks.
    """
    lines = md.splitlines()
    out = []
    in_code = False
    in_display = False

    for i, line in enumerate(lines):
        s = line.strip()

        # code fences
        if s.startswith("```"):
            in_code = not in_code
            out.append(line)
            continue
        if in_code:
            out.append(line)
            continue

        if s == "$$":
            indent = line[: len(line) - len(line.lstrip())]
            blank = indent  # "blank" line with same indent

            if not in_display:
                # opening $$
                if out and out[-1].strip() != "":
                    out.append(blank)
                out.append(line)
                in_display = True
            else:
                # closing $$
                out.append(line)
                # add blank line after closing $$ if next line exists and isn't blank
                if i + 1 < len(lines) and lines[i + 1].strip() != "":
                    out.append(blank)
                in_display = False
            continue

        out.append(line)

    return "\n".join(out) + "\n"

LIST_PREFIX_RE = re.compile(r"^(\s*(?:[-*+]\s+|\d+\.\s+))")

def split_display_math_from_text(md: str) -> str:
    """
    Turn inline/paragraph-wrapped display math like:
      text $$math$$ text
    or multi-line:
      text $$math
      more
      $$ text
    into true block math:

      text

      $$
      math
      more
      $$

      text

    Preserves list indentation. Skips fenced code blocks.
    """
    lines = md.splitlines()
    out = []
    in_code = False
    in_display = False

    for line in lines:
        s = line.strip()

        # code fences
        if s.startswith("```"):
            in_code = not in_code
            out.append(line)
            continue
        if in_code:
            out.append(line)
            continue

        # detect list prefix to keep blocks inside list items
        m = LIST_PREFIX_RE.match(line)
        prefix = m.group(1) if m else ""
        base_indent = (" " * len(prefix)) if prefix else line[:len(line) - len(line.lstrip())]
        content = line[len(prefix):] if prefix else line[len(base_indent):]

        if "$$" not in content:
            out.append(line)
            continue

        parts = re.split(r"(\$\$)", content)
        first_text_emitted = False

        for part in parts:
            if part == "$$":
                # ensure any pending text ends cleanly before math
                if not in_display:
                    # opening
                    out.append(base_indent + "$$")
                    in_display = True
                else:
                    # closing
                    out.append(base_indent + "$$")
                    in_display = False
                continue

            if part == "":
                continue

            if in_display:
                # math content line (keep as-is, but indent appropriately if in list)
                out.append(base_indent + part.rstrip())
            else:
                # normal text segment
                txt = part.rstrip()
                if txt.strip() == "":
                    continue
                if not prefix:
                    # non-list line
                    out.append(base_indent + txt if base_indent else txt)
                else:
                    # list line: first segment keeps bullet/number prefix, remaining segments are continuation
                    if not first_text_emitted:
                        out.append(prefix + txt)
                        first_text_emitted = True
                    else:
                        out.append(base_indent + txt)

    return "\n".join(out) + "\n"


# ----------------------------
# Main
# ----------------------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("main_tex", type=Path)
    ap.add_argument("-o", "--output", type=Path, default=None)
    ap.add_argument("--bib", type=Path, default=None)
    ap.add_argument("--workdir", type=Path, default=Path(".build_tex2md"))
    ap.add_argument("--engine", choices=["auto", "pandoc", "texmd"], default="auto")
    ap.add_argument("--keep-temp", action="store_true")
    args = ap.parse_args()

    main_tex = args.main_tex.resolve()
    if not main_tex.exists():
        print(f"ERROR: not found: {main_tex}", file=sys.stderr)
        sys.exit(1)

    out_md = (args.output if args.output else main_tex.with_suffix(".md")).resolve()
    workdir = args.workdir.resolve()
    workdir.mkdir(parents=True, exist_ok=True)

    flattened = workdir / (main_tex.stem + ".flattened.tex")
    prepped = workdir / (main_tex.stem + ".prepped.tex")

    try:
        # flatten
        tex_flat = flatten_auto(main_tex, flattened)

        # preprocess
        src = read_text(tex_flat)
        src = preprocess_latex(src)
        write_text(prepped, src)

        # convert: pandoc is the reliable one for lists. Use it if possible.
        bib = args.bib.resolve() if args.bib else None

        if args.engine in {"auto", "pandoc"} and have("pandoc"):
            pandoc_convert(prepped, out_md)
            md = postprocess(read_text(out_md))
            write_text(out_md, md)

        elif args.engine == "pandoc" and not have("pandoc"):
            raise RuntimeError("pandoc not found. Install it or use --engine texmd.")

        else:
            md = texmd_convert(prepped, bib)
            md = postprocess(md)

            # If texmd still screwed up lists and pandoc exists, fall back.
            if args.engine == "auto" and output_has_list_damage(md) and have("pandoc"):
                pandoc_convert(prepped, out_md)
                md2 = postprocess(read_text(out_md))
                write_text(out_md, md2)
            else:
                write_text(out_md, md)

        if not args.keep_temp:
            try: flattened.unlink(missing_ok=True)
            except Exception: pass
            try: prepped.unlink(missing_ok=True)
            except Exception: pass

    except Exception as e:
        print(f"ERROR: {e}", file=sys.stderr)
        sys.exit(2)

    print(f"Wrote: {out_md}")
    if args.keep_temp:
        print(f"Kept: {flattened}")
        print(f"Kept: {prepped}")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import argparse
	import re
	import subprocess
	import sys
	from pathlib import Path
	from typing import Optional

	from texmd import tex # pip install texmd


	# ----------------------------
	# Utils
	# ----------------------------
	def have(cmd: str) -> bool:
	try:
	subprocess.run([cmd, "--version"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
	return True
	except FileNotFoundError:
	return False


	def read_text(p: Path) -> str:
	return p.read_text(encoding="utf-8", errors="replace")


	def write_text(p: Path, s: str) -> None:
	p.parent.mkdir(parents=True, exist_ok=True)
	p.write_text(s, encoding="utf-8")


	# ----------------------------
	# Flattening
	# ----------------------------
	INPUT_RE = re.compile(r"""\\(input\|include)\s*\{([^}]+)\}""")


	def strip_comments(tex_src: str) -> str:
	# remove % comments unless escaped as \%
	out = []
	i = 0
	while i < len(tex_src):
	ch = tex_src[i]
	if ch == "%":
	if i > 0 and tex_src[i - 1] == "\\":
	out.append(ch)
	i += 1
	continue
	while i < len(tex_src) and tex_src[i] != "\n":
	i += 1
	continue
	out.append(ch)
	i += 1
	return "".join(out)


	def resolve_tex_path(base_dir: Path, name: str) -> Path:
	name = name.strip()
	if not name.lower().endswith(".tex"):
	name = name + ".tex"
	return (base_dir / name).resolve()


	def flatten_python(main_tex: Path, max_depth: int = 80) -> str:
	visited = set()

	def _expand(path: Path, depth: int) -> str:
	if depth > max_depth:
	raise RuntimeError(f"Max include depth exceeded at {path}")
	if not path.exists():
	raise RuntimeError(f"Missing included file: {path}")

	key = str(path)
	if key in visited:
	return f"\n% [flatten-python] skipped already included: {path}\n"
	visited.add(key)

	src = strip_comments(read_text(path))
	base = path.parent

	def repl(m: re.Match) -> str:
	inc = m.group(2).strip()
	inc_path = resolve_tex_path(base, inc)
	return (
	f"\n% ===== begin input: {inc} =====\n"
	+ _expand(inc_path, depth + 1)
	+ f"\n% ===== end input: {inc} =====\n"
	)

	return INPUT_RE.sub(repl, src)

	return _expand(main_tex.resolve(), 0)


	def flatten_with_latexpand(main_tex: Path, out_tex: Path) -> Path:
	out_tex.parent.mkdir(parents=True, exist_ok=True)
	with open(out_tex, "w", encoding="utf-8") as f:
	res = subprocess.run(
	["latexpand", main_tex.name],
	cwd=str(main_tex.parent),
	stdout=f,
	stderr=subprocess.PIPE,
	text=True,
	)
	if res.returncode != 0:
	raise RuntimeError(f"latexpand failed:\n{res.stderr}")
	return out_tex


	def flatten_auto(main_tex: Path, out_tex: Path) -> Path:
	if have("latexpand"):
	return flatten_with_latexpand(main_tex, out_tex)
	write_text(out_tex, flatten_python(main_tex))
	return out_tex


	# ----------------------------
	# LaTeX pre-processing (kill list/enumitem issues + preserve stuff)
	# ----------------------------

	# Remove optional args even if they span newlines:
	# \begin{enumerate}[ ... possibly multiline ... ]
	BEGIN_LIST_OPT_RE = re.compile(
	r"""\\begin\{(enumerate\|itemize\|description)\}\s\[(.?)\]""",
	re.DOTALL
	)

	# Remove optional labels on \item[...]
	ITEM_OPT_RE = re.compile(r"""\\item\s\[(.?)\]""", re.DOTALL)

	# Remove enumitem configuration commands anywhere (they poison converters)
	SETLIST_LINE_RE = re.compile(r"""^\s\\setlist.$""", re.MULTILINE)

	# Paragraph-like headings that texmd tends to output as {...}
	PARA_RE = re.compile(r"""\\(sub)?paragraph\?\{([^}])\}""")

	# Noindent noise
	NOINDENT_RE = re.compile(r"""\\noindent\s*""")

	# Algorithms: keep them as fenced text (otherwise they vanish or become raw TeX)
	ALGO_ENV_RE = re.compile(r"""\\begin\{algorithm\?\}.?\\end\{algorithm\*?\}""", re.DOTALL)
	ALGO_INNER_RE = re.compile(r"""\\begin\{algorithmic\?\}.?\\end\{algorithmic\*?\}""", re.DOTALL)
	ALGPSEUDO_RE = re.compile(r"""\\begin\{algpseudocode\?\}.?\\end\{algpseudocode\*?\}""", re.DOTALL)


	def preprocess_latex(src: str) -> str:
	# 0) strip enumitem setup lines
	src = SETLIST_LINE_RE.sub("", src)

	# 1) remove list optional args (enumitem) robustly
	src = BEGIN_LIST_OPT_RE.sub(lambda m: f"\\begin{{{m.group(1)}}}", src)

	# 2) remove \item[...]
	src = ITEM_OPT_RE.sub(r"\\item", src)

	# 3) rewrite \paragraph/\subparagraph into converter-safe sectioning (NON-starred)
	def _para(m: re.Match) -> str:
	title = m.group(2).replace("\n", " ").strip()
	if not title:
	return ""
	return f"\\subsubsection{{{title}}}\n"
	src = PARA_RE.sub(_para, src)

	# 4) remove \noindent
	src = NOINDENT_RE.sub("", src)

	# 5) preserve algorithm blocks as fenced code so you don't lose them
	def _fence(m: re.Match) -> str:
	block = m.group(0).strip()
	return "\n\\subsubsection{Algorithmic Summary}\n```text\n" + block + "\n```\n"

	src = ALGO_ENV_RE.sub(_fence, src)
	src = ALGO_INNER_RE.sub(_fence, src)
	src = ALGPSEUDO_RE.sub(_fence, src)

	return src


	# ----------------------------
	# Conversion engines
	# ----------------------------
	def texmd_convert(tex_path: Path, bib_path: Optional[Path]) -> str:
	parser = tex.TexParser()
	if bib_path and bib_path.exists():
	parser.load_citations(str(bib_path))
	doc = parser.load_file(str(tex_path))
	return parser.to_md(doc).to_str()


	def pandoc_convert(tex_path: Path, out_md: Path) -> None:
	cmd = [
	"pandoc",
	str(tex_path),
	"-f", "latex",
	"-t", "gfm+tex_math_dollars",
	"--wrap=none",
	"-o", str(out_md),
	]
	subprocess.run(cmd, check=True)


	# ----------------------------
	# Markdown post-processing
	# ----------------------------

	# Standalone brace artifact line like "{Goal.}" or "{Encode:}"
	BRACE_LINE_RE = re.compile(r"""^\{([^{}]{1,200})\}\s*$""")


	def remove_orphan_bullets(md: str) -> str:
	lines = md.splitlines()
	out = []
	in_code = False

	def next_nonempty(idx: int) -> str:
	j = idx + 1
	while j < len(lines) and lines[j].strip() == "":
	j += 1
	return lines[j] if j < len(lines) else ""

	for i, line in enumerate(lines):
	s = line.strip()

	if s.startswith("```"):
	in_code = not in_code
	out.append(line)
	continue
	if in_code:
	out.append(line)
	continue

	if s in {"*", "-", "+"}:
	nxt = next_nonempty(i)
	if nxt.startswith(" ") or nxt.startswith("\t"):
	out.append(line)
	else:
	continue
	else:
	out.append(line)

	return "\n".join(out) + "\n"


	def fix_brace_artifacts(md: str) -> str:
	"""
	If converter spits:
	{Encode:}
	turn into:
	Encode:
	If it spits:
	{Goal.}
	turn into:
	#### Goal.
	"""
	lines = md.splitlines()
	out = []
	in_code = False

	for line in lines:
	s = line.strip()
	if s.startswith("```"):
	in_code = not in_code
	out.append(line)
	continue
	if in_code:
	out.append(line)
	continue

	m = BRACE_LINE_RE.match(s)
	if m:
	title = m.group(1).strip()
	if title.endswith(":"):
	out.append(f"{title}")
	else:
	out.append(f"#### {title}")
	continue

	out.append(line)

	return "\n".join(out) + "\n"


	def normalize_math_delimiters(md: str) -> str:
	r"""
	Keep inline $...$ and force display $$...$$.
	Converts:
	- \(...\) -> $...$
	- \[ and \] as lines -> $$
	- bare '$' delimiter lines -> '$$'
	- one-line '$ ... $' as whole line -> $$ ... $$
	"""
	md = re.sub(r"""\\\((.+?)\\\)""", r"$\1$", md, flags=re.DOTALL)

	lines = md.splitlines()
	out = []
	in_code = False

	def is_fence(line: str) -> bool:
	return line.strip().startswith("```")

	def is_line_only_single_dollar_math(line: str) -> bool:
	s = line.strip()
	if s.startswith("$$") or s.endswith("$$"):
	return False
	if len(s) < 2 or not (s.startswith("$") and s.endswith("$")):
	return False
	inner = s[1:-1].strip()
	if not inner:
	return False
	return s.count("$") == 2

	for line in lines:
	if is_fence(line):
	in_code = not in_code
	out.append(line)
	continue
	if in_code:
	out.append(line)
	continue

	s = line.strip()
	if s in {"$", r"\[", r"\]"}:
	out.append("$$")
	continue

	if is_line_only_single_dollar_math(line):
	inner = line.strip()[1:-1].strip()
	out.append("$$")
	out.append(inner)
	out.append("$$")
	continue

	out.append(line)

	return "\n".join(out) + "\n"


	def dedupe_consecutive_headings(md: str) -> str:
	lines = md.splitlines()
	out = []
	last_heading = None
	blanks = 0

	def is_heading(line: str) -> bool:
	s = line.strip()
	return s.startswith("#")

	for line in lines:
	s = line.strip()
	if s == "":
	blanks += 1
	out.append(line)
	continue

	if is_heading(line):
	if last_heading == s and blanks <= 5:
	blanks = 0
	continue
	last_heading = s
	blanks = 0
	out.append(line)
	continue

	blanks = 0
	out.append(line)

	return "\n".join(out) + "\n"


	def postprocess(md: str) -> str:
	md = fix_brace_artifacts(md)
	md = remove_orphan_bullets(md)
	md = normalize_math_delimiters(md)
	md = split_display_math_from_text(md)
	md = ensure_blank_lines_around_display_math(md)
	md = dedupe_consecutive_headings(md)
	return md


	def output_has_list_damage(md: str) -> bool:
	r"""
	Detect texmd failing at lists: leaks \item or environments or lots of brace artifacts.
	"""
	bad = 0
	if "\\item" in md:
	bad += 2
	if "\\begin{enumerate}" in md or "\\end{enumerate}" in md:
	bad += 2
	if sum(1 for ln in md.splitlines() if BRACE_LINE_RE.match(ln.strip())) >= 5:
	bad += 2
	return bad >= 2

	def ensure_blank_lines_around_display_math(md: str) -> str:
	"""
	Ensure a blank line before the opening $$ and after the closing $$,
	without inserting blank lines inside the math block.

	Preserves indentation (important if $$ appears inside list items).
	Skips fenced code blocks.
	"""
	lines = md.splitlines()
	out = []
	in_code = False
	in_display = False

	for i, line in enumerate(lines):
	s = line.strip()

	# code fences
	if s.startswith("```"):
	in_code = not in_code
	out.append(line)
	continue
	if in_code:
	out.append(line)
	continue

	if s == "$$":
	indent = line[: len(line) - len(line.lstrip())]
	blank = indent # "blank" line with same indent

	if not in_display:
	# opening $$
	if out and out[-1].strip() != "":
	out.append(blank)
	out.append(line)
	in_display = True
	else:
	# closing $$
	out.append(line)
	# add blank line after closing $$ if next line exists and isn't blank
	if i + 1 < len(lines) and lines[i + 1].strip() != "":
	out.append(blank)
	in_display = False
	continue

	out.append(line)

	return "\n".join(out) + "\n"

	LIST_PREFIX_RE = re.compile(r"^(\s(?:[-+]\s+\|\d+\.\s+))")

	def split_display_math_from_text(md: str) -> str:
	"""
	Turn inline/paragraph-wrapped display math like:
	text $$math$$ text
	or multi-line:
	text $$math
	more
	$$ text
	into true block math:

	text

	$$
	math
	more
	$$

	text

	Preserves list indentation. Skips fenced code blocks.
	"""
	lines = md.splitlines()
	out = []
	in_code = False
	in_display = False

	for line in lines:
	s = line.strip()

	# code fences
	if s.startswith("```"):
	in_code = not in_code
	out.append(line)
	continue
	if in_code:
	out.append(line)
	continue

	# detect list prefix to keep blocks inside list items
	m = LIST_PREFIX_RE.match(line)
	prefix = m.group(1) if m else ""
	base_indent = (" " * len(prefix)) if prefix else line[:len(line) - len(line.lstrip())]
	content = line[len(prefix):] if prefix else line[len(base_indent):]

	if "$$" not in content:
	out.append(line)
	continue

	parts = re.split(r"(\$\$)", content)
	first_text_emitted = False

	for part in parts:
	if part == "$$":
	# ensure any pending text ends cleanly before math
	if not in_display:
	# opening
	out.append(base_indent + "$$")
	in_display = True
	else:
	# closing
	out.append(base_indent + "$$")
	in_display = False
	continue

	if part == "":
	continue

	if in_display:
	# math content line (keep as-is, but indent appropriately if in list)
	out.append(base_indent + part.rstrip())
	else:
	# normal text segment
	txt = part.rstrip()
	if txt.strip() == "":
	continue
	if not prefix:
	# non-list line
	out.append(base_indent + txt if base_indent else txt)
	else:
	# list line: first segment keeps bullet/number prefix, remaining segments are continuation
	if not first_text_emitted:
	out.append(prefix + txt)
	first_text_emitted = True
	else:
	out.append(base_indent + txt)

	return "\n".join(out) + "\n"


	# ----------------------------
	# Main
	# ----------------------------
	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("main_tex", type=Path)
	ap.add_argument("-o", "--output", type=Path, default=None)
	ap.add_argument("--bib", type=Path, default=None)
	ap.add_argument("--workdir", type=Path, default=Path(".build_tex2md"))
	ap.add_argument("--engine", choices=["auto", "pandoc", "texmd"], default="auto")
	ap.add_argument("--keep-temp", action="store_true")
	args = ap.parse_args()

	main_tex = args.main_tex.resolve()
	if not main_tex.exists():
	print(f"ERROR: not found: {main_tex}", file=sys.stderr)
	sys.exit(1)

	out_md = (args.output if args.output else main_tex.with_suffix(".md")).resolve()
	workdir = args.workdir.resolve()
	workdir.mkdir(parents=True, exist_ok=True)

	flattened = workdir / (main_tex.stem + ".flattened.tex")
	prepped = workdir / (main_tex.stem + ".prepped.tex")

	try:
	# flatten
	tex_flat = flatten_auto(main_tex, flattened)

	# preprocess
	src = read_text(tex_flat)
	src = preprocess_latex(src)
	write_text(prepped, src)

	# convert: pandoc is the reliable one for lists. Use it if possible.
	bib = args.bib.resolve() if args.bib else None

	if args.engine in {"auto", "pandoc"} and have("pandoc"):
	pandoc_convert(prepped, out_md)
	md = postprocess(read_text(out_md))
	write_text(out_md, md)

	elif args.engine == "pandoc" and not have("pandoc"):
	raise RuntimeError("pandoc not found. Install it or use --engine texmd.")

	else:
	md = texmd_convert(prepped, bib)
	md = postprocess(md)

	# If texmd still screwed up lists and pandoc exists, fall back.
	if args.engine == "auto" and output_has_list_damage(md) and have("pandoc"):
	pandoc_convert(prepped, out_md)
	md2 = postprocess(read_text(out_md))
	write_text(out_md, md2)
	else:
	write_text(out_md, md)

	if not args.keep_temp:
	try: flattened.unlink(missing_ok=True)
	except Exception: pass
	try: prepped.unlink(missing_ok=True)
	except Exception: pass

	except Exception as e:
	print(f"ERROR: {e}", file=sys.stderr)
	sys.exit(2)

	print(f"Wrote: {out_md}")
	if args.keep_temp:
	print(f"Kept: {flattened}")
	print(f"Kept: {prepped}")


	if __name__ == "__main__":
	main()
No results found