|
#!/usr/bin/env -S uv run --script |
|
# /// script |
|
# requires-python = ">=3.10" |
|
# dependencies = [ |
|
# "markdown", |
|
# "requests", |
|
# "weasyprint", |
|
# "beautifulsoup4", |
|
# ] |
|
# /// |
|
""" |
|
Markdown to PDF/HTML converter with Mermaid diagram rendering. |
|
|
|
External dependencies: |
|
- mermaid.ink API (for diagram rendering, no local Chromium needed) |
|
|
|
Usage: |
|
md2pdf [OPTIONS] <input.md> [output-base] |
|
|
|
Examples: |
|
md2pdf notes.md |
|
md2pdf --format html notes.md notes.html |
|
md2pdf --url "https://gist.github.com/...#file-foo-md" |
|
|
|
By default, writes a PDF; use --format html or --format both for other outputs. |
|
""" |
|
|
|
import argparse |
|
import base64 |
|
import re |
|
import sys |
|
from pathlib import Path |
|
from urllib.parse import urljoin, urlparse |
|
|
|
import markdown |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from weasyprint import HTML |
|
|
|
|
|
def render_mermaid_via_api(mermaid_code: str) -> str | None: |
|
""" |
|
Render mermaid code to PNG via mermaid.ink API. |
|
Returns base64-encoded PNG data or None on failure. |
|
""" |
|
encoded = base64.urlsafe_b64encode(mermaid_code.encode('utf-8')).decode('utf-8') |
|
# Note: Using only type=png because combining with bgColor causes 400 errors |
|
# on some diagrams (apparent mermaid.ink bug) |
|
url = f"https://mermaid.ink/img/{encoded}?type=png" |
|
|
|
print(f" Fetching diagram from mermaid.ink...") |
|
try: |
|
response = requests.get(url, timeout=30) |
|
if response.status_code == 200: |
|
return base64.b64encode(response.content).decode('utf-8') |
|
else: |
|
print(f" Warning: mermaid.ink returned status {response.status_code}") |
|
return None |
|
except Exception as e: |
|
print(f" Warning: Failed to fetch from mermaid.ink: {e}") |
|
return None |
|
|
|
|
|
def ensure_blank_lines_before_lists(content: str) -> str: |
|
""" |
|
Ensure there's a blank line before list items. |
|
Markdown requires blank lines before lists for proper parsing. |
|
""" |
|
lines = content.split('\n') |
|
result = [] |
|
|
|
for i, line in enumerate(lines): |
|
# Check if this line starts a list (bullet or numbered) |
|
is_list_item = bool(re.match(r'^(\s*[-*+]|\s*\d+\.)\s', line)) |
|
|
|
if is_list_item and i > 0: |
|
prev_line = lines[i - 1] |
|
# Previous line is not blank and not itself a list item |
|
prev_is_list = bool(re.match(r'^(\s*[-*+]|\s*\d+\.)\s', prev_line)) |
|
prev_is_blank = prev_line.strip() == '' |
|
|
|
if not prev_is_blank and not prev_is_list: |
|
result.append('') # Insert blank line |
|
|
|
result.append(line) |
|
|
|
return '\n'.join(result) |
|
|
|
|
|
def ensure_blank_lines_before_tables(content: str) -> str: |
|
""" |
|
Ensure there's a blank line before markdown tables. |
|
Markdown requires blank lines before tables for proper parsing. |
|
""" |
|
lines = content.split('\n') |
|
result = [] |
|
|
|
def is_table_row(line: str) -> bool: |
|
"""Check if line is a markdown table row (with or without trailing pipe).""" |
|
stripped = line.strip() |
|
if not stripped.startswith('|'): |
|
return False |
|
# Must have at least one more pipe (header separator or cell delimiter) |
|
return stripped.count('|') >= 2 |
|
|
|
for i, line in enumerate(lines): |
|
if is_table_row(line) and i > 0: |
|
prev_line = lines[i - 1] |
|
prev_is_table = is_table_row(prev_line) |
|
prev_is_blank = prev_line.strip() == '' |
|
|
|
if not prev_is_blank and not prev_is_table: |
|
result.append('') # Insert blank line |
|
|
|
result.append(line) |
|
|
|
return '\n'.join(result) |
|
|
|
|
|
def convert_md_to_html(md_content: str, title: str = "Document", landscape: bool = False, custom_css: str | None = None) -> str: |
|
"""Convert markdown to HTML with mermaid diagrams rendered as embedded PNGs. |
|
|
|
If custom_css is provided, it will override the default styles. |
|
""" |
|
# Find all mermaid blocks and render them BEFORE markdown conversion |
|
pattern = r'```mermaid\s*\n(.*?)```' |
|
|
|
blocks = list(re.finditer(pattern, md_content, re.DOTALL)) |
|
print(f"Found {len(blocks)} mermaid diagram(s)") |
|
|
|
# Process in reverse order to preserve positions |
|
modified_content = md_content |
|
for idx, match in enumerate(reversed(blocks)): |
|
diagram_num = len(blocks) - idx |
|
print(f"Rendering diagram {diagram_num} of {len(blocks)}...") |
|
|
|
mermaid_code = match.group(1).strip() |
|
b64_png = render_mermaid_via_api(mermaid_code) |
|
|
|
if b64_png: |
|
replacement = f'\n\n<div class="mermaid-diagram"><img src="data:image/png;base64,{b64_png}" alt="Mermaid Diagram {diagram_num}"></div>\n\n' |
|
else: |
|
escaped_code = mermaid_code.replace('<', '<').replace('>', '>') |
|
replacement = f'\n\n<pre class="mermaid-fallback"><code>{escaped_code}</code></pre>\n\n' |
|
|
|
modified_content = modified_content[:match.start()] + replacement + modified_content[match.end():] |
|
|
|
# Ensure blank lines before lists for proper markdown parsing |
|
modified_content = ensure_blank_lines_before_lists(modified_content) |
|
|
|
# Ensure blank lines before tables for proper markdown parsing |
|
modified_content = ensure_blank_lines_before_tables(modified_content) |
|
|
|
# Convert markdown to HTML |
|
md = markdown.Markdown(extensions=['tables', 'fenced_code', 'toc']) |
|
html_body = md.convert(modified_content) |
|
|
|
# Wrap in full HTML document with print-friendly styles |
|
page_size = "A4 landscape" if landscape else "A4" |
|
|
|
# Use custom CSS if provided, otherwise use default styles |
|
if custom_css: |
|
# When using custom CSS, embed it then override page size (order matters for cascade) |
|
custom_css_b64 = base64.b64encode(custom_css.encode()).decode() |
|
style_section = f'<link rel="stylesheet" href="data:text/css;base64,{custom_css_b64}"><style>@page {{ size: {page_size}; }}</style>' |
|
else: |
|
style_section = f'''<style> |
|
@page {{ |
|
size: {page_size}; |
|
margin: 2cm; |
|
}} |
|
body {{ |
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif; |
|
line-height: 1.6; |
|
max-width: 900px; |
|
margin: 0 auto; |
|
padding: 2rem; |
|
color: #333; |
|
background: #fff; |
|
}} |
|
h1, h2, h3, h4, h5, h6 {{ |
|
color: #2c3e50; |
|
margin-top: 1.5em; |
|
margin-bottom: 0.5em; |
|
page-break-after: avoid; |
|
}} |
|
h1 {{ border-bottom: 2px solid #3498db; padding-bottom: 0.3em; font-size: 1.8em; }} |
|
h2 {{ border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; font-size: 1.4em; }} |
|
h3 {{ font-size: 1.2em; }} |
|
h4 {{ font-size: 1.1em; }} |
|
code {{ |
|
background: #f4f4f4; |
|
padding: 0.2em 0.4em; |
|
border-radius: 3px; |
|
font-family: 'Consolas', 'Monaco', 'Courier New', monospace; |
|
font-size: 0.9em; |
|
}} |
|
pre {{ |
|
background: #2d2d2d; |
|
color: #f8f8f2; |
|
padding: 1em; |
|
border-radius: 5px; |
|
overflow-x: auto; |
|
page-break-inside: avoid; |
|
}} |
|
pre code {{ |
|
background: none; |
|
padding: 0; |
|
color: inherit; |
|
}} |
|
table {{ |
|
border-collapse: collapse; |
|
width: 100%; |
|
margin: 1em 0; |
|
page-break-inside: avoid; |
|
}} |
|
th, td {{ |
|
border: 1px solid #ddd; |
|
padding: 0.5em 1em; |
|
text-align: left; |
|
}} |
|
th {{ |
|
background: #f5f5f5; |
|
}} |
|
blockquote {{ |
|
border-left: 4px solid #3498db; |
|
margin: 1em 0; |
|
padding-left: 1em; |
|
color: #666; |
|
}} |
|
a {{ |
|
color: #3498db; |
|
text-decoration: none; |
|
}} |
|
hr {{ |
|
border: none; |
|
border-top: 1px solid #ddd; |
|
margin: 2em 0; |
|
}} |
|
ul, ol {{ |
|
padding-left: 2em; |
|
}} |
|
li {{ |
|
margin: 0.3em 0; |
|
}} |
|
.mermaid-fallback {{ |
|
background: #fff3cd; |
|
border: 1px solid #ffc107; |
|
color: #856404; |
|
}} |
|
.mermaid-diagram {{ |
|
text-align: center; |
|
margin: 1.5em 0; |
|
page-break-inside: avoid; |
|
}} |
|
.mermaid-diagram img {{ |
|
max-width: 100%; |
|
height: auto; |
|
border: 1px solid #ddd; |
|
border-radius: 5px; |
|
}} |
|
input[type="checkbox"] {{ |
|
margin-right: 0.5em; |
|
}} |
|
/* Print-specific */ |
|
@media print {{ |
|
body {{ |
|
padding: 0; |
|
max-width: none; |
|
}} |
|
a {{ |
|
text-decoration: none; |
|
}} |
|
}} |
|
</style>''' |
|
|
|
html_template = f'''<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>{title}</title> |
|
{style_section} |
|
</head> |
|
<body> |
|
{html_body} |
|
</body> |
|
</html>''' |
|
|
|
return html_template |
|
|
|
|
|
def _fetch_raw_url(url: str) -> tuple[str, str]: |
|
"""Fetch markdown content from a raw URL. |
|
|
|
Returns (content, name_hint). |
|
""" |
|
print(f"Downloading markdown from: {url}") |
|
try: |
|
response = requests.get(url, timeout=60) |
|
except Exception as exc: # pragma: no cover - network failure path |
|
print(f"Error: failed to fetch URL: {exc}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
if response.status_code != 200: |
|
print(f"Error: URL returned status {response.status_code}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
parsed = urlparse(response.url) |
|
name_hint = Path(parsed.path).name or "document.md" |
|
return response.text, name_hint |
|
|
|
|
|
def _fetch_md_from_gist_page(url: str, fragment: str | None) -> tuple[str, str]: |
|
"""Fetch markdown from a GitHub gist page using the Raw link for the file. |
|
|
|
The fragment (e.g. "file-rio_bulk_upload_dependency_analysis-md") is used to |
|
locate the correct file block before resolving its Raw link. |
|
Returns (content, filename). |
|
""" |
|
print(f"Downloading gist page: {url}") |
|
try: |
|
response = requests.get(url, timeout=60) |
|
except Exception as exc: # pragma: no cover - network failure path |
|
print(f"Error: failed to fetch gist page: {exc}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
if response.status_code != 200: |
|
print(f"Error: gist page returned status {response.status_code}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
soup = BeautifulSoup(response.text, "html.parser") |
|
|
|
raw_url: str | None = None |
|
if fragment: |
|
container = soup.find(id=fragment) |
|
if container is not None: |
|
link = container.find( |
|
"a", |
|
href=lambda h: isinstance(h, str) and "raw" in h, |
|
) |
|
if link and link.has_attr("href"): |
|
raw_url = urljoin(response.url, link["href"]) |
|
|
|
if raw_url is None: |
|
# Fallback: first link whose href looks like a raw URL |
|
link = soup.find( |
|
"a", |
|
href=lambda h: isinstance(h, str) and "raw" in h, |
|
) |
|
if link and link.has_attr("href"): |
|
raw_url = urljoin(response.url, link["href"]) |
|
|
|
if raw_url is None: |
|
print("Error: could not locate Raw link on gist page", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
print(f" Resolved Raw URL: {raw_url}") |
|
content, name_hint = _fetch_raw_url(raw_url) |
|
return content, name_hint |
|
|
|
|
|
def fetch_markdown_from_url(url: str) -> tuple[str, str]: |
|
"""Fetch markdown content from a URL, handling known hosts specially. |
|
|
|
Returns (content, name_hint). |
|
""" |
|
parsed = urlparse(url) |
|
host = parsed.netloc.lower() |
|
|
|
if host.endswith("gist.github.com"): |
|
fragment = parsed.fragment or None |
|
return _fetch_md_from_gist_page(url, fragment) |
|
|
|
return _fetch_raw_url(url) |
|
|
|
|
|
def main(argv: list[str] | None = None) -> None: |
|
"""CLI entry point.""" |
|
if argv is None: |
|
argv = sys.argv[1:] |
|
|
|
parser = argparse.ArgumentParser( |
|
prog="md2pdf", |
|
description=( |
|
"Convert markdown files to PDF or HTML with Mermaid diagram rendering." |
|
), |
|
) |
|
parser.add_argument( |
|
"input", |
|
metavar="INPUT", |
|
help="Markdown input file or URL (with --url)", |
|
) |
|
parser.add_argument( |
|
"output", |
|
nargs="?", |
|
metavar="OUTPUT", |
|
help="Optional output file (PDF or HTML). Defaults to INPUT basename.", |
|
) |
|
parser.add_argument( |
|
"-f", |
|
"--format", |
|
choices=["pdf", "html", "both"], |
|
default="pdf", |
|
help="Output format: pdf (default), html, or both.", |
|
) |
|
parser.add_argument( |
|
"--url", |
|
action="store_true", |
|
help="Treat INPUT as a URL and download markdown before converting.", |
|
) |
|
parser.add_argument( |
|
"--landscape", |
|
action="store_true", |
|
help=( |
|
"Use landscape orientation for PDF output (note: may interact with custom CSS @page rules)." |
|
), |
|
) |
|
parser.add_argument( |
|
"--css", |
|
metavar="CSS_FILE", |
|
help="Path to custom CSS file to override default styles.", |
|
) |
|
|
|
args = parser.parse_args(argv) |
|
|
|
if args.url: |
|
md_content, name_hint = fetch_markdown_from_url(args.input) |
|
source_name = name_hint or "document.md" |
|
input_label = args.input |
|
default_base = Path(source_name).with_suffix("") |
|
else: |
|
input_path = Path(args.input) |
|
|
|
if not input_path.exists(): |
|
print(f"Error: File not found: {input_path}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
if input_path.suffix.lower() != ".md": |
|
print(f"Error: File must be markdown (.md): {input_path}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
input_label = str(input_path) |
|
md_content = input_path.read_text(encoding="utf-8") |
|
source_name = input_path.name |
|
default_base = input_path.with_suffix("") |
|
|
|
if args.output: |
|
output_base = Path(args.output).with_suffix("") |
|
else: |
|
output_base = default_base |
|
|
|
html_path: Path | None = None |
|
pdf_path: Path | None = None |
|
|
|
if args.format in ("html", "both"): |
|
html_path = output_base.with_suffix(".html") |
|
if args.format in ("pdf", "both"): |
|
pdf_path = output_base.with_suffix(".pdf") |
|
|
|
print(f"Reading: {input_label}") |
|
|
|
# Extract title from first H1 |
|
title_match = re.search(r"^#\s+(.+)$", md_content, re.MULTILINE) |
|
title_fallback = Path(source_name).stem |
|
title = title_match.group(1) if title_match else title_fallback |
|
|
|
# Load custom CSS if provided |
|
custom_css = None |
|
if args.css: |
|
css_path = Path(args.css) |
|
if not css_path.exists(): |
|
print(f"Error: CSS file not found: {css_path}", file=sys.stderr) |
|
sys.exit(1) |
|
custom_css = css_path.read_text(encoding="utf-8") |
|
print(f"Loaded custom CSS from: {css_path}") |
|
|
|
print("Converting markdown to HTML...") |
|
html_content = convert_md_to_html(md_content, title, landscape=args.landscape, custom_css=custom_css) |
|
|
|
if html_path is not None: |
|
print(f"Writing HTML: {html_path}") |
|
html_path.write_text(html_content, encoding="utf-8") |
|
|
|
if pdf_path is not None: |
|
print(f"Generating PDF: {pdf_path}") |
|
HTML(string=html_content).write_pdf(pdf_path) |
|
|
|
print("Done!") |
|
if html_path is not None: |
|
print(f" HTML: {html_path}") |
|
if pdf_path is not None: |
|
print(f" PDF: {pdf_path}") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |