Created
March 1, 2026 19:09
-
-
Save yunho-c/f712ad0e24aef6053039d73acd4dcbc0 to your computer and use it in GitHub Desktop.
Convert PDF to Markdown using `pdf_oxide`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Simple CLI example: convert a PDF file to Markdown using pdf_oxide.""" | |
| from pathlib import Path | |
| import typer | |
| from pdf_oxide import PdfDocument | |
| app = typer.Typer( | |
| help="Convert a PDF file to Markdown using pdf_oxide (all pages by default).", | |
| add_completion=False, | |
| ) | |
| @app.command() | |
| def convert( | |
| pdf_path: Path = typer.Argument(..., help="Path to the input PDF file."), | |
| output: Path | None = typer.Option( | |
| None, | |
| "--output", | |
| "-o", | |
| help="Output Markdown file path. Defaults to <input_stem>.md in current directory.", | |
| ), | |
| detect_headings: bool = typer.Option( | |
| True, | |
| "--detect-headings/--no-detect-headings", | |
| help="Detect headings based on font size.", | |
| ), | |
| include_images: bool = typer.Option( | |
| False, | |
| "--include-images/--no-include-images", | |
| help="Include images in Markdown output.", | |
| ), | |
| preserve_layout: bool = typer.Option( | |
| False, | |
| "--preserve-layout/--no-preserve-layout", | |
| help="Preserve visual layout in conversion.", | |
| ), | |
| ) -> None: | |
| """Convert a specified PDF file into Markdown.""" | |
| if not pdf_path.exists(): | |
| typer.secho(f"Input file not found: {pdf_path}", fg=typer.colors.RED, err=True) | |
| raise typer.Exit(code=1) | |
| if not pdf_path.is_file(): | |
| typer.secho(f"Input path is not a file: {pdf_path}", fg=typer.colors.RED, err=True) | |
| raise typer.Exit(code=1) | |
| if pdf_path.suffix.lower() != ".pdf": | |
| typer.secho( | |
| f"Warning: input file does not end with .pdf: {pdf_path.name}", | |
| fg=typer.colors.YELLOW, | |
| err=True, | |
| ) | |
| output_path = output if output is not None else Path.cwd() / f"{pdf_path.stem}.md" | |
| try: | |
| doc = PdfDocument(str(pdf_path)) | |
| markdown = doc.to_markdown_all( | |
| preserve_layout=preserve_layout, | |
| detect_headings=detect_headings, | |
| include_images=include_images, | |
| ) | |
| except (OSError, RuntimeError) as exc: | |
| typer.secho(f"Conversion failed: {exc}", fg=typer.colors.RED, err=True) | |
| raise typer.Exit(code=1) from exc | |
| try: | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| output_path.write_text(markdown, encoding="utf-8") | |
| except OSError as exc: | |
| typer.secho(f"Failed to write output file: {exc}", fg=typer.colors.RED, err=True) | |
| raise typer.Exit(code=1) from exc | |
| typer.secho(f"Wrote Markdown to: {output_path}", fg=typer.colors.GREEN) | |
| typer.echo(f"Characters written: {len(markdown)}") | |
| if __name__ == "__main__": | |
| app() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment