Skip to content

Instantly share code, notes, and snippets.

@alecordev
Created July 8, 2025 16:04
Show Gist options
  • Select an option

  • Save alecordev/9537f1a85895916a4a7b7fb4ec40fd39 to your computer and use it in GitHub Desktop.

Select an option

Save alecordev/9537f1a85895916a4a7b7fb4ec40fd39 to your computer and use it in GitHub Desktop.
Python PDF to HTML
#!/usr/bin/env python3
"""
PDF to HTML Converter Module
A reliable Python module for converting PDF files to standalone HTML with embedded resources.
Uses PyMuPDF (fitz) for maximum compatibility and reliability.
PyMuPDF>=1.23.0
Pillow>=9.0.0
"""
import fitz # PyMuPDF
import base64
import io
import os
import html
import logging
from pathlib import Path
from typing import Optional, Dict, List, Tuple
import re
class PDFToHTMLConverter:
"""
A robust PDF to HTML converter that creates standalone HTML files.
Features:
- Extracts text with formatting preservation
- Embeds images as base64 data URIs
- Maintains document structure
- Handles various PDF types reliably
- Creates self-contained HTML files
"""
def __init__(self, embed_fonts: bool = True, preserve_layout: bool = True):
"""
Initialize the converter.
Args:
embed_fonts: Whether to embed font information in CSS
preserve_layout: Whether to preserve original PDF layout
"""
self.embed_fonts = embed_fonts
self.preserve_layout = preserve_layout
self.logger = logging.getLogger(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
def convert_pdf_to_html(
self, pdf_path: str, output_path: Optional[str] = None
) -> str:
"""
Convert a PDF file to standalone HTML.
Args:
pdf_path: Path to the input PDF file
output_path: Optional path for output HTML file
Returns:
Path to the generated HTML file
Raises:
FileNotFoundError: If PDF file doesn't exist
Exception: For other conversion errors
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
try:
# Open PDF document
doc = fitz.open(pdf_path)
# Extract content from all pages
html_content = self._extract_content_from_pdf(doc)
# Generate complete HTML
full_html = self._generate_complete_html(html_content, pdf_path)
# Determine output path
if output_path is None:
pdf_name = Path(pdf_path).stem
output_path = f"{pdf_name}.html"
# Write HTML file
with open(output_path, "w", encoding="utf-8") as f:
f.write(full_html)
doc.close()
self.logger.info(f"Successfully converted {pdf_path} to {output_path}")
return output_path
except Exception as e:
self.logger.error(f"Error converting PDF to HTML: {str(e)}")
raise
def _extract_content_from_pdf(self, doc: fitz.Document) -> Dict:
"""
Extract all content from PDF document.
Args:
doc: PyMuPDF document object
Returns:
Dictionary containing extracted content
"""
content = {"pages": [], "images": [], "fonts": set(), "metadata": doc.metadata}
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_content = self._extract_page_content(page, page_num)
content["pages"].append(page_content)
return content
def _extract_page_content(self, page: fitz.Page, page_num: int) -> Dict:
"""
Extract content from a single page.
Args:
page: PyMuPDF page object
page_num: Page number
Returns:
Dictionary containing page content
"""
# Get page dimensions
page_rect = page.rect
# Extract text blocks with formatting
text_blocks = self._extract_text_blocks(page)
# Extract images
images = self._extract_images(page, page_num)
# Extract drawings/vector graphics
drawings = self._extract_drawings(page)
return {
"page_num": page_num,
"dimensions": {"width": page_rect.width, "height": page_rect.height},
"text_blocks": text_blocks,
"images": images,
"drawings": drawings,
}
def _extract_text_blocks(self, page: fitz.Page) -> List[Dict]:
"""
Extract text blocks with formatting information.
Args:
page: PyMuPDF page object
Returns:
List of text blocks with formatting
"""
text_blocks = []
# Get text as dictionary with formatting
text_dict = page.get_text("dict")
for block in text_dict["blocks"]:
if "lines" in block: # Text block
block_data = {"type": "text", "bbox": block["bbox"], "lines": []}
for line in block["lines"]:
line_data = {"bbox": line["bbox"], "spans": []}
for span in line["spans"]:
# Extract font information
font_info = {
"font": span.get("font", ""),
"size": span.get("size", 12),
"color": span.get("color", 0),
"bold": "bold" in span.get("font", "").lower(),
"italic": "italic" in span.get("font", "").lower(),
}
span_data = {
"text": span.get("text", ""),
"bbox": span.get("bbox", [0, 0, 0, 0]),
"font_info": font_info,
}
line_data["spans"].append(span_data)
block_data["lines"].append(line_data)
text_blocks.append(block_data)
return text_blocks
def _extract_images(self, page: fitz.Page, page_num: int) -> List[Dict]:
"""
Extract images from page and convert to base64.
Args:
page: PyMuPDF page object
page_num: Page number
Returns:
List of image data with base64 encoding
"""
images = []
image_list = page.get_images()
for img_index, img in enumerate(image_list):
try:
# Get image data
xref = img[0]
pix = fitz.Pixmap(page.parent, xref)
# Convert to PNG if not already
if pix.n - pix.alpha < 4: # Can convert to PNG
img_data = pix.tobytes("png")
img_format = "png"
else: # Convert to JPEG
pix_rgb = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix_rgb.tobytes("jpeg")
img_format = "jpeg"
pix_rgb = None
# Encode as base64
img_b64 = base64.b64encode(img_data).decode()
# Get image position on page
img_rects = page.get_image_rects(xref)
image_info = {
"index": img_index,
"page": page_num,
"format": img_format,
"data": img_b64,
"width": pix.width,
"height": pix.height,
"rects": img_rects,
}
images.append(image_info)
pix = None
except Exception as e:
self.logger.warning(
f"Could not extract image {img_index} from page {page_num}: {str(e)}"
)
continue
return images
def _extract_drawings(self, page: fitz.Page) -> List[Dict]:
"""
Extract vector drawings and paths.
Args:
page: PyMuPDF page object
Returns:
List of drawing elements
"""
drawings = []
# Get drawing commands
try:
paths = page.get_drawings()
for path in paths:
drawing_info = {
"type": "drawing",
"bbox": path.get("rect", [0, 0, 0, 0]),
"stroke_color": path.get("color", None),
"fill_color": path.get("fill", None),
"width": path.get("width", 1),
}
drawings.append(drawing_info)
except Exception as e:
self.logger.warning(f"Could not extract drawings: {str(e)}")
return drawings
def _generate_complete_html(self, content: Dict, pdf_path: str) -> str:
"""
Generate complete HTML document with embedded resources.
Args:
content: Extracted PDF content
pdf_path: Original PDF file path
Returns:
Complete HTML string
"""
pdf_name = Path(pdf_path).stem
# Generate CSS
css = self._generate_css(content)
# Generate HTML body
body_html = self._generate_body_html(content)
# Create complete HTML
html_template = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{html.escape(pdf_name)}</title>
<style>
{css}
</style>
</head>
<body>
<div class="pdf-container">
<h1 class="pdf-title">{html.escape(pdf_name)}</h1>
{body_html}
</div>
</body>
</html>"""
return html_template
def _generate_css(self, content: Dict) -> str:
"""
Generate CSS styles for the HTML document.
Args:
content: Extracted PDF content
Returns:
CSS string
"""
css = """
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.pdf-container {
max-width: 1200px;
margin: 0 auto;
background-color: white;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
border-radius: 8px;
overflow: hidden;
}
.pdf-title {
background-color: #2c3e50;
color: white;
padding: 20px;
margin: 0;
font-size: 24px;
font-weight: bold;
}
.pdf-page {
padding: 20px;
border-bottom: 2px solid #ecf0f1;
position: relative;
min-height: 400px;
}
.pdf-page:last-child {
border-bottom: none;
}
.page-number {
position: absolute;
top: 10px;
right: 20px;
background-color: #3498db;
color: white;
padding: 5px 10px;
border-radius: 15px;
font-size: 12px;
font-weight: bold;
}
.text-block {
margin: 10px 0;
line-height: 1.4;
}
.text-span {
display: inline;
}
.bold {
font-weight: bold;
}
.italic {
font-style: italic;
}
.pdf-image {
max-width: 100%;
height: auto;
display: block;
margin: 15px 0;
border: 1px solid #ddd;
border-radius: 4px;
}
.image-container {
text-align: center;
margin: 20px 0;
}
.drawing-element {
position: absolute;
border: 1px solid #ccc;
}
@media print {
.pdf-container {
box-shadow: none;
border-radius: 0;
}
.pdf-page {
page-break-after: always;
}
.pdf-page:last-child {
page-break-after: auto;
}
}
"""
return css
def _generate_body_html(self, content: Dict) -> str:
"""
Generate HTML body content from extracted PDF content.
Args:
content: Extracted PDF content
Returns:
HTML body string
"""
html_parts = []
for page_data in content["pages"]:
page_html = self._generate_page_html(page_data)
html_parts.append(page_html)
return "\n".join(html_parts)
def _generate_page_html(self, page_data: Dict) -> str:
"""
Generate HTML for a single page.
Args:
page_data: Page content data
Returns:
HTML string for the page
"""
page_num = page_data["page_num"]
html_parts = [
f' <div class="pdf-page" id="page-{page_num + 1}">',
f' <div class="page-number">Page {page_num + 1}</div>',
]
# Add text blocks
for block in page_data["text_blocks"]:
if block["type"] == "text":
block_html = self._generate_text_block_html(block)
html_parts.append(f" {block_html}")
# Add images
for image in page_data["images"]:
image_html = self._generate_image_html(image)
html_parts.append(f" {image_html}")
html_parts.append(" </div>")
return "\n".join(html_parts)
def _generate_text_block_html(self, block: Dict) -> str:
"""
Generate HTML for a text block.
Args:
block: Text block data
Returns:
HTML string for the text block
"""
html_parts = ['<div class="text-block">']
for line in block["lines"]:
line_parts = []
for span in line["spans"]:
text = html.escape(span["text"])
font_info = span["font_info"]
# Build CSS classes
classes = ["text-span"]
if font_info["bold"]:
classes.append("bold")
if font_info["italic"]:
classes.append("italic")
# Build inline styles
styles = []
if font_info["size"] != 12:
styles.append(f"font-size: {font_info['size']}px")
# Convert color (assuming black text if color is 0)
if font_info["color"] != 0:
color_hex = f"#{font_info['color']:06x}"
styles.append(f"color: {color_hex}")
# Create span element
class_attr = f' class="{" ".join(classes)}"' if classes else ""
style_attr = f' style="{"; ".join(styles)}"' if styles else ""
span_html = f"<span{class_attr}{style_attr}>{text}</span>"
line_parts.append(span_html)
# Join spans for this line
if line_parts:
html_parts.append("".join(line_parts))
html_parts.append("</div>")
return "\n".join(html_parts)
def _generate_image_html(self, image: Dict) -> str:
"""
Generate HTML for an image.
Args:
image: Image data
Returns:
HTML string for the image
"""
data_uri = f"data:image/{image['format']};base64,{image['data']}"
return f"""<div class="image-container">
<img src="{data_uri}"
alt="PDF Image {image['index']}"
class="pdf-image"
width="{image['width']}"
height="{image['height']}">
</div>"""
def convert_pdf_to_html(
pdf_path: str,
output_path: Optional[str] = None,
embed_fonts: bool = True,
preserve_layout: bool = True,
) -> str:
"""
Convenience function to convert PDF to HTML.
Args:
pdf_path: Path to input PDF file
output_path: Optional output HTML file path
embed_fonts: Whether to embed font information
preserve_layout: Whether to preserve original layout
Returns:
Path to generated HTML file
"""
converter = PDFToHTMLConverter(
embed_fonts=embed_fonts, preserve_layout=preserve_layout
)
return converter.convert_pdf_to_html(pdf_path, output_path)
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python pdf_to_html.py <pdf_file> [output_file]")
sys.exit(1)
pdf_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else None
try:
result = convert_pdf_to_html(pdf_file, output_file)
print(f"Successfully converted PDF to HTML: {result}")
except Exception as e:
print(f"Error: {str(e)}")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment