Skip to content

Instantly share code, notes, and snippets.

@Rugby-Ball
Created March 10, 2025 15:47
Show Gist options
  • Select an option

  • Save Rugby-Ball/cb7b7a3dc0b05f8f6ca9dd606736cf9f to your computer and use it in GitHub Desktop.

Select an option

Save Rugby-Ball/cb7b7a3dc0b05f8f6ca9dd606736cf9f to your computer and use it in GitHub Desktop.
Use this to extract PDF content and images from a Protected PDF file. #Python #PDF #Public
# Required Libraries
# pip install pymupdf pdfplumber Pillow
import fitz # PyMuPDF
import pdfplumber
import os
pdf_file = "2024-SOC2-Report-for-Forsta-Division-Products.pdf"
output_dir = "extracted_markdown"
images_dir = os.path.join(output_dir, "images")
os.makedirs(images_dir, exist_ok=True)
start_page = 35
end_page = 141
markdown_output = ""
with pdfplumber.open(pdf_file) as plumber_pdf:
fitz_pdf = fitz.open(pdf_file)
for page_num in range(start_page - 1, end_page):
page_markdown = f"# Page {page_num + 1}\n\n"
# Extract text using pdfplumber
plumber_page = plumber_pdf.pages[page_num]
text = plumber_page.extract_text()
if text:
markdown_output += f"{text}\n\n"
# Extract tables using pdfplumber and format as Markdown tables
tables = plumber_page.extract_tables()
for table in tables:
if table:
header_row = table[0]
markdown_output += "| " + " | ".join(cell.strip().replace("\n", " ") if cell else "" for cell in header_row) + " |\n"
markdown_output += "|" + "|".join(["---"] * len(header_row)) + "|\n"
for row in table[1:]:
markdown_output += "| " + " | ".join(cell.strip().replace("\n", " ") if cell else "" for cell in row) + " |\n"
markdown_output += "\n"
# Extract images using PyMuPDF (fitz)
fitz_pdf = fitz.open(pdf_file)
fitz_page = fitz_pdf.load_page(page_num)
images = fitz_page.get_images(full=True)
for img_index, img in enumerate(images):
xref = img[0]
base_image = fitz_pdf.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = f"page_{page_num+1}_img_{xref}.{image_ext}"
image_path = os.path.join(images_dir, image_filename)
with open(image_path, 'wb') as img_file:
img_file.write(image_bytes)
# Embed image in Markdown output
markdown_output += f"![Image from page {page_num+1}](images/{image_filename})\n\n"
# Save final Markdown file
output_markdown_path = os.path.join(output_dir, "Extracted_SOC2_Report.md")
with open(output_markdown_path, 'w', encoding="utf-8") as md_file:
md_file.write(markdown_output)
print(f"Extraction complete. Markdown saved at '{output_markdown_path}'.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment