Skip to content

Instantly share code, notes, and snippets.

@walmik
Created July 17, 2025 04:26
Show Gist options
  • Select an option

  • Save walmik/f7d96b098f5f21b98e1752437cbd02aa to your computer and use it in GitHub Desktop.

Select an option

Save walmik/f7d96b098f5f21b98e1752437cbd02aa to your computer and use it in GitHub Desktop.
PDF Summarizer
import sys
import PyPDF2
import os
from ollama import chat
from ollama import ChatResponse
def extract_text_from_pdf(pdf_path):
"""Extract text from a PDF file."""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error reading PDF: {e}")
return None
def summarize_with_ollama(text, model='nous-hermes2-mixtral:latest'):
"""Send text to Ollama for summarization."""
try:
response: ChatResponse = chat(model=model, messages=[
{
"role": "system",
"content": "You are a knowledge distillation expert. Extract and present a summary of the ACTUAL CONTENT, not meta-descriptions. If a chapter teaches 'how to do X', explain HOW to do X with specific steps, methods, examples, and details. If it presents data/research, include the actual findings and numbers. If it gives advice, state the specific advice. Ignore author bios, acknowledgments, and table of contents. Focus on substantive knowledge someone could use to learn the subject matter.",
},
{
"role": "user",
"content": f"Extract the substantive knowledge from this document. Don't tell me what topics are covered - tell me the actual knowledge, methods, processes, data, and insights. Present individual content of each chapter as distilled bullet points without any reference to the author:\n\n{text}",
}
])
return response.message.content
except Exception as e:
print(f"Error with Ollama: {e}")
return None
def main():
if len(sys.argv) != 2:
print("Usage: python pdf_summarizer.py <path_to_pdf_file>")
print("Example: python pdf_summarizer.py document.pdf")
sys.exit(1)
pdf_path = sys.argv[1]
print(f"Extracting text from: {pdf_path}")
text = extract_text_from_pdf(pdf_path)
if not text:
print("Failed to extract text from PDF.")
sys.exit(1)
print(f"Extracted {len(text)} characters from PDF.")
print(f"Estimated text size: {len(text.encode('utf-8')) / 1024 / 1024:.1f} MB")
# Warn for very large texts
if len(text) > 500000: # 500k characters
print("⚠️ WARNING: This is a very large document!")
print(" Consider using a smaller model or splitting the PDF.")
response = input("Continue anyway? (y/n): ")
if response.lower() != 'y':
print("Aborted.")
return
print("Sending to Ollama for summarization...")
summary = summarize_with_ollama(text)
if summary:
# Generate output filename based on input PDF
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_file = f"{pdf_name}_summary.txt"
# Write to file
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write("="*50 + "\n")
f.write(f"SUMMARY OF: {os.path.basename(pdf_path)}\n")
f.write("="*50 + "\n\n")
f.write(summary)
f.write("\n\n" + "="*50 + "\n")
f.write(f"Generated from: {pdf_path}\n")
f.write("="*50 + "\n")
print(f"\nSummary saved to: {output_file}")
print(f"File size: {os.path.getsize(output_file)} bytes")
except Exception as e:
print(f"Error writing to file: {e}")
print("\nFalling back to terminal output:")
print("\n" + "="*50)
print("SUMMARY:")
print("="*50)
print(summary)
else:
print("Failed to generate summary.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment