Created
July 17, 2025 04:26
-
-
Save walmik/f7d96b098f5f21b98e1752437cbd02aa to your computer and use it in GitHub Desktop.
PDF Summarizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import PyPDF2 | |
| import os | |
| from ollama import chat | |
| from ollama import ChatResponse | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from a PDF file.""" | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page_num in range(len(pdf_reader.pages)): | |
| page = pdf_reader.pages[page_num] | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"Error reading PDF: {e}") | |
| return None | |
| def summarize_with_ollama(text, model='nous-hermes2-mixtral:latest'): | |
| """Send text to Ollama for summarization.""" | |
| try: | |
| response: ChatResponse = chat(model=model, messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a knowledge distillation expert. Extract and present a summary of the ACTUAL CONTENT, not meta-descriptions. If a chapter teaches 'how to do X', explain HOW to do X with specific steps, methods, examples, and details. If it presents data/research, include the actual findings and numbers. If it gives advice, state the specific advice. Ignore author bios, acknowledgments, and table of contents. Focus on substantive knowledge someone could use to learn the subject matter.", | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Extract the substantive knowledge from this document. Don't tell me what topics are covered - tell me the actual knowledge, methods, processes, data, and insights. Present individual content of each chapter as distilled bullet points without any reference to the author:\n\n{text}", | |
| } | |
| ]) | |
| return response.message.content | |
| except Exception as e: | |
| print(f"Error with Ollama: {e}") | |
| return None | |
| def main(): | |
| if len(sys.argv) != 2: | |
| print("Usage: python pdf_summarizer.py <path_to_pdf_file>") | |
| print("Example: python pdf_summarizer.py document.pdf") | |
| sys.exit(1) | |
| pdf_path = sys.argv[1] | |
| print(f"Extracting text from: {pdf_path}") | |
| text = extract_text_from_pdf(pdf_path) | |
| if not text: | |
| print("Failed to extract text from PDF.") | |
| sys.exit(1) | |
| print(f"Extracted {len(text)} characters from PDF.") | |
| print(f"Estimated text size: {len(text.encode('utf-8')) / 1024 / 1024:.1f} MB") | |
| # Warn for very large texts | |
| if len(text) > 500000: # 500k characters | |
| print("⚠️ WARNING: This is a very large document!") | |
| print(" Consider using a smaller model or splitting the PDF.") | |
| response = input("Continue anyway? (y/n): ") | |
| if response.lower() != 'y': | |
| print("Aborted.") | |
| return | |
| print("Sending to Ollama for summarization...") | |
| summary = summarize_with_ollama(text) | |
| if summary: | |
| # Generate output filename based on input PDF | |
| pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] | |
| output_file = f"{pdf_name}_summary.txt" | |
| # Write to file | |
| try: | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write("="*50 + "\n") | |
| f.write(f"SUMMARY OF: {os.path.basename(pdf_path)}\n") | |
| f.write("="*50 + "\n\n") | |
| f.write(summary) | |
| f.write("\n\n" + "="*50 + "\n") | |
| f.write(f"Generated from: {pdf_path}\n") | |
| f.write("="*50 + "\n") | |
| print(f"\nSummary saved to: {output_file}") | |
| print(f"File size: {os.path.getsize(output_file)} bytes") | |
| except Exception as e: | |
| print(f"Error writing to file: {e}") | |
| print("\nFalling back to terminal output:") | |
| print("\n" + "="*50) | |
| print("SUMMARY:") | |
| print("="*50) | |
| print(summary) | |
| else: | |
| print("Failed to generate summary.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment