walmik/gist:f7d96b098f5f21b98e1752437cbd02aa

## gistfile1.txt
import sys
import PyPDF2
import os
from ollama import chat
from ollama import ChatResponse

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""

            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"

            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def summarize_with_ollama(text, model='nous-hermes2-mixtral:latest'):
    """Send text to Ollama for summarization."""
    try:
        response: ChatResponse = chat(model=model, messages=[
            {
                "role": "system",
                "content": "You are a knowledge distillation expert. Extract and present a summary of the ACTUAL CONTENT, not meta-descriptions. If a chapter teaches 'how to do X', explain HOW to do X with specific steps, methods, examples, and details. If it presents data/research, include the actual findings and numbers. If it gives advice, state the specific advice. Ignore author bios, acknowledgments, and table of contents. Focus on substantive knowledge someone could use to learn the subject matter.",
            },
            {
                "role": "user",
                "content": f"Extract the substantive knowledge from this document. Don't tell me what topics are covered - tell me the actual knowledge, methods, processes, data, and insights. Present individual content of each chapter as distilled bullet points without any reference to the author:\n\n{text}",
            }
        ])
        return response.message.content
    except Exception as e:
        print(f"Error with Ollama: {e}")
        return None

def main():
    if len(sys.argv) != 2:
        print("Usage: python pdf_summarizer.py <path_to_pdf_file>")
        print("Example: python pdf_summarizer.py document.pdf")
        sys.exit(1)

    pdf_path = sys.argv[1]

    print(f"Extracting text from: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)

    if not text:
        print("Failed to extract text from PDF.")
        sys.exit(1)

    print(f"Extracted {len(text)} characters from PDF.")
    print(f"Estimated text size: {len(text.encode('utf-8')) / 1024 / 1024:.1f} MB")

    # Warn for very large texts
    if len(text) > 500000:  # 500k characters
        print("⚠️  WARNING: This is a very large document!")
        print("   Consider using a smaller model or splitting the PDF.")
        response = input("Continue anyway? (y/n): ")
        if response.lower() != 'y':
            print("Aborted.")
            return

    print("Sending to Ollama for summarization...")

    summary = summarize_with_ollama(text)

    if summary:
        # Generate output filename based on input PDF
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_file = f"{pdf_name}_summary.txt"

        # Write to file
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write("="*50 + "\n")
                f.write(f"SUMMARY OF: {os.path.basename(pdf_path)}\n")
                f.write("="*50 + "\n\n")
                f.write(summary)
                f.write("\n\n" + "="*50 + "\n")
                f.write(f"Generated from: {pdf_path}\n")
                f.write("="*50 + "\n")

            print(f"\nSummary saved to: {output_file}")
            print(f"File size: {os.path.getsize(output_file)} bytes")

        except Exception as e:
            print(f"Error writing to file: {e}")
            print("\nFalling back to terminal output:")
            print("\n" + "="*50)
            print("SUMMARY:")
            print("="*50)
            print(summary)
    else:
        print("Failed to generate summary.")

if __name__ == "__main__":
    main()
	import sys
	import PyPDF2
	import os
	from ollama import chat
	from ollama import ChatResponse

	def extract_text_from_pdf(pdf_path):
	"""Extract text from a PDF file."""
	try:
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""

	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text += page.extract_text() + "\n"

	return text
	except Exception as e:
	print(f"Error reading PDF: {e}")
	return None

	def summarize_with_ollama(text, model='nous-hermes2-mixtral:latest'):
	"""Send text to Ollama for summarization."""
	try:
	response: ChatResponse = chat(model=model, messages=[
	{
	"role": "system",
	"content": "You are a knowledge distillation expert. Extract and present a summary of the ACTUAL CONTENT, not meta-descriptions. If a chapter teaches 'how to do X', explain HOW to do X with specific steps, methods, examples, and details. If it presents data/research, include the actual findings and numbers. If it gives advice, state the specific advice. Ignore author bios, acknowledgments, and table of contents. Focus on substantive knowledge someone could use to learn the subject matter.",
	},
	{
	"role": "user",
	"content": f"Extract the substantive knowledge from this document. Don't tell me what topics are covered - tell me the actual knowledge, methods, processes, data, and insights. Present individual content of each chapter as distilled bullet points without any reference to the author:\n\n{text}",
	}
	])
	return response.message.content
	except Exception as e:
	print(f"Error with Ollama: {e}")
	return None

	def main():
	if len(sys.argv) != 2:
	print("Usage: python pdf_summarizer.py <path_to_pdf_file>")
	print("Example: python pdf_summarizer.py document.pdf")
	sys.exit(1)

	pdf_path = sys.argv[1]

	print(f"Extracting text from: {pdf_path}")
	text = extract_text_from_pdf(pdf_path)

	if not text:
	print("Failed to extract text from PDF.")
	sys.exit(1)

	print(f"Extracted {len(text)} characters from PDF.")
	print(f"Estimated text size: {len(text.encode('utf-8')) / 1024 / 1024:.1f} MB")

	# Warn for very large texts
	if len(text) > 500000: # 500k characters
	print("⚠️ WARNING: This is a very large document!")
	print(" Consider using a smaller model or splitting the PDF.")
	response = input("Continue anyway? (y/n): ")
	if response.lower() != 'y':
	print("Aborted.")
	return

	print("Sending to Ollama for summarization...")

	summary = summarize_with_ollama(text)

	if summary:
	# Generate output filename based on input PDF
	pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
	output_file = f"{pdf_name}_summary.txt"

	# Write to file
	try:
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write("="*50 + "\n")
	f.write(f"SUMMARY OF: {os.path.basename(pdf_path)}\n")
	f.write("="*50 + "\n\n")
	f.write(summary)
	f.write("\n\n" + "="*50 + "\n")
	f.write(f"Generated from: {pdf_path}\n")
	f.write("="*50 + "\n")

	print(f"\nSummary saved to: {output_file}")
	print(f"File size: {os.path.getsize(output_file)} bytes")

	except Exception as e:
	print(f"Error writing to file: {e}")
	print("\nFalling back to terminal output:")
	print("\n" + "="*50)
	print("SUMMARY:")
	print("="*50)
	print(summary)
	else:
	print("Failed to generate summary.")

	if __name__ == "__main__":
	main()
No results found