fedir/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Prerequisites


Install required packages:

pip install google-cloud-vision google-cloud-storage


Set up Google Cloud:

Create a Google Cloud project
Enable the Cloud Vision API
Create a service account and download the JSON key
Create a Google Cloud Storage bucket (required for PDF processing)


Set credentials:


export GOOGLE_APPLICATION_CREDENTIALS="path/to/your-service-account-key.json"
Usage

python ocr_pdf.py your_document.pdf your-bucket-name output-name
Key Features


Optimized for French: Uses language hints to improve accuracy
Table-aware: Vision API preserves table structure and layout
Multi-page support: Handles documents of any size
Dual output: Saves both plain text and detailed JSON with per-page confidence scores
Automatic cleanup: Removes temporary files from GCS after processing

Why GCS is Required

Google Cloud Vision requires PDFs to be in Google Cloud Storage (GCS) for processing. The script handles this automatically by:

Uploading your PDF temporarily
Processing it with Vision API
Downloading results
Cleaning up temporary files

Output

You'll get two files:

output-name_full_text.txt - Complete extracted text
output-name_detailed.json - Per-page text with confidence scores

The Vision API typically achieves 95%+ accuracy on French documents, especially with clear scans and printed text. Tables are preserved with their spatial layout!

  
## french_ocr.py
#!/usr/bin/env python3
"""
Google Cloud Vision API - PDF OCR Script for French Documents
Extracts text from PDF files with high accuracy, preserving table structure.
"""

from google.cloud import vision
from google.cloud import storage
import json
import os
from pathlib import Path

def ocr_pdf_with_vision(
    pdf_path: str,
    output_prefix: str = "ocr_output",
    gcs_bucket_name: str = None,
    language_hints: list = ["fr"]
):
    """
    OCR a PDF file using Google Cloud Vision API.

    Args:
        pdf_path: Path to the local PDF file
        output_prefix: Prefix for output files
        gcs_bucket_name: GCS bucket name (required for Vision API)
        language_hints: Language hints for OCR (default: French)

    Returns:
        Dictionary with extracted text and metadata
    """

    # Initialize clients
    vision_client = vision.ImageAnnotatorClient()
    storage_client = storage.Client()

    # Validate bucket name
    if not gcs_bucket_name:
        raise ValueError("GCS bucket name is required for PDF processing")

    bucket = storage_client.bucket(gcs_bucket_name)

    # Upload PDF to GCS (Vision API requires PDFs to be in GCS)
    pdf_filename = Path(pdf_path).name
    blob = bucket.blob(f"temp/{pdf_filename}")

    print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...")
    blob.upload_from_filename(pdf_path)

    gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}"
    gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/"

    # Configure the request
    input_config = vision.InputConfig(
        gcs_source=vision.GcsSource(uri=gcs_source_uri),
        mime_type="application/pdf"
    )

    output_config = vision.OutputConfig(
        gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
        batch_size=100  # Pages per output file
    )

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    # Add language hints for better accuracy
    image_context = vision.ImageContext(language_hints=language_hints)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature],
        input_config=input_config,
        output_config=output_config,
        image_context=image_context
    )

    print("Starting OCR process (this may take a few minutes)...")
    operation = vision_client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for operation to complete...")
    operation.result(timeout=600)  # Wait up to 10 minutes

    print("OCR complete! Processing results...")

    # Get the output files from GCS
    prefix = f"ocr-results/{output_prefix}/"
    blobs = list(bucket.list_blobs(prefix=prefix))

    all_text = []
    all_pages = []

    for blob in blobs:
        json_string = blob.download_as_bytes().decode('utf-8')
        response = json.loads(json_string)

        # Extract text from each page
        for idx, resp in enumerate(response['responses']):
            if 'fullTextAnnotation' in resp:
                page_text = resp['fullTextAnnotation']['text']
                all_text.append(page_text)
                all_pages.append({
                    'page_number': idx + 1,
                    'text': page_text,
                    'confidence': resp.get('fullTextAnnotation', {}).get('pages', [{}])[0].get('confidence', 0)
                })

    # Save results locally
    output_txt = f"{output_prefix}_full_text.txt"
    output_json = f"{output_prefix}_detailed.json"

    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write('\n\n=== PAGE BREAK ===\n\n'.join(all_text))

    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(all_pages, f, ensure_ascii=False, indent=2)

    print(f"\n✓ Results saved:")
    print(f"  - Full text: {output_txt}")
    print(f"  - Detailed JSON: {output_json}")
    print(f"  - Total pages processed: {len(all_pages)}")

    # Cleanup: delete temporary files from GCS
    blob.delete()
    for b in blobs:
        b.delete()

    return {
        'full_text': '\n\n'.join(all_text),
        'pages': all_pages,
        'total_pages': len(all_pages)
    }


def ocr_single_page_pdf(pdf_path: str, language_hints: list = ["fr"]):
    """
    OCR a single-page PDF directly without GCS (simpler for small files).

    Args:
        pdf_path: Path to the local PDF file
        language_hints: Language hints for OCR

    Returns:
        Extracted text string
    """

    vision_client = vision.ImageAnnotatorClient()

    with open(pdf_path, 'rb') as f:
        content = f.read()

    image = vision.Image(content=content)
    image_context = vision.ImageContext(language_hints=language_hints)

    response = vision_client.document_text_detection(
        image=image,
        image_context=image_context
    )

    if response.error.message:
        raise Exception(f"Error: {response.error.message}")

    return response.full_text_annotation.text


if __name__ == "__main__":
    import sys

    # Example usage
    if len(sys.argv) < 3:
        print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]")
        print("\nMake sure you have:")
        print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
        print("2. Created a GCS bucket for processing")
        print("\nExample:")
        print("  export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
        print("  python ocr_pdf.py document.pdf my-bucket-name my-document")
        sys.exit(1)

    pdf_path = sys.argv[1]
    bucket_name = sys.argv[2]
    prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output"

    # Check if credentials are set
    if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
        print("⚠️  Warning: GOOGLE_APPLICATION_CREDENTIALS not set!")
        print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
        sys.exit(1)

    try:
        result = ocr_pdf_with_vision(pdf_path, prefix, bucket_name)
        print(f"\n✓ Successfully processed {result['total_pages']} pages")
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)
	#!/usr/bin/env python3
	"""
	Google Cloud Vision API - PDF OCR Script for French Documents
	Extracts text from PDF files with high accuracy, preserving table structure.
	"""

	from google.cloud import vision
	from google.cloud import storage
	import json
	import os
	from pathlib import Path

	def ocr_pdf_with_vision(
	pdf_path: str,
	output_prefix: str = "ocr_output",
	gcs_bucket_name: str = None,
	language_hints: list = ["fr"]
	):
	"""
	OCR a PDF file using Google Cloud Vision API.

	Args:
	pdf_path: Path to the local PDF file
	output_prefix: Prefix for output files
	gcs_bucket_name: GCS bucket name (required for Vision API)
	language_hints: Language hints for OCR (default: French)

	Returns:
	Dictionary with extracted text and metadata
	"""

	# Initialize clients
	vision_client = vision.ImageAnnotatorClient()
	storage_client = storage.Client()

	# Validate bucket name
	if not gcs_bucket_name:
	raise ValueError("GCS bucket name is required for PDF processing")

	bucket = storage_client.bucket(gcs_bucket_name)

	# Upload PDF to GCS (Vision API requires PDFs to be in GCS)
	pdf_filename = Path(pdf_path).name
	blob = bucket.blob(f"temp/{pdf_filename}")

	print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...")
	blob.upload_from_filename(pdf_path)

	gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}"
	gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/"

	# Configure the request
	input_config = vision.InputConfig(
	gcs_source=vision.GcsSource(uri=gcs_source_uri),
	mime_type="application/pdf"
	)

	output_config = vision.OutputConfig(
	gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
	batch_size=100 # Pages per output file
	)

	feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

	# Add language hints for better accuracy
	image_context = vision.ImageContext(language_hints=language_hints)

	async_request = vision.AsyncAnnotateFileRequest(
	features=[feature],
	input_config=input_config,
	output_config=output_config,
	image_context=image_context
	)

	print("Starting OCR process (this may take a few minutes)...")
	operation = vision_client.async_batch_annotate_files(requests=[async_request])

	print("Waiting for operation to complete...")
	operation.result(timeout=600) # Wait up to 10 minutes

	print("OCR complete! Processing results...")

	# Get the output files from GCS
	prefix = f"ocr-results/{output_prefix}/"
	blobs = list(bucket.list_blobs(prefix=prefix))

	all_text = []
	all_pages = []

	for blob in blobs:
	json_string = blob.download_as_bytes().decode('utf-8')
	response = json.loads(json_string)

	# Extract text from each page
	for idx, resp in enumerate(response['responses']):
	if 'fullTextAnnotation' in resp:
	page_text = resp['fullTextAnnotation']['text']
	all_text.append(page_text)
	all_pages.append({
	'page_number': idx + 1,
	'text': page_text,
	'confidence': resp.get('fullTextAnnotation', {}).get('pages', [{}])[0].get('confidence', 0)
	})

	# Save results locally
	output_txt = f"{output_prefix}_full_text.txt"
	output_json = f"{output_prefix}_detailed.json"

	with open(output_txt, 'w', encoding='utf-8') as f:
	f.write('\n\n=== PAGE BREAK ===\n\n'.join(all_text))

	with open(output_json, 'w', encoding='utf-8') as f:
	json.dump(all_pages, f, ensure_ascii=False, indent=2)

	print(f"\n✓ Results saved:")
	print(f" - Full text: {output_txt}")
	print(f" - Detailed JSON: {output_json}")
	print(f" - Total pages processed: {len(all_pages)}")

	# Cleanup: delete temporary files from GCS
	blob.delete()
	for b in blobs:
	b.delete()

	return {
	'full_text': '\n\n'.join(all_text),
	'pages': all_pages,
	'total_pages': len(all_pages)
	}


	def ocr_single_page_pdf(pdf_path: str, language_hints: list = ["fr"]):
	"""
	OCR a single-page PDF directly without GCS (simpler for small files).

	Args:
	pdf_path: Path to the local PDF file
	language_hints: Language hints for OCR

	Returns:
	Extracted text string
	"""

	vision_client = vision.ImageAnnotatorClient()

	with open(pdf_path, 'rb') as f:
	content = f.read()

	image = vision.Image(content=content)
	image_context = vision.ImageContext(language_hints=language_hints)

	response = vision_client.document_text_detection(
	image=image,
	image_context=image_context
	)

	if response.error.message:
	raise Exception(f"Error: {response.error.message}")

	return response.full_text_annotation.text


	if __name__ == "__main__":
	import sys

	# Example usage
	if len(sys.argv) < 3:
	print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]")
	print("\nMake sure you have:")
	print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
	print("2. Created a GCS bucket for processing")
	print("\nExample:")
	print(" export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
	print(" python ocr_pdf.py document.pdf my-bucket-name my-document")
	sys.exit(1)

	pdf_path = sys.argv[1]
	bucket_name = sys.argv[2]
	prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output"

	# Check if credentials are set
	if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
	print("⚠️ Warning: GOOGLE_APPLICATION_CREDENTIALS not set!")
	print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
	sys.exit(1)

	try:
	result = ocr_pdf_with_vision(pdf_path, prefix, bucket_name)
	print(f"\n✓ Successfully processed {result['total_pages']} pages")
	except Exception as e:
	print(f"❌ Error: {e}")
	sys.exit(1)
No results found