|
#!/usr/bin/env python3 |
|
""" |
|
Google Cloud Vision API - PDF OCR Script for French Documents |
|
Extracts text from PDF files with high accuracy, preserving table structure. |
|
""" |
|
|
|
from google.cloud import vision |
|
from google.cloud import storage |
|
import json |
|
import os |
|
from pathlib import Path |
|
|
|
def ocr_pdf_with_vision( |
|
pdf_path: str, |
|
output_prefix: str = "ocr_output", |
|
gcs_bucket_name: str = None, |
|
language_hints: list = ["fr"] |
|
): |
|
""" |
|
OCR a PDF file using Google Cloud Vision API. |
|
|
|
Args: |
|
pdf_path: Path to the local PDF file |
|
output_prefix: Prefix for output files |
|
gcs_bucket_name: GCS bucket name (required for Vision API) |
|
language_hints: Language hints for OCR (default: French) |
|
|
|
Returns: |
|
Dictionary with extracted text and metadata |
|
""" |
|
|
|
# Initialize clients |
|
vision_client = vision.ImageAnnotatorClient() |
|
storage_client = storage.Client() |
|
|
|
# Validate bucket name |
|
if not gcs_bucket_name: |
|
raise ValueError("GCS bucket name is required for PDF processing") |
|
|
|
bucket = storage_client.bucket(gcs_bucket_name) |
|
|
|
# Upload PDF to GCS (Vision API requires PDFs to be in GCS) |
|
pdf_filename = Path(pdf_path).name |
|
blob = bucket.blob(f"temp/{pdf_filename}") |
|
|
|
print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...") |
|
blob.upload_from_filename(pdf_path) |
|
|
|
gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}" |
|
gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/" |
|
|
|
# Configure the request |
|
input_config = vision.InputConfig( |
|
gcs_source=vision.GcsSource(uri=gcs_source_uri), |
|
mime_type="application/pdf" |
|
) |
|
|
|
output_config = vision.OutputConfig( |
|
gcs_destination=vision.GcsDestination(uri=gcs_destination_uri), |
|
batch_size=100 # Pages per output file |
|
) |
|
|
|
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) |
|
|
|
# Add language hints for better accuracy |
|
image_context = vision.ImageContext(language_hints=language_hints) |
|
|
|
async_request = vision.AsyncAnnotateFileRequest( |
|
features=[feature], |
|
input_config=input_config, |
|
output_config=output_config, |
|
image_context=image_context |
|
) |
|
|
|
print("Starting OCR process (this may take a few minutes)...") |
|
operation = vision_client.async_batch_annotate_files(requests=[async_request]) |
|
|
|
print("Waiting for operation to complete...") |
|
operation.result(timeout=600) # Wait up to 10 minutes |
|
|
|
print("OCR complete! Processing results...") |
|
|
|
# Get the output files from GCS |
|
prefix = f"ocr-results/{output_prefix}/" |
|
blobs = list(bucket.list_blobs(prefix=prefix)) |
|
|
|
all_text = [] |
|
all_pages = [] |
|
|
|
for blob in blobs: |
|
json_string = blob.download_as_bytes().decode('utf-8') |
|
response = json.loads(json_string) |
|
|
|
# Extract text from each page |
|
for idx, resp in enumerate(response['responses']): |
|
if 'fullTextAnnotation' in resp: |
|
page_text = resp['fullTextAnnotation']['text'] |
|
all_text.append(page_text) |
|
all_pages.append({ |
|
'page_number': idx + 1, |
|
'text': page_text, |
|
'confidence': resp.get('fullTextAnnotation', {}).get('pages', [{}])[0].get('confidence', 0) |
|
}) |
|
|
|
# Save results locally |
|
output_txt = f"{output_prefix}_full_text.txt" |
|
output_json = f"{output_prefix}_detailed.json" |
|
|
|
with open(output_txt, 'w', encoding='utf-8') as f: |
|
f.write('\n\n=== PAGE BREAK ===\n\n'.join(all_text)) |
|
|
|
with open(output_json, 'w', encoding='utf-8') as f: |
|
json.dump(all_pages, f, ensure_ascii=False, indent=2) |
|
|
|
print(f"\n✓ Results saved:") |
|
print(f" - Full text: {output_txt}") |
|
print(f" - Detailed JSON: {output_json}") |
|
print(f" - Total pages processed: {len(all_pages)}") |
|
|
|
# Cleanup: delete temporary files from GCS |
|
blob.delete() |
|
for b in blobs: |
|
b.delete() |
|
|
|
return { |
|
'full_text': '\n\n'.join(all_text), |
|
'pages': all_pages, |
|
'total_pages': len(all_pages) |
|
} |
|
|
|
|
|
def ocr_single_page_pdf(pdf_path: str, language_hints: list = ["fr"]): |
|
""" |
|
OCR a single-page PDF directly without GCS (simpler for small files). |
|
|
|
Args: |
|
pdf_path: Path to the local PDF file |
|
language_hints: Language hints for OCR |
|
|
|
Returns: |
|
Extracted text string |
|
""" |
|
|
|
vision_client = vision.ImageAnnotatorClient() |
|
|
|
with open(pdf_path, 'rb') as f: |
|
content = f.read() |
|
|
|
image = vision.Image(content=content) |
|
image_context = vision.ImageContext(language_hints=language_hints) |
|
|
|
response = vision_client.document_text_detection( |
|
image=image, |
|
image_context=image_context |
|
) |
|
|
|
if response.error.message: |
|
raise Exception(f"Error: {response.error.message}") |
|
|
|
return response.full_text_annotation.text |
|
|
|
|
|
if __name__ == "__main__": |
|
import sys |
|
|
|
# Example usage |
|
if len(sys.argv) < 3: |
|
print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]") |
|
print("\nMake sure you have:") |
|
print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable") |
|
print("2. Created a GCS bucket for processing") |
|
print("\nExample:") |
|
print(" export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'") |
|
print(" python ocr_pdf.py document.pdf my-bucket-name my-document") |
|
sys.exit(1) |
|
|
|
pdf_path = sys.argv[1] |
|
bucket_name = sys.argv[2] |
|
prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output" |
|
|
|
# Check if credentials are set |
|
if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'): |
|
print("⚠️ Warning: GOOGLE_APPLICATION_CREDENTIALS not set!") |
|
print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'") |
|
sys.exit(1) |
|
|
|
try: |
|
result = ocr_pdf_with_vision(pdf_path, prefix, bucket_name) |
|
print(f"\n✓ Successfully processed {result['total_pages']} pages") |
|
except Exception as e: |
|
print(f"❌ Error: {e}") |
|
sys.exit(1) |