Skip to content

Instantly share code, notes, and snippets.

@fedir
Created November 3, 2025 13:06
Show Gist options
  • Select an option

  • Save fedir/57a553e781ed75cb5dd8314f0b24b9ed to your computer and use it in GitHub Desktop.

Select an option

Save fedir/57a553e781ed75cb5dd8314f0b24b9ed to your computer and use it in GitHub Desktop.
Google Cloud Vision API - PDF OCR Script for French Documents : Extracts text from PDF files with high accuracy, preserving table structure.

Prerequisites

  1. Install required packages:
pip install google-cloud-vision google-cloud-storage
  1. Set up Google Cloud:

    • Create a Google Cloud project
    • Enable the Cloud Vision API
    • Create a service account and download the JSON key
    • Create a Google Cloud Storage bucket (required for PDF processing)
  2. Set credentials:

export GOOGLE_APPLICATION_CREDENTIALS="path/to/your-service-account-key.json"

Usage

python ocr_pdf.py your_document.pdf your-bucket-name output-name

Key Features

  • Optimized for French: Uses language hints to improve accuracy
  • Table-aware: Vision API preserves table structure and layout
  • Multi-page support: Handles documents of any size
  • Dual output: Saves both plain text and detailed JSON with per-page confidence scores
  • Automatic cleanup: Removes temporary files from GCS after processing

Why GCS is Required

Google Cloud Vision requires PDFs to be in Google Cloud Storage (GCS) for processing. The script handles this automatically by:

  1. Uploading your PDF temporarily
  2. Processing it with Vision API
  3. Downloading results
  4. Cleaning up temporary files

Output

You'll get two files:

  • output-name_full_text.txt - Complete extracted text
  • output-name_detailed.json - Per-page text with confidence scores

The Vision API typically achieves 95%+ accuracy on French documents, especially with clear scans and printed text. Tables are preserved with their spatial layout!

#!/usr/bin/env python3
"""
Google Cloud Vision API - PDF OCR Script for French Documents
Extracts text from PDF files with high accuracy, preserving table structure.
"""
from google.cloud import vision
from google.cloud import storage
import json
import os
from pathlib import Path
def ocr_pdf_with_vision(
pdf_path: str,
output_prefix: str = "ocr_output",
gcs_bucket_name: str = None,
language_hints: list = ["fr"]
):
"""
OCR a PDF file using Google Cloud Vision API.
Args:
pdf_path: Path to the local PDF file
output_prefix: Prefix for output files
gcs_bucket_name: GCS bucket name (required for Vision API)
language_hints: Language hints for OCR (default: French)
Returns:
Dictionary with extracted text and metadata
"""
# Initialize clients
vision_client = vision.ImageAnnotatorClient()
storage_client = storage.Client()
# Validate bucket name
if not gcs_bucket_name:
raise ValueError("GCS bucket name is required for PDF processing")
bucket = storage_client.bucket(gcs_bucket_name)
# Upload PDF to GCS (Vision API requires PDFs to be in GCS)
pdf_filename = Path(pdf_path).name
blob = bucket.blob(f"temp/{pdf_filename}")
print(f"Uploading {pdf_path} to gs://{gcs_bucket_name}/temp/{pdf_filename}...")
blob.upload_from_filename(pdf_path)
gcs_source_uri = f"gs://{gcs_bucket_name}/temp/{pdf_filename}"
gcs_destination_uri = f"gs://{gcs_bucket_name}/ocr-results/{output_prefix}/"
# Configure the request
input_config = vision.InputConfig(
gcs_source=vision.GcsSource(uri=gcs_source_uri),
mime_type="application/pdf"
)
output_config = vision.OutputConfig(
gcs_destination=vision.GcsDestination(uri=gcs_destination_uri),
batch_size=100 # Pages per output file
)
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
# Add language hints for better accuracy
image_context = vision.ImageContext(language_hints=language_hints)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature],
input_config=input_config,
output_config=output_config,
image_context=image_context
)
print("Starting OCR process (this may take a few minutes)...")
operation = vision_client.async_batch_annotate_files(requests=[async_request])
print("Waiting for operation to complete...")
operation.result(timeout=600) # Wait up to 10 minutes
print("OCR complete! Processing results...")
# Get the output files from GCS
prefix = f"ocr-results/{output_prefix}/"
blobs = list(bucket.list_blobs(prefix=prefix))
all_text = []
all_pages = []
for blob in blobs:
json_string = blob.download_as_bytes().decode('utf-8')
response = json.loads(json_string)
# Extract text from each page
for idx, resp in enumerate(response['responses']):
if 'fullTextAnnotation' in resp:
page_text = resp['fullTextAnnotation']['text']
all_text.append(page_text)
all_pages.append({
'page_number': idx + 1,
'text': page_text,
'confidence': resp.get('fullTextAnnotation', {}).get('pages', [{}])[0].get('confidence', 0)
})
# Save results locally
output_txt = f"{output_prefix}_full_text.txt"
output_json = f"{output_prefix}_detailed.json"
with open(output_txt, 'w', encoding='utf-8') as f:
f.write('\n\n=== PAGE BREAK ===\n\n'.join(all_text))
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(all_pages, f, ensure_ascii=False, indent=2)
print(f"\n✓ Results saved:")
print(f" - Full text: {output_txt}")
print(f" - Detailed JSON: {output_json}")
print(f" - Total pages processed: {len(all_pages)}")
# Cleanup: delete temporary files from GCS
blob.delete()
for b in blobs:
b.delete()
return {
'full_text': '\n\n'.join(all_text),
'pages': all_pages,
'total_pages': len(all_pages)
}
def ocr_single_page_pdf(pdf_path: str, language_hints: list = ["fr"]):
"""
OCR a single-page PDF directly without GCS (simpler for small files).
Args:
pdf_path: Path to the local PDF file
language_hints: Language hints for OCR
Returns:
Extracted text string
"""
vision_client = vision.ImageAnnotatorClient()
with open(pdf_path, 'rb') as f:
content = f.read()
image = vision.Image(content=content)
image_context = vision.ImageContext(language_hints=language_hints)
response = vision_client.document_text_detection(
image=image,
image_context=image_context
)
if response.error.message:
raise Exception(f"Error: {response.error.message}")
return response.full_text_annotation.text
if __name__ == "__main__":
import sys
# Example usage
if len(sys.argv) < 3:
print("Usage: python ocr_pdf.py <pdf_path> <gcs_bucket_name> [output_prefix]")
print("\nMake sure you have:")
print("1. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
print("2. Created a GCS bucket for processing")
print("\nExample:")
print(" export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
print(" python ocr_pdf.py document.pdf my-bucket-name my-document")
sys.exit(1)
pdf_path = sys.argv[1]
bucket_name = sys.argv[2]
prefix = sys.argv[3] if len(sys.argv) > 3 else "ocr_output"
# Check if credentials are set
if not os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
print("⚠️ Warning: GOOGLE_APPLICATION_CREDENTIALS not set!")
print("Set it with: export GOOGLE_APPLICATION_CREDENTIALS='path/to/key.json'")
sys.exit(1)
try:
result = ocr_pdf_with_vision(pdf_path, prefix, bucket_name)
print(f"\n✓ Successfully processed {result['total_pages']} pages")
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment