peltho/google_ocr.py

## google_ocr.py
from google.cloud import vision
import sys
import os
import csv
import re
from PIL import Image
import io
from pathlib import Path

def process_image(image_path, client):
    """Process a single image and return structured rows."""
    print(f"\nProcessing: {os.path.basename(image_path)}")

    try:
        # Load and rotate image by ~2 degrees
        img = Image.open(image_path)
        img_rotated = img.rotate(1.8, expand=True, fillcolor='white')

        # Convert to bytes for Google Vision
        buffer = io.BytesIO()
        img_rotated.save(buffer, format=img.format if img.format else 'JPEG')
        content = buffer.getvalue()

        image = vision.Image(content=content)

        # OCR document detection (better for tables)
        response = client.document_text_detection(image=image)
        document = response.full_text_annotation

        # Extract words with positions
        words = []
        for page in document.pages:
            for block in page.blocks:
                for paragraph in block.paragraphs:
                    for word in paragraph.words:
                        word_text = "".join([symbol.text for symbol in word.symbols])
                        vertices = word.bounding_box.vertices
                        x = vertices[0].x
                        y = vertices[0].y

                        words.append({
                            "text": word_text,
                            "x": x,
                            "y": y,
                        })

        if not words:
            print("  ⚠ No words detected")
            return []
    except Exception as e:
        print(f"  ❌ Error processing image: {e}")
        return []

    print(f"  ✓ Detected {len(words)} words")

    # Group words by lines (Y position)
    lines = {}
    for w in words:
        # Find closest existing line or create new one
        closest_y = None
        for y in lines.keys():
            if abs(y - w["y"]) < 15:  # Within 15 pixels = same line
                closest_y = y
                break

        if closest_y is not None:
            lines[closest_y].append(w)
        else:
            lines[w["y"]] = [w]

    # Sort lines by Y position
    sorted_lines = sorted(lines.items(), key=lambda x: x[0])

    print(f"  ✓ Grouped into {len(sorted_lines)} lines")

    # Find header line (contains "Nom" and "Prénom")
    header_idx = 0
    for i, (y, line_words) in enumerate(sorted_lines):
        line_text = " ".join([w["text"] for w in line_words]).lower()
        if "nom" in line_text and "prénom" in line_text:
            header_idx = i
            print(f"  ✓ Found header at line {i}")
            break

    # Skip header and extract data lines
    # Skip lines 0 to header_idx + a few more to avoid header area
    data_lines = sorted_lines[header_idx + 2:]

    # Build rows
    structured_rows = []
    skipped_rows = 0

    for y, line_words in data_lines:
        # Sort words by X position (left to right)
        line_words_sorted = sorted(line_words, key=lambda w: w["x"])

        if not line_words_sorted:
            continue

        # Skip CODE column (X < 70) and extract Name (70 <= X < 370)
        name_words = [w for w in line_words_sorted if 70 <= w["x"] < 370]

        if not name_words:
            skipped_rows += 1
            continue

        # Concatenate name words
        name = " ".join([w["text"] for w in name_words])

        # Filter: name must be uppercase and alphanumeric
        name_clean = re.sub(r'[^A-Z0-9\s]', '', name)

        # Skip if name is too short or not uppercase
        if not name_clean or not name_clean.isupper() or len(name_clean) < 2:
            skipped_rows += 1
            continue

        # Skip common noise words
        if name_clean in ['LL', 'DE', 'TE', 'M', 'B', 'B1', 'B2', 'G', 'NG', 'HY', 'ANE', 'NDO']:
            skipped_rows += 1
            continue

        # Split other columns by X position ranges (adjusted boundaries)
        addr1_words = [w for w in line_words_sorted if 370 <= w["x"] < 720]
        addr2_words = [w for w in line_words_sorted if 720 <= w["x"] < 1070]
        bureau_words = [w for w in line_words_sorted if 1070 <= w["x"] < 1370]
        tel_words = [w for w in line_words_sorted if 1370 <= w["x"] < 1670]
        solde_words = [w for w in line_words_sorted if w["x"] >= 1670]

        addr1 = " ".join([w["text"] for w in addr1_words])
        addr2 = " ".join([w["text"] for w in addr2_words])
        bureau = " ".join([w["text"] for w in bureau_words])
        tel = " ".join([w["text"] for w in tel_words])
        solde = " ".join([w["text"] for w in solde_words])

        # Clean special characters
        name_clean = re.sub(r'[^A-Za-z0-9\s]', '', name_clean)
        addr1 = re.sub(r'[^A-Za-z0-9\s]', '', addr1)
        addr2 = re.sub(r'[^A-Za-z0-9\s]', '', addr2)
        bureau = re.sub(r'[^A-Za-z0-9\s]', '', bureau)
        tel = re.sub(r'[^A-Za-z0-9\s]', '', tel)
        solde = re.sub(r'[^A-Za-z0-9\s]', '', solde)

        structured_rows.append([name_clean, addr1, addr2, bureau, tel, solde])

    print(f"  ✓ Extracted {len(structured_rows)} valid rows (skipped {skipped_rows})")

    return structured_rows


def main():
    print("Running OCR with Google Vision API")

    if len(sys.argv) < 2:
        print("Usage: python google_ocr.py <folder_path>")
        print("  Processes all images in the folder and generates a single CSV file")
        sys.exit(1)

    input_path = sys.argv[1]

    # Check if path exists
    if not os.path.exists(input_path):
        print(f"Path not found: {input_path}")
        sys.exit(1)

    # Get list of image files
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
    image_files = []

    if os.path.isfile(input_path):
        # Single file mode (backward compatibility)
        image_files = [input_path]
        output_csv = os.path.splitext(os.path.basename(input_path))[0] + "_google.csv"
    elif os.path.isdir(input_path):
        # Folder mode
        folder_path = Path(input_path)
        image_files = [
            str(f) for f in folder_path.iterdir()
            if f.is_file() and f.suffix.lower() in image_extensions
        ]
        image_files.sort()  # Sort for consistent processing order

        if not image_files:
            print(f"No image files found in {input_path}")
            print(f"Supported formats: {', '.join(image_extensions)}")
            sys.exit(1)

        # Output CSV named after the folder
        folder_name = os.path.basename(os.path.normpath(input_path))
        output_csv = f"{folder_name}_google.csv"
    else:
        print(f"Invalid path: {input_path}")
        sys.exit(1)

    print(f"\nFound {len(image_files)} image(s) to process")

    # Initialize Google Vision client
    client = vision.ImageAnnotatorClient()

    # Process all images and collect rows
    all_rows = []
    successful_images = 0

    for image_path in image_files:
        rows = process_image(image_path, client)
        if rows:
            all_rows.extend(rows)
            successful_images += 1

    print(f"\n{'='*60}")
    print(f"Summary:")
    print(f"  Images processed: {successful_images}/{len(image_files)}")
    print(f"  Total rows extracted: {len(all_rows)}")

    if not all_rows:
        print("\n⚠ No data extracted from any image")
        sys.exit(1)

    # Write consolidated CSV
    with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Nom Prénom", "1ère ligne adresse", "2ème ligne adresse", "Bureau Distributeur", "Téléphone 1", "Solde en €"])
        for row in all_rows:
            writer.writerow(row)

    print(f"\n✅ CSV saved to {output_csv}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()
	from google.cloud import vision
	import sys
	import os
	import csv
	import re
	from PIL import Image
	import io
	from pathlib import Path

	def process_image(image_path, client):
	"""Process a single image and return structured rows."""
	print(f"\nProcessing: {os.path.basename(image_path)}")

	try:
	# Load and rotate image by ~2 degrees
	img = Image.open(image_path)
	img_rotated = img.rotate(1.8, expand=True, fillcolor='white')

	# Convert to bytes for Google Vision
	buffer = io.BytesIO()
	img_rotated.save(buffer, format=img.format if img.format else 'JPEG')
	content = buffer.getvalue()

	image = vision.Image(content=content)

	# OCR document detection (better for tables)
	response = client.document_text_detection(image=image)
	document = response.full_text_annotation

	# Extract words with positions
	words = []
	for page in document.pages:
	for block in page.blocks:
	for paragraph in block.paragraphs:
	for word in paragraph.words:
	word_text = "".join([symbol.text for symbol in word.symbols])
	vertices = word.bounding_box.vertices
	x = vertices[0].x
	y = vertices[0].y

	words.append({
	"text": word_text,
	"x": x,
	"y": y,
	})

	if not words:
	print(" ⚠ No words detected")
	return []
	except Exception as e:
	print(f" ❌ Error processing image: {e}")
	return []

	print(f" ✓ Detected {len(words)} words")

	# Group words by lines (Y position)
	lines = {}
	for w in words:
	# Find closest existing line or create new one
	closest_y = None
	for y in lines.keys():
	if abs(y - w["y"]) < 15: # Within 15 pixels = same line
	closest_y = y
	break

	if closest_y is not None:
	lines[closest_y].append(w)
	else:
	lines[w["y"]] = [w]

	# Sort lines by Y position
	sorted_lines = sorted(lines.items(), key=lambda x: x[0])

	print(f" ✓ Grouped into {len(sorted_lines)} lines")

	# Find header line (contains "Nom" and "Prénom")
	header_idx = 0
	for i, (y, line_words) in enumerate(sorted_lines):
	line_text = " ".join([w["text"] for w in line_words]).lower()
	if "nom" in line_text and "prénom" in line_text:
	header_idx = i
	print(f" ✓ Found header at line {i}")
	break

	# Skip header and extract data lines
	# Skip lines 0 to header_idx + a few more to avoid header area
	data_lines = sorted_lines[header_idx + 2:]

	# Build rows
	structured_rows = []
	skipped_rows = 0

	for y, line_words in data_lines:
	# Sort words by X position (left to right)
	line_words_sorted = sorted(line_words, key=lambda w: w["x"])

	if not line_words_sorted:
	continue

	# Skip CODE column (X < 70) and extract Name (70 <= X < 370)
	name_words = [w for w in line_words_sorted if 70 <= w["x"] < 370]

	if not name_words:
	skipped_rows += 1
	continue

	# Concatenate name words
	name = " ".join([w["text"] for w in name_words])

	# Filter: name must be uppercase and alphanumeric
	name_clean = re.sub(r'[^A-Z0-9\s]', '', name)

	# Skip if name is too short or not uppercase
	if not name_clean or not name_clean.isupper() or len(name_clean) < 2:
	skipped_rows += 1
	continue

	# Skip common noise words
	if name_clean in ['LL', 'DE', 'TE', 'M', 'B', 'B1', 'B2', 'G', 'NG', 'HY', 'ANE', 'NDO']:
	skipped_rows += 1
	continue

	# Split other columns by X position ranges (adjusted boundaries)
	addr1_words = [w for w in line_words_sorted if 370 <= w["x"] < 720]
	addr2_words = [w for w in line_words_sorted if 720 <= w["x"] < 1070]
	bureau_words = [w for w in line_words_sorted if 1070 <= w["x"] < 1370]
	tel_words = [w for w in line_words_sorted if 1370 <= w["x"] < 1670]
	solde_words = [w for w in line_words_sorted if w["x"] >= 1670]

	addr1 = " ".join([w["text"] for w in addr1_words])
	addr2 = " ".join([w["text"] for w in addr2_words])
	bureau = " ".join([w["text"] for w in bureau_words])
	tel = " ".join([w["text"] for w in tel_words])
	solde = " ".join([w["text"] for w in solde_words])

	# Clean special characters
	name_clean = re.sub(r'[^A-Za-z0-9\s]', '', name_clean)
	addr1 = re.sub(r'[^A-Za-z0-9\s]', '', addr1)
	addr2 = re.sub(r'[^A-Za-z0-9\s]', '', addr2)
	bureau = re.sub(r'[^A-Za-z0-9\s]', '', bureau)
	tel = re.sub(r'[^A-Za-z0-9\s]', '', tel)
	solde = re.sub(r'[^A-Za-z0-9\s]', '', solde)

	structured_rows.append([name_clean, addr1, addr2, bureau, tel, solde])

	print(f" ✓ Extracted {len(structured_rows)} valid rows (skipped {skipped_rows})")

	return structured_rows


	def main():
	print("Running OCR with Google Vision API")

	if len(sys.argv) < 2:
	print("Usage: python google_ocr.py <folder_path>")
	print(" Processes all images in the folder and generates a single CSV file")
	sys.exit(1)

	input_path = sys.argv[1]

	# Check if path exists
	if not os.path.exists(input_path):
	print(f"Path not found: {input_path}")
	sys.exit(1)

	# Get list of image files
	image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
	image_files = []

	if os.path.isfile(input_path):
	# Single file mode (backward compatibility)
	image_files = [input_path]
	output_csv = os.path.splitext(os.path.basename(input_path))[0] + "_google.csv"
	elif os.path.isdir(input_path):
	# Folder mode
	folder_path = Path(input_path)
	image_files = [
	str(f) for f in folder_path.iterdir()
	if f.is_file() and f.suffix.lower() in image_extensions
	]
	image_files.sort() # Sort for consistent processing order

	if not image_files:
	print(f"No image files found in {input_path}")
	print(f"Supported formats: {', '.join(image_extensions)}")
	sys.exit(1)

	# Output CSV named after the folder
	folder_name = os.path.basename(os.path.normpath(input_path))
	output_csv = f"{folder_name}_google.csv"
	else:
	print(f"Invalid path: {input_path}")
	sys.exit(1)

	print(f"\nFound {len(image_files)} image(s) to process")

	# Initialize Google Vision client
	client = vision.ImageAnnotatorClient()

	# Process all images and collect rows
	all_rows = []
	successful_images = 0

	for image_path in image_files:
	rows = process_image(image_path, client)
	if rows:
	all_rows.extend(rows)
	successful_images += 1

	print(f"\n{'='*60}")
	print(f"Summary:")
	print(f" Images processed: {successful_images}/{len(image_files)}")
	print(f" Total rows extracted: {len(all_rows)}")

	if not all_rows:
	print("\n⚠ No data extracted from any image")
	sys.exit(1)

	# Write consolidated CSV
	with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
	writer = csv.writer(csv_file)
	writer.writerow(["Nom Prénom", "1ère ligne adresse", "2ème ligne adresse", "Bureau Distributeur", "Téléphone 1", "Solde en €"])
	for row in all_rows:
	writer.writerow(row)

	print(f"\n✅ CSV saved to {output_csv}")
	print(f"{'='*60}\n")


	if __name__ == "__main__":
	main()
No results found