Skip to content

Instantly share code, notes, and snippets.

@peltho
Created November 20, 2025 14:05
Show Gist options
  • Select an option

  • Save peltho/888435f034d3567daa8419a5c4b73a5b to your computer and use it in GitHub Desktop.

Select an option

Save peltho/888435f034d3567daa8419a5c4b73a5b to your computer and use it in GitHub Desktop.
Extracting Excel / Sheets table values from image into .csv
from google.cloud import vision
import sys
import os
import csv
import re
from PIL import Image
import io
from pathlib import Path
def process_image(image_path, client):
"""Process a single image and return structured rows."""
print(f"\nProcessing: {os.path.basename(image_path)}")
try:
# Load and rotate image by ~2 degrees
img = Image.open(image_path)
img_rotated = img.rotate(1.8, expand=True, fillcolor='white')
# Convert to bytes for Google Vision
buffer = io.BytesIO()
img_rotated.save(buffer, format=img.format if img.format else 'JPEG')
content = buffer.getvalue()
image = vision.Image(content=content)
# OCR document detection (better for tables)
response = client.document_text_detection(image=image)
document = response.full_text_annotation
# Extract words with positions
words = []
for page in document.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
for word in paragraph.words:
word_text = "".join([symbol.text for symbol in word.symbols])
vertices = word.bounding_box.vertices
x = vertices[0].x
y = vertices[0].y
words.append({
"text": word_text,
"x": x,
"y": y,
})
if not words:
print(" ⚠ No words detected")
return []
except Exception as e:
print(f" ❌ Error processing image: {e}")
return []
print(f" ✓ Detected {len(words)} words")
# Group words by lines (Y position)
lines = {}
for w in words:
# Find closest existing line or create new one
closest_y = None
for y in lines.keys():
if abs(y - w["y"]) < 15: # Within 15 pixels = same line
closest_y = y
break
if closest_y is not None:
lines[closest_y].append(w)
else:
lines[w["y"]] = [w]
# Sort lines by Y position
sorted_lines = sorted(lines.items(), key=lambda x: x[0])
print(f" ✓ Grouped into {len(sorted_lines)} lines")
# Find header line (contains "Nom" and "Prénom")
header_idx = 0
for i, (y, line_words) in enumerate(sorted_lines):
line_text = " ".join([w["text"] for w in line_words]).lower()
if "nom" in line_text and "prénom" in line_text:
header_idx = i
print(f" ✓ Found header at line {i}")
break
# Skip header and extract data lines
# Skip lines 0 to header_idx + a few more to avoid header area
data_lines = sorted_lines[header_idx + 2:]
# Build rows
structured_rows = []
skipped_rows = 0
for y, line_words in data_lines:
# Sort words by X position (left to right)
line_words_sorted = sorted(line_words, key=lambda w: w["x"])
if not line_words_sorted:
continue
# Skip CODE column (X < 70) and extract Name (70 <= X < 370)
name_words = [w for w in line_words_sorted if 70 <= w["x"] < 370]
if not name_words:
skipped_rows += 1
continue
# Concatenate name words
name = " ".join([w["text"] for w in name_words])
# Filter: name must be uppercase and alphanumeric
name_clean = re.sub(r'[^A-Z0-9\s]', '', name)
# Skip if name is too short or not uppercase
if not name_clean or not name_clean.isupper() or len(name_clean) < 2:
skipped_rows += 1
continue
# Skip common noise words
if name_clean in ['LL', 'DE', 'TE', 'M', 'B', 'B1', 'B2', 'G', 'NG', 'HY', 'ANE', 'NDO']:
skipped_rows += 1
continue
# Split other columns by X position ranges (adjusted boundaries)
addr1_words = [w for w in line_words_sorted if 370 <= w["x"] < 720]
addr2_words = [w for w in line_words_sorted if 720 <= w["x"] < 1070]
bureau_words = [w for w in line_words_sorted if 1070 <= w["x"] < 1370]
tel_words = [w for w in line_words_sorted if 1370 <= w["x"] < 1670]
solde_words = [w for w in line_words_sorted if w["x"] >= 1670]
addr1 = " ".join([w["text"] for w in addr1_words])
addr2 = " ".join([w["text"] for w in addr2_words])
bureau = " ".join([w["text"] for w in bureau_words])
tel = " ".join([w["text"] for w in tel_words])
solde = " ".join([w["text"] for w in solde_words])
# Clean special characters
name_clean = re.sub(r'[^A-Za-z0-9\s]', '', name_clean)
addr1 = re.sub(r'[^A-Za-z0-9\s]', '', addr1)
addr2 = re.sub(r'[^A-Za-z0-9\s]', '', addr2)
bureau = re.sub(r'[^A-Za-z0-9\s]', '', bureau)
tel = re.sub(r'[^A-Za-z0-9\s]', '', tel)
solde = re.sub(r'[^A-Za-z0-9\s]', '', solde)
structured_rows.append([name_clean, addr1, addr2, bureau, tel, solde])
print(f" ✓ Extracted {len(structured_rows)} valid rows (skipped {skipped_rows})")
return structured_rows
def main():
print("Running OCR with Google Vision API")
if len(sys.argv) < 2:
print("Usage: python google_ocr.py <folder_path>")
print(" Processes all images in the folder and generates a single CSV file")
sys.exit(1)
input_path = sys.argv[1]
# Check if path exists
if not os.path.exists(input_path):
print(f"Path not found: {input_path}")
sys.exit(1)
# Get list of image files
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'}
image_files = []
if os.path.isfile(input_path):
# Single file mode (backward compatibility)
image_files = [input_path]
output_csv = os.path.splitext(os.path.basename(input_path))[0] + "_google.csv"
elif os.path.isdir(input_path):
# Folder mode
folder_path = Path(input_path)
image_files = [
str(f) for f in folder_path.iterdir()
if f.is_file() and f.suffix.lower() in image_extensions
]
image_files.sort() # Sort for consistent processing order
if not image_files:
print(f"No image files found in {input_path}")
print(f"Supported formats: {', '.join(image_extensions)}")
sys.exit(1)
# Output CSV named after the folder
folder_name = os.path.basename(os.path.normpath(input_path))
output_csv = f"{folder_name}_google.csv"
else:
print(f"Invalid path: {input_path}")
sys.exit(1)
print(f"\nFound {len(image_files)} image(s) to process")
# Initialize Google Vision client
client = vision.ImageAnnotatorClient()
# Process all images and collect rows
all_rows = []
successful_images = 0
for image_path in image_files:
rows = process_image(image_path, client)
if rows:
all_rows.extend(rows)
successful_images += 1
print(f"\n{'='*60}")
print(f"Summary:")
print(f" Images processed: {successful_images}/{len(image_files)}")
print(f" Total rows extracted: {len(all_rows)}")
if not all_rows:
print("\n⚠ No data extracted from any image")
sys.exit(1)
# Write consolidated CSV
with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["Nom Prénom", "1ère ligne adresse", "2ème ligne adresse", "Bureau Distributeur", "Téléphone 1", "Solde en €"])
for row in all_rows:
writer.writerow(row)
print(f"\n✅ CSV saved to {output_csv}")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment