Created
November 20, 2025 14:05
-
-
Save peltho/888435f034d3567daa8419a5c4b73a5b to your computer and use it in GitHub Desktop.
Extracting Excel / Sheets table values from image into .csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from google.cloud import vision | |
| import sys | |
| import os | |
| import csv | |
| import re | |
| from PIL import Image | |
| import io | |
| from pathlib import Path | |
| def process_image(image_path, client): | |
| """Process a single image and return structured rows.""" | |
| print(f"\nProcessing: {os.path.basename(image_path)}") | |
| try: | |
| # Load and rotate image by ~2 degrees | |
| img = Image.open(image_path) | |
| img_rotated = img.rotate(1.8, expand=True, fillcolor='white') | |
| # Convert to bytes for Google Vision | |
| buffer = io.BytesIO() | |
| img_rotated.save(buffer, format=img.format if img.format else 'JPEG') | |
| content = buffer.getvalue() | |
| image = vision.Image(content=content) | |
| # OCR document detection (better for tables) | |
| response = client.document_text_detection(image=image) | |
| document = response.full_text_annotation | |
| # Extract words with positions | |
| words = [] | |
| for page in document.pages: | |
| for block in page.blocks: | |
| for paragraph in block.paragraphs: | |
| for word in paragraph.words: | |
| word_text = "".join([symbol.text for symbol in word.symbols]) | |
| vertices = word.bounding_box.vertices | |
| x = vertices[0].x | |
| y = vertices[0].y | |
| words.append({ | |
| "text": word_text, | |
| "x": x, | |
| "y": y, | |
| }) | |
| if not words: | |
| print(" ⚠ No words detected") | |
| return [] | |
| except Exception as e: | |
| print(f" ❌ Error processing image: {e}") | |
| return [] | |
| print(f" ✓ Detected {len(words)} words") | |
| # Group words by lines (Y position) | |
| lines = {} | |
| for w in words: | |
| # Find closest existing line or create new one | |
| closest_y = None | |
| for y in lines.keys(): | |
| if abs(y - w["y"]) < 15: # Within 15 pixels = same line | |
| closest_y = y | |
| break | |
| if closest_y is not None: | |
| lines[closest_y].append(w) | |
| else: | |
| lines[w["y"]] = [w] | |
| # Sort lines by Y position | |
| sorted_lines = sorted(lines.items(), key=lambda x: x[0]) | |
| print(f" ✓ Grouped into {len(sorted_lines)} lines") | |
| # Find header line (contains "Nom" and "Prénom") | |
| header_idx = 0 | |
| for i, (y, line_words) in enumerate(sorted_lines): | |
| line_text = " ".join([w["text"] for w in line_words]).lower() | |
| if "nom" in line_text and "prénom" in line_text: | |
| header_idx = i | |
| print(f" ✓ Found header at line {i}") | |
| break | |
| # Skip header and extract data lines | |
| # Skip lines 0 to header_idx + a few more to avoid header area | |
| data_lines = sorted_lines[header_idx + 2:] | |
| # Build rows | |
| structured_rows = [] | |
| skipped_rows = 0 | |
| for y, line_words in data_lines: | |
| # Sort words by X position (left to right) | |
| line_words_sorted = sorted(line_words, key=lambda w: w["x"]) | |
| if not line_words_sorted: | |
| continue | |
| # Skip CODE column (X < 70) and extract Name (70 <= X < 370) | |
| name_words = [w for w in line_words_sorted if 70 <= w["x"] < 370] | |
| if not name_words: | |
| skipped_rows += 1 | |
| continue | |
| # Concatenate name words | |
| name = " ".join([w["text"] for w in name_words]) | |
| # Filter: name must be uppercase and alphanumeric | |
| name_clean = re.sub(r'[^A-Z0-9\s]', '', name) | |
| # Skip if name is too short or not uppercase | |
| if not name_clean or not name_clean.isupper() or len(name_clean) < 2: | |
| skipped_rows += 1 | |
| continue | |
| # Skip common noise words | |
| if name_clean in ['LL', 'DE', 'TE', 'M', 'B', 'B1', 'B2', 'G', 'NG', 'HY', 'ANE', 'NDO']: | |
| skipped_rows += 1 | |
| continue | |
| # Split other columns by X position ranges (adjusted boundaries) | |
| addr1_words = [w for w in line_words_sorted if 370 <= w["x"] < 720] | |
| addr2_words = [w for w in line_words_sorted if 720 <= w["x"] < 1070] | |
| bureau_words = [w for w in line_words_sorted if 1070 <= w["x"] < 1370] | |
| tel_words = [w for w in line_words_sorted if 1370 <= w["x"] < 1670] | |
| solde_words = [w for w in line_words_sorted if w["x"] >= 1670] | |
| addr1 = " ".join([w["text"] for w in addr1_words]) | |
| addr2 = " ".join([w["text"] for w in addr2_words]) | |
| bureau = " ".join([w["text"] for w in bureau_words]) | |
| tel = " ".join([w["text"] for w in tel_words]) | |
| solde = " ".join([w["text"] for w in solde_words]) | |
| # Clean special characters | |
| name_clean = re.sub(r'[^A-Za-z0-9\s]', '', name_clean) | |
| addr1 = re.sub(r'[^A-Za-z0-9\s]', '', addr1) | |
| addr2 = re.sub(r'[^A-Za-z0-9\s]', '', addr2) | |
| bureau = re.sub(r'[^A-Za-z0-9\s]', '', bureau) | |
| tel = re.sub(r'[^A-Za-z0-9\s]', '', tel) | |
| solde = re.sub(r'[^A-Za-z0-9\s]', '', solde) | |
| structured_rows.append([name_clean, addr1, addr2, bureau, tel, solde]) | |
| print(f" ✓ Extracted {len(structured_rows)} valid rows (skipped {skipped_rows})") | |
| return structured_rows | |
| def main(): | |
| print("Running OCR with Google Vision API") | |
| if len(sys.argv) < 2: | |
| print("Usage: python google_ocr.py <folder_path>") | |
| print(" Processes all images in the folder and generates a single CSV file") | |
| sys.exit(1) | |
| input_path = sys.argv[1] | |
| # Check if path exists | |
| if not os.path.exists(input_path): | |
| print(f"Path not found: {input_path}") | |
| sys.exit(1) | |
| # Get list of image files | |
| image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif'} | |
| image_files = [] | |
| if os.path.isfile(input_path): | |
| # Single file mode (backward compatibility) | |
| image_files = [input_path] | |
| output_csv = os.path.splitext(os.path.basename(input_path))[0] + "_google.csv" | |
| elif os.path.isdir(input_path): | |
| # Folder mode | |
| folder_path = Path(input_path) | |
| image_files = [ | |
| str(f) for f in folder_path.iterdir() | |
| if f.is_file() and f.suffix.lower() in image_extensions | |
| ] | |
| image_files.sort() # Sort for consistent processing order | |
| if not image_files: | |
| print(f"No image files found in {input_path}") | |
| print(f"Supported formats: {', '.join(image_extensions)}") | |
| sys.exit(1) | |
| # Output CSV named after the folder | |
| folder_name = os.path.basename(os.path.normpath(input_path)) | |
| output_csv = f"{folder_name}_google.csv" | |
| else: | |
| print(f"Invalid path: {input_path}") | |
| sys.exit(1) | |
| print(f"\nFound {len(image_files)} image(s) to process") | |
| # Initialize Google Vision client | |
| client = vision.ImageAnnotatorClient() | |
| # Process all images and collect rows | |
| all_rows = [] | |
| successful_images = 0 | |
| for image_path in image_files: | |
| rows = process_image(image_path, client) | |
| if rows: | |
| all_rows.extend(rows) | |
| successful_images += 1 | |
| print(f"\n{'='*60}") | |
| print(f"Summary:") | |
| print(f" Images processed: {successful_images}/{len(image_files)}") | |
| print(f" Total rows extracted: {len(all_rows)}") | |
| if not all_rows: | |
| print("\n⚠ No data extracted from any image") | |
| sys.exit(1) | |
| # Write consolidated CSV | |
| with open(output_csv, mode="w", newline="", encoding="utf-8") as csv_file: | |
| writer = csv.writer(csv_file) | |
| writer.writerow(["Nom Prénom", "1ère ligne adresse", "2ème ligne adresse", "Bureau Distributeur", "Téléphone 1", "Solde en €"]) | |
| for row in all_rows: | |
| writer.writerow(row) | |
| print(f"\n✅ CSV saved to {output_csv}") | |
| print(f"{'='*60}\n") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment