Created
January 22, 2026 13:36
-
-
Save roelven/93152fadc4ce5312987fc539471643f8 to your computer and use it in GitHub Desktop.
WordPress to Hugo image downloader - downloads all images from markdown files and updates references
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Download all images from markdown files and update references for Hugo migration. | |
| """ | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from urllib.parse import urlparse, unquote | |
| import requests | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from collections import defaultdict | |
| # Configuration | |
| IMAGES_DIR = Path("images") | |
| MAX_WORKERS = 10 # Parallel downloads | |
| TIMEOUT = 30 # seconds | |
| # Regex patterns to find all image URLs | |
| IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+\.(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE) | |
| # Also match URLs in link wrappers like [](remote-url) | |
| LINKED_IMAGE_PATTERN = re.compile(r'\]\(([^)]*(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE) | |
| def sanitize_filename(url): | |
| """Extract and sanitize filename from URL.""" | |
| parsed = urlparse(url) | |
| # Get the filename from the path | |
| filename = Path(unquote(parsed.path)).name | |
| # Remove size suffixes like -1024x768, -768x1024 but keep the extension | |
| filename = re.sub(r'-\d+x\d+(\.\w+)$', r'\1', filename) | |
| return filename | |
| def download_image(url, filepath): | |
| """Download a single image.""" | |
| try: | |
| response = requests.get(url, timeout=TIMEOUT, stream=True) | |
| response.raise_for_status() | |
| with open(filepath, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| return True, None | |
| except Exception as e: | |
| return False, str(e) | |
| def find_all_images(md_files): | |
| """Find all unique images across all markdown files.""" | |
| images = {} # url -> filename mapping | |
| file_images = defaultdict(list) # track which files use which images | |
| all_urls = set() # track all URLs we've seen | |
| for md_file in md_files: | |
| with open(md_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Find all image URLs in the content using both patterns | |
| for match in IMAGE_PATTERN.finditer(content): | |
| alt_text = match.group(1) | |
| url = match.group(2) | |
| all_urls.add(url) | |
| # Also check linked images pattern for remote URLs | |
| for match in LINKED_IMAGE_PATTERN.finditer(content): | |
| url = match.group(1) | |
| all_urls.add(url) | |
| # Process all found URLs | |
| for md_file in md_files: | |
| with open(md_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| for url in all_urls: | |
| # Only process remote http/https URLs from our domains | |
| if url.startswith(('http://', 'https://')) and not url.startswith('/images/'): | |
| if 'roelvanderven.com' in url or 'roelven.com' in url: | |
| if url in content: | |
| filename = sanitize_filename(url) | |
| images[url] = filename | |
| # Find any markdown image reference with this URL | |
| for match in re.finditer(re.escape(url), content): | |
| file_images[md_file].append((url, '', url)) | |
| return images, file_images | |
| def update_markdown_files(file_images, url_to_filename): | |
| """Update markdown files to use local image references.""" | |
| updated_count = 0 | |
| for md_file, image_list in file_images.items(): | |
| with open(md_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| original_content = content | |
| # Replace all occurrences of each URL with the local path | |
| for url, _, _ in image_list: | |
| if url in url_to_filename: | |
| filename = url_to_filename[url] | |
| # Hugo static files are served from / | |
| local_path = f'/images/{filename}' | |
| content = content.replace(url, local_path) | |
| if content != original_content: | |
| with open(md_file, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| updated_count += 1 | |
| return updated_count | |
| def main(): | |
| # Create images directory | |
| IMAGES_DIR.mkdir(exist_ok=True) | |
| # Find all markdown files | |
| md_files = list(Path('.').glob('*.md')) | |
| print(f"Found {len(md_files)} markdown files") | |
| # Find all images | |
| print("Scanning for images...") | |
| images, file_images = find_all_images(md_files) | |
| print(f"Found {len(images)} unique images") | |
| # Check for filename collisions | |
| filename_to_urls = defaultdict(list) | |
| for url, filename in images.items(): | |
| filename_to_urls[filename].append(url) | |
| collisions = {fn: urls for fn, urls in filename_to_urls.items() if len(urls) > 1} | |
| if collisions: | |
| print(f"\nWarning: {len(collisions)} filename collisions detected:") | |
| for filename, urls in list(collisions.items())[:5]: | |
| print(f" {filename}:") | |
| for url in urls: | |
| print(f" - {url}") | |
| if len(collisions) > 5: | |
| print(f" ... and {len(collisions) - 5} more") | |
| print() | |
| # Download images | |
| print(f"\nDownloading images to {IMAGES_DIR}/...") | |
| downloaded = 0 | |
| skipped = 0 | |
| failed = 0 | |
| download_tasks = [] | |
| for url, filename in images.items(): | |
| filepath = IMAGES_DIR / filename | |
| # Skip if already exists | |
| if filepath.exists(): | |
| skipped += 1 | |
| continue | |
| download_tasks.append((url, filepath, filename)) | |
| if download_tasks: | |
| with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: | |
| future_to_url = { | |
| executor.submit(download_image, url, filepath): (url, filename) | |
| for url, filepath, filename in download_tasks | |
| } | |
| for future in as_completed(future_to_url): | |
| url, filename = future_to_url[future] | |
| success, error = future.result() | |
| if success: | |
| downloaded += 1 | |
| print(f" [{downloaded}/{len(download_tasks)}] ✓ {filename}") | |
| else: | |
| failed += 1 | |
| print(f" [FAILED] ✗ {filename}: {error}") | |
| print(f"\nDownload summary:") | |
| print(f" Downloaded: {downloaded}") | |
| print(f" Skipped (already exist): {skipped}") | |
| print(f" Failed: {failed}") | |
| print(f" Total: {len(images)}") | |
| # Update markdown files | |
| if downloaded > 0 or skipped > 0: | |
| print(f"\nUpdating markdown files...") | |
| updated = update_markdown_files(file_images, images) | |
| print(f" Updated {updated} markdown files") | |
| print("\n✓ Done! Images are now in the ./images/ directory") | |
| print(" When you migrate to Hugo, move this directory to your Hugo site's static/ folder") | |
| return 0 if failed == 0 else 1 | |
| if __name__ == '__main__': | |
| try: | |
| sys.exit(main()) | |
| except KeyboardInterrupt: | |
| print("\n\nInterrupted by user") | |
| sys.exit(1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| requests>=2.31.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment