Skip to content

Instantly share code, notes, and snippets.

@roelven
Created January 22, 2026 13:36
Show Gist options
  • Select an option

  • Save roelven/93152fadc4ce5312987fc539471643f8 to your computer and use it in GitHub Desktop.

Select an option

Save roelven/93152fadc4ce5312987fc539471643f8 to your computer and use it in GitHub Desktop.
WordPress to Hugo image downloader - downloads all images from markdown files and updates references
#!/usr/bin/env python3
"""
Download all images from markdown files and update references for Hugo migration.
"""
import os
import re
import sys
from pathlib import Path
from urllib.parse import urlparse, unquote
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
# Configuration
IMAGES_DIR = Path("images")
MAX_WORKERS = 10 # Parallel downloads
TIMEOUT = 30 # seconds
# Regex patterns to find all image URLs
IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+\.(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE)
# Also match URLs in link wrappers like [![alt](local)](remote-url)
LINKED_IMAGE_PATTERN = re.compile(r'\]\(([^)]*(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE)
def sanitize_filename(url):
"""Extract and sanitize filename from URL."""
parsed = urlparse(url)
# Get the filename from the path
filename = Path(unquote(parsed.path)).name
# Remove size suffixes like -1024x768, -768x1024 but keep the extension
filename = re.sub(r'-\d+x\d+(\.\w+)$', r'\1', filename)
return filename
def download_image(url, filepath):
"""Download a single image."""
try:
response = requests.get(url, timeout=TIMEOUT, stream=True)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return True, None
except Exception as e:
return False, str(e)
def find_all_images(md_files):
"""Find all unique images across all markdown files."""
images = {} # url -> filename mapping
file_images = defaultdict(list) # track which files use which images
all_urls = set() # track all URLs we've seen
for md_file in md_files:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Find all image URLs in the content using both patterns
for match in IMAGE_PATTERN.finditer(content):
alt_text = match.group(1)
url = match.group(2)
all_urls.add(url)
# Also check linked images pattern for remote URLs
for match in LINKED_IMAGE_PATTERN.finditer(content):
url = match.group(1)
all_urls.add(url)
# Process all found URLs
for md_file in md_files:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
for url in all_urls:
# Only process remote http/https URLs from our domains
if url.startswith(('http://', 'https://')) and not url.startswith('/images/'):
if 'roelvanderven.com' in url or 'roelven.com' in url:
if url in content:
filename = sanitize_filename(url)
images[url] = filename
# Find any markdown image reference with this URL
for match in re.finditer(re.escape(url), content):
file_images[md_file].append((url, '', url))
return images, file_images
def update_markdown_files(file_images, url_to_filename):
"""Update markdown files to use local image references."""
updated_count = 0
for md_file, image_list in file_images.items():
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
# Replace all occurrences of each URL with the local path
for url, _, _ in image_list:
if url in url_to_filename:
filename = url_to_filename[url]
# Hugo static files are served from /
local_path = f'/images/{filename}'
content = content.replace(url, local_path)
if content != original_content:
with open(md_file, 'w', encoding='utf-8') as f:
f.write(content)
updated_count += 1
return updated_count
def main():
# Create images directory
IMAGES_DIR.mkdir(exist_ok=True)
# Find all markdown files
md_files = list(Path('.').glob('*.md'))
print(f"Found {len(md_files)} markdown files")
# Find all images
print("Scanning for images...")
images, file_images = find_all_images(md_files)
print(f"Found {len(images)} unique images")
# Check for filename collisions
filename_to_urls = defaultdict(list)
for url, filename in images.items():
filename_to_urls[filename].append(url)
collisions = {fn: urls for fn, urls in filename_to_urls.items() if len(urls) > 1}
if collisions:
print(f"\nWarning: {len(collisions)} filename collisions detected:")
for filename, urls in list(collisions.items())[:5]:
print(f" {filename}:")
for url in urls:
print(f" - {url}")
if len(collisions) > 5:
print(f" ... and {len(collisions) - 5} more")
print()
# Download images
print(f"\nDownloading images to {IMAGES_DIR}/...")
downloaded = 0
skipped = 0
failed = 0
download_tasks = []
for url, filename in images.items():
filepath = IMAGES_DIR / filename
# Skip if already exists
if filepath.exists():
skipped += 1
continue
download_tasks.append((url, filepath, filename))
if download_tasks:
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
future_to_url = {
executor.submit(download_image, url, filepath): (url, filename)
for url, filepath, filename in download_tasks
}
for future in as_completed(future_to_url):
url, filename = future_to_url[future]
success, error = future.result()
if success:
downloaded += 1
print(f" [{downloaded}/{len(download_tasks)}] ✓ {filename}")
else:
failed += 1
print(f" [FAILED] ✗ {filename}: {error}")
print(f"\nDownload summary:")
print(f" Downloaded: {downloaded}")
print(f" Skipped (already exist): {skipped}")
print(f" Failed: {failed}")
print(f" Total: {len(images)}")
# Update markdown files
if downloaded > 0 or skipped > 0:
print(f"\nUpdating markdown files...")
updated = update_markdown_files(file_images, images)
print(f" Updated {updated} markdown files")
print("\n✓ Done! Images are now in the ./images/ directory")
print(" When you migrate to Hugo, move this directory to your Hugo site's static/ folder")
return 0 if failed == 0 else 1
if __name__ == '__main__':
try:
sys.exit(main())
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
requests>=2.31.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment