roelven/download_images.py

## download_images.py
#!/usr/bin/env python3
"""
Download all images from markdown files and update references for Hugo migration.
"""

import os
import re
import sys
from pathlib import Path
from urllib.parse import urlparse, unquote
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

# Configuration
IMAGES_DIR = Path("images")
MAX_WORKERS = 10  # Parallel downloads
TIMEOUT = 30  # seconds

# Regex patterns to find all image URLs
IMAGE_PATTERN = re.compile(r'!\[([^\]]*)\]\(([^)]+\.(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE)
# Also match URLs in link wrappers like [![alt](local)](remote-url)
LINKED_IMAGE_PATTERN = re.compile(r'\]\(([^)]*(?:jpg|jpeg|png|gif|webp|svg|heic)[^)]*)\)', re.IGNORECASE)


def sanitize_filename(url):
    """Extract and sanitize filename from URL."""
    parsed = urlparse(url)
    # Get the filename from the path
    filename = Path(unquote(parsed.path)).name
    # Remove size suffixes like -1024x768, -768x1024 but keep the extension
    filename = re.sub(r'-\d+x\d+(\.\w+)$', r'\1', filename)
    return filename


def download_image(url, filepath):
    """Download a single image."""
    try:
        response = requests.get(url, timeout=TIMEOUT, stream=True)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        return True, None
    except Exception as e:
        return False, str(e)


def find_all_images(md_files):
    """Find all unique images across all markdown files."""
    images = {}  # url -> filename mapping
    file_images = defaultdict(list)  # track which files use which images
    all_urls = set()  # track all URLs we've seen

    for md_file in md_files:
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        # Find all image URLs in the content using both patterns
        for match in IMAGE_PATTERN.finditer(content):
            alt_text = match.group(1)
            url = match.group(2)
            all_urls.add(url)

        # Also check linked images pattern for remote URLs
        for match in LINKED_IMAGE_PATTERN.finditer(content):
            url = match.group(1)
            all_urls.add(url)

    # Process all found URLs
    for md_file in md_files:
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        for url in all_urls:
            # Only process remote http/https URLs from our domains
            if url.startswith(('http://', 'https://')) and not url.startswith('/images/'):
                if 'roelvanderven.com' in url or 'roelven.com' in url:
                    if url in content:
                        filename = sanitize_filename(url)
                        images[url] = filename
                        # Find any markdown image reference with this URL
                        for match in re.finditer(re.escape(url), content):
                            file_images[md_file].append((url, '', url))

    return images, file_images


def update_markdown_files(file_images, url_to_filename):
    """Update markdown files to use local image references."""
    updated_count = 0

    for md_file, image_list in file_images.items():
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()

        original_content = content

        # Replace all occurrences of each URL with the local path
        for url, _, _ in image_list:
            if url in url_to_filename:
                filename = url_to_filename[url]
                # Hugo static files are served from /
                local_path = f'/images/{filename}'
                content = content.replace(url, local_path)

        if content != original_content:
            with open(md_file, 'w', encoding='utf-8') as f:
                f.write(content)
            updated_count += 1

    return updated_count


def main():
    # Create images directory
    IMAGES_DIR.mkdir(exist_ok=True)

    # Find all markdown files
    md_files = list(Path('.').glob('*.md'))
    print(f"Found {len(md_files)} markdown files")

    # Find all images
    print("Scanning for images...")
    images, file_images = find_all_images(md_files)
    print(f"Found {len(images)} unique images")

    # Check for filename collisions
    filename_to_urls = defaultdict(list)
    for url, filename in images.items():
        filename_to_urls[filename].append(url)

    collisions = {fn: urls for fn, urls in filename_to_urls.items() if len(urls) > 1}
    if collisions:
        print(f"\nWarning: {len(collisions)} filename collisions detected:")
        for filename, urls in list(collisions.items())[:5]:
            print(f"  {filename}:")
            for url in urls:
                print(f"    - {url}")
        if len(collisions) > 5:
            print(f"  ... and {len(collisions) - 5} more")
        print()

    # Download images
    print(f"\nDownloading images to {IMAGES_DIR}/...")
    downloaded = 0
    skipped = 0
    failed = 0

    download_tasks = []
    for url, filename in images.items():
        filepath = IMAGES_DIR / filename

        # Skip if already exists
        if filepath.exists():
            skipped += 1
            continue

        download_tasks.append((url, filepath, filename))

    if download_tasks:
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            future_to_url = {
                executor.submit(download_image, url, filepath): (url, filename)
                for url, filepath, filename in download_tasks
            }

            for future in as_completed(future_to_url):
                url, filename = future_to_url[future]
                success, error = future.result()

                if success:
                    downloaded += 1
                    print(f"  [{downloaded}/{len(download_tasks)}] ✓ {filename}")
                else:
                    failed += 1
                    print(f"  [FAILED] ✗ {filename}: {error}")

    print(f"\nDownload summary:")
    print(f"  Downloaded: {downloaded}")
    print(f"  Skipped (already exist): {skipped}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(images)}")

    # Update markdown files
    if downloaded > 0 or skipped > 0:
        print(f"\nUpdating markdown files...")
        updated = update_markdown_files(file_images, images)
        print(f"  Updated {updated} markdown files")
        print("\n✓ Done! Images are now in the ./images/ directory")
        print("  When you migrate to Hugo, move this directory to your Hugo site's static/ folder")

    return 0 if failed == 0 else 1


if __name__ == '__main__':
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
        sys.exit(1)

## requirements.txt
requests>=2.31.0
	#!/usr/bin/env python3
	"""
	Download all images from markdown files and update references for Hugo migration.
	"""

	import os
	import re
	import sys
	from pathlib import Path
	from urllib.parse import urlparse, unquote
	import requests
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from collections import defaultdict

	# Configuration
	IMAGES_DIR = Path("images")
	MAX_WORKERS = 10 # Parallel downloads
	TIMEOUT = 30 # seconds

	# Regex patterns to find all image URLs
	IMAGE_PATTERN = re.compile(r'!\[([^\]])\]\(([^)]+\.(?:jpg\|jpeg\|png\|gif\|webp\|svg\|heic)[^)])\)', re.IGNORECASE)
	# Also match URLs in link wrappers like [![alt](local)](remote-url)
	LINKED_IMAGE_PATTERN = re.compile(r'\]\(([^)](?:jpg\|jpeg\|png\|gif\|webp\|svg\|heic)[^)])\)', re.IGNORECASE)


	def sanitize_filename(url):
	"""Extract and sanitize filename from URL."""
	parsed = urlparse(url)
	# Get the filename from the path
	filename = Path(unquote(parsed.path)).name
	# Remove size suffixes like -1024x768, -768x1024 but keep the extension
	filename = re.sub(r'-\d+x\d+(\.\w+)$', r'\1', filename)
	return filename


	def download_image(url, filepath):
	"""Download a single image."""
	try:
	response = requests.get(url, timeout=TIMEOUT, stream=True)
	response.raise_for_status()

	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	return True, None
	except Exception as e:
	return False, str(e)


	def find_all_images(md_files):
	"""Find all unique images across all markdown files."""
	images = {} # url -> filename mapping
	file_images = defaultdict(list) # track which files use which images
	all_urls = set() # track all URLs we've seen

	for md_file in md_files:
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	# Find all image URLs in the content using both patterns
	for match in IMAGE_PATTERN.finditer(content):
	alt_text = match.group(1)
	url = match.group(2)
	all_urls.add(url)

	# Also check linked images pattern for remote URLs
	for match in LINKED_IMAGE_PATTERN.finditer(content):
	url = match.group(1)
	all_urls.add(url)

	# Process all found URLs
	for md_file in md_files:
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	for url in all_urls:
	# Only process remote http/https URLs from our domains
	if url.startswith(('http://', 'https://')) and not url.startswith('/images/'):
	if 'roelvanderven.com' in url or 'roelven.com' in url:
	if url in content:
	filename = sanitize_filename(url)
	images[url] = filename
	# Find any markdown image reference with this URL
	for match in re.finditer(re.escape(url), content):
	file_images[md_file].append((url, '', url))

	return images, file_images


	def update_markdown_files(file_images, url_to_filename):
	"""Update markdown files to use local image references."""
	updated_count = 0

	for md_file, image_list in file_images.items():
	with open(md_file, 'r', encoding='utf-8') as f:
	content = f.read()

	original_content = content

	# Replace all occurrences of each URL with the local path
	for url, _, _ in image_list:
	if url in url_to_filename:
	filename = url_to_filename[url]
	# Hugo static files are served from /
	local_path = f'/images/{filename}'
	content = content.replace(url, local_path)

	if content != original_content:
	with open(md_file, 'w', encoding='utf-8') as f:
	f.write(content)
	updated_count += 1

	return updated_count


	def main():
	# Create images directory
	IMAGES_DIR.mkdir(exist_ok=True)

	# Find all markdown files
	md_files = list(Path('.').glob('*.md'))
	print(f"Found {len(md_files)} markdown files")

	# Find all images
	print("Scanning for images...")
	images, file_images = find_all_images(md_files)
	print(f"Found {len(images)} unique images")

	# Check for filename collisions
	filename_to_urls = defaultdict(list)
	for url, filename in images.items():
	filename_to_urls[filename].append(url)

	collisions = {fn: urls for fn, urls in filename_to_urls.items() if len(urls) > 1}
	if collisions:
	print(f"\nWarning: {len(collisions)} filename collisions detected:")
	for filename, urls in list(collisions.items())[:5]:
	print(f" {filename}:")
	for url in urls:
	print(f" - {url}")
	if len(collisions) > 5:
	print(f" ... and {len(collisions) - 5} more")
	print()

	# Download images
	print(f"\nDownloading images to {IMAGES_DIR}/...")
	downloaded = 0
	skipped = 0
	failed = 0

	download_tasks = []
	for url, filename in images.items():
	filepath = IMAGES_DIR / filename

	# Skip if already exists
	if filepath.exists():
	skipped += 1
	continue

	download_tasks.append((url, filepath, filename))

	if download_tasks:
	with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	future_to_url = {
	executor.submit(download_image, url, filepath): (url, filename)
	for url, filepath, filename in download_tasks
	}

	for future in as_completed(future_to_url):
	url, filename = future_to_url[future]
	success, error = future.result()

	if success:
	downloaded += 1
	print(f" [{downloaded}/{len(download_tasks)}] ✓ {filename}")
	else:
	failed += 1
	print(f" [FAILED] ✗ {filename}: {error}")

	print(f"\nDownload summary:")
	print(f" Downloaded: {downloaded}")
	print(f" Skipped (already exist): {skipped}")
	print(f" Failed: {failed}")
	print(f" Total: {len(images)}")

	# Update markdown files
	if downloaded > 0 or skipped > 0:
	print(f"\nUpdating markdown files...")
	updated = update_markdown_files(file_images, images)
	print(f" Updated {updated} markdown files")
	print("\n✓ Done! Images are now in the ./images/ directory")
	print(" When you migrate to Hugo, move this directory to your Hugo site's static/ folder")

	return 0 if failed == 0 else 1


	if __name__ == '__main__':
	try:
	sys.exit(main())
	except KeyboardInterrupt:
	print("\n\nInterrupted by user")
	sys.exit(1)
No results found