segyges/verify_hf_download.py

## verify_hf_download.py
"""
Hugging Face Dataset Verification Tool

This script verifies the integrity of locally downloaded Hugging Face datasets by comparing
SHA256 checksums from the remote repository with local file hashes.

USAGE:
    python verify_hf_download.py <repo_id> [local_path]

ARGUMENTS:
    repo_id     - Hugging Face dataset repository ID (e.g., "microsoft/DialoGPT-medium")
    local_path  - Optional path to local dataset directory (defaults to current directory ".")

EXAMPLES:
    # Verify dataset in current directory
    python verify_hf_download.py microsoft/DialoGPT-medium

    # Verify dataset in specific directory
    python verify_hf_download.py microsoft/DialoGPT-medium /path/to/dataset

    # Verify dataset in Downloads folder
    python verify_hf_download.py openai/whisper-small ~/Downloads/whisper-small

REQUIREMENTS:
    - requests: pip install requests
    - backoff: pip install backoff
    - tqdm: pip install tqdm

EXIT CODES:
    0 - All files verified successfully
    1 - One or more verification failures detected

"""

import threading
import queue
import requests
import sys
import re
import backoff
import hashlib
import os
from tqdm import tqdm
import time

# More aggressive backoff: longer waits, more retries
@backoff.on_exception(backoff.expo,
                      (requests.exceptions.RequestException, requests.exceptions.HTTPError),
                      max_tries=8,  # Increased from 5
                      base=2,       # Base delay
                      max_value=300) # Max 5 minute delay
def fetch_with_retry(url):
    """Fetch URL with exponential backoff retry"""
    response = requests.get(url, timeout=60)  # Increased timeout
    if response.status_code == 429:
        raise requests.exceptions.HTTPError("Rate limited")
    response.raise_for_status()
    return response

def get_all_files_recursive(repo_id, path="", failed_dirs=None):
    """Recursively get all files from HF API"""
    if failed_dirs is None:
        failed_dirs = []

    url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main"
    if path:
        url += f"/{path}"

    try:
        response = fetch_with_retry(url)
        data = response.json()
    except Exception as e:
        error_msg = f"Failed to access directory '{path or 'root'}': {str(e)}"
        print(f"❌ DIRECTORY ACCESS FAILURE: {error_msg}")
        failed_dirs.append(path or "root")
        return [], failed_dirs

    files = []
    for item in data:
        if item.get('type') == 'file':
            files.append(item.get('path'))
        elif item.get('type') == 'directory':
            subfiles, failed_dirs = get_all_files_recursive(repo_id, item.get('path'), failed_dirs)
            files.extend(subfiles)

    return files, failed_dirs


@backoff.on_exception(backoff.expo,
                      (IOError, OSError),
                      max_tries=3,
                      base=1,
                      max_value=10)
def calculate_sha256(filepath):
    hash_sha256 = hashlib.sha256()
    chunk_queue = queue.Queue(maxsize=10)

    def reader():
        try:
            with open(filepath, "rb") as f:
                while True:
                    chunk = f.read(1024*1024)
                    if not chunk:
                        break
                    chunk_queue.put(chunk)
        finally:
            chunk_queue.put(None)

    reader_thread = threading.Thread(target=reader)
    reader_thread.start()

    try:
        while True:
            chunk = chunk_queue.get()
            if chunk is None:
                break
            hash_sha256.update(chunk)
    finally:
        reader_thread.join()

    return hash_sha256.hexdigest()

def main():
    if len(sys.argv) < 2:
        print("Usage: python script.py <repo_id> [local_path]")
        sys.exit(1)

    repo_id = sys.argv[1]
    local_path = sys.argv[2] if len(sys.argv) > 2 else "."

    print(f"Walking {repo_id}...")
    files, failed_dirs = get_all_files_recursive(repo_id)

    # Phase 1: Get all hashes from remote WITH IMMEDIATE FAILURE REPORTING
    print(f"Fetching checksums for {len(files)} files...")
    hashes = []
    checksum_failures = []

    # Add counters for live progress
    success_count = 0
    failure_count = 0

    for i, filename in enumerate(tqdm(files, desc="Fetching checksums")):
        pointer_url = f"https://huggingface.co/datasets/{repo_id}/raw/main/{filename}"

        try:
            response = fetch_with_retry(pointer_url)
            content = response.text
            if 'sha256:' in content:
                sha_match = re.search(r'sha256:([a-f0-9]{64})', content)
                if sha_match:
                    sha256 = sha_match.group(1)
                    hashes.append((filename, sha256))
                    success_count += 1
                else:
                    failure_msg = f"SHA256 pattern found but could not extract hash"
                    checksum_failures.append((filename, failure_msg))
                    failure_count += 1
                    print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
            else:
                failure_msg = "No SHA256 hash found in file"
                checksum_failures.append((filename, failure_msg))
                failure_count += 1
                print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
        except Exception as e:
            failure_msg = f"Failed to download checksum: {str(e)}"
            checksum_failures.append((filename, failure_msg))
            failure_count += 1
            print(f"❌ API FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")

            # Add a small delay after API failures to be nicer to the server
            if "rate limit" in str(e).lower() or "429" in str(e):
                print(f"⏱️  Rate limited, taking extra 30s break...")
                time.sleep(30)

    # Report checksum phase summary
    print(f"\n{'='*60}")
    print(f"CHECKSUM PHASE COMPLETE")
    print(f"{'='*60}")
    print(f"✅ Checksums fetched successfully: {success_count}")
    print(f"❌ Checksum fetch failures: {failure_count}")
    if failure_count > 0:
        print(f"⚠️  Warning: {failure_count} files failed checksum fetch - cannot verify these files")

    # Phase 2: Verify local files against checksums WITH IMMEDIATE FAILURE REPORTING
    print(f"\nVerifying {len(hashes)} files against local copies...")
    verification_successes = 0
    verification_failures = []

    for i, (filename, expected_hash) in enumerate(tqdm(hashes, desc="Verifying files")):
        local_file_path = os.path.join(local_path, filename)

        if not os.path.exists(local_file_path):
            failure_msg = "File does not exist locally"
            verification_failures.append((filename, failure_msg))
            print(f"❌ MISSING FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
            continue

        if not os.path.isfile(local_file_path):
            failure_msg = "Path exists but is not a file"
            verification_failures.append((filename, failure_msg))
            print(f"❌ NOT A FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
            continue

        result = calculate_sha256(local_file_path)
        if isinstance(result, tuple):  # Error case
            actual_hash, error = result
            failure_msg = f"Failed to calculate hash: {error}"
            verification_failures.append((filename, failure_msg))
            print(f"❌ HASH ERROR [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
            continue

        actual_hash = result
        if actual_hash.lower() == expected_hash.lower():
            verification_successes += 1
        else:
            failure_msg = f"Hash mismatch - expected: {expected_hash}, got: {actual_hash}"
            verification_failures.append((filename, failure_msg))
            print(f"❌ HASH MISMATCH [{i+1}/{len(hashes)}]: {filename}")
            print(f"   Expected: {expected_hash}")
            print(f"   Got:      {actual_hash}")

    # Final report
    print(f"\n{'='*80}")
    print(f"FINAL VERIFICATION REPORT")
    print(f"{'='*80}")
    print(f"Total files discovered: {len(files)}")
    print(f"Checksums successfully fetched: {len(hashes)}")
    print(f"Files successfully verified: {verification_successes}")
    print(f"Discovery failures (directories): {len(failed_dirs)}")
    print(f"Checksum fetch failures: {len(checksum_failures)}")
    print(f"Verification failures: {len(verification_failures)}")

    if len(hashes) > 0:
        print(f"Verification success rate: {verification_successes/len(hashes)*100:.1f}%")

    # Report failures in summary (since we already reported them individually)
    if failed_dirs:
        print(f"\nFAILED DIRECTORIES SUMMARY:")
        for d in failed_dirs:
            print(f"  - {d}")

    # Exit with appropriate status
    total_failures = len(failed_dirs) + len(checksum_failures) + len(verification_failures)
    if total_failures > 0:
        print(f"\n⚠️  WARNING: {total_failures} total failures detected!")
        sys.exit(1)
    else:
        print(f"\n✅ SUCCESS: All files verified successfully!")
        sys.exit(0)

if __name__ == "__main__":
    main()
	"""
	Hugging Face Dataset Verification Tool

	This script verifies the integrity of locally downloaded Hugging Face datasets by comparing
	SHA256 checksums from the remote repository with local file hashes.

	USAGE:
	python verify_hf_download.py <repo_id> [local_path]

	ARGUMENTS:
	repo_id - Hugging Face dataset repository ID (e.g., "microsoft/DialoGPT-medium")
	local_path - Optional path to local dataset directory (defaults to current directory ".")

	EXAMPLES:
	# Verify dataset in current directory
	python verify_hf_download.py microsoft/DialoGPT-medium

	# Verify dataset in specific directory
	python verify_hf_download.py microsoft/DialoGPT-medium /path/to/dataset

	# Verify dataset in Downloads folder
	python verify_hf_download.py openai/whisper-small ~/Downloads/whisper-small

	REQUIREMENTS:
	- requests: pip install requests
	- backoff: pip install backoff
	- tqdm: pip install tqdm

	EXIT CODES:
	0 - All files verified successfully
	1 - One or more verification failures detected

	"""

	import threading
	import queue
	import requests
	import sys
	import re
	import backoff
	import hashlib
	import os
	from tqdm import tqdm
	import time

	# More aggressive backoff: longer waits, more retries
	@backoff.on_exception(backoff.expo,
	(requests.exceptions.RequestException, requests.exceptions.HTTPError),
	max_tries=8, # Increased from 5
	base=2, # Base delay
	max_value=300) # Max 5 minute delay
	def fetch_with_retry(url):
	"""Fetch URL with exponential backoff retry"""
	response = requests.get(url, timeout=60) # Increased timeout
	if response.status_code == 429:
	raise requests.exceptions.HTTPError("Rate limited")
	response.raise_for_status()
	return response

	def get_all_files_recursive(repo_id, path="", failed_dirs=None):
	"""Recursively get all files from HF API"""
	if failed_dirs is None:
	failed_dirs = []

	url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main"
	if path:
	url += f"/{path}"

	try:
	response = fetch_with_retry(url)
	data = response.json()
	except Exception as e:
	error_msg = f"Failed to access directory '{path or 'root'}': {str(e)}"
	print(f"❌ DIRECTORY ACCESS FAILURE: {error_msg}")
	failed_dirs.append(path or "root")
	return [], failed_dirs

	files = []
	for item in data:
	if item.get('type') == 'file':
	files.append(item.get('path'))
	elif item.get('type') == 'directory':
	subfiles, failed_dirs = get_all_files_recursive(repo_id, item.get('path'), failed_dirs)
	files.extend(subfiles)

	return files, failed_dirs


	@backoff.on_exception(backoff.expo,
	(IOError, OSError),
	max_tries=3,
	base=1,
	max_value=10)
	def calculate_sha256(filepath):
	hash_sha256 = hashlib.sha256()
	chunk_queue = queue.Queue(maxsize=10)

	def reader():
	try:
	with open(filepath, "rb") as f:
	while True:
	chunk = f.read(1024*1024)
	if not chunk:
	break
	chunk_queue.put(chunk)
	finally:
	chunk_queue.put(None)

	reader_thread = threading.Thread(target=reader)
	reader_thread.start()

	try:
	while True:
	chunk = chunk_queue.get()
	if chunk is None:
	break
	hash_sha256.update(chunk)
	finally:
	reader_thread.join()

	return hash_sha256.hexdigest()

	def main():
	if len(sys.argv) < 2:
	print("Usage: python script.py <repo_id> [local_path]")
	sys.exit(1)

	repo_id = sys.argv[1]
	local_path = sys.argv[2] if len(sys.argv) > 2 else "."

	print(f"Walking {repo_id}...")
	files, failed_dirs = get_all_files_recursive(repo_id)

	# Phase 1: Get all hashes from remote WITH IMMEDIATE FAILURE REPORTING
	print(f"Fetching checksums for {len(files)} files...")
	hashes = []
	checksum_failures = []

	# Add counters for live progress
	success_count = 0
	failure_count = 0

	for i, filename in enumerate(tqdm(files, desc="Fetching checksums")):
	pointer_url = f"https://huggingface.co/datasets/{repo_id}/raw/main/{filename}"

	try:
	response = fetch_with_retry(pointer_url)
	content = response.text
	if 'sha256:' in content:
	sha_match = re.search(r'sha256:([a-f0-9]{64})', content)
	if sha_match:
	sha256 = sha_match.group(1)
	hashes.append((filename, sha256))
	success_count += 1
	else:
	failure_msg = f"SHA256 pattern found but could not extract hash"
	checksum_failures.append((filename, failure_msg))
	failure_count += 1
	print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
	else:
	failure_msg = "No SHA256 hash found in file"
	checksum_failures.append((filename, failure_msg))
	failure_count += 1
	print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
	except Exception as e:
	failure_msg = f"Failed to download checksum: {str(e)}"
	checksum_failures.append((filename, failure_msg))
	failure_count += 1
	print(f"❌ API FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")

	# Add a small delay after API failures to be nicer to the server
	if "rate limit" in str(e).lower() or "429" in str(e):
	print(f"⏱️ Rate limited, taking extra 30s break...")
	time.sleep(30)

	# Report checksum phase summary
	print(f"\n{'='*60}")
	print(f"CHECKSUM PHASE COMPLETE")
	print(f"{'='*60}")
	print(f"✅ Checksums fetched successfully: {success_count}")
	print(f"❌ Checksum fetch failures: {failure_count}")
	if failure_count > 0:
	print(f"⚠️ Warning: {failure_count} files failed checksum fetch - cannot verify these files")

	# Phase 2: Verify local files against checksums WITH IMMEDIATE FAILURE REPORTING
	print(f"\nVerifying {len(hashes)} files against local copies...")
	verification_successes = 0
	verification_failures = []

	for i, (filename, expected_hash) in enumerate(tqdm(hashes, desc="Verifying files")):
	local_file_path = os.path.join(local_path, filename)

	if not os.path.exists(local_file_path):
	failure_msg = "File does not exist locally"
	verification_failures.append((filename, failure_msg))
	print(f"❌ MISSING FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
	continue

	if not os.path.isfile(local_file_path):
	failure_msg = "Path exists but is not a file"
	verification_failures.append((filename, failure_msg))
	print(f"❌ NOT A FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
	continue

	result = calculate_sha256(local_file_path)
	if isinstance(result, tuple): # Error case
	actual_hash, error = result
	failure_msg = f"Failed to calculate hash: {error}"
	verification_failures.append((filename, failure_msg))
	print(f"❌ HASH ERROR [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
	continue

	actual_hash = result
	if actual_hash.lower() == expected_hash.lower():
	verification_successes += 1
	else:
	failure_msg = f"Hash mismatch - expected: {expected_hash}, got: {actual_hash}"
	verification_failures.append((filename, failure_msg))
	print(f"❌ HASH MISMATCH [{i+1}/{len(hashes)}]: {filename}")
	print(f" Expected: {expected_hash}")
	print(f" Got: {actual_hash}")

	# Final report
	print(f"\n{'='*80}")
	print(f"FINAL VERIFICATION REPORT")
	print(f"{'='*80}")
	print(f"Total files discovered: {len(files)}")
	print(f"Checksums successfully fetched: {len(hashes)}")
	print(f"Files successfully verified: {verification_successes}")
	print(f"Discovery failures (directories): {len(failed_dirs)}")
	print(f"Checksum fetch failures: {len(checksum_failures)}")
	print(f"Verification failures: {len(verification_failures)}")

	if len(hashes) > 0:
	print(f"Verification success rate: {verification_successes/len(hashes)*100:.1f}%")

	# Report failures in summary (since we already reported them individually)
	if failed_dirs:
	print(f"\nFAILED DIRECTORIES SUMMARY:")
	for d in failed_dirs:
	print(f" - {d}")

	# Exit with appropriate status
	total_failures = len(failed_dirs) + len(checksum_failures) + len(verification_failures)
	if total_failures > 0:
	print(f"\n⚠️ WARNING: {total_failures} total failures detected!")
	sys.exit(1)
	else:
	print(f"\n✅ SUCCESS: All files verified successfully!")
	sys.exit(0)

	if __name__ == "__main__":
	main()
No results found