Skip to content

Instantly share code, notes, and snippets.

@segyges
Last active July 8, 2025 17:24
Show Gist options
  • Select an option

  • Save segyges/463026c961aa9d2ee5d4dec3490ce35d to your computer and use it in GitHub Desktop.

Select an option

Save segyges/463026c961aa9d2ee5d4dec3490ce35d to your computer and use it in GitHub Desktop.
"""
Hugging Face Dataset Verification Tool
This script verifies the integrity of locally downloaded Hugging Face datasets by comparing
SHA256 checksums from the remote repository with local file hashes.
USAGE:
python verify_hf_download.py <repo_id> [local_path]
ARGUMENTS:
repo_id - Hugging Face dataset repository ID (e.g., "microsoft/DialoGPT-medium")
local_path - Optional path to local dataset directory (defaults to current directory ".")
EXAMPLES:
# Verify dataset in current directory
python verify_hf_download.py microsoft/DialoGPT-medium
# Verify dataset in specific directory
python verify_hf_download.py microsoft/DialoGPT-medium /path/to/dataset
# Verify dataset in Downloads folder
python verify_hf_download.py openai/whisper-small ~/Downloads/whisper-small
REQUIREMENTS:
- requests: pip install requests
- backoff: pip install backoff
- tqdm: pip install tqdm
EXIT CODES:
0 - All files verified successfully
1 - One or more verification failures detected
"""
import threading
import queue
import requests
import sys
import re
import backoff
import hashlib
import os
from tqdm import tqdm
import time
# More aggressive backoff: longer waits, more retries
@backoff.on_exception(backoff.expo,
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
max_tries=8, # Increased from 5
base=2, # Base delay
max_value=300) # Max 5 minute delay
def fetch_with_retry(url):
"""Fetch URL with exponential backoff retry"""
response = requests.get(url, timeout=60) # Increased timeout
if response.status_code == 429:
raise requests.exceptions.HTTPError("Rate limited")
response.raise_for_status()
return response
def get_all_files_recursive(repo_id, path="", failed_dirs=None):
"""Recursively get all files from HF API"""
if failed_dirs is None:
failed_dirs = []
url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main"
if path:
url += f"/{path}"
try:
response = fetch_with_retry(url)
data = response.json()
except Exception as e:
error_msg = f"Failed to access directory '{path or 'root'}': {str(e)}"
print(f"❌ DIRECTORY ACCESS FAILURE: {error_msg}")
failed_dirs.append(path or "root")
return [], failed_dirs
files = []
for item in data:
if item.get('type') == 'file':
files.append(item.get('path'))
elif item.get('type') == 'directory':
subfiles, failed_dirs = get_all_files_recursive(repo_id, item.get('path'), failed_dirs)
files.extend(subfiles)
return files, failed_dirs
@backoff.on_exception(backoff.expo,
(IOError, OSError),
max_tries=3,
base=1,
max_value=10)
def calculate_sha256(filepath):
hash_sha256 = hashlib.sha256()
chunk_queue = queue.Queue(maxsize=10)
def reader():
try:
with open(filepath, "rb") as f:
while True:
chunk = f.read(1024*1024)
if not chunk:
break
chunk_queue.put(chunk)
finally:
chunk_queue.put(None)
reader_thread = threading.Thread(target=reader)
reader_thread.start()
try:
while True:
chunk = chunk_queue.get()
if chunk is None:
break
hash_sha256.update(chunk)
finally:
reader_thread.join()
return hash_sha256.hexdigest()
def main():
if len(sys.argv) < 2:
print("Usage: python script.py <repo_id> [local_path]")
sys.exit(1)
repo_id = sys.argv[1]
local_path = sys.argv[2] if len(sys.argv) > 2 else "."
print(f"Walking {repo_id}...")
files, failed_dirs = get_all_files_recursive(repo_id)
# Phase 1: Get all hashes from remote WITH IMMEDIATE FAILURE REPORTING
print(f"Fetching checksums for {len(files)} files...")
hashes = []
checksum_failures = []
# Add counters for live progress
success_count = 0
failure_count = 0
for i, filename in enumerate(tqdm(files, desc="Fetching checksums")):
pointer_url = f"https://huggingface.co/datasets/{repo_id}/raw/main/{filename}"
try:
response = fetch_with_retry(pointer_url)
content = response.text
if 'sha256:' in content:
sha_match = re.search(r'sha256:([a-f0-9]{64})', content)
if sha_match:
sha256 = sha_match.group(1)
hashes.append((filename, sha256))
success_count += 1
else:
failure_msg = f"SHA256 pattern found but could not extract hash"
checksum_failures.append((filename, failure_msg))
failure_count += 1
print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
else:
failure_msg = "No SHA256 hash found in file"
checksum_failures.append((filename, failure_msg))
failure_count += 1
print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
except Exception as e:
failure_msg = f"Failed to download checksum: {str(e)}"
checksum_failures.append((filename, failure_msg))
failure_count += 1
print(f"❌ API FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}")
# Add a small delay after API failures to be nicer to the server
if "rate limit" in str(e).lower() or "429" in str(e):
print(f"⏱️ Rate limited, taking extra 30s break...")
time.sleep(30)
# Report checksum phase summary
print(f"\n{'='*60}")
print(f"CHECKSUM PHASE COMPLETE")
print(f"{'='*60}")
print(f"✅ Checksums fetched successfully: {success_count}")
print(f"❌ Checksum fetch failures: {failure_count}")
if failure_count > 0:
print(f"⚠️ Warning: {failure_count} files failed checksum fetch - cannot verify these files")
# Phase 2: Verify local files against checksums WITH IMMEDIATE FAILURE REPORTING
print(f"\nVerifying {len(hashes)} files against local copies...")
verification_successes = 0
verification_failures = []
for i, (filename, expected_hash) in enumerate(tqdm(hashes, desc="Verifying files")):
local_file_path = os.path.join(local_path, filename)
if not os.path.exists(local_file_path):
failure_msg = "File does not exist locally"
verification_failures.append((filename, failure_msg))
print(f"❌ MISSING FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
continue
if not os.path.isfile(local_file_path):
failure_msg = "Path exists but is not a file"
verification_failures.append((filename, failure_msg))
print(f"❌ NOT A FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
continue
result = calculate_sha256(local_file_path)
if isinstance(result, tuple): # Error case
actual_hash, error = result
failure_msg = f"Failed to calculate hash: {error}"
verification_failures.append((filename, failure_msg))
print(f"❌ HASH ERROR [{i+1}/{len(hashes)}]: {filename} - {failure_msg}")
continue
actual_hash = result
if actual_hash.lower() == expected_hash.lower():
verification_successes += 1
else:
failure_msg = f"Hash mismatch - expected: {expected_hash}, got: {actual_hash}"
verification_failures.append((filename, failure_msg))
print(f"❌ HASH MISMATCH [{i+1}/{len(hashes)}]: {filename}")
print(f" Expected: {expected_hash}")
print(f" Got: {actual_hash}")
# Final report
print(f"\n{'='*80}")
print(f"FINAL VERIFICATION REPORT")
print(f"{'='*80}")
print(f"Total files discovered: {len(files)}")
print(f"Checksums successfully fetched: {len(hashes)}")
print(f"Files successfully verified: {verification_successes}")
print(f"Discovery failures (directories): {len(failed_dirs)}")
print(f"Checksum fetch failures: {len(checksum_failures)}")
print(f"Verification failures: {len(verification_failures)}")
if len(hashes) > 0:
print(f"Verification success rate: {verification_successes/len(hashes)*100:.1f}%")
# Report failures in summary (since we already reported them individually)
if failed_dirs:
print(f"\nFAILED DIRECTORIES SUMMARY:")
for d in failed_dirs:
print(f" - {d}")
# Exit with appropriate status
total_failures = len(failed_dirs) + len(checksum_failures) + len(verification_failures)
if total_failures > 0:
print(f"\n⚠️ WARNING: {total_failures} total failures detected!")
sys.exit(1)
else:
print(f"\n✅ SUCCESS: All files verified successfully!")
sys.exit(0)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment