Last active
July 8, 2025 17:24
-
-
Save segyges/463026c961aa9d2ee5d4dec3490ce35d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Hugging Face Dataset Verification Tool | |
| This script verifies the integrity of locally downloaded Hugging Face datasets by comparing | |
| SHA256 checksums from the remote repository with local file hashes. | |
| USAGE: | |
| python verify_hf_download.py <repo_id> [local_path] | |
| ARGUMENTS: | |
| repo_id - Hugging Face dataset repository ID (e.g., "microsoft/DialoGPT-medium") | |
| local_path - Optional path to local dataset directory (defaults to current directory ".") | |
| EXAMPLES: | |
| # Verify dataset in current directory | |
| python verify_hf_download.py microsoft/DialoGPT-medium | |
| # Verify dataset in specific directory | |
| python verify_hf_download.py microsoft/DialoGPT-medium /path/to/dataset | |
| # Verify dataset in Downloads folder | |
| python verify_hf_download.py openai/whisper-small ~/Downloads/whisper-small | |
| REQUIREMENTS: | |
| - requests: pip install requests | |
| - backoff: pip install backoff | |
| - tqdm: pip install tqdm | |
| EXIT CODES: | |
| 0 - All files verified successfully | |
| 1 - One or more verification failures detected | |
| """ | |
| import threading | |
| import queue | |
| import requests | |
| import sys | |
| import re | |
| import backoff | |
| import hashlib | |
| import os | |
| from tqdm import tqdm | |
| import time | |
| # More aggressive backoff: longer waits, more retries | |
| @backoff.on_exception(backoff.expo, | |
| (requests.exceptions.RequestException, requests.exceptions.HTTPError), | |
| max_tries=8, # Increased from 5 | |
| base=2, # Base delay | |
| max_value=300) # Max 5 minute delay | |
| def fetch_with_retry(url): | |
| """Fetch URL with exponential backoff retry""" | |
| response = requests.get(url, timeout=60) # Increased timeout | |
| if response.status_code == 429: | |
| raise requests.exceptions.HTTPError("Rate limited") | |
| response.raise_for_status() | |
| return response | |
| def get_all_files_recursive(repo_id, path="", failed_dirs=None): | |
| """Recursively get all files from HF API""" | |
| if failed_dirs is None: | |
| failed_dirs = [] | |
| url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main" | |
| if path: | |
| url += f"/{path}" | |
| try: | |
| response = fetch_with_retry(url) | |
| data = response.json() | |
| except Exception as e: | |
| error_msg = f"Failed to access directory '{path or 'root'}': {str(e)}" | |
| print(f"❌ DIRECTORY ACCESS FAILURE: {error_msg}") | |
| failed_dirs.append(path or "root") | |
| return [], failed_dirs | |
| files = [] | |
| for item in data: | |
| if item.get('type') == 'file': | |
| files.append(item.get('path')) | |
| elif item.get('type') == 'directory': | |
| subfiles, failed_dirs = get_all_files_recursive(repo_id, item.get('path'), failed_dirs) | |
| files.extend(subfiles) | |
| return files, failed_dirs | |
| @backoff.on_exception(backoff.expo, | |
| (IOError, OSError), | |
| max_tries=3, | |
| base=1, | |
| max_value=10) | |
| def calculate_sha256(filepath): | |
| hash_sha256 = hashlib.sha256() | |
| chunk_queue = queue.Queue(maxsize=10) | |
| def reader(): | |
| try: | |
| with open(filepath, "rb") as f: | |
| while True: | |
| chunk = f.read(1024*1024) | |
| if not chunk: | |
| break | |
| chunk_queue.put(chunk) | |
| finally: | |
| chunk_queue.put(None) | |
| reader_thread = threading.Thread(target=reader) | |
| reader_thread.start() | |
| try: | |
| while True: | |
| chunk = chunk_queue.get() | |
| if chunk is None: | |
| break | |
| hash_sha256.update(chunk) | |
| finally: | |
| reader_thread.join() | |
| return hash_sha256.hexdigest() | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python script.py <repo_id> [local_path]") | |
| sys.exit(1) | |
| repo_id = sys.argv[1] | |
| local_path = sys.argv[2] if len(sys.argv) > 2 else "." | |
| print(f"Walking {repo_id}...") | |
| files, failed_dirs = get_all_files_recursive(repo_id) | |
| # Phase 1: Get all hashes from remote WITH IMMEDIATE FAILURE REPORTING | |
| print(f"Fetching checksums for {len(files)} files...") | |
| hashes = [] | |
| checksum_failures = [] | |
| # Add counters for live progress | |
| success_count = 0 | |
| failure_count = 0 | |
| for i, filename in enumerate(tqdm(files, desc="Fetching checksums")): | |
| pointer_url = f"https://huggingface.co/datasets/{repo_id}/raw/main/{filename}" | |
| try: | |
| response = fetch_with_retry(pointer_url) | |
| content = response.text | |
| if 'sha256:' in content: | |
| sha_match = re.search(r'sha256:([a-f0-9]{64})', content) | |
| if sha_match: | |
| sha256 = sha_match.group(1) | |
| hashes.append((filename, sha256)) | |
| success_count += 1 | |
| else: | |
| failure_msg = f"SHA256 pattern found but could not extract hash" | |
| checksum_failures.append((filename, failure_msg)) | |
| failure_count += 1 | |
| print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}") | |
| else: | |
| failure_msg = "No SHA256 hash found in file" | |
| checksum_failures.append((filename, failure_msg)) | |
| failure_count += 1 | |
| print(f"❌ CHECKSUM FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}") | |
| except Exception as e: | |
| failure_msg = f"Failed to download checksum: {str(e)}" | |
| checksum_failures.append((filename, failure_msg)) | |
| failure_count += 1 | |
| print(f"❌ API FAILURE [{i+1}/{len(files)}]: {filename} - {failure_msg}") | |
| # Add a small delay after API failures to be nicer to the server | |
| if "rate limit" in str(e).lower() or "429" in str(e): | |
| print(f"⏱️ Rate limited, taking extra 30s break...") | |
| time.sleep(30) | |
| # Report checksum phase summary | |
| print(f"\n{'='*60}") | |
| print(f"CHECKSUM PHASE COMPLETE") | |
| print(f"{'='*60}") | |
| print(f"✅ Checksums fetched successfully: {success_count}") | |
| print(f"❌ Checksum fetch failures: {failure_count}") | |
| if failure_count > 0: | |
| print(f"⚠️ Warning: {failure_count} files failed checksum fetch - cannot verify these files") | |
| # Phase 2: Verify local files against checksums WITH IMMEDIATE FAILURE REPORTING | |
| print(f"\nVerifying {len(hashes)} files against local copies...") | |
| verification_successes = 0 | |
| verification_failures = [] | |
| for i, (filename, expected_hash) in enumerate(tqdm(hashes, desc="Verifying files")): | |
| local_file_path = os.path.join(local_path, filename) | |
| if not os.path.exists(local_file_path): | |
| failure_msg = "File does not exist locally" | |
| verification_failures.append((filename, failure_msg)) | |
| print(f"❌ MISSING FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}") | |
| continue | |
| if not os.path.isfile(local_file_path): | |
| failure_msg = "Path exists but is not a file" | |
| verification_failures.append((filename, failure_msg)) | |
| print(f"❌ NOT A FILE [{i+1}/{len(hashes)}]: {filename} - {failure_msg}") | |
| continue | |
| result = calculate_sha256(local_file_path) | |
| if isinstance(result, tuple): # Error case | |
| actual_hash, error = result | |
| failure_msg = f"Failed to calculate hash: {error}" | |
| verification_failures.append((filename, failure_msg)) | |
| print(f"❌ HASH ERROR [{i+1}/{len(hashes)}]: {filename} - {failure_msg}") | |
| continue | |
| actual_hash = result | |
| if actual_hash.lower() == expected_hash.lower(): | |
| verification_successes += 1 | |
| else: | |
| failure_msg = f"Hash mismatch - expected: {expected_hash}, got: {actual_hash}" | |
| verification_failures.append((filename, failure_msg)) | |
| print(f"❌ HASH MISMATCH [{i+1}/{len(hashes)}]: {filename}") | |
| print(f" Expected: {expected_hash}") | |
| print(f" Got: {actual_hash}") | |
| # Final report | |
| print(f"\n{'='*80}") | |
| print(f"FINAL VERIFICATION REPORT") | |
| print(f"{'='*80}") | |
| print(f"Total files discovered: {len(files)}") | |
| print(f"Checksums successfully fetched: {len(hashes)}") | |
| print(f"Files successfully verified: {verification_successes}") | |
| print(f"Discovery failures (directories): {len(failed_dirs)}") | |
| print(f"Checksum fetch failures: {len(checksum_failures)}") | |
| print(f"Verification failures: {len(verification_failures)}") | |
| if len(hashes) > 0: | |
| print(f"Verification success rate: {verification_successes/len(hashes)*100:.1f}%") | |
| # Report failures in summary (since we already reported them individually) | |
| if failed_dirs: | |
| print(f"\nFAILED DIRECTORIES SUMMARY:") | |
| for d in failed_dirs: | |
| print(f" - {d}") | |
| # Exit with appropriate status | |
| total_failures = len(failed_dirs) + len(checksum_failures) + len(verification_failures) | |
| if total_failures > 0: | |
| print(f"\n⚠️ WARNING: {total_failures} total failures detected!") | |
| sys.exit(1) | |
| else: | |
| print(f"\n✅ SUCCESS: All files verified successfully!") | |
| sys.exit(0) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment