Created
February 16, 2026 16:59
-
-
Save do-me/4fa9b8373fc7f5945f2f5830b2a26f94 to your computer and use it in GitHub Desktop.
Delete wrongly prefixed files in a hf dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from huggingface_hub import HfApi, CommitOperationDelete | |
| # Configure your repo details | |
| repo_id = "user/reponame" | |
| token = "your token" | |
| api = HfApi(token=token) | |
| # 1. List files specifically in the target folder | |
| target_folder = "files/2026" | |
| files = api.list_repo_tree(repo_id, path_in_repo=target_folder, repo_type="dataset") | |
| # 2. Filter for: | |
| # - Files inside the specific folder | |
| # - Filenames starting with "data_" | |
| # - Files ending with ".parquet" | |
| to_delete = [] | |
| for f in files: | |
| # Get the filename from the full path (e.g., "data_2026-02-03_eng.parquet") | |
| filename = f.path.split("/")[-1] | |
| if filename.startswith("data_") and f.path.endswith(".parquet"): | |
| to_delete.append(CommitOperationDelete(path_in_repo=f.path)) | |
| # 3. Execution logic | |
| if to_delete: | |
| print(f"Found {len(to_delete)} files with 'data_' prefix in {target_folder}:") | |
| for op in to_delete: | |
| print(f" - {op.path_in_repo}") | |
| confirm = input("\nConfirm deletion? (y/n): ") | |
| if confirm.lower() == 'y': | |
| api.create_commit( | |
| repo_id=repo_id, | |
| operations=to_delete, | |
| commit_message=f"Cleanup: remove incorrectly prefixed data_ files from {target_folder}", | |
| repo_type="dataset" | |
| ) | |
| print("Deletion complete.") | |
| else: | |
| print("Aborted.") | |
| else: | |
| print(f"No files starting with 'data_' found in {target_folder}.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment