Skip to content

Instantly share code, notes, and snippets.

@do-me
Created February 16, 2026 16:59
Show Gist options
  • Select an option

  • Save do-me/4fa9b8373fc7f5945f2f5830b2a26f94 to your computer and use it in GitHub Desktop.

Select an option

Save do-me/4fa9b8373fc7f5945f2f5830b2a26f94 to your computer and use it in GitHub Desktop.
Delete wrongly prefixed files in a hf dataset
from huggingface_hub import HfApi, CommitOperationDelete
# Configure your repo details
repo_id = "user/reponame"
token = "your token"
api = HfApi(token=token)
# 1. List files specifically in the target folder
target_folder = "files/2026"
files = api.list_repo_tree(repo_id, path_in_repo=target_folder, repo_type="dataset")
# 2. Filter for:
# - Files inside the specific folder
# - Filenames starting with "data_"
# - Files ending with ".parquet"
to_delete = []
for f in files:
# Get the filename from the full path (e.g., "data_2026-02-03_eng.parquet")
filename = f.path.split("/")[-1]
if filename.startswith("data_") and f.path.endswith(".parquet"):
to_delete.append(CommitOperationDelete(path_in_repo=f.path))
# 3. Execution logic
if to_delete:
print(f"Found {len(to_delete)} files with 'data_' prefix in {target_folder}:")
for op in to_delete:
print(f" - {op.path_in_repo}")
confirm = input("\nConfirm deletion? (y/n): ")
if confirm.lower() == 'y':
api.create_commit(
repo_id=repo_id,
operations=to_delete,
commit_message=f"Cleanup: remove incorrectly prefixed data_ files from {target_folder}",
repo_type="dataset"
)
print("Deletion complete.")
else:
print("Aborted.")
else:
print(f"No files starting with 'data_' found in {target_folder}.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment