Skip to content

Instantly share code, notes, and snippets.

@Wauplin
Created October 10, 2025 09:28
Show Gist options
  • Select an option

  • Save Wauplin/7dc54d0b8c51ad3d143e33190af54324 to your computer and use it in GitHub Desktop.

Select an option

Save Wauplin/7dc54d0b8c51ad3d143e33190af54324 to your computer and use it in GitHub Desktop.
How to avoid concurrency issues when uploading to the Hugging Face Hub from many workers
# This is a hacky script, working at the time it has been published.
# It uses internal from the huggingface_hub library so expect breaking changes without prior notice.
# The biggest challenge when uploading from many workers is to avoid concurrency issues during the /commit call.
# The solution is to (pre-)upload files from the workers and put them in a queue.
#
# Then a last worker is dedicated to making commits on the Hub, using preuploaded files.
# The queue to coordinate the workers can be done be local files, a database, a Python queue, etc. as long as it's robust to concurrency.
# You should also add a retry mechanism in the "commit worker" in case of failure while committing.
#
# Upload TBs of data from many workers is challening and puts a high load on our infra.
# Please expect failures, and therefore build a robust retry mechanism client side.
#
# Note: this gist is only valid for LFS files e.g. large ones. For smaller ones (readme, config.json, etc.) you can always commit them later.
from typing import Optional
from huggingface_hub import CommitOperationAdd, HfApi
from huggingface_hub.lfs import UploadInfo
####################
# Code in every worker
####################
api = HfApi()
repo_id = ...
repo_type = ...
additions = [
CommitOperationAdd(path_in_repo=..., path_or_fileobj=...),
# ...
]
api.preupload_lfs_files(repo_id=repo_id, repo_type=repo_type, additions=additions)
for addition in additions:
if addition._upload_mode != "lfs":
# If regular file, it means it's a small one => should not happen. We can handle small files (json, readme, etc.) separately
# But just in case, we skip it
continue
# You must save this info for each addition "somewhere"
# This is the only info required to commit the files after upload
addition.path_in_repo # str
addition.upload_info.sha256.hex() # str
addition.upload_info.size # int
addition._remote_oid # str or None
####################
# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
####################
class HackyCommitOperationAdd(CommitOperationAdd):
def __post_init__(self) -> None:
# Important! Do not call "super().__post_init__()" !
pass
def _build_hacky_operation(
path_in_repo: str,
sha256: str,
size: int,
remote_oid: Optional[str] = None,
) -> HackyCommitOperationAdd:
operation = HackyCommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=b"")
# Manually set upload_info to bypass checks
operation.upload_info = UploadInfo(sha256=bytes.fromhex(sha256), size=size, sample=b"")
operation._upload_mode = "lfs"
operation._should_ignore = False
operation._remote_oid = remote_oid
return operation
####################
# Logic for the single worker committing the files
####################
# Retrieve the saved info for each LFS addition
# Good to commit every e.g. 100 files.
# You want to track which files have been committed successfully or not
# In case of failure (timeout, etc.), make sure to catch the exception and put back the non-committed files in the queue
api = HfApi()
while True:
additions = [
_build_hacky_operation(path_in_repo=..., sha256=..., size=..., remote_oid=...),
# ...
# retrieve e.g. 100 files from queue
]
api.create_commit(
repo_id=repo_id,
repo_type=repo_type,
operations=additions,
commit_message=f"Add {len(additions)} files",
)
time.sleep(1) # just in case
# Once in a while, you'll want to squash the commit history to avoid having thousands of commits in the repo:
api.super_squash_history(repo_id=repo_id, repo_type=repo_type)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment