Created
October 10, 2025 09:28
-
-
Save Wauplin/7dc54d0b8c51ad3d143e33190af54324 to your computer and use it in GitHub Desktop.
How to avoid concurrency issues when uploading to the Hugging Face Hub from many workers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This is a hacky script, working at the time it has been published. | |
| # It uses internal from the huggingface_hub library so expect breaking changes without prior notice. | |
| # The biggest challenge when uploading from many workers is to avoid concurrency issues during the /commit call. | |
| # The solution is to (pre-)upload files from the workers and put them in a queue. | |
| # | |
| # Then a last worker is dedicated to making commits on the Hub, using preuploaded files. | |
| # The queue to coordinate the workers can be done be local files, a database, a Python queue, etc. as long as it's robust to concurrency. | |
| # You should also add a retry mechanism in the "commit worker" in case of failure while committing. | |
| # | |
| # Upload TBs of data from many workers is challening and puts a high load on our infra. | |
| # Please expect failures, and therefore build a robust retry mechanism client side. | |
| # | |
| # Note: this gist is only valid for LFS files e.g. large ones. For smaller ones (readme, config.json, etc.) you can always commit them later. | |
| from typing import Optional | |
| from huggingface_hub import CommitOperationAdd, HfApi | |
| from huggingface_hub.lfs import UploadInfo | |
| #################### | |
| # Code in every worker | |
| #################### | |
| api = HfApi() | |
| repo_id = ... | |
| repo_type = ... | |
| additions = [ | |
| CommitOperationAdd(path_in_repo=..., path_or_fileobj=...), | |
| # ... | |
| ] | |
| api.preupload_lfs_files(repo_id=repo_id, repo_type=repo_type, additions=additions) | |
| for addition in additions: | |
| if addition._upload_mode != "lfs": | |
| # If regular file, it means it's a small one => should not happen. We can handle small files (json, readme, etc.) separately | |
| # But just in case, we skip it | |
| continue | |
| # You must save this info for each addition "somewhere" | |
| # This is the only info required to commit the files after upload | |
| addition.path_in_repo # str | |
| addition.upload_info.sha256.hex() # str | |
| addition.upload_info.size # int | |
| addition._remote_oid # str or None | |
| #################### | |
| # Hacks with CommitOperationAdd to bypass checks/sha256 calculation | |
| #################### | |
| class HackyCommitOperationAdd(CommitOperationAdd): | |
| def __post_init__(self) -> None: | |
| # Important! Do not call "super().__post_init__()" ! | |
| pass | |
| def _build_hacky_operation( | |
| path_in_repo: str, | |
| sha256: str, | |
| size: int, | |
| remote_oid: Optional[str] = None, | |
| ) -> HackyCommitOperationAdd: | |
| operation = HackyCommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=b"") | |
| # Manually set upload_info to bypass checks | |
| operation.upload_info = UploadInfo(sha256=bytes.fromhex(sha256), size=size, sample=b"") | |
| operation._upload_mode = "lfs" | |
| operation._should_ignore = False | |
| operation._remote_oid = remote_oid | |
| return operation | |
| #################### | |
| # Logic for the single worker committing the files | |
| #################### | |
| # Retrieve the saved info for each LFS addition | |
| # Good to commit every e.g. 100 files. | |
| # You want to track which files have been committed successfully or not | |
| # In case of failure (timeout, etc.), make sure to catch the exception and put back the non-committed files in the queue | |
| api = HfApi() | |
| while True: | |
| additions = [ | |
| _build_hacky_operation(path_in_repo=..., sha256=..., size=..., remote_oid=...), | |
| # ... | |
| # retrieve e.g. 100 files from queue | |
| ] | |
| api.create_commit( | |
| repo_id=repo_id, | |
| repo_type=repo_type, | |
| operations=additions, | |
| commit_message=f"Add {len(additions)} files", | |
| ) | |
| time.sleep(1) # just in case | |
| # Once in a while, you'll want to squash the commit history to avoid having thousands of commits in the repo: | |
| api.super_squash_history(repo_id=repo_id, repo_type=repo_type) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment