Wauplin/hf.py

## hf.py
# This is a hacky script, working at the time it has been published.
# It uses internal from the huggingface_hub library so expect breaking changes without prior notice.
# The biggest challenge when uploading from many workers is to avoid concurrency issues during the /commit call.
# The solution is to (pre-)upload files from the workers and put them in a queue.
#
# Then a last worker is dedicated to making commits on the Hub, using preuploaded files.
# The queue to coordinate the workers can be done be local files, a database, a Python queue, etc. as long as it's robust to concurrency.
# You should also add a retry mechanism in the "commit worker" in case of failure while committing.
#
# Upload TBs of data from many workers is challening and puts a high load on our infra.
# Please expect failures, and therefore build a robust retry mechanism client side.
#
# Note: this gist is only valid for LFS files e.g. large ones. For smaller ones (readme, config.json, etc.) you can always commit them later.

from typing import Optional

from huggingface_hub import CommitOperationAdd, HfApi
from huggingface_hub.lfs import UploadInfo


####################
# Code in every worker
####################

api = HfApi()

repo_id = ...
repo_type = ...
additions = [
    CommitOperationAdd(path_in_repo=..., path_or_fileobj=...),
    # ...
]

api.preupload_lfs_files(repo_id=repo_id, repo_type=repo_type, additions=additions)

for addition in additions:
    if addition._upload_mode != "lfs":
        # If regular file, it means it's a small one => should not happen. We can handle small files (json, readme, etc.) separately
        # But just in case, we skip it
        continue

    # You must save this info for each addition "somewhere"
    # This is the only info required to commit the files after upload
    addition.path_in_repo  # str
    addition.upload_info.sha256.hex()  # str
    addition.upload_info.size  # int
    addition._remote_oid  # str or None


####################
# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
####################


class HackyCommitOperationAdd(CommitOperationAdd):
    def __post_init__(self) -> None:
        # Important! Do not call "super().__post_init__()" !
        pass


def _build_hacky_operation(
    path_in_repo: str,
    sha256: str,
    size: int,
    remote_oid: Optional[str] = None,
) -> HackyCommitOperationAdd:
    operation = HackyCommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=b"")

    # Manually set upload_info to bypass checks
    operation.upload_info = UploadInfo(sha256=bytes.fromhex(sha256), size=size, sample=b"")
    operation._upload_mode = "lfs"
    operation._should_ignore = False
    operation._remote_oid = remote_oid
    return operation


####################
# Logic for the single worker committing the files
####################

# Retrieve the saved info for each LFS addition
# Good to commit every e.g. 100 files.
# You want to track which files have been committed successfully or not
# In case of failure (timeout, etc.), make sure to catch the exception and put back the non-committed files in the queue
api = HfApi()
while True:
    additions = [
        _build_hacky_operation(path_in_repo=..., sha256=..., size=..., remote_oid=...),
        # ...
        # retrieve e.g. 100 files from queue
    ]
    api.create_commit(
        repo_id=repo_id,
        repo_type=repo_type,
        operations=additions,
        commit_message=f"Add {len(additions)} files",
    )
    time.sleep(1) # just in case

# Once in a while, you'll want to squash the commit history to avoid having thousands of commits in the repo:
api.super_squash_history(repo_id=repo_id, repo_type=repo_type)
	# This is a hacky script, working at the time it has been published.
	# It uses internal from the huggingface_hub library so expect breaking changes without prior notice.
	# The biggest challenge when uploading from many workers is to avoid concurrency issues during the /commit call.
	# The solution is to (pre-)upload files from the workers and put them in a queue.
	#
	# Then a last worker is dedicated to making commits on the Hub, using preuploaded files.
	# The queue to coordinate the workers can be done be local files, a database, a Python queue, etc. as long as it's robust to concurrency.
	# You should also add a retry mechanism in the "commit worker" in case of failure while committing.
	#
	# Upload TBs of data from many workers is challening and puts a high load on our infra.
	# Please expect failures, and therefore build a robust retry mechanism client side.
	#
	# Note: this gist is only valid for LFS files e.g. large ones. For smaller ones (readme, config.json, etc.) you can always commit them later.

	from typing import Optional

	from huggingface_hub import CommitOperationAdd, HfApi
	from huggingface_hub.lfs import UploadInfo


	####################
	# Code in every worker
	####################

	api = HfApi()

	repo_id = ...
	repo_type = ...
	additions = [
	CommitOperationAdd(path_in_repo=..., path_or_fileobj=...),
	# ...
	]

	api.preupload_lfs_files(repo_id=repo_id, repo_type=repo_type, additions=additions)

	for addition in additions:
	if addition._upload_mode != "lfs":
	# If regular file, it means it's a small one => should not happen. We can handle small files (json, readme, etc.) separately
	# But just in case, we skip it
	continue

	# You must save this info for each addition "somewhere"
	# This is the only info required to commit the files after upload
	addition.path_in_repo # str
	addition.upload_info.sha256.hex() # str
	addition.upload_info.size # int
	addition._remote_oid # str or None


	####################
	# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
	####################


	class HackyCommitOperationAdd(CommitOperationAdd):
	def __post_init__(self) -> None:
	# Important! Do not call "super().__post_init__()" !
	pass


	def _build_hacky_operation(
	path_in_repo: str,
	sha256: str,
	size: int,
	remote_oid: Optional[str] = None,
	) -> HackyCommitOperationAdd:
	operation = HackyCommitOperationAdd(path_in_repo=path_in_repo, path_or_fileobj=b"")

	# Manually set upload_info to bypass checks
	operation.upload_info = UploadInfo(sha256=bytes.fromhex(sha256), size=size, sample=b"")
	operation._upload_mode = "lfs"
	operation._should_ignore = False
	operation._remote_oid = remote_oid
	return operation


	####################
	# Logic for the single worker committing the files
	####################

	# Retrieve the saved info for each LFS addition
	# Good to commit every e.g. 100 files.
	# You want to track which files have been committed successfully or not
	# In case of failure (timeout, etc.), make sure to catch the exception and put back the non-committed files in the queue
	api = HfApi()
	while True:
	additions = [
	_build_hacky_operation(path_in_repo=..., sha256=..., size=..., remote_oid=...),
	# ...
	# retrieve e.g. 100 files from queue
	]
	api.create_commit(
	repo_id=repo_id,
	repo_type=repo_type,
	operations=additions,
	commit_message=f"Add {len(additions)} files",
	)
	time.sleep(1) # just in case

	# Once in a while, you'll want to squash the commit history to avoid having thousands of commits in the repo:
	api.super_squash_history(repo_id=repo_id, repo_type=repo_type)
No results found