Created
November 19, 2025 03:21
-
-
Save baberabb/d94bb68980cdc67a78197b47a37e71b8 to your computer and use it in GitHub Desktop.
convert_to_parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "datasets==3.6.0", | |
| # ] | |
| # /// | |
| # copied from datasets-cli convert_to_parquet | |
| # run with | |
| # uv run convert_to_parquet.py marcob/lambada_multilingual EleutherAI/lambada_multilingual | |
| # adapted from https://github.com/huggingface/datasets | |
| # Licensed under Apache 2.0 | |
| import time | |
| from argparse import ArgumentParser | |
| from typing import Optional | |
| from datasets import ( | |
| get_dataset_config_names, | |
| get_dataset_default_config_name, | |
| load_dataset, | |
| ) | |
| from datasets.commands import BaseDatasetsCLICommand | |
| from huggingface_hub import HfApi, get_repo_discussions | |
| def _command_factory(args): | |
| return ConvertToParquetCommand( | |
| args.dataset_id, | |
| args.other_id, | |
| args.token, | |
| args.revision, | |
| args.trust_remote_code, | |
| ) | |
| class ConvertToParquetCommand(BaseDatasetsCLICommand): | |
| @staticmethod | |
| def register_subcommand(parser): | |
| parser: ArgumentParser = parser.add_parser( | |
| "convert_to_parquet", help="Convert dataset to Parquet" | |
| ) | |
| parser.add_argument("dataset_id", help="source dataset ID") | |
| parser.add_argument("other_id", help="repo to push to") | |
| parser.add_argument("--token", help="access token to the Hugging Face Hub") | |
| parser.add_argument("--revision", help="source revision") | |
| parser.add_argument( | |
| "--trust_remote_code", | |
| action="store_false", | |
| help="whether to not trust the code execution of the load script", | |
| ) | |
| parser.set_defaults(func=_command_factory) | |
| def __init__( | |
| self, | |
| dataset_id: str, | |
| other_id: str, | |
| token: Optional[str], | |
| revision: Optional[str], | |
| trust_remote_code: bool, | |
| ): | |
| self._dataset_id = dataset_id | |
| self._other_id = other_id | |
| self._token = token | |
| self._revision = revision | |
| self._trust_remote_code = trust_remote_code | |
| def run(self) -> None: | |
| dataset_id = self._dataset_id | |
| other_id = self._other_id | |
| token = self._token | |
| revision = self._revision | |
| trust_remote_code = self._trust_remote_code | |
| print(f"{dataset_id}") | |
| configs = get_dataset_config_names( | |
| dataset_id, | |
| token=token, | |
| revision=revision, | |
| trust_remote_code=trust_remote_code, | |
| ) | |
| print(f"{configs = }") | |
| default_config = get_dataset_default_config_name( | |
| dataset_id, | |
| token=token, | |
| revision=revision, | |
| trust_remote_code=trust_remote_code, | |
| ) | |
| print(f"{default_config = }") | |
| if default_config: | |
| config = default_config | |
| configs.remove(default_config) | |
| else: | |
| config = configs.pop(0) | |
| print(f"{config = }") | |
| dataset = load_dataset( | |
| dataset_id, config, revision=revision, trust_remote_code=trust_remote_code | |
| ) | |
| commit_info = dataset.push_to_hub( | |
| other_id, | |
| config_name=config, | |
| commit_message="Convert dataset to Parquet", | |
| commit_description="Convert dataset to Parquet.", | |
| create_pr=True, | |
| token=token, | |
| set_default=default_config is not None, | |
| ) | |
| time.sleep(5) | |
| if commit_info: | |
| pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url | |
| else: | |
| pr_revision, pr_url = infer_pr(other_id, token=token) | |
| for config in configs: | |
| print(f"{config = }") | |
| dataset = load_dataset( | |
| dataset_id, | |
| config, | |
| revision=revision, | |
| trust_remote_code=trust_remote_code, | |
| ) | |
| dataset.push_to_hub( | |
| other_id, | |
| config_name=config, | |
| commit_message=f"Add {config} data files", | |
| revision=pr_revision, | |
| token=token, | |
| ) | |
| time.sleep(5) | |
| # delete_files(dataset_id, revision=pr_revision, token=token) | |
| print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}") | |
| def infer_pr(dataset_id, token=None): | |
| discussions = get_repo_discussions(dataset_id, repo_type="dataset", token=token) | |
| prs = [ | |
| discussion | |
| for discussion in discussions | |
| if discussion.is_pull_request and discussion.status == "open" | |
| ] | |
| pr = sorted(prs, key=lambda pr: pr.num)[-1] | |
| return pr.git_reference, pr.url | |
| def delete_files(dataset_id, revision=None, token=None): | |
| dataset_name = dataset_id.split("/")[-1] | |
| hf_api = HfApi(token=token) | |
| repo_files = hf_api.list_repo_files( | |
| dataset_id, | |
| repo_type="dataset", | |
| ) | |
| if repo_files: | |
| legacy_json_file = [] | |
| python_files = [] | |
| data_files = [] | |
| for filename in repo_files: | |
| if filename in {".gitattributes", "README.md"}: | |
| continue | |
| elif filename == f"{dataset_name}.py": | |
| hf_api.delete_file( | |
| filename, | |
| dataset_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| commit_message="Delete loading script", | |
| ) | |
| elif filename == "dataset_infos.json": | |
| legacy_json_file.append(filename) | |
| elif filename.endswith(".py"): | |
| python_files.append(filename) | |
| else: | |
| data_files.append(filename) | |
| if legacy_json_file: | |
| hf_api.delete_file( | |
| "dataset_infos.json", | |
| dataset_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| commit_message="Delete legacy dataset_infos.json", | |
| ) | |
| if python_files: | |
| for filename in python_files: | |
| hf_api.delete_file( | |
| filename, | |
| dataset_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| commit_message="Delete loading script auxiliary file", | |
| ) | |
| if data_files: | |
| for filename in data_files: | |
| hf_api.delete_file( | |
| filename, | |
| dataset_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| commit_message="Delete data file", | |
| ) | |
| if __name__ == "__main__": | |
| parser = ArgumentParser(description="Convert dataset to Parquet") | |
| parser.add_argument("dataset_id", help="source dataset ID") | |
| parser.add_argument("other_id", help="repo to push to") | |
| parser.add_argument("--token", help="access token to the Hugging Face Hub") | |
| parser.add_argument("--revision", help="source revision") | |
| parser.add_argument( | |
| "--trust_remote_code", | |
| action="store_false", | |
| help="whether to not trust the code execution of the load script", | |
| ) | |
| args = parser.parse_args() | |
| command = ConvertToParquetCommand( | |
| args.dataset_id, | |
| args.other_id, | |
| args.token, | |
| args.revision, | |
| args.trust_remote_code, | |
| ) | |
| command.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment