Skip to content

Instantly share code, notes, and snippets.

@baberabb
Created November 19, 2025 03:21
Show Gist options
  • Select an option

  • Save baberabb/d94bb68980cdc67a78197b47a37e71b8 to your computer and use it in GitHub Desktop.

Select an option

Save baberabb/d94bb68980cdc67a78197b47a37e71b8 to your computer and use it in GitHub Desktop.
convert_to_parquet
# /// script
# dependencies = [
# "datasets==3.6.0",
# ]
# ///
# copied from datasets-cli convert_to_parquet
# run with
# uv run convert_to_parquet.py marcob/lambada_multilingual EleutherAI/lambada_multilingual
# adapted from https://github.com/huggingface/datasets
# Licensed under Apache 2.0
import time
from argparse import ArgumentParser
from typing import Optional
from datasets import (
get_dataset_config_names,
get_dataset_default_config_name,
load_dataset,
)
from datasets.commands import BaseDatasetsCLICommand
from huggingface_hub import HfApi, get_repo_discussions
def _command_factory(args):
return ConvertToParquetCommand(
args.dataset_id,
args.other_id,
args.token,
args.revision,
args.trust_remote_code,
)
class ConvertToParquetCommand(BaseDatasetsCLICommand):
@staticmethod
def register_subcommand(parser):
parser: ArgumentParser = parser.add_parser(
"convert_to_parquet", help="Convert dataset to Parquet"
)
parser.add_argument("dataset_id", help="source dataset ID")
parser.add_argument("other_id", help="repo to push to")
parser.add_argument("--token", help="access token to the Hugging Face Hub")
parser.add_argument("--revision", help="source revision")
parser.add_argument(
"--trust_remote_code",
action="store_false",
help="whether to not trust the code execution of the load script",
)
parser.set_defaults(func=_command_factory)
def __init__(
self,
dataset_id: str,
other_id: str,
token: Optional[str],
revision: Optional[str],
trust_remote_code: bool,
):
self._dataset_id = dataset_id
self._other_id = other_id
self._token = token
self._revision = revision
self._trust_remote_code = trust_remote_code
def run(self) -> None:
dataset_id = self._dataset_id
other_id = self._other_id
token = self._token
revision = self._revision
trust_remote_code = self._trust_remote_code
print(f"{dataset_id}")
configs = get_dataset_config_names(
dataset_id,
token=token,
revision=revision,
trust_remote_code=trust_remote_code,
)
print(f"{configs = }")
default_config = get_dataset_default_config_name(
dataset_id,
token=token,
revision=revision,
trust_remote_code=trust_remote_code,
)
print(f"{default_config = }")
if default_config:
config = default_config
configs.remove(default_config)
else:
config = configs.pop(0)
print(f"{config = }")
dataset = load_dataset(
dataset_id, config, revision=revision, trust_remote_code=trust_remote_code
)
commit_info = dataset.push_to_hub(
other_id,
config_name=config,
commit_message="Convert dataset to Parquet",
commit_description="Convert dataset to Parquet.",
create_pr=True,
token=token,
set_default=default_config is not None,
)
time.sleep(5)
if commit_info:
pr_revision, pr_url = commit_info.pr_revision, commit_info.pr_url
else:
pr_revision, pr_url = infer_pr(other_id, token=token)
for config in configs:
print(f"{config = }")
dataset = load_dataset(
dataset_id,
config,
revision=revision,
trust_remote_code=trust_remote_code,
)
dataset.push_to_hub(
other_id,
config_name=config,
commit_message=f"Add {config} data files",
revision=pr_revision,
token=token,
)
time.sleep(5)
# delete_files(dataset_id, revision=pr_revision, token=token)
print(f"You can find your PR to convert the dataset to Parquet at: {pr_url}")
def infer_pr(dataset_id, token=None):
discussions = get_repo_discussions(dataset_id, repo_type="dataset", token=token)
prs = [
discussion
for discussion in discussions
if discussion.is_pull_request and discussion.status == "open"
]
pr = sorted(prs, key=lambda pr: pr.num)[-1]
return pr.git_reference, pr.url
def delete_files(dataset_id, revision=None, token=None):
dataset_name = dataset_id.split("/")[-1]
hf_api = HfApi(token=token)
repo_files = hf_api.list_repo_files(
dataset_id,
repo_type="dataset",
)
if repo_files:
legacy_json_file = []
python_files = []
data_files = []
for filename in repo_files:
if filename in {".gitattributes", "README.md"}:
continue
elif filename == f"{dataset_name}.py":
hf_api.delete_file(
filename,
dataset_id,
repo_type="dataset",
revision=revision,
commit_message="Delete loading script",
)
elif filename == "dataset_infos.json":
legacy_json_file.append(filename)
elif filename.endswith(".py"):
python_files.append(filename)
else:
data_files.append(filename)
if legacy_json_file:
hf_api.delete_file(
"dataset_infos.json",
dataset_id,
repo_type="dataset",
revision=revision,
commit_message="Delete legacy dataset_infos.json",
)
if python_files:
for filename in python_files:
hf_api.delete_file(
filename,
dataset_id,
repo_type="dataset",
revision=revision,
commit_message="Delete loading script auxiliary file",
)
if data_files:
for filename in data_files:
hf_api.delete_file(
filename,
dataset_id,
repo_type="dataset",
revision=revision,
commit_message="Delete data file",
)
if __name__ == "__main__":
parser = ArgumentParser(description="Convert dataset to Parquet")
parser.add_argument("dataset_id", help="source dataset ID")
parser.add_argument("other_id", help="repo to push to")
parser.add_argument("--token", help="access token to the Hugging Face Hub")
parser.add_argument("--revision", help="source revision")
parser.add_argument(
"--trust_remote_code",
action="store_false",
help="whether to not trust the code execution of the load script",
)
args = parser.parse_args()
command = ConvertToParquetCommand(
args.dataset_id,
args.other_id,
args.token,
args.revision,
args.trust_remote_code,
)
command.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment