Skip to content

Instantly share code, notes, and snippets.

@hppRC
Created January 30, 2025 10:02
Show Gist options
  • Select an option

  • Save hppRC/a53847119be4603fa20e809e8326c478 to your computer and use it in GitHub Desktop.

Select an option

Save hppRC/a53847119be4603fa20e809e8326c478 to your computer and use it in GitHub Desktop.
import datasets as ds
from konoha import SentenceTokenizer
def title2text():
dataset: ds.Dataset = ds.load_dataset("globis-university/aozorabunko-clean", split="train", num_proc=16)
def process(x: dict[str, list]):
anc_list, pos_list = [], []
for text, meta in zip(x["text"], x["meta"]):
title = meta["作品名"]
tokenizer = SentenceTokenizer()
sentences = tokenizer.tokenize(text.strip())
current_text = ""
output_texts = []
for sentence in sentences:
current_text = (current_text + sentence).strip()
if len(current_text) >= 400:
output_texts.append(current_text)
current_text = ""
if len(current_text) > 0:
if len(output_texts) > 0:
if len(current_text) < 200:
output_texts[-1] = (output_texts[-1] + current_text).strip()
else:
output_texts.append(current_text.strip())
else:
output_texts.append(current_text.strip())
anc_list.extend([title for _ in range(len(output_texts))])
pos_list.extend(output_texts)
return {
"anc": anc_list,
"pos": pos_list,
}
dataset = dataset.map(
process,
num_proc=16,
batched=True,
remove_columns=dataset.column_names,
)
dataset = dataset.select_columns(["anc", "pos"])
dataset.push_to_hub("hpprc/ruri-dataset-v2-pt", "aozora-title2text", private=True)
def yomi():
dataset: ds.Dataset = ds.load_dataset("globis-university/aozorabunko-clean", split="train", num_proc=16)
def process(x: dict):
meta = x["meta"]
title = meta["作品名"]
yomi = meta["作品名読み"]
return {
"anc": title,
"pos": yomi,
}
dataset = dataset.map(
process,
num_proc=16,
remove_columns=dataset.column_names,
)
dataset = dataset.filter(lambda x: x["anc"] != x["pos"], num_proc=16)
dataset = dataset.select_columns(["anc", "pos"])
dataset.push_to_hub("hpprc/ruri-dataset-v2-pt", "aozora-yomi", private=True)
def yomi_independent():
dataset: ds.Dataset = ds.load_dataset("globis-university/aozorabunko-clean", split="train", num_proc=16)
def process(x: dict):
meta = x["meta"]
title = meta["作品名"]
yomi = meta["作品名読み"]
url = meta["XHTML/HTMLファイルURL"]
return {
"title": title,
"yomi": yomi,
"url": url,
}
dataset = dataset.map(
process,
num_proc=16,
remove_columns=dataset.column_names,
)
dataset = dataset.filter(lambda x: x["title"] != x["yomi"], num_proc=16)
dataset = dataset.select_columns(["title", "yomi", "url"])
dataset.push_to_hub("hpprc/aozora-yomi")
if __name__ == "__main__":
# title2text()
# yomi()
yomi_independent()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment