Last active
October 5, 2025 21:59
-
-
Save EvilFreelancer/dd1333ee965dc8a01a9218c7ffd95af2 to your computer and use it in GitHub Desktop.
Script for converting wikipedia dataset to plain-text with format of LLM tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Build single-line calibration prompts from Wikimedia Wikipedia (subset like 20231101.ru). | |
| For each article: | |
| - messages = [{"role": "user", "content": f"{title}\n\n{text}"}] | |
| - render via tokenizer.apply_chat_template(..., add_generation_prompt=True) | |
| - optionally inject BOS/EOS (sample-level or message-level) | |
| - replace real newlines with literal '\n' | |
| - write ONE LINE per dataset record | |
| CLI: | |
| --subset : Wikipedia config, default 20231101.ru | |
| --split : split name, default train | |
| --limit : how many rows to take from head, default 500 | |
| --model : HF model id for tokenizer/chat_template (default: driaforall/mem-agent) | |
| --output : output text file | |
| --bos-eos-mode {none,sample,message} | |
| --bos-token / --eos-token | |
| --boundary-pattern : regex to detect message boundaries when --bos-eos-mode=message | |
| Notes: | |
| - Comments are in English by request. | |
| - Message-boundary detection is heuristic; adjust --boundary-pattern for your template. | |
| """ | |
| import argparse | |
| import io | |
| import os | |
| import re | |
| from typing import List, Dict | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| # Heuristic: common end-of-message sentinels used by many chat templates. | |
| DEFAULT_BOUNDARY_PATTERN = r"(?:<\|im_end\|>|</s>|<\|eot_id\|>)" | |
| def resolve_special_token(tok, override: str | None, which: str) -> str: | |
| """ | |
| Resolve BOS/EOS printable string. | |
| Priority: | |
| 1) explicit override, | |
| 2) tokenizer.<bos/eos>_token (string), | |
| 3) decode tokenizer.<bos/eos>_token_id, | |
| 4) empty string. | |
| """ | |
| if override is not None: | |
| return override | |
| if which == "bos": | |
| if getattr(tok, "bos_token", None): | |
| return tok.bos_token | |
| tid = getattr(tok, "bos_token_id", None) | |
| else: | |
| if getattr(tok, "eos_token", None): | |
| return tok.eos_token | |
| tid = getattr(tok, "eos_token_id", None) | |
| if tid is not None: | |
| try: | |
| return tok.decode([tid]) | |
| except Exception: | |
| return "" | |
| return "" | |
| def inject_bos_eos(text: str, | |
| mode: str, | |
| bos_token: str, | |
| eos_token: str, | |
| boundary_pattern: str) -> str: | |
| """ | |
| Inject BOS/EOS according to mode: | |
| - 'none' : return as-is | |
| - 'sample' : BOS at start (if absent), EOS at very end (if absent) | |
| - 'message': BOS at start; insert EOS right after every detected boundary. | |
| Also ensure final EOS at the end. | |
| """ | |
| if mode == "none": | |
| return text | |
| bos = bos_token or "" | |
| eos = eos_token or "" | |
| if mode == "sample": | |
| s = text | |
| if bos and not s.startswith(bos): | |
| s = bos + s | |
| if eos and not s.rstrip().endswith(eos): | |
| s = s.rstrip() + eos + "\n" | |
| return s | |
| if mode == "message": | |
| if not eos: | |
| # Without an EOS symbol, message-level injection is meaningless; degrade to BOS only. | |
| s = text | |
| if bos and not s.startswith(bos): | |
| s = bos + s | |
| return s | |
| pattern = re.compile(boundary_pattern) | |
| s = pattern.sub(lambda m: m.group(0) + eos, text) | |
| if bos and not s.startswith(bos): | |
| s = bos + s | |
| if not s.rstrip().endswith(eos): | |
| s = s.rstrip() + eos + "\n" | |
| return s | |
| return text | |
| def main(): | |
| ap = argparse.ArgumentParser(description="Generate one-line prompts from Wikipedia with optional BOS/EOS.") | |
| ap.add_argument("--model", default="driaforall/mem-agent", | |
| help="HF model id providing tokenizer & chat_template.") | |
| ap.add_argument("--subset", default="20231101.ru", | |
| help="Wikimedia/Wikipedia config (subset), e.g. 20231101.ru.") | |
| ap.add_argument("--split", default="train", help="Dataset split (default: train).") | |
| ap.add_argument("--limit", type=int, default=500, help="How many rows to take from the head (default: 500).") | |
| ap.add_argument("--output", default="calib_wiki.txt", help="Output text file.") | |
| ap.add_argument("--bos-eos-mode", choices=["none", "sample", "message"], default="none", | |
| help="Where to inject BOS/EOS tokens.") | |
| ap.add_argument("--bos-token", default=None, help="Override BOS token string.") | |
| ap.add_argument("--eos-token", default=None, help="Override EOS token string.") | |
| ap.add_argument("--boundary-pattern", default=DEFAULT_BOUNDARY_PATTERN, | |
| help="Regex for message boundaries when --bos-eos-mode=message.") | |
| args = ap.parse_args() | |
| # Load tokenizer with chat_template | |
| tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) | |
| # Resolve printable BOS/EOS | |
| bos = resolve_special_token(tok, args.bos_token, "bos") | |
| eos = resolve_special_token(tok, args.eos_token, "eos") | |
| # Load dataset | |
| ds = load_dataset("wikimedia/wikipedia", args.subset, split=args.split) | |
| # Prepare writer | |
| os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) | |
| written = 0 | |
| with io.open(args.output, "w", encoding="utf-8", newline="\n") as f: | |
| n = min(args.limit, len(ds)) | |
| for i in range(n): | |
| row = ds[i] | |
| title = (row.get("title") or "").strip() | |
| text = (row.get("text") or "").strip() | |
| if not text: | |
| continue | |
| # Build a single user turn | |
| content = f"{title}\n\n{text}" if title else text | |
| messages: List[Dict[str, str]] = [{"role": "user", "content": content}] | |
| # Render prompt with an assistant prefix at the end | |
| rendered = tok.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=False, | |
| ) | |
| # Inject BOS/EOS before escaping newlines | |
| rendered = inject_bos_eos( | |
| text=rendered, | |
| mode=args.bos_eos_mode, | |
| bos_token=bos, | |
| eos_token=eos, | |
| boundary_pattern=args.boundary_pattern, | |
| ) | |
| # One record per line: replace real newlines with literal '\n' | |
| line = rendered.replace("\r\n", "\n").replace("\n", "\\n") | |
| # Write one line per dataset record | |
| f.write(line + "\n") | |
| written += 1 | |
| print(f"Wrote {written} single-line prompts to: {args.output}") | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage example: