Skip to content

Instantly share code, notes, and snippets.

@EvilFreelancer
Last active October 5, 2025 21:59
Show Gist options
  • Select an option

  • Save EvilFreelancer/dd1333ee965dc8a01a9218c7ffd95af2 to your computer and use it in GitHub Desktop.

Select an option

Save EvilFreelancer/dd1333ee965dc8a01a9218c7ffd95af2 to your computer and use it in GitHub Desktop.
Script for converting wikipedia dataset to plain-text with format of LLM tokenizer
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Build single-line calibration prompts from Wikimedia Wikipedia (subset like 20231101.ru).
For each article:
- messages = [{"role": "user", "content": f"{title}\n\n{text}"}]
- render via tokenizer.apply_chat_template(..., add_generation_prompt=True)
- optionally inject BOS/EOS (sample-level or message-level)
- replace real newlines with literal '\n'
- write ONE LINE per dataset record
CLI:
--subset : Wikipedia config, default 20231101.ru
--split : split name, default train
--limit : how many rows to take from head, default 500
--model : HF model id for tokenizer/chat_template (default: driaforall/mem-agent)
--output : output text file
--bos-eos-mode {none,sample,message}
--bos-token / --eos-token
--boundary-pattern : regex to detect message boundaries when --bos-eos-mode=message
Notes:
- Comments are in English by request.
- Message-boundary detection is heuristic; adjust --boundary-pattern for your template.
"""
import argparse
import io
import os
import re
from typing import List, Dict
from datasets import load_dataset
from transformers import AutoTokenizer
# Heuristic: common end-of-message sentinels used by many chat templates.
DEFAULT_BOUNDARY_PATTERN = r"(?:<\|im_end\|>|</s>|<\|eot_id\|>)"
def resolve_special_token(tok, override: str | None, which: str) -> str:
"""
Resolve BOS/EOS printable string.
Priority:
1) explicit override,
2) tokenizer.<bos/eos>_token (string),
3) decode tokenizer.<bos/eos>_token_id,
4) empty string.
"""
if override is not None:
return override
if which == "bos":
if getattr(tok, "bos_token", None):
return tok.bos_token
tid = getattr(tok, "bos_token_id", None)
else:
if getattr(tok, "eos_token", None):
return tok.eos_token
tid = getattr(tok, "eos_token_id", None)
if tid is not None:
try:
return tok.decode([tid])
except Exception:
return ""
return ""
def inject_bos_eos(text: str,
mode: str,
bos_token: str,
eos_token: str,
boundary_pattern: str) -> str:
"""
Inject BOS/EOS according to mode:
- 'none' : return as-is
- 'sample' : BOS at start (if absent), EOS at very end (if absent)
- 'message': BOS at start; insert EOS right after every detected boundary.
Also ensure final EOS at the end.
"""
if mode == "none":
return text
bos = bos_token or ""
eos = eos_token or ""
if mode == "sample":
s = text
if bos and not s.startswith(bos):
s = bos + s
if eos and not s.rstrip().endswith(eos):
s = s.rstrip() + eos + "\n"
return s
if mode == "message":
if not eos:
# Without an EOS symbol, message-level injection is meaningless; degrade to BOS only.
s = text
if bos and not s.startswith(bos):
s = bos + s
return s
pattern = re.compile(boundary_pattern)
s = pattern.sub(lambda m: m.group(0) + eos, text)
if bos and not s.startswith(bos):
s = bos + s
if not s.rstrip().endswith(eos):
s = s.rstrip() + eos + "\n"
return s
return text
def main():
ap = argparse.ArgumentParser(description="Generate one-line prompts from Wikipedia with optional BOS/EOS.")
ap.add_argument("--model", default="driaforall/mem-agent",
help="HF model id providing tokenizer & chat_template.")
ap.add_argument("--subset", default="20231101.ru",
help="Wikimedia/Wikipedia config (subset), e.g. 20231101.ru.")
ap.add_argument("--split", default="train", help="Dataset split (default: train).")
ap.add_argument("--limit", type=int, default=500, help="How many rows to take from the head (default: 500).")
ap.add_argument("--output", default="calib_wiki.txt", help="Output text file.")
ap.add_argument("--bos-eos-mode", choices=["none", "sample", "message"], default="none",
help="Where to inject BOS/EOS tokens.")
ap.add_argument("--bos-token", default=None, help="Override BOS token string.")
ap.add_argument("--eos-token", default=None, help="Override EOS token string.")
ap.add_argument("--boundary-pattern", default=DEFAULT_BOUNDARY_PATTERN,
help="Regex for message boundaries when --bos-eos-mode=message.")
args = ap.parse_args()
# Load tokenizer with chat_template
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
# Resolve printable BOS/EOS
bos = resolve_special_token(tok, args.bos_token, "bos")
eos = resolve_special_token(tok, args.eos_token, "eos")
# Load dataset
ds = load_dataset("wikimedia/wikipedia", args.subset, split=args.split)
# Prepare writer
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
written = 0
with io.open(args.output, "w", encoding="utf-8", newline="\n") as f:
n = min(args.limit, len(ds))
for i in range(n):
row = ds[i]
title = (row.get("title") or "").strip()
text = (row.get("text") or "").strip()
if not text:
continue
# Build a single user turn
content = f"{title}\n\n{text}" if title else text
messages: List[Dict[str, str]] = [{"role": "user", "content": content}]
# Render prompt with an assistant prefix at the end
rendered = tok.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
# Inject BOS/EOS before escaping newlines
rendered = inject_bos_eos(
text=rendered,
mode=args.bos_eos_mode,
bos_token=bos,
eos_token=eos,
boundary_pattern=args.boundary_pattern,
)
# One record per line: replace real newlines with literal '\n'
line = rendered.replace("\r\n", "\n").replace("\n", "\\n")
# Write one line per dataset record
f.write(line + "\n")
written += 1
print(f"Wrote {written} single-line prompts to: {args.output}")
if __name__ == "__main__":
main()
@EvilFreelancer
Copy link
Author

EvilFreelancer commented Oct 5, 2025

Usage example:

pip install "datasets>=2.20.0" "transformers>=4.44"

python wikipedia_generator.py \
  --model ai-sage/GigaChat-20B-A3B-instruct \
  --subset 20231101.ru \
  --split train \
  --limit 500 \
  --output calib.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment