Skip to content

Instantly share code, notes, and snippets.

@marr75
Created February 20, 2026 14:56
Show Gist options
  • Select an option

  • Save marr75/b9963c317eb682d5c652eac75f433379 to your computer and use it in GitHub Desktop.

Select an option

Save marr75/b9963c317eb682d5c652eac75f433379 to your computer and use it in GitHub Desktop.
# /// script
# dependencies = ["tiktoken"]
# ///
"""Explore how GPT-2's BPE tokenizer handles periods in different contexts."""
import tiktoken
enc = tiktoken.get_encoding("gpt2")
def show(label: str, text: str) -> None:
tokens = enc.encode(text)
decoded = [enc.decode([t]) for t in tokens]
print(f" {label}")
print(f" text: {text!r}")
print(f" tokens: {decoded}")
print(f" ids: {tokens}")
print()
print("=" * 80)
print("PERIOD AS SENTENCE ENDER")
print("=" * 80)
show("The reddit example", "The temperature was 98.")
show("Followed by space + new sentence", "The temperature was 98. The next day")
show("Followed by newline", "The temperature was 98.\nThe next day")
show("Simple sentence end", "He left.")
show("End with space after", "He left. She stayed.")
print("=" * 80)
print("PERIOD IN DECIMAL NUMBERS")
print("=" * 80)
show("Decimal mid-sentence", "The temperature was 98.6 degrees.")
show("Decimal at sentence end", "The temperature was 98.6.")
show("Small decimal", "It was 3.14 radians.")
show("Leading zero", "The value is 0.5 percent.")
show("Multiple decimals", "From 98.6 to 99.1 degrees.")
show("Large decimal", "The price was 1234.56 dollars.")
print("=" * 80)
print("PERIOD IN ABBREVIATIONS / SPECIAL CONTEXTS")
print("=" * 80)
show("Abbreviation", "Dr. Smith went home.")
show("Initials", "J. R. R. Tolkien wrote books.")
show("Ellipsis", "He paused... then continued.")
show("Domain name", "Visit openai.com for details.")
show("File extension", "Edit the file main.py please.")
show("Version number", "Using version 3.12.1 now.")
show("IP address", "The server is at 192.168.1.1 today.")
print("=" * 80)
print("AMBIGUOUS CASES — DOES CONTEXT CHANGE TOKENIZATION?")
print("=" * 80)
show("98 + period alone", "98.")
show("98 + period + digit", "98.6")
show("98 + period + space", "98. ")
show("98 + period + space + upper", "98. The")
show("98 + period + space + lower", "98. the")
show("98 + period + newline", "98.\n")
print("=" * 80)
print("TOKEN IDENTITY CHECK — IS '.' ALWAYS THE SAME TOKEN?")
print("=" * 80)
# Check if the period character on its own is a single consistent token
period_token = enc.encode(".")
print(f" Bare period '.' encodes to: {period_token} (decoded: {[enc.decode([t]) for t in period_token]})")
print()
# Now find all tokens that contain a period
print(" Tokens containing '.' (sampling from vocab):")
count = 0
for token_id in range(enc.n_vocab):
try:
decoded = enc.decode([token_id])
if "." in decoded and decoded != ".":
print(f" id={token_id:>6} {decoded!r}")
count += 1
if count >= 40:
print(f" ... (stopping at {count}, there are more)")
break
except Exception:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment