marr75/gpt2_period_tokens.py

## gpt2_period_tokens.py
# /// script
# dependencies = ["tiktoken"]
# ///
"""Explore how GPT-2's BPE tokenizer handles periods in different contexts."""

import tiktoken

enc = tiktoken.get_encoding("gpt2")


def show(label: str, text: str) -> None:
    tokens = enc.encode(text)
    decoded = [enc.decode([t]) for t in tokens]
    print(f"  {label}")
    print(f"    text:   {text!r}")
    print(f"    tokens: {decoded}")
    print(f"    ids:    {tokens}")
    print()


print("=" * 80)
print("PERIOD AS SENTENCE ENDER")
print("=" * 80)
show("The reddit example", "The temperature was 98.")
show("Followed by space + new sentence", "The temperature was 98. The next day")
show("Followed by newline", "The temperature was 98.\nThe next day")
show("Simple sentence end", "He left.")
show("End with space after", "He left. She stayed.")

print("=" * 80)
print("PERIOD IN DECIMAL NUMBERS")
print("=" * 80)
show("Decimal mid-sentence", "The temperature was 98.6 degrees.")
show("Decimal at sentence end", "The temperature was 98.6.")
show("Small decimal", "It was 3.14 radians.")
show("Leading zero", "The value is 0.5 percent.")
show("Multiple decimals", "From 98.6 to 99.1 degrees.")
show("Large decimal", "The price was 1234.56 dollars.")

print("=" * 80)
print("PERIOD IN ABBREVIATIONS / SPECIAL CONTEXTS")
print("=" * 80)
show("Abbreviation", "Dr. Smith went home.")
show("Initials", "J. R. R. Tolkien wrote books.")
show("Ellipsis", "He paused... then continued.")
show("Domain name", "Visit openai.com for details.")
show("File extension", "Edit the file main.py please.")
show("Version number", "Using version 3.12.1 now.")
show("IP address", "The server is at 192.168.1.1 today.")

print("=" * 80)
print("AMBIGUOUS CASES — DOES CONTEXT CHANGE TOKENIZATION?")
print("=" * 80)
show("98 + period alone", "98.")
show("98 + period + digit", "98.6")
show("98 + period + space", "98. ")
show("98 + period + space + upper", "98. The")
show("98 + period + space + lower", "98. the")
show("98 + period + newline", "98.\n")

print("=" * 80)
print("TOKEN IDENTITY CHECK — IS '.' ALWAYS THE SAME TOKEN?")
print("=" * 80)
# Check if the period character on its own is a single consistent token
period_token = enc.encode(".")
print(f"  Bare period '.' encodes to: {period_token} (decoded: {[enc.decode([t]) for t in period_token]})")
print()

# Now find all tokens that contain a period
print("  Tokens containing '.' (sampling from vocab):")
count = 0
for token_id in range(enc.n_vocab):
    try:
        decoded = enc.decode([token_id])
        if "." in decoded and decoded != ".":
            print(f"    id={token_id:>6}  {decoded!r}")
            count += 1
            if count >= 40:
                print(f"    ... (stopping at {count}, there are more)")
                break
    except Exception:
        pass
	# /// script
	# dependencies = ["tiktoken"]
	# ///
	"""Explore how GPT-2's BPE tokenizer handles periods in different contexts."""

	import tiktoken

	enc = tiktoken.get_encoding("gpt2")


	def show(label: str, text: str) -> None:
	tokens = enc.encode(text)
	decoded = [enc.decode([t]) for t in tokens]
	print(f" {label}")
	print(f" text: {text!r}")
	print(f" tokens: {decoded}")
	print(f" ids: {tokens}")
	print()


	print("=" * 80)
	print("PERIOD AS SENTENCE ENDER")
	print("=" * 80)
	show("The reddit example", "The temperature was 98.")
	show("Followed by space + new sentence", "The temperature was 98. The next day")
	show("Followed by newline", "The temperature was 98.\nThe next day")
	show("Simple sentence end", "He left.")
	show("End with space after", "He left. She stayed.")

	print("=" * 80)
	print("PERIOD IN DECIMAL NUMBERS")
	print("=" * 80)
	show("Decimal mid-sentence", "The temperature was 98.6 degrees.")
	show("Decimal at sentence end", "The temperature was 98.6.")
	show("Small decimal", "It was 3.14 radians.")
	show("Leading zero", "The value is 0.5 percent.")
	show("Multiple decimals", "From 98.6 to 99.1 degrees.")
	show("Large decimal", "The price was 1234.56 dollars.")

	print("=" * 80)
	print("PERIOD IN ABBREVIATIONS / SPECIAL CONTEXTS")
	print("=" * 80)
	show("Abbreviation", "Dr. Smith went home.")
	show("Initials", "J. R. R. Tolkien wrote books.")
	show("Ellipsis", "He paused... then continued.")
	show("Domain name", "Visit openai.com for details.")
	show("File extension", "Edit the file main.py please.")
	show("Version number", "Using version 3.12.1 now.")
	show("IP address", "The server is at 192.168.1.1 today.")

	print("=" * 80)
	print("AMBIGUOUS CASES — DOES CONTEXT CHANGE TOKENIZATION?")
	print("=" * 80)
	show("98 + period alone", "98.")
	show("98 + period + digit", "98.6")
	show("98 + period + space", "98. ")
	show("98 + period + space + upper", "98. The")
	show("98 + period + space + lower", "98. the")
	show("98 + period + newline", "98.\n")

	print("=" * 80)
	print("TOKEN IDENTITY CHECK — IS '.' ALWAYS THE SAME TOKEN?")
	print("=" * 80)
	# Check if the period character on its own is a single consistent token
	period_token = enc.encode(".")
	print(f" Bare period '.' encodes to: {period_token} (decoded: {[enc.decode([t]) for t in period_token]})")
	print()

	# Now find all tokens that contain a period
	print(" Tokens containing '.' (sampling from vocab):")
	count = 0
	for token_id in range(enc.n_vocab):
	try:
	decoded = enc.decode([token_id])
	if "." in decoded and decoded != ".":
	print(f" id={token_id:>6} {decoded!r}")
	count += 1
	if count >= 40:
	print(f" ... (stopping at {count}, there are more)")
	break
	except Exception:
	pass
No results found