cosminscn/nanogpt_adder.py

## nanogpt_adder.py
"""190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded."""
import math
from dataclasses import dataclass
import torch, torch.nn as nn
from torch.nn import functional as F

# === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) ===
# Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.n_head, self.n_embd, self.dropout = config.n_head, config.n_embd, config.dropout

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        return self.c_proj(y.transpose(1, 2).contiguous().view(B, T, C))

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        h = config.mlp_hidden or 4 * config.n_embd
        self.c_fc = nn.Linear(config.n_embd, h, bias=True)
        self.gelu = nn.GELU()
        self.c_proj = nn.Linear(h, config.n_embd, bias=config.bias)
    def forward(self, x):
        return self.c_proj(self.gelu(self.c_fc(x)))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.Identity()
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.Identity()
        self.mlp = MLP(config)
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        return x + self.mlp(self.ln_2(x))

@dataclass
class GPTConfig:
    block_size: int = 35
    vocab_size: int = 10    # 0-9 only; + and = mapped to 0
    n_layer: int = 1
    n_head: int = 2
    n_embd: int = 4
    dropout: float = 0.0
    bias: bool = False
    mlp_hidden: int = 4

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
        self.register_buffer('pe', torch.zeros(config.block_size, config.n_embd))

    def forward(self, idx, targets=None):
        x = self.transformer.wte(idx) + self.pe[:idx.size(1)]
        for block in self.transformer.h:
            x = block(x)
        return self.lm_head(x[:, [-1], :]), None

# === Weight injection ===

def build_adder():
    config = GPTConfig()
    model = GPT(config)
    th, S, n = 2*math.pi/11, 100.0, 4

    with torch.no_grad():
        # Token embeddings: digit value in dim 0 (tokens 0-9)
        model.transformer.wte.weight.zero_()
        for v in range(10): model.transformer.wte.weight[v, 0] = float(v)

        # Positional encoding buffer: sin/cos phase with period 11
        model.pe.zero_()
        for p in range(35):
            amp = 100.0 if p <= 21 else 1.0
            model.pe[p, 1] = amp * math.sin(p * th)
            model.pe[p, 2] = amp * math.cos(p * th)

        # ReLU instead of GELU
        model.transformer.h[0].mlp.gelu = nn.ReLU()

        # Attention: 2 heads route digit pairs via sinusoidal resonance
        w = torch.zeros(3*n, n)
        w[0,1] = -math.cos(8*th)*S; w[0,2] = math.sin(8*th)*S  # head0 q: 8-back
        w[1,1] = math.sin(8*th)*S;  w[1,2] = math.cos(8*th)*S
        w[2,1] = -math.cos(9*th)*S; w[2,2] = math.sin(9*th)*S  # head1 q: 9-back
        w[3,1] = math.sin(9*th)*S;  w[3,2] = math.cos(9*th)*S
        w[4,1] = w[6,1] = 1.0; w[5,2] = w[7,2] = 1.0          # keys: sin/cos
        w[8,0] = w[10,0] = 1.0                                   # values: digit
        model.transformer.h[0].attn.c_attn.weight.copy_(w)

        # Projection: head0→dim3 (curr sum), head1→dim1 (prev sum)
        w = torch.zeros(n, n)
        w[3,0] = 2.0; w[1,2] = 2.0
        model.transformer.h[0].attn.c_proj.weight.copy_(w)

        # MLP: 4 ReLU neurons for carry detection + mod-10 wrap
        fw, fb = torch.zeros(4, n), torch.zeros(4)
        fw[0,1]= 100; fw[0,0]=-100; fb[0]= -50     # carry bit
        fw[1,1]= 100; fw[1,0]=-100; fb[1]=-150      # carry guard
        fw[2,3]=1000; fw[2,1]=  10; fw[2,0]=-10; fb[2]=-9005  # wrap detect
        fw[3,3]=1000; fw[3,1]=  10; fw[3,0]=-10; fb[3]=-9015  # wrap guard
        model.transformer.h[0].mlp.c_fc.weight.copy_(fw)
        model.transformer.h[0].mlp.c_fc.bias.copy_(fb)

        pw = torch.zeros(n, 4)
        pw[3,:] = torch.tensor([0.01, -0.01, -1.0, 1.0])
        model.transformer.h[0].mlp.c_proj.weight.copy_(pw)

        # LM head: parabolic decoding (argmax_v -(v-x)^2 = nearest int)
        model.lm_head.weight.zero_(); model.lm_head.bias.zero_()
        for v in range(10):
            model.lm_head.weight[v,3] = 2.0*v; model.lm_head.bias[v] = -float(v*v)

    return model

# === Test ===

def test(model, a, b):
    # Map + and = to token 0 (same as digit 0: no digit value in dim 0)
    tok = {'+': 0, '=': 0}
    seq = [int(c) if c.isdigit() else tok[c] for c in f"{a:010d}+{b:010d}="]
    model.eval()
    with torch.no_grad():
        for _ in range(11):
            logits, _ = model(torch.tensor([seq]))
            seq.append(logits[0,-1].argmax().item())
    result = int("".join(str(t) for t in seq[22:])[::-1])
    ok = result == a + b
    print(f"  {a:>13,d} + {b:>13,d} = {result:>13,d}  {'✅' if ok else '❌'}")
    return ok

model = build_adder()
params = sum(p.numel() for p in model.parameters())
buf = sum(b.numel() for b in model.buffers())
print(f"\n  NanoGPT Adder: {params} params + {buf} pos emb buffer\n")
for name, p in model.named_parameters():
    print(f"    {name:40s} {str(list(p.shape)):>10s} = {p.numel():>3d}")
print(f"    {'TOTAL':40s} {'':>10s} = {params:>3d}\n")

tests = [(5,5),(555,445),(99999,1),(19492,23919),(9999999999,1),(1234567890,987654321),(0,0),(1111111111,8888888889)]
passed = sum(test(model, a, b) for a, b in tests)
print(f"\n  {passed}/{len(tests)} passed")
	"""190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded."""
	import math
	from dataclasses import dataclass
	import torch, torch.nn as nn
	from torch.nn import functional as F

	# === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) ===
	# Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on

	class CausalSelfAttention(nn.Module):
	def __init__(self, config):
	super().__init__()
	assert config.n_embd % config.n_head == 0
	self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
	self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
	self.n_head, self.n_embd, self.dropout = config.n_head, config.n_embd, config.dropout

	def forward(self, x):
	B, T, C = x.size()
	q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
	q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
	k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
	v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
	y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
	return self.c_proj(y.transpose(1, 2).contiguous().view(B, T, C))

	class MLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	h = config.mlp_hidden or 4 * config.n_embd
	self.c_fc = nn.Linear(config.n_embd, h, bias=True)
	self.gelu = nn.GELU()
	self.c_proj = nn.Linear(h, config.n_embd, bias=config.bias)
	def forward(self, x):
	return self.c_proj(self.gelu(self.c_fc(x)))

	class Block(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.ln_1 = nn.Identity()
	self.attn = CausalSelfAttention(config)
	self.ln_2 = nn.Identity()
	self.mlp = MLP(config)
	def forward(self, x):
	x = x + self.attn(self.ln_1(x))
	return x + self.mlp(self.ln_2(x))

	@dataclass
	class GPTConfig:
	block_size: int = 35
	vocab_size: int = 10 # 0-9 only; + and = mapped to 0
	n_layer: int = 1
	n_head: int = 2
	n_embd: int = 4
	dropout: float = 0.0
	bias: bool = False
	mlp_hidden: int = 4

	class GPT(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.transformer = nn.ModuleDict(dict(
	wte = nn.Embedding(config.vocab_size, config.n_embd),
	h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
	))
	self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
	self.register_buffer('pe', torch.zeros(config.block_size, config.n_embd))

	def forward(self, idx, targets=None):
	x = self.transformer.wte(idx) + self.pe[:idx.size(1)]
	for block in self.transformer.h:
	x = block(x)
	return self.lm_head(x[:, [-1], :]), None

	# === Weight injection ===

	def build_adder():
	config = GPTConfig()
	model = GPT(config)
	th, S, n = 2*math.pi/11, 100.0, 4

	with torch.no_grad():
	# Token embeddings: digit value in dim 0 (tokens 0-9)
	model.transformer.wte.weight.zero_()
	for v in range(10): model.transformer.wte.weight[v, 0] = float(v)

	# Positional encoding buffer: sin/cos phase with period 11
	model.pe.zero_()
	for p in range(35):
	amp = 100.0 if p <= 21 else 1.0
	model.pe[p, 1] = amp * math.sin(p * th)
	model.pe[p, 2] = amp * math.cos(p * th)

	# ReLU instead of GELU
	model.transformer.h[0].mlp.gelu = nn.ReLU()

	# Attention: 2 heads route digit pairs via sinusoidal resonance
	w = torch.zeros(3*n, n)
	w[0,1] = -math.cos(8th)S; w[0,2] = math.sin(8th)S # head0 q: 8-back
	w[1,1] = math.sin(8th)S; w[1,2] = math.cos(8th)S
	w[2,1] = -math.cos(9th)S; w[2,2] = math.sin(9th)S # head1 q: 9-back
	w[3,1] = math.sin(9th)S; w[3,2] = math.cos(9th)S
	w[4,1] = w[6,1] = 1.0; w[5,2] = w[7,2] = 1.0 # keys: sin/cos
	w[8,0] = w[10,0] = 1.0 # values: digit
	model.transformer.h[0].attn.c_attn.weight.copy_(w)

	# Projection: head0→dim3 (curr sum), head1→dim1 (prev sum)
	w = torch.zeros(n, n)
	w[3,0] = 2.0; w[1,2] = 2.0
	model.transformer.h[0].attn.c_proj.weight.copy_(w)

	# MLP: 4 ReLU neurons for carry detection + mod-10 wrap
	fw, fb = torch.zeros(4, n), torch.zeros(4)
	fw[0,1]= 100; fw[0,0]=-100; fb[0]= -50 # carry bit
	fw[1,1]= 100; fw[1,0]=-100; fb[1]=-150 # carry guard
	fw[2,3]=1000; fw[2,1]= 10; fw[2,0]=-10; fb[2]=-9005 # wrap detect
	fw[3,3]=1000; fw[3,1]= 10; fw[3,0]=-10; fb[3]=-9015 # wrap guard
	model.transformer.h[0].mlp.c_fc.weight.copy_(fw)
	model.transformer.h[0].mlp.c_fc.bias.copy_(fb)

	pw = torch.zeros(n, 4)
	pw[3,:] = torch.tensor([0.01, -0.01, -1.0, 1.0])
	model.transformer.h[0].mlp.c_proj.weight.copy_(pw)

	# LM head: parabolic decoding (argmax_v -(v-x)^2 = nearest int)
	model.lm_head.weight.zero_(); model.lm_head.bias.zero_()
	for v in range(10):
	model.lm_head.weight[v,3] = 2.0v; model.lm_head.bias[v] = -float(vv)

	return model

	# === Test ===

	def test(model, a, b):
	# Map + and = to token 0 (same as digit 0: no digit value in dim 0)
	tok = {'+': 0, '=': 0}
	seq = [int(c) if c.isdigit() else tok[c] for c in f"{a:010d}+{b:010d}="]
	model.eval()
	with torch.no_grad():
	for _ in range(11):
	logits, _ = model(torch.tensor([seq]))
	seq.append(logits[0,-1].argmax().item())
	result = int("".join(str(t) for t in seq[22:])[::-1])
	ok = result == a + b
	print(f" {a:>13,d} + {b:>13,d} = {result:>13,d} {'✅' if ok else '❌'}")
	return ok

	model = build_adder()
	params = sum(p.numel() for p in model.parameters())
	buf = sum(b.numel() for b in model.buffers())
	print(f"\n NanoGPT Adder: {params} params + {buf} pos emb buffer\n")
	for name, p in model.named_parameters():
	print(f" {name:40s} {str(list(p.shape)):>10s} = {p.numel():>3d}")
	print(f" {'TOTAL':40s} {'':>10s} = {params:>3d}\n")

	tests = [(5,5),(555,445),(99999,1),(19492,23919),(9999999999,1),(1234567890,987654321),(0,0),(1111111111,8888888889)]
	passed = sum(test(model, a, b) for a, b in tests)
	print(f"\n {passed}/{len(tests)} passed")
No results found