Skip to content

Instantly share code, notes, and snippets.

@cosminscn
Last active February 25, 2026 08:23
Show Gist options
  • Select an option

  • Save cosminscn/65a5fa5e20524495415f3cdd6bfdd7d2 to your computer and use it in GitHub Desktop.

Select an option

Save cosminscn/65a5fa5e20524495415f3cdd6bfdd7d2 to your computer and use it in GitHub Desktop.
"""190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded."""
import math
from dataclasses import dataclass
import torch, torch.nn as nn
from torch.nn import functional as F
# === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) ===
# Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
self.n_head, self.n_embd, self.dropout = config.n_head, config.n_embd, config.dropout
def forward(self, x):
B, T, C = x.size()
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
return self.c_proj(y.transpose(1, 2).contiguous().view(B, T, C))
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
h = config.mlp_hidden or 4 * config.n_embd
self.c_fc = nn.Linear(config.n_embd, h, bias=True)
self.gelu = nn.GELU()
self.c_proj = nn.Linear(h, config.n_embd, bias=config.bias)
def forward(self, x):
return self.c_proj(self.gelu(self.c_fc(x)))
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.ln_1 = nn.Identity()
self.attn = CausalSelfAttention(config)
self.ln_2 = nn.Identity()
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
return x + self.mlp(self.ln_2(x))
@dataclass
class GPTConfig:
block_size: int = 35
vocab_size: int = 10 # 0-9 only; + and = mapped to 0
n_layer: int = 1
n_head: int = 2
n_embd: int = 4
dropout: float = 0.0
bias: bool = False
mlp_hidden: int = 4
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.transformer = nn.ModuleDict(dict(
wte = nn.Embedding(config.vocab_size, config.n_embd),
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
))
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
self.register_buffer('pe', torch.zeros(config.block_size, config.n_embd))
def forward(self, idx, targets=None):
x = self.transformer.wte(idx) + self.pe[:idx.size(1)]
for block in self.transformer.h:
x = block(x)
return self.lm_head(x[:, [-1], :]), None
# === Weight injection ===
def build_adder():
config = GPTConfig()
model = GPT(config)
th, S, n = 2*math.pi/11, 100.0, 4
with torch.no_grad():
# Token embeddings: digit value in dim 0 (tokens 0-9)
model.transformer.wte.weight.zero_()
for v in range(10): model.transformer.wte.weight[v, 0] = float(v)
# Positional encoding buffer: sin/cos phase with period 11
model.pe.zero_()
for p in range(35):
amp = 100.0 if p <= 21 else 1.0
model.pe[p, 1] = amp * math.sin(p * th)
model.pe[p, 2] = amp * math.cos(p * th)
# ReLU instead of GELU
model.transformer.h[0].mlp.gelu = nn.ReLU()
# Attention: 2 heads route digit pairs via sinusoidal resonance
w = torch.zeros(3*n, n)
w[0,1] = -math.cos(8*th)*S; w[0,2] = math.sin(8*th)*S # head0 q: 8-back
w[1,1] = math.sin(8*th)*S; w[1,2] = math.cos(8*th)*S
w[2,1] = -math.cos(9*th)*S; w[2,2] = math.sin(9*th)*S # head1 q: 9-back
w[3,1] = math.sin(9*th)*S; w[3,2] = math.cos(9*th)*S
w[4,1] = w[6,1] = 1.0; w[5,2] = w[7,2] = 1.0 # keys: sin/cos
w[8,0] = w[10,0] = 1.0 # values: digit
model.transformer.h[0].attn.c_attn.weight.copy_(w)
# Projection: head0→dim3 (curr sum), head1→dim1 (prev sum)
w = torch.zeros(n, n)
w[3,0] = 2.0; w[1,2] = 2.0
model.transformer.h[0].attn.c_proj.weight.copy_(w)
# MLP: 4 ReLU neurons for carry detection + mod-10 wrap
fw, fb = torch.zeros(4, n), torch.zeros(4)
fw[0,1]= 100; fw[0,0]=-100; fb[0]= -50 # carry bit
fw[1,1]= 100; fw[1,0]=-100; fb[1]=-150 # carry guard
fw[2,3]=1000; fw[2,1]= 10; fw[2,0]=-10; fb[2]=-9005 # wrap detect
fw[3,3]=1000; fw[3,1]= 10; fw[3,0]=-10; fb[3]=-9015 # wrap guard
model.transformer.h[0].mlp.c_fc.weight.copy_(fw)
model.transformer.h[0].mlp.c_fc.bias.copy_(fb)
pw = torch.zeros(n, 4)
pw[3,:] = torch.tensor([0.01, -0.01, -1.0, 1.0])
model.transformer.h[0].mlp.c_proj.weight.copy_(pw)
# LM head: parabolic decoding (argmax_v -(v-x)^2 = nearest int)
model.lm_head.weight.zero_(); model.lm_head.bias.zero_()
for v in range(10):
model.lm_head.weight[v,3] = 2.0*v; model.lm_head.bias[v] = -float(v*v)
return model
# === Test ===
def test(model, a, b):
# Map + and = to token 0 (same as digit 0: no digit value in dim 0)
tok = {'+': 0, '=': 0}
seq = [int(c) if c.isdigit() else tok[c] for c in f"{a:010d}+{b:010d}="]
model.eval()
with torch.no_grad():
for _ in range(11):
logits, _ = model(torch.tensor([seq]))
seq.append(logits[0,-1].argmax().item())
result = int("".join(str(t) for t in seq[22:])[::-1])
ok = result == a + b
print(f" {a:>13,d} + {b:>13,d} = {result:>13,d} {'✅' if ok else '❌'}")
return ok
model = build_adder()
params = sum(p.numel() for p in model.parameters())
buf = sum(b.numel() for b in model.buffers())
print(f"\n NanoGPT Adder: {params} params + {buf} pos emb buffer\n")
for name, p in model.named_parameters():
print(f" {name:40s} {str(list(p.shape)):>10s} = {p.numel():>3d}")
print(f" {'TOTAL':40s} {'':>10s} = {params:>3d}\n")
tests = [(5,5),(555,445),(99999,1),(19492,23919),(9999999999,1),(1234567890,987654321),(0,0),(1111111111,8888888889)]
passed = sum(test(model, a, b) for a, b in tests)
print(f"\n {passed}/{len(tests)} passed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment