Last active
February 25, 2026 08:23
-
-
Save cosminscn/65a5fa5e20524495415f3cdd6bfdd7d2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded.""" | |
| import math | |
| from dataclasses import dataclass | |
| import torch, torch.nn as nn | |
| from torch.nn import functional as F | |
| # === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) === | |
| # Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| assert config.n_embd % config.n_head == 0 | |
| self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) | |
| self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias) | |
| self.n_head, self.n_embd, self.dropout = config.n_head, config.n_embd, config.dropout | |
| def forward(self, x): | |
| B, T, C = x.size() | |
| q, k, v = self.c_attn(x).split(self.n_embd, dim=2) | |
| q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) | |
| k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) | |
| v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) | |
| y = F.scaled_dot_product_attention(q, k, v, is_causal=True) | |
| return self.c_proj(y.transpose(1, 2).contiguous().view(B, T, C)) | |
| class MLP(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| h = config.mlp_hidden or 4 * config.n_embd | |
| self.c_fc = nn.Linear(config.n_embd, h, bias=True) | |
| self.gelu = nn.GELU() | |
| self.c_proj = nn.Linear(h, config.n_embd, bias=config.bias) | |
| def forward(self, x): | |
| return self.c_proj(self.gelu(self.c_fc(x))) | |
| class Block(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.ln_1 = nn.Identity() | |
| self.attn = CausalSelfAttention(config) | |
| self.ln_2 = nn.Identity() | |
| self.mlp = MLP(config) | |
| def forward(self, x): | |
| x = x + self.attn(self.ln_1(x)) | |
| return x + self.mlp(self.ln_2(x)) | |
| @dataclass | |
| class GPTConfig: | |
| block_size: int = 35 | |
| vocab_size: int = 10 # 0-9 only; + and = mapped to 0 | |
| n_layer: int = 1 | |
| n_head: int = 2 | |
| n_embd: int = 4 | |
| dropout: float = 0.0 | |
| bias: bool = False | |
| mlp_hidden: int = 4 | |
| class GPT(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.config = config | |
| self.transformer = nn.ModuleDict(dict( | |
| wte = nn.Embedding(config.vocab_size, config.n_embd), | |
| h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), | |
| )) | |
| self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) | |
| self.register_buffer('pe', torch.zeros(config.block_size, config.n_embd)) | |
| def forward(self, idx, targets=None): | |
| x = self.transformer.wte(idx) + self.pe[:idx.size(1)] | |
| for block in self.transformer.h: | |
| x = block(x) | |
| return self.lm_head(x[:, [-1], :]), None | |
| # === Weight injection === | |
| def build_adder(): | |
| config = GPTConfig() | |
| model = GPT(config) | |
| th, S, n = 2*math.pi/11, 100.0, 4 | |
| with torch.no_grad(): | |
| # Token embeddings: digit value in dim 0 (tokens 0-9) | |
| model.transformer.wte.weight.zero_() | |
| for v in range(10): model.transformer.wte.weight[v, 0] = float(v) | |
| # Positional encoding buffer: sin/cos phase with period 11 | |
| model.pe.zero_() | |
| for p in range(35): | |
| amp = 100.0 if p <= 21 else 1.0 | |
| model.pe[p, 1] = amp * math.sin(p * th) | |
| model.pe[p, 2] = amp * math.cos(p * th) | |
| # ReLU instead of GELU | |
| model.transformer.h[0].mlp.gelu = nn.ReLU() | |
| # Attention: 2 heads route digit pairs via sinusoidal resonance | |
| w = torch.zeros(3*n, n) | |
| w[0,1] = -math.cos(8*th)*S; w[0,2] = math.sin(8*th)*S # head0 q: 8-back | |
| w[1,1] = math.sin(8*th)*S; w[1,2] = math.cos(8*th)*S | |
| w[2,1] = -math.cos(9*th)*S; w[2,2] = math.sin(9*th)*S # head1 q: 9-back | |
| w[3,1] = math.sin(9*th)*S; w[3,2] = math.cos(9*th)*S | |
| w[4,1] = w[6,1] = 1.0; w[5,2] = w[7,2] = 1.0 # keys: sin/cos | |
| w[8,0] = w[10,0] = 1.0 # values: digit | |
| model.transformer.h[0].attn.c_attn.weight.copy_(w) | |
| # Projection: head0→dim3 (curr sum), head1→dim1 (prev sum) | |
| w = torch.zeros(n, n) | |
| w[3,0] = 2.0; w[1,2] = 2.0 | |
| model.transformer.h[0].attn.c_proj.weight.copy_(w) | |
| # MLP: 4 ReLU neurons for carry detection + mod-10 wrap | |
| fw, fb = torch.zeros(4, n), torch.zeros(4) | |
| fw[0,1]= 100; fw[0,0]=-100; fb[0]= -50 # carry bit | |
| fw[1,1]= 100; fw[1,0]=-100; fb[1]=-150 # carry guard | |
| fw[2,3]=1000; fw[2,1]= 10; fw[2,0]=-10; fb[2]=-9005 # wrap detect | |
| fw[3,3]=1000; fw[3,1]= 10; fw[3,0]=-10; fb[3]=-9015 # wrap guard | |
| model.transformer.h[0].mlp.c_fc.weight.copy_(fw) | |
| model.transformer.h[0].mlp.c_fc.bias.copy_(fb) | |
| pw = torch.zeros(n, 4) | |
| pw[3,:] = torch.tensor([0.01, -0.01, -1.0, 1.0]) | |
| model.transformer.h[0].mlp.c_proj.weight.copy_(pw) | |
| # LM head: parabolic decoding (argmax_v -(v-x)^2 = nearest int) | |
| model.lm_head.weight.zero_(); model.lm_head.bias.zero_() | |
| for v in range(10): | |
| model.lm_head.weight[v,3] = 2.0*v; model.lm_head.bias[v] = -float(v*v) | |
| return model | |
| # === Test === | |
| def test(model, a, b): | |
| # Map + and = to token 0 (same as digit 0: no digit value in dim 0) | |
| tok = {'+': 0, '=': 0} | |
| seq = [int(c) if c.isdigit() else tok[c] for c in f"{a:010d}+{b:010d}="] | |
| model.eval() | |
| with torch.no_grad(): | |
| for _ in range(11): | |
| logits, _ = model(torch.tensor([seq])) | |
| seq.append(logits[0,-1].argmax().item()) | |
| result = int("".join(str(t) for t in seq[22:])[::-1]) | |
| ok = result == a + b | |
| print(f" {a:>13,d} + {b:>13,d} = {result:>13,d} {'✅' if ok else '❌'}") | |
| return ok | |
| model = build_adder() | |
| params = sum(p.numel() for p in model.parameters()) | |
| buf = sum(b.numel() for b in model.buffers()) | |
| print(f"\n NanoGPT Adder: {params} params + {buf} pos emb buffer\n") | |
| for name, p in model.named_parameters(): | |
| print(f" {name:40s} {str(list(p.shape)):>10s} = {p.numel():>3d}") | |
| print(f" {'TOTAL':40s} {'':>10s} = {params:>3d}\n") | |
| tests = [(5,5),(555,445),(99999,1),(19492,23919),(9999999999,1),(1234567890,987654321),(0,0),(1111111111,8888888889)] | |
| passed = sum(test(model, a, b) for a, b in tests) | |
| print(f"\n {passed}/{len(tests)} passed") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment