Last active
February 16, 2026 09:35
-
-
Save mehmetkose/f6e2cd31d60f2d49c5294d65752c8a59 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // The most atomic way to train and inference a GPT in pure, dependency-free Go. | |
| // This file is the complete algorithm. Everything else is just efficiency. | |
| // | |
| // Ported from @karpathy's pure Python GPT implementation. | |
| // https://x.com/karpathy/status/2021694437152157847 | |
| // | |
| // Build and run: | |
| // go build -o microgpt microgpt.go && ./gpt | |
| // | |
| // NOTE: This is intentionally slow — every operation is scalar, just like the | |
| // Python original. A real implementation would use tensors/BLAS. This exists | |
| // purely for clarity and education. | |
| package main | |
| import ( | |
| "bufio" | |
| "fmt" | |
| "math" | |
| "net/http" | |
| "os" | |
| "sort" | |
| "strings" | |
| ) | |
| // ============================================================================= | |
| // AUTOGRAD ENGINE | |
| // ============================================================================= | |
| // The Value type wraps a single float64 and tracks how it was computed, so we | |
| // can automatically compute gradients (derivatives) via backpropagation. | |
| type Value struct { | |
| data float64 | |
| grad float64 | |
| children []*Value | |
| localGrads []float64 | |
| } | |
| func NewValue(data float64) *Value { | |
| return &Value{data: data} | |
| } | |
| func NewValueWithGraph(data float64, children []*Value, localGrads []float64) *Value { | |
| return &Value{data: data, children: children, localGrads: localGrads} | |
| } | |
| func (a *Value) Add(b *Value) *Value { | |
| return NewValueWithGraph(a.data+b.data, []*Value{a, b}, []float64{1, 1}) | |
| } | |
| func (a *Value) Mul(b *Value) *Value { | |
| return NewValueWithGraph(a.data*b.data, []*Value{a, b}, []float64{b.data, a.data}) | |
| } | |
| func (a *Value) Pow(n float64) *Value { | |
| return NewValueWithGraph(math.Pow(a.data, n), []*Value{a}, []float64{n * math.Pow(a.data, n-1)}) | |
| } | |
| func (a *Value) Log() *Value { | |
| return NewValueWithGraph(math.Log(a.data), []*Value{a}, []float64{1.0 / a.data}) | |
| } | |
| func (a *Value) Exp() *Value { | |
| e := math.Exp(a.data) | |
| return NewValueWithGraph(e, []*Value{a}, []float64{e}) | |
| } | |
| func (a *Value) ReLU() *Value { | |
| out := 0.0 | |
| grad := 0.0 | |
| if a.data > 0 { | |
| out = a.data | |
| grad = 1.0 | |
| } | |
| return NewValueWithGraph(out, []*Value{a}, []float64{grad}) | |
| } | |
| func (a *Value) Neg() *Value { | |
| return a.Mul(NewValue(-1)) | |
| } | |
| func (a *Value) Sub(b *Value) *Value { | |
| return a.Add(b.Neg()) | |
| } | |
| func (a *Value) Div(b *Value) *Value { | |
| return a.Mul(b.Pow(-1)) | |
| } | |
| // AddScalar and MulScalar are convenience helpers to avoid wrapping constants. | |
| func (a *Value) AddScalar(s float64) *Value { | |
| return a.Add(NewValue(s)) | |
| } | |
| func (a *Value) MulScalar(s float64) *Value { | |
| return a.Mul(NewValue(s)) | |
| } | |
| func (a *Value) DivScalar(s float64) *Value { | |
| return a.Div(NewValue(s)) | |
| } | |
| // Backward computes gradients for all Values in the computation graph via | |
| // reverse-mode autodiff (backpropagation). | |
| func (a *Value) Backward() { | |
| // Topological sort | |
| var topo []*Value | |
| visited := make(map[*Value]bool) | |
| var buildTopo func(v *Value) | |
| buildTopo = func(v *Value) { | |
| if visited[v] { | |
| return | |
| } | |
| visited[v] = true | |
| for _, child := range v.children { | |
| buildTopo(child) | |
| } | |
| topo = append(topo, v) | |
| } | |
| buildTopo(a) | |
| a.grad = 1.0 | |
| for i := len(topo) - 1; i >= 0; i-- { | |
| v := topo[i] | |
| for j, child := range v.children { | |
| child.grad += v.localGrads[j] * v.grad | |
| } | |
| } | |
| } | |
| // ============================================================================= | |
| // PSEUDORANDOM NUMBER GENERATOR | |
| // ============================================================================= | |
| // A simple xoshiro256** PRNG so we don't need math/rand. Seeded deterministically. | |
| type RNG struct { | |
| s [4]uint64 | |
| } | |
| func NewRNG(seed uint64) *RNG { | |
| // SplitMix64 to expand seed into state | |
| r := &RNG{} | |
| for i := 0; i < 4; i++ { | |
| seed += 0x9e3779b97f4a7c15 | |
| z := seed | |
| z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9 | |
| z = (z ^ (z >> 27)) * 0x94d049bb133111eb | |
| z = z ^ (z >> 31) | |
| r.s[i] = z | |
| } | |
| return r | |
| } | |
| func (r *RNG) rotl(x uint64, k uint) uint64 { | |
| return (x << k) | (x >> (64 - k)) | |
| } | |
| func (r *RNG) Next() uint64 { | |
| result := r.rotl(r.s[1]*5, 7) * 9 | |
| t := r.s[1] << 17 | |
| r.s[2] ^= r.s[0] | |
| r.s[3] ^= r.s[1] | |
| r.s[1] ^= r.s[2] | |
| r.s[0] ^= r.s[3] | |
| r.s[2] ^= t | |
| r.s[3] = r.rotl(r.s[3], 45) | |
| return result | |
| } | |
| // Float64 returns a random float in [0, 1). | |
| func (r *RNG) Float64() float64 { | |
| return float64(r.Next()>>11) / (1 << 53) | |
| } | |
| // Gauss returns a normally distributed random number (Box-Muller transform). | |
| func (r *RNG) Gauss(mean, std float64) float64 { | |
| u1 := r.Float64() | |
| u2 := r.Float64() | |
| for u1 == 0 { | |
| u1 = r.Float64() | |
| } | |
| z := math.Sqrt(-2*math.Log(u1)) * math.Cos(2*math.Pi*u2) | |
| return mean + std*z | |
| } | |
| // Shuffle randomly permutes a slice of strings. | |
| func (r *RNG) Shuffle(s []string) { | |
| for i := len(s) - 1; i > 0; i-- { | |
| j := int(r.Next() % uint64(i+1)) | |
| s[i], s[j] = s[j], s[i] | |
| } | |
| } | |
| // WeightedChoice samples an index from weights (unnormalized probabilities). | |
| func (r *RNG) WeightedChoice(weights []float64) int { | |
| total := 0.0 | |
| for _, w := range weights { | |
| total += w | |
| } | |
| threshold := r.Float64() * total | |
| cum := 0.0 | |
| for i, w := range weights { | |
| cum += w | |
| if cum > threshold { | |
| return i | |
| } | |
| } | |
| return len(weights) - 1 | |
| } | |
| // ============================================================================= | |
| // MODEL DEFINITION | |
| // ============================================================================= | |
| const ( | |
| nEmbd = 16 | |
| nHead = 4 | |
| nLayer = 1 | |
| blockSize = 16 | |
| headDim = nEmbd / nHead | |
| ) | |
| // stateDict holds all model parameters as named 2D slices of *Value. | |
| var stateDict map[string][][]*Value | |
| // params is a flat list of every parameter, for the optimizer. | |
| var params []*Value | |
| // initMatrix creates a [nout][nin] matrix of random Values. | |
| func initMatrix(rng *RNG, nout, nin int, std float64) [][]*Value { | |
| m := make([][]*Value, nout) | |
| for i := range m { | |
| m[i] = make([]*Value, nin) | |
| for j := range m[i] { | |
| m[i][j] = NewValue(rng.Gauss(0, std)) | |
| } | |
| } | |
| return m | |
| } | |
| func initParams(rng *RNG, vocabSize int) { | |
| stateDict = make(map[string][][]*Value) | |
| stateDict["wte"] = initMatrix(rng, vocabSize, nEmbd, 0.08) | |
| stateDict["wpe"] = initMatrix(rng, blockSize, nEmbd, 0.08) | |
| stateDict["lm_head"] = initMatrix(rng, vocabSize, nEmbd, 0.08) | |
| for i := 0; i < nLayer; i++ { | |
| prefix := fmt.Sprintf("layer%d", i) | |
| stateDict[prefix+".attn_wq"] = initMatrix(rng, nEmbd, nEmbd, 0.08) | |
| stateDict[prefix+".attn_wk"] = initMatrix(rng, nEmbd, nEmbd, 0.08) | |
| stateDict[prefix+".attn_wv"] = initMatrix(rng, nEmbd, nEmbd, 0.08) | |
| stateDict[prefix+".attn_wo"] = initMatrix(rng, nEmbd, nEmbd, 0.08) | |
| stateDict[prefix+".mlp_fc1"] = initMatrix(rng, 4*nEmbd, nEmbd, 0.08) | |
| stateDict[prefix+".mlp_fc2"] = initMatrix(rng, nEmbd, 4*nEmbd, 0.08) | |
| } | |
| // Flatten all parameters into a single slice | |
| params = nil | |
| for _, mat := range stateDict { | |
| for _, row := range mat { | |
| params = append(params, row...) | |
| } | |
| } | |
| } | |
| // ============================================================================= | |
| // MODEL ARCHITECTURE | |
| // ============================================================================= | |
| func linear(x []*Value, w [][]*Value) []*Value { | |
| out := make([]*Value, len(w)) | |
| for i, wo := range w { | |
| s := NewValue(0) | |
| for j := range wo { | |
| s = s.Add(wo[j].Mul(x[j])) | |
| } | |
| out[i] = s | |
| } | |
| return out | |
| } | |
| func softmax(logits []*Value) []*Value { | |
| maxVal := logits[0].data | |
| for _, v := range logits[1:] { | |
| if v.data > maxVal { | |
| maxVal = v.data | |
| } | |
| } | |
| exps := make([]*Value, len(logits)) | |
| total := NewValue(0) | |
| for i, v := range logits { | |
| exps[i] = v.AddScalar(-maxVal).Exp() | |
| total = total.Add(exps[i]) | |
| } | |
| out := make([]*Value, len(logits)) | |
| for i := range exps { | |
| out[i] = exps[i].Div(total) | |
| } | |
| return out | |
| } | |
| func rmsnorm(x []*Value) []*Value { | |
| ms := NewValue(0) | |
| for _, xi := range x { | |
| ms = ms.Add(xi.Mul(xi)) | |
| } | |
| ms = ms.DivScalar(float64(len(x))) | |
| scale := ms.AddScalar(1e-5).Pow(-0.5) | |
| out := make([]*Value, len(x)) | |
| for i, xi := range x { | |
| out[i] = xi.Mul(scale) | |
| } | |
| return out | |
| } | |
| // KVCache stores the key and value vectors for each layer across positions. | |
| type KVCache struct { | |
| keys [nLayer][][]*Value // keys[layer][position] = vector | |
| values [nLayer][][]*Value | |
| } | |
| func NewKVCache() *KVCache { | |
| kv := &KVCache{} | |
| for i := 0; i < nLayer; i++ { | |
| kv.keys[i] = nil | |
| kv.values[i] = nil | |
| } | |
| return kv | |
| } | |
| func gpt(tokenID, posID int, kv *KVCache) []*Value { | |
| // Embedding lookup: token + position | |
| tokEmb := stateDict["wte"][tokenID] | |
| posEmb := stateDict["wpe"][posID] | |
| x := make([]*Value, nEmbd) | |
| for i := range x { | |
| x[i] = tokEmb[i].Add(posEmb[i]) | |
| } | |
| x = rmsnorm(x) | |
| for li := 0; li < nLayer; li++ { | |
| prefix := fmt.Sprintf("layer%d", li) | |
| // --- Multi-head attention --- | |
| xResidual := x | |
| x = rmsnorm(x) | |
| q := linear(x, stateDict[prefix+".attn_wq"]) | |
| k := linear(x, stateDict[prefix+".attn_wk"]) | |
| v := linear(x, stateDict[prefix+".attn_wv"]) | |
| kv.keys[li] = append(kv.keys[li], k) | |
| kv.values[li] = append(kv.values[li], v) | |
| xAttn := make([]*Value, 0, nEmbd) | |
| for h := 0; h < nHead; h++ { | |
| hs := h * headDim | |
| // Slice out this head's query | |
| qH := q[hs : hs+headDim] | |
| // Gather this head's cached keys and values | |
| nPos := len(kv.keys[li]) | |
| kH := make([][]*Value, nPos) | |
| vH := make([][]*Value, nPos) | |
| for t := 0; t < nPos; t++ { | |
| kH[t] = kv.keys[li][t][hs : hs+headDim] | |
| vH[t] = kv.values[li][t][hs : hs+headDim] | |
| } | |
| // Attention scores: dot(q, k) / sqrt(head_dim) | |
| attnLogits := make([]*Value, nPos) | |
| scale := math.Sqrt(float64(headDim)) | |
| for t := 0; t < nPos; t++ { | |
| dot := NewValue(0) | |
| for j := 0; j < headDim; j++ { | |
| dot = dot.Add(qH[j].Mul(kH[t][j])) | |
| } | |
| attnLogits[t] = dot.DivScalar(scale) | |
| } | |
| attnWeights := softmax(attnLogits) | |
| // Weighted sum of values | |
| for j := 0; j < headDim; j++ { | |
| s := NewValue(0) | |
| for t := 0; t < nPos; t++ { | |
| s = s.Add(attnWeights[t].Mul(vH[t][j])) | |
| } | |
| xAttn = append(xAttn, s) | |
| } | |
| } | |
| x = linear(xAttn, stateDict[prefix+".attn_wo"]) | |
| for i := range x { | |
| x[i] = x[i].Add(xResidual[i]) | |
| } | |
| // --- MLP block --- | |
| xResidual = x | |
| x = rmsnorm(x) | |
| x = linear(x, stateDict[prefix+".mlp_fc1"]) | |
| for i := range x { | |
| x[i] = x[i].ReLU() | |
| } | |
| x = linear(x, stateDict[prefix+".mlp_fc2"]) | |
| for i := range x { | |
| x[i] = x[i].Add(xResidual[i]) | |
| } | |
| } | |
| return linear(x, stateDict["lm_head"]) | |
| } | |
| // ============================================================================= | |
| // MAIN: DATA LOADING, TRAINING, AND INFERENCE | |
| // ============================================================================= | |
| func main() { | |
| rng := NewRNG(42) | |
| // --- Load dataset --- | |
| if _, err := os.Stat("input.txt"); os.IsNotExist(err) { | |
| fmt.Println("Downloading names dataset...") | |
| resp, err := http.Get("https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt") | |
| if err != nil { | |
| panic(err) | |
| } | |
| defer resp.Body.Close() | |
| f, _ := os.Create("input.txt") | |
| defer f.Close() | |
| scanner := bufio.NewScanner(resp.Body) | |
| for scanner.Scan() { | |
| f.WriteString(scanner.Text() + "\n") | |
| } | |
| } | |
| data, _ := os.ReadFile("input.txt") | |
| lines := strings.Split(strings.TrimSpace(string(data)), "\n") | |
| var docs []string | |
| for _, l := range lines { | |
| l = strings.TrimSpace(l) | |
| if l != "" { | |
| docs = append(docs, l) | |
| } | |
| } | |
| rng.Shuffle(docs) | |
| fmt.Printf("num docs: %d\n", len(docs)) | |
| // --- Tokenizer --- | |
| charSet := make(map[rune]bool) | |
| for _, doc := range docs { | |
| for _, ch := range doc { | |
| charSet[ch] = true | |
| } | |
| } | |
| var uchars []rune | |
| for ch := range charSet { | |
| uchars = append(uchars, ch) | |
| } | |
| sort.Slice(uchars, func(i, j int) bool { return uchars[i] < uchars[j] }) | |
| charToIdx := make(map[rune]int) | |
| for i, ch := range uchars { | |
| charToIdx[ch] = i | |
| } | |
| BOS := len(uchars) | |
| vocabSize := len(uchars) + 1 | |
| fmt.Printf("vocab size: %d\n", vocabSize) | |
| // --- Initialize parameters --- | |
| initParams(rng, vocabSize) | |
| fmt.Printf("num params: %d\n", len(params)) | |
| // --- Adam optimizer buffers --- | |
| lr := 0.01 | |
| beta1 := 0.85 | |
| beta2 := 0.99 | |
| epsAdam := 1e-8 | |
| mBuf := make([]float64, len(params)) | |
| vBuf := make([]float64, len(params)) | |
| // --- Training loop --- | |
| numSteps := 1000 | |
| for step := 0; step < numSteps; step++ { | |
| doc := docs[step%len(docs)] | |
| // Tokenize: [BOS, ...chars..., BOS] | |
| tokens := make([]int, 0, len(doc)+2) | |
| tokens = append(tokens, BOS) | |
| for _, ch := range doc { | |
| tokens = append(tokens, charToIdx[ch]) | |
| } | |
| tokens = append(tokens, BOS) | |
| n := len(tokens) - 1 | |
| if n > blockSize { | |
| n = blockSize | |
| } | |
| // Forward pass | |
| kv := NewKVCache() | |
| losses := make([]*Value, 0, n) | |
| for pos := 0; pos < n; pos++ { | |
| tokenID := tokens[pos] | |
| targetID := tokens[pos+1] | |
| logits := gpt(tokenID, pos, kv) | |
| probs := softmax(logits) | |
| lossT := probs[targetID].Log().Neg() | |
| losses = append(losses, lossT) | |
| } | |
| // Average loss | |
| loss := NewValue(0) | |
| for _, l := range losses { | |
| loss = loss.Add(l) | |
| } | |
| loss = loss.DivScalar(float64(n)) | |
| // Backward pass | |
| loss.Backward() | |
| // Adam update | |
| lrT := lr * (1.0 - float64(step)/float64(numSteps)) | |
| for i, p := range params { | |
| mBuf[i] = beta1*mBuf[i] + (1-beta1)*p.grad | |
| vBuf[i] = beta2*vBuf[i] + (1-beta2)*p.grad*p.grad | |
| mHat := mBuf[i] / (1 - math.Pow(beta1, float64(step+1))) | |
| vHat := vBuf[i] / (1 - math.Pow(beta2, float64(step+1))) | |
| p.data -= lrT * mHat / (math.Sqrt(vHat) + epsAdam) | |
| p.grad = 0 | |
| } | |
| fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data) | |
| } | |
| // --- Inference --- | |
| temperature := 0.5 | |
| fmt.Println("\n--- inference (new, hallucinated names) ---") | |
| for sample := 0; sample < 20; sample++ { | |
| kv := NewKVCache() | |
| tokenID := BOS | |
| var name []rune | |
| for pos := 0; pos < blockSize; pos++ { | |
| logits := gpt(tokenID, pos, kv) | |
| // Apply temperature | |
| scaled := make([]*Value, len(logits)) | |
| for i, l := range logits { | |
| scaled[i] = l.DivScalar(temperature) | |
| } | |
| probs := softmax(scaled) | |
| weights := make([]float64, len(probs)) | |
| for i, p := range probs { | |
| weights[i] = p.data | |
| } | |
| tokenID = rng.WeightedChoice(weights) | |
| if tokenID == BOS { | |
| break | |
| } | |
| name = append(name, uchars[tokenID]) | |
| } | |
| fmt.Printf("sample %2d: %s\n", sample+1, string(name)) | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment