Skip to content

Instantly share code, notes, and snippets.

@mehmetkose
Last active February 16, 2026 09:35
Show Gist options
  • Select an option

  • Save mehmetkose/f6e2cd31d60f2d49c5294d65752c8a59 to your computer and use it in GitHub Desktop.

Select an option

Save mehmetkose/f6e2cd31d60f2d49c5294d65752c8a59 to your computer and use it in GitHub Desktop.
// The most atomic way to train and inference a GPT in pure, dependency-free Go.
// This file is the complete algorithm. Everything else is just efficiency.
//
// Ported from @karpathy's pure Python GPT implementation.
// https://x.com/karpathy/status/2021694437152157847
//
// Build and run:
// go build -o microgpt microgpt.go && ./gpt
//
// NOTE: This is intentionally slow — every operation is scalar, just like the
// Python original. A real implementation would use tensors/BLAS. This exists
// purely for clarity and education.
package main
import (
"bufio"
"fmt"
"math"
"net/http"
"os"
"sort"
"strings"
)
// =============================================================================
// AUTOGRAD ENGINE
// =============================================================================
// The Value type wraps a single float64 and tracks how it was computed, so we
// can automatically compute gradients (derivatives) via backpropagation.
type Value struct {
data float64
grad float64
children []*Value
localGrads []float64
}
func NewValue(data float64) *Value {
return &Value{data: data}
}
func NewValueWithGraph(data float64, children []*Value, localGrads []float64) *Value {
return &Value{data: data, children: children, localGrads: localGrads}
}
func (a *Value) Add(b *Value) *Value {
return NewValueWithGraph(a.data+b.data, []*Value{a, b}, []float64{1, 1})
}
func (a *Value) Mul(b *Value) *Value {
return NewValueWithGraph(a.data*b.data, []*Value{a, b}, []float64{b.data, a.data})
}
func (a *Value) Pow(n float64) *Value {
return NewValueWithGraph(math.Pow(a.data, n), []*Value{a}, []float64{n * math.Pow(a.data, n-1)})
}
func (a *Value) Log() *Value {
return NewValueWithGraph(math.Log(a.data), []*Value{a}, []float64{1.0 / a.data})
}
func (a *Value) Exp() *Value {
e := math.Exp(a.data)
return NewValueWithGraph(e, []*Value{a}, []float64{e})
}
func (a *Value) ReLU() *Value {
out := 0.0
grad := 0.0
if a.data > 0 {
out = a.data
grad = 1.0
}
return NewValueWithGraph(out, []*Value{a}, []float64{grad})
}
func (a *Value) Neg() *Value {
return a.Mul(NewValue(-1))
}
func (a *Value) Sub(b *Value) *Value {
return a.Add(b.Neg())
}
func (a *Value) Div(b *Value) *Value {
return a.Mul(b.Pow(-1))
}
// AddScalar and MulScalar are convenience helpers to avoid wrapping constants.
func (a *Value) AddScalar(s float64) *Value {
return a.Add(NewValue(s))
}
func (a *Value) MulScalar(s float64) *Value {
return a.Mul(NewValue(s))
}
func (a *Value) DivScalar(s float64) *Value {
return a.Div(NewValue(s))
}
// Backward computes gradients for all Values in the computation graph via
// reverse-mode autodiff (backpropagation).
func (a *Value) Backward() {
// Topological sort
var topo []*Value
visited := make(map[*Value]bool)
var buildTopo func(v *Value)
buildTopo = func(v *Value) {
if visited[v] {
return
}
visited[v] = true
for _, child := range v.children {
buildTopo(child)
}
topo = append(topo, v)
}
buildTopo(a)
a.grad = 1.0
for i := len(topo) - 1; i >= 0; i-- {
v := topo[i]
for j, child := range v.children {
child.grad += v.localGrads[j] * v.grad
}
}
}
// =============================================================================
// PSEUDORANDOM NUMBER GENERATOR
// =============================================================================
// A simple xoshiro256** PRNG so we don't need math/rand. Seeded deterministically.
type RNG struct {
s [4]uint64
}
func NewRNG(seed uint64) *RNG {
// SplitMix64 to expand seed into state
r := &RNG{}
for i := 0; i < 4; i++ {
seed += 0x9e3779b97f4a7c15
z := seed
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9
z = (z ^ (z >> 27)) * 0x94d049bb133111eb
z = z ^ (z >> 31)
r.s[i] = z
}
return r
}
func (r *RNG) rotl(x uint64, k uint) uint64 {
return (x << k) | (x >> (64 - k))
}
func (r *RNG) Next() uint64 {
result := r.rotl(r.s[1]*5, 7) * 9
t := r.s[1] << 17
r.s[2] ^= r.s[0]
r.s[3] ^= r.s[1]
r.s[1] ^= r.s[2]
r.s[0] ^= r.s[3]
r.s[2] ^= t
r.s[3] = r.rotl(r.s[3], 45)
return result
}
// Float64 returns a random float in [0, 1).
func (r *RNG) Float64() float64 {
return float64(r.Next()>>11) / (1 << 53)
}
// Gauss returns a normally distributed random number (Box-Muller transform).
func (r *RNG) Gauss(mean, std float64) float64 {
u1 := r.Float64()
u2 := r.Float64()
for u1 == 0 {
u1 = r.Float64()
}
z := math.Sqrt(-2*math.Log(u1)) * math.Cos(2*math.Pi*u2)
return mean + std*z
}
// Shuffle randomly permutes a slice of strings.
func (r *RNG) Shuffle(s []string) {
for i := len(s) - 1; i > 0; i-- {
j := int(r.Next() % uint64(i+1))
s[i], s[j] = s[j], s[i]
}
}
// WeightedChoice samples an index from weights (unnormalized probabilities).
func (r *RNG) WeightedChoice(weights []float64) int {
total := 0.0
for _, w := range weights {
total += w
}
threshold := r.Float64() * total
cum := 0.0
for i, w := range weights {
cum += w
if cum > threshold {
return i
}
}
return len(weights) - 1
}
// =============================================================================
// MODEL DEFINITION
// =============================================================================
const (
nEmbd = 16
nHead = 4
nLayer = 1
blockSize = 16
headDim = nEmbd / nHead
)
// stateDict holds all model parameters as named 2D slices of *Value.
var stateDict map[string][][]*Value
// params is a flat list of every parameter, for the optimizer.
var params []*Value
// initMatrix creates a [nout][nin] matrix of random Values.
func initMatrix(rng *RNG, nout, nin int, std float64) [][]*Value {
m := make([][]*Value, nout)
for i := range m {
m[i] = make([]*Value, nin)
for j := range m[i] {
m[i][j] = NewValue(rng.Gauss(0, std))
}
}
return m
}
func initParams(rng *RNG, vocabSize int) {
stateDict = make(map[string][][]*Value)
stateDict["wte"] = initMatrix(rng, vocabSize, nEmbd, 0.08)
stateDict["wpe"] = initMatrix(rng, blockSize, nEmbd, 0.08)
stateDict["lm_head"] = initMatrix(rng, vocabSize, nEmbd, 0.08)
for i := 0; i < nLayer; i++ {
prefix := fmt.Sprintf("layer%d", i)
stateDict[prefix+".attn_wq"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
stateDict[prefix+".attn_wk"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
stateDict[prefix+".attn_wv"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
stateDict[prefix+".attn_wo"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
stateDict[prefix+".mlp_fc1"] = initMatrix(rng, 4*nEmbd, nEmbd, 0.08)
stateDict[prefix+".mlp_fc2"] = initMatrix(rng, nEmbd, 4*nEmbd, 0.08)
}
// Flatten all parameters into a single slice
params = nil
for _, mat := range stateDict {
for _, row := range mat {
params = append(params, row...)
}
}
}
// =============================================================================
// MODEL ARCHITECTURE
// =============================================================================
func linear(x []*Value, w [][]*Value) []*Value {
out := make([]*Value, len(w))
for i, wo := range w {
s := NewValue(0)
for j := range wo {
s = s.Add(wo[j].Mul(x[j]))
}
out[i] = s
}
return out
}
func softmax(logits []*Value) []*Value {
maxVal := logits[0].data
for _, v := range logits[1:] {
if v.data > maxVal {
maxVal = v.data
}
}
exps := make([]*Value, len(logits))
total := NewValue(0)
for i, v := range logits {
exps[i] = v.AddScalar(-maxVal).Exp()
total = total.Add(exps[i])
}
out := make([]*Value, len(logits))
for i := range exps {
out[i] = exps[i].Div(total)
}
return out
}
func rmsnorm(x []*Value) []*Value {
ms := NewValue(0)
for _, xi := range x {
ms = ms.Add(xi.Mul(xi))
}
ms = ms.DivScalar(float64(len(x)))
scale := ms.AddScalar(1e-5).Pow(-0.5)
out := make([]*Value, len(x))
for i, xi := range x {
out[i] = xi.Mul(scale)
}
return out
}
// KVCache stores the key and value vectors for each layer across positions.
type KVCache struct {
keys [nLayer][][]*Value // keys[layer][position] = vector
values [nLayer][][]*Value
}
func NewKVCache() *KVCache {
kv := &KVCache{}
for i := 0; i < nLayer; i++ {
kv.keys[i] = nil
kv.values[i] = nil
}
return kv
}
func gpt(tokenID, posID int, kv *KVCache) []*Value {
// Embedding lookup: token + position
tokEmb := stateDict["wte"][tokenID]
posEmb := stateDict["wpe"][posID]
x := make([]*Value, nEmbd)
for i := range x {
x[i] = tokEmb[i].Add(posEmb[i])
}
x = rmsnorm(x)
for li := 0; li < nLayer; li++ {
prefix := fmt.Sprintf("layer%d", li)
// --- Multi-head attention ---
xResidual := x
x = rmsnorm(x)
q := linear(x, stateDict[prefix+".attn_wq"])
k := linear(x, stateDict[prefix+".attn_wk"])
v := linear(x, stateDict[prefix+".attn_wv"])
kv.keys[li] = append(kv.keys[li], k)
kv.values[li] = append(kv.values[li], v)
xAttn := make([]*Value, 0, nEmbd)
for h := 0; h < nHead; h++ {
hs := h * headDim
// Slice out this head's query
qH := q[hs : hs+headDim]
// Gather this head's cached keys and values
nPos := len(kv.keys[li])
kH := make([][]*Value, nPos)
vH := make([][]*Value, nPos)
for t := 0; t < nPos; t++ {
kH[t] = kv.keys[li][t][hs : hs+headDim]
vH[t] = kv.values[li][t][hs : hs+headDim]
}
// Attention scores: dot(q, k) / sqrt(head_dim)
attnLogits := make([]*Value, nPos)
scale := math.Sqrt(float64(headDim))
for t := 0; t < nPos; t++ {
dot := NewValue(0)
for j := 0; j < headDim; j++ {
dot = dot.Add(qH[j].Mul(kH[t][j]))
}
attnLogits[t] = dot.DivScalar(scale)
}
attnWeights := softmax(attnLogits)
// Weighted sum of values
for j := 0; j < headDim; j++ {
s := NewValue(0)
for t := 0; t < nPos; t++ {
s = s.Add(attnWeights[t].Mul(vH[t][j]))
}
xAttn = append(xAttn, s)
}
}
x = linear(xAttn, stateDict[prefix+".attn_wo"])
for i := range x {
x[i] = x[i].Add(xResidual[i])
}
// --- MLP block ---
xResidual = x
x = rmsnorm(x)
x = linear(x, stateDict[prefix+".mlp_fc1"])
for i := range x {
x[i] = x[i].ReLU()
}
x = linear(x, stateDict[prefix+".mlp_fc2"])
for i := range x {
x[i] = x[i].Add(xResidual[i])
}
}
return linear(x, stateDict["lm_head"])
}
// =============================================================================
// MAIN: DATA LOADING, TRAINING, AND INFERENCE
// =============================================================================
func main() {
rng := NewRNG(42)
// --- Load dataset ---
if _, err := os.Stat("input.txt"); os.IsNotExist(err) {
fmt.Println("Downloading names dataset...")
resp, err := http.Get("https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt")
if err != nil {
panic(err)
}
defer resp.Body.Close()
f, _ := os.Create("input.txt")
defer f.Close()
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
f.WriteString(scanner.Text() + "\n")
}
}
data, _ := os.ReadFile("input.txt")
lines := strings.Split(strings.TrimSpace(string(data)), "\n")
var docs []string
for _, l := range lines {
l = strings.TrimSpace(l)
if l != "" {
docs = append(docs, l)
}
}
rng.Shuffle(docs)
fmt.Printf("num docs: %d\n", len(docs))
// --- Tokenizer ---
charSet := make(map[rune]bool)
for _, doc := range docs {
for _, ch := range doc {
charSet[ch] = true
}
}
var uchars []rune
for ch := range charSet {
uchars = append(uchars, ch)
}
sort.Slice(uchars, func(i, j int) bool { return uchars[i] < uchars[j] })
charToIdx := make(map[rune]int)
for i, ch := range uchars {
charToIdx[ch] = i
}
BOS := len(uchars)
vocabSize := len(uchars) + 1
fmt.Printf("vocab size: %d\n", vocabSize)
// --- Initialize parameters ---
initParams(rng, vocabSize)
fmt.Printf("num params: %d\n", len(params))
// --- Adam optimizer buffers ---
lr := 0.01
beta1 := 0.85
beta2 := 0.99
epsAdam := 1e-8
mBuf := make([]float64, len(params))
vBuf := make([]float64, len(params))
// --- Training loop ---
numSteps := 1000
for step := 0; step < numSteps; step++ {
doc := docs[step%len(docs)]
// Tokenize: [BOS, ...chars..., BOS]
tokens := make([]int, 0, len(doc)+2)
tokens = append(tokens, BOS)
for _, ch := range doc {
tokens = append(tokens, charToIdx[ch])
}
tokens = append(tokens, BOS)
n := len(tokens) - 1
if n > blockSize {
n = blockSize
}
// Forward pass
kv := NewKVCache()
losses := make([]*Value, 0, n)
for pos := 0; pos < n; pos++ {
tokenID := tokens[pos]
targetID := tokens[pos+1]
logits := gpt(tokenID, pos, kv)
probs := softmax(logits)
lossT := probs[targetID].Log().Neg()
losses = append(losses, lossT)
}
// Average loss
loss := NewValue(0)
for _, l := range losses {
loss = loss.Add(l)
}
loss = loss.DivScalar(float64(n))
// Backward pass
loss.Backward()
// Adam update
lrT := lr * (1.0 - float64(step)/float64(numSteps))
for i, p := range params {
mBuf[i] = beta1*mBuf[i] + (1-beta1)*p.grad
vBuf[i] = beta2*vBuf[i] + (1-beta2)*p.grad*p.grad
mHat := mBuf[i] / (1 - math.Pow(beta1, float64(step+1)))
vHat := vBuf[i] / (1 - math.Pow(beta2, float64(step+1)))
p.data -= lrT * mHat / (math.Sqrt(vHat) + epsAdam)
p.grad = 0
}
fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
}
// --- Inference ---
temperature := 0.5
fmt.Println("\n--- inference (new, hallucinated names) ---")
for sample := 0; sample < 20; sample++ {
kv := NewKVCache()
tokenID := BOS
var name []rune
for pos := 0; pos < blockSize; pos++ {
logits := gpt(tokenID, pos, kv)
// Apply temperature
scaled := make([]*Value, len(logits))
for i, l := range logits {
scaled[i] = l.DivScalar(temperature)
}
probs := softmax(scaled)
weights := make([]float64, len(probs))
for i, p := range probs {
weights[i] = p.data
}
tokenID = rng.WeightedChoice(weights)
if tokenID == BOS {
break
}
name = append(name, uchars[tokenID])
}
fmt.Printf("sample %2d: %s\n", sample+1, string(name))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment