mehmetkose/microgpt.go

## microgpt.go
// The most atomic way to train and inference a GPT in pure, dependency-free Go.
// This file is the complete algorithm. Everything else is just efficiency.
//
// Ported from @karpathy's pure Python GPT implementation.
// https://x.com/karpathy/status/2021694437152157847
//
// Build and run:
//   go build -o microgpt microgpt.go && ./gpt
//
// NOTE: This is intentionally slow — every operation is scalar, just like the
// Python original. A real implementation would use tensors/BLAS. This exists
// purely for clarity and education.


package main

import (
	"bufio"
	"fmt"
	"math"
	"net/http"
	"os"
	"sort"
	"strings"
)

// =============================================================================
// AUTOGRAD ENGINE
// =============================================================================
// The Value type wraps a single float64 and tracks how it was computed, so we
// can automatically compute gradients (derivatives) via backpropagation.

type Value struct {
	data       float64
	grad       float64
	children   []*Value
	localGrads []float64
}

func NewValue(data float64) *Value {
	return &Value{data: data}
}

func NewValueWithGraph(data float64, children []*Value, localGrads []float64) *Value {
	return &Value{data: data, children: children, localGrads: localGrads}
}

func (a *Value) Add(b *Value) *Value {
	return NewValueWithGraph(a.data+b.data, []*Value{a, b}, []float64{1, 1})
}

func (a *Value) Mul(b *Value) *Value {
	return NewValueWithGraph(a.data*b.data, []*Value{a, b}, []float64{b.data, a.data})
}

func (a *Value) Pow(n float64) *Value {
	return NewValueWithGraph(math.Pow(a.data, n), []*Value{a}, []float64{n * math.Pow(a.data, n-1)})
}

func (a *Value) Log() *Value {
	return NewValueWithGraph(math.Log(a.data), []*Value{a}, []float64{1.0 / a.data})
}

func (a *Value) Exp() *Value {
	e := math.Exp(a.data)
	return NewValueWithGraph(e, []*Value{a}, []float64{e})
}

func (a *Value) ReLU() *Value {
	out := 0.0
	grad := 0.0
	if a.data > 0 {
		out = a.data
		grad = 1.0
	}
	return NewValueWithGraph(out, []*Value{a}, []float64{grad})
}

func (a *Value) Neg() *Value {
	return a.Mul(NewValue(-1))
}

func (a *Value) Sub(b *Value) *Value {
	return a.Add(b.Neg())
}

func (a *Value) Div(b *Value) *Value {
	return a.Mul(b.Pow(-1))
}

// AddScalar and MulScalar are convenience helpers to avoid wrapping constants.
func (a *Value) AddScalar(s float64) *Value {
	return a.Add(NewValue(s))
}

func (a *Value) MulScalar(s float64) *Value {
	return a.Mul(NewValue(s))
}

func (a *Value) DivScalar(s float64) *Value {
	return a.Div(NewValue(s))
}

// Backward computes gradients for all Values in the computation graph via
// reverse-mode autodiff (backpropagation).
func (a *Value) Backward() {
	// Topological sort
	var topo []*Value
	visited := make(map[*Value]bool)
	var buildTopo func(v *Value)
	buildTopo = func(v *Value) {
		if visited[v] {
			return
		}
		visited[v] = true
		for _, child := range v.children {
			buildTopo(child)
		}
		topo = append(topo, v)
	}
	buildTopo(a)

	a.grad = 1.0
	for i := len(topo) - 1; i >= 0; i-- {
		v := topo[i]
		for j, child := range v.children {
			child.grad += v.localGrads[j] * v.grad
		}
	}
}

// =============================================================================
// PSEUDORANDOM NUMBER GENERATOR
// =============================================================================
// A simple xoshiro256** PRNG so we don't need math/rand. Seeded deterministically.

type RNG struct {
	s [4]uint64
}

func NewRNG(seed uint64) *RNG {
	// SplitMix64 to expand seed into state
	r := &RNG{}
	for i := 0; i < 4; i++ {
		seed += 0x9e3779b97f4a7c15
		z := seed
		z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9
		z = (z ^ (z >> 27)) * 0x94d049bb133111eb
		z = z ^ (z >> 31)
		r.s[i] = z
	}
	return r
}

func (r *RNG) rotl(x uint64, k uint) uint64 {
	return (x << k) | (x >> (64 - k))
}

func (r *RNG) Next() uint64 {
	result := r.rotl(r.s[1]*5, 7) * 9
	t := r.s[1] << 17
	r.s[2] ^= r.s[0]
	r.s[3] ^= r.s[1]
	r.s[1] ^= r.s[2]
	r.s[0] ^= r.s[3]
	r.s[2] ^= t
	r.s[3] = r.rotl(r.s[3], 45)
	return result
}

// Float64 returns a random float in [0, 1).
func (r *RNG) Float64() float64 {
	return float64(r.Next()>>11) / (1 << 53)
}

// Gauss returns a normally distributed random number (Box-Muller transform).
func (r *RNG) Gauss(mean, std float64) float64 {
	u1 := r.Float64()
	u2 := r.Float64()
	for u1 == 0 {
		u1 = r.Float64()
	}
	z := math.Sqrt(-2*math.Log(u1)) * math.Cos(2*math.Pi*u2)
	return mean + std*z
}

// Shuffle randomly permutes a slice of strings.
func (r *RNG) Shuffle(s []string) {
	for i := len(s) - 1; i > 0; i-- {
		j := int(r.Next() % uint64(i+1))
		s[i], s[j] = s[j], s[i]
	}
}

// WeightedChoice samples an index from weights (unnormalized probabilities).
func (r *RNG) WeightedChoice(weights []float64) int {
	total := 0.0
	for _, w := range weights {
		total += w
	}
	threshold := r.Float64() * total
	cum := 0.0
	for i, w := range weights {
		cum += w
		if cum > threshold {
			return i
		}
	}
	return len(weights) - 1
}

// =============================================================================
// MODEL DEFINITION
// =============================================================================

const (
	nEmbd     = 16
	nHead     = 4
	nLayer    = 1
	blockSize = 16
	headDim   = nEmbd / nHead
)

// stateDict holds all model parameters as named 2D slices of *Value.
var stateDict map[string][][]*Value

// params is a flat list of every parameter, for the optimizer.
var params []*Value

// initMatrix creates a [nout][nin] matrix of random Values.
func initMatrix(rng *RNG, nout, nin int, std float64) [][]*Value {
	m := make([][]*Value, nout)
	for i := range m {
		m[i] = make([]*Value, nin)
		for j := range m[i] {
			m[i][j] = NewValue(rng.Gauss(0, std))
		}
	}
	return m
}

func initParams(rng *RNG, vocabSize int) {
	stateDict = make(map[string][][]*Value)
	stateDict["wte"] = initMatrix(rng, vocabSize, nEmbd, 0.08)
	stateDict["wpe"] = initMatrix(rng, blockSize, nEmbd, 0.08)
	stateDict["lm_head"] = initMatrix(rng, vocabSize, nEmbd, 0.08)

	for i := 0; i < nLayer; i++ {
		prefix := fmt.Sprintf("layer%d", i)
		stateDict[prefix+".attn_wq"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
		stateDict[prefix+".attn_wk"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
		stateDict[prefix+".attn_wv"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
		stateDict[prefix+".attn_wo"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
		stateDict[prefix+".mlp_fc1"] = initMatrix(rng, 4*nEmbd, nEmbd, 0.08)
		stateDict[prefix+".mlp_fc2"] = initMatrix(rng, nEmbd, 4*nEmbd, 0.08)
	}

	// Flatten all parameters into a single slice
	params = nil
	for _, mat := range stateDict {
		for _, row := range mat {
			params = append(params, row...)
		}
	}
}

// =============================================================================
// MODEL ARCHITECTURE
// =============================================================================

func linear(x []*Value, w [][]*Value) []*Value {
	out := make([]*Value, len(w))
	for i, wo := range w {
		s := NewValue(0)
		for j := range wo {
			s = s.Add(wo[j].Mul(x[j]))
		}
		out[i] = s
	}
	return out
}

func softmax(logits []*Value) []*Value {
	maxVal := logits[0].data
	for _, v := range logits[1:] {
		if v.data > maxVal {
			maxVal = v.data
		}
	}
	exps := make([]*Value, len(logits))
	total := NewValue(0)
	for i, v := range logits {
		exps[i] = v.AddScalar(-maxVal).Exp()
		total = total.Add(exps[i])
	}
	out := make([]*Value, len(logits))
	for i := range exps {
		out[i] = exps[i].Div(total)
	}
	return out
}

func rmsnorm(x []*Value) []*Value {
	ms := NewValue(0)
	for _, xi := range x {
		ms = ms.Add(xi.Mul(xi))
	}
	ms = ms.DivScalar(float64(len(x)))
	scale := ms.AddScalar(1e-5).Pow(-0.5)
	out := make([]*Value, len(x))
	for i, xi := range x {
		out[i] = xi.Mul(scale)
	}
	return out
}

// KVCache stores the key and value vectors for each layer across positions.
type KVCache struct {
	keys   [nLayer][][]*Value // keys[layer][position] = vector
	values [nLayer][][]*Value
}

func NewKVCache() *KVCache {
	kv := &KVCache{}
	for i := 0; i < nLayer; i++ {
		kv.keys[i] = nil
		kv.values[i] = nil
	}
	return kv
}

func gpt(tokenID, posID int, kv *KVCache) []*Value {
	// Embedding lookup: token + position
	tokEmb := stateDict["wte"][tokenID]
	posEmb := stateDict["wpe"][posID]
	x := make([]*Value, nEmbd)
	for i := range x {
		x[i] = tokEmb[i].Add(posEmb[i])
	}
	x = rmsnorm(x)

	for li := 0; li < nLayer; li++ {
		prefix := fmt.Sprintf("layer%d", li)

		// --- Multi-head attention ---
		xResidual := x
		x = rmsnorm(x)
		q := linear(x, stateDict[prefix+".attn_wq"])
		k := linear(x, stateDict[prefix+".attn_wk"])
		v := linear(x, stateDict[prefix+".attn_wv"])

		kv.keys[li] = append(kv.keys[li], k)
		kv.values[li] = append(kv.values[li], v)

		xAttn := make([]*Value, 0, nEmbd)
		for h := 0; h < nHead; h++ {
			hs := h * headDim

			// Slice out this head's query
			qH := q[hs : hs+headDim]

			// Gather this head's cached keys and values
			nPos := len(kv.keys[li])
			kH := make([][]*Value, nPos)
			vH := make([][]*Value, nPos)
			for t := 0; t < nPos; t++ {
				kH[t] = kv.keys[li][t][hs : hs+headDim]
				vH[t] = kv.values[li][t][hs : hs+headDim]
			}

			// Attention scores: dot(q, k) / sqrt(head_dim)
			attnLogits := make([]*Value, nPos)
			scale := math.Sqrt(float64(headDim))
			for t := 0; t < nPos; t++ {
				dot := NewValue(0)
				for j := 0; j < headDim; j++ {
					dot = dot.Add(qH[j].Mul(kH[t][j]))
				}
				attnLogits[t] = dot.DivScalar(scale)
			}

			attnWeights := softmax(attnLogits)

			// Weighted sum of values
			for j := 0; j < headDim; j++ {
				s := NewValue(0)
				for t := 0; t < nPos; t++ {
					s = s.Add(attnWeights[t].Mul(vH[t][j]))
				}
				xAttn = append(xAttn, s)
			}
		}

		x = linear(xAttn, stateDict[prefix+".attn_wo"])
		for i := range x {
			x[i] = x[i].Add(xResidual[i])
		}

		// --- MLP block ---
		xResidual = x
		x = rmsnorm(x)
		x = linear(x, stateDict[prefix+".mlp_fc1"])
		for i := range x {
			x[i] = x[i].ReLU()
		}
		x = linear(x, stateDict[prefix+".mlp_fc2"])
		for i := range x {
			x[i] = x[i].Add(xResidual[i])
		}
	}

	return linear(x, stateDict["lm_head"])
}

// =============================================================================
// MAIN: DATA LOADING, TRAINING, AND INFERENCE
// =============================================================================

func main() {
	rng := NewRNG(42)

	// --- Load dataset ---
	if _, err := os.Stat("input.txt"); os.IsNotExist(err) {
		fmt.Println("Downloading names dataset...")
		resp, err := http.Get("https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt")
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()
		f, _ := os.Create("input.txt")
		defer f.Close()
		scanner := bufio.NewScanner(resp.Body)
		for scanner.Scan() {
			f.WriteString(scanner.Text() + "\n")
		}
	}

	data, _ := os.ReadFile("input.txt")
	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
	var docs []string
	for _, l := range lines {
		l = strings.TrimSpace(l)
		if l != "" {
			docs = append(docs, l)
		}
	}
	rng.Shuffle(docs)
	fmt.Printf("num docs: %d\n", len(docs))

	// --- Tokenizer ---
	charSet := make(map[rune]bool)
	for _, doc := range docs {
		for _, ch := range doc {
			charSet[ch] = true
		}
	}
	var uchars []rune
	for ch := range charSet {
		uchars = append(uchars, ch)
	}
	sort.Slice(uchars, func(i, j int) bool { return uchars[i] < uchars[j] })

	charToIdx := make(map[rune]int)
	for i, ch := range uchars {
		charToIdx[ch] = i
	}
	BOS := len(uchars)
	vocabSize := len(uchars) + 1
	fmt.Printf("vocab size: %d\n", vocabSize)

	// --- Initialize parameters ---
	initParams(rng, vocabSize)
	fmt.Printf("num params: %d\n", len(params))

	// --- Adam optimizer buffers ---
	lr := 0.01
	beta1 := 0.85
	beta2 := 0.99
	epsAdam := 1e-8
	mBuf := make([]float64, len(params))
	vBuf := make([]float64, len(params))

	// --- Training loop ---
	numSteps := 1000

	for step := 0; step < numSteps; step++ {
		doc := docs[step%len(docs)]

		// Tokenize: [BOS, ...chars..., BOS]
		tokens := make([]int, 0, len(doc)+2)
		tokens = append(tokens, BOS)
		for _, ch := range doc {
			tokens = append(tokens, charToIdx[ch])
		}
		tokens = append(tokens, BOS)

		n := len(tokens) - 1
		if n > blockSize {
			n = blockSize
		}

		// Forward pass
		kv := NewKVCache()
		losses := make([]*Value, 0, n)

		for pos := 0; pos < n; pos++ {
			tokenID := tokens[pos]
			targetID := tokens[pos+1]

			logits := gpt(tokenID, pos, kv)
			probs := softmax(logits)
			lossT := probs[targetID].Log().Neg()
			losses = append(losses, lossT)
		}

		// Average loss
		loss := NewValue(0)
		for _, l := range losses {
			loss = loss.Add(l)
		}
		loss = loss.DivScalar(float64(n))

		// Backward pass
		loss.Backward()

		// Adam update
		lrT := lr * (1.0 - float64(step)/float64(numSteps))
		for i, p := range params {
			mBuf[i] = beta1*mBuf[i] + (1-beta1)*p.grad
			vBuf[i] = beta2*vBuf[i] + (1-beta2)*p.grad*p.grad
			mHat := mBuf[i] / (1 - math.Pow(beta1, float64(step+1)))
			vHat := vBuf[i] / (1 - math.Pow(beta2, float64(step+1)))
			p.data -= lrT * mHat / (math.Sqrt(vHat) + epsAdam)
			p.grad = 0
		}

		fmt.Printf("step %4d / %4d | loss %.4f\n", step+1, numSteps, loss.data)
	}

	// --- Inference ---
	temperature := 0.5
	fmt.Println("\n--- inference (new, hallucinated names) ---")

	for sample := 0; sample < 20; sample++ {
		kv := NewKVCache()
		tokenID := BOS
		var name []rune

		for pos := 0; pos < blockSize; pos++ {
			logits := gpt(tokenID, pos, kv)

			// Apply temperature
			scaled := make([]*Value, len(logits))
			for i, l := range logits {
				scaled[i] = l.DivScalar(temperature)
			}
			probs := softmax(scaled)

			weights := make([]float64, len(probs))
			for i, p := range probs {
				weights[i] = p.data
			}
			tokenID = rng.WeightedChoice(weights)

			if tokenID == BOS {
				break
			}
			name = append(name, uchars[tokenID])
		}

		fmt.Printf("sample %2d: %s\n", sample+1, string(name))
	}
}
	// The most atomic way to train and inference a GPT in pure, dependency-free Go.
	// This file is the complete algorithm. Everything else is just efficiency.
	//
	// Ported from @karpathy's pure Python GPT implementation.
	// https://x.com/karpathy/status/2021694437152157847
	//
	// Build and run:
	// go build -o microgpt microgpt.go && ./gpt
	//
	// NOTE: This is intentionally slow — every operation is scalar, just like the
	// Python original. A real implementation would use tensors/BLAS. This exists
	// purely for clarity and education.


	package main

	import (
	"bufio"
	"fmt"
	"math"
	"net/http"
	"os"
	"sort"
	"strings"
	)

	// =============================================================================
	// AUTOGRAD ENGINE
	// =============================================================================
	// The Value type wraps a single float64 and tracks how it was computed, so we
	// can automatically compute gradients (derivatives) via backpropagation.

	type Value struct {
	data float64
	grad float64
	children []*Value
	localGrads []float64
	}

	func NewValue(data float64) *Value {
	return &Value{data: data}
	}

	func NewValueWithGraph(data float64, children []Value, localGrads []float64) Value {
	return &Value{data: data, children: children, localGrads: localGrads}
	}

	func (a Value) Add(b Value) *Value {
	return NewValueWithGraph(a.data+b.data, []*Value{a, b}, []float64{1, 1})
	}

	func (a Value) Mul(b Value) *Value {
	return NewValueWithGraph(a.datab.data, []Value{a, b}, []float64{b.data, a.data})
	}

	func (a Value) Pow(n float64) Value {
	return NewValueWithGraph(math.Pow(a.data, n), []Value{a}, []float64{n math.Pow(a.data, n-1)})
	}

	func (a Value) Log() Value {
	return NewValueWithGraph(math.Log(a.data), []*Value{a}, []float64{1.0 / a.data})
	}

	func (a Value) Exp() Value {
	e := math.Exp(a.data)
	return NewValueWithGraph(e, []*Value{a}, []float64{e})
	}

	func (a Value) ReLU() Value {
	out := 0.0
	grad := 0.0
	if a.data > 0 {
	out = a.data
	grad = 1.0
	}
	return NewValueWithGraph(out, []*Value{a}, []float64{grad})
	}

	func (a Value) Neg() Value {
	return a.Mul(NewValue(-1))
	}

	func (a Value) Sub(b Value) *Value {
	return a.Add(b.Neg())
	}

	func (a Value) Div(b Value) *Value {
	return a.Mul(b.Pow(-1))
	}

	// AddScalar and MulScalar are convenience helpers to avoid wrapping constants.
	func (a Value) AddScalar(s float64) Value {
	return a.Add(NewValue(s))
	}

	func (a Value) MulScalar(s float64) Value {
	return a.Mul(NewValue(s))
	}

	func (a Value) DivScalar(s float64) Value {
	return a.Div(NewValue(s))
	}

	// Backward computes gradients for all Values in the computation graph via
	// reverse-mode autodiff (backpropagation).
	func (a *Value) Backward() {
	// Topological sort
	var topo []*Value
	visited := make(map[*Value]bool)
	var buildTopo func(v *Value)
	buildTopo = func(v *Value) {
	if visited[v] {
	return
	}
	visited[v] = true
	for _, child := range v.children {
	buildTopo(child)
	}
	topo = append(topo, v)
	}
	buildTopo(a)

	a.grad = 1.0
	for i := len(topo) - 1; i >= 0; i-- {
	v := topo[i]
	for j, child := range v.children {
	child.grad += v.localGrads[j] * v.grad
	}
	}
	}

	// =============================================================================
	// PSEUDORANDOM NUMBER GENERATOR
	// =============================================================================
	// A simple xoshiro256** PRNG so we don't need math/rand. Seeded deterministically.

	type RNG struct {
	s [4]uint64
	}

	func NewRNG(seed uint64) *RNG {
	// SplitMix64 to expand seed into state
	r := &RNG{}
	for i := 0; i < 4; i++ {
	seed += 0x9e3779b97f4a7c15
	z := seed
	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9
	z = (z ^ (z >> 27)) * 0x94d049bb133111eb
	z = z ^ (z >> 31)
	r.s[i] = z
	}
	return r
	}

	func (r *RNG) rotl(x uint64, k uint) uint64 {
	return (x << k) \| (x >> (64 - k))
	}

	func (r *RNG) Next() uint64 {
	result := r.rotl(r.s[1]5, 7) 9
	t := r.s[1] << 17
	r.s[2] ^= r.s[0]
	r.s[3] ^= r.s[1]
	r.s[1] ^= r.s[2]
	r.s[0] ^= r.s[3]
	r.s[2] ^= t
	r.s[3] = r.rotl(r.s[3], 45)
	return result
	}

	// Float64 returns a random float in [0, 1).
	func (r *RNG) Float64() float64 {
	return float64(r.Next()>>11) / (1 << 53)
	}

	// Gauss returns a normally distributed random number (Box-Muller transform).
	func (r *RNG) Gauss(mean, std float64) float64 {
	u1 := r.Float64()
	u2 := r.Float64()
	for u1 == 0 {
	u1 = r.Float64()
	}
	z := math.Sqrt(-2math.Log(u1)) math.Cos(2math.Piu2)
	return mean + std*z
	}

	// Shuffle randomly permutes a slice of strings.
	func (r *RNG) Shuffle(s []string) {
	for i := len(s) - 1; i > 0; i-- {
	j := int(r.Next() % uint64(i+1))
	s[i], s[j] = s[j], s[i]
	}
	}

	// WeightedChoice samples an index from weights (unnormalized probabilities).
	func (r *RNG) WeightedChoice(weights []float64) int {
	total := 0.0
	for _, w := range weights {
	total += w
	}
	threshold := r.Float64() * total
	cum := 0.0
	for i, w := range weights {
	cum += w
	if cum > threshold {
	return i
	}
	}
	return len(weights) - 1
	}

	// =============================================================================
	// MODEL DEFINITION
	// =============================================================================

	const (
	nEmbd = 16
	nHead = 4
	nLayer = 1
	blockSize = 16
	headDim = nEmbd / nHead
	)

	// stateDict holds all model parameters as named 2D slices of *Value.
	var stateDict map[string][][]*Value

	// params is a flat list of every parameter, for the optimizer.
	var params []*Value

	// initMatrix creates a [nout][nin] matrix of random Values.
	func initMatrix(rng RNG, nout, nin int, std float64) [][]Value {
	m := make([][]*Value, nout)
	for i := range m {
	m[i] = make([]*Value, nin)
	for j := range m[i] {
	m[i][j] = NewValue(rng.Gauss(0, std))
	}
	}
	return m
	}

	func initParams(rng *RNG, vocabSize int) {
	stateDict = make(map[string][][]*Value)
	stateDict["wte"] = initMatrix(rng, vocabSize, nEmbd, 0.08)
	stateDict["wpe"] = initMatrix(rng, blockSize, nEmbd, 0.08)
	stateDict["lm_head"] = initMatrix(rng, vocabSize, nEmbd, 0.08)

	for i := 0; i < nLayer; i++ {
	prefix := fmt.Sprintf("layer%d", i)
	stateDict[prefix+".attn_wq"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
	stateDict[prefix+".attn_wk"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
	stateDict[prefix+".attn_wv"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
	stateDict[prefix+".attn_wo"] = initMatrix(rng, nEmbd, nEmbd, 0.08)
	stateDict[prefix+".mlp_fc1"] = initMatrix(rng, 4*nEmbd, nEmbd, 0.08)
	stateDict[prefix+".mlp_fc2"] = initMatrix(rng, nEmbd, 4*nEmbd, 0.08)
	}

	// Flatten all parameters into a single slice
	params = nil
	for _, mat := range stateDict {
	for _, row := range mat {
	params = append(params, row...)
	}
	}
	}

	// =============================================================================
	// MODEL ARCHITECTURE
	// =============================================================================

	func linear(x []Value, w [][]Value) []*Value {
	out := make([]*Value, len(w))
	for i, wo := range w {
	s := NewValue(0)
	for j := range wo {
	s = s.Add(wo[j].Mul(x[j]))
	}
	out[i] = s
	}
	return out
	}

	func softmax(logits []Value) []Value {
	maxVal := logits[0].data
	for _, v := range logits[1:] {
	if v.data > maxVal {
	maxVal = v.data
	}
	}
	exps := make([]*Value, len(logits))
	total := NewValue(0)
	for i, v := range logits {
	exps[i] = v.AddScalar(-maxVal).Exp()
	total = total.Add(exps[i])
	}
	out := make([]*Value, len(logits))
	for i := range exps {
	out[i] = exps[i].Div(total)
	}
	return out
	}

	func rmsnorm(x []Value) []Value {
	ms := NewValue(0)
	for _, xi := range x {
	ms = ms.Add(xi.Mul(xi))
	}
	ms = ms.DivScalar(float64(len(x)))
	scale := ms.AddScalar(1e-5).Pow(-0.5)
	out := make([]*Value, len(x))
	for i, xi := range x {
	out[i] = xi.Mul(scale)
	}
	return out
	}

	// KVCache stores the key and value vectors for each layer across positions.
	type KVCache struct {
	keys [nLayer][][]*Value // keys[layer][position] = vector
	values [nLayer][][]*Value
	}

	func NewKVCache() *KVCache {
	kv := &KVCache{}
	for i := 0; i < nLayer; i++ {
	kv.keys[i] = nil
	kv.values[i] = nil
	}
	return kv
	}

	func gpt(tokenID, posID int, kv KVCache) []Value {
	// Embedding lookup: token + position
	tokEmb := stateDict["wte"][tokenID]
	posEmb := stateDict["wpe"][posID]
	x := make([]*Value, nEmbd)
	for i := range x {
	x[i] = tokEmb[i].Add(posEmb[i])
	}
	x = rmsnorm(x)

	for li := 0; li < nLayer; li++ {
	prefix := fmt.Sprintf("layer%d", li)

	// --- Multi-head attention ---
	xResidual := x
	x = rmsnorm(x)
	q := linear(x, stateDict[prefix+".attn_wq"])
	k := linear(x, stateDict[prefix+".attn_wk"])
	v := linear(x, stateDict[prefix+".attn_wv"])

	kv.keys[li] = append(kv.keys[li], k)
	kv.values[li] = append(kv.values[li], v)

	xAttn := make([]*Value, 0, nEmbd)
	for h := 0; h < nHead; h++ {
	hs := h * headDim

	// Slice out this head's query
	qH := q[hs : hs+headDim]

	// Gather this head's cached keys and values
	nPos := len(kv.keys[li])
	kH := make([][]*Value, nPos)
	vH := make([][]*Value, nPos)
	for t := 0; t < nPos; t++ {
	kH[t] = kv.keys[li][t][hs : hs+headDim]
	vH[t] = kv.values[li][t][hs : hs+headDim]
	}

	// Attention scores: dot(q, k) / sqrt(head_dim)
	attnLogits := make([]*Value, nPos)
	scale := math.Sqrt(float64(headDim))
	for t := 0; t < nPos; t++ {
	dot := NewValue(0)
	for j := 0; j < headDim; j++ {
	dot = dot.Add(qH[j].Mul(kH[t][j]))
	}
	attnLogits[t] = dot.DivScalar(scale)
	}

	attnWeights := softmax(attnLogits)

	// Weighted sum of values
	for j := 0; j < headDim; j++ {
	s := NewValue(0)
	for t := 0; t < nPos; t++ {
	s = s.Add(attnWeights[t].Mul(vH[t][j]))
	}
	xAttn = append(xAttn, s)
	}
	}

	x = linear(xAttn, stateDict[prefix+".attn_wo"])
	for i := range x {
	x[i] = x[i].Add(xResidual[i])
	}

	// --- MLP block ---
	xResidual = x
	x = rmsnorm(x)
	x = linear(x, stateDict[prefix+".mlp_fc1"])
	for i := range x {
	x[i] = x[i].ReLU()
	}
	x = linear(x, stateDict[prefix+".mlp_fc2"])
	for i := range x {
	x[i] = x[i].Add(xResidual[i])
	}
	}

	return linear(x, stateDict["lm_head"])
	}

	// =============================================================================
	// MAIN: DATA LOADING, TRAINING, AND INFERENCE
	// =============================================================================

	func main() {
	rng := NewRNG(42)

	// --- Load dataset ---
	if _, err := os.Stat("input.txt"); os.IsNotExist(err) {
	fmt.Println("Downloading names dataset...")
	resp, err := http.Get("https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt")
	if err != nil {
	panic(err)
	}
	defer resp.Body.Close()
	f, _ := os.Create("input.txt")
	defer f.Close()
	scanner := bufio.NewScanner(resp.Body)
	for scanner.Scan() {
	f.WriteString(scanner.Text() + "\n")
	}
	}

	data, _ := os.ReadFile("input.txt")
	lines := strings.Split(strings.TrimSpace(string(data)), "\n")
	var docs []string
	for _, l := range lines {
	l = strings.TrimSpace(l)
	if l != "" {
	docs = append(docs, l)
	}
	}
	rng.Shuffle(docs)
	fmt.Printf("num docs: %d\n", len(docs))

	// --- Tokenizer ---
	charSet := make(map[rune]bool)
	for _, doc := range docs {
	for _, ch := range doc {
	charSet[ch] = true
	}
	}
	var uchars []rune
	for ch := range charSet {
	uchars = append(uchars, ch)
	}
	sort.Slice(uchars, func(i, j int) bool { return uchars[i] < uchars[j] })

	charToIdx := make(map[rune]int)
	for i, ch := range uchars {
	charToIdx[ch] = i
	}
	BOS := len(uchars)
	vocabSize := len(uchars) + 1
	fmt.Printf("vocab size: %d\n", vocabSize)

	// --- Initialize parameters ---
	initParams(rng, vocabSize)
	fmt.Printf("num params: %d\n", len(params))

	// --- Adam optimizer buffers ---
	lr := 0.01
	beta1 := 0.85
	beta2 := 0.99
	epsAdam := 1e-8
	mBuf := make([]float64, len(params))
	vBuf := make([]float64, len(params))

	// --- Training loop ---
	numSteps := 1000

	for step := 0; step < numSteps; step++ {
	doc := docs[step%len(docs)]

	// Tokenize: [BOS, ...chars..., BOS]
	tokens := make([]int, 0, len(doc)+2)
	tokens = append(tokens, BOS)
	for _, ch := range doc {
	tokens = append(tokens, charToIdx[ch])
	}
	tokens = append(tokens, BOS)

	n := len(tokens) - 1
	if n > blockSize {
	n = blockSize
	}

	// Forward pass
	kv := NewKVCache()
	losses := make([]*Value, 0, n)

	for pos := 0; pos < n; pos++ {
	tokenID := tokens[pos]
	targetID := tokens[pos+1]

	logits := gpt(tokenID, pos, kv)
	probs := softmax(logits)
	lossT := probs[targetID].Log().Neg()
	losses = append(losses, lossT)
	}

	// Average loss
	loss := NewValue(0)
	for _, l := range losses {
	loss = loss.Add(l)
	}
	loss = loss.DivScalar(float64(n))

	// Backward pass
	loss.Backward()

	// Adam update
	lrT := lr * (1.0 - float64(step)/float64(numSteps))
	for i, p := range params {
	mBuf[i] = beta1mBuf[i] + (1-beta1)p.grad
	vBuf[i] = beta2vBuf[i] + (1-beta2)p.grad*p.grad
	mHat := mBuf[i] / (1 - math.Pow(beta1, float64(step+1)))
	vHat := vBuf[i] / (1 - math.Pow(beta2, float64(step+1)))
	p.data -= lrT * mHat / (math.Sqrt(vHat) + epsAdam)
	p.grad = 0
	}

	fmt.Printf("step %4d / %4d \| loss %.4f\n", step+1, numSteps, loss.data)
	}

	// --- Inference ---
	temperature := 0.5
	fmt.Println("\n--- inference (new, hallucinated names) ---")

	for sample := 0; sample < 20; sample++ {
	kv := NewKVCache()
	tokenID := BOS
	var name []rune

	for pos := 0; pos < blockSize; pos++ {
	logits := gpt(tokenID, pos, kv)

	// Apply temperature
	scaled := make([]*Value, len(logits))
	for i, l := range logits {
	scaled[i] = l.DivScalar(temperature)
	}
	probs := softmax(scaled)

	weights := make([]float64, len(probs))
	for i, p := range probs {
	weights[i] = p.data
	}
	tokenID = rng.WeightedChoice(weights)

	if tokenID == BOS {
	break
	}
	name = append(name, uchars[tokenID])
	}

	fmt.Printf("sample %2d: %s\n", sample+1, string(name))
	}
	}
No results found