snoblenet/microgpt.ts

## microgpt.ts
#!/usr/bin/env bun
/**
 * Minimal GPT trainer and inference in pure TypeScript (no dependencies).
 * TypeScript port of Andrej Karpathy's microgpt.py.
 * Original: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
 *
 * A GPT (Generative Pre-trained Transformer) is a neural network that learns
 * to predict the next token in a sequence. This script demonstrates the full
 * pipeline: autograd (automatic differentiation for computing gradients),
 * a transformer architecture, training with the Adam optimizer, and
 * autoregressive text generation — all in ~350 lines, zero dependencies.
 */

type ModelConfig = {
  blockSize: number;
  headDim: number;
  nEmbd: number;
  nHead: number;
  nLayer: number;
  vocabSize: number;
};

type StateDict = Record<string, Value[][]>;

/** Deterministic PRNG — fixes the random seed so training is reproducible. */
class Rng {
  _hasSpare: boolean;
  _spare: number;
  _state: number;

  constructor(seed: number) {
    this._hasSpare = false;
    this._spare = 0;
    this._state = seed >>> 0;
  }

  choices(population: number[], weights: number[]): number {
    const total = weights.reduce((a, b) => a + b, 0);
    let r = this.random() * total;
    for (let i = 0; i < population.length; i++) {
      r -= weights[i];
      if (r <= 0) return population[i];
    }
    return population[population.length - 1];
  }

  /**
   * Box-Muller polar method: transforms two uniform random numbers into a
   * Gaussian sample. Used for weight initialization — starting weights from a
   * small Gaussian distribution (rather than all zeros) breaks symmetry so
   * each neuron can learn different features.
   */
  gauss(mean: number, std: number): number {
    if (this._hasSpare) {
      this._hasSpare = false;
      return this._spare * std + mean;
    }
    let u = 0,
      v = 0,
      s = 0;
    do {
      u = this.random() * 2 - 1;
      v = this.random() * 2 - 1;
      s = u * u + v * v;
    } while (s >= 1 || s === 0);
    const mul = Math.sqrt((-2 * Math.log(s)) / s);
    this._spare = v * mul;
    this._hasSpare = true;
    return u * mul * std + mean;
  }

  random(): number {
    this._state = (this._state + 0x6d2b79f5) >>> 0;
    let t = Math.imul(this._state ^ (this._state >>> 15), this._state | 1);
    t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
  }

  shuffle<T>(arr: T[]): void {
    for (let i = arr.length - 1; i > 0; i--) {
      const j = Math.floor(this.random() * (i + 1));
      [arr[i], arr[j]] = [arr[j], arr[i]];
    }
  }
}

/**
 * Autograd engine — every scalar value in the network is wrapped in a Value
 * so that operations build a computation graph. Once you call `backward()` on
 * the loss, gradients flow back through the graph via the chain rule, telling
 * each parameter how to change to reduce the loss.
 */
class Value {
  /** The operands (parent nodes) that produced this value. */
  _children: readonly Value[];
  /**
   * The partial derivative of this value with respect to each child,
   * i.e. ∂(this)/∂(child[i]). Multiplied by `this.grad` during backprop
   * to give the child's contribution to the total gradient (chain rule).
   */
  _localGrads: readonly number[];
  data: number;
  grad: number;

  constructor(
    data: number,
    children: readonly Value[] = [],
    localGrads: readonly number[] = [],
  ) {
    this._children = children;
    this._localGrads = localGrads;
    this.data = data;
    this.grad = 0;
  }

  add(other: Value | number): Value {
    const o = other instanceof Value ? other : new Value(other);
    return new Value(this.data + o.data, [this, o], [1, 1]);
  }

  /**
   * Reverse-mode autodiff (backpropagation). Topologically sorts the
   * computation graph so every node is visited after its dependents, then
   * walks in reverse, accumulating gradients via the chain rule:
   *   child.grad += (∂this/∂child) * this.grad
   */
  backward(): void {
    const topo: Value[] = [];
    const visited = new Set<Value>();
    const buildTopo = (v: Value): void => {
      if (!visited.has(v)) {
        visited.add(v);
        for (const child of v._children) {
          buildTopo(child);
        }
        topo.push(v);
      }
    };
    buildTopo(this);
    this.grad = 1;
    for (let i = topo.length - 1; i >= 0; i--) {
      const v = topo[i];
      for (let j = 0; j < v._children.length; j++) {
        v._children[j].grad += v._localGrads[j] * v.grad;
      }
    }
  }

  div(other: Value | number): Value {
    const o = other instanceof Value ? other : new Value(other);
    return this.mul(o.pow(-1));
  }

  static dot(a: Value[], b: Value[]): Value {
    return Value.sum(a.map((ai, i) => ai.mul(b[i])));
  }

  exp(): Value {
    return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
  }

  log(): Value {
    return new Value(Math.log(this.data), [this], [1 / this.data]);
  }

  /**
   * ∂(a·b)/∂a = b and ∂(a·b)/∂b = a — so the local gradients for
   * multiplication are simply the other operand's current data values.
   */
  mul(other: Value | number): Value {
    const o = other instanceof Value ? other : new Value(other);
    return new Value(this.data * o.data, [this, o], [o.data, this.data]);
  }

  neg(): Value {
    return this.mul(-1);
  }

  pow(exp: number): Value {
    return new Value(this.data ** exp, [this], [exp * this.data ** (exp - 1)]);
  }

  relu(): Value {
    return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
  }

  sub(other: Value | number): Value {
    const o = other instanceof Value ? other : new Value(other);
    return this.add(o.neg());
  }

  static sum(values: Value[]): Value {
    return values.reduce((acc, v) => acc.add(v), new Value(0));
  }
}

const flattenParams = (stateDict: StateDict): Value[] =>
  Object.values(stateDict).flatMap(mat => mat.flatMap(row => row));

/**
 * Single-token forward pass through the GPT transformer. Takes the current
 * token and its position, runs it through all layers, and returns raw logits
 * (unnormalized scores) over the vocabulary — one per possible next token.
 * KV caches accumulate keys and values from previous positions so we don't
 * recompute them on each step during inference.
 */
const gpt = (
  tokenId: number,
  posId: number,
  kvKeys: Value[][][],
  kvVals: Value[][][],
  stateDict: StateDict,
  config: ModelConfig,
): Value[] => {
  const { headDim, nHead, nLayer } = config;
  // Token embedding + positional embedding: the model's only way to know
  // *what* token it's looking at and *where* in the sequence it sits.
  let x = stateDict['wte'][tokenId].map((t, i) =>
    t.add(stateDict['wpe'][posId][i]),
  );
  // Pre-normalization before the first attention layer.
  x = rmsnorm(x);
  for (let li = 0; li < nLayer; li++) {
    // --- Self-attention block ---
    const xResidual = x;
    x = rmsnorm(x);
    // Project the input into query, key, and value spaces — each a learned
    // linear transformation. Q asks "what am I looking for?", K says "what
    // do I offer?", V holds the actual content to aggregate.
    const q = linear(x, stateDict[`layer${li}.attn_wq`]);
    const k = linear(x, stateDict[`layer${li}.attn_wk`]);
    const v = linear(x, stateDict[`layer${li}.attn_wv`]);
    // Append this token's key and value to the KV cache for use in later
    // positions (during inference we avoid recomputing past context).
    kvKeys[li].push(k);
    kvVals[li].push(v);
    const xAttn: Value[] = [];
    for (let h = 0; h < nHead; h++) {
      const hs = h * headDim;
      const qH = q.slice(hs, hs + headDim);
      const kH = kvKeys[li].map(ki => ki.slice(hs, hs + headDim));
      const vH = kvVals[li].map(vi => vi.slice(hs, hs + headDim));
      const scale = headDim ** 0.5;
      // Scaled dot-product attention: softmax(Q·K^T / √d_k) · V
      // Dividing by √d_k keeps dot products from growing large as headDim
      // increases, which would push softmax into low-gradient saturation.
      const attnLogits = kH.map(khi => Value.dot(qH, khi).div(scale));
      const attnWeights = softmax(attnLogits);
      for (let j = 0; j < headDim; j++) {
        xAttn.push(Value.sum(attnWeights.map((w, t) => w.mul(vH[t][j]))));
      }
    }
    x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
    // Residual connection: add back the block's input. This highway for
    // gradients lets deep networks train without vanishing gradients.
    x = x.map((xi, i) => xi.add(xResidual[i]));
    // --- MLP block: expand → ReLU → contract → residual ---
    const xResidual2 = x;
    x = rmsnorm(x);
    x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
    x = x.map(xi => xi.relu());
    x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
    x = x.map((xi, i) => xi.add(xResidual2[i]));
  }
  // Final projection: map from embedding space back to vocabulary logits.
  return linear(x, stateDict['lm_head']);
};

/**
 * Initializes all model parameters as random matrices. Insertion order into
 * `sd` matters — `flattenParams` uses `Object.values()` to produce a stable
 * parameter list for the optimizer.
 */
const initStateDict = (config: ModelConfig, rng: Rng): StateDict => {
  const { blockSize, nEmbd, nLayer, vocabSize } = config;
  const sd: StateDict = {};
  sd['wte'] = matrix(vocabSize, nEmbd, rng);
  sd['wpe'] = matrix(blockSize, nEmbd, rng);
  sd['lm_head'] = matrix(vocabSize, nEmbd, rng);
  for (let i = 0; i < nLayer; i++) {
    sd[`layer${i}.attn_wq`] = matrix(nEmbd, nEmbd, rng);
    sd[`layer${i}.attn_wk`] = matrix(nEmbd, nEmbd, rng);
    sd[`layer${i}.attn_wv`] = matrix(nEmbd, nEmbd, rng);
    sd[`layer${i}.attn_wo`] = matrix(nEmbd, nEmbd, rng);
    sd[`layer${i}.mlp_fc1`] = matrix(4 * nEmbd, nEmbd, rng);
    sd[`layer${i}.mlp_fc2`] = matrix(nEmbd, 4 * nEmbd, rng);
  }
  return sd;
};

/**
 * Matrix-vector multiply: the fundamental building block of neural networks.
 * Each row of `w` is a weight vector; dotting it with the input `x` produces
 * one output scalar, so the result has one entry per row of `w`.
 */
const linear = (x: Value[], w: Value[][]): Value[] =>
  w.map(row => Value.dot(row, x));

const loadDataset = async (): Promise<string[]> => {
  const url =
    'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt';
  const path = 'utils/ml/gptData.txt';
  if (!(await Bun.file(path).exists())) {
    const response = await fetch(url);
    const text = await response.text();
    await Bun.write(path, text);
  }
  const text = await Bun.file(path).text();
  return text
    .split('\n')
    .map(line => line.trim())
    .filter(line => line.length > 0);
};

/**
 * Initialize a weight matrix with Gaussian noise (std=0.08). The small std
 * breaks symmetry (each neuron starts differently) without causing the
 * activations or gradients to explode on the first forward pass.
 */
const matrix = (nout: number, nin: number, rng: Rng, std = 0.08): Value[][] =>
  Array.from({ length: nout }, () =>
    Array.from({ length: nin }, () => new Value(rng.gauss(0, std))),
  );

/**
 * Root Mean Square Layer Normalization. Scales each vector so its RMS is ~1.
 * Normalizing activations stabilizes training by preventing them from growing
 * or shrinking uncontrollably as signals pass through many layers.
 */
const rmsnorm = (x: Value[]): Value[] => {
  const ms = Value.sum(x.map(xi => xi.mul(xi))).div(x.length);
  const scale = ms.add(1e-5).pow(-0.5);
  return x.map(xi => xi.mul(scale));
};

/**
 * Converts raw scores (logits) into a probability distribution over tokens.
 * Subtracting the max before calling exp() is a numerical stability trick —
 * it prevents overflow without changing the output (softmax is shift-invariant).
 */
const softmax = (logits: Value[]): Value[] => {
  const maxVal = Math.max(...logits.map(v => v.data));
  const exps = logits.map(v => v.sub(maxVal).exp());
  const total = Value.sum(exps);
  return exps.map(e => e.div(total));
};

export { Rng, Value };

if (import.meta.main) {
  const docs = await loadDataset();
  const rng = new Rng(42);
  rng.shuffle(docs);
  console.log(`num docs: ${docs.length}`);

  // Build the vocabulary: one index per unique character, plus a special BOS
  // (beginning-of-sequence) token used as both start and end marker.
  const uchars = [...new Set(docs.join(''))].sort();
  const BOS = uchars.length;
  const vocabSize = uchars.length + 1;
  console.log(`vocab size: ${vocabSize}`);

  const blockSize = 16;
  const headDim = 4;
  const nEmbd = 16;
  const nHead = 4;
  const nLayer = 1;
  const config: ModelConfig = {
    blockSize,
    headDim,
    nEmbd,
    nHead,
    nLayer,
    vocabSize,
  };

  const stateDict = initStateDict(config, rng);
  const params = flattenParams(stateDict);
  console.log(`num params: ${params.length}`);

  const beta1 = 0.85;
  const beta2 = 0.99;
  const epsAdam = 1e-8;
  const learningRate = 0.01;
  const m = new Float64Array(params.length);
  const numSteps = 1000;
  const v = new Float64Array(params.length);

  for (let step = 0; step < numSteps; step++) {
    // Tokenize: convert each character to its vocabulary index, wrap the
    // word with BOS tokens so the model learns to start and stop.
    const doc = docs[step % docs.length];
    const tokens = [BOS, ...[...doc].map(ch => uchars.indexOf(ch)), BOS];
    const n = Math.min(blockSize, tokens.length - 1);
    const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
    const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
    const losses: Value[] = [];

    for (let posId = 0; posId < n; posId++) {
      const tokenId = tokens[posId];
      const targetId = tokens[posId + 1];
      const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
      const probs = softmax(logits);
      // Cross-entropy loss: −log P(correct_token). Near zero when the model
      // is confident and correct; large when it assigns low probability to
      // the right answer.
      losses.push(probs[targetId].log().neg());
    }

    const loss = Value.sum(losses).div(n);
    // Backprop: compute ∂loss/∂param for every parameter in the network.
    loss.backward();

    // Adam optimizer: maintains per-parameter exponential moving averages of
    // the gradient (m, "momentum") and squared gradient (v, "velocity").
    // Bias-corrected estimates m̂ and v̂ correct for the cold start at zero.
    // Update rule: param -= lr · m̂ / (√v̂ + ε)
    // Linear LR decay from learningRate → 0 keeps training stable near convergence.
    const lrT = learningRate * (1 - step / numSteps);
    for (let i = 0; i < params.length; i++) {
      const g = params[i].grad;
      m[i] = beta1 * m[i] + (1 - beta1) * g;
      v[i] = beta2 * v[i] + (1 - beta2) * g * g;
      const mHat = m[i] / (1 - beta1 ** (step + 1));
      const vHat = v[i] / (1 - beta2 ** (step + 1));
      params[i].data -= lrT * (mHat / (vHat ** 0.5 + epsAdam));
      params[i].grad = 0;
    }

    process.stdout.write(
      `step ${String(step + 1).padStart(4)} / ${numSteps} | loss ${loss.data.toFixed(4)}\r`,
    );
  }

  // Temperature < 1 sharpens the probability distribution (more confident,
  // less random picks); > 1 flattens it (more creative, less coherent).
  const temperature = 0.5;
  console.log('\n--- inference (new, hallucinated names) ---');
  for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
    const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
    const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
    let tokenId = BOS;
    const sample: string[] = [];

    // Autoregressive generation: feed each predicted token back as the next
    // input, one step at a time, until the model emits BOS (its end marker)
    // or the maximum context length is reached.
    for (let posId = 0; posId < blockSize; posId++) {
      const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
      const probs = softmax(logits.map(l => l.div(temperature)));
      tokenId = rng.choices(
        Array.from({ length: vocabSize }, (_, i) => i),
        probs.map(p => p.data),
      );
      if (tokenId === BOS) break;
      sample.push(uchars[tokenId]);
    }

    console.log(
      `sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join('')}`,
    );
  }
}
	#!/usr/bin/env bun
	/**
	* Minimal GPT trainer and inference in pure TypeScript (no dependencies).
	* TypeScript port of Andrej Karpathy's microgpt.py.
	* Original: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
	*
	* A GPT (Generative Pre-trained Transformer) is a neural network that learns
	* to predict the next token in a sequence. This script demonstrates the full
	* pipeline: autograd (automatic differentiation for computing gradients),
	* a transformer architecture, training with the Adam optimizer, and
	* autoregressive text generation — all in ~350 lines, zero dependencies.
	*/

	type ModelConfig = {
	blockSize: number;
	headDim: number;
	nEmbd: number;
	nHead: number;
	nLayer: number;
	vocabSize: number;
	};

	type StateDict = Record<string, Value[][]>;

	/** Deterministic PRNG — fixes the random seed so training is reproducible. */
	class Rng {
	_hasSpare: boolean;
	_spare: number;
	_state: number;

	constructor(seed: number) {
	this._hasSpare = false;
	this._spare = 0;
	this._state = seed >>> 0;
	}

	choices(population: number[], weights: number[]): number {
	const total = weights.reduce((a, b) => a + b, 0);
	let r = this.random() * total;
	for (let i = 0; i < population.length; i++) {
	r -= weights[i];
	if (r <= 0) return population[i];
	}
	return population[population.length - 1];
	}

	/**
	* Box-Muller polar method: transforms two uniform random numbers into a
	* Gaussian sample. Used for weight initialization — starting weights from a
	* small Gaussian distribution (rather than all zeros) breaks symmetry so
	* each neuron can learn different features.
	*/
	gauss(mean: number, std: number): number {
	if (this._hasSpare) {
	this._hasSpare = false;
	return this._spare * std + mean;
	}
	let u = 0,
	v = 0,
	s = 0;
	do {
	u = this.random() * 2 - 1;
	v = this.random() * 2 - 1;
	s = u * u + v * v;
	} while (s >= 1 \|\| s === 0);
	const mul = Math.sqrt((-2 * Math.log(s)) / s);
	this._spare = v * mul;
	this._hasSpare = true;
	return u * mul * std + mean;
	}

	random(): number {
	this._state = (this._state + 0x6d2b79f5) >>> 0;
	let t = Math.imul(this._state ^ (this._state >>> 15), this._state \| 1);
	t ^= t + Math.imul(t ^ (t >>> 7), t \| 61);
	return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
	}

	shuffle<T>(arr: T[]): void {
	for (let i = arr.length - 1; i > 0; i--) {
	const j = Math.floor(this.random() * (i + 1));
	[arr[i], arr[j]] = [arr[j], arr[i]];
	}
	}
	}

	/**
	* Autograd engine — every scalar value in the network is wrapped in a Value
	* so that operations build a computation graph. Once you call `backward()` on
	* the loss, gradients flow back through the graph via the chain rule, telling
	* each parameter how to change to reduce the loss.
	*/
	class Value {
	/** The operands (parent nodes) that produced this value. */
	_children: readonly Value[];
	/**
	* The partial derivative of this value with respect to each child,
	* i.e. ∂(this)/∂(child[i]). Multiplied by `this.grad` during backprop
	* to give the child's contribution to the total gradient (chain rule).
	*/
	_localGrads: readonly number[];
	data: number;
	grad: number;

	constructor(
	data: number,
	children: readonly Value[] = [],
	localGrads: readonly number[] = [],
	) {
	this._children = children;
	this._localGrads = localGrads;
	this.data = data;
	this.grad = 0;
	}

	add(other: Value \| number): Value {
	const o = other instanceof Value ? other : new Value(other);
	return new Value(this.data + o.data, [this, o], [1, 1]);
	}

	/**
	* Reverse-mode autodiff (backpropagation). Topologically sorts the
	* computation graph so every node is visited after its dependents, then
	* walks in reverse, accumulating gradients via the chain rule:
	* child.grad += (∂this/∂child) * this.grad
	*/
	backward(): void {
	const topo: Value[] = [];
	const visited = new Set<Value>();
	const buildTopo = (v: Value): void => {
	if (!visited.has(v)) {
	visited.add(v);
	for (const child of v._children) {
	buildTopo(child);
	}
	topo.push(v);
	}
	};
	buildTopo(this);
	this.grad = 1;
	for (let i = topo.length - 1; i >= 0; i--) {
	const v = topo[i];
	for (let j = 0; j < v._children.length; j++) {
	v._children[j].grad += v._localGrads[j] * v.grad;
	}
	}
	}

	div(other: Value \| number): Value {
	const o = other instanceof Value ? other : new Value(other);
	return this.mul(o.pow(-1));
	}

	static dot(a: Value[], b: Value[]): Value {
	return Value.sum(a.map((ai, i) => ai.mul(b[i])));
	}

	exp(): Value {
	return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
	}

	log(): Value {
	return new Value(Math.log(this.data), [this], [1 / this.data]);
	}

	/**
	* ∂(a·b)/∂a = b and ∂(a·b)/∂b = a — so the local gradients for
	* multiplication are simply the other operand's current data values.
	*/
	mul(other: Value \| number): Value {
	const o = other instanceof Value ? other : new Value(other);
	return new Value(this.data * o.data, [this, o], [o.data, this.data]);
	}

	neg(): Value {
	return this.mul(-1);
	}

	pow(exp: number): Value {
	return new Value(this.data ** exp, [this], [exp * this.data ** (exp - 1)]);
	}

	relu(): Value {
	return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
	}

	sub(other: Value \| number): Value {
	const o = other instanceof Value ? other : new Value(other);
	return this.add(o.neg());
	}

	static sum(values: Value[]): Value {
	return values.reduce((acc, v) => acc.add(v), new Value(0));
	}
	}

	const flattenParams = (stateDict: StateDict): Value[] =>
	Object.values(stateDict).flatMap(mat => mat.flatMap(row => row));

	/**
	* Single-token forward pass through the GPT transformer. Takes the current
	* token and its position, runs it through all layers, and returns raw logits
	* (unnormalized scores) over the vocabulary — one per possible next token.
	* KV caches accumulate keys and values from previous positions so we don't
	* recompute them on each step during inference.
	*/
	const gpt = (
	tokenId: number,
	posId: number,
	kvKeys: Value[][][],
	kvVals: Value[][][],
	stateDict: StateDict,
	config: ModelConfig,
	): Value[] => {
	const { headDim, nHead, nLayer } = config;
	// Token embedding + positional embedding: the model's only way to know
	// what token it's looking at and where in the sequence it sits.
	let x = stateDict['wte'][tokenId].map((t, i) =>
	t.add(stateDict['wpe'][posId][i]),
	);
	// Pre-normalization before the first attention layer.
	x = rmsnorm(x);
	for (let li = 0; li < nLayer; li++) {
	// --- Self-attention block ---
	const xResidual = x;
	x = rmsnorm(x);
	// Project the input into query, key, and value spaces — each a learned
	// linear transformation. Q asks "what am I looking for?", K says "what
	// do I offer?", V holds the actual content to aggregate.
	const q = linear(x, stateDict[`layer${li}.attn_wq`]);
	const k = linear(x, stateDict[`layer${li}.attn_wk`]);
	const v = linear(x, stateDict[`layer${li}.attn_wv`]);
	// Append this token's key and value to the KV cache for use in later
	// positions (during inference we avoid recomputing past context).
	kvKeys[li].push(k);
	kvVals[li].push(v);
	const xAttn: Value[] = [];
	for (let h = 0; h < nHead; h++) {
	const hs = h * headDim;
	const qH = q.slice(hs, hs + headDim);
	const kH = kvKeys[li].map(ki => ki.slice(hs, hs + headDim));
	const vH = kvVals[li].map(vi => vi.slice(hs, hs + headDim));
	const scale = headDim ** 0.5;
	// Scaled dot-product attention: softmax(Q·K^T / √d_k) · V
	// Dividing by √d_k keeps dot products from growing large as headDim
	// increases, which would push softmax into low-gradient saturation.
	const attnLogits = kH.map(khi => Value.dot(qH, khi).div(scale));
	const attnWeights = softmax(attnLogits);
	for (let j = 0; j < headDim; j++) {
	xAttn.push(Value.sum(attnWeights.map((w, t) => w.mul(vH[t][j]))));
	}
	}
	x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
	// Residual connection: add back the block's input. This highway for
	// gradients lets deep networks train without vanishing gradients.
	x = x.map((xi, i) => xi.add(xResidual[i]));
	// --- MLP block: expand → ReLU → contract → residual ---
	const xResidual2 = x;
	x = rmsnorm(x);
	x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
	x = x.map(xi => xi.relu());
	x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
	x = x.map((xi, i) => xi.add(xResidual2[i]));
	}
	// Final projection: map from embedding space back to vocabulary logits.
	return linear(x, stateDict['lm_head']);
	};

	/**
	* Initializes all model parameters as random matrices. Insertion order into
	* `sd` matters — `flattenParams` uses `Object.values()` to produce a stable
	* parameter list for the optimizer.
	*/
	const initStateDict = (config: ModelConfig, rng: Rng): StateDict => {
	const { blockSize, nEmbd, nLayer, vocabSize } = config;
	const sd: StateDict = {};
	sd['wte'] = matrix(vocabSize, nEmbd, rng);
	sd['wpe'] = matrix(blockSize, nEmbd, rng);
	sd['lm_head'] = matrix(vocabSize, nEmbd, rng);
	for (let i = 0; i < nLayer; i++) {
	sd[`layer${i}.attn_wq`] = matrix(nEmbd, nEmbd, rng);
	sd[`layer${i}.attn_wk`] = matrix(nEmbd, nEmbd, rng);
	sd[`layer${i}.attn_wv`] = matrix(nEmbd, nEmbd, rng);
	sd[`layer${i}.attn_wo`] = matrix(nEmbd, nEmbd, rng);
	sd[`layer${i}.mlp_fc1`] = matrix(4 * nEmbd, nEmbd, rng);
	sd[`layer${i}.mlp_fc2`] = matrix(nEmbd, 4 * nEmbd, rng);
	}
	return sd;
	};

	/**
	* Matrix-vector multiply: the fundamental building block of neural networks.
	* Each row of `w` is a weight vector; dotting it with the input `x` produces
	* one output scalar, so the result has one entry per row of `w`.
	*/
	const linear = (x: Value[], w: Value[][]): Value[] =>
	w.map(row => Value.dot(row, x));

	const loadDataset = async (): Promise<string[]> => {
	const url =
	'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt';
	const path = 'utils/ml/gptData.txt';
	if (!(await Bun.file(path).exists())) {
	const response = await fetch(url);
	const text = await response.text();
	await Bun.write(path, text);
	}
	const text = await Bun.file(path).text();
	return text
	.split('\n')
	.map(line => line.trim())
	.filter(line => line.length > 0);
	};

	/**
	* Initialize a weight matrix with Gaussian noise (std=0.08). The small std
	* breaks symmetry (each neuron starts differently) without causing the
	* activations or gradients to explode on the first forward pass.
	*/
	const matrix = (nout: number, nin: number, rng: Rng, std = 0.08): Value[][] =>
	Array.from({ length: nout }, () =>
	Array.from({ length: nin }, () => new Value(rng.gauss(0, std))),
	);

	/**
	* Root Mean Square Layer Normalization. Scales each vector so its RMS is ~1.
	* Normalizing activations stabilizes training by preventing them from growing
	* or shrinking uncontrollably as signals pass through many layers.
	*/
	const rmsnorm = (x: Value[]): Value[] => {
	const ms = Value.sum(x.map(xi => xi.mul(xi))).div(x.length);
	const scale = ms.add(1e-5).pow(-0.5);
	return x.map(xi => xi.mul(scale));
	};

	/**
	* Converts raw scores (logits) into a probability distribution over tokens.
	* Subtracting the max before calling exp() is a numerical stability trick —
	* it prevents overflow without changing the output (softmax is shift-invariant).
	*/
	const softmax = (logits: Value[]): Value[] => {
	const maxVal = Math.max(...logits.map(v => v.data));
	const exps = logits.map(v => v.sub(maxVal).exp());
	const total = Value.sum(exps);
	return exps.map(e => e.div(total));
	};

	export { Rng, Value };

	if (import.meta.main) {
	const docs = await loadDataset();
	const rng = new Rng(42);
	rng.shuffle(docs);
	console.log(`num docs: ${docs.length}`);

	// Build the vocabulary: one index per unique character, plus a special BOS
	// (beginning-of-sequence) token used as both start and end marker.
	const uchars = [...new Set(docs.join(''))].sort();
	const BOS = uchars.length;
	const vocabSize = uchars.length + 1;
	console.log(`vocab size: ${vocabSize}`);

	const blockSize = 16;
	const headDim = 4;
	const nEmbd = 16;
	const nHead = 4;
	const nLayer = 1;
	const config: ModelConfig = {
	blockSize,
	headDim,
	nEmbd,
	nHead,
	nLayer,
	vocabSize,
	};

	const stateDict = initStateDict(config, rng);
	const params = flattenParams(stateDict);
	console.log(`num params: ${params.length}`);

	const beta1 = 0.85;
	const beta2 = 0.99;
	const epsAdam = 1e-8;
	const learningRate = 0.01;
	const m = new Float64Array(params.length);
	const numSteps = 1000;
	const v = new Float64Array(params.length);

	for (let step = 0; step < numSteps; step++) {
	// Tokenize: convert each character to its vocabulary index, wrap the
	// word with BOS tokens so the model learns to start and stop.
	const doc = docs[step % docs.length];
	const tokens = [BOS, ...[...doc].map(ch => uchars.indexOf(ch)), BOS];
	const n = Math.min(blockSize, tokens.length - 1);
	const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
	const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
	const losses: Value[] = [];

	for (let posId = 0; posId < n; posId++) {
	const tokenId = tokens[posId];
	const targetId = tokens[posId + 1];
	const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
	const probs = softmax(logits);
	// Cross-entropy loss: −log P(correct_token). Near zero when the model
	// is confident and correct; large when it assigns low probability to
	// the right answer.
	losses.push(probs[targetId].log().neg());
	}

	const loss = Value.sum(losses).div(n);
	// Backprop: compute ∂loss/∂param for every parameter in the network.
	loss.backward();

	// Adam optimizer: maintains per-parameter exponential moving averages of
	// the gradient (m, "momentum") and squared gradient (v, "velocity").
	// Bias-corrected estimates m̂ and v̂ correct for the cold start at zero.
	// Update rule: param -= lr · m̂ / (√v̂ + ε)
	// Linear LR decay from learningRate → 0 keeps training stable near convergence.
	const lrT = learningRate * (1 - step / numSteps);
	for (let i = 0; i < params.length; i++) {
	const g = params[i].grad;
	m[i] = beta1 * m[i] + (1 - beta1) * g;
	v[i] = beta2 * v[i] + (1 - beta2) * g * g;
	const mHat = m[i] / (1 - beta1 ** (step + 1));
	const vHat = v[i] / (1 - beta2 ** (step + 1));
	params[i].data -= lrT * (mHat / (vHat ** 0.5 + epsAdam));
	params[i].grad = 0;
	}

	process.stdout.write(
	`step ${String(step + 1).padStart(4)} / ${numSteps} \| loss ${loss.data.toFixed(4)}\r`,
	);
	}

	// Temperature < 1 sharpens the probability distribution (more confident,
	// less random picks); > 1 flattens it (more creative, less coherent).
	const temperature = 0.5;
	console.log('\n--- inference (new, hallucinated names) ---');
	for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
	const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
	const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
	let tokenId = BOS;
	const sample: string[] = [];

	// Autoregressive generation: feed each predicted token back as the next
	// input, one step at a time, until the model emits BOS (its end marker)
	// or the maximum context length is reached.
	for (let posId = 0; posId < blockSize; posId++) {
	const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
	const probs = softmax(logits.map(l => l.div(temperature)));
	tokenId = rng.choices(
	Array.from({ length: vocabSize }, (_, i) => i),
	probs.map(p => p.data),
	);
	if (tokenId === BOS) break;
	sample.push(uchars[tokenId]);
	}

	console.log(
	`sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join('')}`,
	);
	}
	}
No results found