Created
February 18, 2026 03:12
-
-
Save snoblenet/7739055e32bffb81277b6a08d33a37ef to your computer and use it in GitHub Desktop.
Minimal GPT trainer and inference in pure TypeScript (no dependencies). Port of @karpathy's microgpt.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bun | |
| /** | |
| * Minimal GPT trainer and inference in pure TypeScript (no dependencies). | |
| * TypeScript port of Andrej Karpathy's microgpt.py. | |
| * Original: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95 | |
| * | |
| * A GPT (Generative Pre-trained Transformer) is a neural network that learns | |
| * to predict the next token in a sequence. This script demonstrates the full | |
| * pipeline: autograd (automatic differentiation for computing gradients), | |
| * a transformer architecture, training with the Adam optimizer, and | |
| * autoregressive text generation — all in ~350 lines, zero dependencies. | |
| */ | |
| type ModelConfig = { | |
| blockSize: number; | |
| headDim: number; | |
| nEmbd: number; | |
| nHead: number; | |
| nLayer: number; | |
| vocabSize: number; | |
| }; | |
| type StateDict = Record<string, Value[][]>; | |
| /** Deterministic PRNG — fixes the random seed so training is reproducible. */ | |
| class Rng { | |
| _hasSpare: boolean; | |
| _spare: number; | |
| _state: number; | |
| constructor(seed: number) { | |
| this._hasSpare = false; | |
| this._spare = 0; | |
| this._state = seed >>> 0; | |
| } | |
| choices(population: number[], weights: number[]): number { | |
| const total = weights.reduce((a, b) => a + b, 0); | |
| let r = this.random() * total; | |
| for (let i = 0; i < population.length; i++) { | |
| r -= weights[i]; | |
| if (r <= 0) return population[i]; | |
| } | |
| return population[population.length - 1]; | |
| } | |
| /** | |
| * Box-Muller polar method: transforms two uniform random numbers into a | |
| * Gaussian sample. Used for weight initialization — starting weights from a | |
| * small Gaussian distribution (rather than all zeros) breaks symmetry so | |
| * each neuron can learn different features. | |
| */ | |
| gauss(mean: number, std: number): number { | |
| if (this._hasSpare) { | |
| this._hasSpare = false; | |
| return this._spare * std + mean; | |
| } | |
| let u = 0, | |
| v = 0, | |
| s = 0; | |
| do { | |
| u = this.random() * 2 - 1; | |
| v = this.random() * 2 - 1; | |
| s = u * u + v * v; | |
| } while (s >= 1 || s === 0); | |
| const mul = Math.sqrt((-2 * Math.log(s)) / s); | |
| this._spare = v * mul; | |
| this._hasSpare = true; | |
| return u * mul * std + mean; | |
| } | |
| random(): number { | |
| this._state = (this._state + 0x6d2b79f5) >>> 0; | |
| let t = Math.imul(this._state ^ (this._state >>> 15), this._state | 1); | |
| t ^= t + Math.imul(t ^ (t >>> 7), t | 61); | |
| return ((t ^ (t >>> 14)) >>> 0) / 4294967296; | |
| } | |
| shuffle<T>(arr: T[]): void { | |
| for (let i = arr.length - 1; i > 0; i--) { | |
| const j = Math.floor(this.random() * (i + 1)); | |
| [arr[i], arr[j]] = [arr[j], arr[i]]; | |
| } | |
| } | |
| } | |
| /** | |
| * Autograd engine — every scalar value in the network is wrapped in a Value | |
| * so that operations build a computation graph. Once you call `backward()` on | |
| * the loss, gradients flow back through the graph via the chain rule, telling | |
| * each parameter how to change to reduce the loss. | |
| */ | |
| class Value { | |
| /** The operands (parent nodes) that produced this value. */ | |
| _children: readonly Value[]; | |
| /** | |
| * The partial derivative of this value with respect to each child, | |
| * i.e. ∂(this)/∂(child[i]). Multiplied by `this.grad` during backprop | |
| * to give the child's contribution to the total gradient (chain rule). | |
| */ | |
| _localGrads: readonly number[]; | |
| data: number; | |
| grad: number; | |
| constructor( | |
| data: number, | |
| children: readonly Value[] = [], | |
| localGrads: readonly number[] = [], | |
| ) { | |
| this._children = children; | |
| this._localGrads = localGrads; | |
| this.data = data; | |
| this.grad = 0; | |
| } | |
| add(other: Value | number): Value { | |
| const o = other instanceof Value ? other : new Value(other); | |
| return new Value(this.data + o.data, [this, o], [1, 1]); | |
| } | |
| /** | |
| * Reverse-mode autodiff (backpropagation). Topologically sorts the | |
| * computation graph so every node is visited after its dependents, then | |
| * walks in reverse, accumulating gradients via the chain rule: | |
| * child.grad += (∂this/∂child) * this.grad | |
| */ | |
| backward(): void { | |
| const topo: Value[] = []; | |
| const visited = new Set<Value>(); | |
| const buildTopo = (v: Value): void => { | |
| if (!visited.has(v)) { | |
| visited.add(v); | |
| for (const child of v._children) { | |
| buildTopo(child); | |
| } | |
| topo.push(v); | |
| } | |
| }; | |
| buildTopo(this); | |
| this.grad = 1; | |
| for (let i = topo.length - 1; i >= 0; i--) { | |
| const v = topo[i]; | |
| for (let j = 0; j < v._children.length; j++) { | |
| v._children[j].grad += v._localGrads[j] * v.grad; | |
| } | |
| } | |
| } | |
| div(other: Value | number): Value { | |
| const o = other instanceof Value ? other : new Value(other); | |
| return this.mul(o.pow(-1)); | |
| } | |
| static dot(a: Value[], b: Value[]): Value { | |
| return Value.sum(a.map((ai, i) => ai.mul(b[i]))); | |
| } | |
| exp(): Value { | |
| return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]); | |
| } | |
| log(): Value { | |
| return new Value(Math.log(this.data), [this], [1 / this.data]); | |
| } | |
| /** | |
| * ∂(a·b)/∂a = b and ∂(a·b)/∂b = a — so the local gradients for | |
| * multiplication are simply the other operand's current data values. | |
| */ | |
| mul(other: Value | number): Value { | |
| const o = other instanceof Value ? other : new Value(other); | |
| return new Value(this.data * o.data, [this, o], [o.data, this.data]); | |
| } | |
| neg(): Value { | |
| return this.mul(-1); | |
| } | |
| pow(exp: number): Value { | |
| return new Value(this.data ** exp, [this], [exp * this.data ** (exp - 1)]); | |
| } | |
| relu(): Value { | |
| return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]); | |
| } | |
| sub(other: Value | number): Value { | |
| const o = other instanceof Value ? other : new Value(other); | |
| return this.add(o.neg()); | |
| } | |
| static sum(values: Value[]): Value { | |
| return values.reduce((acc, v) => acc.add(v), new Value(0)); | |
| } | |
| } | |
| const flattenParams = (stateDict: StateDict): Value[] => | |
| Object.values(stateDict).flatMap(mat => mat.flatMap(row => row)); | |
| /** | |
| * Single-token forward pass through the GPT transformer. Takes the current | |
| * token and its position, runs it through all layers, and returns raw logits | |
| * (unnormalized scores) over the vocabulary — one per possible next token. | |
| * KV caches accumulate keys and values from previous positions so we don't | |
| * recompute them on each step during inference. | |
| */ | |
| const gpt = ( | |
| tokenId: number, | |
| posId: number, | |
| kvKeys: Value[][][], | |
| kvVals: Value[][][], | |
| stateDict: StateDict, | |
| config: ModelConfig, | |
| ): Value[] => { | |
| const { headDim, nHead, nLayer } = config; | |
| // Token embedding + positional embedding: the model's only way to know | |
| // *what* token it's looking at and *where* in the sequence it sits. | |
| let x = stateDict['wte'][tokenId].map((t, i) => | |
| t.add(stateDict['wpe'][posId][i]), | |
| ); | |
| // Pre-normalization before the first attention layer. | |
| x = rmsnorm(x); | |
| for (let li = 0; li < nLayer; li++) { | |
| // --- Self-attention block --- | |
| const xResidual = x; | |
| x = rmsnorm(x); | |
| // Project the input into query, key, and value spaces — each a learned | |
| // linear transformation. Q asks "what am I looking for?", K says "what | |
| // do I offer?", V holds the actual content to aggregate. | |
| const q = linear(x, stateDict[`layer${li}.attn_wq`]); | |
| const k = linear(x, stateDict[`layer${li}.attn_wk`]); | |
| const v = linear(x, stateDict[`layer${li}.attn_wv`]); | |
| // Append this token's key and value to the KV cache for use in later | |
| // positions (during inference we avoid recomputing past context). | |
| kvKeys[li].push(k); | |
| kvVals[li].push(v); | |
| const xAttn: Value[] = []; | |
| for (let h = 0; h < nHead; h++) { | |
| const hs = h * headDim; | |
| const qH = q.slice(hs, hs + headDim); | |
| const kH = kvKeys[li].map(ki => ki.slice(hs, hs + headDim)); | |
| const vH = kvVals[li].map(vi => vi.slice(hs, hs + headDim)); | |
| const scale = headDim ** 0.5; | |
| // Scaled dot-product attention: softmax(Q·K^T / √d_k) · V | |
| // Dividing by √d_k keeps dot products from growing large as headDim | |
| // increases, which would push softmax into low-gradient saturation. | |
| const attnLogits = kH.map(khi => Value.dot(qH, khi).div(scale)); | |
| const attnWeights = softmax(attnLogits); | |
| for (let j = 0; j < headDim; j++) { | |
| xAttn.push(Value.sum(attnWeights.map((w, t) => w.mul(vH[t][j])))); | |
| } | |
| } | |
| x = linear(xAttn, stateDict[`layer${li}.attn_wo`]); | |
| // Residual connection: add back the block's input. This highway for | |
| // gradients lets deep networks train without vanishing gradients. | |
| x = x.map((xi, i) => xi.add(xResidual[i])); | |
| // --- MLP block: expand → ReLU → contract → residual --- | |
| const xResidual2 = x; | |
| x = rmsnorm(x); | |
| x = linear(x, stateDict[`layer${li}.mlp_fc1`]); | |
| x = x.map(xi => xi.relu()); | |
| x = linear(x, stateDict[`layer${li}.mlp_fc2`]); | |
| x = x.map((xi, i) => xi.add(xResidual2[i])); | |
| } | |
| // Final projection: map from embedding space back to vocabulary logits. | |
| return linear(x, stateDict['lm_head']); | |
| }; | |
| /** | |
| * Initializes all model parameters as random matrices. Insertion order into | |
| * `sd` matters — `flattenParams` uses `Object.values()` to produce a stable | |
| * parameter list for the optimizer. | |
| */ | |
| const initStateDict = (config: ModelConfig, rng: Rng): StateDict => { | |
| const { blockSize, nEmbd, nLayer, vocabSize } = config; | |
| const sd: StateDict = {}; | |
| sd['wte'] = matrix(vocabSize, nEmbd, rng); | |
| sd['wpe'] = matrix(blockSize, nEmbd, rng); | |
| sd['lm_head'] = matrix(vocabSize, nEmbd, rng); | |
| for (let i = 0; i < nLayer; i++) { | |
| sd[`layer${i}.attn_wq`] = matrix(nEmbd, nEmbd, rng); | |
| sd[`layer${i}.attn_wk`] = matrix(nEmbd, nEmbd, rng); | |
| sd[`layer${i}.attn_wv`] = matrix(nEmbd, nEmbd, rng); | |
| sd[`layer${i}.attn_wo`] = matrix(nEmbd, nEmbd, rng); | |
| sd[`layer${i}.mlp_fc1`] = matrix(4 * nEmbd, nEmbd, rng); | |
| sd[`layer${i}.mlp_fc2`] = matrix(nEmbd, 4 * nEmbd, rng); | |
| } | |
| return sd; | |
| }; | |
| /** | |
| * Matrix-vector multiply: the fundamental building block of neural networks. | |
| * Each row of `w` is a weight vector; dotting it with the input `x` produces | |
| * one output scalar, so the result has one entry per row of `w`. | |
| */ | |
| const linear = (x: Value[], w: Value[][]): Value[] => | |
| w.map(row => Value.dot(row, x)); | |
| const loadDataset = async (): Promise<string[]> => { | |
| const url = | |
| 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'; | |
| const path = 'utils/ml/gptData.txt'; | |
| if (!(await Bun.file(path).exists())) { | |
| const response = await fetch(url); | |
| const text = await response.text(); | |
| await Bun.write(path, text); | |
| } | |
| const text = await Bun.file(path).text(); | |
| return text | |
| .split('\n') | |
| .map(line => line.trim()) | |
| .filter(line => line.length > 0); | |
| }; | |
| /** | |
| * Initialize a weight matrix with Gaussian noise (std=0.08). The small std | |
| * breaks symmetry (each neuron starts differently) without causing the | |
| * activations or gradients to explode on the first forward pass. | |
| */ | |
| const matrix = (nout: number, nin: number, rng: Rng, std = 0.08): Value[][] => | |
| Array.from({ length: nout }, () => | |
| Array.from({ length: nin }, () => new Value(rng.gauss(0, std))), | |
| ); | |
| /** | |
| * Root Mean Square Layer Normalization. Scales each vector so its RMS is ~1. | |
| * Normalizing activations stabilizes training by preventing them from growing | |
| * or shrinking uncontrollably as signals pass through many layers. | |
| */ | |
| const rmsnorm = (x: Value[]): Value[] => { | |
| const ms = Value.sum(x.map(xi => xi.mul(xi))).div(x.length); | |
| const scale = ms.add(1e-5).pow(-0.5); | |
| return x.map(xi => xi.mul(scale)); | |
| }; | |
| /** | |
| * Converts raw scores (logits) into a probability distribution over tokens. | |
| * Subtracting the max before calling exp() is a numerical stability trick — | |
| * it prevents overflow without changing the output (softmax is shift-invariant). | |
| */ | |
| const softmax = (logits: Value[]): Value[] => { | |
| const maxVal = Math.max(...logits.map(v => v.data)); | |
| const exps = logits.map(v => v.sub(maxVal).exp()); | |
| const total = Value.sum(exps); | |
| return exps.map(e => e.div(total)); | |
| }; | |
| export { Rng, Value }; | |
| if (import.meta.main) { | |
| const docs = await loadDataset(); | |
| const rng = new Rng(42); | |
| rng.shuffle(docs); | |
| console.log(`num docs: ${docs.length}`); | |
| // Build the vocabulary: one index per unique character, plus a special BOS | |
| // (beginning-of-sequence) token used as both start and end marker. | |
| const uchars = [...new Set(docs.join(''))].sort(); | |
| const BOS = uchars.length; | |
| const vocabSize = uchars.length + 1; | |
| console.log(`vocab size: ${vocabSize}`); | |
| const blockSize = 16; | |
| const headDim = 4; | |
| const nEmbd = 16; | |
| const nHead = 4; | |
| const nLayer = 1; | |
| const config: ModelConfig = { | |
| blockSize, | |
| headDim, | |
| nEmbd, | |
| nHead, | |
| nLayer, | |
| vocabSize, | |
| }; | |
| const stateDict = initStateDict(config, rng); | |
| const params = flattenParams(stateDict); | |
| console.log(`num params: ${params.length}`); | |
| const beta1 = 0.85; | |
| const beta2 = 0.99; | |
| const epsAdam = 1e-8; | |
| const learningRate = 0.01; | |
| const m = new Float64Array(params.length); | |
| const numSteps = 1000; | |
| const v = new Float64Array(params.length); | |
| for (let step = 0; step < numSteps; step++) { | |
| // Tokenize: convert each character to its vocabulary index, wrap the | |
| // word with BOS tokens so the model learns to start and stop. | |
| const doc = docs[step % docs.length]; | |
| const tokens = [BOS, ...[...doc].map(ch => uchars.indexOf(ch)), BOS]; | |
| const n = Math.min(blockSize, tokens.length - 1); | |
| const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []); | |
| const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []); | |
| const losses: Value[] = []; | |
| for (let posId = 0; posId < n; posId++) { | |
| const tokenId = tokens[posId]; | |
| const targetId = tokens[posId + 1]; | |
| const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config); | |
| const probs = softmax(logits); | |
| // Cross-entropy loss: −log P(correct_token). Near zero when the model | |
| // is confident and correct; large when it assigns low probability to | |
| // the right answer. | |
| losses.push(probs[targetId].log().neg()); | |
| } | |
| const loss = Value.sum(losses).div(n); | |
| // Backprop: compute ∂loss/∂param for every parameter in the network. | |
| loss.backward(); | |
| // Adam optimizer: maintains per-parameter exponential moving averages of | |
| // the gradient (m, "momentum") and squared gradient (v, "velocity"). | |
| // Bias-corrected estimates m̂ and v̂ correct for the cold start at zero. | |
| // Update rule: param -= lr · m̂ / (√v̂ + ε) | |
| // Linear LR decay from learningRate → 0 keeps training stable near convergence. | |
| const lrT = learningRate * (1 - step / numSteps); | |
| for (let i = 0; i < params.length; i++) { | |
| const g = params[i].grad; | |
| m[i] = beta1 * m[i] + (1 - beta1) * g; | |
| v[i] = beta2 * v[i] + (1 - beta2) * g * g; | |
| const mHat = m[i] / (1 - beta1 ** (step + 1)); | |
| const vHat = v[i] / (1 - beta2 ** (step + 1)); | |
| params[i].data -= lrT * (mHat / (vHat ** 0.5 + epsAdam)); | |
| params[i].grad = 0; | |
| } | |
| process.stdout.write( | |
| `step ${String(step + 1).padStart(4)} / ${numSteps} | loss ${loss.data.toFixed(4)}\r`, | |
| ); | |
| } | |
| // Temperature < 1 sharpens the probability distribution (more confident, | |
| // less random picks); > 1 flattens it (more creative, less coherent). | |
| const temperature = 0.5; | |
| console.log('\n--- inference (new, hallucinated names) ---'); | |
| for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) { | |
| const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []); | |
| const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []); | |
| let tokenId = BOS; | |
| const sample: string[] = []; | |
| // Autoregressive generation: feed each predicted token back as the next | |
| // input, one step at a time, until the model emits BOS (its end marker) | |
| // or the maximum context length is reached. | |
| for (let posId = 0; posId < blockSize; posId++) { | |
| const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config); | |
| const probs = softmax(logits.map(l => l.div(temperature))); | |
| tokenId = rng.choices( | |
| Array.from({ length: vocabSize }, (_, i) => i), | |
| probs.map(p => p.data), | |
| ); | |
| if (tokenId === BOS) break; | |
| sample.push(uchars[tokenId]); | |
| } | |
| console.log( | |
| `sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join('')}`, | |
| ); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment