Skip to content

Instantly share code, notes, and snippets.

@snoblenet
Created February 18, 2026 03:12
Show Gist options
  • Select an option

  • Save snoblenet/7739055e32bffb81277b6a08d33a37ef to your computer and use it in GitHub Desktop.

Select an option

Save snoblenet/7739055e32bffb81277b6a08d33a37ef to your computer and use it in GitHub Desktop.
Minimal GPT trainer and inference in pure TypeScript (no dependencies). Port of @karpathy's microgpt.py
#!/usr/bin/env bun
/**
* Minimal GPT trainer and inference in pure TypeScript (no dependencies).
* TypeScript port of Andrej Karpathy's microgpt.py.
* Original: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
*
* A GPT (Generative Pre-trained Transformer) is a neural network that learns
* to predict the next token in a sequence. This script demonstrates the full
* pipeline: autograd (automatic differentiation for computing gradients),
* a transformer architecture, training with the Adam optimizer, and
* autoregressive text generation — all in ~350 lines, zero dependencies.
*/
type ModelConfig = {
blockSize: number;
headDim: number;
nEmbd: number;
nHead: number;
nLayer: number;
vocabSize: number;
};
type StateDict = Record<string, Value[][]>;
/** Deterministic PRNG — fixes the random seed so training is reproducible. */
class Rng {
_hasSpare: boolean;
_spare: number;
_state: number;
constructor(seed: number) {
this._hasSpare = false;
this._spare = 0;
this._state = seed >>> 0;
}
choices(population: number[], weights: number[]): number {
const total = weights.reduce((a, b) => a + b, 0);
let r = this.random() * total;
for (let i = 0; i < population.length; i++) {
r -= weights[i];
if (r <= 0) return population[i];
}
return population[population.length - 1];
}
/**
* Box-Muller polar method: transforms two uniform random numbers into a
* Gaussian sample. Used for weight initialization — starting weights from a
* small Gaussian distribution (rather than all zeros) breaks symmetry so
* each neuron can learn different features.
*/
gauss(mean: number, std: number): number {
if (this._hasSpare) {
this._hasSpare = false;
return this._spare * std + mean;
}
let u = 0,
v = 0,
s = 0;
do {
u = this.random() * 2 - 1;
v = this.random() * 2 - 1;
s = u * u + v * v;
} while (s >= 1 || s === 0);
const mul = Math.sqrt((-2 * Math.log(s)) / s);
this._spare = v * mul;
this._hasSpare = true;
return u * mul * std + mean;
}
random(): number {
this._state = (this._state + 0x6d2b79f5) >>> 0;
let t = Math.imul(this._state ^ (this._state >>> 15), this._state | 1);
t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
}
shuffle<T>(arr: T[]): void {
for (let i = arr.length - 1; i > 0; i--) {
const j = Math.floor(this.random() * (i + 1));
[arr[i], arr[j]] = [arr[j], arr[i]];
}
}
}
/**
* Autograd engine — every scalar value in the network is wrapped in a Value
* so that operations build a computation graph. Once you call `backward()` on
* the loss, gradients flow back through the graph via the chain rule, telling
* each parameter how to change to reduce the loss.
*/
class Value {
/** The operands (parent nodes) that produced this value. */
_children: readonly Value[];
/**
* The partial derivative of this value with respect to each child,
* i.e. ∂(this)/∂(child[i]). Multiplied by `this.grad` during backprop
* to give the child's contribution to the total gradient (chain rule).
*/
_localGrads: readonly number[];
data: number;
grad: number;
constructor(
data: number,
children: readonly Value[] = [],
localGrads: readonly number[] = [],
) {
this._children = children;
this._localGrads = localGrads;
this.data = data;
this.grad = 0;
}
add(other: Value | number): Value {
const o = other instanceof Value ? other : new Value(other);
return new Value(this.data + o.data, [this, o], [1, 1]);
}
/**
* Reverse-mode autodiff (backpropagation). Topologically sorts the
* computation graph so every node is visited after its dependents, then
* walks in reverse, accumulating gradients via the chain rule:
* child.grad += (∂this/∂child) * this.grad
*/
backward(): void {
const topo: Value[] = [];
const visited = new Set<Value>();
const buildTopo = (v: Value): void => {
if (!visited.has(v)) {
visited.add(v);
for (const child of v._children) {
buildTopo(child);
}
topo.push(v);
}
};
buildTopo(this);
this.grad = 1;
for (let i = topo.length - 1; i >= 0; i--) {
const v = topo[i];
for (let j = 0; j < v._children.length; j++) {
v._children[j].grad += v._localGrads[j] * v.grad;
}
}
}
div(other: Value | number): Value {
const o = other instanceof Value ? other : new Value(other);
return this.mul(o.pow(-1));
}
static dot(a: Value[], b: Value[]): Value {
return Value.sum(a.map((ai, i) => ai.mul(b[i])));
}
exp(): Value {
return new Value(Math.exp(this.data), [this], [Math.exp(this.data)]);
}
log(): Value {
return new Value(Math.log(this.data), [this], [1 / this.data]);
}
/**
* ∂(a·b)/∂a = b and ∂(a·b)/∂b = a — so the local gradients for
* multiplication are simply the other operand's current data values.
*/
mul(other: Value | number): Value {
const o = other instanceof Value ? other : new Value(other);
return new Value(this.data * o.data, [this, o], [o.data, this.data]);
}
neg(): Value {
return this.mul(-1);
}
pow(exp: number): Value {
return new Value(this.data ** exp, [this], [exp * this.data ** (exp - 1)]);
}
relu(): Value {
return new Value(Math.max(0, this.data), [this], [this.data > 0 ? 1 : 0]);
}
sub(other: Value | number): Value {
const o = other instanceof Value ? other : new Value(other);
return this.add(o.neg());
}
static sum(values: Value[]): Value {
return values.reduce((acc, v) => acc.add(v), new Value(0));
}
}
const flattenParams = (stateDict: StateDict): Value[] =>
Object.values(stateDict).flatMap(mat => mat.flatMap(row => row));
/**
* Single-token forward pass through the GPT transformer. Takes the current
* token and its position, runs it through all layers, and returns raw logits
* (unnormalized scores) over the vocabulary — one per possible next token.
* KV caches accumulate keys and values from previous positions so we don't
* recompute them on each step during inference.
*/
const gpt = (
tokenId: number,
posId: number,
kvKeys: Value[][][],
kvVals: Value[][][],
stateDict: StateDict,
config: ModelConfig,
): Value[] => {
const { headDim, nHead, nLayer } = config;
// Token embedding + positional embedding: the model's only way to know
// *what* token it's looking at and *where* in the sequence it sits.
let x = stateDict['wte'][tokenId].map((t, i) =>
t.add(stateDict['wpe'][posId][i]),
);
// Pre-normalization before the first attention layer.
x = rmsnorm(x);
for (let li = 0; li < nLayer; li++) {
// --- Self-attention block ---
const xResidual = x;
x = rmsnorm(x);
// Project the input into query, key, and value spaces — each a learned
// linear transformation. Q asks "what am I looking for?", K says "what
// do I offer?", V holds the actual content to aggregate.
const q = linear(x, stateDict[`layer${li}.attn_wq`]);
const k = linear(x, stateDict[`layer${li}.attn_wk`]);
const v = linear(x, stateDict[`layer${li}.attn_wv`]);
// Append this token's key and value to the KV cache for use in later
// positions (during inference we avoid recomputing past context).
kvKeys[li].push(k);
kvVals[li].push(v);
const xAttn: Value[] = [];
for (let h = 0; h < nHead; h++) {
const hs = h * headDim;
const qH = q.slice(hs, hs + headDim);
const kH = kvKeys[li].map(ki => ki.slice(hs, hs + headDim));
const vH = kvVals[li].map(vi => vi.slice(hs, hs + headDim));
const scale = headDim ** 0.5;
// Scaled dot-product attention: softmax(Q·K^T / √d_k) · V
// Dividing by √d_k keeps dot products from growing large as headDim
// increases, which would push softmax into low-gradient saturation.
const attnLogits = kH.map(khi => Value.dot(qH, khi).div(scale));
const attnWeights = softmax(attnLogits);
for (let j = 0; j < headDim; j++) {
xAttn.push(Value.sum(attnWeights.map((w, t) => w.mul(vH[t][j]))));
}
}
x = linear(xAttn, stateDict[`layer${li}.attn_wo`]);
// Residual connection: add back the block's input. This highway for
// gradients lets deep networks train without vanishing gradients.
x = x.map((xi, i) => xi.add(xResidual[i]));
// --- MLP block: expand → ReLU → contract → residual ---
const xResidual2 = x;
x = rmsnorm(x);
x = linear(x, stateDict[`layer${li}.mlp_fc1`]);
x = x.map(xi => xi.relu());
x = linear(x, stateDict[`layer${li}.mlp_fc2`]);
x = x.map((xi, i) => xi.add(xResidual2[i]));
}
// Final projection: map from embedding space back to vocabulary logits.
return linear(x, stateDict['lm_head']);
};
/**
* Initializes all model parameters as random matrices. Insertion order into
* `sd` matters — `flattenParams` uses `Object.values()` to produce a stable
* parameter list for the optimizer.
*/
const initStateDict = (config: ModelConfig, rng: Rng): StateDict => {
const { blockSize, nEmbd, nLayer, vocabSize } = config;
const sd: StateDict = {};
sd['wte'] = matrix(vocabSize, nEmbd, rng);
sd['wpe'] = matrix(blockSize, nEmbd, rng);
sd['lm_head'] = matrix(vocabSize, nEmbd, rng);
for (let i = 0; i < nLayer; i++) {
sd[`layer${i}.attn_wq`] = matrix(nEmbd, nEmbd, rng);
sd[`layer${i}.attn_wk`] = matrix(nEmbd, nEmbd, rng);
sd[`layer${i}.attn_wv`] = matrix(nEmbd, nEmbd, rng);
sd[`layer${i}.attn_wo`] = matrix(nEmbd, nEmbd, rng);
sd[`layer${i}.mlp_fc1`] = matrix(4 * nEmbd, nEmbd, rng);
sd[`layer${i}.mlp_fc2`] = matrix(nEmbd, 4 * nEmbd, rng);
}
return sd;
};
/**
* Matrix-vector multiply: the fundamental building block of neural networks.
* Each row of `w` is a weight vector; dotting it with the input `x` produces
* one output scalar, so the result has one entry per row of `w`.
*/
const linear = (x: Value[], w: Value[][]): Value[] =>
w.map(row => Value.dot(row, x));
const loadDataset = async (): Promise<string[]> => {
const url =
'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt';
const path = 'utils/ml/gptData.txt';
if (!(await Bun.file(path).exists())) {
const response = await fetch(url);
const text = await response.text();
await Bun.write(path, text);
}
const text = await Bun.file(path).text();
return text
.split('\n')
.map(line => line.trim())
.filter(line => line.length > 0);
};
/**
* Initialize a weight matrix with Gaussian noise (std=0.08). The small std
* breaks symmetry (each neuron starts differently) without causing the
* activations or gradients to explode on the first forward pass.
*/
const matrix = (nout: number, nin: number, rng: Rng, std = 0.08): Value[][] =>
Array.from({ length: nout }, () =>
Array.from({ length: nin }, () => new Value(rng.gauss(0, std))),
);
/**
* Root Mean Square Layer Normalization. Scales each vector so its RMS is ~1.
* Normalizing activations stabilizes training by preventing them from growing
* or shrinking uncontrollably as signals pass through many layers.
*/
const rmsnorm = (x: Value[]): Value[] => {
const ms = Value.sum(x.map(xi => xi.mul(xi))).div(x.length);
const scale = ms.add(1e-5).pow(-0.5);
return x.map(xi => xi.mul(scale));
};
/**
* Converts raw scores (logits) into a probability distribution over tokens.
* Subtracting the max before calling exp() is a numerical stability trick —
* it prevents overflow without changing the output (softmax is shift-invariant).
*/
const softmax = (logits: Value[]): Value[] => {
const maxVal = Math.max(...logits.map(v => v.data));
const exps = logits.map(v => v.sub(maxVal).exp());
const total = Value.sum(exps);
return exps.map(e => e.div(total));
};
export { Rng, Value };
if (import.meta.main) {
const docs = await loadDataset();
const rng = new Rng(42);
rng.shuffle(docs);
console.log(`num docs: ${docs.length}`);
// Build the vocabulary: one index per unique character, plus a special BOS
// (beginning-of-sequence) token used as both start and end marker.
const uchars = [...new Set(docs.join(''))].sort();
const BOS = uchars.length;
const vocabSize = uchars.length + 1;
console.log(`vocab size: ${vocabSize}`);
const blockSize = 16;
const headDim = 4;
const nEmbd = 16;
const nHead = 4;
const nLayer = 1;
const config: ModelConfig = {
blockSize,
headDim,
nEmbd,
nHead,
nLayer,
vocabSize,
};
const stateDict = initStateDict(config, rng);
const params = flattenParams(stateDict);
console.log(`num params: ${params.length}`);
const beta1 = 0.85;
const beta2 = 0.99;
const epsAdam = 1e-8;
const learningRate = 0.01;
const m = new Float64Array(params.length);
const numSteps = 1000;
const v = new Float64Array(params.length);
for (let step = 0; step < numSteps; step++) {
// Tokenize: convert each character to its vocabulary index, wrap the
// word with BOS tokens so the model learns to start and stop.
const doc = docs[step % docs.length];
const tokens = [BOS, ...[...doc].map(ch => uchars.indexOf(ch)), BOS];
const n = Math.min(blockSize, tokens.length - 1);
const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
const losses: Value[] = [];
for (let posId = 0; posId < n; posId++) {
const tokenId = tokens[posId];
const targetId = tokens[posId + 1];
const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
const probs = softmax(logits);
// Cross-entropy loss: −log P(correct_token). Near zero when the model
// is confident and correct; large when it assigns low probability to
// the right answer.
losses.push(probs[targetId].log().neg());
}
const loss = Value.sum(losses).div(n);
// Backprop: compute ∂loss/∂param for every parameter in the network.
loss.backward();
// Adam optimizer: maintains per-parameter exponential moving averages of
// the gradient (m, "momentum") and squared gradient (v, "velocity").
// Bias-corrected estimates m̂ and v̂ correct for the cold start at zero.
// Update rule: param -= lr · m̂ / (√v̂ + ε)
// Linear LR decay from learningRate → 0 keeps training stable near convergence.
const lrT = learningRate * (1 - step / numSteps);
for (let i = 0; i < params.length; i++) {
const g = params[i].grad;
m[i] = beta1 * m[i] + (1 - beta1) * g;
v[i] = beta2 * v[i] + (1 - beta2) * g * g;
const mHat = m[i] / (1 - beta1 ** (step + 1));
const vHat = v[i] / (1 - beta2 ** (step + 1));
params[i].data -= lrT * (mHat / (vHat ** 0.5 + epsAdam));
params[i].grad = 0;
}
process.stdout.write(
`step ${String(step + 1).padStart(4)} / ${numSteps} | loss ${loss.data.toFixed(4)}\r`,
);
}
// Temperature < 1 sharpens the probability distribution (more confident,
// less random picks); > 1 flattens it (more creative, less coherent).
const temperature = 0.5;
console.log('\n--- inference (new, hallucinated names) ---');
for (let sampleIdx = 0; sampleIdx < 20; sampleIdx++) {
const kvKeys: Value[][][] = Array.from({ length: nLayer }, () => []);
const kvVals: Value[][][] = Array.from({ length: nLayer }, () => []);
let tokenId = BOS;
const sample: string[] = [];
// Autoregressive generation: feed each predicted token back as the next
// input, one step at a time, until the model emits BOS (its end marker)
// or the maximum context length is reached.
for (let posId = 0; posId < blockSize; posId++) {
const logits = gpt(tokenId, posId, kvKeys, kvVals, stateDict, config);
const probs = softmax(logits.map(l => l.div(temperature)));
tokenId = rng.choices(
Array.from({ length: vocabSize }, (_, i) => i),
probs.map(p => p.data),
);
if (tokenId === BOS) break;
sample.push(uchars[tokenId]);
}
console.log(
`sample ${String(sampleIdx + 1).padStart(2)}: ${sample.join('')}`,
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment