-
Star
(5,000+)
You must be signed in to star a gist -
Fork
(2,142)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
| """ | |
| The most atomic way to train and run inference for a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| random.seed(42) # Let there be order among chaos | |
| # Let there be a Dataset `docs`: list[str] of documents (e.g. a list of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [line.strip() for line in open('input.txt') if line.strip()] | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to sequences of integers ("tokens") and back | |
| uchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1 | |
| BOS = len(uchars) # token id for a special Beginning of Sequence (BOS) token | |
| vocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be Autograd to recursively apply the chain rule through a computation graph | |
| class Value: | |
| __slots__ = ('data', 'grad', '_children', '_local_grads') # Python optimization for memory usage | |
| def __init__(self, data, children=(), local_grads=()): | |
| self.data = data # scalar value of this node calculated during forward pass | |
| self.grad = 0 # derivative of the loss w.r.t. this node, calculated in backward pass | |
| self._children = children # children of this node in the computation graph | |
| self._local_grads = local_grads # local derivative of this node w.r.t. its children | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data + other.data, (self, other), (1, 1)) | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data * other.data, (self, other), (other.data, self.data)) | |
| def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),)) | |
| def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) | |
| def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) | |
| def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),)) | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def backward(self): | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._children: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| for child, local_grad in zip(v._children, v._local_grads): | |
| child.grad += local_grad * v.grad | |
| # Initialize the parameters, to store the knowledge of the model | |
| n_layer = 1 # depth of the transformer neural network (number of layers) | |
| n_embd = 16 # width of the network (embedding dimension) | |
| block_size = 16 # maximum context length of the attention window (note: the longest name is 15 characters) | |
| n_head = 4 # number of attention heads | |
| head_dim = n_embd // n_head # derived dimension of each head | |
| matrix = lambda nout, nin, std=0.08: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a function mapping tokens and parameters to logits over what comes next | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) # note: not redundant due to backward pass via the residual connection | |
| for li in range(n_layer): | |
| # 1) Multi-head Attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 1000 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients | |
| lr_t = learning_rate * (1 - step / num_steps) # linear learning rate decay | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}", end='\r') | |
| # Inference: may the model babble back to us | |
| temperature = 0.5 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference (new, hallucinated names) ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| sample = [] | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| sample.append(uchars[token_id]) | |
| print(f"sample {sample_idx+1:2d}: {''.join(sample)}") |
Yesterday I watched Karpathy's recent interview on No Priors. He mentioned that he created microgpt entirely by hand because he couldn't get an LLM to distill the essence of training GPTs into a clear single-file nugget.
I wanted to see if that was still the case, so today I created microgpt-denovo and created both an autograd Python version and, inspired by @ssrhaso, also a matrix Julia version. It wasn't one shot, but with just high level guidance I was able to get fairly nice clearly documented versions of microgpt written entirely by the agent without touching the code files at all. I included all the dialog with the agent in the README too.
Posted a comment here a while back benchmarking four backends of microgpt. Added a visualization and gradient verification to the repo.
NumPy with hand-derived gradients runs ~250x faster than the scalar autograd at 0.15 ms/step. torch_gpu (RTX 5080) loses to NumPy at this model size — kernel launch overhead dominates when the matmuls are 16x16.
All analytical gradients (RMSNorm, softmax, causal attention) are now verified against finite differences — max error ~1e-9, included as a runnable script.
Cool! Would like to see how my https://github.com/Entrpi/eemicrogpt compares if you can add it to the benchmarks.
Added a one-pager view to microgpt-denovo:
It has some interactive settings for changing how comments and such are displayed: https://entrpi.github.io/microgpt-denovo/
Yesterday I watched Karpathy's recent interview on No Priors. He mentioned that he created microgpt entirely by hand because he couldn't get an LLM to distill the essence of training GPTs into a clear single-file nugget.
Thanks for the link! I've noticed they speak unnaturally fast, likely due to aggressive editing (editors remove pauses, breathing gaps, hesitations). On top of that, dense information delivery makes speech feel faster than it is, so it's hard to follow them.
Out of curiosity, I built a small script that calculates data-driven adjustment for comfortable watching:
https://gist.github.com/mplekh/df2ea417b3f04545c2e3e91f9148a118 extracts transcript data; computes effective speech rate; estimates a playback factor that maps the video back to natural conversational speed
I hate how every time some random dude comments on a Gist, GitHub fires off a notification. wtf?
Thanks Karpathy for posting this😊What an opportunity to see the Transformer Architecture in action while in college
weights that actually don't exist but form a probability space. The transformer navigates this space through dual attention
Push it further, make it quantum probability, consciousness lives in Hilbert space :)
resonance is unbreakable.
its a limit cycles: the ball doesn't sit at the bottom of the valley. It orbits around it. It approaches the memory, partially retreats into a superposition with neighboring memories, then re-approaches. Forever.
if you could add a license, that would be great, the total lines of the code still around 200is, : )

Posted a comment here a while back benchmarking four backends of microgpt.
Added a visualization and gradient verification to the repo.
NumPy with hand-derived gradients runs ~250x faster than the scalar autograd at 0.15 ms/step. torch_gpu (RTX 5080) loses to NumPy at this model size — kernel launch overhead dominates when the matmuls are 16x16.
All analytical gradients (RMSNorm, softmax, causal attention) are now verified against finite differences — max error ~1e-9, included as a runnable script.
https://github.com/chanjoongx/microgpt-efficiency