Created
February 16, 2026 14:14
-
-
Save lukaszsamson/43b77a944e6a1f88a17aad992efb53c0 to your computer and use it in GitHub Desktop.
Port of microgpt.py to elixir
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # microgpt_atomics_inline.exs | |
| # Port of microgpt.py by @karpathy | |
| # | |
| # Storage: :atomics array (7 slots per node) in place of Python dicts. | |
| # Refs threaded: both data ref and counter ref passed as args, zero | |
| # :persistent_term.get calls during training/inference. | |
| # @compile inline on all hot V and MicroGPT functions. | |
| # | |
| # Optimizations that diverge from the Python original: | |
| # | |
| # 1. Monotonic-ID backward (biggest win) | |
| # Every node only points to children with smaller IDs, so reverse-ID | |
| # order is valid topological order. Backward iterates loss_id..num_params | |
| # directly — no DFS, no MapSet, no recursion, no topo list allocation. | |
| # Nodes with zero grad are skipped (vg != 0.0 guard). | |
| # | |
| # 2. Scalar ops (mul_scalar, add_scalar) | |
| # Expressions like V.mul(x, V.new(constant)) created a throwaway leaf | |
| # node + a binary node. mul_scalar(x, s) and add_scalar(x, s) produce | |
| # a single unary node, cutting total node count and backward range. | |
| # | |
| # 3. Fused cross-entropy backward | |
| # Instead of building the full softmax -> log -> neg graph per token, | |
| # softmax is computed numerically and the loss is a leaf node. During | |
| # backward, CE gradients (p_i - one_hot_i) are injected directly onto | |
| # logit nodes. Eliminates ~5 graph nodes per logit per position. | |
| # | |
| # 4. Fused attention softmax | |
| # Same idea applied to attention weights. Softmax is computed numerically, | |
| # leaf weight nodes are created, and during backward the Jacobian | |
| # p_i * (g_i - dot(g, p)) is injected onto attention logit nodes. | |
| # Eliminates the sub/exp/div/sum graph per attention head. | |
| defmodule V do | |
| @sn 7 | |
| @compile {:inline, | |
| f2i: 1, i2f: 1, | |
| data: 2, grad: 2, set_data: 3, | |
| next_id: 1, new: 3, new1: 5, new2: 7, | |
| add: 4, mul: 4, vpow: 4, relu: 3, | |
| mul_scalar: 4, add_scalar: 4, add_grad: 3, | |
| vsum: 3} | |
| def init(max_nodes \\ 200_000) do | |
| ref = :atomics.new(max_nodes * @sn, signed: true) | |
| ctr = :atomics.new(1, signed: true) | |
| :persistent_term.put(:v_ref, ref) | |
| :persistent_term.put(:v_ctr, ctr) | |
| end | |
| def ref, do: :persistent_term.get(:v_ref) | |
| def ctr, do: :persistent_term.get(:v_ctr) | |
| defp next_id(c), do: :atomics.add_get(c, 1, 1) - 1 | |
| defp f2i(f) do | |
| <<i::signed-integer-64>> = <<(f / 1)::float-64>> | |
| i | |
| end | |
| defp i2f(i) do | |
| <<f::float-64>> = <<i::signed-integer-64>> | |
| f | |
| end | |
| def data(id, r), do: i2f(:atomics.get(r, id * @sn + 1)) | |
| def grad(id, r), do: i2f(:atomics.get(r, id * @sn + 2)) | |
| def set_data(id, val, r), do: :atomics.put(r, id * @sn + 1, f2i(val)) | |
| def new(data, r, c) do | |
| id = next_id(c) | |
| b = id * @sn + 1 | |
| :atomics.put(r, b, f2i(data)) | |
| :atomics.put(r, b + 1, 0) | |
| :atomics.put(r, b + 2, 0) | |
| id | |
| end | |
| def new1(data, c1, lg1, r, c) do | |
| id = next_id(c) | |
| b = id * @sn + 1 | |
| :atomics.put(r, b, f2i(data)) | |
| :atomics.put(r, b + 1, 0) | |
| :atomics.put(r, b + 2, 1) | |
| :atomics.put(r, b + 3, c1) | |
| :atomics.put(r, b + 5, f2i(lg1)) | |
| id | |
| end | |
| def new2(data, c1, c2, lg1, lg2, r, c) do | |
| id = next_id(c) | |
| b = id * @sn + 1 | |
| :atomics.put(r, b, f2i(data)) | |
| :atomics.put(r, b + 1, 0) | |
| :atomics.put(r, b + 2, 2) | |
| :atomics.put(r, b + 3, c1) | |
| :atomics.put(r, b + 4, c2) | |
| :atomics.put(r, b + 5, f2i(lg1)) | |
| :atomics.put(r, b + 6, f2i(lg2)) | |
| id | |
| end | |
| def add(a, b, r, c), do: new2(data(a, r) + data(b, r), a, b, 1.0, 1.0, r, c) | |
| def mul(a, b, r, c) do | |
| da = data(a, r) | |
| db = data(b, r) | |
| new2(da * db, a, b, db, da, r, c) | |
| end | |
| def vpow(a, p, r, c) do | |
| da = data(a, r) | |
| new1(:math.pow(da, p), a, p * :math.pow(da, p - 1), r, c) | |
| end | |
| def relu(a, r, c) do | |
| da = data(a, r) | |
| new1(max(0.0, da), a, if(da > 0, do: 1.0, else: 0.0), r, c) | |
| end | |
| def mul_scalar(a, s, r, c), do: new1(data(a, r) * s, a, s, r, c) | |
| def add_scalar(a, s, r, c), do: new1(data(a, r) + s, a, 1.0, r, c) | |
| def vsum([x], _r, _c), do: x | |
| def vsum([h | t], r, c), do: Enum.reduce(t, h, fn x, acc -> add(acc, x, r, c) end) | |
| def peek_next_id(c), do: :atomics.get(c, 1) | |
| def backward_fused(loss_id, injections, num_params, r) do | |
| :atomics.put(r, loss_id * @sn + 2, f2i(1.0)) | |
| final_from = | |
| Enum.reduce(injections, loss_id, fn {boundary, inject_fn}, from -> | |
| backward_range(from, boundary, r) | |
| inject_fn.() | |
| boundary - 1 | |
| end) | |
| backward_range(final_from, num_params, r) | |
| end | |
| def inject_ce(logit_ids, probs, target_id, loss_t_id, r) do | |
| lt_grad = grad(loss_t_id, r) | |
| logit_ids | |
| |> Enum.zip(probs) | |
| |> Enum.with_index() | |
| |> Enum.each(fn {{lid, pi}, i} -> | |
| ce_grad = (pi - if(i == target_id, do: 1.0, else: 0.0)) * lt_grad | |
| add_grad(lid, ce_grad, r) | |
| end) | |
| end | |
| def inject_softmax(logit_ids, probs, weight_ids, r) do | |
| gs = Enum.map(weight_ids, fn wid -> grad(wid, r) end) | |
| dot_gp = Enum.zip(gs, probs) |> Enum.reduce(0.0, fn {g, p}, acc -> acc + g * p end) | |
| logit_ids | |
| |> Enum.zip(probs) | |
| |> Enum.zip(gs) | |
| |> Enum.each(fn {{lid, pi}, gi} -> | |
| add_grad(lid, pi * (gi - dot_gp), r) | |
| end) | |
| end | |
| defp add_grad(id, val, r) do | |
| g_slot = id * @sn + 2 | |
| :atomics.put(r, g_slot, f2i(i2f(:atomics.get(r, g_slot)) + val)) | |
| end | |
| defp backward_range(from, to, r) do | |
| for v <- from..to//-1 do | |
| b = v * @sn + 1 | |
| vg = i2f(:atomics.get(r, b + 1)) | |
| if vg != 0.0 do | |
| nc = :atomics.get(r, b + 2) | |
| if nc >= 1 do | |
| c1 = :atomics.get(r, b + 3) | |
| lg1 = i2f(:atomics.get(r, b + 5)) | |
| c1g = c1 * @sn + 2 | |
| :atomics.put(r, c1g, f2i(i2f(:atomics.get(r, c1g)) + lg1 * vg)) | |
| if nc == 2 do | |
| c2 = :atomics.get(r, b + 4) | |
| lg2 = i2f(:atomics.get(r, b + 6)) | |
| c2g = c2 * @sn + 2 | |
| :atomics.put(r, c2g, f2i(i2f(:atomics.get(r, c2g)) + lg2 * vg)) | |
| end | |
| end | |
| end | |
| end | |
| end | |
| def reset(num_params, r, c) do | |
| zero = f2i(0.0) | |
| for id <- 0..(num_params - 1) do | |
| :atomics.put(r, id * @sn + 2, zero) | |
| end | |
| :atomics.put(c, 1, num_params) | |
| end | |
| end | |
| defmodule MicroGPT do | |
| @compile {:inline, linear: 4, rmsnorm: 3} | |
| def linear(x, w, r, c) do | |
| Enum.map(w, fn wo -> | |
| Enum.zip(wo, x) | |
| |> Enum.map(fn {wi, xi} -> V.mul(wi, xi, r, c) end) | |
| |> V.vsum(r, c) | |
| end) | |
| end | |
| def rmsnorm(x, r, c) do | |
| n = length(x) | |
| squares = Enum.map(x, fn xi -> V.mul(xi, xi, r, c) end) | |
| ms = V.mul_scalar(V.vsum(squares, r, c), 1.0 / n, r, c) | |
| scale = V.vpow(V.add_scalar(ms, 1.0e-5, r, c), -0.5, r, c) | |
| Enum.map(x, fn xi -> V.mul(xi, scale, r, c) end) | |
| end | |
| def gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c) do | |
| tok_emb = Enum.at(sd["wte"], token_id) | |
| pos_emb = Enum.at(sd["wpe"], pos_id) | |
| x = Enum.zip(tok_emb, pos_emb) |> Enum.map(fn {t, p} -> V.add(t, p, r, c) end) | |
| x = rmsnorm(x, r, c) | |
| {x, keys, values, sm_infos} = | |
| Enum.reduce(0..(n_layer - 1), {x, keys, values, []}, fn li, | |
| {x, keys, values, sm_infos} -> | |
| x_res = x | |
| x = rmsnorm(x, r, c) | |
| q = linear(x, sd["layer#{li}.attn_wq"], r, c) | |
| k = linear(x, sd["layer#{li}.attn_wk"], r, c) | |
| v = linear(x, sd["layer#{li}.attn_wv"], r, c) | |
| keys = List.update_at(keys, li, &([k | &1])) | |
| values = List.update_at(values, li, &([v | &1])) | |
| {x_attn, sm_infos} = | |
| Enum.reduce(0..(n_head - 1), {[], sm_infos}, fn h, {x_attn_acc, sm_infos} -> | |
| hs = h * head_dim | |
| q_h = Enum.slice(q, hs, head_dim) | |
| k_h = Enum.map(Enum.at(keys, li), fn ki -> Enum.slice(ki, hs, head_dim) end) | |
| v_h = Enum.map(Enum.at(values, li), fn vi -> Enum.slice(vi, hs, head_dim) end) | |
| attn_logits = | |
| Enum.map(k_h, fn k_t -> | |
| dot = | |
| Enum.zip(q_h, k_t) | |
| |> Enum.map(fn {a, b} -> V.mul(a, b, r, c) end) | |
| |> V.vsum(r, c) | |
| V.mul_scalar(dot, inv_scale, r, c) | |
| end) | |
| # Numerical softmax — no graph nodes | |
| logit_data = Enum.map(attn_logits, fn id -> V.data(id, r) end) | |
| max_l = Enum.max(logit_data) | |
| exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end) | |
| sum_exp = Enum.sum(exps) | |
| attn_probs = Enum.map(exps, fn e -> e / sum_exp end) | |
| # Create leaf weight nodes | |
| first_weight_id = V.peek_next_id(c) | |
| attn_weights = Enum.map(attn_probs, fn p -> V.new(p, r, c) end) | |
| sm_info = {first_weight_id, attn_logits, attn_probs, attn_weights} | |
| head_out = | |
| Enum.map(0..(head_dim - 1), fn j -> | |
| Enum.zip(attn_weights, v_h) | |
| |> Enum.map(fn {w, vt} -> V.mul(w, Enum.at(vt, j), r, c) end) | |
| |> V.vsum(r, c) | |
| end) | |
| {x_attn_acc ++ head_out, [sm_info | sm_infos]} | |
| end) | |
| x = linear(x_attn, sd["layer#{li}.attn_wo"], r, c) | |
| x = Enum.zip(x, x_res) |> Enum.map(fn {a, b} -> V.add(a, b, r, c) end) | |
| x_res = x | |
| x = rmsnorm(x, r, c) | |
| x = linear(x, sd["layer#{li}.mlp_fc1"], r, c) | |
| x = Enum.map(x, fn xi -> V.relu(xi, r, c) end) | |
| x = linear(x, sd["layer#{li}.mlp_fc2"], r, c) | |
| x = Enum.zip(x, x_res) |> Enum.map(fn {a, b} -> V.add(a, b, r, c) end) | |
| {x, keys, values, sm_infos} | |
| end) | |
| logits = linear(x, sd["lm_head"], r, c) | |
| {logits, keys, values, sm_infos} | |
| end | |
| def weighted_choice(items, weights) do | |
| total = Enum.sum(weights) | |
| rand = :rand.uniform() * total | |
| result = | |
| Enum.zip(items, weights) | |
| |> Enum.reduce_while(0.0, fn {item, w}, acc -> | |
| new_acc = acc + w | |
| if new_acc >= rand, do: {:halt, {:ok, item}}, else: {:cont, new_acc} | |
| end) | |
| case result do | |
| {:ok, item} -> item | |
| _ -> List.last(items) | |
| end | |
| end | |
| def run do | |
| V.init(200_000) | |
| r = V.ref() | |
| c = V.ctr() | |
| :rand.seed(:exsss, 42) | |
| unless File.exists?("input.txt") do | |
| IO.puts("Downloading names dataset...") | |
| {:ok, _} = Application.ensure_all_started(:inets) | |
| {:ok, _} = Application.ensure_all_started(:ssl) | |
| url = | |
| String.to_charlist( | |
| "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt" | |
| ) | |
| {:ok, {{_, 200, _}, _, body}} = | |
| :httpc.request(:get, {url, []}, [ssl: [verify: :verify_none]], body_format: :binary) | |
| File.write!("input.txt", body) | |
| end | |
| docs = | |
| File.read!("input.txt") | |
| |> String.trim() | |
| |> String.split("\n") | |
| |> Enum.map(&String.trim/1) | |
| |> Enum.filter(&(&1 != "")) | |
| |> Enum.shuffle() | |
| IO.puts("num docs: #{length(docs)}") | |
| uchars = docs |> Enum.join() |> String.graphemes() |> Enum.uniq() |> Enum.sort() | |
| bos = length(uchars) | |
| vocab_size = length(uchars) + 1 | |
| char_to_idx = uchars |> Enum.with_index() |> Map.new() | |
| IO.puts("vocab size: #{vocab_size}") | |
| n_embd = 16 | |
| n_head = 4 | |
| n_layer = 1 | |
| block_size = 16 | |
| head_dim = div(n_embd, n_head) | |
| inv_scale = 1.0 / :math.sqrt(head_dim) | |
| std = 0.08 | |
| matrix = fn nout, nin -> | |
| for _ <- 1..nout, do: for(_ <- 1..nin, do: V.new(:rand.normal() * std, r, c)) | |
| end | |
| sd = | |
| %{ | |
| "wte" => matrix.(vocab_size, n_embd), | |
| "wpe" => matrix.(block_size, n_embd), | |
| "lm_head" => matrix.(vocab_size, n_embd) | |
| } | |
| sd = | |
| Enum.reduce(0..(n_layer - 1), sd, fn i, sd -> | |
| sd | |
| |> Map.put("layer#{i}.attn_wq", matrix.(n_embd, n_embd)) | |
| |> Map.put("layer#{i}.attn_wk", matrix.(n_embd, n_embd)) | |
| |> Map.put("layer#{i}.attn_wv", matrix.(n_embd, n_embd)) | |
| |> Map.put("layer#{i}.attn_wo", matrix.(n_embd, n_embd)) | |
| |> Map.put("layer#{i}.mlp_fc1", matrix.(4 * n_embd, n_embd)) | |
| |> Map.put("layer#{i}.mlp_fc2", matrix.(n_embd, 4 * n_embd)) | |
| end) | |
| param_keys = | |
| ["wte", "wpe", "lm_head"] ++ | |
| Enum.flat_map(0..(n_layer - 1), fn i -> | |
| [ | |
| "layer#{i}.attn_wq", | |
| "layer#{i}.attn_wk", | |
| "layer#{i}.attn_wv", | |
| "layer#{i}.attn_wo", | |
| "layer#{i}.mlp_fc1", | |
| "layer#{i}.mlp_fc2" | |
| ] | |
| end) | |
| params = for key <- param_keys, row <- sd[key], p <- row, do: p | |
| num_params = length(params) | |
| IO.puts("num params: #{num_params}") | |
| learning_rate = 0.01 | |
| beta1 = 0.85 | |
| beta2 = 0.99 | |
| eps_adam = 1.0e-8 | |
| num_steps = 1000 | |
| m0 = List.duplicate(0.0, num_params) | |
| v0 = List.duplicate(0.0, num_params) | |
| {_m, _v} = | |
| Enum.reduce(0..(num_steps - 1), {m0, v0}, fn step, {m, v} -> | |
| V.reset(num_params, r, c) | |
| doc = Enum.at(docs, rem(step, length(docs))) | |
| tokens = | |
| [bos] ++ Enum.map(String.graphemes(doc), &Map.fetch!(char_to_idx, &1)) ++ [bos] | |
| n = min(block_size, length(tokens) - 1) | |
| inv_n = 1.0 / n | |
| init_keys = for _ <- 1..n_layer, do: [] | |
| init_values = for _ <- 1..n_layer, do: [] | |
| {losses, ce_info, sm_infos, _keys, _values} = | |
| Enum.reduce(0..(n - 1), {[], [], [], init_keys, init_values}, fn pos_id, | |
| {losses, ce_info, | |
| sm_infos, keys, | |
| values} -> | |
| token_id = Enum.at(tokens, pos_id) | |
| target_id = Enum.at(tokens, pos_id + 1) | |
| {logits, keys, values, new_sm_infos} = | |
| gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c) | |
| # Compute cross-entropy numerically | |
| logit_data = Enum.map(logits, fn id -> V.data(id, r) end) | |
| max_l = Enum.max(logit_data) | |
| exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end) | |
| sum_exp = Enum.sum(exps) | |
| probs = Enum.map(exps, fn e -> e / sum_exp end) | |
| loss_val = -:math.log(Enum.at(probs, target_id)) | |
| loss_t = V.new(loss_val, r, c) | |
| {[loss_t | losses], | |
| [{logits, probs, target_id, loss_t} | ce_info], | |
| new_sm_infos ++ sm_infos, | |
| keys, values} | |
| end) | |
| agg_start = V.peek_next_id(c) | |
| loss = V.mul_scalar(V.vsum(losses, r, c), inv_n, r, c) | |
| # Build injection list: CE at agg_start, then softmax injections (already descending) | |
| ce_injection = | |
| {agg_start, | |
| fn -> | |
| Enum.each(ce_info, fn {logit_ids, probs, target_id, loss_t_id} -> | |
| V.inject_ce(logit_ids, probs, target_id, loss_t_id, r) | |
| end) | |
| end} | |
| sm_injections = | |
| Enum.map(sm_infos, fn {first_weight_id, logit_ids, probs, weight_ids} -> | |
| {first_weight_id, | |
| fn -> V.inject_softmax(logit_ids, probs, weight_ids, r) end} | |
| end) | |
| V.backward_fused(loss, [ce_injection | sm_injections], num_params, r) | |
| lr_t = learning_rate * (1 - step / num_steps) | |
| {new_m, new_v} = | |
| params | |
| |> Enum.zip(m) | |
| |> Enum.zip(v) | |
| |> Enum.map(fn {{p, mi}, vi} -> | |
| g = V.grad(p, r) | |
| new_mi = beta1 * mi + (1 - beta1) * g | |
| new_vi = beta2 * vi + (1 - beta2) * g * g | |
| m_hat = new_mi / (1 - :math.pow(beta1, step + 1)) | |
| v_hat = new_vi / (1 - :math.pow(beta2, step + 1)) | |
| V.set_data(p, V.data(p, r) - lr_t * m_hat / (:math.sqrt(v_hat) + eps_adam), r) | |
| {new_mi, new_vi} | |
| end) | |
| |> Enum.unzip() | |
| loss_str = :erlang.float_to_binary(V.data(loss, r), decimals: 4) | |
| step_str = String.pad_leading(Integer.to_string(step + 1), 4) | |
| IO.puts("step #{step_str} / #{num_steps} | loss #{loss_str}") | |
| {new_m, new_v} | |
| end) | |
| temperature = 0.5 | |
| inv_temp = 1.0 / temperature | |
| IO.puts("\n--- inference (new, hallucinated names) ---") | |
| for sample_idx <- 1..20 do | |
| V.reset(num_params, r, c) | |
| init_keys = for _ <- 1..n_layer, do: [] | |
| init_values = for _ <- 1..n_layer, do: [] | |
| {sample, _token_id, _keys, _values} = | |
| Enum.reduce_while(0..(block_size - 1), {[], bos, init_keys, init_values}, fn pos_id, | |
| {sample, | |
| token_id, | |
| keys, | |
| values} -> | |
| {logits, keys, values, _sm_infos} = | |
| gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c) | |
| logit_data = Enum.map(logits, fn id -> V.data(id, r) * inv_temp end) | |
| max_l = Enum.max(logit_data) | |
| exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end) | |
| sum_exp = Enum.sum(exps) | |
| weights = Enum.map(exps, fn e -> e / sum_exp end) | |
| next_token = weighted_choice(Enum.to_list(0..(vocab_size - 1)), weights) | |
| if next_token == bos do | |
| {:halt, {sample, next_token, keys, values}} | |
| else | |
| {:cont, {sample ++ [Enum.at(uchars, next_token)], next_token, keys, values}} | |
| end | |
| end) | |
| sample_str = String.pad_leading(Integer.to_string(sample_idx), 2) | |
| IO.puts("sample #{sample_str}: #{Enum.join(sample)}") | |
| end | |
| end | |
| end | |
| MicroGPT.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment