Skip to content

Instantly share code, notes, and snippets.

@lukaszsamson
Created February 16, 2026 14:14
Show Gist options
  • Select an option

  • Save lukaszsamson/43b77a944e6a1f88a17aad992efb53c0 to your computer and use it in GitHub Desktop.

Select an option

Save lukaszsamson/43b77a944e6a1f88a17aad992efb53c0 to your computer and use it in GitHub Desktop.
Port of microgpt.py to elixir
# microgpt_atomics_inline.exs
# Port of microgpt.py by @karpathy
#
# Storage: :atomics array (7 slots per node) in place of Python dicts.
# Refs threaded: both data ref and counter ref passed as args, zero
# :persistent_term.get calls during training/inference.
# @compile inline on all hot V and MicroGPT functions.
#
# Optimizations that diverge from the Python original:
#
# 1. Monotonic-ID backward (biggest win)
# Every node only points to children with smaller IDs, so reverse-ID
# order is valid topological order. Backward iterates loss_id..num_params
# directly — no DFS, no MapSet, no recursion, no topo list allocation.
# Nodes with zero grad are skipped (vg != 0.0 guard).
#
# 2. Scalar ops (mul_scalar, add_scalar)
# Expressions like V.mul(x, V.new(constant)) created a throwaway leaf
# node + a binary node. mul_scalar(x, s) and add_scalar(x, s) produce
# a single unary node, cutting total node count and backward range.
#
# 3. Fused cross-entropy backward
# Instead of building the full softmax -> log -> neg graph per token,
# softmax is computed numerically and the loss is a leaf node. During
# backward, CE gradients (p_i - one_hot_i) are injected directly onto
# logit nodes. Eliminates ~5 graph nodes per logit per position.
#
# 4. Fused attention softmax
# Same idea applied to attention weights. Softmax is computed numerically,
# leaf weight nodes are created, and during backward the Jacobian
# p_i * (g_i - dot(g, p)) is injected onto attention logit nodes.
# Eliminates the sub/exp/div/sum graph per attention head.
defmodule V do
@sn 7
@compile {:inline,
f2i: 1, i2f: 1,
data: 2, grad: 2, set_data: 3,
next_id: 1, new: 3, new1: 5, new2: 7,
add: 4, mul: 4, vpow: 4, relu: 3,
mul_scalar: 4, add_scalar: 4, add_grad: 3,
vsum: 3}
def init(max_nodes \\ 200_000) do
ref = :atomics.new(max_nodes * @sn, signed: true)
ctr = :atomics.new(1, signed: true)
:persistent_term.put(:v_ref, ref)
:persistent_term.put(:v_ctr, ctr)
end
def ref, do: :persistent_term.get(:v_ref)
def ctr, do: :persistent_term.get(:v_ctr)
defp next_id(c), do: :atomics.add_get(c, 1, 1) - 1
defp f2i(f) do
<<i::signed-integer-64>> = <<(f / 1)::float-64>>
i
end
defp i2f(i) do
<<f::float-64>> = <<i::signed-integer-64>>
f
end
def data(id, r), do: i2f(:atomics.get(r, id * @sn + 1))
def grad(id, r), do: i2f(:atomics.get(r, id * @sn + 2))
def set_data(id, val, r), do: :atomics.put(r, id * @sn + 1, f2i(val))
def new(data, r, c) do
id = next_id(c)
b = id * @sn + 1
:atomics.put(r, b, f2i(data))
:atomics.put(r, b + 1, 0)
:atomics.put(r, b + 2, 0)
id
end
def new1(data, c1, lg1, r, c) do
id = next_id(c)
b = id * @sn + 1
:atomics.put(r, b, f2i(data))
:atomics.put(r, b + 1, 0)
:atomics.put(r, b + 2, 1)
:atomics.put(r, b + 3, c1)
:atomics.put(r, b + 5, f2i(lg1))
id
end
def new2(data, c1, c2, lg1, lg2, r, c) do
id = next_id(c)
b = id * @sn + 1
:atomics.put(r, b, f2i(data))
:atomics.put(r, b + 1, 0)
:atomics.put(r, b + 2, 2)
:atomics.put(r, b + 3, c1)
:atomics.put(r, b + 4, c2)
:atomics.put(r, b + 5, f2i(lg1))
:atomics.put(r, b + 6, f2i(lg2))
id
end
def add(a, b, r, c), do: new2(data(a, r) + data(b, r), a, b, 1.0, 1.0, r, c)
def mul(a, b, r, c) do
da = data(a, r)
db = data(b, r)
new2(da * db, a, b, db, da, r, c)
end
def vpow(a, p, r, c) do
da = data(a, r)
new1(:math.pow(da, p), a, p * :math.pow(da, p - 1), r, c)
end
def relu(a, r, c) do
da = data(a, r)
new1(max(0.0, da), a, if(da > 0, do: 1.0, else: 0.0), r, c)
end
def mul_scalar(a, s, r, c), do: new1(data(a, r) * s, a, s, r, c)
def add_scalar(a, s, r, c), do: new1(data(a, r) + s, a, 1.0, r, c)
def vsum([x], _r, _c), do: x
def vsum([h | t], r, c), do: Enum.reduce(t, h, fn x, acc -> add(acc, x, r, c) end)
def peek_next_id(c), do: :atomics.get(c, 1)
def backward_fused(loss_id, injections, num_params, r) do
:atomics.put(r, loss_id * @sn + 2, f2i(1.0))
final_from =
Enum.reduce(injections, loss_id, fn {boundary, inject_fn}, from ->
backward_range(from, boundary, r)
inject_fn.()
boundary - 1
end)
backward_range(final_from, num_params, r)
end
def inject_ce(logit_ids, probs, target_id, loss_t_id, r) do
lt_grad = grad(loss_t_id, r)
logit_ids
|> Enum.zip(probs)
|> Enum.with_index()
|> Enum.each(fn {{lid, pi}, i} ->
ce_grad = (pi - if(i == target_id, do: 1.0, else: 0.0)) * lt_grad
add_grad(lid, ce_grad, r)
end)
end
def inject_softmax(logit_ids, probs, weight_ids, r) do
gs = Enum.map(weight_ids, fn wid -> grad(wid, r) end)
dot_gp = Enum.zip(gs, probs) |> Enum.reduce(0.0, fn {g, p}, acc -> acc + g * p end)
logit_ids
|> Enum.zip(probs)
|> Enum.zip(gs)
|> Enum.each(fn {{lid, pi}, gi} ->
add_grad(lid, pi * (gi - dot_gp), r)
end)
end
defp add_grad(id, val, r) do
g_slot = id * @sn + 2
:atomics.put(r, g_slot, f2i(i2f(:atomics.get(r, g_slot)) + val))
end
defp backward_range(from, to, r) do
for v <- from..to//-1 do
b = v * @sn + 1
vg = i2f(:atomics.get(r, b + 1))
if vg != 0.0 do
nc = :atomics.get(r, b + 2)
if nc >= 1 do
c1 = :atomics.get(r, b + 3)
lg1 = i2f(:atomics.get(r, b + 5))
c1g = c1 * @sn + 2
:atomics.put(r, c1g, f2i(i2f(:atomics.get(r, c1g)) + lg1 * vg))
if nc == 2 do
c2 = :atomics.get(r, b + 4)
lg2 = i2f(:atomics.get(r, b + 6))
c2g = c2 * @sn + 2
:atomics.put(r, c2g, f2i(i2f(:atomics.get(r, c2g)) + lg2 * vg))
end
end
end
end
end
def reset(num_params, r, c) do
zero = f2i(0.0)
for id <- 0..(num_params - 1) do
:atomics.put(r, id * @sn + 2, zero)
end
:atomics.put(c, 1, num_params)
end
end
defmodule MicroGPT do
@compile {:inline, linear: 4, rmsnorm: 3}
def linear(x, w, r, c) do
Enum.map(w, fn wo ->
Enum.zip(wo, x)
|> Enum.map(fn {wi, xi} -> V.mul(wi, xi, r, c) end)
|> V.vsum(r, c)
end)
end
def rmsnorm(x, r, c) do
n = length(x)
squares = Enum.map(x, fn xi -> V.mul(xi, xi, r, c) end)
ms = V.mul_scalar(V.vsum(squares, r, c), 1.0 / n, r, c)
scale = V.vpow(V.add_scalar(ms, 1.0e-5, r, c), -0.5, r, c)
Enum.map(x, fn xi -> V.mul(xi, scale, r, c) end)
end
def gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c) do
tok_emb = Enum.at(sd["wte"], token_id)
pos_emb = Enum.at(sd["wpe"], pos_id)
x = Enum.zip(tok_emb, pos_emb) |> Enum.map(fn {t, p} -> V.add(t, p, r, c) end)
x = rmsnorm(x, r, c)
{x, keys, values, sm_infos} =
Enum.reduce(0..(n_layer - 1), {x, keys, values, []}, fn li,
{x, keys, values, sm_infos} ->
x_res = x
x = rmsnorm(x, r, c)
q = linear(x, sd["layer#{li}.attn_wq"], r, c)
k = linear(x, sd["layer#{li}.attn_wk"], r, c)
v = linear(x, sd["layer#{li}.attn_wv"], r, c)
keys = List.update_at(keys, li, &([k | &1]))
values = List.update_at(values, li, &([v | &1]))
{x_attn, sm_infos} =
Enum.reduce(0..(n_head - 1), {[], sm_infos}, fn h, {x_attn_acc, sm_infos} ->
hs = h * head_dim
q_h = Enum.slice(q, hs, head_dim)
k_h = Enum.map(Enum.at(keys, li), fn ki -> Enum.slice(ki, hs, head_dim) end)
v_h = Enum.map(Enum.at(values, li), fn vi -> Enum.slice(vi, hs, head_dim) end)
attn_logits =
Enum.map(k_h, fn k_t ->
dot =
Enum.zip(q_h, k_t)
|> Enum.map(fn {a, b} -> V.mul(a, b, r, c) end)
|> V.vsum(r, c)
V.mul_scalar(dot, inv_scale, r, c)
end)
# Numerical softmax — no graph nodes
logit_data = Enum.map(attn_logits, fn id -> V.data(id, r) end)
max_l = Enum.max(logit_data)
exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end)
sum_exp = Enum.sum(exps)
attn_probs = Enum.map(exps, fn e -> e / sum_exp end)
# Create leaf weight nodes
first_weight_id = V.peek_next_id(c)
attn_weights = Enum.map(attn_probs, fn p -> V.new(p, r, c) end)
sm_info = {first_weight_id, attn_logits, attn_probs, attn_weights}
head_out =
Enum.map(0..(head_dim - 1), fn j ->
Enum.zip(attn_weights, v_h)
|> Enum.map(fn {w, vt} -> V.mul(w, Enum.at(vt, j), r, c) end)
|> V.vsum(r, c)
end)
{x_attn_acc ++ head_out, [sm_info | sm_infos]}
end)
x = linear(x_attn, sd["layer#{li}.attn_wo"], r, c)
x = Enum.zip(x, x_res) |> Enum.map(fn {a, b} -> V.add(a, b, r, c) end)
x_res = x
x = rmsnorm(x, r, c)
x = linear(x, sd["layer#{li}.mlp_fc1"], r, c)
x = Enum.map(x, fn xi -> V.relu(xi, r, c) end)
x = linear(x, sd["layer#{li}.mlp_fc2"], r, c)
x = Enum.zip(x, x_res) |> Enum.map(fn {a, b} -> V.add(a, b, r, c) end)
{x, keys, values, sm_infos}
end)
logits = linear(x, sd["lm_head"], r, c)
{logits, keys, values, sm_infos}
end
def weighted_choice(items, weights) do
total = Enum.sum(weights)
rand = :rand.uniform() * total
result =
Enum.zip(items, weights)
|> Enum.reduce_while(0.0, fn {item, w}, acc ->
new_acc = acc + w
if new_acc >= rand, do: {:halt, {:ok, item}}, else: {:cont, new_acc}
end)
case result do
{:ok, item} -> item
_ -> List.last(items)
end
end
def run do
V.init(200_000)
r = V.ref()
c = V.ctr()
:rand.seed(:exsss, 42)
unless File.exists?("input.txt") do
IO.puts("Downloading names dataset...")
{:ok, _} = Application.ensure_all_started(:inets)
{:ok, _} = Application.ensure_all_started(:ssl)
url =
String.to_charlist(
"https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
)
{:ok, {{_, 200, _}, _, body}} =
:httpc.request(:get, {url, []}, [ssl: [verify: :verify_none]], body_format: :binary)
File.write!("input.txt", body)
end
docs =
File.read!("input.txt")
|> String.trim()
|> String.split("\n")
|> Enum.map(&String.trim/1)
|> Enum.filter(&(&1 != ""))
|> Enum.shuffle()
IO.puts("num docs: #{length(docs)}")
uchars = docs |> Enum.join() |> String.graphemes() |> Enum.uniq() |> Enum.sort()
bos = length(uchars)
vocab_size = length(uchars) + 1
char_to_idx = uchars |> Enum.with_index() |> Map.new()
IO.puts("vocab size: #{vocab_size}")
n_embd = 16
n_head = 4
n_layer = 1
block_size = 16
head_dim = div(n_embd, n_head)
inv_scale = 1.0 / :math.sqrt(head_dim)
std = 0.08
matrix = fn nout, nin ->
for _ <- 1..nout, do: for(_ <- 1..nin, do: V.new(:rand.normal() * std, r, c))
end
sd =
%{
"wte" => matrix.(vocab_size, n_embd),
"wpe" => matrix.(block_size, n_embd),
"lm_head" => matrix.(vocab_size, n_embd)
}
sd =
Enum.reduce(0..(n_layer - 1), sd, fn i, sd ->
sd
|> Map.put("layer#{i}.attn_wq", matrix.(n_embd, n_embd))
|> Map.put("layer#{i}.attn_wk", matrix.(n_embd, n_embd))
|> Map.put("layer#{i}.attn_wv", matrix.(n_embd, n_embd))
|> Map.put("layer#{i}.attn_wo", matrix.(n_embd, n_embd))
|> Map.put("layer#{i}.mlp_fc1", matrix.(4 * n_embd, n_embd))
|> Map.put("layer#{i}.mlp_fc2", matrix.(n_embd, 4 * n_embd))
end)
param_keys =
["wte", "wpe", "lm_head"] ++
Enum.flat_map(0..(n_layer - 1), fn i ->
[
"layer#{i}.attn_wq",
"layer#{i}.attn_wk",
"layer#{i}.attn_wv",
"layer#{i}.attn_wo",
"layer#{i}.mlp_fc1",
"layer#{i}.mlp_fc2"
]
end)
params = for key <- param_keys, row <- sd[key], p <- row, do: p
num_params = length(params)
IO.puts("num params: #{num_params}")
learning_rate = 0.01
beta1 = 0.85
beta2 = 0.99
eps_adam = 1.0e-8
num_steps = 1000
m0 = List.duplicate(0.0, num_params)
v0 = List.duplicate(0.0, num_params)
{_m, _v} =
Enum.reduce(0..(num_steps - 1), {m0, v0}, fn step, {m, v} ->
V.reset(num_params, r, c)
doc = Enum.at(docs, rem(step, length(docs)))
tokens =
[bos] ++ Enum.map(String.graphemes(doc), &Map.fetch!(char_to_idx, &1)) ++ [bos]
n = min(block_size, length(tokens) - 1)
inv_n = 1.0 / n
init_keys = for _ <- 1..n_layer, do: []
init_values = for _ <- 1..n_layer, do: []
{losses, ce_info, sm_infos, _keys, _values} =
Enum.reduce(0..(n - 1), {[], [], [], init_keys, init_values}, fn pos_id,
{losses, ce_info,
sm_infos, keys,
values} ->
token_id = Enum.at(tokens, pos_id)
target_id = Enum.at(tokens, pos_id + 1)
{logits, keys, values, new_sm_infos} =
gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c)
# Compute cross-entropy numerically
logit_data = Enum.map(logits, fn id -> V.data(id, r) end)
max_l = Enum.max(logit_data)
exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end)
sum_exp = Enum.sum(exps)
probs = Enum.map(exps, fn e -> e / sum_exp end)
loss_val = -:math.log(Enum.at(probs, target_id))
loss_t = V.new(loss_val, r, c)
{[loss_t | losses],
[{logits, probs, target_id, loss_t} | ce_info],
new_sm_infos ++ sm_infos,
keys, values}
end)
agg_start = V.peek_next_id(c)
loss = V.mul_scalar(V.vsum(losses, r, c), inv_n, r, c)
# Build injection list: CE at agg_start, then softmax injections (already descending)
ce_injection =
{agg_start,
fn ->
Enum.each(ce_info, fn {logit_ids, probs, target_id, loss_t_id} ->
V.inject_ce(logit_ids, probs, target_id, loss_t_id, r)
end)
end}
sm_injections =
Enum.map(sm_infos, fn {first_weight_id, logit_ids, probs, weight_ids} ->
{first_weight_id,
fn -> V.inject_softmax(logit_ids, probs, weight_ids, r) end}
end)
V.backward_fused(loss, [ce_injection | sm_injections], num_params, r)
lr_t = learning_rate * (1 - step / num_steps)
{new_m, new_v} =
params
|> Enum.zip(m)
|> Enum.zip(v)
|> Enum.map(fn {{p, mi}, vi} ->
g = V.grad(p, r)
new_mi = beta1 * mi + (1 - beta1) * g
new_vi = beta2 * vi + (1 - beta2) * g * g
m_hat = new_mi / (1 - :math.pow(beta1, step + 1))
v_hat = new_vi / (1 - :math.pow(beta2, step + 1))
V.set_data(p, V.data(p, r) - lr_t * m_hat / (:math.sqrt(v_hat) + eps_adam), r)
{new_mi, new_vi}
end)
|> Enum.unzip()
loss_str = :erlang.float_to_binary(V.data(loss, r), decimals: 4)
step_str = String.pad_leading(Integer.to_string(step + 1), 4)
IO.puts("step #{step_str} / #{num_steps} | loss #{loss_str}")
{new_m, new_v}
end)
temperature = 0.5
inv_temp = 1.0 / temperature
IO.puts("\n--- inference (new, hallucinated names) ---")
for sample_idx <- 1..20 do
V.reset(num_params, r, c)
init_keys = for _ <- 1..n_layer, do: []
init_values = for _ <- 1..n_layer, do: []
{sample, _token_id, _keys, _values} =
Enum.reduce_while(0..(block_size - 1), {[], bos, init_keys, init_values}, fn pos_id,
{sample,
token_id,
keys,
values} ->
{logits, keys, values, _sm_infos} =
gpt(token_id, pos_id, keys, values, sd, n_layer, n_head, head_dim, inv_scale, r, c)
logit_data = Enum.map(logits, fn id -> V.data(id, r) * inv_temp end)
max_l = Enum.max(logit_data)
exps = Enum.map(logit_data, fn d -> :math.exp(d - max_l) end)
sum_exp = Enum.sum(exps)
weights = Enum.map(exps, fn e -> e / sum_exp end)
next_token = weighted_choice(Enum.to_list(0..(vocab_size - 1)), weights)
if next_token == bos do
{:halt, {sample, next_token, keys, values}}
else
{:cont, {sample ++ [Enum.at(uchars, next_token)], next_token, keys, values}}
end
end)
sample_str = String.pad_leading(Integer.to_string(sample_idx), 2)
IO.puts("sample #{sample_str}: #{Enum.join(sample)}")
end
end
end
MicroGPT.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment