cosminscn

## nanogpt_adder_66.py
"""
Optimized NanoGPT Adder: 66 params + 0 buffers

    transformer.wte.A                                          [10, 1] =  10
    transformer.wte.B                                           [1, 4] =   4
    transformer.h.0.attn.q_proj.angle_h0                           [1] =   1
    transformer.h.0.attn.q_proj.angle_h1                           [1] =   1
    transformer.h.0.attn.q_proj.scale                              [1] =   1
    transformer.h.0.attn.k_proj.weight                          [4, 2] =   8
    transformer.h.0.attn.v_proj.u                               [4, 1] =   4

## nanogpt_adder_58.py
"""58-param nanoGPT that adds any two 10-digit numbers. No training.

Down from 130 params. HONEST counting: every stored numerical value is an
nn.Parameter. Zero buffers. The only "free" things are structural choices
(which dim connects where), control flow, and pure math (sin/cos/arange
for PE generation — same convention as the original).

Parameter budget:
  wte.A (10×1) + wte.B (1×4)         = 14   [factorized embedding]
  Q (2 angles + 1 scale)              =  3   [rotation-parameterized]

## nanogpt_adder_130.py
"""
 Dynamic NanoGPT Adder: 130 params + 0 buffers

    transformer.wte.A                           [10, 1] =  10
    transformer.wte.B                            [1, 4] =   4
    transformer.h.0.attn.c_attn.weight          [12, 4] =  48
    transformer.h.0.attn.c_proj.weight           [4, 4] =  16
    transformer.h.0.mlp.c_fc.weight              [4, 4] =  16
    transformer.h.0.mlp.c_fc.bias                   [4] =   4
    transformer.h.0.mlp.c_proj.u                 [4, 1] =   4

## nanogpt_adder.py
"""190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded."""
import math
from dataclasses import dataclass
import torch, torch.nn as nn
from torch.nn import functional as F

# === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) ===
# Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on

class CausalSelfAttention(nn.Module):

## notes_dt.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                cosminscn
                / notes_dt.md
            
            
              Last active
              August 28, 2025 01:03
            
              
                Distributed training notes - aug 27 25
              
          
    A100 - spec
312 TFLOPS/s
40GB? 80GB HBM ram
20MB cache
Large model run


## cover.py
# This solver covers a rectangular grid of '.' and 'X' with two non-rotatable tiles:
#   • A (U-shape): on row i it occupies (i,j) and (i,j+3), and on row i+1 it occupies (i+1, j..j+3).
#   • B (horizontal domino): occupies (i,j) and (i,j+1) on the same row.
# We scan left→right, row→row using top-down DP with memoization. The DP state is
# (i, j, needA, anchors):
#   – i, j: current row and column being decided.
#   – needA: tuple[bool] of length W marking cells in the CURRENT row that are forced to 'A'
#            because they are the bottom strip of some A placed in the PREVIOUS row.
#   – anchors: tuple of columns where we have already anchored an A in the CURRENT row before j;
#              each anchor at column a also forces (i, a+3) to be 'A' (its top-right leg) and

## ai_deals.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                cosminscn
                / ai_deals.md
            
            
              Created
              June 21, 2025 06:12
            
              
                ai deals
              
          
Year
Acquirer
Target
Price / Structure
Core Focus


2010
Apple
Siri
≈ $200 M
Voice assistant / NLP


2013
Google
DNNresearch (Hinton)
≈ $44 M
Deep CNN research (AlexNet)


2014
Google
DeepMind
$500–650 M
Deep RL & general AI


2015
Facebook
Wit.ai
n/d
Speech / NLU APIs


2016
Intel
Nervana
$350–400 M
DL ASIC & framework


2016
	"""
	Optimized NanoGPT Adder: 66 params + 0 buffers

	transformer.wte.A [10, 1] = 10
	transformer.wte.B [1, 4] = 4
	transformer.h.0.attn.q_proj.angle_h0 [1] = 1
	transformer.h.0.attn.q_proj.angle_h1 [1] = 1
	transformer.h.0.attn.q_proj.scale [1] = 1
	transformer.h.0.attn.k_proj.weight [4, 2] = 8
	transformer.h.0.attn.v_proj.u [4, 1] = 4
	"""58-param nanoGPT that adds any two 10-digit numbers. No training.

	Down from 130 params. HONEST counting: every stored numerical value is an
	nn.Parameter. Zero buffers. The only "free" things are structural choices
	(which dim connects where), control flow, and pure math (sin/cos/arange
	for PE generation — same convention as the original).

	Parameter budget:
	wte.A (10×1) + wte.B (1×4) = 14 [factorized embedding]
	Q (2 angles + 1 scale) = 3 [rotation-parameterized]
	"""
	Dynamic NanoGPT Adder: 130 params + 0 buffers

	transformer.wte.A [10, 1] = 10
	transformer.wte.B [1, 4] = 4
	transformer.h.0.attn.c_attn.weight [12, 4] = 48
	transformer.h.0.attn.c_proj.weight [4, 4] = 16
	transformer.h.0.mlp.c_fc.weight [4, 4] = 16
	transformer.h.0.mlp.c_fc.bias [4] = 4
	transformer.h.0.mlp.c_proj.u [4, 1] = 4
	"""190-param nanoGPT that adds any two 10-digit numbers. All weights hand-coded."""
	import math
	from dataclasses import dataclass
	import torch, torch.nn as nn
	from torch.nn import functional as F

	# === NanoGPT (from github.com/karpathy/nanoGPT/blob/master/model.py) ===
	# Modifications: sinusoidal PE buffer, configurable mlp_hidden, c_fc bias always on

	class CausalSelfAttention(nn.Module):
	# This solver covers a rectangular grid of '.' and 'X' with two non-rotatable tiles:
	# • A (U-shape): on row i it occupies (i,j) and (i,j+3), and on row i+1 it occupies (i+1, j..j+3).
	# • B (horizontal domino): occupies (i,j) and (i,j+1) on the same row.
	# We scan left→right, row→row using top-down DP with memoization. The DP state is
	# (i, j, needA, anchors):
	# – i, j: current row and column being decided.
	# – needA: tuple[bool] of length W marking cells in the CURRENT row that are forced to 'A'
	# because they are the bottom strip of some A placed in the PREVIOUS row.
	# – anchors: tuple of columns where we have already anchored an A in the CURRENT row before j;
	# each anchor at column a also forces (i, a+3) to be 'A' (its top-right leg) and
Year	Acquirer	Target	Price / Structure	Core Focus
2010	Apple	Siri	≈ $200 M	Voice assistant / NLP
2013	Google	DNNresearch (Hinton)	≈ $44 M	Deep CNN research (AlexNet)
2014	Google	DeepMind	$500–650 M	Deep RL & general AI
2015	Facebook	Wit.ai	n/d	Speech / NLU APIs
2016	Intel	Nervana	$350–400 M	DL ASIC & framework
2016