Skip to content

Instantly share code, notes, and snippets.

@eiz
Last active February 26, 2026 07:09
Show Gist options
  • Select an option

  • Save eiz/5a85c2465b1df44a97544719bed5b2c8 to your computer and use it in GitHub Desktop.

Select an option

Save eiz/5a85c2465b1df44a97544719bed5b2c8 to your computer and use it in GitHub Desktop.
// asm.s — self-hosting aarch64 assembler
//
// reads an aarch64 assembly source file (GAS-compatible subset),
// emits a static PIE ELF binary directly. no linker required.
//
// usage: asm <input.s> <output>
//
// this file was created solely by Claude (Opus 4.6 and Sonnet 4.6) and is
// in the public domain (or CC0 1.0, if you prefer).
//
// to bootstrap with GNU tools: gcc -o asm0 -nostdlib asm.s && ./asm0 asm.s asm
//
// current binary size: 5543 bytes
//
// ── supported instructions ────────────────────────────────────────────────
//
// arithmetic/logic:
// add Rd, Rn, #imm12 | Rm [, lsl #N] sub (same forms)
// adds Rd, Rn, #imm12 | Rm [, lsl #N] subs (same forms)
// cmp Rn, #imm12 | Rm [, lsl #N] cmn (same forms)
// and Rd, Rn, #bitmask | Rm [, lsl #N] orr, eor (same forms)
// ands Rd, Rn, #bitmask | Rm [, lsl #N] (flag-setting AND)
// tst Rn, #bitmask | Rm (ANDS alias, Rd=XZR)
// bic Rd, Rn, Rm
// neg Rd, Rm mvn Rd, Rm
// mul Rd, Rn, Rm msub Rd, Rn, Rm, Ra
// madd Rd, Rn, Rm, Ra (Rd = Ra + Rn*Rm)
// udiv Rd, Rn, Rm sdiv Rd, Rn, Rm
// nop
//
// moves:
// mov Rd, Rm | #imm (MOVZ/MOVN-encodable) | SP
// movz Rd, #imm16 [, lsl #N] movn, movk (same forms)
//
// shifts:
// lsl Rd, Rn, Rm | #N lsr, asr (same forms)
// ror Rd, Rn, Rm
//
// bitfield:
// ubfm Rd, Rn, #immr, #imms sbfm, bfm (same forms)
// ubfx Rd, Rn, #lsb, #width sbfx (same form)
// ubfiz Rd, Rn, #lsb, #width sbfiz, bfi (same form)
// bfxil Rd, Rn, #lsb, #width
// sxtb Rd, Wn sxth Rd, Wn sxtw Rd, Wn
// uxtb Wd, Wn uxth Wd, Wn
//
// bit manipulation:
// clz Rd, Rn rbit Rd, Rn
//
// branches:
// b label bl label br Xn blr Xn ret
// b.cc label (eq ne hs lo mi pl vs vc hi ls ge lt gt le al cs cc)
// cbz Rt, label cbnz Rt, label
// tbz Rt, #bit, label tbnz Rt, #bit, label
//
// conditional:
// csel Rd, Rn, Rm, cc csinc Rd, Rn, Rm, cc
// cset Rd, cc
//
// address:
// adr Rd, expr adrp Rd, symbol
//
// load/store (single):
// ldr Rt, [Rn {, #imm | :lo12:sym}] str (same forms)
// ldr Rt, [Rn, Rm {, lsl #N}] str (same forms)
// ldr Rt, [Rn, #simm9]! str (pre-index)
// ldr Rt, [Rn], #simm9 str (post-index)
// ldr Rt, label (PC-relative literal)
// ldrb (same addressing modes) strb
// ldrh (same addressing modes) strh
// ldrsb Rt, [Rn {, ...}] ldrsh, ldrsw
//
// load/store (pair):
// ldp Rt1, Rt2, [Rn {, #imm}] stp (same forms)
// ldp Rt1, Rt2, [Rn, #imm]! stp (pre-index)
// ldp Rt1, Rt2, [Rn], #imm stp (post-index)
//
// system:
// svc #imm16
//
// ── registers ─────────────────────────────────────────────────────────────
// x0-x30, w0-w30, xzr, wzr, sp (no fp/lr aliases)
//
// ── directives ────────────────────────────────────────────────────────────
// .text .bss .section .rodata .global name .equ name, expr
// .word expr .ascii "str" .asciz "str"
// .align N .skip N
//
// ── expressions ───────────────────────────────────────────────────────────
// operators: | & + - * << >> unary: ~ - grouping: ( )
// atoms: 123 0xFF 'A' '\n' . label :lo12:expr
// labels: name: N: (numeric 0-9, ref as Nf/Nb)
// comments: //
//
// ── output ────────────────────────────────────────────────────────────────
// ELF64 static PIE, single LOAD segment (RWX), no section headers.
// .text is dictionary-compressed; decompressor stub runs at entry.
//
// ── syscall numbers ───────────────────────────────────────────────────────
.equ SYS_exit, 93
.equ SYS_read, 63
.equ SYS_write, 64
.equ SYS_openat, 56
.equ SYS_close, 57
.equ SYS_fchmod, 52
// ── file constants ────────────────────────────────────────────────────────
.equ AT_FDCWD, -100
.equ O_RDONLY, 0
.equ O_WRONLY_CREAT_TRUNC, 577 // O_WRONLY|O_CREAT|O_TRUNC = 1|64|512
.equ STDERR, 2
// ── ELF constants ─────────────────────────────────────────────────────────
.equ ELF_HEADER_SIZE, 64
.equ PHDR_SIZE, 56
.equ CODE_START, 120 // ELF_HEADER_SIZE + PHDR_SIZE
// ── compression constants ─────────────────────────────────────────────────
// STUB_SIZE and STUB_DATA_* are computed from labels after _decomp_stub_end
.equ FULL_DICT_ENTRIES, 126
.equ HALF_DICT_ENTRIES, 128
.equ FULL_DICT_SIZE, 504 // 126 * 4
.equ HALF_DICT_SIZE, 256 // 128 * 2
// ── section IDs ───────────────────────────────────────────────────────────
.equ SEC_TEXT, 0 // pre-multiplied by 8 for direct state block indexing
.equ SEC_RODATA, 8
.equ SEC_BSS, 16
// ── state block offsets (all u64) ─────────────────────────────────────────
.equ ST_TEXT_POS, 0 // current offset within .text
.equ ST_RODATA_POS, 8 // current offset within .rodata
.equ ST_BSS_POS, 16 // current offset within .bss
.equ ST_CUR_SEC, 24 // current section (SEC_TEXT/RODATA/BSS)
.equ ST_TEXT_BASE, 32 // virtual address of .text start
.equ ST_RODATA_BASE, 40 // virtual address of .rodata start
.equ ST_BSS_BASE, 48 // virtual address of .bss start
.equ ST_PASS, 56 // current pass (1 or 2)
.equ ST_LINE_NUM, 64 // current source line number
.equ ST_INPUT_LEN, 72 // input file length in bytes
.equ ST_FILE_SIZE, 80 // total output file size
.equ ST_MEM_SIZE, 88 // total memory size (file + bss)
.equ ST_INPUT_NAME, 104 // pointer to input filename string
.equ ST_OUTPUT_NAME, 112 // pointer to output filename string
.equ ST_SIZE, 120
// ── symbol table entry layout (32 bytes) ──────────────────────────────────
// name_ptr u64 @ 0 pointer to name in input buffer (0 = empty slot)
// name_len u32 @ 8 length of name
// flags u64 @ 16 SYMF_* bits
// value u64 @ 24 address or .equ value
.equ SYM_ENT_SIZE, 32
.equ SYM_NAME_PTR, 0
.equ SYM_NAME_LEN, 8
.equ SYM_FLAGS, 16
.equ SYM_VALUE, 24
.equ SYM_TBL_SLOTS, 1024 // must be power of 2
// ── symbol flags ──────────────────────────────────────────────────────────
.equ SYMF_DEFINED, 1
.equ SYMF_GLOBAL, 2
.equ SYMF_EQU, 4
.equ SYMF_SEC_SHIFT, 4 // section stored in bits 5:4 of flags
// ── buffer sizes ──────────────────────────────────────────────────────────
.equ INPUT_BUF_SIZE, 1048576 // 1 MB
.equ TEXT_BUF_SIZE, 1048576 // 1 MB
.equ RODATA_BUF_SIZE, 1048576 // 1 MB
.equ SYM_TBL_BYTES, 32768 // SYM_TBL_SLOTS * SYM_ENT_SIZE
// ── BSS offsets from x28 (state block pointer) ────────────────────────────
.equ NUMLAB_CNTS_OFF, ST_SIZE // 128
.equ NUMLAB_CURS_OFF, NUMLAB_CNTS_OFF + NUMLAB_DIGITS * 8 // 208
.equ INPUT_BUF_OFF, NUMLAB_CURS_OFF + NUMLAB_DIGITS * 8 // 288
// ── numeric labels ────────────────────────────────────────────────────────
.equ NUMLAB_MAX_DEFS, 128 // max definitions per digit
.equ NUMLAB_DIGITS, 10 // digits 0-9
// ══════════════════════════════════════════════════════════════════════════
// BSS
// ══════════════════════════════════════════════════════════════════════════
.bss
.align 4
state: .skip ST_SIZE
numlab_cnts: .skip NUMLAB_DIGITS * 8
numlab_curs: .skip NUMLAB_DIGITS * 8
input_buf: .skip INPUT_BUF_SIZE
text_buf: .skip TEXT_BUF_SIZE
rodata_buf: .skip RODATA_BUF_SIZE
sym_table: .skip SYM_TBL_BYTES
// numeric label storage: 10 digits × 128 defs × 8 bytes
numlab_defs: .skip NUMLAB_DIGITS * NUMLAB_MAX_DEFS * 8
// ══════════════════════════════════════════════════════════════════════════
// Read-only data
// ══════════════════════════════════════════════════════════════════════════
.section .rodata
msg_usage: .asciz "usage: asm <input.s> <output>\n"
msg_open: .asciz "cannot open input file\n"
msg_create: .asciz "cannot create output file\n"
msg_syntax: .asciz "syntax error\n"
msg_undef: .asciz "undefined symbol\n"
msg_badins: .asciz "unknown instruction\n"
msg_badimm: .asciz "invalid immediate\n"
// condition code XOR lookup: cond_xor_tbl[(c0^c1) & 0x1F] = cond code (31=invalid)
cond_xor_tbl:
.word 0x030A0803
.word 0x1F1F0604
.word 0x011F0D1F
.word 0x1F1F0E1F
.word 0x0C1F1F02
.word 0x1F1F0700
.word 0x021F1F0B
.word 0x091F1F05
// operator table for expression parser: 2-byte entries (char, packed+0x20), sentinel=\0
// packed = (prec<<4)|opcode: | →0x10 & →0x21 + →0x32 - →0x33 * →0x44
op_table: .ascii "|0&A+R-S*d\0"
// ══════════════════════════════════════════════════════════════════════════
// Code
// ══════════════════════════════════════════════════════════════════════════
.text
.global _start
// ──────────────────────────────────────────────────────────────────────────
// _start — entry point
// ──────────────────────────────────────────────────────────────────────────
_start:
// grab argc / argv from the stack
ldr x0, [sp] // argc
cmp x0, #3
b.lt err_usage
// set up state block pointer (x28 is callee-saved, lives forever)
adr x28, state
// pin x29 = 0x100000 (1 MB stride between section buffers)
movz x29, #0x10, lsl #16
// pin x27 = text_buf (x28 + INPUT_BUF_OFF + INPUT_BUF_SIZE = x28 + 0x100120)
add x27, x28, x29
add x27, x27, #INPUT_BUF_OFF
// store input/output filenames
ldp x1, x0, [sp, #16] // x1=argv[1] (input), x0=argv[2] (output)
stp x1, x0, [x28, #ST_INPUT_NAME]
// ── open and read the input file ──────────────────────────────────────
// x1 already holds input filename from ldp above
mov x0, #AT_FDCWD
mov x2, #O_RDONLY
mov x8, #SYS_openat
svc #0
tbnz x0, #63, err_open
add x1, x28, #INPUT_BUF_OFF // input_buf
mov x2, #INPUT_BUF_SIZE
mov x8, #SYS_read
svc #0
tbnz x0, #63, err_open
str x0, [x28, #ST_INPUT_LEN]
// ── pass 1: collect symbols and measure sections ──────────────────────
mov x0, #1
bl run_pass
// ── compute section base addresses ────────────────────────────────────
ldp x1, x2, [x28, #ST_TEXT_POS] // text_pos, rodata_pos
mov x0, #CODE_START
add x1, x0, x1 // rodata_base = text_base + text_size
stp x0, x1, [x28, #ST_TEXT_BASE]
add x2, x1, x2 // bss_base = rodata_base + rodata_size
str x2, [x28, #ST_BSS_BASE]
ldr x3, [x28, #ST_BSS_POS]
add x3, x2, x3 // mem_size = bss_base + bss_size
stp x2, x3, [x28, #ST_FILE_SIZE]
// ── rebase symbols: add section bases to label addresses ──────────────
bl rebase_symbols
// ── pass 2: encode instructions and emit data ─────────────────────────
mov x0, #2
bl run_pass
// ── compress .text section ────────────────────────────────────────────
bl compress_text
mov x20, x0 // x20 = compressed stream size
// ── allocate stack: 128 (ELF header) ──────────────────────────────────
sub sp, sp, #128
// p_filesz = CODE_START + STUB_SIZE + FULL_DICT_SIZE + HALF_DICT_SIZE + stream + rodata
ldr x21, [x28, #ST_RODATA_POS] // x21 = rodata_size (callee-saved)
add x12, x20, x21 // stream + rodata
add x12, x12, #(CODE_START + STUB_SIZE + FULL_DICT_SIZE + HALF_DICT_SIZE)
// DECOMP_DEST_OFF = ceil_page(p_filesz) — offset from stub base
add x11, x12, #0xFFF
and x11, x11, #0xFFFFFFFFFFFFF000 // ceil to page
// p_memsz = ceil_page(p_filesz) + total_mem_size
ldr x13, [x28, #ST_MEM_SIZE]
add x13, x13, x11
// ELF magic + e_ident[0..7] + zeros [8..15]
movz x9, #0x457f
movk x9, #0x464c, lsl #16
movk x9, #0x0102, lsl #32
movk x9, #0x0001, lsl #48
stp x9, xzr, [sp]
// e_type=3, e_machine=0xB7, e_version=1 + e_entry=0x78 (stub entry)
movz x9, #3
movk x9, #0x00B7, lsl #16
movk x9, #1, lsl #32
mov x10, #CODE_START // e_entry = stub at 0x78
stp x9, x10, [sp, #16]
// e_phoff=64 + e_shoff=0
mov x9, #64
stp x9, xzr, [sp, #32]
// e_flags=0|e_ehsize=64|e_phentsize=56 + e_phnum=1|rest=0
lsl x9, x9, #32 // x9 was 64 → 0x0040_0000_0000
movk x9, #0x0038, lsl #48
mov x10, #1
stp x9, x10, [sp, #48]
// p_type=1|p_flags=7 + p_offset=0
movk x10, #7, lsl #32
stp x10, xzr, [sp, #64]
// p_vaddr=0, p_paddr=0
stp xzr, xzr, [sp, #80]
// p_filesz + p_memsz (compressed values)
stp x12, x13, [sp, #96]
// p_align = 0x10000
mov x9, #0x10000
str x9, [sp, #112]
// ── open output file ──────────────────────────────────────────────────
mov x0, #AT_FDCWD
ldr x1, [x28, #ST_OUTPUT_NAME]
mov x2, #O_WRONLY_CREAT_TRUNC
mov w3, #493 // 0755 octal
mov x8, #SYS_openat
svc #0
tbnz x0, #63, err_create
mov x19, x0 // fd
// write ELF header + program header (120 bytes)
mov x8, #SYS_write
mov x1, sp
mov x2, #CODE_START
bl svc_x19
// header buffer no longer needed — reuse sp[0..7] for stub data (32-bit)
stp w11, w21, [sp]
// write decompressor stub code (from .text, excludes data block)
adr x1, _decomp_stub_start
mov x2, #STUB_DATA_DECOMP_DEST
bl svc_x19
// write patched stub data block (from stack)
mov x1, sp
mov x2, #(STUB_SIZE - STUB_DATA_DECOMP_DEST)
bl svc_x19
// write full_dict + half_dict (adjacent in memory)
adr x1, full_dict
mov x2, #(FULL_DICT_SIZE + HALF_DICT_SIZE)
bl svc_x19
// write compressed stream
add x1, x28, #INPUT_BUF_OFF
mov x2, x20
bl svc_x19
// write .rodata section
add x1, x27, x29
mov x2, x21 // rodata_size (saved in x21)
bl svc_x19
// fchmod to make executable (returns 0 on success = our exit code)
mov x1, #493
mov x8, #SYS_fchmod
bl svc_x19
b exit_common
// ──────────────────────────────────────────────────────────────────────────
// Error exits
// ──────────────────────────────────────────────────────────────────────────
err_usage:
adr x1, msg_usage
b die_msg
err_open:
adr x1, msg_open
b die_msg
err_create:
adr x1, msg_create
die_msg:
bl strlen_x1
bl write2
mov x0, #1
exit_common:
mov x8, #SYS_exit
svc #0
svc_x19:
mov x0, x19
svc #0
ret
write2:
mov x0, #STDERR
mov x8, #SYS_write
svc #0
ret
// strlen_x1 — compute length of null-terminated string in x1 → x2
strlen_x1:
mov x2, #-1
1: add x2, x2, #1
ldrb w10, [x1, x2]
cbnz w10, 1b
ret
// ══════════════════════════════════════════════════════════════════════════
// Utility functions (spec §8.5)
//
// Calling convention: args in x0-x7, return in x0 (x1 for pairs).
// Leaf functions — no stack frame needed.
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// skip_ws — advance pointer past spaces and tabs
// x0 = pointer
// returns x0 = first non-whitespace position
// ──────────────────────────────────────────────────────────────────────────
skip1_ws:
add x0, x0, #1
skip_ws:
1: ldrb w9, [x0]
cbz w9, 2f
cmp w9, #' '
csinc x0, x0, x0, hi
b.ls 1b
2: ret
ws_x2_skip1:
mov x16, x30
bl ws_x2
mov x30, x16
b skip1_ws
ws_x1:
mov x0, x1
b skip_ws
ws_x19:
mov x0, x19
b skip_ws
ws_x2:
mov x0, x2
b skip_ws
ws_x21:
mov x0, x21
b skip_ws
ws_x21_parse_reg:
mov x16, x30
bl ws_x21
mov x30, x16
b parse_register
// ──────────────────────────────────────────────────────────────────────────
// decode_escape — decode backslash escape character
// w9 = char after backslash; returns w9 = decoded character
// ──────────────────────────────────────────────────────────────────────────
decode_escape:
cmp w9, #'0'
csel w9, wzr, w9, eq
cmp w9, #'n'
mov w10, #10
csel w9, w10, w9, eq
cmp w9, #'t'
mov w10, #9
csel w9, w10, w9, eq
ret
// ──────────────────────────────────────────────────────────────────────────
// parse_int — parse decimal, hex, or character literal
// x0 = pointer (at first character of the number)
// returns x0 = value, x1 = pointer past the parsed number
//
// formats: 123 -42 0x1F 0xFF 'A' '\n'
// ──────────────────────────────────────────────────────────────────────────
parse_int:
ldrb w9, [x0]
// character literal?
cmp w9, #'\''
b.eq parse_int_char
// negative?
cmp w9, #'-'
csinc x0, x0, x0, ne // advance past '-' if negative
cset x11, eq // sign flag: 1 if '-', else 0
ldrb w9, [x0] // reload current char
// hex prefix?
cmp w9, #'0'
b.ne parse_int_dec
ldrb w10, [x0, #1]
orr w10, w10, #0x20
cmp w10, #'x'
b.eq parse_int_hex
parse_int_dec:
mov x12, #0 // accumulator
2: ldrb w9, [x0]
sub w10, w9, #'0'
cmp w10, #9
b.hi parse_int_done
add x12, x12, x12, lsl #2 // x12 * 5
add x12, x10, x12, lsl #1 // digit + x12*10
add x0, x0, #1
b 2b
parse_int_hex:
add x0, x0, #2 // skip "0x"
mov x12, #0
3: ldrb w9, [x0]
sub w10, w9, #'0'
cmp w10, #9
b.ls 4f
orr w10, w9, #0x20 // fold uppercase to lowercase
sub w10, w10, #'a'
cmp w10, #5
b.hi parse_int_done
add w10, w10, #10
4: add x12, x10, x12, lsl #4
add x0, x0, #1
b 3b
parse_int_done:
cbz x11, parse_int_ret
neg x12, x12
parse_int_ret:
mov x1, x0
mov x0, x12
ret
parse_int_char:
add x0, x0, #1 // skip opening quote
ldrb w9, [x0], #1 // load char, advance past it
cmp w9, #'\\'
b.ne 1f
// escape: x0 is past backslash already
ldrb w9, [x0], #1 // load escape char, advance past it
mov x16, x30
bl decode_escape
mov x30, x16
1: mov x12, x9
add x0, x0, #1 // skip closing quote
b parse_int_ret
// ──────────────────────────────────────────────────────────────────────────
// parse_ident — parse an identifier [a-zA-Z_][a-zA-Z0-9_]*
// x0 = pointer
// returns x0 = start of ident, x1 = length, x2 = pointer past ident
// if no valid identifier, x1 = 0
// ──────────────────────────────────────────────────────────────────────────
parse_ident:
mov x9, x0 // start
ldrb w10, [x0], #1
b pi_check_first
1: ldrb w10, [x0], #1
// loop: accept digits (not valid for first char)
sub w11, w10, #'0'
cmp w11, #9
b.ls 1b
pi_check_first:
// accept underscore and letters
cmp w10, #'_'
b.eq 1b
orr w11, w10, #0x20
sub w11, w11, #'a'
cmp w11, #25
b.ls 1b
// end of identifier (or not an identifier if x0 == x9)
sub x2, x0, #1 // end pointer (x0 is one past due to post-index)
sub x1, x2, x9 // length (0 if no ident)
mov x0, x9 // start
ret
// ──────────────────────────────────────────────────────────────────────────
// parse_register — parse register name
// x0 = pointer
// returns x0 = reg number (0-31), x1 = is_64bit, x2 = pointer past
// on error: x0 = -1
// ──────────────────────────────────────────────────────────────────────────
parse_register:
// w9 pre-loaded by caller (skip_ws sets w9 = first non-ws char)
// sp?
ldrh w10, [x0]
movz w11, #0x7073 // 'sp' in little-endian
cmp w10, w11
b.ne 1f
// make sure it's not a longer ident (e.g. "spaghetti")
ldrb w10, [x0, #2]
orr w11, w10, #0x20
sub w11, w11, #'a'
cmp w11, #25
b.ls 1f
sub w11, w10, #'0'
cmp w11, #9
b.ls 1f
cmp w10, #'_'
b.eq 1f
add x2, x0, #2 // end pointer (before clobbering x0)
mov x0, #31
mov x1, #1
ret
1: cmp w9, #'x'
cset x1, eq // x1=1 if 'x' (64-bit), else 0
b.eq parse_reg_xw
cmp w9, #'w'
b.ne parse_reg_fail
parse_reg_xw:
// check for xzr/wzr — load 4 bytes, extract bytes 1-2 as 16-bit LE
ldr w10, [x0]
ubfx w10, w10, #8, #16
movz w11, #0x727A // 'z' | ('r' << 8) in little-endian
cmp w10, w11
b.ne parse_reg_num
add x2, x0, #3
mov x0, #31
ret
parse_reg_num:
// x1 = is_64bit from cset above; not modified by this code
ldrb w12, [x0, #1] // first digit
sub w12, w12, #'0'
cmp w12, #9
b.hi parse_reg_fail
ldrb w10, [x0, #2]
sub w11, w10, #'0'
cmp w11, #9
add x2, x0, #2 // end pointer (single digit); flags unaffected
b.hi 1f // single digit
add w13, w12, w12, lsl #2 // first * 5
add w12, w11, w13, lsl #1 // second + first * 10
add x2, x2, #1
1: cmp w12, #30
b.hi parse_reg_fail
mov x0, x12
ret
// ──────────────────────────────────────────────────────────────────────────
// sym_lookup — find a symbol in the hash table
// x0 = name pointer, x1 = name length
// returns x0 = pointer to entry, x1 = 1 if found (0 if empty slot)
//
// uses x28 (state block) to reach sym_table / sym_names
// ──────────────────────────────────────────────────────────────────────────
sym_lookup:
// leaf function — no frame needed, uses scratch registers only
mov x15, x0 // name ptr
mov x16, x1 // name len
// hash the name (inlined djb2)
mov x9, #5381
cbz x1, 2f
1: sub x1, x1, #1
ldrb w12, [x0, x1]
add x9, x9, x9, lsl #5
add x9, x9, x12
cbnz x1, 1b
// slot = hash & (SYM_TBL_SLOTS - 1)
2: and x17, x9, #(SYM_TBL_SLOTS - 1)
add x14, x27, x29, lsl #1 // sym_table = text_buf + 2*1MB
sym_lookup_probe:
// entry = &sym_table[slot * 32]
add x13, x14, x17, lsl #5 // entry pointer in x13
// check if slot is empty (name_ptr == NULL)
ldr x12, [x13, #SYM_NAME_PTR]
cbz x12, sym_lookup_empty
// compare name_len
ldr w11, [x13, #SYM_NAME_LEN]
cmp w11, w16
b.ne sym_lookup_next
// compare name bytes (x12 = direct pointer into input buffer)
mov x0, x16 // counter
1: sub x0, x0, #1
ldrb w9, [x12, x0]
ldrb w10, [x15, x0]
cmp w9, w10
b.ne sym_lookup_next
cbnz x0, 1b
mov x1, #1
b sym_lookup_ret
sym_lookup_next:
add x17, x17, #1
and x17, x17, #(SYM_TBL_SLOTS - 1)
b sym_lookup_probe
sym_lookup_empty:
mov x1, #0
sym_lookup_ret:
mov x0, x13
ret
// ──────────────────────────────────────────────────────────────────────────
// sym_define — insert or update a symbol
// x0 = name pointer, x1 = name length, x2 = value, x3 = flags
//
// if the symbol already exists, updates value and flags (OR'd).
// if new, stores direct name pointer from input buffer.
// ──────────────────────────────────────────────────────────────────────────
sym_define:
stp x19, x30, [sp, #-16]!
mov x19, x2 // value (callee-saved)
// x3 = flags (preserved across sym_lookup — leaf, doesn't touch x3)
bl sym_lookup
// x0 = entry, x15 = name ptr, x16 = name len (set by sym_lookup)
cbnz x1, sym_define_update
// ── new entry: store name pointer directly ────────────────────────────
str x15, [x0, #SYM_NAME_PTR]
str w16, [x0, #SYM_NAME_LEN]
sym_define_update:
str x19, [x0, #SYM_VALUE]
// OR in flags (don't clobber existing bits)
ldr x9, [x0, #SYM_FLAGS]
orr x9, x9, x3
str x9, [x0, #SYM_FLAGS]
ldp x19, x30, [sp], #16
ret
// ──────────────────────────────────────────────────────────────────────────
// error_at — print "filename:line: msg\n" to stderr and exit(1)
// x0 = message pointer (null-terminated)
//
// uses state block for filename and line number
// ──────────────────────────────────────────────────────────────────────────
error_at:
mov x19, x0 // msg ptr
// write filename
ldr x1, [x28, #ST_INPUT_NAME]
bl strlen_x1
bl write2
// build ":[linenum]: " in frame buffer and write it
ldr x9, [x28, #ST_LINE_NUM]
add x11, sp, #44
mov x10, x11
mov x12, #10
3: udiv x13, x9, x12
msub x14, x13, x12, x9
add w14, w14, #'0'
strb w14, [x10, #-1]!
mov x9, x13
cbnz x9, 3b
mov w14, #':'
strb w14, [x10, #-1]!
movz w14, #0x203A
strh w14, [x11]
mov x1, x10
sub x2, x11, x10
add x2, x2, #2
bl write2
// write message (includes \n) and exit
mov x1, x19
b die_msg
// ══════════════════════════════════════════════════════════════════════════
// Pass driver and line processing
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// run_pass — iterate over all source lines
// x0 = pass number (1 or 2)
// ──────────────────────────────────────────────────────────────────────────
run_pass:
stp x30, x19, [sp, #-64]!
stp x20, x21, [sp, #16]
stp x22, x23, [sp, #32]
str x0, [x28, #ST_PASS]
// reset section positions and current section
stp xzr, xzr, [x28, #ST_TEXT_POS]
stp xzr, xzr, [x28, #ST_BSS_POS]
// reset line number (x21 = line counter, synced to state before process_line)
mov x21, #1
// reset numeric label cursors (inline zero fill)
add x0, x28, #NUMLAB_CURS_OFF
mov x2, #(NUMLAB_DIGITS * 8)
1: sub x2, x2, #1
strb wzr, [x0, x2]
cbnz x2, 1b
// set up input pointers
add x19, x28, #INPUT_BUF_OFF // input_buf
ldr x9, [x28, #ST_INPUT_LEN]
add x20, x19, x9 // x20 = end of input
run_pass_loop:
cmp x19, x20
b.ge pl_done
// find end of line (newline or end of buffer)
mov x22, x19
1: cmp x22, x20
b.ge 2f
ldrb w10, [x22], #1
cmp w10, #'\n'
b.ne 1b
sub x22, x22, #1 // back up to newline
2:
// temporarily null-terminate
ldrb w23, [x22]
strb wzr, [x22]
// process the line
str x21, [x28, #ST_LINE_NUM]
mov x0, x19
bl process_line
// restore original byte
strb w23, [x22]
// advance past newline
add x19, x22, #1
add x21, x21, #1
b run_pass_loop
// ──────────────────────────────────────────────────────────────────────────
// process_line — handle one null-terminated source line
// x0 = line start (null-terminated)
// ──────────────────────────────────────────────────────────────────────────
process_line:
stp x30, x19, [sp, #-64]!
stp x20, x21, [sp, #16]
stp x22, x23, [sp, #32]
bl skip_ws
mov x19, x0
pl_check_content:
// empty line? (w9 pre-loaded by skip_ws / ws_x19)
cbz w9, pl_done
// comment? ( // )
ldrh w10, [x19]
movz w11, #0x2F2F // "//" in little-endian
cmp w10, w11
b.eq pl_done
// ── check for numeric label (digit followed by ':') ───────────────────
// w10 = ldrh from [x19]; for "N:", w10 = 0x3A30..0x3A39
movz w11, #0x3A30 // ':' << 8 | '0'
sub w10, w10, w11
cmp w10, #9
b.hi pl_not_numlab
// numeric label — record in pass 1
mov x0, x10 // digit (0-9)
bl handle_numlab
add x19, x19, #2
b pl_after_label
pl_not_numlab:
// ── check for named label or mnemonic ─────────────────────────────────
cmp w9, #'.'
b.eq pl_directive
mov x0, x19
bl parse_ident
cbz x1, pl_done // no identifier → skip
// is it a label (followed by ':')?
ldrb w9, [x2]
cmp w9, #':'
b.ne pl_instruction
// ── named label ───────────────────────────────────────────────────────
add x19, x2, #1 // past ':'
// only define in pass 1 (pass 2 uses rebased values)
ldr x9, [x28, #ST_PASS]
tbnz x9, #1, pl_after_label
// value = current section offset
ldr x11, [x28, #ST_CUR_SEC]
ldr x2, [x28, x11]
// flags = DEFINED | (cur_section << SEC_SHIFT); x11 = sec*8
lsl x3, x11, #1
orr x3, x3, #SYMF_DEFINED
bl sym_define
pl_after_label:
bl ws_x19
mov x19, x0
b pl_check_content
// ── directive (starts with '.') ───────────────────────────────────────
pl_directive:
add x0, x19, #1 // skip '.'
bl parse_ident
cbz x1, pl_done
// x0 = name start, x1 = name length, x2 = end pointer
mov x20, x0 // directive name
mov x21, x1 // directive length
mov x19, x2 // position after directive name
// dispatch on directive name — check first char then length
ldrb w9, [x20]
cmp w9, #'b'
mov x10, #SEC_BSS // doesn't affect flags
b.eq dir_sec_set
cmp w9, #'s'
b.ne 5f
cmp x21, #7
b.ne dir_skip
mov x10, #SEC_RODATA
b dir_sec_set
5: cmp w9, #'a'
b.ne 6f
ldrb w10, [x20, #4]
cmp w10, #'n'
b.eq dir_align
b dir_str_common
6: cmp w9, #'e'
b.ne 7f
// inline dir_equ:
bl ws_x19
bl parse_ident
cbz x1, pl_done
mov x20, x0
mov x21, x1
bl ws_x2_skip1
bl parse_expr0
mov x2, x0
mov x3, #(SYMF_DEFINED | SYMF_EQU)
mov x0, x20
mov x1, x21
bl sym_define
b pl_done
7: cmp w9, #'g'
b.ne dir_word
// inline dir_global:
bl ws_x19
bl parse_ident
cbz x1, pl_done
mov x2, #0
mov x3, #SYMF_GLOBAL
bl sym_define
b pl_done
// ── instruction ───────────────────────────────────────────────────────
pl_instruction:
// x0 = mnemonic start, x1 = mnemonic length, x2 = position after
ldr x9, [x28, #ST_PASS]
tbnz x9, #1, encode_instruction
// pass 1: advance text pos by 4 (write garbage to buf — overwritten in pass 2)
b emit_inst_done
dir_word:
cmp w9, #'w'
b.ne dir_text
// .word <expr> — emit 4-byte little-endian value
bl parse_expr0_x19
mov x22, x0 // value
// always write to buffer (pass 1 writes are harmless, overwritten in pass 2)
ldr x11, [x28, #ST_CUR_SEC]
ldr x10, [x28, x11] // current pos
add x0, x27, x11, lsl #17 // text_buf + sec * 1MB
str w22, [x0, x10] // store 4 bytes
mov x0, #4
b advance_sec_pos
dir_text:
cmp w9, #'t'
mov x10, #SEC_TEXT // doesn't affect flags
b.ne pl_done
dir_sec_set:
str x10, [x28, #ST_CUR_SEC]
pl_done:
ldp x22, x23, [sp, #32]
ldp x20, x21, [sp, #16]
ldp x30, x19, [sp], #64
ret
// ══════════════════════════════════════════════════════════════════════════
// Directive handlers
//
// On entry: x19 = parse position after directive name
// x20, x21 available (saved by process_line's frame)
// Must jump to pl_done when finished.
// ══════════════════════════════════════════════════════════════════════════
// .align N — align to 2^N boundary
dir_align:
bl parse_expr0_x19
// x0 = N (alignment power)
mov x10, x0
ldr x11, [x28, #ST_CUR_SEC]
ldr x0, [x28, x11] // current position
// aligned = (pos + mask) & ~mask where mask = (1<<N)-1
mov x9, #1
lsl x9, x9, x10 // 1 << N
sub x9, x9, #1 // mask
add x0, x0, x9 // pos + mask
bic x0, x0, x9 // & ~mask = aligned position
str x0, [x28, x11]
b pl_done
// .skip N — advance by N bytes
dir_skip:
bl parse_expr0_x19
b advance_sec_pos
// .ascii/.asciz "string" — w10 still holds directive[4] ('i' or 'z')
dir_str_common:
cmp w10, #'z'
cset x21, eq // null_flag: 1 if asciz, 0 if ascii
bl ws_x19 // x0 = pointer to '"'
// always compute dest buffer (pass 1 writes are harmless, overwritten in pass 2)
ldr x11, [x28, #ST_CUR_SEC] // x0 preserved
add x20, x27, x11, lsl #17 // text_buf + sec * 1MB
ldr x10, [x28, x11]
add x20, x20, x10
mov x1, x20
bl parse_string // x0 = count, x1 = ptr past
cbz x21, 2f // not asciz: skip null
strb wzr, [x20, x0] // write null terminator
2: add x0, x0, x21 // count + null_flag
advance_sec_pos:
ldr x11, [x28, #ST_CUR_SEC]
ldr x10, [x28, x11]
add x10, x10, x0
str x10, [x28, x11]
b pl_done
// ──────────────────────────────────────────────────────────────────────────
// handle_numlab — record a numeric label definition
// x0 = digit (0-9)
// ──────────────────────────────────────────────────────────────────────────
handle_numlab:
// x0 = digit (0-9) — leaf function, no frame needed
ldr x10, [x28, #ST_PASS]
tbnz x10, #1, handle_numlab_p2
// pass 1: numeric labels are always in .text
ldr x10, [x28, #ST_TEXT_POS]
// count = numlab_cnts[digit]
add x11, x28, #NUMLAB_CNTS_OFF
ldr x12, [x11, x0, lsl #3] // count
// store address: numlab_defs[digit * MAX_DEFS + count]
adrp x13, numlab_defs
add x13, x13, :lo12:numlab_defs
lsl x14, x0, #7 // digit * 128
add x14, x14, x12
str x10, [x13, x14, lsl #3]
// increment count
add x12, x12, #1
str x12, [x11, x0, lsl #3]
ret
handle_numlab_p2:
add x11, x28, #NUMLAB_CURS_OFF
ldr x10, [x11, x0, lsl #3]
add x10, x10, #1
str x10, [x11, x0, lsl #3]
ret
// ──────────────────────────────────────────────────────────────────────────
// rebase_symbols — after pass 1, add section bases to label values
// ──────────────────────────────────────────────────────────────────────────
rebase_symbols:
add x9, x27, x29, lsl #1 // sym_table = text_buf + 2*1MB
mov x10, #SYM_TBL_SLOTS
rebase_loop:
ldr x13, [x9, #SYM_NAME_PTR]
cbz x13, rebase_next // empty slot
ldp x14, x17, [x9, #SYM_FLAGS] // flags, value
tbnz x14, #2, rebase_next // bit 2 = SYMF_EQU, skip
// extract section from flags bits 5:4
ubfx x15, x14, #SYMF_SEC_SHIFT, #2
// base[section] = state[ST_TEXT_BASE + section*8]
add x16, x28, x15, lsl #3
ldr x16, [x16, #ST_TEXT_BASE]
add x17, x17, x16
str x17, [x9, #SYM_VALUE]
rebase_next:
add x9, x9, #SYM_ENT_SIZE
sub x10, x10, #1
cbnz x10, rebase_loop
// x9 now points to numlab_defs (sym_table + SYM_TBL_BYTES)
add x10, x28, #NUMLAB_CNTS_OFF
// numeric labels are always in the text section for now
ldr x16, [x28, #ST_TEXT_BASE]
mov x11, #0 // digit
rebase_numlab_digit:
ldr x12, [x10, x11, lsl #3] // count for this digit
lsl x14, x11, #7 // base index = digit * 128
cbz x12, 5f // skip if count = 0
4: sub x12, x12, #1
add x17, x14, x12
ldr x0, [x9, x17, lsl #3]
add x0, x0, x16
str x0, [x9, x17, lsl #3]
cbnz x12, 4b
5: add x11, x11, #1
cmp x11, #NUMLAB_DIGITS
b.lt rebase_numlab_digit
ret
// ══════════════════════════════════════════════════════════════════════════
// Expression evaluator — recursive descent
//
// Each function: x0 = pointer → x0 = value, x1 = pointer past expr
//
// Precedence (low to high): | & +/- * <</>> unary(~ -) atom
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// parse_expr — Pratt binary expression parser
// x0 = pointer, x1 = min_prec (0 for top-level callers)
// returns x0 = value, x1 = pointer past expr
//
// Precedence: | (1) < & (2) < +/- (3) < * (4) < <<,>> (5)
// ──────────────────────────────────────────────────────────────────────────
parse_expr0_x19:
mov x0, x19
parse_expr0:
mov x1, #0
parse_expr:
stp x30, x19, [sp, #-64]!
stp x20, x21, [sp, #16]
stp x22, x23, [sp, #32]
mov x22, x1 // min_prec
bl parse_expr_unary
mov x19, x0 // lhs value
mov x20, x1 // current position
// Operator dispatch: x21 encodes (prec<<4)|opcode
// | → 0x10 & → 0x21 + → 0x32 - → 0x33 * → 0x44 << → 0x55 >> → 0x56
pe_loop:
mov x0, x20
bl skip_ws
mov x20, x0
adr x10, op_table
1: ldrb w11, [x10]
cbz w11, 5f
cmp w9, w11
ldrb w21, [x10, #1]
sub x21, x21, #32
add x10, x10, #2
b.ne 1b
b pe_check_prec
5: cmp w9, #'<'
b.eq pe_shift
cmp w9, #'>'
b.ne pe_done
pe_shift:
ldrb w10, [x20, #1]
cmp w10, w9
b.ne pe_done
add x20, x20, #1
lsr w21, w9, #1
add x21, x21, #55
pe_check_prec:
lsr x9, x21, #4 // prec = x21 >> 4
and x23, x21, #0xF // opcode = x21 & 0xF (callee-saved)
cmp x9, x22 // op_prec vs min_prec
b.lt pe_done // op_prec < min_prec: not ours
add x20, x20, #1 // skip operator char
mov x0, x20
add x1, x9, #1 // recurse with prec+1
bl parse_expr
mov x20, x1 // update position
adr x9, pe_ops
add x9, x9, x23, lsl #3
br x9
pe_ops:
orr x19, x19, x0 // opcode 0: |
b pe_loop
and x19, x19, x0 // opcode 1: &
b pe_loop
add x19, x19, x0 // opcode 2: +
b pe_loop
sub x19, x19, x0 // opcode 3: -
b pe_loop
mul x19, x19, x0 // opcode 4: *
b pe_loop
lsl x19, x19, x0 // opcode 5: <<
b pe_loop
lsr x19, x19, x0 // opcode 6: >>
b pe_loop
pe_done:
mov x0, x19
mov x1, x20
b pl_done
// ──────────────────────────────────────────────────────────────────────────
// parse_expr_unary — handles '~', unary '-', then falls through to atom
// ──────────────────────────────────────────────────────────────────────────
parse_expr_unary:
stp x30, x20, [sp, #-16]!
bl skip_ws
cmp w9, #'~'
b.eq pe_unary_not
cmp w9, #'-'
b.eq pe_unary_neg
// not unary, fall through to parse atom (skip_ws already done)
// '(' — grouped expression
cmp w9, #'('
b.eq pe_atom_paren
// '.' — current location counter
cmp w9, #'.'
b.eq pe_atom_dot
// digit or '-' or '\'' — numeric literal
sub w10, w9, #'0'
cmp w10, #9
b.ls pe_atom_num
cmp w9, #'\''
b.eq pe_atom_num
// identifier — symbol reference
bl parse_ident
cbz x1, pe_atom_err
mov x20, x2 // end pointer (return this)
// look up symbol
bl sym_lookup
cbz x1, pe_atom_undef
// return value
ldr x0, [x0, #SYM_VALUE]
b pea_ret_x20
pe_atom_undef:
// in pass 1, undefined symbols get 0 (forward ref in instruction)
ldr x9, [x28, #ST_PASS]
tbz x9, #1, 1f
// pass 2: error
err_undef:
adr x0, msg_undef
bl error_at
1: mov x0, #0
b pea_ret_x20
pe_atom_paren:
add x0, x0, #1 // skip '('
bl parse_expr0
mov x20, x0 // value
bl ws_x1
cmp w9, #')'
b.ne pe_atom_err
add x1, x0, #1 // pointer past ')'
mov x0, x20
b pea_ret
pe_atom_dot:
mov x20, x0 // save pointer to '.'
ldr x11, [x28, #ST_CUR_SEC]
ldr x0, [x28, x11] // section offset
ldr x10, [x28, #ST_PASS]
tbz x10, #1, 1f
// pass 2: add section base (x11 = sec*8)
add x11, x28, x11
ldr x11, [x11, #ST_TEXT_BASE]
add x0, x0, x11
1: add x1, x20, #1 // pointer past '.'
b pea_ret
pe_atom_num:
bl parse_int
b pea_ret
pea_ret_x20:
mov x1, x20
pea_ret:
ldp x30, x20, [sp], #16
ret
pe_atom_err:
adr x0, msg_syntax
bl error_at
pe_unary_not:
add x0, x0, #1
bl parse_expr_unary // recursive
mvn x0, x0
b pea_ret
pe_unary_neg:
add x0, x0, #1
bl parse_expr_unary // recursive
sub x0, xzr, x0 // neg
b pea_ret
// ══════════════════════════════════════════════════════════════════════════
// String parsing
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// parse_string — parse a quoted string, count or emit bytes
// x0 = pointer (at the opening '"')
// x1 = destination (NULL to just count)
// returns x0 = byte count, x1 = pointer past closing '"'
// ──────────────────────────────────────────────────────────────────────────
parse_string:
mov x16, x30
mov x15, x1 // dest (or NULL)
add x0, x0, #1 // skip opening '"'
mov x14, #0 // byte count
ps_loop:
ldrb w9, [x0], #1 // load + advance
cbz w9, ps_done // unterminated string
cmp w9, #'"'
b.eq ps_done // closing quote (x0 already past it)
cmp w9, #'\\'
b.eq ps_escape
// plain character — x0 already advanced by post-increment
ps_store:
strb w9, [x15, x14]
add x14, x14, #1
b ps_loop
ps_escape:
ldrb w9, [x0], #1 // load escape char, advance (past backslash)
bl decode_escape
b ps_store
ps_done:
mov x1, x0
mov x0, x14 // byte count
br x16
// ══════════════════════════════════════════════════════════════════════════
// Pass 2 infrastructure
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// emit_inst_done — emit instruction word then restore encode_instruction frame
// x0 = instruction word; reached via 'b' from within encode_instruction
// ──────────────────────────────────────────────────────────────────────────
// emit_with_sf — apply sf bit into bit 31 of w0, then emit
emit_with_sf:
mov w24, w23
emit_with_sf24:
orr w0, w0, w24, lsl #31
emit_inst_done:
ldr x9, [x28, #ST_TEXT_POS]
str w0, [x27, x9]
add x9, x9, #4
str x9, [x28, #ST_TEXT_POS]
b pl_done
// ──────────────────────────────────────────────────────────────────────────
// parse_label_pc_rel — parse label ref then compute PC-relative offset
// uses [sp, #48] for return address
// returns x0 = signed offset in instruction units
// ──────────────────────────────────────────────────────────────────────────
parse_label_pc_rel:
str x30, [sp, #48]
bl parse_label_ref
ldr x9, [x28, #ST_TEXT_BASE]
ldr x10, [x28, #ST_TEXT_POS]
add x9, x9, x10
sub x0, x0, x9
asr x0, x0, #2
ldr x30, [sp, #48]
ret
// parse_x23_ws — parse first register into x23, skip comma+ws
// x1 preserved (sf/is_64bit from parse_register)
// uses [sp, #48] for return address
parse_x23_ws:
str x30, [sp, #48]
bl ws_x21_parse_reg
mov x23, x0
ldr x30, [sp, #48]
b ws_x2_skip1
// ──────────────────────────────────────────────────────────────────────────
// parse_2reg — parse "Rd, Rn" from operands (x21)
// returns x22 = Rd, x23 = sf, x0 = Rn
// NOTE: uses [sp, #56] for return address; called from encode_instruction
// ──────────────────────────────────────────────────────────────────────────
parse_2reg:
str x30, [sp, #56]
bl parse_x23_ws
mov x22, x23 // Rd
mov x23, x1 // sf
b p23_tail
// ──────────────────────────────────────────────────────────────────────────
// parse_3reg — parse "Rd, Rn, Rm" from operands (x21)
// returns x22 = Rd, x23 = sf, x24 = Rn, x0 = Rm
// NOTE: uses [sp, #56] for return address; called from encode_instruction
// ──────────────────────────────────────────────────────────────────────────
parse_3reg:
str x30, [sp, #56]
bl parse_x23_ws
mov x22, x23 // Rd
mov x23, x1 // sf
bl parse_register
mov x24, x0 // Rn
bl ws_x2_skip1 // skip ','
p23_tail:
ldr x30, [sp, #56]
b parse_register
skip_lsl:
1: ldrb w9, [x0, #1]!
cmp w9, #'#'
b.ne 1b
// falls through to parse_hash_imm
// ──────────────────────────────────────────────────────────────────────────
// parse_hash_imm — parse #expr or #:lo12:expr
// x0 = pointer (at '#')
// returns x0 = value, x1 = pointer past, x2 = 1 if :lo12:
// ──────────────────────────────────────────────────────────────────────────
parse_hash_imm:
str x30, [sp, #48]
// check first char: '#' or ':'
ldrb w9, [x0]
cmp w9, #'#'
csinc x0, x0, x0, ne // skip '#' if found
ldrb w9, [x0]
cmp w9, #':'
b.ne phi_plain
ldrb w10, [x0, #1]
cmp w10, #'l'
b.ne phi_plain
// :lo12: — skip 6 chars
add x0, x0, #6
bl parse_expr0
and x0, x0, #0xFFF
b phi_ret
phi_plain:
bl parse_expr0
phi_ret:
ldr x30, [sp, #48]
ret
// ──────────────────────────────────────────────────────────────────────────
// parse_label_ref — parse branch target (named label or Nf/Nb)
// x0 = pointer
// returns x0 = target address, x1 = pointer past
// ──────────────────────────────────────────────────────────────────────────
parse_label_ref:
stp x30, x20, [sp, #-16]!
bl skip_ws
// numeric label ref? digit followed by 'f' or 'b'
sub w10, w9, #'0'
cmp w10, #9
b.hi plr_named
ldrb w11, [x0, #1]
cmp w11, #'b'
cset x1, eq // x1=1 backward, 0 forward
b.eq plr_numlab_common
cmp w11, #'f'
b.ne plr_named
plr_numlab_common:
add x20, x0, #2 // pointer past "Nf"/"Nb"
mov x0, x10 // digit
// inlined rnl_entry
add x9, x28, #NUMLAB_CURS_OFF
ldr x10, [x9, x0, lsl #3] // cursor
sub x10, x10, x1 // backward: cursor-1, forward: cursor
adrp x11, numlab_defs
add x11, x11, :lo12:numlab_defs
lsl x12, x0, #7 // digit * 128
add x12, x12, x10 // + cursor
ldr x0, [x11, x12, lsl #3]
b pea_ret_x20
plr_named:
bl parse_ident
cbz x1, err_undef
mov x20, x2 // save end pointer
bl sym_lookup
cbz x1, err_undef
ldr x0, [x0, #SYM_VALUE]
b pea_ret_x20
// ──────────────────────────────────────────────────────────────────────────
// encode_logical_imm — encode bitmask immediate for logical instructions
// x0 = value, x1 = is_32bit (1=replicate low 32 to full 64)
// returns x0 = (N << 12) | (immr << 6) | imms, or -1 if unencodable
// ──────────────────────────────────────────────────────────────────────────
encode_logical_imm:
// leaf function — no frame needed, uses scratch registers only
// for 32-bit, replicate low 32 bits
cbz x1, eli_start
and x0, x0, #0xFFFFFFFF
orr x0, x0, x0, lsl #32
eli_start:
mov x13, x0 // val
// reject all-zeros and all-ones
cbz x13, ei_logical_bad
mvn x9, x13
cbz x9, ei_logical_bad
// rotation = ctz(val & (val + 1))
add x9, x13, #1
and x9, x13, x9
rbit x10, x9
clz x14, x10 // rotation
// normalized = ror(val, rotation)
ror x9, x13, x14
// zeroes = clz(normalized)
clz x10, x9
// ones = ctz(~normalized) = clz(rbit(~normalized))
mvn x11, x9
rbit x11, x11
clz x15, x11 // ones
// size = zeroes + ones
add x16, x10, x15
// validate: ror(val, size) == val
ror x9, x13, x16
cmp x9, x13
b.ne ei_logical_bad
// immr = (-rotation) & (size - 1)
neg x9, x14
sub x10, x16, #1
and x9, x9, x10 // immr
// imms = (-(size << 1) | (ones - 1)) & 0x3F
sub x11, xzr, x16, lsl #1
sub x12, x15, #1
orr x11, x11, x12
and x11, x11, #0x3F // imms
// result = (N << 12) | (immr << 6) | imms where N = size >> 6
lsr x12, x16, #6
orr x0, x11, x9, lsl #6
orr x0, x0, x12, lsl #12
ret
// ──────────────────────────────────────────────────────────────────────────
// parse_cond — parse condition code (eq, ne, lt, ge, hi, ls, etc.)
// x0 = pointer (at first char of condition)
// returns x0 = pointer past, x1 = cond code (0-14)
// Uses cond_table in .rodata: 2-byte entries, index = code; cs/cc aliases at 15/16
// ──────────────────────────────────────────────────────────────────────────
parse_cond:
ldrh w9, [x0]
add x0, x0, #2
lsr w10, w9, #8 // char1
eor w10, w10, w9 // char0 ^ char1
and w10, w10, #0x1F // 5-bit index
adr x11, cond_xor_tbl
ldrb w1, [x11, x10]
ret
parse_reg_fail:
mov x0, #-1
ret
// ──────────────────────────────────────────────────────────────────────────
// encode_instruction — dispatch mnemonic, parse operands, emit
// x0 = mnemonic start, x1 = mnemonic length, x2 = operands start
// ──────────────────────────────────────────────────────────────────────────
encode_instruction:
// x19 already equals x0 (set by process_line before parse_ident)
mov x20, x1
mov x21, x2
// dispatch on first character of mnemonic
ldrb w9, [x19]
ldrb w10, [x19, #1]
cmp w9, #'a'
b.eq ei_a
cmp w9, #'b'
b.eq ei_b
cmp w9, #'c'
b.eq ei_c
cmp w9, #'e'
mov x22, #2 // eor opc (doesn't affect flags)
b.eq ei_logical
cmp w9, #'l'
b.eq ei_l
cmp w9, #'m'
b.eq ei_m
cmp w9, #'n'
b.eq ei_n
cmp w9, #'o'
mov x22, #1 // orr opc (doesn't affect flags)
b.eq ei_logical
cmp w9, #'r'
b.eq ei_r
cmp w9, #'s'
b.eq ei_s
cmp w9, #'t'
b.eq ei_t
cmp w9, #'u'
b.ne ei_bad
// udiv Rd, Rn, Rm / ubfx / ubfm / ubfiz / uxtb / uxth
ei_u:
cmp w10, #'b'
b.eq ei_bfm_unified
cmp w10, #'x'
b.eq ei_sxt_uxt
ei_udiv:
mov w25, #0
ei_div_common:
bl parse_3reg
orr w9, w25, #0x0800
b emit_3reg_1AC0_tail
// ── 'a' mnemonics: add, and, adrp ─────────────────────────────────────
ei_a:
cmp w10, #'d'
b.eq ei_a_d
cmp w10, #'n'
movz w25, #0x2800 // ASRV opcode (speculative, harmless if AND)
b.ne ei_shift_common
sub x22, x20, #3 // len=3→0 (AND), len=4→1
add x22, x22, x22, lsl #1 // 0→0, 1→3 (ANDS opc)
b ei_logical
// sxtb/sxth/sxtw/uxtb/uxth Rd, Rn — SBFM/UBFM Rd, Rn, #0, #imms
ei_sxt_uxt:
bl parse_2reg
mov x24, x0
mov x10, #0 // immr = 0
ldrb w9, [x19, #3] // suffix: 'b', 'h', or 'w'
ubfx w11, w9, #3, #2 // 'b'→0, 'h'→1, 'w'→2
mov w12, #8
lsl w11, w12, w11 // 8, 16, 32
sub w11, w11, #1 // 7, 15, 31
ldrb w9, [x19] // 's' or 'u'
cmp w9, #'s'
b.ne ei_ubfm_emit // uxt → UBFM path (sxt falls through)
ei_asr_sbfm:
movz w0, #0x1300, lsl #16 // 32-bit SBFM base (sf+N applied later)
b ei_bfm_apply_n_sf
ei_a_d:
ldrb w10, [x19, #2]
cmp w10, #'d'
b.eq ei_add
cmp w10, #'r'
b.ne ei_bad
// adr/adrp shared: parse Rd, skip comma, precompute PC
bl ws_x21_parse_reg
mov x22, x0 // Rd
bl ws_x2_skip1 // skip ','
ldr x9, [x28, #ST_TEXT_BASE]
ldr x10, [x28, #ST_TEXT_POS]
add x25, x9, x10 // x25 = PC
cmp x20, #3
b.eq ei_adr_body
// adrp: page-relative offset
bl parse_label_ref
and x23, x0, #~0xFFF
and x9, x25, #~0xFFF
sub x23, x23, x9
asr x23, x23, #12
b ei_adr_encode
ei_adr_body:
bl parse_expr0
sub x23, x0, x25 // imm21 = target - PC
ei_adr_encode:
// encoding: immlo = imm21[1:0], immhi = imm21[20:2]
and w9, w23, #3 // immlo
ubfx w10, w23, #2, #19 // immhi (19 bits)
sub w23, w20, #3 // sf: 0=ADR(len3), 1=ADRP(len4)
movz w0, #0x1000, lsl #16 // ADR base opcode
orr w0, w0, w22
orr w0, w0, w9, lsl #29
orr w0, w0, w10, lsl #5
b emit_with_sf
// ── 'b' mnemonics: b, bl, b.cond, bic, bfm, bfi, bfxil ────────────────
ei_b:
cmp w10, #'f'
b.eq ei_bfm_unified
cmp x20, #3
b.eq ei_b3
b.hi ei_bad
cmp w10, #'r'
b.eq ei_br
// b (len=1) or bl (len=2): bit 31 = len-1
sub x9, x20, #1
movz w22, #0x1400, lsl #16
orr w22, w22, w9, lsl #31
bl ws_x21
cmp w9, #'.'
b.eq ei_bcond
// B/BL: parse label, compute pc-relative offset
bl parse_label_pc_rel
and w0, w0, #0x3FFFFFF
orr w0, w0, w22
b emit_inst_done
// 3-char 'b' mnemonics: blr or bic
ei_b3:
ldrb w9, [x19, #2]
cmp w9, #'r'
b.eq ei_blr
// bic Rd, Rn, Rm — AND Rd, Rn, ~Rm
// sf 00 01010 sh 1 Rm imm6 Rn Rd
ei_bic:
bl parse_3reg
movz w9, #0x0A20, lsl #16 // 32-bit BIC
b emit_3reg_sf_tail
// br Xn / blr Xn — branch (with link) to register
// br: x20=2 (len), blr: x20=3 → sub 2 gives 0 or 1 for bit 21
ei_br:
ei_blr:
bl ws_x21_parse_reg
sub w10, w20, #2
movz w9, #0xD61F, lsl #16
orr w9, w9, w10, lsl #21 // blr: set bit 21 → 0xD63F
orr w0, w9, w0, lsl #5
b emit_inst_done
// ── 'c' mnemonics: cmp, cbz, cbnz, clz, cset ──────────────────────────
ei_c:
cmp w10, #'m'
b.eq ei_c_cm
cmp w10, #'b'
b.ne 2f
ldrb w10, [x19, #2]
cmp w10, #'z'
cset x22, ne // x22=0 for cbz, 1 for cbnz
b.eq ei_cbz_common
cmp w10, #'n'
b.ne ei_bad
ei_cbz_common:
bl parse_x23_ws
mov x24, x1 // sf
bl parse_label_pc_rel
and w0, w0, #0x7FFFF
orr w0, w23, w0, lsl #5
orr w0, w0, w22, lsl #24
movz w9, #0x3400, lsl #16
b ei_addsub_sf_emit
2: cmp w10, #'l'
b.eq ei_clz
ldrb w10, [x19, #3]
cmp w10, #'n'
movz w26, #0x0400 // CSINC bit (speculative)
b.eq ei_csel_common
cmp w10, #'l'
movz w26, #0 // CSEL: no extra bits (speculative)
b.eq ei_csel_common
// cset Rd, cond — alias for CSINC Rd, xzr, xzr, invert(cond)
// encoding: 0x9A9F0000 | (inv_cond << 12) | 0x07E0 | Rd
ei_cset:
bl ws_x21_parse_reg // x0 = Rd, x2 = pointer past
mov x22, x0 // Rd
bl ws_x2_skip1 // skip ','
bl parse_cond // x1 = cond code
eor w1, w1, #1 // invert condition (flip bit 0)
orr w0, w22, w1, lsl #12 // Rd | (inv_cond << 12)
orr w0, w0, #0x7E0 // | Rn=xzr<<5, o2=1
movk w0, #0x9A9F, lsl #16 // | sf=1, opc, Rm=xzr
b emit_inst_done
// csel Rd, Rn, Rm, cond
// encoding: sf 00 11010100 Rm cond 00 Rn Rd
// 64-bit base: 0x9A800000
ei_csel_common:
bl parse_3reg // x22=Rd, x23=sf, x24=Rn, x0=Rm
mov x25, x0 // save Rm
bl ws_x2_skip1 // skip ','
bl parse_cond // x1 = cond
movz w9, #0x1A80, lsl #16 // CSEL 32-bit base
orr w9, w9, w26 // | CSINC bit if set
orr w9, w9, w1, lsl #12 // | (cond << 12)
mov x0, x25 // Rm for emit_3reg_sf_tail
b emit_3reg_sf_tail
// ── 'l' mnemonics: ldr, ldrb, lsl, lsr ────────────────────────────────
ei_l:
cmp w10, #'d'
b.eq ei_ld
// lsl/lsr — immediate (UBFM alias) or register (LSLV/LSRV)
ei_ls_shift:
ldrb w10, [x19, #2]
movz w25, #0x2000 // LSLV
cmp w10, #'r'
b.ne ei_shift_common
movz w25, #0x2400 // LSRV
ei_shift_common:
bl parse_2reg // x22=Rd, x23=sf, x0=Rn, x2=ptr past
mov x24, x0 // Rn
bl ws_x2_skip1
cmp w9, #'#'
b.eq ei_shift_imm_dispatch
// register form
bl parse_register // Rm
mov w9, w25
b emit_3reg_1AC0_tail
// ── 'm' mnemonics: mov, movz, movn, movk, mul, msub, madd, mvn ────────
ei_m:
cmp w10, #'o'
b.eq ei_mo
cmp w10, #'u'
b.eq ei_mul
cmp w10, #'s'
movz x26, #0x8000 // MSUB bit15 (speculative)
b.eq ei_madd_msub_common
cmp w10, #'a'
mov x26, #0 // MADD bit15 (speculative)
b.eq ei_madd_msub_common
// mvn Rd, Rm — alias for orn Rd, xzr, Rm
ei_mvn:
movz w25, #0x2A20, lsl #16 // 32-bit ORN base
b ei_neg_mvn_common
ei_mo:
// mov (3 chars) vs movz/movn/movk (4 chars)
cmp x20, #3
b.eq ei_mov
ldrb w10, [x19, #3]
cmp w10, #'z'
movz w22, #0x5280, lsl #16 // MOVZ base (speculative)
b.eq ei_movwide
cmp w10, #'n'
movz w22, #0x1280, lsl #16 // MOVN base (speculative)
b.eq ei_movwide
cmp w10, #'k'
b.ne ei_bad
movz w22, #0x7280, lsl #16 // MOVK base
b ei_movwide
// ── 's' mnemonics: sub, str, strb, svc, sbfm, sbfx, sbfiz, sxt* ──────
ei_s:
cmp w10, #'u'
b.eq ei_su
cmp w10, #'t'
b.eq ei_st
cmp w10, #'v'
b.eq ei_svc
cmp w10, #'b'
b.eq ei_bfm_unified
cmp w10, #'x'
b.eq ei_sxt_uxt
ei_sd:
mov w25, #0x400
b ei_div_common
ei_bad:
adr x0, msg_badins
bl error_at
ei_ret:
movz w0, #0x03C0
movk w0, #0xD65F, lsl #16
b emit_inst_done
ei_svc:
bl ws_x21
bl parse_hash_imm // x0 = imm16 value
and w9, w0, #0xFFFF
movz w0, #0x0001
movk w0, #0xD400, lsl #16 // 0xD4000001
orr w0, w0, w9, lsl #5
b emit_inst_done
ei_bcond:
add x0, x0, #1 // skip '.'
movz w22, #0x5400, lsl #16 // 0x54000000
bl parse_cond
orr w22, w22, w1 // base | cond
bl parse_label_pc_rel
// 0x54000000 | (imm19 << 5) | cond
and w0, w0, #0x7FFFF
orr w0, w22, w0, lsl #5
b emit_inst_done
// clz Rd, Rn — 64-bit: 0xDAC01000, 32-bit: 0x5AC01000
ei_clz:
mov w25, #0x1000
b ei_clz_rbit_common
// rbit Rd, Rn — 64-bit: 0xDAC00000, 32-bit: 0x5AC00000
ei_r:
cmp w10, #'e'
b.eq ei_ret
cmp w10, #'o'
b.eq ei_ror
ei_rbit:
mov w25, #0
ei_clz_rbit_common:
bl parse_2reg // x22=Rd, x23=sf, x0=Rn
mov x24, x0 // Rn for emit_3reg_sf_tail
movz w9, #0x5AC0, lsl #16 // 32-bit base
orr w9, w9, w25 // opcode (0x1000 for clz, 0 for rbit)
mov x0, #0 // no Rm field
b emit_3reg_sf_tail
// ror Rd, Rn, Rm — RORV: 0x1AC02C00 (32-bit) / 0x9AC02C00 (64-bit)
ei_ror:
bl parse_3reg
movz w9, #0x2C00
b emit_3reg_1AC0_tail
// add/adds Rd, Rn, #imm / Rm [, lsl #N] / :lo12:sym
ei_add:
mov x22, #0 // op=0 (ADD)
b ei_addsub_s
// sub/subs Rd, Rn, #imm / Rm
ei_su:
ei_sub:
movz x22, #0x4000, lsl #16 // op=1 (SUB)
ei_addsub_s:
sub x9, x20, #3 // 0 for len=3, 1 for len=4
orr x22, x22, x9, lsl #29 // set S flag if len=4
ei_addsub:
bl parse_x23_ws
mov x24, x1 // sf
bl parse_register
mov x25, x0 // save Rn
bl ws_x2_skip1 // skip ','
// is the third operand a register or immediate?
ei_addsub_operand:
cmp w9, #'a'
b.lo ei_addsub_imm // '#' or ':lo12:' (both < 'a')
// register form: add Rd, Rn, Rm [, lsl #N]
bl parse_register
mov x21, x0 // Rm
bl ws_x2
// check for optional ", lsl #N"
cmp w9, #','
mov x9, #0 // shift amount default 0 (doesn't affect flags)
b.ne ei_addsub_reg_emit
bl skip_lsl // skip ", lsl" + parse_hash_imm
mov x9, x0 // shift amount
ei_addsub_reg_emit:
// sf op 0 01011 shift 0 Rm imm6 Rn Rd
// shift = 00 (LSL)
and w11, w9, #0x3F
orr w0, w23, w25, lsl #5 // Rd | (Rn << 5)
orr w0, w0, w11, lsl #10 // imm6
orr w0, w0, w21, lsl #16 // Rm
orr w0, w0, w22 // op|S bits
movz w9, #0x0B00, lsl #16
b ei_addsub_sf_emit
ei_addsub_imm:
// immediate form: #expr or #:lo12:expr
bl parse_hash_imm // x0=val, x2=is_lo12
lsr x9, x0, #12
cbnz x9, ei_logical_bad // imm12 out of range (0-4095)
// sf op 0 10001 shift imm12 Rn Rd
orr w9, w23, w25, lsl #5 // Rd | (Rn << 5)
orr w9, w9, w0, lsl #10 // imm12 (bits 12+ known zero)
orr w9, w9, w22 // op|S bits
movz w0, #0x1100, lsl #16
// shared tail: w9=opcode bits (0x0B00 or 0x1100 << 16), x24=sf, w0=partial insn
ei_addsub_sf_emit:
orr w0, w0, w9
b emit_with_sf24
// cmp/cmn Rn, #imm / cmp/cmn Rn, Rm — reuse addsub with Rd=xzr
ei_c_cm:
ldrb w10, [x19, #2]
cmp w10, #'n'
movz x22, #0x6000, lsl #16 // CMP: SUBS bits 30:29 = 11
b.ne 1f
movz x22, #0x2000, lsl #16 // CMN: ADDS bits 30:29 = 01
1: mov x23, #31 // Rd = xzr
bl ws_x21_parse_reg // first operand = Rn
mov x24, x1 // sf
mov x25, x0 // save Rn
bl ws_x2_skip1 // skip ','
b ei_addsub_operand
// and/eor/orr — immediate (bitmask) or register
ei_logical:
bl parse_x23_ws
mov x24, x1 // sf
bl parse_register
mov x26, x0 // Rn
bl ws_x2_skip1
ei_logical_operand:
cmp w9, #'#'
b.eq ei_logical_imm
// register form: sf opc 01010 sh 0 Rm imm6 Rn Rd
bl parse_register
mov x21, x0 // Rm
bl ws_x2
mov w25, #0 // shift amount = 0 default
cmp w9, #','
b.ne ei_logical_reg_emit
bl skip_lsl // skip ", lsl" + parse_hash_imm
mov w25, w0 // shift amount
ei_logical_reg_emit:
orr w0, w23, w26, lsl #5 // Rd | (Rn << 5)
orr w0, w0, w25, lsl #10 // imm6 (shift amount)
orr w0, w0, w21, lsl #16
orr w0, w0, w22, lsl #29
movz w9, #0x0A00, lsl #16
b ei_addsub_sf_emit
ei_logical_imm:
bl parse_hash_imm
eor x1, x24, #1 // is_32bit = !sf
bl encode_logical_imm
// x0 = (N<<12)|(immr<<6)|imms
orr w9, w23, w26, lsl #5 // Rd | (Rn << 5)
orr w9, w9, w0, lsl #10 // | N/immr/imms
orr w9, w9, w22, lsl #29
movz w0, #0x1200, lsl #16 // 100100 in bits 28:23
b ei_addsub_sf_emit // orr w0|w9, apply sf, emit
// tst Rn, #imm / Rm — alias for ANDS XZR, Rn, operand
ei_tst:
mov x22, #3 // opc = ANDS
mov x23, #31 // Rd = XZR
bl ws_x21_parse_reg // parse Rn
mov x24, x1 // sf
mov x26, x0 // Rn
bl ws_x2_skip1 // skip ','
b ei_logical_operand
ei_logical_bad:
adr x0, msg_badimm
bl error_at
// ldr/ldrb/str/strb/ldp/stp — multiple addressing modes
ei_ld:
ei_st:
cmp w9, #'l'
cset x22, eq // 1 for load ('l'), 0 for store ('s')
ei_ldst_dispatch:
ldrb w10, [x19, #2]
cmp w10, #'p'
b.eq ei_ldst_pair
sub x24, x20, #3 // 0 for ldr/str (len=3), 1 for ldrb/strb/ldrh/strh (len=4)
ei_ldst:
bl parse_x23_ws
mov x21, x1 // sf (size for non-byte)
// precompute size encoding: 0=byte, 1=half, 2=32bit, 3=64bit
add w20, w21, #2 // 2 or 3
cbz x24, 1f // len=3: use sf+2
ldrb w20, [x19, #3] // 'b'=0x62, 'h'=0x68, 's'=0x73
cmp w20, #'s'
b.eq ei_ldrs_size // sign-extending load (ldrsb/ldrsh/ldrsw)
ubfx w20, w20, #3, #2 // 0 for byte, 1 for half
1: // literal load check: ldr Rt, label (no bracket)
cbz x22, ei_ldst_bracket // store: must have [
cbnz x24, ei_ldst_bracket // ldrb/ldrh: must have [
cmp w9, #'['
b.ne ei_ldr_literal
ei_ldst_bracket:
bl skip1_ws // skip '['
bl parse_register // Rn
mov x25, x0 // save Rn
bl ws_x2
cmp w9, #']'
b.eq ei_ldst_base_only
cmp w9, #','
b.ne pe_atom_err
bl skip1_ws
cmp w9, #'a'
b.lo ei_ldst_uimm // '#' or ':lo12:' (both < 'a')
// register offset: Rm [, lsl #N]
bl parse_register
mov x24, x0 // save Rm
bl ws_x2
mov w10, #0 // S=0
cmp w9, #']'
b.eq ei_ldst_reg_emit
bl skip_lsl // skip ", lsl" + parse_hash_imm
cbz x0, ei_ldst_reg_emit
mov w10, #1 // S=1
ei_ldst_reg_emit:
bl ldst_base
orr w0, w0, w10, lsl #12 // S bit
orr w0, w0, w24, lsl #16 // Rm
movz w9, #0x6800 // 0x800 | 0x6000
movk w9, #0x3820, lsl #16 // | 0x38000000 | 0x00200000
b orr_w9_emit
ei_ldst_base_only:
bl skip1_ws // skip ']'
cmp w9, #','
b.eq ei_ldst_post
mov x0, #0
b ei_ldst_uimm_encode
ei_ldst_uimm:
bl parse_hash_imm // x0=value, x1=ptr past imm
// check for pre-index: [Rn, #simm9]!
ldrb w9, [x1]
cmp w9, #']'
b.ne ei_ldst_uimm_encode
ldrb w9, [x1, #1]
cmp w9, #'!'
b.ne ei_ldst_uimm_encode
// pre-index encoding
and w10, w0, #0x1FF
bl ldst_base
orr w0, w0, #0x00000C00 // pre-index: bits[11:10] = 11
b ei_ldst_simm9_tail
ei_ldst_uimm_encode:
tbnz x0, #63, ei_ldst_unscaled // negative → LDUR/STUR encoding
lsr x0, x0, x20
and w10, w0, #0xFFF
bl ldst_base
orr w0, w0, w10, lsl #10
movz w9, #0x3900, lsl #16
b orr_w9_emit
ei_ldst_unscaled:
and w10, w0, #0x1FF
bl ldst_base
b ei_ldst_simm9_tail // bits[11:10] = 00 (unscaled)
ei_ldst_post:
bl skip1_ws // skip ','
bl parse_hash_imm
and w10, w0, #0x1FF
bl ldst_base
orr w0, w0, #0x00000400 // post-index: bits[11:10] = 01
ei_ldst_simm9_tail: // shared by pre-index and post-index
orr w0, w0, w10, lsl #12 // imm9 at [20:12]
orr w0, w0, #0x38000000
b emit_inst_done
// sign-extending load: determine size and opc from mnemonic suffix + dest register
// x21=sf (from parse_x23_ws), x19=mnemonic
ei_ldrs_size:
ldrb w9, [x19, #4] // 5th char: 'b','h','w'
ubfx w20, w9, #3, #2 // 'b'→0, 'h'→1, 'w'→2
mov w22, #3
sub w22, w22, w21 // opc = 3 - sf (Xd→2, Wd→3)
b ei_ldst_bracket
// ldr Rt, label — PC-relative literal load
// x23=Rt, x21=sf, x0=pointer to label
ei_ldr_literal:
bl parse_label_pc_rel // x0 = (target - PC) / 4
ubfiz w0, w0, #5, #19 // imm19 << 5
orr w0, w0, w23 // Rt
movz w9, #0x1800, lsl #16 // 32-bit base (0x18000000)
orr w9, w9, w21, lsl #30 // sf=1 → 0x58000000 for 64-bit
b orr_w9_emit
ei_ldst_pair:
// x22=L (already set by ei_ld/ei_st)
bl parse_x23_ws
mov x21, x1 // save sf (0=32-bit, 1=64-bit)
bl parse_register // Rt2
mov x24, x0 // Rt2
bl ws_x2_skip1 // skip ','
bl skip1_ws // skip '['
bl parse_register // Rn
mov x25, x0 // Rn
mov x26, #0 // addressing mode: 0=signed-offset
bl ws_x2
cmp w9, #']'
b.eq ei_pair_close
bl skip1_ws // skip ','
bl parse_hash_imm
// x0=value, x1=ptr past imm
// check for pre-index: ']' then '!'
mov x20, x0 // save imm value
bl ws_x1 // skip_ws from ptr past imm
cmp w9, #']'
b.ne ei_pair_pre_done
bl skip1_ws
cmp w9, #'!'
b.ne ei_pair_pre_done
movz w26, #0x0080, lsl #16 // pre-index: XOR sets bit 23
ei_pair_pre_done:
mov x0, x20 // restore imm value
b ei_pair_encode
ei_pair_close:
// saw ']' — check for post-index: ], #imm
bl skip1_ws // skip ']'
cmp w9, #','
b.ne 1f
bl skip1_ws // skip ','
bl parse_hash_imm
movz w26, #0x0180, lsl #16 // post-index: XOR flips bit24 off, bit23 on
b ei_pair_encode
1: mov x0, #0 // base-only: offset=0
ei_pair_encode:
add w10, w21, #2 // shift: 2 (32-bit) or 3 (64-bit)
asr w0, w0, w10
and w0, w0, #0x7F // imm7
movz w9, #0x2900, lsl #16 // 32-bit STP/LDP base (signed offset)
orr w9, w9, w21, lsl #31 // sf=1 → 0xA900
eor w9, w9, w26 // apply addressing mode bits
orr w9, w9, w22, lsl #22
orr w9, w9, w0, lsl #15
orr w9, w9, w24, lsl #10
orr w9, w9, w25, lsl #5
orr w0, w9, w23
b emit_inst_done
// madd/msub Rd, Rn, Rm, Ra — 0x1B000000 (32) / 0x9B000000 (64)
ei_madd_msub_common:
bl parse_3reg // x22=Rd, x23=sf, x24=Rn, x0=Rm
mov x25, x0 // save Rm
bl ws_x2_skip1 // skip ','
bl parse_register // Ra
orr w9, w22, w0, lsl #10 // Rd | (Ra << 10)
orr w9, w9, w24, lsl #5 // | (Rn << 5)
orr w9, w9, w25, lsl #16 // | (Rm << 16)
mov w0, w26 // bit15 (0 or 0x8000)
movk w0, #0x1B00, lsl #16 // 32-bit base
orr w0, w0, w23, lsl #31 // sf
b orr_w9_emit
// mul Rd, Rn, Rm — MADD Rd, Rn, Rm, XZR
// 64-bit: 0x9B007C00 | (Rm<<16) | (Rn<<5) | Rd
// 32-bit: 0x1B007C00 | ...
ei_mul:
bl parse_3reg
movz w9, #0x7C00
movk w9, #0x1B00, lsl #16 // 32-bit base
b emit_3reg_sf_tail
ei_shift_imm_dispatch:
ei_shift_imm:
bl parse_hash_imm
mov x21, x0 // shift amount
mov x11, #31
add x11, x11, x23, lsl #5 // size-1 = 31 or 63 (shared)
ldrb w9, [x19, #2]
cmp w9, #'r'
b.eq ei_lsr_asr_imm
// LSL #n: UBFM Rd, Rn, #(-n mod size), #(size-1-n)
neg x10, x21
and x10, x10, x11 // immr = (-n) & (size-1)
sub x11, x11, x21 // imms = (size-1) - n
b ei_ubfm_emit
ei_lsr_asr_imm:
mov x10, x21 // immr = n
tbnz w25, #11, ei_asr_sbfm // bit 11 set in w25 = ASR (0x2800)
ei_ubfm_emit:
movz w0, #0x5300, lsl #16 // UBFM base (sf+N applied below)
ei_bfm_apply_n_sf:
orr w0, w0, w23, lsl #22 // N bit = sf
ei_ubfm_orr:
orr w0, w0, w22
orr w0, w0, w24, lsl #5
orr w0, w0, w11, lsl #10
orr w0, w0, w10, lsl #16
b emit_with_sf
// ── unified bitfield handler (ubfx/ubfm/ubfiz/sbfx/sbfm/sbfiz/bfm/bfi/bfxil)
ei_bfm_unified:
bl parse_2reg // x22=Rd, x23=sf, x0=Rn, x2=ptr past
mov x24, x0 // Rn
bl ws_x2_skip1 // skip ','
bl parse_hash_imm // #op3
mov x25, x0
bl ws_x1
bl skip1_ws // skip ','
bl parse_hash_imm // #op4
mov x9, x0 // op4 in x9
// determine base opcode from mnemonic first char (x19 preserved)
ldrb w10, [x19]
mov w11, #3 // suffix offset for u*/s* prefix
movz w0, #0x1300, lsl #16 // SBFM
cmp w10, #'s'
b.eq 1f
movz w0, #0x5300, lsl #16 // UBFM
cmp w10, #'u'
b.eq 1f
movz w0, #0x3300, lsl #16 // BFM
mov w11, #2 // suffix offset for b* prefix
1: ldrb w11, [x19, x11] // load distinguishing char
cmp w11, #'x'
b.eq bfm_extract_apply
cmp w11, #'m'
b.eq bfm_raw_apply
// insert: immr=(-lsb) mod size, imms=width-1 (fall-through from dispatch)
bfm_insert_apply:
sub x11, x9, #1
mov x10, #31
add x10, x10, x23, lsl #5 // size-1 = 31 or 63
neg x9, x25
and x10, x9, x10
b ei_bfm_apply_n_sf
// extract: immr=lsb(x25), imms=lsb+width-1 (falls through to raw)
bfm_extract_apply:
add x9, x25, x9
sub x9, x9, #1
// raw: immr=x25, imms=x9
bfm_raw_apply:
mov x10, x25
mov x11, x9
b ei_bfm_apply_n_sf
// (bitfield handlers unified into ei_bfm_unified above)
// mov — multiple forms
ei_mov:
bl ws_x21_parse_reg
mov x22, x0 // Rd
mov x23, x1 // sf
bl ws_x2_skip1 // skip ','
cmp w9, #'#'
b.eq ei_mov_imm
// register form
bl parse_register
// if either reg is 31, use ADD Rd, Rn, #0 (handles SP)
cmp x22, #31
b.eq ei_mov_add
cmp x0, #31
b.eq ei_mov_add
// ORR Rd, XZR, Rm — x0 = Rm from parse_register
movz w9, #0x03E0
movk w9, #0x2A00, lsl #16
orr w9, w9, w22
orr w0, w9, w0, lsl #16
b emit_with_sf // x23=sf: sets bit 31 if 64-bit
ei_mov_add:
// ADD Rd, Rn, #0 — x0 = Rm from parse_register
orr w0, w22, w0, lsl #5
movk w0, #0x1100, lsl #16
b emit_with_sf // x23 = sf
ei_mov_imm:
bl parse_hash_imm
mov x24, x0
mov x26, #0 // phase: 0=MOVZ, 1=MOVN
ei_mov_try_phase:
mov x25, #0 // hw shift counter (reset each phase)
ei_mov_hw_loop:
lsr x9, x24, x25
and x9, x9, #0xFFFF
lsl x11, x9, x25
cmp x11, x24
b.eq ei_mov_found
add x25, x25, #16
cmp x25, #64
b.lt ei_mov_hw_loop
// try MOVN phase
cbnz x26, ei_logical_bad
mvn x24, x24
mov x26, #1
b ei_mov_try_phase
ei_mov_found:
// x9 = imm16, x25 = shift, x26 = phase (0=MOVZ, 1=MOVN)
movz w0, #0x5280, lsl #16 // MOVZ base
sub w0, w0, w26, lsl #30 // MOVN: subtract 0x40000000 (clear bit 30)
orr w0, w0, w23, lsl #31 // sf bit
orr w0, w0, w22 // Rd
orr w0, w0, w9, lsl #5 // imm16
orr w0, w0, w25, lsl #17 // hw (shift_amount << 17 = hw << 21)
b emit_inst_done
// movz/movn/movk Rd, #imm16 [, lsl #N]
ei_movwide:
bl parse_x23_ws
mov x24, x1 // sf
bl parse_hash_imm // #imm16
and w25, w0, #0xFFFF // imm16 (callee-saved)
// check for optional ", lsl #N"
bl ws_x1
cmp w9, #','
mov w10, #0 // hw = 0 default (doesn't affect flags)
b.ne ei_movwide_emit
bl skip_lsl // skip ", lsl" + parse_hash_imm
mov w10, w0 // raw shift amount
ei_movwide_emit:
orr w0, w22, w24, lsl #31 // base | sf
orr w0, w0, w23 // | Rd
orr w0, w0, w25, lsl #5 // | imm16
orr w0, w0, w10, lsl #17 // | hw (shift<<17 = hw<<21)
b emit_inst_done
// tbz/tbnz Rt, #bit, label — b5 011011 op b40 imm14 Rt
ei_t:
cmp w10, #'s'
b.eq ei_tst
ldrb w9, [x19, #2]
cmp w9, #'z'
b.eq 1f
cmp w9, #'n'
b.ne ei_bad
1: sub x22, x20, #3 // 0 for tbz (len=3), 1 for tbnz (len=4)
ei_tbz_common:
bl parse_x23_ws
bl parse_hash_imm
mov x24, x0 // bit number
bl ws_x1
add x0, x0, #1 // skip ','
bl parse_label_pc_rel
and w0, w0, #0x3FFF
orr w0, w23, w0, lsl #5
bfi w0, w24, #19, #5
lsr w9, w24, #5
orr w0, w0, w9, lsl #31
orr w0, w0, w22, lsl #24
movz w9, #0x3600, lsl #16
orr_w9_emit:
orr w0, w0, w9
b emit_inst_done
// neg Rd, Rm — alias for sub Rd, xzr, Rm / nop
ei_n:
cmp w10, #'o'
b.eq ei_nop
ei_neg:
movz w25, #0x4B00, lsl #16 // 32-bit SUB base
ei_neg_mvn_common:
bl parse_2reg // x22=Rd, x23=sf, x0=Rm
mov x24, #31 // Rn = xzr
mov w9, w25
b emit_3reg_sf_tail
// nop — 0xD503201F
ei_nop:
movz w0, #0x201F
movk w0, #0xD503, lsl #16
b emit_inst_done
// ── shared emit tails ─────────────────────────────────────────────────────
// emit_3reg_sf_tail: w9=32-bit base, x23=sf -> set bit31 if sf, then emit_3reg_tail
// emit_3reg_tail: w9=base, x0=Rm, x22=Rd, x24=Rn -> emit and done
// emit_3reg_1AC0_tail: w9=low opcode bits, x23=sf, x0=Rm, x22=Rd, x24=Rn
// completes with 0x1AC0/0x9AC0 opcode and emits
emit_3reg_1AC0_tail:
movk w9, #0x1AC0, lsl #16
emit_3reg_sf_tail:
orr w9, w9, w23, lsl #31
emit_3reg_tail:
orr w9, w9, w22
orr w9, w9, w24, lsl #5
orr w0, w9, w0, lsl #16
b emit_inst_done
// ldst_base: compute size<<30 | opc<<22 | Rn<<5 | Rt for load/store encodings
// reads x20=size, x22=opc, w23=Rt, x25=Rn; returns w0=partial insn
ldst_base:
lsl w0, w20, #30
orr w0, w0, w22, lsl #22
orr w0, w0, w23
orr w0, w0, w25, lsl #5
ret
// ══════════════════════════════════════════════════════════════════════════
// Compression — two-tier dictionary encoder
// ══════════════════════════════════════════════════════════════════════════
// ──────────────────────────────────────────────────────────────────────────
// compress_text — compress text_buf into input_buf using dictionary
//
// Input: x27 = text_buf, [x28, #ST_TEXT_POS] = text size
// Output: x0 = compressed stream size (bytes)
// Uses input_buf as scratch (safe — input already consumed)
// ──────────────────────────────────────────────────────────────────────────
compress_text:
// leaf function — no bl calls, caller doesn't need x19/x20 preserved
adr x10, full_dict
add x11, x10, #FULL_DICT_SIZE // half_dict = full_dict + 504
mov x12, x27 // src = text_buf
ldr x1, [x28, #ST_TEXT_POS]
add x13, x12, x1 // src_end
add x2, x28, #INPUT_BUF_OFF // dst = input_buf
ct_loop:
cmp x12, x13
b.hs ct_done
ldr w3, [x12], #4
// scan full_dict (126 entries)
mov x4, x10
mov w5, #1
ct_full:
ldr w6, [x4], #4
cmp w3, w6
b.eq ct_emit_full
add w5, w5, #1
cmp w5, #(FULL_DICT_ENTRIES + 1)
b.lo ct_full
// try half dict
lsr w7, w3, #16
mov x4, x11
mov w5, #0x80
ct_half:
ldrh w6, [x4], #2
cmp w7, w6
b.eq ct_emit_half
add w5, w5, #1
cmp w5, #(0x80 + HALF_DICT_ENTRIES)
b.lo ct_half
// raw escape
mov w5, #0x7F
strb w5, [x2], #1
str w3, [x2], #4
b ct_loop
ct_emit_full:
strb w5, [x2], #1
b ct_loop
ct_emit_half:
strb w5, [x2], #1
strh w3, [x2], #2
b ct_loop
ct_done:
strb wzr, [x2], #1 // end marker
sub x0, x2, x28
sub x0, x0, #INPUT_BUF_OFF // x0 = compressed size
ret
// ── full instruction dictionary (126 entries) ──────────────────
// generated by gen_dict.py — do not edit manually
full_dict:
.word 0xd65f03c0
.word 0x91000400
.word 0xaa0003f8
.word 0x540000a1
.word 0x39400009
.word 0xaa0003f4
.word 0xaa0003f9
.word 0x54000060
.word 0x7100b13f
.word 0x7101b95f
.word 0xaa0103f8
.word 0xaa1303e0
.word 0xd4000001
.word 0xf9400f8b
.word 0x14000002
.word 0x2a160000
.word 0x39400a6a
.word 0x54000041
.word 0x7100255f
.word 0x71008d3f
.word 0x7101753f
.word 0xaa0003f3
.word 0xaa1403e0
.word 0x17fffff5
.word 0x38401409
.word 0x5100c12a
.word 0x7101853f
.word 0xaa0003f6
.word 0xaa0103f5
.word 0xaa1e03f0
.word 0xd2800000
.word 0x1200200a
.word 0x14000005
.word 0x17ffffef
.word 0x2a160129
.word 0x2a170000
.word 0x38001445
.word 0x39400a69
.word 0x5100c14b
.word 0x52800019
.word 0x54000061
.word 0x54000080
.word 0x540000c0
.word 0x54000100
.word 0x54000120
.word 0x54000140
.word 0x540001a0
.word 0x540001e1
.word 0x54ffff63
.word 0x6b0b015f
.word 0x7100257f
.word 0x7100b93f
.word 0x7101895f
.word 0x7101915f
.word 0x7101b15f
.word 0x7101b93f
.word 0x7101bd5f
.word 0x7101c93f
.word 0x7101c95f
.word 0x7101cd3f
.word 0x7101cd5f
.word 0x7101d13f
.word 0x7101d55f
.word 0x7101e15f
.word 0x7101e95f
.word 0xa90157f4
.word 0xa9025ff6
.word 0xa9bc4ffe
.word 0xaa0003e9
.word 0xaa0003f5
.word 0xaa0103f4
.word 0xaa0103f7
.word 0xaa1003fe
.word 0xaa1403e1
.word 0xd280001a
.word 0xf1000e9f
.word 0xf86b6b8a
.word 0xf9001bfe
.word 0xf940038a
.word 0xf9401bfe
.word 0xf9401f89
.word 0x00000000
.word 0x110004a5
.word 0x12004800
.word 0x14000003
.word 0x14000004
.word 0x14000006
.word 0x14000008
.word 0x14000009
.word 0x1400017b
.word 0x17ffffd7
.word 0x17ffffdd
.word 0x17ffffdf
.word 0x17fffff9
.word 0x2a0016c0
.word 0x2a0016e0
.word 0x2a004120
.word 0x2a090000
.word 0x2a091400
.word 0x2a0a3000
.word 0x2a0b2800
.word 0x2a154000
.word 0x321b014b
.word 0x3940040a
.word 0x5101856b
.word 0x52a26000
.word 0x52aa6000
.word 0x54000081
.word 0x540000a0
.word 0x54000101
.word 0x540001c0
.word 0x540002a0
.word 0x7100657f
.word 0x71009d3f
.word 0x7100b53f
.word 0x7100c13f
.word 0x7100e93f
.word 0x7101713f
.word 0x71017d5f
.word 0x7101893f
.word 0x7101953f
.word 0x7101dd3f
.word 0x92800c60
.word 0x9400000c
.word 0x97fffe94
.word 0x9a9f17e1
// ── top-half dictionary (128 entries, packed as 64 words) ─────
half_dict:
.word 0x540097ff
.word 0x17ffd280
.word 0x14009100
.word 0x94007101
.word 0x3940f940
.word 0xb4005280
.word 0x54ffd100
.word 0x2a19f900
.word 0xaa007100
.word 0xb5ff1200
.word 0x2a001000
.word 0x2a182a16
.word 0x91048b0c
.word 0xaa029240
.word 0x2a172a0a
.word 0x38403400
.word 0xf10052a2
.word 0x52841100
.word 0x8b0b8b09
.word 0x91038b1d
.word 0xaa09aa01
.word 0xb7f8aa0c
.word 0x2a01dac0
.word 0x2a1a2a09
.word 0x5000321b
.word 0x52855100
.word 0x700052a6
.word 0x8b0a72a3
.word 0xa9419a9f
.word 0xaa0aa9bf
.word 0xb500aa15
.word 0xd503b840
.word 0xf86bf860
.word 0x32161a89
.word 0x37083608
.word 0x38603800
.word 0x52a152a0
.word 0x52aa52a5
.word 0x53035302
.word 0x784072ba
.word 0x8a0a7940
.word 0x8b108b00
.word 0x8b178b15
.word 0x92749101
.word 0x9a809280
.word 0xa8c19ad9
.word 0xa905a902
.word 0xaa13aa0d
.word 0xcb00b4ff
.word 0xcb19cb09
.word 0xd2a0d000
.word 0xd50bd379
.word 0xeb0dd61f
.word 0x2a15f2c0
.word 0x3900381f
.word 0x52a3528e
.word 0x6b065308
.word 0x8b0f8b01
.word 0xa9009ac0
.word 0xa906a901
.word 0xaa0baa07
.word 0xaa19aa17
.word 0xb940b800
.word 0xd344cb15
// ══════════════════════════════════════════════════════════════════════════
// Decompressor stub — copied verbatim to output at CODE_START (0x78)
//
// Runs at the ELF entry point. Decompresses .text to a page-aligned
// address (DECOMP_DEST = ceil_page(p_filesz) + 0x78) so all ADRP+ADD
// encodings are preserved. Copies rodata, flushes icache, jumps.
// The stub runs in place and is never overwritten.
//
// Unsupported instructions encoded as .word constants:
// dc cvau, ic ivau, dsb ish, isb, br
// ══════════════════════════════════════════════════════════════════════════
_decomp_stub_start:
adr x6, . // get our address
// compute decompression destination + preload rodata_size
ldp w7, w8, [x6, #STUB_DATA_DECOMP_DEST]
add x7, x6, x7 // x7 = stub_base + offset
// set up dict/stream pointers (right after stub in file)
add x2, x6, #(STUB_SIZE - 4) // full_dict - 4 (1-based index → 0-based via ptr adjust)
add x3, x2, #(FULL_DICT_SIZE - 256 + 4) // half_dict adjusted for 0x80-based index
add x0, x2, #(FULL_DICT_SIZE + HALF_DICT_SIZE + 4) // stream
mov x1, x7 // output dest
// ── decompress ────────────────────────────────────────────────────────
3: ldrb w4, [x0], #1
cbz w4, _decomp_copy_rodata
ldr w5, [x2, x4, lsl #2] // speculative full dict (harmless if half/raw)
tbz w4, #7, 5f // bit 7 clear → full dict or raw
ldrh w5, [x3, x4, lsl #1] // half dict: upper 16 bits
ldrh w9, [x0], #2
orr w5, w9, w5, lsl #16
5: cmp w4, #0x7F
b.ne 6f
ldr w5, [x0], #4 // raw: overwrite with stream word
6: str w5, [x1], #4
b 3b
// ── copy rodata ───────────────────────────────────────────────────────
_decomp_copy_rodata:
cbz x8, _decomp_flush
7: ldrb w3, [x0], #1
strb w3, [x1], #1
sub x8, x8, #1
cbnz x8, 7b
// ── icache flush (x7=start, x1=end) ───────────────────────────────────
_decomp_flush:
mov x0, x7
7: .word 0xd50b7b20 // dc cvau, x0
.word 0xd5033b9f // dsb ish
.word 0xd50b7520 // ic ivau, x0
add x0, x0, #64
cmp x0, x1
b.lo 7b
.word 0xd5033b9f // dsb ish
.word 0xd5033fdf // isb
// jump to decompressed entry
br x7
// data block (2 x uint32, patched by assembler at output time)
_decomp_data_decomp_dest:
.word 0
_decomp_data_rodata_size:
.word 0
_decomp_stub_end:
// computed stub constants (auto-adjust when stub changes)
.equ STUB_SIZE, (_decomp_stub_end - _decomp_stub_start)
.equ STUB_DATA_DECOMP_DEST, (_decomp_data_decomp_dest - _decomp_stub_start)
.equ STUB_DATA_RODATA_SIZE, (_decomp_data_rodata_size - _decomp_stub_start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment