eiz/asm.s

## asm.s
// asm.s — self-hosting aarch64 assembler
//
// reads an aarch64 assembly source file (GAS-compatible subset),
// emits a static PIE ELF binary directly. no linker required.
//
// usage: asm <input.s> <output>
//
// this file was created solely by Claude (Opus 4.6 and Sonnet 4.6) and is
// in the public domain (or CC0 1.0, if you prefer).
//
// to bootstrap with GNU tools: gcc -o asm0 -nostdlib asm.s && ./asm0 asm.s asm
//
// current binary size: 5543 bytes
//
// ── supported instructions ────────────────────────────────────────────────
//
//  arithmetic/logic:
//    add   Rd, Rn, #imm12 | Rm [, lsl #N]     sub (same forms)
//    adds  Rd, Rn, #imm12 | Rm [, lsl #N]     subs (same forms)
//    cmp   Rn, #imm12 | Rm [, lsl #N]         cmn (same forms)
//    and   Rd, Rn, #bitmask | Rm [, lsl #N]   orr, eor (same forms)
//    ands  Rd, Rn, #bitmask | Rm [, lsl #N]   (flag-setting AND)
//    tst   Rn, #bitmask | Rm                  (ANDS alias, Rd=XZR)
//    bic   Rd, Rn, Rm
//    neg   Rd, Rm                             mvn Rd, Rm
//    mul   Rd, Rn, Rm                         msub Rd, Rn, Rm, Ra
//    madd  Rd, Rn, Rm, Ra                     (Rd = Ra + Rn*Rm)
//    udiv  Rd, Rn, Rm                         sdiv Rd, Rn, Rm
//    nop
//
//  moves:
//    mov   Rd, Rm | #imm (MOVZ/MOVN-encodable) | SP
//    movz  Rd, #imm16 [, lsl #N]              movn, movk (same forms)
//
//  shifts:
//    lsl   Rd, Rn, Rm | #N                    lsr, asr (same forms)
//    ror   Rd, Rn, Rm
//
//  bitfield:
//    ubfm  Rd, Rn, #immr, #imms            sbfm, bfm (same forms)
//    ubfx  Rd, Rn, #lsb, #width            sbfx (same form)
//    ubfiz Rd, Rn, #lsb, #width            sbfiz, bfi (same form)
//    bfxil Rd, Rn, #lsb, #width
//    sxtb  Rd, Wn    sxth Rd, Wn    sxtw Rd, Wn
//    uxtb  Wd, Wn    uxth Wd, Wn
//
//  bit manipulation:
//    clz   Rd, Rn                             rbit Rd, Rn
//
//  branches:
//    b     label       bl label       br Xn      blr Xn     ret
//    b.cc  label       (eq ne hs lo mi pl vs vc hi ls ge lt gt le al cs cc)
//    cbz   Rt, label   cbnz Rt, label
//    tbz   Rt, #bit, label            tbnz Rt, #bit, label
//
//  conditional:
//    csel  Rd, Rn, Rm, cc             csinc Rd, Rn, Rm, cc
//    cset  Rd, cc
//
//  address:
//    adr   Rd, expr                   adrp Rd, symbol
//
//  load/store (single):
//    ldr   Rt, [Rn {, #imm | :lo12:sym}]      str (same forms)
//    ldr   Rt, [Rn, Rm {, lsl #N}]            str (same forms)
//    ldr   Rt, [Rn, #simm9]!                  str (pre-index)
//    ldr   Rt, [Rn], #simm9                   str (post-index)
//    ldr   Rt, label                           (PC-relative literal)
//    ldrb  (same addressing modes)             strb
//    ldrh  (same addressing modes)             strh
//    ldrsb Rt, [Rn {, ...}]                   ldrsh, ldrsw
//
//  load/store (pair):
//    ldp   Rt1, Rt2, [Rn {, #imm}]            stp (same forms)
//    ldp   Rt1, Rt2, [Rn, #imm]!              stp (pre-index)
//    ldp   Rt1, Rt2, [Rn], #imm               stp (post-index)
//
//  system:
//    svc   #imm16
//
// ── registers ─────────────────────────────────────────────────────────────
//    x0-x30, w0-w30, xzr, wzr, sp   (no fp/lr aliases)
//
// ── directives ────────────────────────────────────────────────────────────
//    .text  .bss  .section .rodata  .global name  .equ name, expr
//    .word expr   .ascii "str"      .asciz "str"
//    .align N     .skip N
//
// ── expressions ───────────────────────────────────────────────────────────
//    operators: | & + - * << >>   unary: ~ -   grouping: ( )
//    atoms: 123  0xFF  'A'  '\n'  .  label  :lo12:expr
//    labels: name:  N: (numeric 0-9, ref as Nf/Nb)
//    comments: //
//
// ── output ────────────────────────────────────────────────────────────────
//    ELF64 static PIE, single LOAD segment (RWX), no section headers.
//    .text is dictionary-compressed; decompressor stub runs at entry.
//

// ── syscall numbers ───────────────────────────────────────────────────────
.equ SYS_exit,       93
.equ SYS_read,       63
.equ SYS_write,      64
.equ SYS_openat,     56
.equ SYS_close,      57
.equ SYS_fchmod,     52

// ── file constants ────────────────────────────────────────────────────────
.equ AT_FDCWD,             -100
.equ O_RDONLY,             0
.equ O_WRONLY_CREAT_TRUNC, 577   // O_WRONLY|O_CREAT|O_TRUNC = 1|64|512
.equ STDERR,               2

// ── ELF constants ─────────────────────────────────────────────────────────
.equ ELF_HEADER_SIZE, 64
.equ PHDR_SIZE,       56
.equ CODE_START,      120        // ELF_HEADER_SIZE + PHDR_SIZE

// ── compression constants ─────────────────────────────────────────────────
// STUB_SIZE and STUB_DATA_* are computed from labels after _decomp_stub_end
.equ FULL_DICT_ENTRIES, 126
.equ HALF_DICT_ENTRIES, 128
.equ FULL_DICT_SIZE,    504        // 126 * 4
.equ HALF_DICT_SIZE,    256        // 128 * 2

// ── section IDs ───────────────────────────────────────────────────────────
.equ SEC_TEXT,       0          // pre-multiplied by 8 for direct state block indexing
.equ SEC_RODATA,     8
.equ SEC_BSS,        16

// ── state block offsets (all u64) ─────────────────────────────────────────
.equ ST_TEXT_POS,    0          // current offset within .text
.equ ST_RODATA_POS,  8          // current offset within .rodata
.equ ST_BSS_POS,     16         // current offset within .bss
.equ ST_CUR_SEC,     24         // current section (SEC_TEXT/RODATA/BSS)
.equ ST_TEXT_BASE,   32         // virtual address of .text start
.equ ST_RODATA_BASE, 40         // virtual address of .rodata start
.equ ST_BSS_BASE,    48         // virtual address of .bss start
.equ ST_PASS,        56         // current pass (1 or 2)
.equ ST_LINE_NUM,    64         // current source line number
.equ ST_INPUT_LEN,   72         // input file length in bytes
.equ ST_FILE_SIZE,   80         // total output file size
.equ ST_MEM_SIZE,    88         // total memory size (file + bss)
.equ ST_INPUT_NAME,  104        // pointer to input filename string
.equ ST_OUTPUT_NAME, 112        // pointer to output filename string
.equ ST_SIZE,        120

// ── symbol table entry layout (32 bytes) ──────────────────────────────────
// name_ptr  u64 @ 0   pointer to name in input buffer (0 = empty slot)
// name_len  u32 @ 8   length of name
// flags     u64 @ 16  SYMF_* bits
// value     u64 @ 24  address or .equ value
.equ SYM_ENT_SIZE,   32
.equ SYM_NAME_PTR,   0
.equ SYM_NAME_LEN,   8
.equ SYM_FLAGS,      16
.equ SYM_VALUE,      24
.equ SYM_TBL_SLOTS,  1024       // must be power of 2

// ── symbol flags ──────────────────────────────────────────────────────────
.equ SYMF_DEFINED,   1
.equ SYMF_GLOBAL,    2
.equ SYMF_EQU,       4
.equ SYMF_SEC_SHIFT, 4          // section stored in bits 5:4 of flags

// ── buffer sizes ──────────────────────────────────────────────────────────
.equ INPUT_BUF_SIZE,  1048576    // 1 MB
.equ TEXT_BUF_SIZE,   1048576    // 1 MB
.equ RODATA_BUF_SIZE, 1048576    // 1 MB
.equ SYM_TBL_BYTES,   32768      // SYM_TBL_SLOTS * SYM_ENT_SIZE

// ── BSS offsets from x28 (state block pointer) ────────────────────────────
.equ NUMLAB_CNTS_OFF, ST_SIZE                             // 128
.equ NUMLAB_CURS_OFF, NUMLAB_CNTS_OFF + NUMLAB_DIGITS * 8 // 208
.equ INPUT_BUF_OFF,   NUMLAB_CURS_OFF + NUMLAB_DIGITS * 8 // 288

// ── numeric labels ────────────────────────────────────────────────────────
.equ NUMLAB_MAX_DEFS, 128        // max definitions per digit
.equ NUMLAB_DIGITS,   10         // digits 0-9

// ══════════════════════════════════════════════════════════════════════════
//  BSS
// ══════════════════════════════════════════════════════════════════════════
.bss
.align 4
state:        .skip ST_SIZE
numlab_cnts:  .skip NUMLAB_DIGITS * 8
numlab_curs:  .skip NUMLAB_DIGITS * 8
input_buf:    .skip INPUT_BUF_SIZE
text_buf:     .skip TEXT_BUF_SIZE
rodata_buf:   .skip RODATA_BUF_SIZE
sym_table:    .skip SYM_TBL_BYTES
// numeric label storage: 10 digits × 128 defs × 8 bytes
numlab_defs:  .skip NUMLAB_DIGITS * NUMLAB_MAX_DEFS * 8

// ══════════════════════════════════════════════════════════════════════════
//  Read-only data
// ══════════════════════════════════════════════════════════════════════════
.section .rodata

msg_usage:    .asciz "usage: asm <input.s> <output>\n"
msg_open:     .asciz "cannot open input file\n"
msg_create:   .asciz "cannot create output file\n"
msg_syntax:   .asciz "syntax error\n"
msg_undef:    .asciz "undefined symbol\n"

msg_badins:   .asciz "unknown instruction\n"
msg_badimm:   .asciz "invalid immediate\n"

// condition code XOR lookup: cond_xor_tbl[(c0^c1) & 0x1F] = cond code (31=invalid)
cond_xor_tbl:
    .word 0x030A0803
    .word 0x1F1F0604
    .word 0x011F0D1F
    .word 0x1F1F0E1F
    .word 0x0C1F1F02
    .word 0x1F1F0700
    .word 0x021F1F0B
    .word 0x091F1F05

// operator table for expression parser: 2-byte entries (char, packed+0x20), sentinel=\0
// packed = (prec<<4)|opcode: | →0x10 & →0x21 + →0x32 - →0x33 * →0x44
op_table:     .ascii "|0&A+R-S*d\0"

// ══════════════════════════════════════════════════════════════════════════
//  Code
// ══════════════════════════════════════════════════════════════════════════
.text
.global _start

// ──────────────────────────────────────────────────────────────────────────
//  _start — entry point
// ──────────────────────────────────────────────────────────────────────────
_start:
    // grab argc / argv from the stack
    ldr     x0, [sp]                // argc
    cmp     x0, #3
    b.lt    err_usage

    // set up state block pointer (x28 is callee-saved, lives forever)
    adr     x28, state
    // pin x29 = 0x100000 (1 MB stride between section buffers)
    movz    x29, #0x10, lsl #16
    // pin x27 = text_buf (x28 + INPUT_BUF_OFF + INPUT_BUF_SIZE = x28 + 0x100120)
    add     x27, x28, x29
    add     x27, x27, #INPUT_BUF_OFF

    // store input/output filenames
    ldp     x1, x0, [sp, #16]       // x1=argv[1] (input), x0=argv[2] (output)
    stp     x1, x0, [x28, #ST_INPUT_NAME]

    // ── open and read the input file ──────────────────────────────────────
    // x1 already holds input filename from ldp above
    mov     x0, #AT_FDCWD
    mov     x2, #O_RDONLY
    mov     x8, #SYS_openat
    svc     #0
    tbnz    x0, #63, err_open

    add     x1, x28, #INPUT_BUF_OFF    // input_buf
    mov     x2, #INPUT_BUF_SIZE
    mov     x8, #SYS_read
    svc     #0
    tbnz    x0, #63, err_open
    str     x0, [x28, #ST_INPUT_LEN]

    // ── pass 1: collect symbols and measure sections ──────────────────────
    mov     x0, #1
    bl      run_pass

    // ── compute section base addresses ────────────────────────────────────
    ldp     x1, x2, [x28, #ST_TEXT_POS] // text_pos, rodata_pos
    mov     x0, #CODE_START
    add     x1, x0, x1                  // rodata_base = text_base + text_size
    stp     x0, x1, [x28, #ST_TEXT_BASE]

    add     x2, x1, x2                  // bss_base = rodata_base + rodata_size
    str     x2, [x28, #ST_BSS_BASE]

    ldr     x3, [x28, #ST_BSS_POS]
    add     x3, x2, x3                  // mem_size = bss_base + bss_size
    stp     x2, x3, [x28, #ST_FILE_SIZE]

    // ── rebase symbols: add section bases to label addresses ──────────────
    bl      rebase_symbols

    // ── pass 2: encode instructions and emit data ─────────────────────────
    mov     x0, #2
    bl      run_pass

    // ── compress .text section ────────────────────────────────────────────
    bl      compress_text
    mov     x20, x0                   // x20 = compressed stream size

    // ── allocate stack: 128 (ELF header) ──────────────────────────────────
    sub     sp, sp, #128

    // p_filesz = CODE_START + STUB_SIZE + FULL_DICT_SIZE + HALF_DICT_SIZE + stream + rodata
    ldr     x21, [x28, #ST_RODATA_POS]    // x21 = rodata_size (callee-saved)
    add     x12, x20, x21              // stream + rodata
    add     x12, x12, #(CODE_START + STUB_SIZE + FULL_DICT_SIZE + HALF_DICT_SIZE)

    // DECOMP_DEST_OFF = ceil_page(p_filesz) — offset from stub base
    add     x11, x12, #0xFFF
    and     x11, x11, #0xFFFFFFFFFFFFF000  // ceil to page

    // p_memsz = ceil_page(p_filesz) + total_mem_size
    ldr     x13, [x28, #ST_MEM_SIZE]
    add     x13, x13, x11

    // ELF magic + e_ident[0..7] + zeros [8..15]
    movz    x9, #0x457f
    movk    x9, #0x464c, lsl #16
    movk    x9, #0x0102, lsl #32
    movk    x9, #0x0001, lsl #48
    stp     x9, xzr, [sp]

    // e_type=3, e_machine=0xB7, e_version=1 + e_entry=0x78 (stub entry)
    movz    x9, #3
    movk    x9, #0x00B7, lsl #16
    movk    x9, #1, lsl #32
    mov     x10, #CODE_START           // e_entry = stub at 0x78
    stp     x9, x10, [sp, #16]

    // e_phoff=64 + e_shoff=0
    mov     x9, #64
    stp     x9, xzr, [sp, #32]

    // e_flags=0|e_ehsize=64|e_phentsize=56 + e_phnum=1|rest=0
    lsl     x9, x9, #32             // x9 was 64 → 0x0040_0000_0000
    movk    x9, #0x0038, lsl #48
    mov     x10, #1
    stp     x9, x10, [sp, #48]

    // p_type=1|p_flags=7 + p_offset=0
    movk    x10, #7, lsl #32
    stp     x10, xzr, [sp, #64]

    // p_vaddr=0, p_paddr=0
    stp     xzr, xzr, [sp, #80]

    // p_filesz + p_memsz (compressed values)
    stp     x12, x13, [sp, #96]

    // p_align = 0x10000
    mov     x9, #0x10000
    str     x9, [sp, #112]

    // ── open output file ──────────────────────────────────────────────────
    mov     x0, #AT_FDCWD
    ldr     x1, [x28, #ST_OUTPUT_NAME]
    mov     x2, #O_WRONLY_CREAT_TRUNC
    mov     w3, #493                  // 0755 octal
    mov     x8, #SYS_openat
    svc     #0
    tbnz    x0, #63, err_create
    mov     x19, x0                   // fd

    // write ELF header + program header (120 bytes)
    mov     x8, #SYS_write
    mov     x1, sp
    mov     x2, #CODE_START
    bl      svc_x19

    // header buffer no longer needed — reuse sp[0..7] for stub data (32-bit)
    stp     w11, w21, [sp]

    // write decompressor stub code (from .text, excludes data block)
    adr     x1, _decomp_stub_start
    mov     x2, #STUB_DATA_DECOMP_DEST
    bl      svc_x19

    // write patched stub data block (from stack)
    mov     x1, sp
    mov     x2, #(STUB_SIZE - STUB_DATA_DECOMP_DEST)
    bl      svc_x19

    // write full_dict + half_dict (adjacent in memory)
    adr     x1, full_dict
    mov     x2, #(FULL_DICT_SIZE + HALF_DICT_SIZE)
    bl      svc_x19

    // write compressed stream
    add     x1, x28, #INPUT_BUF_OFF
    mov     x2, x20
    bl      svc_x19

    // write .rodata section
    add     x1, x27, x29
    mov     x2, x21                    // rodata_size (saved in x21)
    bl      svc_x19

    // fchmod to make executable (returns 0 on success = our exit code)
    mov     x1, #493
    mov     x8, #SYS_fchmod
    bl      svc_x19
    b       exit_common

// ──────────────────────────────────────────────────────────────────────────
//  Error exits
// ──────────────────────────────────────────────────────────────────────────
err_usage:
    adr     x1, msg_usage
    b       die_msg
err_open:
    adr     x1, msg_open
    b       die_msg
err_create:
    adr     x1, msg_create
die_msg:
    bl      strlen_x1
    bl      write2
    mov     x0, #1
exit_common:
    mov     x8, #SYS_exit
    svc     #0

svc_x19:
    mov     x0, x19
    svc     #0
    ret

write2:
    mov     x0, #STDERR
    mov     x8, #SYS_write
    svc     #0
    ret

// strlen_x1 — compute length of null-terminated string in x1 → x2
strlen_x1:
    mov     x2, #-1
1:  add     x2, x2, #1
    ldrb    w10, [x1, x2]
    cbnz    w10, 1b
    ret

// ══════════════════════════════════════════════════════════════════════════
//  Utility functions (spec §8.5)
//
//  Calling convention: args in x0-x7, return in x0 (x1 for pairs).
//  Leaf functions — no stack frame needed.
// ══════════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────────
//  skip_ws — advance pointer past spaces and tabs
//  x0 = pointer
//  returns x0 = first non-whitespace position
// ──────────────────────────────────────────────────────────────────────────
skip1_ws:
    add     x0, x0, #1
skip_ws:
1:  ldrb    w9, [x0]
    cbz     w9, 2f
    cmp     w9, #' '
    csinc   x0, x0, x0, hi
    b.ls    1b
2:  ret

ws_x2_skip1:
    mov     x16, x30
    bl      ws_x2
    mov     x30, x16
    b       skip1_ws

ws_x1:
    mov     x0, x1
    b       skip_ws

ws_x19:
    mov     x0, x19
    b       skip_ws

ws_x2:
    mov     x0, x2
    b       skip_ws

ws_x21:
    mov     x0, x21
    b       skip_ws

ws_x21_parse_reg:
    mov     x16, x30
    bl      ws_x21
    mov     x30, x16
    b       parse_register

// ──────────────────────────────────────────────────────────────────────────
//  decode_escape — decode backslash escape character
//  w9 = char after backslash; returns w9 = decoded character
// ──────────────────────────────────────────────────────────────────────────
decode_escape:
    cmp     w9, #'0'
    csel    w9, wzr, w9, eq
    cmp     w9, #'n'
    mov     w10, #10
    csel    w9, w10, w9, eq
    cmp     w9, #'t'
    mov     w10, #9
    csel    w9, w10, w9, eq
    ret

// ──────────────────────────────────────────────────────────────────────────
//  parse_int — parse decimal, hex, or character literal
//  x0 = pointer (at first character of the number)
//  returns x0 = value, x1 = pointer past the parsed number
//
//  formats: 123  -42  0x1F  0xFF  'A'  '\n'
// ──────────────────────────────────────────────────────────────────────────
parse_int:
    ldrb    w9, [x0]

    // character literal?
    cmp     w9, #'\''
    b.eq    parse_int_char

    // negative?
    cmp     w9, #'-'
    csinc   x0, x0, x0, ne          // advance past '-' if negative
    cset    x11, eq                  // sign flag: 1 if '-', else 0
    ldrb    w9, [x0]                 // reload current char

    // hex prefix?
    cmp     w9, #'0'
    b.ne    parse_int_dec
    ldrb    w10, [x0, #1]
    orr     w10, w10, #0x20
    cmp     w10, #'x'
    b.eq    parse_int_hex

parse_int_dec:
    mov     x12, #0                  // accumulator
2:  ldrb    w9, [x0]
    sub     w10, w9, #'0'
    cmp     w10, #9
    b.hi    parse_int_done
    add     x12, x12, x12, lsl #2   // x12 * 5
    add     x12, x10, x12, lsl #1   // digit + x12*10
    add     x0, x0, #1
    b       2b

parse_int_hex:
    add     x0, x0, #2              // skip "0x"
    mov     x12, #0
3:  ldrb    w9, [x0]
    sub     w10, w9, #'0'
    cmp     w10, #9
    b.ls    4f
    orr     w10, w9, #0x20          // fold uppercase to lowercase
    sub     w10, w10, #'a'
    cmp     w10, #5
    b.hi    parse_int_done
    add     w10, w10, #10
4:  add     x12, x10, x12, lsl #4
    add     x0, x0, #1
    b       3b

parse_int_done:
    cbz     x11, parse_int_ret
    neg     x12, x12
parse_int_ret:
    mov     x1, x0
    mov     x0, x12
    ret

parse_int_char:
    add     x0, x0, #1              // skip opening quote
    ldrb    w9, [x0], #1            // load char, advance past it
    cmp     w9, #'\\'
    b.ne    1f
    // escape: x0 is past backslash already
    ldrb    w9, [x0], #1            // load escape char, advance past it
    mov     x16, x30
    bl      decode_escape
    mov     x30, x16
1:  mov     x12, x9
    add     x0, x0, #1              // skip closing quote
    b       parse_int_ret

// ──────────────────────────────────────────────────────────────────────────
//  parse_ident — parse an identifier [a-zA-Z_][a-zA-Z0-9_]*
//  x0 = pointer
//  returns x0 = start of ident, x1 = length, x2 = pointer past ident
//  if no valid identifier, x1 = 0
// ──────────────────────────────────────────────────────────────────────────
parse_ident:
    mov     x9, x0                   // start
    ldrb    w10, [x0], #1
    b       pi_check_first
1:  ldrb    w10, [x0], #1
    // loop: accept digits (not valid for first char)
    sub     w11, w10, #'0'
    cmp     w11, #9
    b.ls    1b
pi_check_first:
    // accept underscore and letters
    cmp     w10, #'_'
    b.eq    1b
    orr     w11, w10, #0x20
    sub     w11, w11, #'a'
    cmp     w11, #25
    b.ls    1b
    // end of identifier (or not an identifier if x0 == x9)
    sub     x2, x0, #1             // end pointer (x0 is one past due to post-index)
    sub     x1, x2, x9             // length (0 if no ident)
    mov     x0, x9                  // start
    ret

// ──────────────────────────────────────────────────────────────────────────
//  parse_register — parse register name
//  x0 = pointer
//  returns x0 = reg number (0-31), x1 = is_64bit, x2 = pointer past
//  on error: x0 = -1
// ──────────────────────────────────────────────────────────────────────────
parse_register:
    // w9 pre-loaded by caller (skip_ws sets w9 = first non-ws char)

    // sp?
    ldrh    w10, [x0]
    movz    w11, #0x7073             // 'sp' in little-endian
    cmp     w10, w11
    b.ne    1f
    // make sure it's not a longer ident (e.g. "spaghetti")
    ldrb    w10, [x0, #2]
    orr     w11, w10, #0x20
    sub     w11, w11, #'a'
    cmp     w11, #25
    b.ls    1f
    sub     w11, w10, #'0'
    cmp     w11, #9
    b.ls    1f
    cmp     w10, #'_'
    b.eq    1f
    add     x2, x0, #2              // end pointer (before clobbering x0)
    mov     x0, #31
    mov     x1, #1
    ret

1:  cmp     w9, #'x'
    cset    x1, eq                   // x1=1 if 'x' (64-bit), else 0
    b.eq    parse_reg_xw
    cmp     w9, #'w'
    b.ne    parse_reg_fail
parse_reg_xw:
    // check for xzr/wzr — load 4 bytes, extract bytes 1-2 as 16-bit LE
    ldr     w10, [x0]
    ubfx    w10, w10, #8, #16
    movz    w11, #0x727A             // 'z' | ('r' << 8) in little-endian
    cmp     w10, w11
    b.ne    parse_reg_num
    add     x2, x0, #3
    mov     x0, #31
    ret

parse_reg_num:
    // x1 = is_64bit from cset above; not modified by this code
    ldrb    w12, [x0, #1]           // first digit
    sub     w12, w12, #'0'
    cmp     w12, #9
    b.hi    parse_reg_fail
    ldrb    w10, [x0, #2]
    sub     w11, w10, #'0'
    cmp     w11, #9
    add     x2, x0, #2              // end pointer (single digit); flags unaffected
    b.hi    1f                       // single digit
    add     w13, w12, w12, lsl #2   // first * 5
    add     w12, w11, w13, lsl #1   // second + first * 10
    add     x2, x2, #1
1:  cmp     w12, #30
    b.hi    parse_reg_fail
    mov     x0, x12
    ret


// ──────────────────────────────────────────────────────────────────────────
//  sym_lookup — find a symbol in the hash table
//  x0 = name pointer, x1 = name length
//  returns x0 = pointer to entry, x1 = 1 if found (0 if empty slot)
//
//  uses x28 (state block) to reach sym_table / sym_names
// ──────────────────────────────────────────────────────────────────────────
sym_lookup:
    // leaf function — no frame needed, uses scratch registers only
    mov     x15, x0                  // name ptr
    mov     x16, x1                  // name len

    // hash the name (inlined djb2)
    mov     x9, #5381
    cbz     x1, 2f
1:  sub     x1, x1, #1
    ldrb    w12, [x0, x1]
    add     x9, x9, x9, lsl #5
    add     x9, x9, x12
    cbnz    x1, 1b
    // slot = hash & (SYM_TBL_SLOTS - 1)
2:  and     x17, x9, #(SYM_TBL_SLOTS - 1)

    add     x14, x27, x29, lsl #1      // sym_table = text_buf + 2*1MB

sym_lookup_probe:
    // entry = &sym_table[slot * 32]
    add     x13, x14, x17, lsl #5  // entry pointer in x13

    // check if slot is empty (name_ptr == NULL)
    ldr     x12, [x13, #SYM_NAME_PTR]
    cbz     x12, sym_lookup_empty

    // compare name_len
    ldr     w11, [x13, #SYM_NAME_LEN]
    cmp     w11, w16
    b.ne    sym_lookup_next

    // compare name bytes (x12 = direct pointer into input buffer)
    mov     x0, x16                  // counter
1:  sub     x0, x0, #1
    ldrb    w9, [x12, x0]
    ldrb    w10, [x15, x0]
    cmp     w9, w10
    b.ne    sym_lookup_next
    cbnz    x0, 1b
    mov     x1, #1
    b       sym_lookup_ret

sym_lookup_next:
    add     x17, x17, #1
    and     x17, x17, #(SYM_TBL_SLOTS - 1)
    b       sym_lookup_probe

sym_lookup_empty:
    mov     x1, #0
sym_lookup_ret:
    mov     x0, x13
    ret

// ──────────────────────────────────────────────────────────────────────────
//  sym_define — insert or update a symbol
//  x0 = name pointer, x1 = name length, x2 = value, x3 = flags
//
//  if the symbol already exists, updates value and flags (OR'd).
//  if new, stores direct name pointer from input buffer.
// ──────────────────────────────────────────────────────────────────────────
sym_define:
    stp     x19, x30, [sp, #-16]!
    mov     x19, x2                  // value (callee-saved)
    // x3 = flags (preserved across sym_lookup — leaf, doesn't touch x3)
    bl      sym_lookup
    // x0 = entry, x15 = name ptr, x16 = name len (set by sym_lookup)

    cbnz    x1, sym_define_update

    // ── new entry: store name pointer directly ────────────────────────────
    str     x15, [x0, #SYM_NAME_PTR]
    str     w16, [x0, #SYM_NAME_LEN]

sym_define_update:
    str     x19, [x0, #SYM_VALUE]
    // OR in flags (don't clobber existing bits)
    ldr     x9, [x0, #SYM_FLAGS]
    orr     x9, x9, x3
    str     x9, [x0, #SYM_FLAGS]

    ldp     x19, x30, [sp], #16
    ret

// ──────────────────────────────────────────────────────────────────────────
//  error_at — print "filename:line: msg\n" to stderr and exit(1)
//  x0 = message pointer (null-terminated)
//
//  uses state block for filename and line number
// ──────────────────────────────────────────────────────────────────────────
error_at:
    mov     x19, x0                  // msg ptr

    // write filename
    ldr     x1, [x28, #ST_INPUT_NAME]
    bl      strlen_x1
    bl      write2

    // build ":[linenum]: " in frame buffer and write it
    ldr     x9, [x28, #ST_LINE_NUM]
    add     x11, sp, #44
    mov     x10, x11
    mov     x12, #10
3:  udiv    x13, x9, x12
    msub    x14, x13, x12, x9
    add     w14, w14, #'0'
    strb    w14, [x10, #-1]!
    mov     x9, x13
    cbnz    x9, 3b
    mov     w14, #':'
    strb    w14, [x10, #-1]!
    movz    w14, #0x203A
    strh    w14, [x11]
    mov     x1, x10
    sub     x2, x11, x10
    add     x2, x2, #2
    bl      write2

    // write message (includes \n) and exit
    mov     x1, x19
    b       die_msg

// ══════════════════════════════════════════════════════════════════════════
//  Pass driver and line processing
// ══════════════════════════════════════════════════════════════════════════


// ──────────────────────────────────────────────────────────────────────────
//  run_pass — iterate over all source lines
//  x0 = pass number (1 or 2)
// ──────────────────────────────────────────────────────────────────────────
run_pass:
    stp     x30, x19, [sp, #-64]!
    stp     x20, x21, [sp, #16]
    stp     x22, x23, [sp, #32]

    str     x0, [x28, #ST_PASS]

    // reset section positions and current section
    stp     xzr, xzr, [x28, #ST_TEXT_POS]
    stp     xzr, xzr, [x28, #ST_BSS_POS]

    // reset line number (x21 = line counter, synced to state before process_line)
    mov     x21, #1

    // reset numeric label cursors (inline zero fill)
    add     x0, x28, #NUMLAB_CURS_OFF
    mov     x2, #(NUMLAB_DIGITS * 8)
1:  sub     x2, x2, #1
    strb    wzr, [x0, x2]
    cbnz    x2, 1b

    // set up input pointers
    add     x19, x28, #INPUT_BUF_OFF   // input_buf
    ldr     x9, [x28, #ST_INPUT_LEN]
    add     x20, x19, x9              // x20 = end of input

run_pass_loop:
    cmp     x19, x20
    b.ge    pl_done

    // find end of line (newline or end of buffer)
    mov     x22, x19
1:  cmp     x22, x20
    b.ge    2f
    ldrb    w10, [x22], #1
    cmp     w10, #'\n'
    b.ne    1b
    sub     x22, x22, #1           // back up to newline
2:

    // temporarily null-terminate
    ldrb    w23, [x22]
    strb    wzr, [x22]

    // process the line
    str     x21, [x28, #ST_LINE_NUM]
    mov     x0, x19
    bl      process_line

    // restore original byte
    strb    w23, [x22]

    // advance past newline
    add     x19, x22, #1
    add     x21, x21, #1

    b       run_pass_loop

// ──────────────────────────────────────────────────────────────────────────
//  process_line — handle one null-terminated source line
//  x0 = line start (null-terminated)
// ──────────────────────────────────────────────────────────────────────────
process_line:
    stp     x30, x19, [sp, #-64]!
    stp     x20, x21, [sp, #16]
    stp     x22, x23, [sp, #32]

    bl      skip_ws
    mov     x19, x0

pl_check_content:
    // empty line? (w9 pre-loaded by skip_ws / ws_x19)
    cbz     w9, pl_done

    // comment?  ( // )
    ldrh    w10, [x19]
    movz    w11, #0x2F2F            // "//" in little-endian
    cmp     w10, w11
    b.eq    pl_done

    // ── check for numeric label (digit followed by ':') ───────────────────
    // w10 = ldrh from [x19]; for "N:", w10 = 0x3A30..0x3A39
    movz    w11, #0x3A30             // ':' << 8 | '0'
    sub     w10, w10, w11
    cmp     w10, #9
    b.hi    pl_not_numlab

    // numeric label — record in pass 1
    mov     x0, x10                  // digit (0-9)
    bl      handle_numlab
    add     x19, x19, #2
    b       pl_after_label

pl_not_numlab:
    // ── check for named label or mnemonic ─────────────────────────────────
    cmp     w9, #'.'
    b.eq    pl_directive

    mov     x0, x19
    bl      parse_ident
    cbz     x1, pl_done              // no identifier → skip

    // is it a label (followed by ':')?
    ldrb    w9, [x2]
    cmp     w9, #':'
    b.ne    pl_instruction

    // ── named label ───────────────────────────────────────────────────────
    add     x19, x2, #1             // past ':'

    // only define in pass 1 (pass 2 uses rebased values)
    ldr     x9, [x28, #ST_PASS]
    tbnz    x9, #1, pl_after_label

    // value = current section offset
    ldr     x11, [x28, #ST_CUR_SEC]
    ldr     x2, [x28, x11]
    // flags = DEFINED | (cur_section << SEC_SHIFT); x11 = sec*8
    lsl     x3, x11, #1
    orr     x3, x3, #SYMF_DEFINED
    bl      sym_define

pl_after_label:
    bl      ws_x19
    mov     x19, x0
    b       pl_check_content

    // ── directive (starts with '.') ───────────────────────────────────────
pl_directive:
    add     x0, x19, #1            // skip '.'
    bl      parse_ident
    cbz     x1, pl_done
    // x0 = name start, x1 = name length, x2 = end pointer
    mov     x20, x0                  // directive name
    mov     x21, x1                  // directive length
    mov     x19, x2                  // position after directive name

    // dispatch on directive name — check first char then length
    ldrb    w9, [x20]

    cmp     w9, #'b'
    mov     x10, #SEC_BSS            // doesn't affect flags
    b.eq    dir_sec_set

    cmp     w9, #'s'
    b.ne    5f
    cmp     x21, #7
    b.ne    dir_skip
    mov     x10, #SEC_RODATA
    b       dir_sec_set

5:  cmp     w9, #'a'
    b.ne    6f
    ldrb    w10, [x20, #4]
    cmp     w10, #'n'
    b.eq    dir_align
    b       dir_str_common

6:  cmp     w9, #'e'
    b.ne    7f
    // inline dir_equ:
    bl      ws_x19
    bl      parse_ident
    cbz     x1, pl_done
    mov     x20, x0
    mov     x21, x1
    bl      ws_x2_skip1
    bl      parse_expr0
    mov     x2, x0
    mov     x3, #(SYMF_DEFINED | SYMF_EQU)
    mov     x0, x20
    mov     x1, x21
    bl      sym_define
    b       pl_done

7:  cmp     w9, #'g'
    b.ne    dir_word
    // inline dir_global:
    bl      ws_x19
    bl      parse_ident
    cbz     x1, pl_done
    mov     x2, #0
    mov     x3, #SYMF_GLOBAL
    bl      sym_define
    b       pl_done

    // ── instruction ───────────────────────────────────────────────────────
pl_instruction:
    // x0 = mnemonic start, x1 = mnemonic length, x2 = position after
    ldr     x9, [x28, #ST_PASS]
    tbnz    x9, #1, encode_instruction
    // pass 1: advance text pos by 4 (write garbage to buf — overwritten in pass 2)
    b       emit_inst_done

dir_word:
    cmp     w9, #'w'
    b.ne    dir_text
    // .word <expr> — emit 4-byte little-endian value
    bl      parse_expr0_x19
    mov     x22, x0                     // value
    // always write to buffer (pass 1 writes are harmless, overwritten in pass 2)
    ldr     x11, [x28, #ST_CUR_SEC]
    ldr     x10, [x28, x11]             // current pos
    add     x0, x27, x11, lsl #17       // text_buf + sec * 1MB
    str     w22, [x0, x10]              // store 4 bytes
    mov     x0, #4
    b       advance_sec_pos

dir_text:
    cmp     w9, #'t'
    mov     x10, #SEC_TEXT           // doesn't affect flags
    b.ne    pl_done
dir_sec_set:
    str     x10, [x28, #ST_CUR_SEC]

pl_done:
    ldp     x22, x23, [sp, #32]
    ldp     x20, x21, [sp, #16]
    ldp     x30, x19, [sp], #64
    ret

// ══════════════════════════════════════════════════════════════════════════
//  Directive handlers
//
//  On entry: x19 = parse position after directive name
//            x20, x21 available (saved by process_line's frame)
//  Must jump to pl_done when finished.
// ══════════════════════════════════════════════════════════════════════════

// .align N — align to 2^N boundary
dir_align:
    bl      parse_expr0_x19
    // x0 = N (alignment power)
    mov     x10, x0

    ldr     x11, [x28, #ST_CUR_SEC]
    ldr     x0, [x28, x11]          // current position

    // aligned = (pos + mask) & ~mask where mask = (1<<N)-1
    mov     x9, #1
    lsl     x9, x9, x10             // 1 << N
    sub     x9, x9, #1              // mask
    add     x0, x0, x9              // pos + mask
    bic     x0, x0, x9              // & ~mask = aligned position

    str     x0, [x28, x11]
    b       pl_done

// .skip N — advance by N bytes
dir_skip:
    bl      parse_expr0_x19
    b       advance_sec_pos

// .ascii/.asciz "string" — w10 still holds directive[4] ('i' or 'z')
dir_str_common:
    cmp     w10, #'z'
    cset    x21, eq                  // null_flag: 1 if asciz, 0 if ascii
    bl      ws_x19                   // x0 = pointer to '"'
    // always compute dest buffer (pass 1 writes are harmless, overwritten in pass 2)
    ldr     x11, [x28, #ST_CUR_SEC]  // x0 preserved
    add     x20, x27, x11, lsl #17   // text_buf + sec * 1MB
    ldr     x10, [x28, x11]
    add     x20, x20, x10
    mov     x1, x20
    bl      parse_string             // x0 = count, x1 = ptr past
    cbz     x21, 2f                  // not asciz: skip null
    strb    wzr, [x20, x0]           // write null terminator
2:  add     x0, x0, x21             // count + null_flag
advance_sec_pos:
    ldr     x11, [x28, #ST_CUR_SEC]
    ldr     x10, [x28, x11]
    add     x10, x10, x0
    str     x10, [x28, x11]
    b       pl_done

// ──────────────────────────────────────────────────────────────────────────
//  handle_numlab — record a numeric label definition
//  x0 = digit (0-9)
// ──────────────────────────────────────────────────────────────────────────
handle_numlab:
    // x0 = digit (0-9) — leaf function, no frame needed
    ldr     x10, [x28, #ST_PASS]
    tbnz    x10, #1, handle_numlab_p2

    // pass 1: numeric labels are always in .text
    ldr     x10, [x28, #ST_TEXT_POS]

    // count = numlab_cnts[digit]
    add     x11, x28, #NUMLAB_CNTS_OFF
    ldr     x12, [x11, x0, lsl #3]   // count

    // store address: numlab_defs[digit * MAX_DEFS + count]
    adrp    x13, numlab_defs
    add     x13, x13, :lo12:numlab_defs
    lsl     x14, x0, #7             // digit * 128
    add     x14, x14, x12
    str     x10, [x13, x14, lsl #3]

    // increment count
    add     x12, x12, #1
    str     x12, [x11, x0, lsl #3]
    ret

handle_numlab_p2:
    add     x11, x28, #NUMLAB_CURS_OFF
    ldr     x10, [x11, x0, lsl #3]
    add     x10, x10, #1
    str     x10, [x11, x0, lsl #3]
    ret

// ──────────────────────────────────────────────────────────────────────────
//  rebase_symbols — after pass 1, add section bases to label values
// ──────────────────────────────────────────────────────────────────────────
rebase_symbols:
    add     x9, x27, x29, lsl #1       // sym_table = text_buf + 2*1MB
    mov     x10, #SYM_TBL_SLOTS

rebase_loop:
    ldr     x13, [x9, #SYM_NAME_PTR]
    cbz     x13, rebase_next         // empty slot

    ldp     x14, x17, [x9, #SYM_FLAGS]  // flags, value
    tbnz    x14, #2, rebase_next    // bit 2 = SYMF_EQU, skip

    // extract section from flags bits 5:4
    ubfx    x15, x14, #SYMF_SEC_SHIFT, #2

    // base[section] = state[ST_TEXT_BASE + section*8]
    add     x16, x28, x15, lsl #3
    ldr     x16, [x16, #ST_TEXT_BASE]
    add     x17, x17, x16
    str     x17, [x9, #SYM_VALUE]

rebase_next:
    add     x9, x9, #SYM_ENT_SIZE
    sub     x10, x10, #1
    cbnz    x10, rebase_loop
    // x9 now points to numlab_defs (sym_table + SYM_TBL_BYTES)
    add     x10, x28, #NUMLAB_CNTS_OFF

    // numeric labels are always in the text section for now
    ldr     x16, [x28, #ST_TEXT_BASE]

    mov     x11, #0                  // digit
rebase_numlab_digit:
    ldr     x12, [x10, x11, lsl #3] // count for this digit
    lsl     x14, x11, #7            // base index = digit * 128
    cbz     x12, 5f                  // skip if count = 0
4:  sub     x12, x12, #1
    add     x17, x14, x12
    ldr     x0, [x9, x17, lsl #3]
    add     x0, x0, x16
    str     x0, [x9, x17, lsl #3]
    cbnz    x12, 4b
5:  add     x11, x11, #1
    cmp     x11, #NUMLAB_DIGITS
    b.lt    rebase_numlab_digit
    ret

// ══════════════════════════════════════════════════════════════════════════
//  Expression evaluator — recursive descent
//
//  Each function: x0 = pointer → x0 = value, x1 = pointer past expr
//
//  Precedence (low to high): |  &  +/-  *  <</>>  unary(~ -)  atom
// ══════════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────────
//  parse_expr — Pratt binary expression parser
//  x0 = pointer, x1 = min_prec (0 for top-level callers)
//  returns x0 = value, x1 = pointer past expr
//
//  Precedence: | (1) < & (2) < +/- (3) < * (4) < <<,>> (5)
// ──────────────────────────────────────────────────────────────────────────
parse_expr0_x19:
    mov     x0, x19
parse_expr0:
    mov     x1, #0
parse_expr:
    stp     x30, x19, [sp, #-64]!
    stp     x20, x21, [sp, #16]
    stp     x22, x23, [sp, #32]
    mov     x22, x1                  // min_prec
    bl      parse_expr_unary
    mov     x19, x0                  // lhs value
    mov     x20, x1                  // current position

// Operator dispatch: x21 encodes (prec<<4)|opcode
// | → 0x10  & → 0x21  + → 0x32  - → 0x33  * → 0x44  << → 0x55  >> → 0x56
pe_loop:
    mov     x0, x20
    bl      skip_ws
    mov     x20, x0
    adr     x10, op_table
1:  ldrb    w11, [x10]
    cbz     w11, 5f
    cmp     w9, w11
    ldrb    w21, [x10, #1]
    sub     x21, x21, #32
    add     x10, x10, #2
    b.ne    1b
    b       pe_check_prec
5:  cmp     w9, #'<'
    b.eq    pe_shift
    cmp     w9, #'>'
    b.ne    pe_done
pe_shift:
    ldrb    w10, [x20, #1]
    cmp     w10, w9
    b.ne    pe_done
    add     x20, x20, #1
    lsr     w21, w9, #1
    add     x21, x21, #55

pe_check_prec:
    lsr     x9, x21, #4              // prec = x21 >> 4
    and     x23, x21, #0xF           // opcode = x21 & 0xF (callee-saved)
    cmp     x9, x22                  // op_prec vs min_prec
    b.lt    pe_done                  // op_prec < min_prec: not ours
    add     x20, x20, #1            // skip operator char
    mov     x0, x20
    add     x1, x9, #1              // recurse with prec+1
    bl      parse_expr
    mov     x20, x1                  // update position
    adr     x9, pe_ops
    add     x9, x9, x23, lsl #3
    br      x9
pe_ops:
    orr     x19, x19, x0            // opcode 0: |
    b       pe_loop
    and     x19, x19, x0            // opcode 1: &
    b       pe_loop
    add     x19, x19, x0            // opcode 2: +
    b       pe_loop
    sub     x19, x19, x0            // opcode 3: -
    b       pe_loop
    mul     x19, x19, x0            // opcode 4: *
    b       pe_loop
    lsl     x19, x19, x0            // opcode 5: <<
    b       pe_loop
    lsr     x19, x19, x0            // opcode 6: >>
    b       pe_loop

pe_done:
    mov     x0, x19
    mov     x1, x20
    b       pl_done

// ──────────────────────────────────────────────────────────────────────────
//  parse_expr_unary — handles '~', unary '-', then falls through to atom
// ──────────────────────────────────────────────────────────────────────────
parse_expr_unary:
    stp     x30, x20, [sp, #-16]!

    bl      skip_ws

    cmp     w9, #'~'
    b.eq    pe_unary_not
    cmp     w9, #'-'
    b.eq    pe_unary_neg

    // not unary, fall through to parse atom (skip_ws already done)

    // '(' — grouped expression
    cmp     w9, #'('
    b.eq    pe_atom_paren

    // '.' — current location counter
    cmp     w9, #'.'
    b.eq    pe_atom_dot

    // digit or '-' or '\'' — numeric literal
    sub     w10, w9, #'0'
    cmp     w10, #9
    b.ls    pe_atom_num
    cmp     w9, #'\''
    b.eq    pe_atom_num

    // identifier — symbol reference
    bl      parse_ident
    cbz     x1, pe_atom_err
    mov     x20, x2                  // end pointer (return this)

    // look up symbol
    bl      sym_lookup
    cbz     x1, pe_atom_undef

    // return value
    ldr     x0, [x0, #SYM_VALUE]
    b       pea_ret_x20

pe_atom_undef:
    // in pass 1, undefined symbols get 0 (forward ref in instruction)
    ldr     x9, [x28, #ST_PASS]
    tbz     x9, #1, 1f
    // pass 2: error
err_undef:
    adr     x0, msg_undef
    bl      error_at
1:  mov     x0, #0
    b       pea_ret_x20

pe_atom_paren:
    add     x0, x0, #1              // skip '('
    bl      parse_expr0
    mov     x20, x0                  // value
    bl      ws_x1
    cmp     w9, #')'
    b.ne    pe_atom_err
    add     x1, x0, #1              // pointer past ')'
    mov     x0, x20
    b       pea_ret

pe_atom_dot:
    mov     x20, x0                 // save pointer to '.'
    ldr     x11, [x28, #ST_CUR_SEC]
    ldr     x0, [x28, x11]          // section offset
    ldr     x10, [x28, #ST_PASS]
    tbz     x10, #1, 1f
    // pass 2: add section base (x11 = sec*8)
    add     x11, x28, x11
    ldr     x11, [x11, #ST_TEXT_BASE]
    add     x0, x0, x11
1:  add     x1, x20, #1             // pointer past '.'
    b       pea_ret

pe_atom_num:
    bl      parse_int
    b       pea_ret
pea_ret_x20:
    mov     x1, x20
pea_ret:
    ldp     x30, x20, [sp], #16
    ret

pe_atom_err:
    adr     x0, msg_syntax
    bl      error_at

pe_unary_not:
    add     x0, x0, #1
    bl      parse_expr_unary         // recursive
    mvn     x0, x0
    b       pea_ret

pe_unary_neg:
    add     x0, x0, #1
    bl      parse_expr_unary         // recursive
    sub     x0, xzr, x0             // neg
    b       pea_ret

// ══════════════════════════════════════════════════════════════════════════
//  String parsing
// ══════════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────────
//  parse_string — parse a quoted string, count or emit bytes
//  x0 = pointer (at the opening '"')
//  x1 = destination (NULL to just count)
//  returns x0 = byte count, x1 = pointer past closing '"'
// ──────────────────────────────────────────────────────────────────────────
parse_string:
    mov     x16, x30
    mov     x15, x1                  // dest (or NULL)
    add     x0, x0, #1              // skip opening '"'
    mov     x14, #0                  // byte count

ps_loop:
    ldrb    w9, [x0], #1            // load + advance
    cbz     w9, ps_done              // unterminated string
    cmp     w9, #'"'
    b.eq    ps_done                  // closing quote (x0 already past it)
    cmp     w9, #'\\'
    b.eq    ps_escape

    // plain character — x0 already advanced by post-increment
ps_store:
    strb    w9, [x15, x14]
    add     x14, x14, #1
    b       ps_loop

ps_escape:
    ldrb    w9, [x0], #1            // load escape char, advance (past backslash)
    bl      decode_escape
    b       ps_store

ps_done:
    mov     x1, x0
    mov     x0, x14                  // byte count
    br      x16

// ══════════════════════════════════════════════════════════════════════════
//  Pass 2 infrastructure
// ══════════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────────
//  emit_inst_done — emit instruction word then restore encode_instruction frame
//  x0 = instruction word; reached via 'b' from within encode_instruction
// ──────────────────────────────────────────────────────────────────────────
// emit_with_sf — apply sf bit into bit 31 of w0, then emit
emit_with_sf:
    mov     w24, w23
emit_with_sf24:
    orr     w0, w0, w24, lsl #31
emit_inst_done:
    ldr     x9, [x28, #ST_TEXT_POS]
    str     w0, [x27, x9]
    add     x9, x9, #4
    str     x9, [x28, #ST_TEXT_POS]
    b       pl_done

// ──────────────────────────────────────────────────────────────────────────
//  parse_label_pc_rel — parse label ref then compute PC-relative offset
//  uses [sp, #48] for return address
//  returns x0 = signed offset in instruction units
// ──────────────────────────────────────────────────────────────────────────
parse_label_pc_rel:
    str     x30, [sp, #48]
    bl      parse_label_ref
    ldr     x9, [x28, #ST_TEXT_BASE]
    ldr     x10, [x28, #ST_TEXT_POS]
    add     x9, x9, x10
    sub     x0, x0, x9
    asr     x0, x0, #2
    ldr     x30, [sp, #48]
    ret


// parse_x23_ws — parse first register into x23, skip comma+ws
// x1 preserved (sf/is_64bit from parse_register)
// uses [sp, #48] for return address
parse_x23_ws:
    str     x30, [sp, #48]
    bl      ws_x21_parse_reg
    mov     x23, x0
    ldr     x30, [sp, #48]
    b       ws_x2_skip1

// ──────────────────────────────────────────────────────────────────────────
//  parse_2reg — parse "Rd, Rn" from operands (x21)
//  returns x22 = Rd, x23 = sf, x0 = Rn
//  NOTE: uses [sp, #56] for return address; called from encode_instruction
// ──────────────────────────────────────────────────────────────────────────
parse_2reg:
    str     x30, [sp, #56]
    bl      parse_x23_ws
    mov     x22, x23                  // Rd
    mov     x23, x1                  // sf
    b       p23_tail

// ──────────────────────────────────────────────────────────────────────────
//  parse_3reg — parse "Rd, Rn, Rm" from operands (x21)
//  returns x22 = Rd, x23 = sf, x24 = Rn, x0 = Rm
//  NOTE: uses [sp, #56] for return address; called from encode_instruction
// ──────────────────────────────────────────────────────────────────────────
parse_3reg:
    str     x30, [sp, #56]
    bl      parse_x23_ws
    mov     x22, x23                  // Rd
    mov     x23, x1                  // sf
    bl      parse_register
    mov     x24, x0                  // Rn
    bl      ws_x2_skip1                 // skip ','
p23_tail:
    ldr     x30, [sp, #56]
    b       parse_register

skip_lsl:
1:  ldrb    w9, [x0, #1]!
    cmp     w9, #'#'
    b.ne    1b
    // falls through to parse_hash_imm

// ──────────────────────────────────────────────────────────────────────────
//  parse_hash_imm — parse #expr or #:lo12:expr
//  x0 = pointer (at '#')
//  returns x0 = value, x1 = pointer past, x2 = 1 if :lo12:
// ──────────────────────────────────────────────────────────────────────────
parse_hash_imm:
    str     x30, [sp, #48]

    // check first char: '#' or ':'
    ldrb    w9, [x0]
    cmp     w9, #'#'
    csinc   x0, x0, x0, ne          // skip '#' if found
    ldrb    w9, [x0]
    cmp     w9, #':'
    b.ne    phi_plain
    ldrb    w10, [x0, #1]
    cmp     w10, #'l'
    b.ne    phi_plain

    // :lo12: — skip 6 chars
    add     x0, x0, #6
    bl      parse_expr0
    and     x0, x0, #0xFFF
    b       phi_ret

phi_plain:
    bl      parse_expr0
phi_ret:
    ldr     x30, [sp, #48]
    ret

// ──────────────────────────────────────────────────────────────────────────
//  parse_label_ref — parse branch target (named label or Nf/Nb)
//  x0 = pointer
//  returns x0 = target address, x1 = pointer past
// ──────────────────────────────────────────────────────────────────────────
parse_label_ref:
    stp     x30, x20, [sp, #-16]!

    bl      skip_ws

    // numeric label ref? digit followed by 'f' or 'b'
    sub     w10, w9, #'0'
    cmp     w10, #9
    b.hi    plr_named

    ldrb    w11, [x0, #1]
    cmp     w11, #'b'
    cset    x1, eq                   // x1=1 backward, 0 forward
    b.eq    plr_numlab_common
    cmp     w11, #'f'
    b.ne    plr_named
plr_numlab_common:
    add     x20, x0, #2             // pointer past "Nf"/"Nb"
    mov     x0, x10                  // digit
    // inlined rnl_entry
    add     x9, x28, #NUMLAB_CURS_OFF
    ldr     x10, [x9, x0, lsl #3]   // cursor
    sub     x10, x10, x1             // backward: cursor-1, forward: cursor
    adrp    x11, numlab_defs
    add     x11, x11, :lo12:numlab_defs
    lsl     x12, x0, #7             // digit * 128
    add     x12, x12, x10           // + cursor
    ldr     x0, [x11, x12, lsl #3]
    b       pea_ret_x20

plr_named:
    bl      parse_ident
    cbz     x1, err_undef
    mov     x20, x2                  // save end pointer

    bl      sym_lookup
    cbz     x1, err_undef

    ldr     x0, [x0, #SYM_VALUE]
    b       pea_ret_x20

// ──────────────────────────────────────────────────────────────────────────
//  encode_logical_imm — encode bitmask immediate for logical instructions
//  x0 = value, x1 = is_32bit (1=replicate low 32 to full 64)
//  returns x0 = (N << 12) | (immr << 6) | imms, or -1 if unencodable
// ──────────────────────────────────────────────────────────────────────────
encode_logical_imm:
    // leaf function — no frame needed, uses scratch registers only
    // for 32-bit, replicate low 32 bits
    cbz     x1, eli_start
    and     x0, x0, #0xFFFFFFFF
    orr     x0, x0, x0, lsl #32

eli_start:
    mov     x13, x0                  // val

    // reject all-zeros and all-ones
    cbz     x13, ei_logical_bad
    mvn     x9, x13
    cbz     x9, ei_logical_bad

    // rotation = ctz(val & (val + 1))
    add     x9, x13, #1
    and     x9, x13, x9
    rbit    x10, x9
    clz     x14, x10                 // rotation

    // normalized = ror(val, rotation)
    ror     x9, x13, x14

    // zeroes = clz(normalized)
    clz     x10, x9

    // ones = ctz(~normalized) = clz(rbit(~normalized))
    mvn     x11, x9
    rbit    x11, x11
    clz     x15, x11                 // ones

    // size = zeroes + ones
    add     x16, x10, x15

    // validate: ror(val, size) == val
    ror     x9, x13, x16
    cmp     x9, x13
    b.ne    ei_logical_bad

    // immr = (-rotation) & (size - 1)
    neg     x9, x14
    sub     x10, x16, #1
    and     x9, x9, x10             // immr

    // imms = (-(size << 1) | (ones - 1)) & 0x3F
    sub     x11, xzr, x16, lsl #1
    sub     x12, x15, #1
    orr     x11, x11, x12
    and     x11, x11, #0x3F         // imms

    // result = (N << 12) | (immr << 6) | imms  where N = size >> 6
    lsr     x12, x16, #6
    orr     x0, x11, x9, lsl #6
    orr     x0, x0, x12, lsl #12
    ret

// ──────────────────────────────────────────────────────────────────────────
//  parse_cond — parse condition code (eq, ne, lt, ge, hi, ls, etc.)
//  x0 = pointer (at first char of condition)
//  returns x0 = pointer past, x1 = cond code (0-14)
//  Uses cond_table in .rodata: 2-byte entries, index = code; cs/cc aliases at 15/16
// ──────────────────────────────────────────────────────────────────────────
parse_cond:
    ldrh    w9, [x0]
    add     x0, x0, #2
    lsr     w10, w9, #8             // char1
    eor     w10, w10, w9            // char0 ^ char1
    and     w10, w10, #0x1F         // 5-bit index
    adr     x11, cond_xor_tbl
    ldrb    w1, [x11, x10]
    ret

parse_reg_fail:
    mov     x0, #-1
    ret

// ──────────────────────────────────────────────────────────────────────────
//  encode_instruction — dispatch mnemonic, parse operands, emit
//  x0 = mnemonic start, x1 = mnemonic length, x2 = operands start
// ──────────────────────────────────────────────────────────────────────────
encode_instruction:
    // x19 already equals x0 (set by process_line before parse_ident)
    mov     x20, x1
    mov     x21, x2

    // dispatch on first character of mnemonic
    ldrb    w9, [x19]
    ldrb    w10, [x19, #1]

    cmp     w9, #'a'
    b.eq    ei_a
    cmp     w9, #'b'
    b.eq    ei_b
    cmp     w9, #'c'
    b.eq    ei_c
    cmp     w9, #'e'
    mov     x22, #2                 // eor opc (doesn't affect flags)
    b.eq    ei_logical
    cmp     w9, #'l'
    b.eq    ei_l
    cmp     w9, #'m'
    b.eq    ei_m
    cmp     w9, #'n'
    b.eq    ei_n
    cmp     w9, #'o'
    mov     x22, #1                 // orr opc (doesn't affect flags)
    b.eq    ei_logical
    cmp     w9, #'r'
    b.eq    ei_r
    cmp     w9, #'s'
    b.eq    ei_s
    cmp     w9, #'t'
    b.eq    ei_t
    cmp     w9, #'u'
    b.ne    ei_bad
// udiv Rd, Rn, Rm / ubfx / ubfm / ubfiz / uxtb / uxth
ei_u:
    cmp     w10, #'b'
    b.eq    ei_bfm_unified
    cmp     w10, #'x'
    b.eq    ei_sxt_uxt
ei_udiv:
    mov     w25, #0
ei_div_common:
    bl      parse_3reg
    orr     w9, w25, #0x0800
    b       emit_3reg_1AC0_tail

    // ── 'a' mnemonics: add, and, adrp ─────────────────────────────────────
ei_a:
    cmp     w10, #'d'
    b.eq    ei_a_d
    cmp     w10, #'n'
    movz    w25, #0x2800             // ASRV opcode (speculative, harmless if AND)
    b.ne    ei_shift_common
    sub     x22, x20, #3             // len=3→0 (AND), len=4→1
    add     x22, x22, x22, lsl #1   // 0→0, 1→3 (ANDS opc)
    b       ei_logical
// sxtb/sxth/sxtw/uxtb/uxth Rd, Rn — SBFM/UBFM Rd, Rn, #0, #imms
ei_sxt_uxt:
    bl      parse_2reg
    mov     x24, x0
    mov     x10, #0                  // immr = 0
    ldrb    w9, [x19, #3]            // suffix: 'b', 'h', or 'w'
    ubfx    w11, w9, #3, #2          // 'b'→0, 'h'→1, 'w'→2
    mov     w12, #8
    lsl     w11, w12, w11            // 8, 16, 32
    sub     w11, w11, #1             // 7, 15, 31
    ldrb    w9, [x19]               // 's' or 'u'
    cmp     w9, #'s'
    b.ne    ei_ubfm_emit             // uxt → UBFM path (sxt falls through)
ei_asr_sbfm:
    movz    w0, #0x1300, lsl #16     // 32-bit SBFM base (sf+N applied later)
    b       ei_bfm_apply_n_sf

ei_a_d:
    ldrb    w10, [x19, #2]
    cmp     w10, #'d'
    b.eq    ei_add
    cmp     w10, #'r'
    b.ne    ei_bad
    // adr/adrp shared: parse Rd, skip comma, precompute PC
    bl      ws_x21_parse_reg
    mov     x22, x0                  // Rd
    bl      ws_x2_skip1                 // skip ','
    ldr     x9, [x28, #ST_TEXT_BASE]
    ldr     x10, [x28, #ST_TEXT_POS]
    add     x25, x9, x10             // x25 = PC
    cmp     x20, #3
    b.eq    ei_adr_body
    // adrp: page-relative offset
    bl      parse_label_ref
    and     x23, x0, #~0xFFF
    and     x9, x25, #~0xFFF
    sub     x23, x23, x9
    asr     x23, x23, #12
    b       ei_adr_encode
ei_adr_body:
    bl      parse_expr0
    sub     x23, x0, x25             // imm21 = target - PC
ei_adr_encode:
    // encoding: immlo = imm21[1:0], immhi = imm21[20:2]
    and     w9, w23, #3              // immlo
    ubfx    w10, w23, #2, #19        // immhi (19 bits)
    sub     w23, w20, #3             // sf: 0=ADR(len3), 1=ADRP(len4)
    movz    w0, #0x1000, lsl #16     // ADR base opcode
    orr     w0, w0, w22
    orr     w0, w0, w9, lsl #29
    orr     w0, w0, w10, lsl #5
    b       emit_with_sf

    // ── 'b' mnemonics: b, bl, b.cond, bic, bfm, bfi, bfxil ────────────────
ei_b:
    cmp     w10, #'f'
    b.eq    ei_bfm_unified
    cmp     x20, #3
    b.eq    ei_b3
    b.hi    ei_bad
    cmp     w10, #'r'
    b.eq    ei_br
    // b (len=1) or bl (len=2): bit 31 = len-1
    sub     x9, x20, #1
    movz    w22, #0x1400, lsl #16
    orr     w22, w22, w9, lsl #31
    bl      ws_x21
    cmp     w9, #'.'
    b.eq    ei_bcond
    // B/BL: parse label, compute pc-relative offset
    bl      parse_label_pc_rel
    and     w0, w0, #0x3FFFFFF
    orr     w0, w0, w22
    b       emit_inst_done
// 3-char 'b' mnemonics: blr or bic
ei_b3:
    ldrb    w9, [x19, #2]
    cmp     w9, #'r'
    b.eq    ei_blr
// bic Rd, Rn, Rm — AND Rd, Rn, ~Rm
// sf 00 01010 sh 1 Rm imm6 Rn Rd
ei_bic:
    bl      parse_3reg
    movz    w9, #0x0A20, lsl #16     // 32-bit BIC
    b       emit_3reg_sf_tail
// br Xn / blr Xn — branch (with link) to register
// br: x20=2 (len), blr: x20=3 → sub 2 gives 0 or 1 for bit 21
ei_br:
ei_blr:
    bl      ws_x21_parse_reg
    sub     w10, w20, #2
    movz    w9, #0xD61F, lsl #16
    orr     w9, w9, w10, lsl #21     // blr: set bit 21 → 0xD63F
    orr     w0, w9, w0, lsl #5
    b       emit_inst_done

    // ── 'c' mnemonics: cmp, cbz, cbnz, clz, cset ──────────────────────────
ei_c:
    cmp     w10, #'m'
    b.eq    ei_c_cm
    cmp     w10, #'b'
    b.ne    2f
    ldrb    w10, [x19, #2]
    cmp     w10, #'z'
    cset    x22, ne                  // x22=0 for cbz, 1 for cbnz
    b.eq    ei_cbz_common
    cmp     w10, #'n'
    b.ne    ei_bad
ei_cbz_common:
    bl      parse_x23_ws
    mov     x24, x1                  // sf
    bl      parse_label_pc_rel
    and     w0, w0, #0x7FFFF
    orr     w0, w23, w0, lsl #5
    orr     w0, w0, w22, lsl #24
    movz    w9, #0x3400, lsl #16
    b       ei_addsub_sf_emit
2:  cmp     w10, #'l'
    b.eq    ei_clz
    ldrb    w10, [x19, #3]
    cmp     w10, #'n'
    movz    w26, #0x0400                // CSINC bit (speculative)
    b.eq    ei_csel_common
    cmp     w10, #'l'
    movz    w26, #0                     // CSEL: no extra bits (speculative)
    b.eq    ei_csel_common
// cset Rd, cond — alias for CSINC Rd, xzr, xzr, invert(cond)
// encoding: 0x9A9F0000 | (inv_cond << 12) | 0x07E0 | Rd
ei_cset:
    bl      ws_x21_parse_reg            // x0 = Rd, x2 = pointer past
    mov     x22, x0                     // Rd
    bl      ws_x2_skip1                     // skip ','
    bl      parse_cond                  // x1 = cond code
    eor     w1, w1, #1                  // invert condition (flip bit 0)
    orr     w0, w22, w1, lsl #12        // Rd | (inv_cond << 12)
    orr     w0, w0, #0x7E0              // | Rn=xzr<<5, o2=1
    movk    w0, #0x9A9F, lsl #16        // | sf=1, opc, Rm=xzr
    b       emit_inst_done

// csel Rd, Rn, Rm, cond
// encoding: sf 00 11010100 Rm cond 00 Rn Rd
//   64-bit base: 0x9A800000
ei_csel_common:
    bl      parse_3reg                  // x22=Rd, x23=sf, x24=Rn, x0=Rm
    mov     x25, x0                     // save Rm
    bl      ws_x2_skip1                 // skip ','
    bl      parse_cond                  // x1 = cond
    movz    w9, #0x1A80, lsl #16        // CSEL 32-bit base
    orr     w9, w9, w26                 // | CSINC bit if set
    orr     w9, w9, w1, lsl #12        // | (cond << 12)
    mov     x0, x25                     // Rm for emit_3reg_sf_tail
    b       emit_3reg_sf_tail

    // ── 'l' mnemonics: ldr, ldrb, lsl, lsr ────────────────────────────────
ei_l:
    cmp     w10, #'d'
    b.eq    ei_ld
// lsl/lsr — immediate (UBFM alias) or register (LSLV/LSRV)
ei_ls_shift:
    ldrb    w10, [x19, #2]
    movz    w25, #0x2000             // LSLV
    cmp     w10, #'r'
    b.ne    ei_shift_common
    movz    w25, #0x2400             // LSRV
ei_shift_common:
    bl      parse_2reg               // x22=Rd, x23=sf, x0=Rn, x2=ptr past
    mov     x24, x0                  // Rn
    bl      ws_x2_skip1
    cmp     w9, #'#'
    b.eq    ei_shift_imm_dispatch
    // register form
    bl      parse_register           // Rm
    mov     w9, w25
    b       emit_3reg_1AC0_tail

    // ── 'm' mnemonics: mov, movz, movn, movk, mul, msub, madd, mvn ────────
ei_m:
    cmp     w10, #'o'
    b.eq    ei_mo
    cmp     w10, #'u'
    b.eq    ei_mul
    cmp     w10, #'s'
    movz    x26, #0x8000                // MSUB bit15 (speculative)
    b.eq    ei_madd_msub_common
    cmp     w10, #'a'
    mov     x26, #0                     // MADD bit15 (speculative)
    b.eq    ei_madd_msub_common
// mvn Rd, Rm — alias for orn Rd, xzr, Rm
ei_mvn:
    movz    w25, #0x2A20, lsl #16     // 32-bit ORN base
    b       ei_neg_mvn_common

ei_mo:
    // mov (3 chars) vs movz/movn/movk (4 chars)
    cmp     x20, #3
    b.eq    ei_mov
    ldrb    w10, [x19, #3]
    cmp     w10, #'z'
    movz    w22, #0x5280, lsl #16    // MOVZ base (speculative)
    b.eq    ei_movwide
    cmp     w10, #'n'
    movz    w22, #0x1280, lsl #16    // MOVN base (speculative)
    b.eq    ei_movwide
    cmp     w10, #'k'
    b.ne    ei_bad
    movz    w22, #0x7280, lsl #16    // MOVK base
    b       ei_movwide

    // ── 's' mnemonics: sub, str, strb, svc, sbfm, sbfx, sbfiz, sxt* ──────
ei_s:
    cmp     w10, #'u'
    b.eq    ei_su
    cmp     w10, #'t'
    b.eq    ei_st
    cmp     w10, #'v'
    b.eq    ei_svc
    cmp     w10, #'b'
    b.eq    ei_bfm_unified
    cmp     w10, #'x'
    b.eq    ei_sxt_uxt
ei_sd:
    mov     w25, #0x400
    b       ei_div_common

ei_bad:
    adr     x0, msg_badins
    bl      error_at

ei_ret:
    movz    w0, #0x03C0
    movk    w0, #0xD65F, lsl #16
    b       emit_inst_done

ei_svc:
    bl      ws_x21
    bl      parse_hash_imm           // x0 = imm16 value
    and     w9, w0, #0xFFFF
    movz    w0, #0x0001
    movk    w0, #0xD400, lsl #16     // 0xD4000001
    orr     w0, w0, w9, lsl #5
    b       emit_inst_done

ei_bcond:
    add     x0, x0, #1              // skip '.'
    movz    w22, #0x5400, lsl #16    // 0x54000000
    bl      parse_cond
    orr     w22, w22, w1             // base | cond
    bl      parse_label_pc_rel
    // 0x54000000 | (imm19 << 5) | cond
    and     w0, w0, #0x7FFFF
    orr     w0, w22, w0, lsl #5
    b       emit_inst_done

// clz Rd, Rn — 64-bit: 0xDAC01000, 32-bit: 0x5AC01000
ei_clz:
    mov     w25, #0x1000
    b       ei_clz_rbit_common

// rbit Rd, Rn — 64-bit: 0xDAC00000, 32-bit: 0x5AC00000
ei_r:
    cmp     w10, #'e'
    b.eq    ei_ret
    cmp     w10, #'o'
    b.eq    ei_ror
ei_rbit:
    mov     w25, #0
ei_clz_rbit_common:
    bl      parse_2reg               // x22=Rd, x23=sf, x0=Rn
    mov     x24, x0                  // Rn for emit_3reg_sf_tail
    movz    w9, #0x5AC0, lsl #16     // 32-bit base
    orr     w9, w9, w25              // opcode (0x1000 for clz, 0 for rbit)
    mov     x0, #0                   // no Rm field
    b       emit_3reg_sf_tail

// ror Rd, Rn, Rm — RORV: 0x1AC02C00 (32-bit) / 0x9AC02C00 (64-bit)
ei_ror:
    bl      parse_3reg
    movz    w9, #0x2C00
    b       emit_3reg_1AC0_tail

// add/adds Rd, Rn, #imm / Rm [, lsl #N] / :lo12:sym
ei_add:
    mov     x22, #0                  // op=0 (ADD)
    b       ei_addsub_s
// sub/subs Rd, Rn, #imm / Rm
ei_su:
ei_sub:
    movz    x22, #0x4000, lsl #16    // op=1 (SUB)
ei_addsub_s:
    sub     x9, x20, #3             // 0 for len=3, 1 for len=4
    orr     x22, x22, x9, lsl #29   // set S flag if len=4
ei_addsub:
    bl      parse_x23_ws
    mov     x24, x1                  // sf
    bl      parse_register
    mov     x25, x0                  // save Rn
    bl      ws_x2_skip1                 // skip ','

    // is the third operand a register or immediate?
ei_addsub_operand:
    cmp     w9, #'a'
    b.lo    ei_addsub_imm            // '#' or ':lo12:' (both < 'a')

    // register form: add Rd, Rn, Rm [, lsl #N]
    bl      parse_register
    mov     x21, x0                  // Rm
    bl      ws_x2
    // check for optional ", lsl #N"
    cmp     w9, #','
    mov     x9, #0                   // shift amount default 0 (doesn't affect flags)
    b.ne    ei_addsub_reg_emit
    bl      skip_lsl                 // skip ", lsl" + parse_hash_imm
    mov     x9, x0                   // shift amount

ei_addsub_reg_emit:
    // sf op 0 01011 shift 0 Rm imm6 Rn Rd
    // shift = 00 (LSL)
    and     w11, w9, #0x3F
    orr     w0, w23, w25, lsl #5     // Rd | (Rn << 5)
    orr     w0, w0, w11, lsl #10     // imm6
    orr     w0, w0, w21, lsl #16     // Rm
    orr     w0, w0, w22              // op|S bits
    movz    w9, #0x0B00, lsl #16
    b       ei_addsub_sf_emit

ei_addsub_imm:
    // immediate form: #expr or #:lo12:expr
    bl      parse_hash_imm           // x0=val, x2=is_lo12
    lsr     x9, x0, #12
    cbnz    x9, ei_logical_bad       // imm12 out of range (0-4095)
    // sf op 0 10001 shift imm12 Rn Rd
    orr     w9, w23, w25, lsl #5     // Rd | (Rn << 5)
    orr     w9, w9, w0, lsl #10     // imm12 (bits 12+ known zero)
    orr     w9, w9, w22              // op|S bits
    movz    w0, #0x1100, lsl #16

// shared tail: w9=opcode bits (0x0B00 or 0x1100 << 16), x24=sf, w0=partial insn
ei_addsub_sf_emit:
    orr     w0, w0, w9
    b       emit_with_sf24

// cmp/cmn Rn, #imm / cmp/cmn Rn, Rm — reuse addsub with Rd=xzr
ei_c_cm:
    ldrb    w10, [x19, #2]
    cmp     w10, #'n'
    movz    x22, #0x6000, lsl #16    // CMP: SUBS bits 30:29 = 11
    b.ne    1f
    movz    x22, #0x2000, lsl #16    // CMN: ADDS bits 30:29 = 01
1:  mov     x23, #31                 // Rd = xzr
    bl      ws_x21_parse_reg         // first operand = Rn
    mov     x24, x1                  // sf
    mov     x25, x0                  // save Rn
    bl      ws_x2_skip1              // skip ','
    b       ei_addsub_operand

// and/eor/orr — immediate (bitmask) or register
ei_logical:
    bl      parse_x23_ws
    mov     x24, x1                  // sf
    bl      parse_register
    mov     x26, x0                  // Rn
    bl      ws_x2_skip1
ei_logical_operand:
    cmp     w9, #'#'
    b.eq    ei_logical_imm

    // register form: sf opc 01010 sh 0 Rm imm6 Rn Rd
    bl      parse_register
    mov     x21, x0                  // Rm
    bl      ws_x2
    mov     w25, #0                  // shift amount = 0 default
    cmp     w9, #','
    b.ne    ei_logical_reg_emit
    bl      skip_lsl                 // skip ", lsl" + parse_hash_imm
    mov     w25, w0                  // shift amount
ei_logical_reg_emit:
    orr     w0, w23, w26, lsl #5     // Rd | (Rn << 5)
    orr     w0, w0, w25, lsl #10     // imm6 (shift amount)
    orr     w0, w0, w21, lsl #16
    orr     w0, w0, w22, lsl #29
    movz    w9, #0x0A00, lsl #16
    b       ei_addsub_sf_emit

ei_logical_imm:
    bl      parse_hash_imm
    eor     x1, x24, #1             // is_32bit = !sf
    bl      encode_logical_imm
    // x0 = (N<<12)|(immr<<6)|imms
    orr     w9, w23, w26, lsl #5     // Rd | (Rn << 5)
    orr     w9, w9, w0, lsl #10      // | N/immr/imms
    orr     w9, w9, w22, lsl #29
    movz    w0, #0x1200, lsl #16     // 100100 in bits 28:23
    b       ei_addsub_sf_emit        // orr w0|w9, apply sf, emit

// tst Rn, #imm / Rm — alias for ANDS XZR, Rn, operand
ei_tst:
    mov     x22, #3                  // opc = ANDS
    mov     x23, #31                 // Rd = XZR
    bl      ws_x21_parse_reg         // parse Rn
    mov     x24, x1                  // sf
    mov     x26, x0                  // Rn
    bl      ws_x2_skip1              // skip ','
    b       ei_logical_operand

ei_logical_bad:
    adr     x0, msg_badimm
    bl      error_at


// ldr/ldrb/str/strb/ldp/stp — multiple addressing modes
ei_ld:
ei_st:
    cmp     w9, #'l'
    cset    x22, eq                  // 1 for load ('l'), 0 for store ('s')
ei_ldst_dispatch:
    ldrb    w10, [x19, #2]
    cmp     w10, #'p'
    b.eq    ei_ldst_pair
    sub     x24, x20, #3            // 0 for ldr/str (len=3), 1 for ldrb/strb/ldrh/strh (len=4)
ei_ldst:
    bl      parse_x23_ws
    mov     x21, x1                  // sf (size for non-byte)
    // precompute size encoding: 0=byte, 1=half, 2=32bit, 3=64bit
    add     w20, w21, #2             // 2 or 3
    cbz     x24, 1f                  // len=3: use sf+2
    ldrb    w20, [x19, #3]          // 'b'=0x62, 'h'=0x68, 's'=0x73
    cmp     w20, #'s'
    b.eq    ei_ldrs_size             // sign-extending load (ldrsb/ldrsh/ldrsw)
    ubfx    w20, w20, #3, #2        // 0 for byte, 1 for half
1:  // literal load check: ldr Rt, label (no bracket)
    cbz     x22, ei_ldst_bracket     // store: must have [
    cbnz    x24, ei_ldst_bracket     // ldrb/ldrh: must have [
    cmp     w9, #'['
    b.ne    ei_ldr_literal
ei_ldst_bracket:
    bl      skip1_ws                 // skip '['
    bl      parse_register           // Rn
    mov     x25, x0                  // save Rn
    bl      ws_x2
    cmp     w9, #']'
    b.eq    ei_ldst_base_only
    cmp     w9, #','
    b.ne    pe_atom_err
    bl      skip1_ws
    cmp     w9, #'a'
    b.lo    ei_ldst_uimm            // '#' or ':lo12:' (both < 'a')

    // register offset: Rm [, lsl #N]
    bl      parse_register
    mov     x24, x0                 // save Rm
    bl      ws_x2
    mov     w10, #0                  // S=0
    cmp     w9, #']'
    b.eq    ei_ldst_reg_emit
    bl      skip_lsl                 // skip ", lsl" + parse_hash_imm
    cbz     x0, ei_ldst_reg_emit
    mov     w10, #1                  // S=1

ei_ldst_reg_emit:
    bl      ldst_base
    orr     w0, w0, w10, lsl #12     // S bit
    orr     w0, w0, w24, lsl #16     // Rm
    movz    w9, #0x6800              // 0x800 | 0x6000
    movk    w9, #0x3820, lsl #16     // | 0x38000000 | 0x00200000
    b       orr_w9_emit

ei_ldst_base_only:
    bl      skip1_ws                 // skip ']'
    cmp     w9, #','
    b.eq    ei_ldst_post
    mov     x0, #0
    b       ei_ldst_uimm_encode

ei_ldst_uimm:
    bl      parse_hash_imm           // x0=value, x1=ptr past imm
    // check for pre-index: [Rn, #simm9]!
    ldrb    w9, [x1]
    cmp     w9, #']'
    b.ne    ei_ldst_uimm_encode
    ldrb    w9, [x1, #1]
    cmp     w9, #'!'
    b.ne    ei_ldst_uimm_encode
    // pre-index encoding
    and     w10, w0, #0x1FF
    bl      ldst_base
    orr     w0, w0, #0x00000C00      // pre-index: bits[11:10] = 11
    b       ei_ldst_simm9_tail
ei_ldst_uimm_encode:
    tbnz    x0, #63, ei_ldst_unscaled   // negative → LDUR/STUR encoding
    lsr     x0, x0, x20
    and     w10, w0, #0xFFF
    bl      ldst_base
    orr     w0, w0, w10, lsl #10
    movz    w9, #0x3900, lsl #16
    b       orr_w9_emit
ei_ldst_unscaled:
    and     w10, w0, #0x1FF
    bl      ldst_base
    b       ei_ldst_simm9_tail          // bits[11:10] = 00 (unscaled)

ei_ldst_post:
    bl      skip1_ws                 // skip ','
    bl      parse_hash_imm
    and     w10, w0, #0x1FF
    bl      ldst_base
    orr     w0, w0, #0x00000400      // post-index: bits[11:10] = 01
ei_ldst_simm9_tail:                  // shared by pre-index and post-index
    orr     w0, w0, w10, lsl #12     // imm9 at [20:12]
    orr     w0, w0, #0x38000000
    b       emit_inst_done

// sign-extending load: determine size and opc from mnemonic suffix + dest register
// x21=sf (from parse_x23_ws), x19=mnemonic
ei_ldrs_size:
    ldrb    w9, [x19, #4]           // 5th char: 'b','h','w'
    ubfx    w20, w9, #3, #2         // 'b'→0, 'h'→1, 'w'→2
    mov     w22, #3
    sub     w22, w22, w21            // opc = 3 - sf (Xd→2, Wd→3)
    b       ei_ldst_bracket

// ldr Rt, label — PC-relative literal load
// x23=Rt, x21=sf, x0=pointer to label
ei_ldr_literal:
    bl      parse_label_pc_rel       // x0 = (target - PC) / 4
    ubfiz   w0, w0, #5, #19         // imm19 << 5
    orr     w0, w0, w23              // Rt
    movz    w9, #0x1800, lsl #16     // 32-bit base (0x18000000)
    orr     w9, w9, w21, lsl #30     // sf=1 → 0x58000000 for 64-bit
    b       orr_w9_emit

ei_ldst_pair:
    // x22=L (already set by ei_ld/ei_st)
    bl      parse_x23_ws
    mov     x21, x1                  // save sf (0=32-bit, 1=64-bit)
    bl      parse_register           // Rt2
    mov     x24, x0                  // Rt2
    bl      ws_x2_skip1                 // skip ','
    bl      skip1_ws                 // skip '['
    bl      parse_register           // Rn
    mov     x25, x0                  // Rn
    mov     x26, #0                  // addressing mode: 0=signed-offset
    bl      ws_x2
    cmp     w9, #']'
    b.eq    ei_pair_close
    bl      skip1_ws                 // skip ','
    bl      parse_hash_imm
    // x0=value, x1=ptr past imm
    // check for pre-index: ']' then '!'
    mov     x20, x0                  // save imm value
    bl      ws_x1                    // skip_ws from ptr past imm
    cmp     w9, #']'
    b.ne    ei_pair_pre_done
    bl      skip1_ws
    cmp     w9, #'!'
    b.ne    ei_pair_pre_done
    movz    w26, #0x0080, lsl #16    // pre-index: XOR sets bit 23
ei_pair_pre_done:
    mov     x0, x20                  // restore imm value
    b       ei_pair_encode
ei_pair_close:
    // saw ']' — check for post-index: ], #imm
    bl      skip1_ws                 // skip ']'
    cmp     w9, #','
    b.ne    1f
    bl      skip1_ws                 // skip ','
    bl      parse_hash_imm
    movz    w26, #0x0180, lsl #16    // post-index: XOR flips bit24 off, bit23 on
    b       ei_pair_encode
1:  mov     x0, #0                  // base-only: offset=0
ei_pair_encode:
    add     w10, w21, #2             // shift: 2 (32-bit) or 3 (64-bit)
    asr     w0, w0, w10
    and     w0, w0, #0x7F            // imm7
    movz    w9, #0x2900, lsl #16     // 32-bit STP/LDP base (signed offset)
    orr     w9, w9, w21, lsl #31     // sf=1 → 0xA900
    eor     w9, w9, w26              // apply addressing mode bits
    orr     w9, w9, w22, lsl #22
    orr     w9, w9, w0, lsl #15
    orr     w9, w9, w24, lsl #10
    orr     w9, w9, w25, lsl #5
    orr     w0, w9, w23
    b       emit_inst_done

// madd/msub Rd, Rn, Rm, Ra — 0x1B000000 (32) / 0x9B000000 (64)
ei_madd_msub_common:
    bl      parse_3reg                  // x22=Rd, x23=sf, x24=Rn, x0=Rm
    mov     x25, x0                     // save Rm
    bl      ws_x2_skip1                 // skip ','
    bl      parse_register              // Ra
    orr     w9, w22, w0, lsl #10        // Rd | (Ra << 10)
    orr     w9, w9, w24, lsl #5         // | (Rn << 5)
    orr     w9, w9, w25, lsl #16        // | (Rm << 16)
    mov     w0, w26                     // bit15 (0 or 0x8000)
    movk    w0, #0x1B00, lsl #16        // 32-bit base
    orr     w0, w0, w23, lsl #31        // sf
    b       orr_w9_emit

// mul Rd, Rn, Rm — MADD Rd, Rn, Rm, XZR
// 64-bit: 0x9B007C00 | (Rm<<16) | (Rn<<5) | Rd
// 32-bit: 0x1B007C00 | ...
ei_mul:
    bl      parse_3reg
    movz    w9, #0x7C00
    movk    w9, #0x1B00, lsl #16     // 32-bit base
    b       emit_3reg_sf_tail

ei_shift_imm_dispatch:
ei_shift_imm:
    bl      parse_hash_imm
    mov     x21, x0                  // shift amount
    mov     x11, #31
    add     x11, x11, x23, lsl #5   // size-1 = 31 or 63 (shared)
    ldrb    w9, [x19, #2]
    cmp     w9, #'r'
    b.eq    ei_lsr_asr_imm
    // LSL #n: UBFM Rd, Rn, #(-n mod size), #(size-1-n)
    neg     x10, x21
    and     x10, x10, x11           // immr = (-n) & (size-1)
    sub     x11, x11, x21           // imms = (size-1) - n
    b       ei_ubfm_emit

ei_lsr_asr_imm:
    mov     x10, x21                 // immr = n
    tbnz    w25, #11, ei_asr_sbfm    // bit 11 set in w25 = ASR (0x2800)

ei_ubfm_emit:
    movz    w0, #0x5300, lsl #16     // UBFM base (sf+N applied below)
ei_bfm_apply_n_sf:
    orr     w0, w0, w23, lsl #22     // N bit = sf
ei_ubfm_orr:
    orr     w0, w0, w22
    orr     w0, w0, w24, lsl #5
    orr     w0, w0, w11, lsl #10
    orr     w0, w0, w10, lsl #16
    b       emit_with_sf

// ── unified bitfield handler (ubfx/ubfm/ubfiz/sbfx/sbfm/sbfiz/bfm/bfi/bfxil)
ei_bfm_unified:
    bl      parse_2reg               // x22=Rd, x23=sf, x0=Rn, x2=ptr past
    mov     x24, x0                  // Rn
    bl      ws_x2_skip1              // skip ','
    bl      parse_hash_imm           // #op3
    mov     x25, x0
    bl      ws_x1
    bl      skip1_ws                 // skip ','
    bl      parse_hash_imm           // #op4
    mov     x9, x0                   // op4 in x9
    // determine base opcode from mnemonic first char (x19 preserved)
    ldrb    w10, [x19]
    mov     w11, #3                  // suffix offset for u*/s* prefix
    movz    w0, #0x1300, lsl #16     // SBFM
    cmp     w10, #'s'
    b.eq    1f
    movz    w0, #0x5300, lsl #16     // UBFM
    cmp     w10, #'u'
    b.eq    1f
    movz    w0, #0x3300, lsl #16     // BFM
    mov     w11, #2                  // suffix offset for b* prefix
1:  ldrb    w11, [x19, x11]          // load distinguishing char
    cmp     w11, #'x'
    b.eq    bfm_extract_apply
    cmp     w11, #'m'
    b.eq    bfm_raw_apply

// insert: immr=(-lsb) mod size, imms=width-1 (fall-through from dispatch)
bfm_insert_apply:
    sub     x11, x9, #1
    mov     x10, #31
    add     x10, x10, x23, lsl #5   // size-1 = 31 or 63
    neg     x9, x25
    and     x10, x9, x10
    b       ei_bfm_apply_n_sf

// extract: immr=lsb(x25), imms=lsb+width-1 (falls through to raw)
bfm_extract_apply:
    add     x9, x25, x9
    sub     x9, x9, #1
// raw: immr=x25, imms=x9
bfm_raw_apply:
    mov     x10, x25
    mov     x11, x9
    b       ei_bfm_apply_n_sf

// (bitfield handlers unified into ei_bfm_unified above)

// mov — multiple forms
ei_mov:
    bl      ws_x21_parse_reg
    mov     x22, x0                  // Rd
    mov     x23, x1                  // sf
    bl      ws_x2_skip1                 // skip ','

    cmp     w9, #'#'
    b.eq    ei_mov_imm

    // register form
    bl      parse_register
    // if either reg is 31, use ADD Rd, Rn, #0 (handles SP)
    cmp     x22, #31
    b.eq    ei_mov_add
    cmp     x0, #31
    b.eq    ei_mov_add
    // ORR Rd, XZR, Rm — x0 = Rm from parse_register
    movz    w9, #0x03E0
    movk    w9, #0x2A00, lsl #16
    orr     w9, w9, w22
    orr     w0, w9, w0, lsl #16
    b       emit_with_sf            // x23=sf: sets bit 31 if 64-bit

ei_mov_add:
    // ADD Rd, Rn, #0 — x0 = Rm from parse_register
    orr     w0, w22, w0, lsl #5
    movk    w0, #0x1100, lsl #16
    b       emit_with_sf            // x23 = sf

ei_mov_imm:
    bl      parse_hash_imm
    mov     x24, x0
    mov     x26, #0                  // phase: 0=MOVZ, 1=MOVN
ei_mov_try_phase:
    mov     x25, #0                  // hw shift counter (reset each phase)
ei_mov_hw_loop:
    lsr     x9, x24, x25
    and     x9, x9, #0xFFFF
    lsl     x11, x9, x25
    cmp     x11, x24
    b.eq    ei_mov_found
    add     x25, x25, #16
    cmp     x25, #64
    b.lt    ei_mov_hw_loop
    // try MOVN phase
    cbnz    x26, ei_logical_bad
    mvn     x24, x24
    mov     x26, #1
    b       ei_mov_try_phase

ei_mov_found:
    // x9 = imm16, x25 = shift, x26 = phase (0=MOVZ, 1=MOVN)
    movz    w0, #0x5280, lsl #16    // MOVZ base
    sub     w0, w0, w26, lsl #30    // MOVN: subtract 0x40000000 (clear bit 30)
    orr     w0, w0, w23, lsl #31    // sf bit
    orr     w0, w0, w22             // Rd
    orr     w0, w0, w9, lsl #5      // imm16
    orr     w0, w0, w25, lsl #17    // hw (shift_amount << 17 = hw << 21)
    b       emit_inst_done

// movz/movn/movk Rd, #imm16 [, lsl #N]
ei_movwide:
    bl      parse_x23_ws
    mov     x24, x1                  // sf
    bl      parse_hash_imm           // #imm16
    and     w25, w0, #0xFFFF         // imm16 (callee-saved)

    // check for optional ", lsl #N"
    bl      ws_x1
    cmp     w9, #','
    mov     w10, #0                  // hw = 0 default (doesn't affect flags)
    b.ne    ei_movwide_emit
    bl      skip_lsl                 // skip ", lsl" + parse_hash_imm
    mov     w10, w0                  // raw shift amount

ei_movwide_emit:
    orr     w0, w22, w24, lsl #31    // base | sf
    orr     w0, w0, w23              // | Rd
    orr     w0, w0, w25, lsl #5      // | imm16
    orr     w0, w0, w10, lsl #17     // | hw (shift<<17 = hw<<21)
    b       emit_inst_done

// tbz/tbnz Rt, #bit, label — b5 011011 op b40 imm14 Rt
ei_t:
    cmp     w10, #'s'
    b.eq    ei_tst
    ldrb    w9, [x19, #2]
    cmp     w9, #'z'
    b.eq    1f
    cmp     w9, #'n'
    b.ne    ei_bad
1:  sub     x22, x20, #3            // 0 for tbz (len=3), 1 for tbnz (len=4)
ei_tbz_common:
    bl      parse_x23_ws
    bl      parse_hash_imm
    mov     x24, x0                  // bit number
    bl      ws_x1
    add     x0, x0, #1              // skip ','
    bl      parse_label_pc_rel
    and     w0, w0, #0x3FFF
    orr     w0, w23, w0, lsl #5
    bfi     w0, w24, #19, #5
    lsr     w9, w24, #5
    orr     w0, w0, w9, lsl #31
    orr     w0, w0, w22, lsl #24
    movz    w9, #0x3600, lsl #16
orr_w9_emit:
    orr     w0, w0, w9
    b       emit_inst_done

// neg Rd, Rm — alias for sub Rd, xzr, Rm  /  nop
ei_n:
    cmp     w10, #'o'
    b.eq    ei_nop
ei_neg:
    movz    w25, #0x4B00, lsl #16     // 32-bit SUB base
ei_neg_mvn_common:
    bl      parse_2reg               // x22=Rd, x23=sf, x0=Rm
    mov     x24, #31                 // Rn = xzr
    mov     w9, w25
    b       emit_3reg_sf_tail

// nop — 0xD503201F
ei_nop:
    movz    w0, #0x201F
    movk    w0, #0xD503, lsl #16
    b       emit_inst_done

// ── shared emit tails ─────────────────────────────────────────────────────
// emit_3reg_sf_tail: w9=32-bit base, x23=sf -> set bit31 if sf, then emit_3reg_tail
// emit_3reg_tail: w9=base, x0=Rm, x22=Rd, x24=Rn -> emit and done
// emit_3reg_1AC0_tail: w9=low opcode bits, x23=sf, x0=Rm, x22=Rd, x24=Rn
// completes with 0x1AC0/0x9AC0 opcode and emits
emit_3reg_1AC0_tail:
    movk    w9, #0x1AC0, lsl #16
emit_3reg_sf_tail:
    orr     w9, w9, w23, lsl #31
emit_3reg_tail:
    orr     w9, w9, w22
    orr     w9, w9, w24, lsl #5
    orr     w0, w9, w0, lsl #16
    b       emit_inst_done
// ldst_base: compute size<<30 | opc<<22 | Rn<<5 | Rt for load/store encodings
// reads x20=size, x22=opc, w23=Rt, x25=Rn; returns w0=partial insn
ldst_base:
    lsl     w0, w20, #30
    orr     w0, w0, w22, lsl #22
    orr     w0, w0, w23
    orr     w0, w0, w25, lsl #5
    ret

// ══════════════════════════════════════════════════════════════════════════
//  Compression — two-tier dictionary encoder
// ══════════════════════════════════════════════════════════════════════════

// ──────────────────────────────────────────────────────────────────────────
//  compress_text — compress text_buf into input_buf using dictionary
//
//  Input:  x27 = text_buf, [x28, #ST_TEXT_POS] = text size
//  Output: x0 = compressed stream size (bytes)
//  Uses input_buf as scratch (safe — input already consumed)
// ──────────────────────────────────────────────────────────────────────────
compress_text:
    // leaf function — no bl calls, caller doesn't need x19/x20 preserved
    adr     x10, full_dict
    add     x11, x10, #FULL_DICT_SIZE     // half_dict = full_dict + 504

    mov     x12, x27                       // src = text_buf
    ldr     x1, [x28, #ST_TEXT_POS]
    add     x13, x12, x1                   // src_end
    add     x2, x28, #INPUT_BUF_OFF        // dst = input_buf

ct_loop:
    cmp     x12, x13
    b.hs    ct_done
    ldr     w3, [x12], #4

    // scan full_dict (126 entries)
    mov     x4, x10
    mov     w5, #1
ct_full:
    ldr     w6, [x4], #4
    cmp     w3, w6
    b.eq    ct_emit_full
    add     w5, w5, #1
    cmp     w5, #(FULL_DICT_ENTRIES + 1)
    b.lo    ct_full

    // try half dict
    lsr     w7, w3, #16
    mov     x4, x11
    mov     w5, #0x80
ct_half:
    ldrh    w6, [x4], #2
    cmp     w7, w6
    b.eq    ct_emit_half
    add     w5, w5, #1
    cmp     w5, #(0x80 + HALF_DICT_ENTRIES)
    b.lo    ct_half

    // raw escape
    mov     w5, #0x7F
    strb    w5, [x2], #1
    str     w3, [x2], #4
    b       ct_loop

ct_emit_full:
    strb    w5, [x2], #1
    b       ct_loop

ct_emit_half:
    strb    w5, [x2], #1
    strh    w3, [x2], #2
    b       ct_loop

ct_done:
    strb    wzr, [x2], #1              // end marker
    sub     x0, x2, x28
    sub     x0, x0, #INPUT_BUF_OFF     // x0 = compressed size
    ret

// ── full instruction dictionary (126 entries) ──────────────────
// generated by gen_dict.py — do not edit manually
full_dict:
    .word 0xd65f03c0
    .word 0x91000400
    .word 0xaa0003f8
    .word 0x540000a1
    .word 0x39400009
    .word 0xaa0003f4
    .word 0xaa0003f9
    .word 0x54000060
    .word 0x7100b13f
    .word 0x7101b95f
    .word 0xaa0103f8
    .word 0xaa1303e0
    .word 0xd4000001
    .word 0xf9400f8b
    .word 0x14000002
    .word 0x2a160000
    .word 0x39400a6a
    .word 0x54000041
    .word 0x7100255f
    .word 0x71008d3f
    .word 0x7101753f
    .word 0xaa0003f3
    .word 0xaa1403e0
    .word 0x17fffff5
    .word 0x38401409
    .word 0x5100c12a
    .word 0x7101853f
    .word 0xaa0003f6
    .word 0xaa0103f5
    .word 0xaa1e03f0
    .word 0xd2800000
    .word 0x1200200a
    .word 0x14000005
    .word 0x17ffffef
    .word 0x2a160129
    .word 0x2a170000
    .word 0x38001445
    .word 0x39400a69
    .word 0x5100c14b
    .word 0x52800019
    .word 0x54000061
    .word 0x54000080
    .word 0x540000c0
    .word 0x54000100
    .word 0x54000120
    .word 0x54000140
    .word 0x540001a0
    .word 0x540001e1
    .word 0x54ffff63
    .word 0x6b0b015f
    .word 0x7100257f
    .word 0x7100b93f
    .word 0x7101895f
    .word 0x7101915f
    .word 0x7101b15f
    .word 0x7101b93f
    .word 0x7101bd5f
    .word 0x7101c93f
    .word 0x7101c95f
    .word 0x7101cd3f
    .word 0x7101cd5f
    .word 0x7101d13f
    .word 0x7101d55f
    .word 0x7101e15f
    .word 0x7101e95f
    .word 0xa90157f4
    .word 0xa9025ff6
    .word 0xa9bc4ffe
    .word 0xaa0003e9
    .word 0xaa0003f5
    .word 0xaa0103f4
    .word 0xaa0103f7
    .word 0xaa1003fe
    .word 0xaa1403e1
    .word 0xd280001a
    .word 0xf1000e9f
    .word 0xf86b6b8a
    .word 0xf9001bfe
    .word 0xf940038a
    .word 0xf9401bfe
    .word 0xf9401f89
    .word 0x00000000
    .word 0x110004a5
    .word 0x12004800
    .word 0x14000003
    .word 0x14000004
    .word 0x14000006
    .word 0x14000008
    .word 0x14000009
    .word 0x1400017b
    .word 0x17ffffd7
    .word 0x17ffffdd
    .word 0x17ffffdf
    .word 0x17fffff9
    .word 0x2a0016c0
    .word 0x2a0016e0
    .word 0x2a004120
    .word 0x2a090000
    .word 0x2a091400
    .word 0x2a0a3000
    .word 0x2a0b2800
    .word 0x2a154000
    .word 0x321b014b
    .word 0x3940040a
    .word 0x5101856b
    .word 0x52a26000
    .word 0x52aa6000
    .word 0x54000081
    .word 0x540000a0
    .word 0x54000101
    .word 0x540001c0
    .word 0x540002a0
    .word 0x7100657f
    .word 0x71009d3f
    .word 0x7100b53f
    .word 0x7100c13f
    .word 0x7100e93f
    .word 0x7101713f
    .word 0x71017d5f
    .word 0x7101893f
    .word 0x7101953f
    .word 0x7101dd3f
    .word 0x92800c60
    .word 0x9400000c
    .word 0x97fffe94
    .word 0x9a9f17e1

// ── top-half dictionary (128 entries, packed as 64 words) ─────
half_dict:
    .word 0x540097ff
    .word 0x17ffd280
    .word 0x14009100
    .word 0x94007101
    .word 0x3940f940
    .word 0xb4005280
    .word 0x54ffd100
    .word 0x2a19f900
    .word 0xaa007100
    .word 0xb5ff1200
    .word 0x2a001000
    .word 0x2a182a16
    .word 0x91048b0c
    .word 0xaa029240
    .word 0x2a172a0a
    .word 0x38403400
    .word 0xf10052a2
    .word 0x52841100
    .word 0x8b0b8b09
    .word 0x91038b1d
    .word 0xaa09aa01
    .word 0xb7f8aa0c
    .word 0x2a01dac0
    .word 0x2a1a2a09
    .word 0x5000321b
    .word 0x52855100
    .word 0x700052a6
    .word 0x8b0a72a3
    .word 0xa9419a9f
    .word 0xaa0aa9bf
    .word 0xb500aa15
    .word 0xd503b840
    .word 0xf86bf860
    .word 0x32161a89
    .word 0x37083608
    .word 0x38603800
    .word 0x52a152a0
    .word 0x52aa52a5
    .word 0x53035302
    .word 0x784072ba
    .word 0x8a0a7940
    .word 0x8b108b00
    .word 0x8b178b15
    .word 0x92749101
    .word 0x9a809280
    .word 0xa8c19ad9
    .word 0xa905a902
    .word 0xaa13aa0d
    .word 0xcb00b4ff
    .word 0xcb19cb09
    .word 0xd2a0d000
    .word 0xd50bd379
    .word 0xeb0dd61f
    .word 0x2a15f2c0
    .word 0x3900381f
    .word 0x52a3528e
    .word 0x6b065308
    .word 0x8b0f8b01
    .word 0xa9009ac0
    .word 0xa906a901
    .word 0xaa0baa07
    .word 0xaa19aa17
    .word 0xb940b800
    .word 0xd344cb15
// ══════════════════════════════════════════════════════════════════════════
//  Decompressor stub — copied verbatim to output at CODE_START (0x78)
//
//  Runs at the ELF entry point. Decompresses .text to a page-aligned
//  address (DECOMP_DEST = ceil_page(p_filesz) + 0x78) so all ADRP+ADD
//  encodings are preserved. Copies rodata, flushes icache, jumps.
//  The stub runs in place and is never overwritten.
//
//  Unsupported instructions encoded as .word constants:
//    dc cvau, ic ivau, dsb ish, isb, br
// ══════════════════════════════════════════════════════════════════════════
_decomp_stub_start:
    adr     x6, .                        // get our address

    // compute decompression destination + preload rodata_size
    ldp     w7, w8, [x6, #STUB_DATA_DECOMP_DEST]
    add     x7, x6, x7                  // x7 = stub_base + offset

    // set up dict/stream pointers (right after stub in file)
    add     x2, x6, #(STUB_SIZE - 4)    // full_dict - 4 (1-based index → 0-based via ptr adjust)
    add     x3, x2, #(FULL_DICT_SIZE - 256 + 4) // half_dict adjusted for 0x80-based index
    add     x0, x2, #(FULL_DICT_SIZE + HALF_DICT_SIZE + 4) // stream
    mov     x1, x7                      // output dest

    // ── decompress ────────────────────────────────────────────────────────
3:  ldrb    w4, [x0], #1
    cbz     w4, _decomp_copy_rodata
    ldr     w5, [x2, x4, lsl #2]       // speculative full dict (harmless if half/raw)
    tbz     w4, #7, 5f                  // bit 7 clear → full dict or raw
    ldrh    w5, [x3, x4, lsl #1]       // half dict: upper 16 bits
    ldrh    w9, [x0], #2
    orr     w5, w9, w5, lsl #16
5:  cmp     w4, #0x7F
    b.ne    6f
    ldr     w5, [x0], #4               // raw: overwrite with stream word
6:  str     w5, [x1], #4
    b       3b

    // ── copy rodata ───────────────────────────────────────────────────────
_decomp_copy_rodata:
    cbz     x8, _decomp_flush
7:  ldrb    w3, [x0], #1
    strb    w3, [x1], #1
    sub     x8, x8, #1
    cbnz    x8, 7b

    // ── icache flush (x7=start, x1=end) ───────────────────────────────────
_decomp_flush:
    mov     x0, x7
7:  .word 0xd50b7b20                    // dc cvau, x0
    .word 0xd5033b9f                    // dsb ish
    .word 0xd50b7520                    // ic ivau, x0
    add     x0, x0, #64
    cmp     x0, x1
    b.lo    7b
    .word 0xd5033b9f                    // dsb ish
    .word 0xd5033fdf                    // isb
    // jump to decompressed entry
    br      x7
// data block (2 x uint32, patched by assembler at output time)
_decomp_data_decomp_dest:
    .word 0
_decomp_data_rodata_size:
    .word 0
_decomp_stub_end:
// computed stub constants (auto-adjust when stub changes)
.equ STUB_SIZE, (_decomp_stub_end - _decomp_stub_start)
.equ STUB_DATA_DECOMP_DEST, (_decomp_data_decomp_dest - _decomp_stub_start)
.equ STUB_DATA_RODATA_SIZE, (_decomp_data_rodata_size - _decomp_stub_start)
No results found