Skip to content

Instantly share code, notes, and snippets.

@NickBarnes
Created November 11, 2025 09:50
Show Gist options
  • Select an option

  • Save NickBarnes/b4e4bb1d3f3bb8f9d3d8509733880631 to your computer and use it in GitHub Desktop.

Select an option

Save NickBarnes/b4e4bb1d3f3bb8f9d3d8509733880631 to your computer and use it in GitHub Desktop.
This is a complete annotated disassembly of do_some_marking from a377580282ddd0f5ee5f5905fe02b7c896dc2df7, including
perf mem reports from an in-house test case. The memory performance here is excellent, and IMO there's very little in the
way of low-hanging fruit for hand optimisation. Incidentally, the compiler code generation quality is really high too.
(compiled with gcc (GCC) 14.2.1 20250110 (Red Hat 14.2.1-7)).
<+0>: cs cs cs cs push %r15 # prologue
<+6>: cs cs cs xor %ecx,%ecx
<+11>: mov %rdi,%r15
<+14>: push %r14
<+16>: mov $0x40,%r14d
<+22>: push %r13
<+24>: xor %r13d,%r13d
<+27>: push %r12
<+29>: xor %r12d,%r12d
<+32>: push %rbp
<+33>: mov %rsi,%rbp
<+36>: push %rbx
<+37>: sub $0x848,%rsp
<+44>: mov 0x6adc4d(%rip),%r11 # <caml_global_heap_state>
<+51>: mov 0x6adc4e(%rip),%r8 # <caml_global_heap_state+8>
.loop1
<+58>: mov %r12,%rax
<+61>: sub %r13,%rax
<+64>: cmp %rax,%r14
<+67>: jb <+384 .above_waterline>
<+73>: test %rbp,%rbp
<+76>: jle <+87> # budget <= 0
<+78>: mov 0x8(%r15),%rax # stk->count
<+82>: test %rax,%rax
<+85>: jne <+144 .mark_stack_pop>
<+87>: nopw 0x0(%rax,%rax,1)
<+96>: test %r14,%r14 # waterline?
<+99>: jne <+536 .enter_drain_mode>
<+105>: mov 0x2d04c8(%rip),%rax
<+112>: mov %fs:(%rax),%rax # Caml_state
<+116>: add %rcx,0x200(%rax) # stat_blocks_marked += blocks_marked
<+123>: add $0x848,%rsp
<+130>: mov %rbp,%rax # return budget
<+133>: pop %rbx
<+134>: pop %rbp
<+135>: pop %r12
<+137>: pop %r13
<+139>: pop %r14
<+141>: pop %r15
<+143>: ret
.mark_stack_pop
<+144>: sub $0x1,%rax # -- stk->count
<+148>: mov %rax,0x8(%r15) # save to stk (perf: 0.06% L1 hit)
<+152>: shl $0x4,%rax # sizeof(mark_entry) == 16
<+156>: add (%r15),%rax # stk->stack[stk->count]
<+159>: mov (%rax),%rbx # me.start
<+162>: mov 0x8(%rax),%rsi # me.end
.mark_an_entry:
rcx blocks_marked
r15 stk
r14 pb.waterline
r13 pb.dequeued
r12 pb.enqueued
rbp budget
r11 MARKED
r8 UNMARKED
rbx me.start
rsi me.end
<+166>: mov %rsi,%rax
<+169>: mov %rsi,%rdx # scan_end
<+172>: sub %rbx,%rax
<+175>: sar $0x3,%rax
<+179>: cmp %rbp,%rax
<+182>: jle <+199>
<+184>: cs cs xor %eax,%eax
<+188>: test %rbp,%rbp
<+191>: cmovns %rbp,%rax # scan_len budget < 0 ? 0 : budget
<+195>: lea (%rbx,%rax,8),%rdx # scan_end
<+199>: mov 0x6ade42(%rip),%rax # 0x294e970 <caml_minor_heaps_start>
<+206>: mov 0x6ade33(%rip),%rdi # 0x294e968 <caml_minor_heaps_end>
<+213>: lea 0x100(%r13),%r9
<+220>: mov %rax,(%rsp) # spill caml_minor_heaps_start (perf: (0.48% L1 hit; 0.01% LFB/MAB hit)
<+224>: cmp %rdx,%rbx # me_start < scan_end ?
<+227>: jae <+302 .mark_loop_over>
<+229>: data16 cs nopw 0x0(%rax,%rax,1)
<+240>: mov (%rbx),%rax # child = *me.start (perf: 0.42% RAM; 0.01% L3; 0.07% L2; 0.02% L1; 1.92% LFB/MAB)
<+243>: sub $0x1,%rbp
<+247>: tzcnt %rax,%r10 # Is_block hack
<+252>: jbe <+293 .not_markable>
<+254>: xchg %ax,%ax
<+256>: cmp %rdi,%rax # caml_minor_heaps_end
<+259>: jae <+267 .is_markable>
<+261>: cmp %rax,(%rsp) # caml_minor_heaps_start
<+265>: jb <+293 .not_markable>
.is_markable
<+267>: cmp %r12,%r9
<+270>: je <+302>
<+272>: movzbl %r12b,%r10d
<+276>: prefetcht0 -0x8(%rax) # prefetch_block (perf: 0.86% L1 hit)
<+280>: prefetcht0 0x18(%rax) # prefetch_block (perf: 1.13% L1 hit)
<+284>: add $0x1,%r12 # ++ pb.enqueued
<+288>: mov %rax,0x38(%rsp,%r10,8) # pb_push (perf: 0.93% L1 hit, 0.01% L1 miss)
.not_markable
<+293>: add $0x8,%rbx # me.start++
<+297>: cmp %rdx,%rbx # me.start < scan_end
<+300>: jb <+240>
.mark_loop_over
<+302>: cmp %rsi,%rbx # me.start < me.end?
<+305>: jae <+58 .loop1> # continue;
<+311>: mov 0x8(%r15),%rax # stk->count : inlined mark_stack_push_range
<+315>: nopl 0x0(%rax,%rax,1)
<+320>: cmp 0x10(%r15),%rax # stk->size
<+324>: je <+745 .call_realloc> # go out-of-line to call realloc_mark_stack
.realloc_return
<+330>: lea 0x1(%rax),%rdx # stk->count++
<+334>: shl $0x4,%rax
<+338>: add (%r15),%rax # &stk->stack[stk->count]
<+341>: prefetcht0 0x8(%rbx) # caml_prefetch(me.start + 1)
<+345>: mov %rdx,0x8(%r15) # save stk->count++
<+349>: mov %rbx,(%rax) # me->start = start
<+352>: mov %rsi,0x8(%rax) # me->end = end
<+356>: mov %r12,%rax # pb.enqueued
<+359>: sub %r13,%rax # pb_size(&pb)
<+362>: cmp $0x40,%rax # PREFETCH_BUFFER_MIN
<+366>: jbe <+58> .loop1 # continue;
<+372>: mov $0x40,%r14d # pb_fill_mode
<+378>: nopw 0x0(%rax,%rax,1) # fall-through: compiler has deduced above_waterline(!)
.above_waterline
rcx blocks_marked
rbp budget
r15 stk
r14 pb_waterline
.r13 pb_dequeued
r12 pb_enqueued
r11 MARKED
r8 UNMARKED
<+384>: movzbl %r13b,%eax # pb_dequeued & 255
<+388>: add $0x1,%r13 # ++ pb_dequeued
<+392>: mov 0x38(%rsp,%rax,8),%rdi # rdi = block = pb_pop
<+397>: mov -0x8(%rdi),%rax # rax = hd = Hd_val(block) (perf: 0.16% L2; 0.00% L1; 0.01% LFB/MAB)
<+401>: cmp $0xf9,%al # Infix_tag
<+403>: je <+640 .infix_tag>
<+409>: mov %rax,%rdx
<+412>: and $0x300,%edx # status bits
<+418>: cmp %rdx,%r8 # UNMARKED?
<+421>: jne <+58 .loop1> # continue;
<+427>: add $0x1,%rcx # ++blocks_marked
<+431>: movzbl %al,%edx # edx = tag
<+434>: cmp $0xf5,%al # Cont_tag
<+436>: je <+672 .cont_tag>
<+442>: mov %eax,%esi # bottom 32 bits of hd
<+444>: and $0xfd,%esi # if tag = 0xf4 (Forcing_tag)
<+450>: cmp $0xf4,%esi # ... or 0xf6 (Lzy_tag)
<+456>: jne <+831 .not_lazy_forcing>
.lazy_forcing
<+462>: lea -0x8(%rdi),%rsi # rsi = Hp_val(block)
<+466>: mov %rax,%rdx # rdx = hd
<+469>: and $0xfc,%dh # mask colour bits
<+472>: or %r11,%rdx # With_status_hd(hd, MARKED)
<+475>: lock cmpxchg %rdx,(%rsi) # atomic_store_relaxed
<+480>: jne <+805 .raced>
<+486>: movzbl %al,%edx # tag
.after_marking
<+489>: mov %rax,%r9 # r9 = hd
<+492>: sub $0x1,%rbp # -- budget
<+496>: shr $0xa,%r9 # r9 = hd >> 10
<+500>: cmp $0xfa,%edx # Scannable?
<+506>: jbe <+552 .scannable>
<+508>: movabs $0x3fffffffffff,%rax
<+518>: and %r9,%rax # budget -= Wosize_hd(hd)
<+521>: sub %rax,%rbp
<+524>: jmp <+58 .loop1> # continue;
<+529>: nopl 0x0(%rax)
.enter_drain_mode
<+536>: xor %r14d,%r14d # pb.waterline = 0
<+539>: nopl 0x0(%rax,%rax,1)
<+544>: jmp <+58 .loop1> # continue;
<+549>: nopl (%rax)
.scannable
<+552>: shr $0x38,%rax # rax = Reserved_hd(hd)
<+556>: mov %rdi,%rbx # me.start = Op_val(block)
<+559>: je <+736 .no_reserved>
<+565>: movabs $0x3fffffffffff,%r10 # r10 = wosize_mask mask
<+575>: sub $0x1,%rax # reserved-1
<+579>: lea (%rdi,%rax,8),%rsi #
<+583>: and %r10,%r9 #
<+586>: add %rbp,%rax # ... + budget
<+589>: sub %r9,%rax # - Wosize_hd(hd)
<+592>: mov %rax,%rbp # budget -= Wosize_hd(hd) - scannable_wosize
.has_mark_entry
<+595>: cmp $0xf7,%edx # Closure_tag
<+601>: jne <+166 .mark_an_entry>
<+607>: mov 0x8(%rdi),%rax # closinfo
<+611>: shl $0x9,%rax # shift arity away
<+615>: shr $0xa,%rax # untag
<+619>: sub %rax,%rbp # budget -= env_offset
<+622>: lea (%rdi,%rax,8),%rbx # me.start += env_offset
<+626>: jmp <+166 .mark_an_entry>
<+631>: nopw 0x0(%rax,%rax,1)
.infix_tag
<+640>: shl $0x8,%rax # lose 8 top bits ...
<+644>: shr $0x12,%rax # ... and 10 bottom bits
<+648>: shl $0x3,%rax # * 8
<+652>: sub %rax,%rdi #block -= Infix_offset_hd
<+655>: mov -0x8(%rdi),%rax # hd = Hd_val(block)
<+659>: jmp <+409>
<+664>: nopl 0x0(%rax,%rax,1)
.cont_tag
<+672>: mov %r11,0x18(%rsp) # save r11 (MARKED)
<+677>: sub $0x1,%rbp # -- budget
<+681>: mov %rcx,0x10(%rsp) # save rcx (blocks_marked)
<+686>: mov %r8,0x8(%rsp) # save r8 (UNMARKED)
<+691>: mov %rax,(%rsp) # save rax (hd)
<+695>: call caml_darken_cont> # happily rdi = block already
<+700>: mov (%rsp),%rax # restore rax (hd)
<+704>: mov 0x8(%rsp),%r8 # restore r8 (UNMARKED)
<+709>: mov 0x10(%rsp),%rcx # restore rcx (blocks_marked)
<+714>: mov 0x18(%rsp),%r11 # restore r11 (MARKED)
<+719>: shl $0x8,%rax # Whsize(hd)
<+723>: shr $0x12,%rax
<+727>: sub %rax,%rbp # budget -= Whsize(hd)
<+730>: jmp <+58 .loop1> # continue;
<+735>: nop
.no_reserved
<+736>: lea (%rdi,%r9,8),%rsi # me.end = me.start+Wosize_hd(hd)
<+740>: jmp <+595 .has_mark_entry>
.call_realloc
<+745>: cs cs cs cs mov %r15,%rdi # rdi = stk
<+752>: cs mov %r11,0x18(%rsp) # spill r11 (MARKED)
<+758>: mov %rcx,0x10(%rsp) # ... rcx (blocks_marked)
<+763>: mov %r8,0x8(%rsp) # ... r8 (UNMARKED)
<+768>: mov %rsi,(%rsp) # ... rsi (me.end)
<+772>: call 0x229f840 <realloc_mark_stack>
<+777>: mov 0x8(%r15),%rax # restore stk->count
<+781>: mov 0x18(%rsp),%r11 # restore r11
<+786>: mov 0x10(%rsp),%rcx # ... rcx
<+791>: mov 0x8(%rsp),%r8 # ... r8
<+796>: mov (%rsp),%rsi # ... rsi
<+800>: jmp <+330 .realloc_return>
.raced
<+805>: mov (%rsi),%rax # hd = Hd_val(block)
<+808>: mov %eax,%edx # Lazy/Forcing test again
<+810>: and $0xfd,%edx
<+816>: cmp $0xf4,%edx
<+822>: je <+466 .lazy_forcing> # (loop/fall through)
<+828>: movzbl %al,%edx # edx gets tag again
.not_lazy_forcing:
<+831>: mov %rax,%rsi # hd
<+834>: and $0xfffffffffffffcff,%rsi # mask out colours bits
<+841>: or %r11,%rsi # marked
<+844>: mov %rsi,-0x8(%rdi) # mark block (perf: 1.36% L1)
<+848>: jmp <+489 .after_marking>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment