Created
November 11, 2025 09:50
-
-
Save NickBarnes/b4e4bb1d3f3bb8f9d3d8509733880631 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| This is a complete annotated disassembly of do_some_marking from a377580282ddd0f5ee5f5905fe02b7c896dc2df7, including | |
| perf mem reports from an in-house test case. The memory performance here is excellent, and IMO there's very little in the | |
| way of low-hanging fruit for hand optimisation. Incidentally, the compiler code generation quality is really high too. | |
| (compiled with gcc (GCC) 14.2.1 20250110 (Red Hat 14.2.1-7)). | |
| <+0>: cs cs cs cs push %r15 # prologue | |
| <+6>: cs cs cs xor %ecx,%ecx | |
| <+11>: mov %rdi,%r15 | |
| <+14>: push %r14 | |
| <+16>: mov $0x40,%r14d | |
| <+22>: push %r13 | |
| <+24>: xor %r13d,%r13d | |
| <+27>: push %r12 | |
| <+29>: xor %r12d,%r12d | |
| <+32>: push %rbp | |
| <+33>: mov %rsi,%rbp | |
| <+36>: push %rbx | |
| <+37>: sub $0x848,%rsp | |
| <+44>: mov 0x6adc4d(%rip),%r11 # <caml_global_heap_state> | |
| <+51>: mov 0x6adc4e(%rip),%r8 # <caml_global_heap_state+8> | |
| .loop1 | |
| <+58>: mov %r12,%rax | |
| <+61>: sub %r13,%rax | |
| <+64>: cmp %rax,%r14 | |
| <+67>: jb <+384 .above_waterline> | |
| <+73>: test %rbp,%rbp | |
| <+76>: jle <+87> # budget <= 0 | |
| <+78>: mov 0x8(%r15),%rax # stk->count | |
| <+82>: test %rax,%rax | |
| <+85>: jne <+144 .mark_stack_pop> | |
| <+87>: nopw 0x0(%rax,%rax,1) | |
| <+96>: test %r14,%r14 # waterline? | |
| <+99>: jne <+536 .enter_drain_mode> | |
| <+105>: mov 0x2d04c8(%rip),%rax | |
| <+112>: mov %fs:(%rax),%rax # Caml_state | |
| <+116>: add %rcx,0x200(%rax) # stat_blocks_marked += blocks_marked | |
| <+123>: add $0x848,%rsp | |
| <+130>: mov %rbp,%rax # return budget | |
| <+133>: pop %rbx | |
| <+134>: pop %rbp | |
| <+135>: pop %r12 | |
| <+137>: pop %r13 | |
| <+139>: pop %r14 | |
| <+141>: pop %r15 | |
| <+143>: ret | |
| .mark_stack_pop | |
| <+144>: sub $0x1,%rax # -- stk->count | |
| <+148>: mov %rax,0x8(%r15) # save to stk (perf: 0.06% L1 hit) | |
| <+152>: shl $0x4,%rax # sizeof(mark_entry) == 16 | |
| <+156>: add (%r15),%rax # stk->stack[stk->count] | |
| <+159>: mov (%rax),%rbx # me.start | |
| <+162>: mov 0x8(%rax),%rsi # me.end | |
| .mark_an_entry: | |
| rcx blocks_marked | |
| r15 stk | |
| r14 pb.waterline | |
| r13 pb.dequeued | |
| r12 pb.enqueued | |
| rbp budget | |
| r11 MARKED | |
| r8 UNMARKED | |
| rbx me.start | |
| rsi me.end | |
| <+166>: mov %rsi,%rax | |
| <+169>: mov %rsi,%rdx # scan_end | |
| <+172>: sub %rbx,%rax | |
| <+175>: sar $0x3,%rax | |
| <+179>: cmp %rbp,%rax | |
| <+182>: jle <+199> | |
| <+184>: cs cs xor %eax,%eax | |
| <+188>: test %rbp,%rbp | |
| <+191>: cmovns %rbp,%rax # scan_len budget < 0 ? 0 : budget | |
| <+195>: lea (%rbx,%rax,8),%rdx # scan_end | |
| <+199>: mov 0x6ade42(%rip),%rax # 0x294e970 <caml_minor_heaps_start> | |
| <+206>: mov 0x6ade33(%rip),%rdi # 0x294e968 <caml_minor_heaps_end> | |
| <+213>: lea 0x100(%r13),%r9 | |
| <+220>: mov %rax,(%rsp) # spill caml_minor_heaps_start (perf: (0.48% L1 hit; 0.01% LFB/MAB hit) | |
| <+224>: cmp %rdx,%rbx # me_start < scan_end ? | |
| <+227>: jae <+302 .mark_loop_over> | |
| <+229>: data16 cs nopw 0x0(%rax,%rax,1) | |
| <+240>: mov (%rbx),%rax # child = *me.start (perf: 0.42% RAM; 0.01% L3; 0.07% L2; 0.02% L1; 1.92% LFB/MAB) | |
| <+243>: sub $0x1,%rbp | |
| <+247>: tzcnt %rax,%r10 # Is_block hack | |
| <+252>: jbe <+293 .not_markable> | |
| <+254>: xchg %ax,%ax | |
| <+256>: cmp %rdi,%rax # caml_minor_heaps_end | |
| <+259>: jae <+267 .is_markable> | |
| <+261>: cmp %rax,(%rsp) # caml_minor_heaps_start | |
| <+265>: jb <+293 .not_markable> | |
| .is_markable | |
| <+267>: cmp %r12,%r9 | |
| <+270>: je <+302> | |
| <+272>: movzbl %r12b,%r10d | |
| <+276>: prefetcht0 -0x8(%rax) # prefetch_block (perf: 0.86% L1 hit) | |
| <+280>: prefetcht0 0x18(%rax) # prefetch_block (perf: 1.13% L1 hit) | |
| <+284>: add $0x1,%r12 # ++ pb.enqueued | |
| <+288>: mov %rax,0x38(%rsp,%r10,8) # pb_push (perf: 0.93% L1 hit, 0.01% L1 miss) | |
| .not_markable | |
| <+293>: add $0x8,%rbx # me.start++ | |
| <+297>: cmp %rdx,%rbx # me.start < scan_end | |
| <+300>: jb <+240> | |
| .mark_loop_over | |
| <+302>: cmp %rsi,%rbx # me.start < me.end? | |
| <+305>: jae <+58 .loop1> # continue; | |
| <+311>: mov 0x8(%r15),%rax # stk->count : inlined mark_stack_push_range | |
| <+315>: nopl 0x0(%rax,%rax,1) | |
| <+320>: cmp 0x10(%r15),%rax # stk->size | |
| <+324>: je <+745 .call_realloc> # go out-of-line to call realloc_mark_stack | |
| .realloc_return | |
| <+330>: lea 0x1(%rax),%rdx # stk->count++ | |
| <+334>: shl $0x4,%rax | |
| <+338>: add (%r15),%rax # &stk->stack[stk->count] | |
| <+341>: prefetcht0 0x8(%rbx) # caml_prefetch(me.start + 1) | |
| <+345>: mov %rdx,0x8(%r15) # save stk->count++ | |
| <+349>: mov %rbx,(%rax) # me->start = start | |
| <+352>: mov %rsi,0x8(%rax) # me->end = end | |
| <+356>: mov %r12,%rax # pb.enqueued | |
| <+359>: sub %r13,%rax # pb_size(&pb) | |
| <+362>: cmp $0x40,%rax # PREFETCH_BUFFER_MIN | |
| <+366>: jbe <+58> .loop1 # continue; | |
| <+372>: mov $0x40,%r14d # pb_fill_mode | |
| <+378>: nopw 0x0(%rax,%rax,1) # fall-through: compiler has deduced above_waterline(!) | |
| .above_waterline | |
| rcx blocks_marked | |
| rbp budget | |
| r15 stk | |
| r14 pb_waterline | |
| .r13 pb_dequeued | |
| r12 pb_enqueued | |
| r11 MARKED | |
| r8 UNMARKED | |
| <+384>: movzbl %r13b,%eax # pb_dequeued & 255 | |
| <+388>: add $0x1,%r13 # ++ pb_dequeued | |
| <+392>: mov 0x38(%rsp,%rax,8),%rdi # rdi = block = pb_pop | |
| <+397>: mov -0x8(%rdi),%rax # rax = hd = Hd_val(block) (perf: 0.16% L2; 0.00% L1; 0.01% LFB/MAB) | |
| <+401>: cmp $0xf9,%al # Infix_tag | |
| <+403>: je <+640 .infix_tag> | |
| <+409>: mov %rax,%rdx | |
| <+412>: and $0x300,%edx # status bits | |
| <+418>: cmp %rdx,%r8 # UNMARKED? | |
| <+421>: jne <+58 .loop1> # continue; | |
| <+427>: add $0x1,%rcx # ++blocks_marked | |
| <+431>: movzbl %al,%edx # edx = tag | |
| <+434>: cmp $0xf5,%al # Cont_tag | |
| <+436>: je <+672 .cont_tag> | |
| <+442>: mov %eax,%esi # bottom 32 bits of hd | |
| <+444>: and $0xfd,%esi # if tag = 0xf4 (Forcing_tag) | |
| <+450>: cmp $0xf4,%esi # ... or 0xf6 (Lzy_tag) | |
| <+456>: jne <+831 .not_lazy_forcing> | |
| .lazy_forcing | |
| <+462>: lea -0x8(%rdi),%rsi # rsi = Hp_val(block) | |
| <+466>: mov %rax,%rdx # rdx = hd | |
| <+469>: and $0xfc,%dh # mask colour bits | |
| <+472>: or %r11,%rdx # With_status_hd(hd, MARKED) | |
| <+475>: lock cmpxchg %rdx,(%rsi) # atomic_store_relaxed | |
| <+480>: jne <+805 .raced> | |
| <+486>: movzbl %al,%edx # tag | |
| .after_marking | |
| <+489>: mov %rax,%r9 # r9 = hd | |
| <+492>: sub $0x1,%rbp # -- budget | |
| <+496>: shr $0xa,%r9 # r9 = hd >> 10 | |
| <+500>: cmp $0xfa,%edx # Scannable? | |
| <+506>: jbe <+552 .scannable> | |
| <+508>: movabs $0x3fffffffffff,%rax | |
| <+518>: and %r9,%rax # budget -= Wosize_hd(hd) | |
| <+521>: sub %rax,%rbp | |
| <+524>: jmp <+58 .loop1> # continue; | |
| <+529>: nopl 0x0(%rax) | |
| .enter_drain_mode | |
| <+536>: xor %r14d,%r14d # pb.waterline = 0 | |
| <+539>: nopl 0x0(%rax,%rax,1) | |
| <+544>: jmp <+58 .loop1> # continue; | |
| <+549>: nopl (%rax) | |
| .scannable | |
| <+552>: shr $0x38,%rax # rax = Reserved_hd(hd) | |
| <+556>: mov %rdi,%rbx # me.start = Op_val(block) | |
| <+559>: je <+736 .no_reserved> | |
| <+565>: movabs $0x3fffffffffff,%r10 # r10 = wosize_mask mask | |
| <+575>: sub $0x1,%rax # reserved-1 | |
| <+579>: lea (%rdi,%rax,8),%rsi # | |
| <+583>: and %r10,%r9 # | |
| <+586>: add %rbp,%rax # ... + budget | |
| <+589>: sub %r9,%rax # - Wosize_hd(hd) | |
| <+592>: mov %rax,%rbp # budget -= Wosize_hd(hd) - scannable_wosize | |
| .has_mark_entry | |
| <+595>: cmp $0xf7,%edx # Closure_tag | |
| <+601>: jne <+166 .mark_an_entry> | |
| <+607>: mov 0x8(%rdi),%rax # closinfo | |
| <+611>: shl $0x9,%rax # shift arity away | |
| <+615>: shr $0xa,%rax # untag | |
| <+619>: sub %rax,%rbp # budget -= env_offset | |
| <+622>: lea (%rdi,%rax,8),%rbx # me.start += env_offset | |
| <+626>: jmp <+166 .mark_an_entry> | |
| <+631>: nopw 0x0(%rax,%rax,1) | |
| .infix_tag | |
| <+640>: shl $0x8,%rax # lose 8 top bits ... | |
| <+644>: shr $0x12,%rax # ... and 10 bottom bits | |
| <+648>: shl $0x3,%rax # * 8 | |
| <+652>: sub %rax,%rdi #block -= Infix_offset_hd | |
| <+655>: mov -0x8(%rdi),%rax # hd = Hd_val(block) | |
| <+659>: jmp <+409> | |
| <+664>: nopl 0x0(%rax,%rax,1) | |
| .cont_tag | |
| <+672>: mov %r11,0x18(%rsp) # save r11 (MARKED) | |
| <+677>: sub $0x1,%rbp # -- budget | |
| <+681>: mov %rcx,0x10(%rsp) # save rcx (blocks_marked) | |
| <+686>: mov %r8,0x8(%rsp) # save r8 (UNMARKED) | |
| <+691>: mov %rax,(%rsp) # save rax (hd) | |
| <+695>: call caml_darken_cont> # happily rdi = block already | |
| <+700>: mov (%rsp),%rax # restore rax (hd) | |
| <+704>: mov 0x8(%rsp),%r8 # restore r8 (UNMARKED) | |
| <+709>: mov 0x10(%rsp),%rcx # restore rcx (blocks_marked) | |
| <+714>: mov 0x18(%rsp),%r11 # restore r11 (MARKED) | |
| <+719>: shl $0x8,%rax # Whsize(hd) | |
| <+723>: shr $0x12,%rax | |
| <+727>: sub %rax,%rbp # budget -= Whsize(hd) | |
| <+730>: jmp <+58 .loop1> # continue; | |
| <+735>: nop | |
| .no_reserved | |
| <+736>: lea (%rdi,%r9,8),%rsi # me.end = me.start+Wosize_hd(hd) | |
| <+740>: jmp <+595 .has_mark_entry> | |
| .call_realloc | |
| <+745>: cs cs cs cs mov %r15,%rdi # rdi = stk | |
| <+752>: cs mov %r11,0x18(%rsp) # spill r11 (MARKED) | |
| <+758>: mov %rcx,0x10(%rsp) # ... rcx (blocks_marked) | |
| <+763>: mov %r8,0x8(%rsp) # ... r8 (UNMARKED) | |
| <+768>: mov %rsi,(%rsp) # ... rsi (me.end) | |
| <+772>: call 0x229f840 <realloc_mark_stack> | |
| <+777>: mov 0x8(%r15),%rax # restore stk->count | |
| <+781>: mov 0x18(%rsp),%r11 # restore r11 | |
| <+786>: mov 0x10(%rsp),%rcx # ... rcx | |
| <+791>: mov 0x8(%rsp),%r8 # ... r8 | |
| <+796>: mov (%rsp),%rsi # ... rsi | |
| <+800>: jmp <+330 .realloc_return> | |
| .raced | |
| <+805>: mov (%rsi),%rax # hd = Hd_val(block) | |
| <+808>: mov %eax,%edx # Lazy/Forcing test again | |
| <+810>: and $0xfd,%edx | |
| <+816>: cmp $0xf4,%edx | |
| <+822>: je <+466 .lazy_forcing> # (loop/fall through) | |
| <+828>: movzbl %al,%edx # edx gets tag again | |
| .not_lazy_forcing: | |
| <+831>: mov %rax,%rsi # hd | |
| <+834>: and $0xfffffffffffffcff,%rsi # mask out colours bits | |
| <+841>: or %r11,%rsi # marked | |
| <+844>: mov %rsi,-0x8(%rdi) # mark block (perf: 1.36% L1) | |
| <+848>: jmp <+489 .after_marking> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment