Skip to content

Instantly share code, notes, and snippets.

@uchan-nos
Last active November 7, 2025 04:01
Show Gist options
  • Select an option

  • Save uchan-nos/7eafbcc075ddf3ad1606ab316c7c8cf4 to your computer and use it in GitHub Desktop.

Select an option

Save uchan-nos/7eafbcc075ddf3ad1606ab316c7c8cf4 to your computer and use it in GitHub Desktop.
関数の入口・出口における処理はpush/popとsub/addでどのくらい性能差が出るか
.text
.intel_syntax noprefix
.globl pushpop
.p2align 4, 0x90
.type pushpop,@function
pushpop:
push rax
mov rax, qword ptr [rdi]
lea rcx, [rax + 1]
mov qword ptr [rdi], rcx
pop rcx
ret
.globl subadd
.p2align 4, 0x90
.type subadd,@function
subadd:
sub rsp, 8
mov rax, qword ptr [rdi]
lea rcx, [rax + 1]
mov qword ptr [rdi], rcx
add rsp, 8
ret
# スタックを実行不可とする(ビルド時の警告抑制)
.section ".note.GNU-stack","",@progbits
objdump -d -Mintel a.out
a.out: file format elf64-x86-64
Disassembly of section .init:
0000000000001000 <_init>:
1000: f3 0f 1e fa endbr64
1004: 48 83 ec 08 sub rsp,0x8
1008: 48 8b 05 c1 2f 00 00 mov rax,QWORD PTR [rip+0x2fc1] # 3fd0 <__gmon_start__@Base>
100f: 48 85 c0 test rax,rax
1012: 74 02 je 1016 <_init+0x16>
1014: ff d0 call rax
1016: 48 83 c4 08 add rsp,0x8
101a: c3 ret
Disassembly of section .plt:
0000000000001020 <printf@plt-0x10>:
1020: ff 35 ca 2f 00 00 push QWORD PTR [rip+0x2fca] # 3ff0 <_GLOBAL_OFFSET_TABLE_+0x8>
1026: ff 25 cc 2f 00 00 jmp QWORD PTR [rip+0x2fcc] # 3ff8 <_GLOBAL_OFFSET_TABLE_+0x10>
102c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000001030 <printf@plt>:
1030: ff 25 ca 2f 00 00 jmp QWORD PTR [rip+0x2fca] # 4000 <printf@GLIBC_2.2.5>
1036: 68 00 00 00 00 push 0x0
103b: e9 e0 ff ff ff jmp 1020 <_init+0x20>
Disassembly of section .plt.got:
0000000000001040 <__cxa_finalize@plt>:
1040: ff 25 9a 2f 00 00 jmp QWORD PTR [rip+0x2f9a] # 3fe0 <__cxa_finalize@GLIBC_2.2.5>
1046: 66 90 xchg ax,ax
Disassembly of section .text:
0000000000001050 <_start>:
1050: f3 0f 1e fa endbr64
1054: 31 ed xor ebp,ebp
1056: 49 89 d1 mov r9,rdx
1059: 5e pop rsi
105a: 48 89 e2 mov rdx,rsp
105d: 48 83 e4 f0 and rsp,0xfffffffffffffff0
1061: 50 push rax
1062: 54 push rsp
1063: 45 31 c0 xor r8d,r8d
1066: 31 c9 xor ecx,ecx
1068: 48 8d 3d e1 00 00 00 lea rdi,[rip+0xe1] # 1150 <main>
106f: ff 15 4b 2f 00 00 call QWORD PTR [rip+0x2f4b] # 3fc0 <__libc_start_main@GLIBC_2.34>
1075: f4 hlt
1076: 66 2e 0f 1f 84 00 00 cs nop WORD PTR [rax+rax*1+0x0]
107d: 00 00 00
0000000000001080 <deregister_tm_clones>:
1080: 48 8d 3d 91 2f 00 00 lea rdi,[rip+0x2f91] # 4018 <__TMC_END__>
1087: 48 8d 05 8a 2f 00 00 lea rax,[rip+0x2f8a] # 4018 <__TMC_END__>
108e: 48 39 f8 cmp rax,rdi
1091: 74 15 je 10a8 <deregister_tm_clones+0x28>
1093: 48 8b 05 2e 2f 00 00 mov rax,QWORD PTR [rip+0x2f2e] # 3fc8 <_ITM_deregisterTMCloneTable@Base>
109a: 48 85 c0 test rax,rax
109d: 74 09 je 10a8 <deregister_tm_clones+0x28>
109f: ff e0 jmp rax
10a1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10a8: c3 ret
10a9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
00000000000010b0 <register_tm_clones>:
10b0: 48 8d 3d 61 2f 00 00 lea rdi,[rip+0x2f61] # 4018 <__TMC_END__>
10b7: 48 8d 35 5a 2f 00 00 lea rsi,[rip+0x2f5a] # 4018 <__TMC_END__>
10be: 48 29 fe sub rsi,rdi
10c1: 48 89 f0 mov rax,rsi
10c4: 48 c1 ee 3f shr rsi,0x3f
10c8: 48 c1 f8 03 sar rax,0x3
10cc: 48 01 c6 add rsi,rax
10cf: 48 d1 fe sar rsi,1
10d2: 74 14 je 10e8 <register_tm_clones+0x38>
10d4: 48 8b 05 fd 2e 00 00 mov rax,QWORD PTR [rip+0x2efd] # 3fd8 <_ITM_registerTMCloneTable@Base>
10db: 48 85 c0 test rax,rax
10de: 74 08 je 10e8 <register_tm_clones+0x38>
10e0: ff e0 jmp rax
10e2: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
10e8: c3 ret
10e9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
00000000000010f0 <__do_global_dtors_aux>:
10f0: f3 0f 1e fa endbr64
10f4: 80 3d 1d 2f 00 00 00 cmp BYTE PTR [rip+0x2f1d],0x0 # 4018 <__TMC_END__>
10fb: 75 2b jne 1128 <__do_global_dtors_aux+0x38>
10fd: 55 push rbp
10fe: 48 83 3d da 2e 00 00 cmp QWORD PTR [rip+0x2eda],0x0 # 3fe0 <__cxa_finalize@GLIBC_2.2.5>
1105: 00
1106: 48 89 e5 mov rbp,rsp
1109: 74 0c je 1117 <__do_global_dtors_aux+0x27>
110b: 48 8b 3d fe 2e 00 00 mov rdi,QWORD PTR [rip+0x2efe] # 4010 <__dso_handle>
1112: e8 29 ff ff ff call 1040 <__cxa_finalize@plt>
1117: e8 64 ff ff ff call 1080 <deregister_tm_clones>
111c: c6 05 f5 2e 00 00 01 mov BYTE PTR [rip+0x2ef5],0x1 # 4018 <__TMC_END__>
1123: 5d pop rbp
1124: c3 ret
1125: 0f 1f 00 nop DWORD PTR [rax]
1128: c3 ret
1129: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
0000000000001130 <frame_dummy>:
1130: f3 0f 1e fa endbr64
1134: e9 77 ff ff ff jmp 10b0 <register_tm_clones>
1139: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
0000000000001140 <inc>:
1140: 8b 07 mov eax,DWORD PTR [rdi]
1142: 8d 48 01 lea ecx,[rax+0x1]
1145: 89 0f mov DWORD PTR [rdi],ecx
1147: c3 ret
1148: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
114f: 00
0000000000001150 <main>:
1150: 41 57 push r15
1152: 41 56 push r14
1154: 53 push rbx
1155: 48 83 ec 10 sub rsp,0x10
1159: 48 c7 44 24 08 00 00 mov QWORD PTR [rsp+0x8],0x0
1160: 00 00
1162: 0f 31 rdtsc
1164: 48 89 d3 mov rbx,rdx
1167: 48 c1 e3 20 shl rbx,0x20
116b: 48 09 c3 or rbx,rax
116e: 41 bf 00 ca 9a 3b mov r15d,0x3b9aca00
1174: 4c 8d 74 24 08 lea r14,[rsp+0x8]
1179: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
1180: 4c 89 f7 mov rdi,r14
1183: e8 f8 00 00 00 call 1280 <pushpop>
1188: 49 ff cf dec r15
118b: 75 f3 jne 1180 <main+0x30>
118d: 0f 31 rdtsc
118f: 48 c1 e2 20 shl rdx,0x20
1193: 48 09 c2 or rdx,rax
1196: 48 29 da sub rdx,rbx
1199: 66 48 0f 6e ca movq xmm1,rdx
119e: 66 0f 62 0d 6a 0e 00 punpckldq xmm1,XMMWORD PTR [rip+0xe6a] # 2010 <_IO_stdin_used+0x10>
11a5: 00
11a6: 66 0f 5c 0d 72 0e 00 subpd xmm1,XMMWORD PTR [rip+0xe72] # 2020 <_IO_stdin_used+0x20>
11ad: 00
11ae: 66 0f 28 c1 movapd xmm0,xmm1
11b2: 66 0f 15 c1 unpckhpd xmm0,xmm1
11b6: f2 0f 58 c1 addsd xmm0,xmm1
11ba: f2 0f 5e 05 6e 0e 00 divsd xmm0,QWORD PTR [rip+0xe6e] # 2030 <_IO_stdin_used+0x30>
11c1: 00
11c2: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
11c7: 48 8d 3d 6a 0e 00 00 lea rdi,[rip+0xe6a] # 2038 <_IO_stdin_used+0x38>
11ce: 41 bf 00 ca 9a 3b mov r15d,0x3b9aca00
11d4: be 00 ca 9a 3b mov esi,0x3b9aca00
11d9: b0 01 mov al,0x1
11db: e8 50 fe ff ff call 1030 <printf@plt>
11e0: 48 c7 44 24 08 00 00 mov QWORD PTR [rsp+0x8],0x0
11e7: 00 00
11e9: 0f 31 rdtsc
11eb: 48 89 d3 mov rbx,rdx
11ee: 48 c1 e3 20 shl rbx,0x20
11f2: 48 09 c3 or rbx,rax
11f5: 4c 8d 74 24 08 lea r14,[rsp+0x8]
11fa: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
1200: 4c 89 f7 mov rdi,r14
1203: e8 88 00 00 00 call 1290 <subadd>
1208: 49 ff cf dec r15
120b: 75 f3 jne 1200 <main+0xb0>
120d: 0f 31 rdtsc
120f: 48 c1 e2 20 shl rdx,0x20
1213: 48 09 c2 or rdx,rax
1216: 48 29 da sub rdx,rbx
1219: 66 48 0f 6e ca movq xmm1,rdx
121e: 66 0f 62 0d ea 0d 00 punpckldq xmm1,XMMWORD PTR [rip+0xdea] # 2010 <_IO_stdin_used+0x10>
1225: 00
1226: 66 0f 5c 0d f2 0d 00 subpd xmm1,XMMWORD PTR [rip+0xdf2] # 2020 <_IO_stdin_used+0x20>
122d: 00
122e: 66 0f 28 c1 movapd xmm0,xmm1
1232: 66 0f 15 c1 unpckhpd xmm0,xmm1
1236: f2 0f 58 c1 addsd xmm0,xmm1
123a: f2 0f 5e 05 ee 0d 00 divsd xmm0,QWORD PTR [rip+0xdee] # 2030 <_IO_stdin_used+0x30>
1241: 00
1242: 48 8b 4c 24 08 mov rcx,QWORD PTR [rsp+0x8]
1247: 48 8d 3d 24 0e 00 00 lea rdi,[rip+0xe24] # 2072 <_IO_stdin_used+0x72>
124e: be 00 ca 9a 3b mov esi,0x3b9aca00
1253: b0 01 mov al,0x1
1255: e8 d6 fd ff ff call 1030 <printf@plt>
125a: 48 8d 3d 4a 0e 00 00 lea rdi,[rip+0xe4a] # 20ab <_IO_stdin_used+0xab>
1261: be 02 00 00 00 mov esi,0x2
1266: ba 03 00 00 00 mov edx,0x3
126b: 31 c0 xor eax,eax
126d: e8 be fd ff ff call 1030 <printf@plt>
1272: 31 c0 xor eax,eax
1274: 48 83 c4 10 add rsp,0x10
1278: 5b pop rbx
1279: 41 5e pop r14
127b: 41 5f pop r15
127d: c3 ret
127e: 66 90 xchg ax,ax
0000000000001280 <pushpop>:
1280: 50 push rax
1281: 48 8b 07 mov rax,QWORD PTR [rdi]
1284: 48 8d 48 01 lea rcx,[rax+0x1]
1288: 48 89 0f mov QWORD PTR [rdi],rcx
128b: 59 pop rcx
128c: c3 ret
128d: 0f 1f 00 nop DWORD PTR [rax]
0000000000001290 <subadd>:
1290: 48 83 ec 08 sub rsp,0x8
1294: 48 8b 07 mov rax,QWORD PTR [rdi]
1297: 48 8d 48 01 lea rcx,[rax+0x1]
129b: 48 89 0f mov QWORD PTR [rdi],rcx
129e: 48 83 c4 08 add rsp,0x8
12a2: c3 ret
Disassembly of section .fini:
00000000000012a4 <_fini>:
12a4: f3 0f 1e fa endbr64
12a8: 48 83 ec 08 sub rsp,0x8
12ac: 48 83 c4 08 add rsp,0x8
12b0: c3 ret
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#define LOOP 1000000000L
uint64_t pushpop(uint64_t*);
uint64_t subadd(uint64_t*);
uint64_t run_bench(const char *name, uint64_t (*f)(uint64_t*), size_t loop) {
uint64_t start = __builtin_readcyclecounter();
uint64_t cnt = 0;
for (size_t i = 0; i < loop; ++i) {
f(&cnt);
}
uint64_t end = __builtin_readcyclecounter();
uint64_t elapsed = end - start;
return elapsed;
}
int main(int argc, char *argv[]) {
size_t loop = LOOP;
size_t num_bench = 3;
if (argc >= 2) {
loop = strtol(argv[1], NULL, 0);
}
if (argc >= 3) {
num_bench = strtol(argv[2], NULL, 0);
}
printf("num_bench: %zu\n"
"loop: %zu\n"
"\n",
num_bench, loop);
printf("| run | pp elapsed | pp average | sa elapsed | sa average |\n"
"|-----|--------------|------------|--------------|------------|\n");
uint64_t sum_pp = 0, sum_sa = 0;
for (size_t i = 0; i < num_bench; ++i) {
uint64_t elapsed_pp = run_bench("pushpop", pushpop, loop);
uint64_t elapsed_sa = run_bench("subadd ", subadd, loop);
sum_pp += elapsed_pp;
sum_sa += elapsed_sa;
printf("| %3zu | %12lu | %#10.6f | %12lu | %#10.6f |\n",
i + 1, elapsed_pp, (double)elapsed_pp / loop, elapsed_sa, (double)elapsed_sa / loop);
}
printf("| avg | %#12.1f | %#10.6f | %#12.1f | %#10.6f |\n",
(double)sum_pp / num_bench, (double)sum_pp / loop / num_bench,
(double)sum_sa / num_bench, (double)sum_sa / loop / num_bench);
}
SRCS := main.c asm.s
all:
make -r a.out
a.out: $(SRCS) Makefile
clang -O2 -o $@ $(SRCS)
.PHONY: disas
disas:
objdump -d -Mintel a.out
$ ./a.out
pushpop: 1000000000 times => elapsed 5713678920, average 5.71368, cnt = 1000000000
subadd: 1000000000 times => elapsed 5709335488, average 5.70934, cnt = 1000000000
inc(&i) = 2, i = 3
$ ./a.out
pushpop: 1000000000 times => elapsed 5702241796, average 5.70224, cnt = 1000000000
subadd: 1000000000 times => elapsed 5710384354, average 5.71038, cnt = 1000000000
inc(&i) = 2, i = 3
$ ./a.out
pushpop: 1000000000 times => elapsed 5707916838, average 5.70792, cnt = 1000000000
subadd: 1000000000 times => elapsed 5708659500, average 5.70866, cnt = 1000000000
inc(&i) = 2, i = 3
実験回 pushpop subadd diff (pp - sa)
1 5.71368 5.70934 0.00434
2 5.70224 5.71038 -0.00814
3 5.70792 5.70866 -0.00074

差は無いと言って良い?

@uchan-nos
Copy link
Author

Ubuntu 24.04 on WSL2
CPU: AMD Ryzen 9 5950X 16-Core Processor 3.4GHz

$ time ./a.out
num_bench: 3
loop: 1000000000

run pp elapsed pp average sa elapsed sa average
1 5714961367 5.714961 5695248421 5.695248
2 5697037688 5.697038 5693585820 5.693586
3 5694287530 5.694288 5695270470 5.695270
avg 5702095528.3 5.702096 5694701570.3 5.694702

real 0m10.013s
user 0m10.057s
sys 0m0.001s

@uchan-nos
Copy link
Author

Ubuntu 24.04 on WSL2
CPU: Intel(R) Core(TM) Ultra 7 165U 2.688GHz

$ time ./a.out
num_bench: 3
loop: 1000000000

run pp elapsed pp average sa elapsed sa average
1 1605336124 1.605336 1691755508 1.691756
2 1647749740 1.647750 1680450364 1.680450
3 1690682586 1.690683 1666465438 1.666465
avg 1647922816.7 1.647923 1679557103.3 1.679557

real 0m3.640s
user 0m3.714s
sys 0m0.001s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment