Skip to content

Instantly share code, notes, and snippets.

@bulk88
Created December 16, 2025 10:26
Show Gist options
  • Select an option

  • Save bulk88/ad7d96dd3c80a9b7258411422411de78 to your computer and use it in GitHub Desktop.

Select an option

Save bulk88/ad7d96dd3c80a9b7258411422411de78 to your computer and use it in GitHub Desktop.
90nmXeonNetburstVsCore.patch
@@ -1,204 +1,215 @@
static const
-struct processor_costs nocona_cost = {
+struct processor_costs core_cost = {
{
/* Start of register allocator costs. integer->integer move cost is 2. */
/* cost for loading QImode using movzbl */
- 4,
+ 6,
/* cost of loading integer registers
{4, 4, 4},
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
/* cost of storing integer registers */
- {4, 4, 4},
+ {6, 6, 6},
/* cost of reg,reg fld/fst */
- 12,
+ 2,
/* cost of loading fp registers
- {14, 14, 14},
+ {6, 6, 8},
in SFmode, DFmode and XFmode */
/* cost of storing fp registers
- {14, 14, 14},
+ {6, 6, 10},
in SFmode, DFmode and XFmode */
/* cost of moving MMX register */
- 14,
+ 2,
/* cost of loading MMX registers
- {12, 12},
+ {6, 6},
in SImode and DImode */
/* cost of storing MMX registers
- {12, 12},
+ {6, 6},
in SImode and DImode */
/* cost of moving XMM,YMM,ZMM register */
- 6, 12, 24,
+ 2, 2, 4,
/* cost of loading SSE registers
- {12, 12, 12, 24, 48},
+ {6, 6, 6, 6, 12},
in 32,64,128,256 and 512-bit */
/* cost of storing SSE registers
- {12, 12, 12, 24, 48},
+ {6, 6, 6, 6, 12},
in 32,64,128,256 and 512-bit */
/* SSE->integer and integer->SSE moves */
- 20, 12,
+ 6, 6,
/* mask->integer and integer->mask moves */
- 20, 12,
+ 6, 6,
/* cost of loading mask register
{4, 4, 4},
in QImode, HImode, SImode. */
/* cost if storing mask register
- {4, 4, 4},
+ {6, 6, 6},
in QImode, HImode, SImode. */
/* cost of moving mask register. */
2,
/* End of register allocator costs. */
},
/* cost of an add instruction */
COSTS_N_INSNS (1),
+ /* On all chips taken into consideration lea is 2 cycles and more. With
+ this cost however our current implementation of synth_mult results in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks. */
/* cost of a lea instruction */
- COSTS_N_INSNS (1),
+ COSTS_N_INSNS (1) + 1,
/* variable shift costs */
COSTS_N_INSNS (1),
/* constant shift costs */
COSTS_N_INSNS (1),
/* cost of starting multiply for QI */
- {COSTS_N_INSNS (10),
+ {COSTS_N_INSNS (3),
/* HI */
- COSTS_N_INSNS (10),
+ COSTS_N_INSNS (4),
/* SI */
- COSTS_N_INSNS (10),
+ COSTS_N_INSNS (3),
+ /* Here we tune for Sandybridge or newer. */
/* DI */
- COSTS_N_INSNS (10),
+ COSTS_N_INSNS (3),
/* other */
- COSTS_N_INSNS (10)},
-/* cost of multiply per each bit set */
- 0,
+ COSTS_N_INSNS (3)},
+ 0, /* cost of multiply per each bit set */
+ /* Expanding div/mod currently doesn't consider parallelism. So the cost
+ model is not realistic. We compensate by increasing the latencies a bit. */
/* cost of a divide/mod for QI */
- {COSTS_N_INSNS (66),
+ {COSTS_N_INSNS (11),
/* HI */
- COSTS_N_INSNS (66),
+ COSTS_N_INSNS (11),
/* SI */
- COSTS_N_INSNS (66),
+ COSTS_N_INSNS (14),
/* DI */
- COSTS_N_INSNS (66),
+ COSTS_N_INSNS (81),
/* other */
- COSTS_N_INSNS (66)},
+ COSTS_N_INSNS (81)},
/* cost of movsx */
COSTS_N_INSNS (1),
/* cost of movzx */
COSTS_N_INSNS (1),
/* "large" insn */
- 16,
+ 8,
/* MOVE_RATIO */
17,
/* CLEAR_RATIO */
6,
/* cost of loading integer registers
{4, 4, 4},
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
/* cost of storing integer registers */
- {4, 4, 4},
+ {6, 6, 6},
/* cost of loading SSE register
- {12, 12, 12, 24, 48},
+ {6, 6, 6, 6, 12},
in 32bit, 64bit, 128bit, 256bit and 512bit */
/* cost of storing SSE register
- {12, 12, 12, 24, 48},
+ {6, 6, 6, 6, 12},
in 32bit, 64bit, 128bit, 256bit and 512bit */
/* cost of unaligned loads. */
- {24, 24, 24, 48, 96},
+ {6, 6, 6, 6, 12},
/* cost of unaligned stores. */
- {24, 24, 24, 48, 96},
+ {6, 6, 6, 6, 12},
/* cost of moving XMM,YMM,ZMM register */
- 6, 12, 24,
+ 2, 2, 4,
/* cost of moving SSE register to integer. */
- 20,
+ 2,
/* cost of moving integer register to SSE. */
- 20,
+ 2,
+ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+ rec. throughput 6.
+ So 5 uops statically and one uops per load. */
/* Gather load static, per_elt. */
- 12, 12,
+ 10, 6,
/* Gather store static, per_elt. */
- 12, 12,
+ 10, 6,
/* size of l1 cache. */
- 8,
+ 64,
/* size of l2 cache. */
- 1024,
+ 512,
/* size of prefetch block */
64,
/* number of parallel prefetches */
- 8,
+ 6,
+ /* FIXME perhaps more appropriate value is 5. */
/* Branch cost */
- 1,
+ 3,
/* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (6),
+ COSTS_N_INSNS (3),
/* cost of FMUL instruction. */
- COSTS_N_INSNS (8),
+ COSTS_N_INSNS (5),
+ /* 10-24 */
/* cost of FDIV instruction. */
- COSTS_N_INSNS (40),
+ COSTS_N_INSNS (24),
/* cost of FABS instruction. */
- COSTS_N_INSNS (3),
+ COSTS_N_INSNS (1),
/* cost of FCHS instruction. */
- COSTS_N_INSNS (3),
+ COSTS_N_INSNS (1),
/* cost of FSQRT instruction. */
- COSTS_N_INSNS (44),
+ COSTS_N_INSNS (23),
/* cost of cheap SSE instruction. */
- COSTS_N_INSNS (2),
+ COSTS_N_INSNS (1),
/* cost of ADDSS/SD SUBSS/SD insns. */
- COSTS_N_INSNS (5),
+ COSTS_N_INSNS (3),
/* cost of MULSS instruction. */
- COSTS_N_INSNS (7),
+ COSTS_N_INSNS (4),
/* cost of MULSD instruction. */
- COSTS_N_INSNS (7),
+ COSTS_N_INSNS (5),
/* cost of FMA SS instruction. */
- COSTS_N_INSNS (7),
+ COSTS_N_INSNS (5),
/* cost of FMA SD instruction. */
- COSTS_N_INSNS (7),
+ COSTS_N_INSNS (5),
/* cost of DIVSS instruction. */
- COSTS_N_INSNS (32),
+ COSTS_N_INSNS (18),
/* cost of DIVSD instruction. */
- COSTS_N_INSNS (40),
-/* cost of SQRTSS instruction. */
COSTS_N_INSNS (32),
+/* cost of SQRTSS instruction. */
+ COSTS_N_INSNS (30),
/* cost of SQRTSD instruction. */
- COSTS_N_INSNS (41),
+ COSTS_N_INSNS (58),
/* cost of CVTSS2SD etc. */
- COSTS_N_INSNS (10),
+ COSTS_N_INSNS (2),
/* cost of 256bit VCVTPS2PD etc. */
- COSTS_N_INSNS (20),
+ COSTS_N_INSNS (2),
/* cost of 512bit VCVTPS2PD etc. */
- COSTS_N_INSNS (40),
+ COSTS_N_INSNS (2),
/* cost of CVTSI2SS instruction. */
- COSTS_N_INSNS (20),
+ COSTS_N_INSNS (6),
/* cost of CVT(T)SS2SI instruction. */
- COSTS_N_INSNS (17),
+ COSTS_N_INSNS (6),
/* cost of CVTPI2PS instruction. */
- COSTS_N_INSNS (12),
+ COSTS_N_INSNS (6),
/* cost of CVT(T)PS2PI instruction. */
- COSTS_N_INSNS (8),
+ COSTS_N_INSNS (7),
/* reassoc int, fp, vec_int, vec_fp. */
- 1, 1, 1, 1,
+ 1, 4, 2, 2,
/* latency times throughput of
- {1, 1, 1},
+ {8, 1, 3},
FMA/DOT_PROD_EXPR/SAD_EXPR,
it's used to determine unroll
factor in the vectorizer. */
/* Limit how much the autovectorizer
1,
may unroll a loop. */
- nocona_memcpy,
- nocona_memset,
+ core_memcpy,
+ core_memset,
/* cond_taken_branch_cost. */
COSTS_N_INSNS (3),
/* cond_not_taken_branch_cost. */
COSTS_N_INSNS (1),
/* Loop alignment. */
- NULL,
+ "16:11:8",
/* Jump alignment. */
- NULL,
+ "16:11:8",
/* Label alignment. */
- NULL,
+ "0:0:8",
/* Func alignment. */
- NULL,
+ "16",
/* Small unroll limit. */
4,
/* Small unroll factor. */
2,
/* Branch mispredict scale. */
COSTS_N_INSNS (2),
-};
\ No newline at end of file
+};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment