Created
December 16, 2025 10:26
-
-
Save bulk88/ad7d96dd3c80a9b7258411422411de78 to your computer and use it in GitHub Desktop.
90nmXeonNetburstVsCore.patch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @@ -1,204 +1,215 @@ | |
| static const | |
| -struct processor_costs nocona_cost = { | |
| +struct processor_costs core_cost = { | |
| { | |
| /* Start of register allocator costs. integer->integer move cost is 2. */ | |
| /* cost for loading QImode using movzbl */ | |
| - 4, | |
| + 6, | |
| /* cost of loading integer registers | |
| {4, 4, 4}, | |
| in QImode, HImode and SImode. | |
| Relative to reg-reg move (2). */ | |
| /* cost of storing integer registers */ | |
| - {4, 4, 4}, | |
| + {6, 6, 6}, | |
| /* cost of reg,reg fld/fst */ | |
| - 12, | |
| + 2, | |
| /* cost of loading fp registers | |
| - {14, 14, 14}, | |
| + {6, 6, 8}, | |
| in SFmode, DFmode and XFmode */ | |
| /* cost of storing fp registers | |
| - {14, 14, 14}, | |
| + {6, 6, 10}, | |
| in SFmode, DFmode and XFmode */ | |
| /* cost of moving MMX register */ | |
| - 14, | |
| + 2, | |
| /* cost of loading MMX registers | |
| - {12, 12}, | |
| + {6, 6}, | |
| in SImode and DImode */ | |
| /* cost of storing MMX registers | |
| - {12, 12}, | |
| + {6, 6}, | |
| in SImode and DImode */ | |
| /* cost of moving XMM,YMM,ZMM register */ | |
| - 6, 12, 24, | |
| + 2, 2, 4, | |
| /* cost of loading SSE registers | |
| - {12, 12, 12, 24, 48}, | |
| + {6, 6, 6, 6, 12}, | |
| in 32,64,128,256 and 512-bit */ | |
| /* cost of storing SSE registers | |
| - {12, 12, 12, 24, 48}, | |
| + {6, 6, 6, 6, 12}, | |
| in 32,64,128,256 and 512-bit */ | |
| /* SSE->integer and integer->SSE moves */ | |
| - 20, 12, | |
| + 6, 6, | |
| /* mask->integer and integer->mask moves */ | |
| - 20, 12, | |
| + 6, 6, | |
| /* cost of loading mask register | |
| {4, 4, 4}, | |
| in QImode, HImode, SImode. */ | |
| /* cost if storing mask register | |
| - {4, 4, 4}, | |
| + {6, 6, 6}, | |
| in QImode, HImode, SImode. */ | |
| /* cost of moving mask register. */ | |
| 2, | |
| /* End of register allocator costs. */ | |
| }, | |
| /* cost of an add instruction */ | |
| COSTS_N_INSNS (1), | |
| + /* On all chips taken into consideration lea is 2 cycles and more. With | |
| + this cost however our current implementation of synth_mult results in | |
| + use of unnecessary temporary registers causing regression on several | |
| + SPECfp benchmarks. */ | |
| /* cost of a lea instruction */ | |
| - COSTS_N_INSNS (1), | |
| + COSTS_N_INSNS (1) + 1, | |
| /* variable shift costs */ | |
| COSTS_N_INSNS (1), | |
| /* constant shift costs */ | |
| COSTS_N_INSNS (1), | |
| /* cost of starting multiply for QI */ | |
| - {COSTS_N_INSNS (10), | |
| + {COSTS_N_INSNS (3), | |
| /* HI */ | |
| - COSTS_N_INSNS (10), | |
| + COSTS_N_INSNS (4), | |
| /* SI */ | |
| - COSTS_N_INSNS (10), | |
| + COSTS_N_INSNS (3), | |
| + /* Here we tune for Sandybridge or newer. */ | |
| /* DI */ | |
| - COSTS_N_INSNS (10), | |
| + COSTS_N_INSNS (3), | |
| /* other */ | |
| - COSTS_N_INSNS (10)}, | |
| -/* cost of multiply per each bit set */ | |
| - 0, | |
| + COSTS_N_INSNS (3)}, | |
| + 0, /* cost of multiply per each bit set */ | |
| + /* Expanding div/mod currently doesn't consider parallelism. So the cost | |
| + model is not realistic. We compensate by increasing the latencies a bit. */ | |
| /* cost of a divide/mod for QI */ | |
| - {COSTS_N_INSNS (66), | |
| + {COSTS_N_INSNS (11), | |
| /* HI */ | |
| - COSTS_N_INSNS (66), | |
| + COSTS_N_INSNS (11), | |
| /* SI */ | |
| - COSTS_N_INSNS (66), | |
| + COSTS_N_INSNS (14), | |
| /* DI */ | |
| - COSTS_N_INSNS (66), | |
| + COSTS_N_INSNS (81), | |
| /* other */ | |
| - COSTS_N_INSNS (66)}, | |
| + COSTS_N_INSNS (81)}, | |
| /* cost of movsx */ | |
| COSTS_N_INSNS (1), | |
| /* cost of movzx */ | |
| COSTS_N_INSNS (1), | |
| /* "large" insn */ | |
| - 16, | |
| + 8, | |
| /* MOVE_RATIO */ | |
| 17, | |
| /* CLEAR_RATIO */ | |
| 6, | |
| /* cost of loading integer registers | |
| {4, 4, 4}, | |
| in QImode, HImode and SImode. | |
| Relative to reg-reg move (2). */ | |
| /* cost of storing integer registers */ | |
| - {4, 4, 4}, | |
| + {6, 6, 6}, | |
| /* cost of loading SSE register | |
| - {12, 12, 12, 24, 48}, | |
| + {6, 6, 6, 6, 12}, | |
| in 32bit, 64bit, 128bit, 256bit and 512bit */ | |
| /* cost of storing SSE register | |
| - {12, 12, 12, 24, 48}, | |
| + {6, 6, 6, 6, 12}, | |
| in 32bit, 64bit, 128bit, 256bit and 512bit */ | |
| /* cost of unaligned loads. */ | |
| - {24, 24, 24, 48, 96}, | |
| + {6, 6, 6, 6, 12}, | |
| /* cost of unaligned stores. */ | |
| - {24, 24, 24, 48, 96}, | |
| + {6, 6, 6, 6, 12}, | |
| /* cost of moving XMM,YMM,ZMM register */ | |
| - 6, 12, 24, | |
| + 2, 2, 4, | |
| /* cost of moving SSE register to integer. */ | |
| - 20, | |
| + 2, | |
| /* cost of moving integer register to SSE. */ | |
| - 20, | |
| + 2, | |
| + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, | |
| + rec. throughput 6. | |
| + So 5 uops statically and one uops per load. */ | |
| /* Gather load static, per_elt. */ | |
| - 12, 12, | |
| + 10, 6, | |
| /* Gather store static, per_elt. */ | |
| - 12, 12, | |
| + 10, 6, | |
| /* size of l1 cache. */ | |
| - 8, | |
| + 64, | |
| /* size of l2 cache. */ | |
| - 1024, | |
| + 512, | |
| /* size of prefetch block */ | |
| 64, | |
| /* number of parallel prefetches */ | |
| - 8, | |
| + 6, | |
| + /* FIXME perhaps more appropriate value is 5. */ | |
| /* Branch cost */ | |
| - 1, | |
| + 3, | |
| /* cost of FADD and FSUB insns. */ | |
| - COSTS_N_INSNS (6), | |
| + COSTS_N_INSNS (3), | |
| /* cost of FMUL instruction. */ | |
| - COSTS_N_INSNS (8), | |
| + COSTS_N_INSNS (5), | |
| + /* 10-24 */ | |
| /* cost of FDIV instruction. */ | |
| - COSTS_N_INSNS (40), | |
| + COSTS_N_INSNS (24), | |
| /* cost of FABS instruction. */ | |
| - COSTS_N_INSNS (3), | |
| + COSTS_N_INSNS (1), | |
| /* cost of FCHS instruction. */ | |
| - COSTS_N_INSNS (3), | |
| + COSTS_N_INSNS (1), | |
| /* cost of FSQRT instruction. */ | |
| - COSTS_N_INSNS (44), | |
| + COSTS_N_INSNS (23), | |
| /* cost of cheap SSE instruction. */ | |
| - COSTS_N_INSNS (2), | |
| + COSTS_N_INSNS (1), | |
| /* cost of ADDSS/SD SUBSS/SD insns. */ | |
| - COSTS_N_INSNS (5), | |
| + COSTS_N_INSNS (3), | |
| /* cost of MULSS instruction. */ | |
| - COSTS_N_INSNS (7), | |
| + COSTS_N_INSNS (4), | |
| /* cost of MULSD instruction. */ | |
| - COSTS_N_INSNS (7), | |
| + COSTS_N_INSNS (5), | |
| /* cost of FMA SS instruction. */ | |
| - COSTS_N_INSNS (7), | |
| + COSTS_N_INSNS (5), | |
| /* cost of FMA SD instruction. */ | |
| - COSTS_N_INSNS (7), | |
| + COSTS_N_INSNS (5), | |
| /* cost of DIVSS instruction. */ | |
| - COSTS_N_INSNS (32), | |
| + COSTS_N_INSNS (18), | |
| /* cost of DIVSD instruction. */ | |
| - COSTS_N_INSNS (40), | |
| -/* cost of SQRTSS instruction. */ | |
| COSTS_N_INSNS (32), | |
| +/* cost of SQRTSS instruction. */ | |
| + COSTS_N_INSNS (30), | |
| /* cost of SQRTSD instruction. */ | |
| - COSTS_N_INSNS (41), | |
| + COSTS_N_INSNS (58), | |
| /* cost of CVTSS2SD etc. */ | |
| - COSTS_N_INSNS (10), | |
| + COSTS_N_INSNS (2), | |
| /* cost of 256bit VCVTPS2PD etc. */ | |
| - COSTS_N_INSNS (20), | |
| + COSTS_N_INSNS (2), | |
| /* cost of 512bit VCVTPS2PD etc. */ | |
| - COSTS_N_INSNS (40), | |
| + COSTS_N_INSNS (2), | |
| /* cost of CVTSI2SS instruction. */ | |
| - COSTS_N_INSNS (20), | |
| + COSTS_N_INSNS (6), | |
| /* cost of CVT(T)SS2SI instruction. */ | |
| - COSTS_N_INSNS (17), | |
| + COSTS_N_INSNS (6), | |
| /* cost of CVTPI2PS instruction. */ | |
| - COSTS_N_INSNS (12), | |
| + COSTS_N_INSNS (6), | |
| /* cost of CVT(T)PS2PI instruction. */ | |
| - COSTS_N_INSNS (8), | |
| + COSTS_N_INSNS (7), | |
| /* reassoc int, fp, vec_int, vec_fp. */ | |
| - 1, 1, 1, 1, | |
| + 1, 4, 2, 2, | |
| /* latency times throughput of | |
| - {1, 1, 1}, | |
| + {8, 1, 3}, | |
| FMA/DOT_PROD_EXPR/SAD_EXPR, | |
| it's used to determine unroll | |
| factor in the vectorizer. */ | |
| /* Limit how much the autovectorizer | |
| 1, | |
| may unroll a loop. */ | |
| - nocona_memcpy, | |
| - nocona_memset, | |
| + core_memcpy, | |
| + core_memset, | |
| /* cond_taken_branch_cost. */ | |
| COSTS_N_INSNS (3), | |
| /* cond_not_taken_branch_cost. */ | |
| COSTS_N_INSNS (1), | |
| /* Loop alignment. */ | |
| - NULL, | |
| + "16:11:8", | |
| /* Jump alignment. */ | |
| - NULL, | |
| + "16:11:8", | |
| /* Label alignment. */ | |
| - NULL, | |
| + "0:0:8", | |
| /* Func alignment. */ | |
| - NULL, | |
| + "16", | |
| /* Small unroll limit. */ | |
| 4, | |
| /* Small unroll factor. */ | |
| 2, | |
| /* Branch mispredict scale. */ | |
| COSTS_N_INSNS (2), | |
| -}; | |
| \ No newline at end of file | |
| +}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment