bulk88/90nmXeonNetburstVsCore.patch

## 90nmXeonNetburstVsCore.patch
@@ -1,204 +1,215 @@
 static const
-struct processor_costs nocona_cost = {
+struct processor_costs core_cost = {
   {
   /* Start of register allocator costs.  integer->integer move cost is 2. */
 /* cost for loading QImode using movzbl */
-  4,
+  6,
 /* cost of loading integer registers
   {4, 4, 4},
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 /* cost of storing integer registers */
-  {4, 4, 4},
+  {6, 6, 6},
 /* cost of reg,reg fld/fst */
-  12,
+  2,
 /* cost of loading fp registers
-  {14, 14, 14},
+  {6, 6, 8},
 					   in SFmode, DFmode and XFmode */
 /* cost of storing fp registers
-  {14, 14, 14},
+  {6, 6, 10},
 					   in SFmode, DFmode and XFmode */
 /* cost of moving MMX register */
-  14,
+  2,
 /* cost of loading MMX registers
-  {12, 12},
+  {6, 6},
 					   in SImode and DImode */
 /* cost of storing MMX registers
-  {12, 12},
+  {6, 6},
 					   in SImode and DImode */
 /* cost of moving XMM,YMM,ZMM register */
-  6, 12, 24,
+  2, 2, 4,
 /* cost of loading SSE registers
-  {12, 12, 12, 24, 48},
+  {6, 6, 6, 6, 12},
 					   in 32,64,128,256 and 512-bit */
 /* cost of storing SSE registers
-  {12, 12, 12, 24, 48},
+  {6, 6, 6, 6, 12},
 					   in 32,64,128,256 and 512-bit */
 /* SSE->integer and integer->SSE moves */
-  20, 12,
+  6, 6,
 /* mask->integer and integer->mask moves */
-  20, 12,
+  6, 6,
 /* cost of loading mask register
   {4, 4, 4},
 					   in QImode, HImode, SImode.  */
 /* cost if storing mask register
-  {4, 4, 4},
+  {6, 6, 6},
 					   in QImode, HImode, SImode.  */
 /* cost of moving mask register.  */
   2,
   /* End of register allocator costs.  */
   },
 /* cost of an add instruction */
   COSTS_N_INSNS (1),
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
 /* cost of a lea instruction */
-  COSTS_N_INSNS (1),
+  COSTS_N_INSNS (1) + 1,
 /* variable shift costs */
   COSTS_N_INSNS (1),
 /* constant shift costs */
   COSTS_N_INSNS (1),
 /* cost of starting multiply for QI */
-  {COSTS_N_INSNS (10),
+  {COSTS_N_INSNS (3),
 /*				 HI */
-   COSTS_N_INSNS (10),
+   COSTS_N_INSNS (4),
 /*				 SI */
-   COSTS_N_INSNS (10),
+   COSTS_N_INSNS (3),
+   /* Here we tune for Sandybridge or newer.  */
 /*				 DI */
-   COSTS_N_INSNS (10),
+   COSTS_N_INSNS (3),
 /*			      other */
-   COSTS_N_INSNS (10)},
-/* cost of multiply per each bit set */
-  0,
+   COSTS_N_INSNS (3)},
+  0,					/* cost of multiply per each bit set */
+  /* Expanding div/mod currently doesn't consider parallelism. So the cost
+     model is not realistic. We compensate by increasing the latencies a bit.  */
 /* cost of a divide/mod for QI */
-  {COSTS_N_INSNS (66),
+  {COSTS_N_INSNS (11),
 /*			    HI */
-   COSTS_N_INSNS (66),
+   COSTS_N_INSNS (11),
 /*			    SI */
-   COSTS_N_INSNS (66),
+   COSTS_N_INSNS (14),
 /*			    DI */
-   COSTS_N_INSNS (66),
+   COSTS_N_INSNS (81),
 /*			    other */
-   COSTS_N_INSNS (66)},
+   COSTS_N_INSNS (81)},
 /* cost of movsx */
   COSTS_N_INSNS (1),
 /* cost of movzx */
   COSTS_N_INSNS (1),
 /* "large" insn */
-  16,
+  8,
 /* MOVE_RATIO */
   17,
 /* CLEAR_RATIO */
   6,
 /* cost of loading integer registers
   {4, 4, 4},
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
 /* cost of storing integer registers */
-  {4, 4, 4},
+  {6, 6, 6},
 /* cost of loading SSE register
-  {12, 12, 12, 24, 48},
+  {6, 6, 6, 6, 12},
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
 /* cost of storing SSE register
-  {12, 12, 12, 24, 48},
+  {6, 6, 6, 6, 12},
 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
 /* cost of unaligned loads.  */
-  {24, 24, 24, 48, 96},
+  {6, 6, 6, 6, 12},
 /* cost of unaligned stores.  */
-  {24, 24, 24, 48, 96},
+  {6, 6, 6, 6, 12},
 /* cost of moving XMM,YMM,ZMM register */
-  6, 12, 24,
+  2, 2, 4,
 /* cost of moving SSE register to integer.  */
-  20,
+  2,
 /* cost of moving integer register to SSE.  */
-  20,
+  2,
+  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+     rec. throughput 6.
+     So 5 uops statically and one uops per load.  */
 /* Gather load static, per_elt.  */
-  12, 12,
+  10, 6,
 /* Gather store static, per_elt.  */
-  12, 12,
+  10, 6,
 /* size of l1 cache.  */
-  8,
+  64,
 /* size of l2 cache.  */
-  1024,
+  512,
 /* size of prefetch block */
   64,
 /* number of parallel prefetches */
-  8,
+  6,
+  /* FIXME perhaps more appropriate value is 5.  */
 /* Branch cost */
-  1,
+  3,
 /* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (6),
+  COSTS_N_INSNS (3),
 /* cost of FMUL instruction.  */
-  COSTS_N_INSNS (8),
+  COSTS_N_INSNS (5),
+  /* 10-24 */
 /* cost of FDIV instruction.  */
-  COSTS_N_INSNS (40),
+  COSTS_N_INSNS (24),
 /* cost of FABS instruction.  */
-  COSTS_N_INSNS (3),
+  COSTS_N_INSNS (1),
 /* cost of FCHS instruction.  */
-  COSTS_N_INSNS (3),
+  COSTS_N_INSNS (1),
 /* cost of FSQRT instruction.  */
-  COSTS_N_INSNS (44),
+  COSTS_N_INSNS (23),
 /* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (2),
+  COSTS_N_INSNS (1),
 /* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (5),
+  COSTS_N_INSNS (3),
 /* cost of MULSS instruction.  */
-  COSTS_N_INSNS (7),
+  COSTS_N_INSNS (4),
 /* cost of MULSD instruction.  */
-  COSTS_N_INSNS (7),
+  COSTS_N_INSNS (5),
 /* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (7),
+  COSTS_N_INSNS (5),
 /* cost of FMA SD instruction.  */
-  COSTS_N_INSNS (7),
+  COSTS_N_INSNS (5),
 /* cost of DIVSS instruction.  */
-  COSTS_N_INSNS (32),
+  COSTS_N_INSNS (18),
 /* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (40),
-/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (32),
+/* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (30),
 /* cost of SQRTSD instruction.  */
-  COSTS_N_INSNS (41),
+  COSTS_N_INSNS (58),
 /* cost of CVTSS2SD etc.  */
-  COSTS_N_INSNS (10),
+  COSTS_N_INSNS (2),
 /* cost of 256bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (20),
+  COSTS_N_INSNS (2),
 /* cost of 512bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (40),
+  COSTS_N_INSNS (2),
 /* cost of CVTSI2SS instruction.  */
-  COSTS_N_INSNS (20),
+  COSTS_N_INSNS (6),
 /* cost of CVT(T)SS2SI instruction.  */
-  COSTS_N_INSNS (17),
+  COSTS_N_INSNS (6),
 /* cost of CVTPI2PS instruction.  */
-  COSTS_N_INSNS (12),
+  COSTS_N_INSNS (6),
 /* cost of CVT(T)PS2PI instruction.  */
-  COSTS_N_INSNS (8),
+  COSTS_N_INSNS (7),
 /* reassoc int, fp, vec_int, vec_fp.  */
-  1, 1, 1, 1,
+  1, 4, 2, 2,
 /* latency times throughput of
-  {1, 1, 1},
+  {8, 1, 3},
 					   FMA/DOT_PROD_EXPR/SAD_EXPR,
 					   it's used to determine unroll
 					   factor in the vectorizer.  */
 /* Limit how much the autovectorizer
   1,
 					   may unroll a loop.  */
-  nocona_memcpy,
-  nocona_memset,
+  core_memcpy,
+  core_memset,
 /* cond_taken_branch_cost.  */
   COSTS_N_INSNS (3),
 /* cond_not_taken_branch_cost.  */
   COSTS_N_INSNS (1),
 /* Loop alignment.  */
-  NULL,
+  "16:11:8",
 /* Jump alignment.  */
-  NULL,
+  "16:11:8",
 /* Label alignment.  */
-  NULL,
+  "0:0:8",
 /* Func alignment.  */
-  NULL,
+  "16",
 /* Small unroll limit.  */
   4,
 /* Small unroll factor.  */
   2,
 /* Branch mispredict scale.  */
   COSTS_N_INSNS (2),
-};
\ No newline at end of file
+};
	@@ -1,204 +1,215 @@
	static const
	-struct processor_costs nocona_cost = {
	+struct processor_costs core_cost = {
	{
	/* Start of register allocator costs. integer->integer move cost is 2. */
	/* cost for loading QImode using movzbl */
	- 4,
	+ 6,
	/* cost of loading integer registers
	{4, 4, 4},
	in QImode, HImode and SImode.
	Relative to reg-reg move (2). */
	/* cost of storing integer registers */
	- {4, 4, 4},
	+ {6, 6, 6},
	/* cost of reg,reg fld/fst */
	- 12,
	+ 2,
	/* cost of loading fp registers
	- {14, 14, 14},
	+ {6, 6, 8},
	in SFmode, DFmode and XFmode */
	/* cost of storing fp registers
	- {14, 14, 14},
	+ {6, 6, 10},
	in SFmode, DFmode and XFmode */
	/* cost of moving MMX register */
	- 14,
	+ 2,
	/* cost of loading MMX registers
	- {12, 12},
	+ {6, 6},
	in SImode and DImode */
	/* cost of storing MMX registers
	- {12, 12},
	+ {6, 6},
	in SImode and DImode */
	/* cost of moving XMM,YMM,ZMM register */
	- 6, 12, 24,
	+ 2, 2, 4,
	/* cost of loading SSE registers
	- {12, 12, 12, 24, 48},
	+ {6, 6, 6, 6, 12},
	in 32,64,128,256 and 512-bit */
	/* cost of storing SSE registers
	- {12, 12, 12, 24, 48},
	+ {6, 6, 6, 6, 12},
	in 32,64,128,256 and 512-bit */
	/* SSE->integer and integer->SSE moves */
	- 20, 12,
	+ 6, 6,
	/* mask->integer and integer->mask moves */
	- 20, 12,
	+ 6, 6,
	/* cost of loading mask register
	{4, 4, 4},
	in QImode, HImode, SImode. */
	/* cost if storing mask register
	- {4, 4, 4},
	+ {6, 6, 6},
	in QImode, HImode, SImode. */
	/* cost of moving mask register. */
	2,
	/* End of register allocator costs. */
	},
	/* cost of an add instruction */
	COSTS_N_INSNS (1),
	+ /* On all chips taken into consideration lea is 2 cycles and more. With
	+ this cost however our current implementation of synth_mult results in
	+ use of unnecessary temporary registers causing regression on several
	+ SPECfp benchmarks. */
	/* cost of a lea instruction */
	- COSTS_N_INSNS (1),
	+ COSTS_N_INSNS (1) + 1,
	/* variable shift costs */
	COSTS_N_INSNS (1),
	/* constant shift costs */
	COSTS_N_INSNS (1),
	/* cost of starting multiply for QI */
	- {COSTS_N_INSNS (10),
	+ {COSTS_N_INSNS (3),
	/* HI */
	- COSTS_N_INSNS (10),
	+ COSTS_N_INSNS (4),
	/* SI */
	- COSTS_N_INSNS (10),
	+ COSTS_N_INSNS (3),
	+ /* Here we tune for Sandybridge or newer. */
	/* DI */
	- COSTS_N_INSNS (10),
	+ COSTS_N_INSNS (3),
	/* other */
	- COSTS_N_INSNS (10)},
	-/* cost of multiply per each bit set */
	- 0,
	+ COSTS_N_INSNS (3)},
	+ 0, /* cost of multiply per each bit set */
	+ /* Expanding div/mod currently doesn't consider parallelism. So the cost
	+ model is not realistic. We compensate by increasing the latencies a bit. */
	/* cost of a divide/mod for QI */
	- {COSTS_N_INSNS (66),
	+ {COSTS_N_INSNS (11),
	/* HI */
	- COSTS_N_INSNS (66),
	+ COSTS_N_INSNS (11),
	/* SI */
	- COSTS_N_INSNS (66),
	+ COSTS_N_INSNS (14),
	/* DI */
	- COSTS_N_INSNS (66),
	+ COSTS_N_INSNS (81),
	/* other */
	- COSTS_N_INSNS (66)},
	+ COSTS_N_INSNS (81)},
	/* cost of movsx */
	COSTS_N_INSNS (1),
	/* cost of movzx */
	COSTS_N_INSNS (1),
	/* "large" insn */
	- 16,
	+ 8,
	/* MOVE_RATIO */
	17,
	/* CLEAR_RATIO */
	6,
	/* cost of loading integer registers
	{4, 4, 4},
	in QImode, HImode and SImode.
	Relative to reg-reg move (2). */
	/* cost of storing integer registers */
	- {4, 4, 4},
	+ {6, 6, 6},
	/* cost of loading SSE register
	- {12, 12, 12, 24, 48},
	+ {6, 6, 6, 6, 12},
	in 32bit, 64bit, 128bit, 256bit and 512bit */
	/* cost of storing SSE register
	- {12, 12, 12, 24, 48},
	+ {6, 6, 6, 6, 12},
	in 32bit, 64bit, 128bit, 256bit and 512bit */
	/* cost of unaligned loads. */
	- {24, 24, 24, 48, 96},
	+ {6, 6, 6, 6, 12},
	/* cost of unaligned stores. */
	- {24, 24, 24, 48, 96},
	+ {6, 6, 6, 6, 12},
	/* cost of moving XMM,YMM,ZMM register */
	- 6, 12, 24,
	+ 2, 2, 4,
	/* cost of moving SSE register to integer. */
	- 20,
	+ 2,
	/* cost of moving integer register to SSE. */
	- 20,
	+ 2,
	+ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
	+ rec. throughput 6.
	+ So 5 uops statically and one uops per load. */
	/* Gather load static, per_elt. */
	- 12, 12,
	+ 10, 6,
	/* Gather store static, per_elt. */
	- 12, 12,
	+ 10, 6,
	/* size of l1 cache. */
	- 8,
	+ 64,
	/* size of l2 cache. */
	- 1024,
	+ 512,
	/* size of prefetch block */
	64,
	/* number of parallel prefetches */
	- 8,
	+ 6,
	+ /* FIXME perhaps more appropriate value is 5. */
	/* Branch cost */
	- 1,
	+ 3,
	/* cost of FADD and FSUB insns. */
	- COSTS_N_INSNS (6),
	+ COSTS_N_INSNS (3),
	/* cost of FMUL instruction. */
	- COSTS_N_INSNS (8),
	+ COSTS_N_INSNS (5),
	+ /* 10-24 */
	/* cost of FDIV instruction. */
	- COSTS_N_INSNS (40),
	+ COSTS_N_INSNS (24),
	/* cost of FABS instruction. */
	- COSTS_N_INSNS (3),
	+ COSTS_N_INSNS (1),
	/* cost of FCHS instruction. */
	- COSTS_N_INSNS (3),
	+ COSTS_N_INSNS (1),
	/* cost of FSQRT instruction. */
	- COSTS_N_INSNS (44),
	+ COSTS_N_INSNS (23),
	/* cost of cheap SSE instruction. */
	- COSTS_N_INSNS (2),
	+ COSTS_N_INSNS (1),
	/* cost of ADDSS/SD SUBSS/SD insns. */
	- COSTS_N_INSNS (5),
	+ COSTS_N_INSNS (3),
	/* cost of MULSS instruction. */
	- COSTS_N_INSNS (7),
	+ COSTS_N_INSNS (4),
	/* cost of MULSD instruction. */
	- COSTS_N_INSNS (7),
	+ COSTS_N_INSNS (5),
	/* cost of FMA SS instruction. */
	- COSTS_N_INSNS (7),
	+ COSTS_N_INSNS (5),
	/* cost of FMA SD instruction. */
	- COSTS_N_INSNS (7),
	+ COSTS_N_INSNS (5),
	/* cost of DIVSS instruction. */
	- COSTS_N_INSNS (32),
	+ COSTS_N_INSNS (18),
	/* cost of DIVSD instruction. */
	- COSTS_N_INSNS (40),
	-/* cost of SQRTSS instruction. */
	COSTS_N_INSNS (32),
	+/* cost of SQRTSS instruction. */
	+ COSTS_N_INSNS (30),
	/* cost of SQRTSD instruction. */
	- COSTS_N_INSNS (41),
	+ COSTS_N_INSNS (58),
	/* cost of CVTSS2SD etc. */
	- COSTS_N_INSNS (10),
	+ COSTS_N_INSNS (2),
	/* cost of 256bit VCVTPS2PD etc. */
	- COSTS_N_INSNS (20),
	+ COSTS_N_INSNS (2),
	/* cost of 512bit VCVTPS2PD etc. */
	- COSTS_N_INSNS (40),
	+ COSTS_N_INSNS (2),
	/* cost of CVTSI2SS instruction. */
	- COSTS_N_INSNS (20),
	+ COSTS_N_INSNS (6),
	/* cost of CVT(T)SS2SI instruction. */
	- COSTS_N_INSNS (17),
	+ COSTS_N_INSNS (6),
	/* cost of CVTPI2PS instruction. */
	- COSTS_N_INSNS (12),
	+ COSTS_N_INSNS (6),
	/* cost of CVT(T)PS2PI instruction. */
	- COSTS_N_INSNS (8),
	+ COSTS_N_INSNS (7),
	/* reassoc int, fp, vec_int, vec_fp. */
	- 1, 1, 1, 1,
	+ 1, 4, 2, 2,
	/* latency times throughput of
	- {1, 1, 1},
	+ {8, 1, 3},
	FMA/DOT_PROD_EXPR/SAD_EXPR,
	it's used to determine unroll
	factor in the vectorizer. */
	/* Limit how much the autovectorizer
	1,
	may unroll a loop. */
	- nocona_memcpy,
	- nocona_memset,
	+ core_memcpy,
	+ core_memset,
	/* cond_taken_branch_cost. */
	COSTS_N_INSNS (3),
	/* cond_not_taken_branch_cost. */
	COSTS_N_INSNS (1),
	/* Loop alignment. */
	- NULL,
	+ "16:11:8",
	/* Jump alignment. */
	- NULL,
	+ "16:11:8",
	/* Label alignment. */
	- NULL,
	+ "0:0:8",
	/* Func alignment. */
	- NULL,
	+ "16",
	/* Small unroll limit. */
	4,
	/* Small unroll factor. */
	2,
	/* Branch mispredict scale. */
	COSTS_N_INSNS (2),
	-};
	\ No newline at end of file
	+};
No results found