Update znver4 costs

author Jan Hubicka <jh@suse.cz>

Thu, 22 Dec 2022 01:16:24 +0000 (02:16 +0100)

committer Jan Hubicka <jh@suse.cz>

Thu, 18 Jan 2024 14:37:41 +0000 (15:37 +0100)
author Jan Hubicka <jh@suse.cz>
Thu, 22 Dec 2022 01:16:24 +0000 (02:16 +0100)
committer Jan Hubicka <jh@suse.cz>
Thu, 18 Jan 2024 14:37:41 +0000 (15:37 +0100)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index ffe810f2bcbf8033de0e769938e4a565335d6eeb..48100d104156814c104ee241683c2f109e2d6f60 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -1820,6 +1820,140 @@ struct processor_costs znver3_cost = {
    "16",                                        /* Func alignment.  */
  };
  
+/* This table currently replicates znver3_cost table. */
+struct processor_costs znver4_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+
+  /* reg-reg moves are done by renaming and thus they are even cheaper than
+     1 cycle.  Because reg-reg move cost is 2 and following tables correspond
+     to doubles of latencies, we do not model this correctly.  It does not
+     seem to make practical difference to bump prices up even more.  */
+  6,                                   /* cost for loading QImode using
+                                          movzbl.  */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  2,                                   /* cost of reg,reg fld/fst.  */
+  {14, 14, 17},                                /* cost of loading fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  {12, 12, 16},                                /* cost of storing fp registers
+                                          in SFmode, DFmode and XFmode.  */
+  2,                                   /* cost of moving MMX register.  */
+  {6, 6},                              /* cost of loading MMX registers
+                                          in SImode and DImode.  */
+  {8, 8},                              /* cost of storing MMX registers
+                                          in SImode and DImode.  */
+  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  {6, 6, 10, 10, 12},                  /* cost of loading SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  {8, 8, 8, 12, 12},                   /* cost of storing SSE registers
+                                          in 32,64,128,256 and 512-bit.  */
+  6, 8,                                        /* SSE->integer and integer->SSE
+                                          moves.  */
+  8, 8,                                        /* mask->integer and integer->mask moves */
+  {6, 6, 6},                           /* cost of loading mask register
+                                          in QImode, HImode, SImode.  */
+  {8, 8, 8},                           /* cost if storing mask register
+                                          in QImode, HImode, SImode.  */
+  2,                                   /* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
+  /* TODO: Lea with 3 components has cost 2.  */
+  COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
+  COSTS_N_INSNS (1),                   /* variable shift costs.  */
+  COSTS_N_INSNS (1),                   /* constant shift costs.  */
+  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI.  */
+   COSTS_N_INSNS (3),                  /*                               HI.  */
+   COSTS_N_INSNS (3),                  /*                               SI.  */
+   COSTS_N_INSNS (3),                  /*                               DI.  */
+   COSTS_N_INSNS (3)},                 /*                      other.  */
+  0,                                   /* cost of multiply per each bit
+                                          set.  */
+  {COSTS_N_INSNS (12),                 /* cost of a divide/mod for QI.  */
+   COSTS_N_INSNS (13),                 /*                          HI.  */
+   COSTS_N_INSNS (13),                 /*                          SI.  */
+   COSTS_N_INSNS (18),                 /*                          DI.  */
+   COSTS_N_INSNS (18)},                        /*                          other.  */
+  COSTS_N_INSNS (1),                   /* cost of movsx.  */
+  COSTS_N_INSNS (1),                   /* cost of movzx.  */
+  8,                                   /* "large" insn.  */
+  9,                                   /* MOVE_RATIO.  */
+  6,                                   /* CLEAR_RATIO */
+  {6, 6, 6},                           /* cost of loading integer registers
+                                          in QImode, HImode and SImode.
+                                          Relative to reg-reg move (2).  */
+  {8, 8, 8},                           /* cost of storing integer
+                                          registers.  */
+  {6, 6, 10, 10, 12},                  /* cost of loading SSE registers
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {8, 8, 8, 12, 12},                   /* cost of storing SSE register
+                                          in 32bit, 64bit, 128bit, 256bit and 512bit */
+  {6, 6, 6, 6, 6},                     /* cost of unaligned loads.  */
+  {8, 8, 8, 8, 8},                     /* cost of unaligned stores.  */
+  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
+                                          register.  */
+  6,                                   /* cost of moving SSE register to integer.  */
+  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
+     throughput 5.  Approx 7 uops do not depend on vector size and every load
+     is 5 uops.  */
+  14, 10,                              /* Gather load static, per_elt.  */
+  14, 20,                              /* Gather store static, per_elt.  */
+  32,                                  /* size of l1 cache.  */
+  1024,                                        /* size of l2 cache.  */
+  64,                                  /* size of prefetch block.  */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,                                 /* number of parallel prefetches.  */
+  3,                                   /* Branch cost.  */
+  COSTS_N_INSNS (7),                   /* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
+  /* Latency of fdiv is 8-15.  */
+  COSTS_N_INSNS (15),                  /* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
+  /* Latency of fsqrt is 4-10.  */
+  COSTS_N_INSNS (25),                  /* cost of FSQRT instruction.  */
+
+  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of FMA SS instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of FMA SD instruction.  */
+  COSTS_N_INSNS (13),                  /* cost of DIVSS instruction.  */
+  /* 9-13.  */
+  COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
+  COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
+  COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
+  /* Zen can execute 4 integer operations per cycle.  FP operations
+     take 3 cycles and it can execute 2 integer additions and 2
+     multiplications thus reassociation may make sense up to with of 6.
+     SPEC2k6 bencharks suggests
+     that 4 works better than 6 probably due to register pressure.
+
+     Integer vector operations are taken by FP unit and execute 3 vector
+     plus/minus operations per cycle but only one multiply.  This is adjusted
+     in ix86_reassociation_width.  */
+  4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
+  znver2_memcpy,
+  znver2_memset,
+  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
+  "16",                                        /* Loop alignment.  */
+  "16",                                        /* Jump alignment.  */
+  "0:0:8",                             /* Label alignment.  */
+  "16",                                        /* Func alignment.  */
+};
+
  /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
  static stringop_algs skylake_memcpy[2] =   {
    {libcall,
author	Jan Hubicka <jh@suse.cz>
	Thu, 22 Dec 2022 01:16:24 +0000 (02:16 +0100)
committer	Jan Hubicka <jh@suse.cz>
	Thu, 18 Jan 2024 14:37:41 +0000 (15:37 +0100)