i386: Add br_mispredict_scale in cost table.

author Hongyu Wang <hongyu.wang@intel.com>

Thu, 2 Jan 2025 02:29:27 +0000 (10:29 +0800)

committer Hongyu Wang <hongyu.wang@intel.com>

Wed, 8 Jan 2025 02:19:21 +0000 (10:19 +0800)
author Hongyu Wang <hongyu.wang@intel.com>
Thu, 2 Jan 2025 02:29:27 +0000 (10:29 +0800)
committer Hongyu Wang <hongyu.wang@intel.com>
Wed, 8 Jan 2025 02:19:21 +0000 (10:19 +0800)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 655335e2f47f805c5413e898dc134e099b9f6f75..11770aa8a50004817435ab65f00abed7d58894f4 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25088,7 +25088,13 @@ ix86_max_noce_ifcvt_seq_cost (edge e)
         return param_max_rtl_if_conversion_unpredictable_cost;
      }
  
-  return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
+  /* For modern machines with deeper pipeline, the penalty for branch
+     misprediction could be higher than before to reset the pipeline
+     slots. Add parameter br_mispredict_scale as a factor to describe
+     the impact of reseting the pipeline.  */
+
+  return BRANCH_COST (true, predictable_p)
+        * ix86_tune_cost->br_mispredict_scale;
  }
  
  /* Return true if SEQ is a good candidate as a replacement for the
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 00626d539a9894232432e8c30809ba1d05a314ae..e8e528c7811bebdb500cc2f8e93c8ae65b30f29b 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -232,6 +232,8 @@ struct processor_costs {
                                            to be unrolled.  */
    const unsigned small_unroll_factor;   /* Unroll factor for small loop to
                                            be unrolled.  */
+  const int br_mispredict_scale;       /* Branch mispredict scale for ifcvt
+                                          threshold.  */
  };
  
  extern const struct processor_costs *ix86_cost;
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index 56a09f12b94dc45ce78390e0ed3a31d485d2f25c..a4a128cd5dde671bd203e517f290366814ebf88b 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -137,6 +137,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    NULL,                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* Processor costs (relative to an add) */
@@ -248,6 +249,7 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
    "4",                                 /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs i486_memcpy[2] = {
@@ -360,6 +362,7 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs pentium_memcpy[2] = {
@@ -470,6 +473,7 @@ struct processor_costs pentium_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static const
@@ -573,6 +577,7 @@ struct processor_costs lakemont_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -691,6 +696,7 @@ struct processor_costs pentiumpro_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs geode_memcpy[2] = {
@@ -800,6 +806,7 @@ struct processor_costs geode_cost = {
    NULL,                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs k6_memcpy[2] = {
@@ -912,6 +919,7 @@ struct processor_costs k6_cost = {
    "32",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -1025,6 +1033,7 @@ struct processor_costs athlon_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -1147,6 +1156,7 @@ struct processor_costs k8_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1277,6 +1287,7 @@ struct processor_costs amdfam10_cost = {
    "32",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /*  BDVER has optimized REP instruction for medium sized blocks, but for
@@ -1400,6 +1411,7 @@ const struct processor_costs bdver_cost = {
    "11",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  
@@ -1555,6 +1567,7 @@ struct processor_costs znver1_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1714,6 +1727,7 @@ struct processor_costs znver2_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  struct processor_costs znver3_cost = {
@@ -1848,6 +1862,7 @@ struct processor_costs znver3_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* This table currently replicates znver3_cost table. */
@@ -1984,6 +1999,7 @@ struct processor_costs znver4_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* This table currently replicates znver4_cost table. */
@@ -2137,6 +2153,7 @@ struct processor_costs znver5_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
@@ -2263,6 +2280,7 @@ struct processor_costs skylake_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* icelake_cost should produce code tuned for Icelake family of CPUs.
@@ -2391,6 +2409,7 @@ struct processor_costs icelake_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2) + 3,               /* Branch mispredict scale.  */
  };
  
  /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
@@ -2513,6 +2532,7 @@ struct processor_costs alderlake_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2) + 3,               /* Branch mispredict scale.  */
  };
  
    /* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -2628,6 +2648,7 @@ const struct processor_costs btver1_cost = {
    "11",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs btver2_memcpy[2] = {
@@ -2740,6 +2761,7 @@ const struct processor_costs btver2_cost = {
    "11",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs pentium4_memcpy[2] = {
@@ -2851,6 +2873,7 @@ struct processor_costs pentium4_cost = {
    NULL,                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs nocona_memcpy[2] = {
@@ -2965,6 +2988,7 @@ struct processor_costs nocona_cost = {
    NULL,                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs atom_memcpy[2] = {
@@ -3077,6 +3101,7 @@ struct processor_costs atom_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs slm_memcpy[2] = {
@@ -3189,6 +3214,7 @@ struct processor_costs slm_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs tremont_memcpy[2] = {
@@ -3315,6 +3341,7 @@ struct processor_costs tremont_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  static stringop_algs intel_memcpy[2] = {
@@ -3427,6 +3454,7 @@ struct processor_costs intel_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
@@ -3544,6 +3572,7 @@ struct processor_costs lujiazui_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU.  */
@@ -3659,6 +3688,7 @@ struct processor_costs yongfeng_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU.  */
@@ -3774,6 +3804,7 @@ struct processor_costs shijidadao_cost = {
    "16",                                /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  
@@ -3897,6 +3928,7 @@ struct processor_costs generic_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
  /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -4025,5 +4057,6 @@ struct processor_costs core_cost = {
    "16",                                        /* Func alignment.  */
    4,                                   /* Small unroll limit.  */
    2,                                   /* Small unroll factor.  */
+  COSTS_N_INSNS (2),                   /* Branch mispredict scale.  */
  };
  
diff --git a/gcc/testsuite/gcc.target/i386/cmov12.c b/gcc/testsuite/gcc.target/i386/cmov12.c

new file mode 100644 (file)

index 0000000..87de4f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmov12.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx2 -mtune=sapphirerapids" } */                             
+/* { dg-final { scan-assembler-times "cmovg" 3 } } */
+
+void foo(int *a, int n, int k)
+{
+  int j, v;
+
+  v = a[k - 1];
+  while (k <= n / 2) {
+      j = k + k;
+      if ((j < n) && (a[j - 1] < a[j]))
+       j++;
+      if (v >= a[j - 1])
+       break;
+      a[k - 1] = a[j - 1];
+      k = j;
+  }
+  a[k - 1] = v;
+}
+
author	Hongyu Wang <hongyu.wang@intel.com>
	Thu, 2 Jan 2025 02:29:27 +0000 (10:29 +0800)
committer	Hongyu Wang <hongyu.wang@intel.com>
	Wed, 8 Jan 2025 02:19:21 +0000 (10:19 +0800)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/config/i386/i386.h		patch \| blob \| blame \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/cmov12.c	[new file with mode: 0644]	patch \| blob