Enable small loop unrolling for O2

author Hongyu Wang <hongyu.wang@intel.com>

Thu, 8 Sep 2022 08:52:02 +0000 (16:52 +0800)

committer Hongyu Wang <hongyu.wang@intel.com>

Mon, 14 Nov 2022 05:38:06 +0000 (13:38 +0800)
author Hongyu Wang <hongyu.wang@intel.com>
Thu, 8 Sep 2022 08:52:02 +0000 (16:52 +0800)
committer Hongyu Wang <hongyu.wang@intel.com>
Mon, 14 Nov 2022 05:38:06 +0000 (13:38 +0800)
diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc

index 431fd0d3ad1ab3211dd605450ad5e235d97eb968..2f491b2f84bc770bfdb51b48d6800303b0db2895 100644 (file)
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1803,6 +1803,7 @@ static const struct default_options ix86_option_optimization_table[] =
      /* The STC algorithm produces the smallest code at -Os, for x86.  */
      { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
        REORDER_BLOCKS_ALGORITHM_STC },
+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
      /* Turn off -fschedule-insns by default.  It tends to make the
         problem with not enough registers even worse.  */
      { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index f8586499cd10e4ad5706fe01e8fc2a847c592451..292b32c5e99a71314d97c008ba41ec52e42253dd 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23827,6 +23827,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
    unsigned i;
    unsigned mem_count = 0;
  
+  /* Unroll small size loop when unroll factor is not explicitly
+     specified.  */
+  if (!(flag_unroll_loops
+       || flag_unroll_all_loops
+       || loop->unroll))
+    {
+      nunroll = 1;
+
+      /* Any explicit -f{no-}unroll-{all-}loops turns off
+        -munroll-only-small-loops.  */
+      if (ix86_unroll_only_small_loops
+         && !OPTION_SET_P (flag_unroll_loops)
+         && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+       nunroll = ix86_cost->small_unroll_factor;
+
+      return nunroll;
+    }
+
    if (!TARGET_ADJUST_UNROLL)
       return nunroll;
  
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index a5ad9f387f7e10e7f1dc193e5713065879253c17..3869db8f2d35754b732bc43266c9c7692e2e2222 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -219,6 +219,11 @@ struct processor_costs {
    const char *const align_jump;                /* Jump alignment.  */
    const char *const align_label;       /* Label alignment.  */
    const char *const align_func;                /* Function alignment.  */
+
+  const unsigned small_unroll_ninsns;  /* Insn count limit for small loop
+                                          to be unrolled.  */
+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
+                                          be unrolled.  */
  };
  
  extern const struct processor_costs *ix86_cost;
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt

index 415c52e1bb4417fa40754934b278d252a9b7896f..d6b80efa04deba3a70e3787edb5da059b238b232 100644 (file)
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,3 +1246,7 @@ Support PREFETCHI built-in functions and code generation.
  mraoint
  Target Mask(ISA2_RAOINT) Var(ix86_isa_flags2) Save
  Support RAOINT built-in functions and code generation.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index aeaa7eb008e2e6776072c48069f63cae1ab90ec7..f01b8ee9eef190852960d6bcfa37537f3bc01a27 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    NULL,                                        /* Jump alignment.  */
    NULL,                                        /* Label alignment.  */
    NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* Processor costs (relative to an add) */
@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
    "4",                                 /* Jump alignment.  */
    NULL,                                        /* Label alignment.  */
    "4",                                 /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs i486_memcpy[2] = {
@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
    "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs pentium_memcpy[2] = {
@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static const
@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs geode_memcpy[2] = {
@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
    NULL,                                        /* Jump alignment.  */
    NULL,                                        /* Label alignment.  */
    NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs k6_memcpy[2] = {
@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
    "32:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "32",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
    "32:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "32",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /*  BDVER has optimized REP instruction for medium sized blocks, but for
@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  
@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
    "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
    "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  struct processor_costs znver3_cost = {
@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
    "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* This table currently replicates znver3_cost table. */
@@ -1951,6 +1981,8 @@ struct processor_costs znver4_cost = {
    "16",                                        /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
@@ -2075,6 +2107,8 @@ struct processor_costs skylake_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* icelake_cost should produce code tuned for Icelake family of CPUs.
@@ -2201,6 +2235,8 @@ struct processor_costs icelake_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
@@ -2321,6 +2357,8 @@ struct processor_costs alderlake_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
    /* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -2434,6 +2472,8 @@ const struct processor_costs btver1_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs btver2_memcpy[2] = {
@@ -2544,6 +2584,8 @@ const struct processor_costs btver2_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs pentium4_memcpy[2] = {
@@ -2653,6 +2695,8 @@ struct processor_costs pentium4_cost = {
    NULL,                                        /* Jump alignment.  */
    NULL,                                        /* Label alignment.  */
    NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs nocona_memcpy[2] = {
@@ -2765,6 +2809,8 @@ struct processor_costs nocona_cost = {
    NULL,                                        /* Jump alignment.  */
    NULL,                                        /* Label alignment.  */
    NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs atom_memcpy[2] = {
@@ -2875,6 +2921,8 @@ struct processor_costs atom_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs slm_memcpy[2] = {
@@ -2985,6 +3033,8 @@ struct processor_costs slm_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs tremont_memcpy[2] = {
@@ -3109,6 +3159,8 @@ struct processor_costs tremont_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  static stringop_algs intel_memcpy[2] = {
@@ -3219,6 +3271,8 @@ struct processor_costs intel_cost = {
    "16:8:8",                            /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
@@ -3334,6 +3388,8 @@ struct processor_costs lujiazui_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* Generic should produce code tuned for Core-i7 (and newer chips)
@@ -3453,6 +3509,8 @@ struct processor_costs generic_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
  /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -3579,5 +3637,7 @@ struct processor_costs core_cost = {
    "16:11:8",                           /* Jump alignment.  */
    "0:0:8",                             /* Label alignment.  */
    "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
  };
  
diff --git a/gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst b/gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst

index 6f015e9e96a3eb510312c40efdf0259c9f539d8e..5e18fd77f870b8885d5cbc75437a89a7423ed60e 100644 (file)
--- a/gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst
+++ b/gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst
@@ -1614,3 +1614,9 @@ on x86-64 processors in 64-bit environments.
  .. option:: -mdirect-extern-access
  
    Default setting; overrides :option:`-mno-direct-extern-access`.
+
+.. option:: -munroll-only-small-loops
+  Controls conservative small loop unrolling. It is default enbaled by
+  O2, and unrolls loop with less than 4 insns by 1 time. Explicit
+  -f[no-]unroll-[all-]loops would disable this flag to avoid any
+  unintended unrolling behavior that user does not want.
diff --git a/gcc/doc/gcc/gcc-command-options/option-summary.rst b/gcc/doc/gcc/gcc-command-options/option-summary.rst

index b90b6600d700bbbc276e865ecac01cdb1989df94..02898fb65cd57983e8bd8808b62464fd9a952c8e 100644 (file)
--- a/gcc/doc/gcc/gcc-command-options/option-summary.rst
+++ b/gcc/doc/gcc/gcc-command-options/option-summary.rst
@@ -1490,7 +1490,8 @@ in the following sections.
    :option:`-mgeneral-regs-only`  :option:`-mcall-ms2sysv-xlogues` :option:`-mrelax-cmpxchg-loop` |gol|
    :option:`-mindirect-branch=choice`  :option:`-mfunction-return=choice` |gol|
    :option:`-mindirect-branch-register` :option:`-mharden-sls=choice` |gol|
-  :option:`-mindirect-branch-cs-prefix` :option:`-mneeded` :option:`-mno-direct-extern-access`
+  :option:`-mindirect-branch-cs-prefix` :option:`-mneeded` :option:`-mno-direct-extern-access` |gol|
+  :option:`-munroll-only-small-loops`
  
    *x86 Windows Options*
  
diff --git a/gcc/loop-init.cc b/gcc/loop-init.cc

index b9e07973dd6a905670eda7a9a45ace30ff0fcaf2..9789efa1e11a740956ffa8e218d86262e31ece91 100644 (file)
--- a/gcc/loop-init.cc
+++ b/gcc/loop-init.cc
@@ -565,9 +565,12 @@ public:
    {}
  
    /* opt_pass methods: */
-  bool gate (function *) final override
+  bool gate (function *fun) final override
      {
-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+             || (targetm.loop_unroll_adjust
+                 && optimize >= 2
+                 && optimize_function_for_speed_p (fun)));
      }
  
    unsigned int execute (function *) final override;
@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
        if (dump_file)
         df_dump (dump_file);
  
-      if (flag_unroll_loops)
+      if (flag_unroll_loops
+         || targetm.loop_unroll_adjust)
         flags |= UAP_UNROLL;
        if (flag_unroll_all_loops)
         flags |= UAP_UNROLL_ALL;
diff --git a/gcc/testsuite/gcc.dg/guality/loop-1.c b/gcc/testsuite/gcc.dg/guality/loop-1.c

index 1b1f6d32271b76a86351fc0c236a6e4acb6d55fd..a32ea445a3f7feb0b2af9a364e7cc297d8a6e440 100644 (file)
--- a/gcc/testsuite/gcc.dg/guality/loop-1.c
+++ b/gcc/testsuite/gcc.dg/guality/loop-1.c
@@ -1,5 +1,7 @@
  /* { dg-do run } */
  /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+
  
  #include "../nop.h"
  
diff --git a/gcc/testsuite/gcc.target/i386/pr86270.c b/gcc/testsuite/gcc.target/i386/pr86270.c

index 81841ef5bd70e4ea4ad103fda6f1131f47703098..cbc9fbb0450485a7409a82e97152fa6620f4c1cb 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/pr86270.c
+++ b/gcc/testsuite/gcc.target/i386/pr86270.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
  
  int *a;
  long len;
diff --git a/gcc/testsuite/gcc.target/i386/pr93002.c b/gcc/testsuite/gcc.target/i386/pr93002.c

index 0248fcc00a5a63c52f221ffa4294405724d10827..f75a847f75dfab39f5171b126f8b82c405ee7137 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/pr93002.c
+++ b/gcc/testsuite/gcc.target/i386/pr93002.c
@@ -1,6 +1,6 @@
  /* PR target/93002 */
  /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
  /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
  
  volatile int sink;
author	Hongyu Wang <hongyu.wang@intel.com>
	Thu, 8 Sep 2022 08:52:02 +0000 (16:52 +0800)
committer	Hongyu Wang <hongyu.wang@intel.com>
	Mon, 14 Nov 2022 05:38:06 +0000 (13:38 +0800)
gcc/common/config/i386/i386-common.cc		patch \| blob \| blame \| history
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/config/i386/i386.h		patch \| blob \| blame \| history
gcc/config/i386/i386.opt		patch \| blob \| blame \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| blame \| history
gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst		patch \| blob \| blame \| history
gcc/doc/gcc/gcc-command-options/option-summary.rst		patch \| blob \| blame \| history
gcc/loop-init.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/guality/loop-1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/pr86270.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/pr93002.c		patch \| blob \| blame \| history