]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Enable small loop unrolling for O2
authorHongyu Wang <hongyu.wang@intel.com>
Thu, 8 Sep 2022 08:52:02 +0000 (16:52 +0800)
committerHongyu Wang <hongyu.wang@intel.com>
Mon, 14 Nov 2022 05:38:06 +0000 (13:38 +0800)
Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

* common/config/i386/i386-common.cc (ix86_optimization_table):
Enable small loop unroll at O2 by default.
* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
factor if -munroll-only-small-loops enabled and -funroll-loops/
-funroll-all-loops are disabled.
* config/i386/i386.h (struct processor_costs): Add 2 field
small_unroll_ninsns and small_unroll_factor.
* config/i386/i386.opt: Add -munroll-only-small-loops.
* doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst:
Document -munroll-only-small-loops.
* doc/gcc/gcc-command-options/option-summary.rst: Likewise.
* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
loop unrolling for -O2-speed and above if target hook
loop_unroll_adjust exists.
(pass_rtl_unroll_loops::execute): Set UAP_UNROLL flag
when target hook loop_unroll_adjust exists.
* config/i386/x86-tune-costs.h: Update all processor costs
with small_unroll_ninsns = 4 and small_unroll_factor = 2.

gcc/testsuite/ChangeLog:

* gcc.dg/guality/loop-1.c: Add additional option
-mno-unroll-only-small-loops.
* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
* gcc.target/i386/pr93002.c: Likewise.

gcc/common/config/i386/i386-common.cc
gcc/config/i386/i386.cc
gcc/config/i386/i386.h
gcc/config/i386/i386.opt
gcc/config/i386/x86-tune-costs.h
gcc/doc/gcc/gcc-command-options/machine-dependent-options/x86-options.rst
gcc/doc/gcc/gcc-command-options/option-summary.rst
gcc/loop-init.cc
gcc/testsuite/gcc.dg/guality/loop-1.c
gcc/testsuite/gcc.target/i386/pr86270.c
gcc/testsuite/gcc.target/i386/pr93002.c

index 431fd0d3ad1ab3211dd605450ad5e235d97eb968..2f491b2f84bc770bfdb51b48d6800303b0db2895 100644 (file)
@@ -1803,6 +1803,7 @@ static const struct default_options ix86_option_optimization_table[] =
     /* The STC algorithm produces the smallest code at -Os, for x86.  */
     { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
       REORDER_BLOCKS_ALGORITHM_STC },
+    { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
     /* Turn off -fschedule-insns by default.  It tends to make the
        problem with not enough registers even worse.  */
     { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
index f8586499cd10e4ad5706fe01e8fc2a847c592451..292b32c5e99a71314d97c008ba41ec52e42253dd 100644 (file)
@@ -23827,6 +23827,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop *loop)
   unsigned i;
   unsigned mem_count = 0;
 
+  /* Unroll small size loop when unroll factor is not explicitly
+     specified.  */
+  if (!(flag_unroll_loops
+       || flag_unroll_all_loops
+       || loop->unroll))
+    {
+      nunroll = 1;
+
+      /* Any explicit -f{no-}unroll-{all-}loops turns off
+        -munroll-only-small-loops.  */
+      if (ix86_unroll_only_small_loops
+         && !OPTION_SET_P (flag_unroll_loops)
+         && loop->ninsns <= ix86_cost->small_unroll_ninsns)
+       nunroll = ix86_cost->small_unroll_factor;
+
+      return nunroll;
+    }
+
   if (!TARGET_ADJUST_UNROLL)
      return nunroll;
 
index a5ad9f387f7e10e7f1dc193e5713065879253c17..3869db8f2d35754b732bc43266c9c7692e2e2222 100644 (file)
@@ -219,6 +219,11 @@ struct processor_costs {
   const char *const align_jump;                /* Jump alignment.  */
   const char *const align_label;       /* Label alignment.  */
   const char *const align_func;                /* Function alignment.  */
+
+  const unsigned small_unroll_ninsns;  /* Insn count limit for small loop
+                                          to be unrolled.  */
+  const unsigned small_unroll_factor;   /* Unroll factor for small loop to
+                                          be unrolled.  */
 };
 
 extern const struct processor_costs *ix86_cost;
index 415c52e1bb4417fa40754934b278d252a9b7896f..d6b80efa04deba3a70e3787edb5da059b238b232 100644 (file)
@@ -1246,3 +1246,7 @@ Support PREFETCHI built-in functions and code generation.
 mraoint
 Target Mask(ISA2_RAOINT) Var(ix86_isa_flags2) Save
 Support RAOINT built-in functions and code generation.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
index aeaa7eb008e2e6776072c48069f63cae1ab90ec7..f01b8ee9eef190852960d6bcfa37537f3bc01a27 100644 (file)
@@ -135,6 +135,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   NULL,                                        /* Jump alignment.  */
   NULL,                                        /* Label alignment.  */
   NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -244,6 +246,8 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
   "4",                                 /* Jump alignment.  */
   NULL,                                        /* Label alignment.  */
   "4",                                 /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs i486_memcpy[2] = {
@@ -354,6 +358,8 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
   "16",                                        /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs pentium_memcpy[2] = {
@@ -462,6 +468,8 @@ struct processor_costs pentium_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static const
@@ -563,6 +571,8 @@ struct processor_costs lakemont_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -679,6 +689,8 @@ struct processor_costs pentiumpro_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs geode_memcpy[2] = {
@@ -786,6 +798,8 @@ struct processor_costs geode_cost = {
   NULL,                                        /* Jump alignment.  */
   NULL,                                        /* Label alignment.  */
   NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs k6_memcpy[2] = {
@@ -896,6 +910,8 @@ struct processor_costs k6_cost = {
   "32:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "32",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -1007,6 +1023,8 @@ struct processor_costs athlon_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -1127,6 +1145,8 @@ struct processor_costs k8_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -1255,6 +1275,8 @@ struct processor_costs amdfam10_cost = {
   "32:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "32",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /*  BDVER has optimized REP instruction for medium sized blocks, but for
@@ -1376,6 +1398,8 @@ const struct processor_costs bdver_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 
@@ -1529,6 +1553,8 @@ struct processor_costs znver1_cost = {
   "16",                                        /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1686,6 +1712,8 @@ struct processor_costs znver2_cost = {
   "16",                                        /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 struct processor_costs znver3_cost = {
@@ -1818,6 +1846,8 @@ struct processor_costs znver3_cost = {
   "16",                                        /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* This table currently replicates znver3_cost table. */
@@ -1951,6 +1981,8 @@ struct processor_costs znver4_cost = {
   "16",                                        /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
@@ -2075,6 +2107,8 @@ struct processor_costs skylake_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* icelake_cost should produce code tuned for Icelake family of CPUs.
@@ -2201,6 +2235,8 @@ struct processor_costs icelake_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* alderlake_cost should produce code tuned for alderlake family of CPUs.  */
@@ -2321,6 +2357,8 @@ struct processor_costs alderlake_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -2434,6 +2472,8 @@ const struct processor_costs btver1_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs btver2_memcpy[2] = {
@@ -2544,6 +2584,8 @@ const struct processor_costs btver2_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "11",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs pentium4_memcpy[2] = {
@@ -2653,6 +2695,8 @@ struct processor_costs pentium4_cost = {
   NULL,                                        /* Jump alignment.  */
   NULL,                                        /* Label alignment.  */
   NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs nocona_memcpy[2] = {
@@ -2765,6 +2809,8 @@ struct processor_costs nocona_cost = {
   NULL,                                        /* Jump alignment.  */
   NULL,                                        /* Label alignment.  */
   NULL,                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs atom_memcpy[2] = {
@@ -2875,6 +2921,8 @@ struct processor_costs atom_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs slm_memcpy[2] = {
@@ -2985,6 +3033,8 @@ struct processor_costs slm_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs tremont_memcpy[2] = {
@@ -3109,6 +3159,8 @@ struct processor_costs tremont_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 static stringop_algs intel_memcpy[2] = {
@@ -3219,6 +3271,8 @@ struct processor_costs intel_cost = {
   "16:8:8",                            /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
@@ -3334,6 +3388,8 @@ struct processor_costs lujiazui_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
@@ -3453,6 +3509,8 @@ struct processor_costs generic_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -3579,5 +3637,7 @@ struct processor_costs core_cost = {
   "16:11:8",                           /* Jump alignment.  */
   "0:0:8",                             /* Label alignment.  */
   "16",                                        /* Func alignment.  */
+  4,                                   /* Small unroll limit.  */
+  2,                                   /* Small unroll factor.  */
 };
 
index 6f015e9e96a3eb510312c40efdf0259c9f539d8e..5e18fd77f870b8885d5cbc75437a89a7423ed60e 100644 (file)
@@ -1614,3 +1614,9 @@ on x86-64 processors in 64-bit environments.
 .. option:: -mdirect-extern-access
 
   Default setting; overrides :option:`-mno-direct-extern-access`.
+
+.. option:: -munroll-only-small-loops
+  Controls conservative small loop unrolling. It is default enbaled by
+  O2, and unrolls loop with less than 4 insns by 1 time. Explicit
+  -f[no-]unroll-[all-]loops would disable this flag to avoid any
+  unintended unrolling behavior that user does not want.
index b90b6600d700bbbc276e865ecac01cdb1989df94..02898fb65cd57983e8bd8808b62464fd9a952c8e 100644 (file)
@@ -1490,7 +1490,8 @@ in the following sections.
   :option:`-mgeneral-regs-only`  :option:`-mcall-ms2sysv-xlogues` :option:`-mrelax-cmpxchg-loop` |gol|
   :option:`-mindirect-branch=choice`  :option:`-mfunction-return=choice` |gol|
   :option:`-mindirect-branch-register` :option:`-mharden-sls=choice` |gol|
-  :option:`-mindirect-branch-cs-prefix` :option:`-mneeded` :option:`-mno-direct-extern-access`
+  :option:`-mindirect-branch-cs-prefix` :option:`-mneeded` :option:`-mno-direct-extern-access` |gol|
+  :option:`-munroll-only-small-loops`
 
   *x86 Windows Options*
 
index b9e07973dd6a905670eda7a9a45ace30ff0fcaf2..9789efa1e11a740956ffa8e218d86262e31ece91 100644 (file)
@@ -565,9 +565,12 @@ public:
   {}
 
   /* opt_pass methods: */
-  bool gate (function *) final override
+  bool gate (function *fun) final override
     {
-      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll);
+      return (flag_unroll_loops || flag_unroll_all_loops || cfun->has_unroll
+             || (targetm.loop_unroll_adjust
+                 && optimize >= 2
+                 && optimize_function_for_speed_p (fun)));
     }
 
   unsigned int execute (function *) final override;
@@ -583,7 +586,8 @@ pass_rtl_unroll_loops::execute (function *fun)
       if (dump_file)
        df_dump (dump_file);
 
-      if (flag_unroll_loops)
+      if (flag_unroll_loops
+         || targetm.loop_unroll_adjust)
        flags |= UAP_UNROLL;
       if (flag_unroll_all_loops)
        flags |= UAP_UNROLL_ALL;
index 1b1f6d32271b76a86351fc0c236a6e4acb6d55fd..a32ea445a3f7feb0b2af9a364e7cc297d8a6e440 100644 (file)
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-fno-tree-scev-cprop -fno-tree-vectorize -g" } */
+/* { dg-additional-options "-mno-unroll-only-small-loops" { target ia32 } } */
+
 
 #include "../nop.h"
 
index 81841ef5bd70e4ea4ad103fda6f1131f47703098..cbc9fbb0450485a7409a82e97152fa6620f4c1cb 100644 (file)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
 
 int *a;
 long len;
index 0248fcc00a5a63c52f221ffa4294405724d10827..f75a847f75dfab39f5171b126f8b82c405ee7137 100644 (file)
@@ -1,6 +1,6 @@
 /* PR target/93002 */
 /* { dg-do compile } */
-/* { dg-options "-O2" } */
+/* { dg-options "-O2 -mno-unroll-only-small-loops" } */
 /* { dg-final { scan-assembler-not "cmp\[^\n\r]*-1" } } */
 
 volatile int sink;