add masked-epilogue tuning

author Richard Biener <rguenther@suse.de>

Sun, 25 May 2025 17:29:04 +0000 (19:29 +0200)

committer Richard Biener <rguenth@gcc.gnu.org>

Tue, 8 Jul 2025 08:12:30 +0000 (10:12 +0200)
author Richard Biener <rguenther@suse.de>
Sun, 25 May 2025 17:29:04 +0000 (19:29 +0200)
committer Richard Biener <rguenth@gcc.gnu.org>
Tue, 8 Jul 2025 08:12:30 +0000 (10:12 +0200)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index fd3f35de14d3d14633f289bcaeceaf1201a40648..ad7360ec71a43b865e9374fd26ff6c2e76a447a0 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26295,6 +26295,65 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
        && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16)
      m_suggested_epilogue_mode = V8QImode;
  
+  /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use
+     a masked epilogue if that doesn't seem detrimental.  */
+  if (loop_vinfo
+      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+      && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
+      && !OPTION_SET_P (param_vect_partial_vector_usage))
+    {
+      bool avoid = false;
+      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+         && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+       {
+         unsigned int peel_niter
+           = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+         if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+           peel_niter += 1;
+         /* When we know the number of scalar iterations of the epilogue,
+            avoid masking when a single vector epilog iteration handles
+            it in full.  */
+         if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter)
+                        % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()))
+           avoid = true;
+       }
+      if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo))))
+       for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo))
+         {
+           if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+             ;
+           else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+             ;
+           else
+             {
+               int loop_depth
+                   = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
+                                         DDR_LOOP_NEST (ddr));
+               if (DDR_NUM_DIST_VECTS (ddr) == 1
+                   && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0)
+                 {
+                   /* Avoid the case when there's an outer loop that might
+                      traverse a multi-dimensional array with the inner
+                      loop just executing the masked epilogue with a
+                      read-write where the next outer iteration might
+                      read from the masked part of the previous write,
+                      'n' filling half a vector.
+                        for (j = 0; j < m; ++j)
+                          for (i = 0; i < n; ++i)
+                            a[j][i] = c * a[j][i];  */
+                   avoid = true;
+                   break;
+                 }
+             }
+         }
+      if (!avoid)
+       {
+         m_suggested_epilogue_mode = loop_vinfo->vector_mode;
+         m_masked_epilogue = 1;
+       }
+    }
+
    vector_costs::finish_cost (scalar_costs);
  }
  
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def

index 91cdca7fbfc26ceb221048dea26cc6b1f9c952a4..4773e5dd5ad1e5daa434e6d0b6d1e7feae9677d1 100644 (file)
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -639,6 +639,11 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
  DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
           m_ZNVER4 | m_ZNVER5)
  
+/* X86_TUNE_AVX512_MAKED_EPILOGUES: Use two masked vector epilogues
+   when fit.  */
+DEF_TUNE (X86_TUNE_AVX512_MASKED_EPILOGUES, "avx512_masked_epilogues",
+         m_ZNVER4 | m_ZNVER5)
+
  /*****************************************************************************/
  /*****************************************************************************/
  /* Historical relics: tuning flags that helps a specific old CPU designs     */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c

index 0ee610f5e3ef850b7efe00482b9fcabc9236f764..e88ab30c770f2bc983d0e5c47fa4292a97c7a6e7 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-3.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O3 -mavx512bw -mtune=znver4 -fdump-tree-vect-optimized" } */
+/* { dg-options "-O3 -mavx512bw -mtune=znver4 --param vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
  
  int test (signed char *data, int n)
  {
diff --git a/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c

new file mode 100644 (file)

index 0000000..55519aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void bar (double *a, double *b, double c, int n, int m)
+{
+  for (int j = 0; j < m; ++j)
+    for (int i = 0; i < n; ++i)
+      a[j*n + i] = b[j*n + i] + c;
+}
+
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using masked 64 byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c

new file mode 100644 (file)

index 0000000..3dc28b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void foo (double *a, double b, double c, int n, int m)
+{
+  for (int j = 0; j < m; ++j)
+    for (int i = 0; i < n; ++i)
+      a[j*n + i] = a[j*n + i] * b + c;
+}
+
+/* We do not want to use a masked epilogue for the inner loop as the next
+   outer iteration will possibly immediately read from elements masked of
+   the previous inner loop epilogue and that never forwards.  */
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using 32 byte vectors" "vect" } } */
author	Richard Biener <rguenther@suse.de>
	Sun, 25 May 2025 17:29:04 +0000 (19:29 +0200)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Tue, 8 Jul 2025 08:12:30 +0000 (10:12 +0200)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/config/i386/x86-tune.def		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/vect-epilogues-3.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/vect-mask-epilogue-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/vect-mask-epilogue-2.c	[new file with mode: 0644]	patch \| blob