Enhance gather fallback for PR65518 with SLP

author Richard Biener <rguenther@suse.de>

Wed, 16 Oct 2024 08:09:36 +0000 (10:09 +0200)

committer Richard Biener <rguenth@gcc.gnu.org>

Wed, 16 Oct 2024 11:04:38 +0000 (13:04 +0200)
author Richard Biener <rguenther@suse.de>
Wed, 16 Oct 2024 08:09:36 +0000 (10:09 +0200)
committer Richard Biener <rguenth@gcc.gnu.org>
Wed, 16 Oct 2024 11:04:38 +0000 (13:04 +0200)
diff --git a/gcc/testsuite/gcc.dg/vect/pr65518.c b/gcc/testsuite/gcc.dg/vect/pr65518.c

index 189a65534f61372cfaf9777f246f7cf89f2d93bd..6d8515061694abaac70498d64ea21a0f6048c447 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/pr65518.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65518.c
@@ -1,54 +1,55 @@
-#include "tree-vect.h"\r
-\r
-#if VECTOR_BITS > 256\r
-#define NINTS (VECTOR_BITS / 32)\r
-#else\r
-#define NINTS 8\r
-#endif\r
-\r
-#define N (NINTS * 2)\r
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)\r
-\r
-extern void abort (void);\r
-\r
-typedef struct giga\r
-{\r
-  unsigned int g[N];\r
-} giga;\r
-\r
-unsigned long __attribute__((noinline,noclone))\r
-addfst(giga const *gptr, int num)\r
-{\r
-  unsigned int retval = 0;\r
-  int i;\r
-  for (i = 0; i < num; i++)\r
-    retval += gptr[i].g[0];\r
-  return retval;\r
-}\r
-\r
-int main ()\r
-{\r
-  struct giga g[NINTS];\r
-  unsigned int n = 1;\r
-  int i, j;\r
-  check_vect ();\r
-  for (i = 0; i < NINTS; ++i)\r
-    for (j = 0; j < N; ++j)\r
-      {\r
-       g[i].g[j] = n++;\r
-       __asm__ volatile ("");\r
-      }\r
-  if (addfst (g, NINTS) != RESULT)\r
-    abort ();\r
-  return 0;\r
-}\r
-\r
-/* We don't want to vectorize the single-element interleaving in the way\r
-   we currently do that (without ignoring not needed vectors in the\r
-   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very\r
-   sub-optimal and causes memory explosion (even though the cost model\r
-   should reject that in the end).  */\r
-\r
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */\r
-/* We end up using gathers for the strided load on RISC-V which would be OK.  */\r
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */\r
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 256
+#define NINTS (VECTOR_BITS / 32)
+#else
+#define NINTS 8
+#endif
+
+#define N (NINTS * 2)
+#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
+
+extern void abort (void);
+
+typedef struct giga
+{
+  unsigned int g[N];
+} giga;
+
+unsigned long __attribute__((noinline,noclone))
+addfst(giga const *gptr, int num)
+{
+  unsigned int retval = 0;
+  int i;
+  for (i = 0; i < num; i++)
+    retval += gptr[i].g[0];
+  return retval;
+}
+
+int main ()
+{
+  struct giga g[NINTS];
+  unsigned int n = 1;
+  int i, j;
+  check_vect ();
+  for (i = 0; i < NINTS; ++i)
+    for (j = 0; j < N; ++j)
+      {
+       g[i].g[j] = n++;
+       __asm__ volatile ("");
+      }
+  if (addfst (g, NINTS) != RESULT)
+    abort ();
+  return 0;
+}
+
+/* We don't want to vectorize the single-element interleaving in the way
+   we currently do that (without ignoring not needed vectors in the
+   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
+   sub-optimal and causes memory explosion (even though the cost model
+   should reject that in the end).  */
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
+/* We should end up using gathers for the strided load on RISC-V.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 9b14b96cb5a6ae0aa38ae3bc4ed68efce936d650..6967d50288e9e37de40a0e3e7afab30aaa49c247 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2081,6 +2081,35 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
           else
             *memory_access_type = VMAT_CONTIGUOUS;
  
+         /* If this is single-element interleaving with an element
+            distance that leaves unused vector loads around punt - we
+            at least create very sub-optimal code in that case (and
+            blow up memory, see PR65518).  */
+         if (loop_vinfo
+             && *memory_access_type == VMAT_CONTIGUOUS
+             && single_element_p
+             && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+           {
+             if (SLP_TREE_LANES (slp_node) == 1)
+               {
+                 *memory_access_type = VMAT_ELEMENTWISE;
+                 overrun_p = false;
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "single-element interleaving not supported "
+                                    "for not adjacent vector loads, using "
+                                    "elementwise access\n");
+               }
+             else
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "single-element interleaving not supported "
+                                    "for not adjacent vector loads\n");
+                 return false;
+               }
+           }
+
           overrun_p = loop_vinfo && gap != 0;
           if (overrun_p && vls_type != VLS_LOAD)
             {
@@ -2149,6 +2178,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                                  "Peeling for outer loop is not supported\n");
               return false;
             }
+
           /* Peeling for gaps assumes that a single scalar iteration
              is enough to make sure the last vector iteration doesn't
              access excess elements.  */
@@ -2179,34 +2209,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                   return false;
                 }
             }
-
-         /* If this is single-element interleaving with an element
-            distance that leaves unused vector loads around punt - we
-            at least create very sub-optimal code in that case (and
-            blow up memory, see PR65518).  */
-         if (loop_vinfo
-             && *memory_access_type == VMAT_CONTIGUOUS
-             && single_element_p
-             && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
-           {
-             if (SLP_TREE_LANES (slp_node) == 1)
-               {
-                 *memory_access_type = VMAT_ELEMENTWISE;
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "single-element interleaving not supported "
-                                    "for not adjacent vector loads, using "
-                                    "elementwise access\n");
-               }
-             else
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "single-element interleaving not supported "
-                                    "for not adjacent vector loads\n");
-                 return false;
-               }
-           }
         }
      }
    else
author	Richard Biener <rguenther@suse.de>
	Wed, 16 Oct 2024 08:09:36 +0000 (10:09 +0200)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Wed, 16 Oct 2024 11:04:38 +0000 (13:04 +0200)
gcc/testsuite/gcc.dg/vect/pr65518.c		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history