-#include "tree-vect.h"\r
-\r
-#if VECTOR_BITS > 256\r
-#define NINTS (VECTOR_BITS / 32)\r
-#else\r
-#define NINTS 8\r
-#endif\r
-\r
-#define N (NINTS * 2)\r
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)\r
-\r
-extern void abort (void);\r
-\r
-typedef struct giga\r
-{\r
- unsigned int g[N];\r
-} giga;\r
-\r
-unsigned long __attribute__((noinline,noclone))\r
-addfst(giga const *gptr, int num)\r
-{\r
- unsigned int retval = 0;\r
- int i;\r
- for (i = 0; i < num; i++)\r
- retval += gptr[i].g[0];\r
- return retval;\r
-}\r
-\r
-int main ()\r
-{\r
- struct giga g[NINTS];\r
- unsigned int n = 1;\r
- int i, j;\r
- check_vect ();\r
- for (i = 0; i < NINTS; ++i)\r
- for (j = 0; j < N; ++j)\r
- {\r
- g[i].g[j] = n++;\r
- __asm__ volatile ("");\r
- }\r
- if (addfst (g, NINTS) != RESULT)\r
- abort ();\r
- return 0;\r
-}\r
-\r
-/* We don't want to vectorize the single-element interleaving in the way\r
- we currently do that (without ignoring not needed vectors in the\r
- gap between gptr[0].g[0] and gptr[1].g[0]), because that's very\r
- sub-optimal and causes memory explosion (even though the cost model\r
- should reject that in the end). */\r
-\r
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */\r
-/* We end up using gathers for the strided load on RISC-V which would be OK. */\r
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */\r
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 256
+#define NINTS (VECTOR_BITS / 32)
+#else
+#define NINTS 8
+#endif
+
+#define N (NINTS * 2)
+#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
+
+extern void abort (void);
+
+typedef struct giga
+{
+ unsigned int g[N];
+} giga;
+
+unsigned long __attribute__((noinline,noclone))
+addfst(giga const *gptr, int num)
+{
+ unsigned int retval = 0;
+ int i;
+ for (i = 0; i < num; i++)
+ retval += gptr[i].g[0];
+ return retval;
+}
+
+int main ()
+{
+ struct giga g[NINTS];
+ unsigned int n = 1;
+ int i, j;
+ check_vect ();
+ for (i = 0; i < NINTS; ++i)
+ for (j = 0; j < N; ++j)
+ {
+ g[i].g[j] = n++;
+ __asm__ volatile ("");
+ }
+ if (addfst (g, NINTS) != RESULT)
+ abort ();
+ return 0;
+}
+
+/* We don't want to vectorize the single-element interleaving in the way
+ we currently do that (without ignoring not needed vectors in the
+ gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
+ sub-optimal and causes memory explosion (even though the cost model
+ should reject that in the end). */
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
+/* We should end up using gathers for the strided load on RISC-V. */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
else
*memory_access_type = VMAT_CONTIGUOUS;
+ /* If this is single-element interleaving with an element
+ distance that leaves unused vector loads around punt - we
+ at least create very sub-optimal code in that case (and
+ blow up memory, see PR65518). */
+ if (loop_vinfo
+ && *memory_access_type == VMAT_CONTIGUOUS
+ && single_element_p
+ && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ if (SLP_TREE_LANES (slp_node) == 1)
+ {
+ *memory_access_type = VMAT_ELEMENTWISE;
+ overrun_p = false;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads, using "
+ "elementwise access\n");
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads\n");
+ return false;
+ }
+ }
+
overrun_p = loop_vinfo && gap != 0;
if (overrun_p && vls_type != VLS_LOAD)
{
"Peeling for outer loop is not supported\n");
return false;
}
+
/* Peeling for gaps assumes that a single scalar iteration
is enough to make sure the last vector iteration doesn't
access excess elements. */
return false;
}
}
-
- /* If this is single-element interleaving with an element
- distance that leaves unused vector loads around punt - we
- at least create very sub-optimal code in that case (and
- blow up memory, see PR65518). */
- if (loop_vinfo
- && *memory_access_type == VMAT_CONTIGUOUS
- && single_element_p
- && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
- {
- if (SLP_TREE_LANES (slp_node) == 1)
- {
- *memory_access_type = VMAT_ELEMENTWISE;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads, using "
- "elementwise access\n");
- }
- else
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads\n");
- return false;
- }
- }
}
}
else