The following adjusts two very similar testcases that when
vector cost comparison is enabled and with generic tuning,
chose to use SSE vector size for the vector epilogue as that
reduces the possible iterations through the scalar epilogue
following that and thus speeds up the overall epilogue processing
for a majority of cases. I have chosen to duplicate the
testcases for --param ix86-vect-compare-costs=0 and =1.
* gcc.target/i386/vect-epilogues-2.c: Add
--param ix86-vect-compare-costs=0.
* gcc.target/i386/vect-epilogues-2b.c: Duplicate from
gcc.target/i386/vect-epilogues-2.c, add
--param ix86-vect-compare-costs=1 and adjust expected
vectorization.
* gcc.target/i386/vect-pr113078.c: Likewise.
* gcc.target/i386/vect-pr113078b.c: Likewise.
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512bw -mtune=generic -fdump-tree-vect-optimized" } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic --param ix86-vect-compare-costs=0 -fdump-tree-vect-optimized" } */
int test (signed char *data, int n)
{
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512bw -mtune=generic --param ix86-vect-compare-costs=1 -fdump-tree-vect-optimized" } */
+
+int test (signed char *data, int n)
+{
+ int sum = 0;
+ for (int i = 0; i < n; ++i)
+ sum += data[i];
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump "loop vectorized using 64 byte vectors" "vect" } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "loop vectorized using 8 byte vectors" "vect" { target { ! ia32 } } } } */
/* { dg-do compile } */
-/* { dg-options "-O3 -mavx512vl" } */
+/* { dg-options "-O3 -mavx512vl --param ix86-vect-compare-costs=0" } */
int
foo (int n, int* p, int* pi)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -mtune=generic --param ix86-vect-compare-costs=1" } */
+
+int
+foo (int n, int* p, int* pi)
+{
+ int sum = 0;
+ for (int i = 0; i != n; i++)
+ {
+ if (pi[i] > 0)
+ sum -= p[i];
+ }
+ return sum;
+}
+
+/* We vectorize with 64 byte vectors and 16 byte vector epilog which
+ we completely peel. */
+/* { dg-final { scan-assembler-times "vpsub\[^\r\n\]*%k" 4 } } */