--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_2 char *restrict c,
+ SIGNEDNESS_2 char *restrict d,
+ SIGNEDNESS_1 int *restrict e)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ res += a[i] * b[i];
+ res += c[i] * d[i];
+ res += e[i];
+ }
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_2 char c[N], d[N];
+ SIGNEDNESS_1 int e[N];
+ int expected = 0x12345;
+
+ #pragma GCC novector
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ c[i] = BASE + i * 2;
+ d[i] = BASE + OFFSET + i * 3;
+ e[i] = i;
+ expected += a[i] * b[i];
+ expected += c[i] * d[i];
+ expected += e[i];
+ }
+
+ if (f (0x12345, a, b, c, d, e) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_3 char *restrict c,
+ SIGNEDNESS_3 char *restrict d,
+ SIGNEDNESS_4 short *restrict e,
+ SIGNEDNESS_4 short *restrict f,
+ SIGNEDNESS_1 int *restrict g)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ res += a[i] * b[i];
+ res += i + 1;
+ res += c[i] * d[i];
+ res += e[i] * f[i];
+ res += g[i];
+ }
+ return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_3 char c[N], d[N];
+ SIGNEDNESS_4 short e[N], f[N];
+ SIGNEDNESS_1 int g[N];
+ int expected = 0x12345;
+
+#pragma GCC novector
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE2 + i * 5;
+ b[i] = BASE2 + OFFSET + i * 4;
+ c[i] = BASE3 + i * 2;
+ d[i] = BASE3 + OFFSET + i * 3;
+ e[i] = BASE4 + i * 6;
+ f[i] = BASE4 + OFFSET + i * 5;
+ g[i] = i;
+ expected += a[i] * b[i];
+ expected += i + 1;
+ expected += c[i] * d[i];
+ expected += e[i] * f[i];
+ expected += g[i];
+ }
+
+ if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_3 short *restrict c,
+ SIGNEDNESS_3 short *restrict d,
+ SIGNEDNESS_1 int *restrict e)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ short diff = a[i] - b[i];
+ SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+ res += abs;
+ res += c[i] * d[i];
+ res += e[i];
+ }
+ return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_3 short c[N], d[N];
+ SIGNEDNESS_1 int e[N];
+ int expected = 0x12345;
+
+#pragma GCC novector
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE2 + i * 5;
+ b[i] = BASE2 - i * 4;
+ c[i] = BASE3 + i * 2;
+ d[i] = BASE3 + OFFSET + i * 3;
+ e[i] = i;
+ short diff = a[i] - b[i];
+ SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+ expected += abs;
+ expected += c[i] * d[i];
+ expected += e[i];
+ }
+
+ if (f (0x12345, a, b, c, d, e) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *a,
+ SIGNEDNESS_2 char *b,
+ int step, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[0] * b[0];
+ res += a[1] * b[1];
+ res += a[2] * b[2];
+ res += a[3] * b[3];
+ res += a[4] * b[4];
+ res += a[5] * b[5];
+ res += a[6] * b[6];
+ res += a[7] * b[7];
+ res += a[8] * b[8];
+ res += a[9] * b[9];
+ res += a[10] * b[10];
+ res += a[11] * b[11];
+ res += a[12] * b[12];
+ res += a[13] * b[13];
+ res += a[14] * b[14];
+ res += a[15] * b[15];
+
+ a += step;
+ b += step;
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[100], b[100];
+ int expected = 0x12345;
+ int step = 16;
+ int n = 2;
+ int t = 0;
+
+#pragma GCC novector
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ }
+
+#pragma GCC novector
+ for (int i = 0; i < n; i++)
+ {
+ expected += a[t + 0] * b[t + 0];
+ expected += a[t + 1] * b[t + 1];
+ expected += a[t + 2] * b[t + 2];
+ expected += a[t + 3] * b[t + 3];
+ expected += a[t + 4] * b[t + 4];
+ expected += a[t + 5] * b[t + 5];
+ expected += a[t + 6] * b[t + 6];
+ expected += a[t + 7] * b[t + 7];
+ expected += a[t + 8] * b[t + 8];
+ expected += a[t + 9] * b[t + 9];
+ expected += a[t + 10] * b[t + 10];
+ expected += a[t + 11] * b[t + 11];
+ expected += a[t + 12] * b[t + 12];
+ expected += a[t + 13] * b[t + 13];
+ expected += a[t + 14] * b[t + 14];
+ expected += a[t + 15] * b[t + 15];
+ t += step;
+ }
+
+ if (f (0x12345, a, b, step, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *a,
+ SIGNEDNESS_2 char *b,
+ int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[5 * i + 0] * b[5 * i + 0];
+ res += a[5 * i + 1] * b[5 * i + 1];
+ res += a[5 * i + 2] * b[5 * i + 2];
+ res += a[5 * i + 3] * b[5 * i + 3];
+ res += a[5 * i + 4] * b[5 * i + 4];
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[100], b[100];
+ int expected = 0x12345;
+ int n = 18;
+
+#pragma GCC novector
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ }
+
+#pragma GCC novector
+ for (int i = 0; i < n; i++)
+ {
+ expected += a[5 * i + 0] * b[5 * i + 0];
+ expected += a[5 * i + 1] * b[5 * i + 1];
+ expected += a[5 * i + 2] * b[5 * i + 2];
+ expected += a[5 * i + 3] * b[5 * i + 3];
+ expected += a[5 * i + 4] * b[5 * i + 4];
+ }
+
+ if (f (0x12345, a, b, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 5 "vect" } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 short *a,
+ SIGNEDNESS_2 short *b,
+ int step, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[0] * b[0];
+ res += a[1] * b[1];
+ res += a[2] * b[2];
+ res += a[3] * b[3];
+ res += a[4] * b[4];
+ res += a[5] * b[5];
+ res += a[6] * b[6];
+ res += a[7] * b[7];
+
+ a += step;
+ b += step;
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 short a[100], b[100];
+ int expected = 0x12345;
+ int step = 8;
+ int n = 2;
+ int t = 0;
+
+#pragma GCC novector
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ }
+
+#pragma GCC novector
+ for (int i = 0; i < n; i++)
+ {
+ expected += a[t + 0] * b[t + 0];
+ expected += a[t + 1] * b[t + 1];
+ expected += a[t + 2] * b[t + 2];
+ expected += a[t + 3] * b[t + 3];
+ expected += a[t + 4] * b[t + 4];
+ expected += a[t + 5] * b[t + 5];
+ expected += a[t + 6] * b[t + 6];
+ expected += a[t + 7] * b[t + 7];
+ t += step;
+ }
+
+ if (f (0x12345, a, b, step, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 short *a,
+ SIGNEDNESS_2 short *b,
+ int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[3 * i + 0] * b[3 * i + 0];
+ res += a[3 * i + 1] * b[3 * i + 1];
+ res += a[3 * i + 2] * b[3 * i + 2];
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 short a[100], b[100];
+ int expected = 0x12345;
+ int n = 18;
+
+#pragma GCC novector
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ }
+
+#pragma GCC novector
+ for (int i = 0; i < n; i++)
+ {
+ expected += a[3 * i + 0] * b[3 * i + 0];
+ expected += a[3 * i + 1] * b[3 * i + 1];
+ expected += a[3 * i + 2] * b[3 * i + 2];
+ }
+
+ if (f (0x12345, a, b, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 3 "vect" { target vect_sdot_hi } } } */
--- /dev/null
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0 -fdump-tree-optimized" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res0,
+ SIGNEDNESS_1 int res1,
+ SIGNEDNESS_1 int res2,
+ SIGNEDNESS_1 int res3,
+ SIGNEDNESS_1 int res4,
+ SIGNEDNESS_1 int res5,
+ SIGNEDNESS_1 int res6,
+ SIGNEDNESS_1 int res7,
+ SIGNEDNESS_1 int res8,
+ SIGNEDNESS_1 int res9,
+ SIGNEDNESS_1 int resA,
+ SIGNEDNESS_1 int resB,
+ SIGNEDNESS_1 int resC,
+ SIGNEDNESS_1 int resD,
+ SIGNEDNESS_1 int resE,
+ SIGNEDNESS_1 int resF,
+ SIGNEDNESS_2 char *a,
+ SIGNEDNESS_2 char *b)
+{
+ for (int i = 0; i < 64; i += 16)
+ {
+ res0 += a[i + 0x00] * b[i + 0x00];
+ res1 += a[i + 0x01] * b[i + 0x01];
+ res2 += a[i + 0x02] * b[i + 0x02];
+ res3 += a[i + 0x03] * b[i + 0x03];
+ res4 += a[i + 0x04] * b[i + 0x04];
+ res5 += a[i + 0x05] * b[i + 0x05];
+ res6 += a[i + 0x06] * b[i + 0x06];
+ res7 += a[i + 0x07] * b[i + 0x07];
+ res8 += a[i + 0x08] * b[i + 0x08];
+ res9 += a[i + 0x09] * b[i + 0x09];
+ resA += a[i + 0x0A] * b[i + 0x0A];
+ resB += a[i + 0x0B] * b[i + 0x0B];
+ resC += a[i + 0x0C] * b[i + 0x0C];
+ resD += a[i + 0x0D] * b[i + 0x0D];
+ resE += a[i + 0x0E] * b[i + 0x0E];
+ resF += a[i + 0x0F] * b[i + 0x0F];
+ }
+
+ return res0 ^ res1 ^ res2 ^ res3 ^ res4 ^ res5 ^ res6 ^ res7 ^
+ res8 ^ res9 ^ resA ^ resB ^ resC ^ resD ^ resE ^ resF;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-not "DOT_PROD_EXPR" "optimized" } } */
if (!gimple_extract_op (orig_stmt_info->stmt, &op))
gcc_unreachable ();
- bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
if (reduction_type == EXTRACT_LAST_REDUCTION)
/* No extra instructions are needed in the prologue. The loop body
operations are costed in vectorizable_condition. */
initial result of the data reduction, initial value of the index
reduction. */
prologue_stmts = 4;
- else if (emulated_mixed_dot_prod)
- /* We need the initial reduction value and two invariants:
- one that contains the minimum signed value and one that
- contains half of its negative. */
- prologue_stmts = 3;
else
+ /* We need the initial reduction value. */
prologue_stmts = 1;
prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
scalar_to_vec, stmt_info, 0,
}
}
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+ the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
+ and the analysis is for slp if SLP_NODE is not NULL.
+
+ For a lane-reducing operation, the loop reduction path that it lies in,
+ may contain normal operation, or other lane-reducing operation of different
+ input type size, an example as:
+
+ int sum = 0;
+ for (i)
+ {
+ ...
+ sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += w[i]; // widen-sum <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
+ ...
+ }
+
+ Vectorization factor is essentially determined by operation whose input
+ vectype has the most lanes ("vector(16) char" in the example), while we
+ need to choose input vectype with the least lanes ("vector(4) int" in the
+ example) to determine effective number of vector reduction PHIs. */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+ slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+ gimple *stmt = stmt_info->stmt;
+
+ if (!lane_reducing_stmt_p (stmt))
+ return false;
+
+ tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+ if (!INTEGRAL_TYPE_P (type))
+ return false;
+
+ /* Do not try to vectorize bit-precision reductions. */
+ if (!type_has_mode_precision_p (type))
+ return false;
+
+ stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+ /* TODO: Support lane-reducing operation that does not directly participate
+ in loop reduction. */
+ if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+ return false;
+
+ /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+ recoginized. */
+ gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+ gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+ for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+ {
+ stmt_vec_info def_stmt_info;
+ slp_tree slp_op;
+ tree op;
+ tree vectype;
+ enum vect_def_type dt;
+
+ if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+ &slp_op, &dt, &vectype, &def_stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "use not simple.\n");
+ return false;
+ }
+
+ if (!vectype)
+ {
+ vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+ slp_op);
+ if (!vectype)
+ return false;
+ }
+
+ if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+
+ if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+ continue;
+
+ /* There should be at most one cycle def in the stmt. */
+ if (VECTORIZABLE_CYCLE_DEF (dt))
+ return false;
+ }
+
+ tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+ gcc_assert (vectype_in);
+
+ /* Compute number of effective vector statements for costing. */
+ unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
+ vectype_in);
+ gcc_assert (ncopies_for_cost >= 1);
+
+ if (vect_is_emulated_mixed_dot_prod (stmt_info))
+ {
+ /* We need extra two invariants: one that contains the minimum signed
+ value and one that contains half of its negative. */
+ int prologue_stmts = 2;
+ unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+ scalar_to_vec, stmt_info, 0,
+ vect_prologue);
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+ "extra prologue_cost = %d .\n", cost);
+
+ /* Three dot-products and a subtraction. */
+ ncopies_for_cost *= 4;
+ }
+
+ record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
+ 0, vect_body);
+
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+ slp_node, code, type,
+ vectype_in);
+ }
+
+ /* Transform via vect_transform_reduction. */
+ STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+ return true;
+}
+
/* Function vectorizable_reduction.
Check if STMT_INFO performs a reduction operation that can be vectorized.
}
/* For lane-reducing operation vectorizable analysis needs the
- reduction PHI information */
+ reduction PHI information. */
STMT_VINFO_REDUC_DEF (def) = phi_info;
/* Each lane-reducing operation has its own input vectype, while
if (!type_has_mode_precision_p (op.type))
return false;
- /* For lane-reducing ops we're reducing the number of reduction PHIs
- which means the only use of that may be in the lane-reducing operation. */
- if (lane_reducing
- && reduc_chain_length != 1
- && !only_slp_reduc_chain)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "lane-reducing reduction with extra stmts.\n");
- return false;
- }
-
/* Lane-reducing ops also never can be used in a SLP reduction group
since we'll mix lanes belonging to different reductions. But it's
OK to use them in a reduction chain or when the reduction group
&& loop_vinfo->suggested_unroll_factor == 1)
single_defuse_cycle = true;
- if (single_defuse_cycle || lane_reducing)
+ if (single_defuse_cycle && !lane_reducing)
{
gcc_assert (op.code != COND_EXPR);
- /* 4. Supportable by target? */
- bool ok = true;
-
- /* 4.1. check support for the operation in the loop
+ /* 4. check support for the operation in the loop
This isn't necessary for the lane reduction codes, since they
can only be produced by pattern matching, and it's up to the
mixed-sign dot-products can be implemented using signed
dot-products. */
machine_mode vec_mode = TYPE_MODE (vectype_in);
- if (!lane_reducing
- && !directly_supported_p (op.code, vectype_in, optab_vector))
+ if (!directly_supported_p (op.code, vectype_in, optab_vector))
{
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "op not supported by target.\n");
if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
|| !vect_can_vectorize_without_simd_p (op.code))
- ok = false;
+ single_defuse_cycle = false;
else
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "proceeding using word mode.\n");
dump_printf (MSG_NOTE, "using word mode not possible.\n");
return false;
}
-
- /* lane-reducing operations have to go through vect_transform_reduction.
- For the other cases try without the single cycle optimization. */
- if (!ok)
- {
- if (lane_reducing)
- return false;
- else
- single_defuse_cycle = false;
- }
}
if (dump_enabled_p () && single_defuse_cycle)
dump_printf_loc (MSG_NOTE, vect_location,
"multiple vectors to one in the loop body\n");
STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
- /* If the reduction stmt is one of the patterns that have lane
- reduction embedded we cannot handle the case of ! single_defuse_cycle. */
- if ((ncopies > 1 && ! single_defuse_cycle)
- && lane_reducing)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "multi def-use cycle not possible for lane-reducing "
- "reduction operation\n");
- return false;
- }
+ /* For lane-reducing operation, the below processing related to single
+ defuse-cycle will be done in its own vectorizable function. One more
+ thing to note is that the operation must not be involved in fold-left
+ reduction. */
+ single_defuse_cycle &= !lane_reducing;
if (slp_node
- && !(!single_defuse_cycle
- && !lane_reducing
- && reduction_type != FOLD_LEFT_REDUCTION))
+ && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
for (i = 0; i < (int) op.num_ops; i++)
if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
{
vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
reduction_type, ncopies, cost_vec);
/* Cost the reduction op inside the loop if transformed via
- vect_transform_reduction. Otherwise this is costed by the
- separate vectorizable_* routines. */
- if (single_defuse_cycle || lane_reducing)
- {
- int factor = 1;
- if (vect_is_emulated_mixed_dot_prod (stmt_info))
- /* Three dot-products and a subtraction. */
- factor = 4;
- record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
- stmt_info, 0, vect_body);
- }
+ vect_transform_reduction for non-lane-reducing operation. Otherwise
+ this is costed by the separate vectorizable_* routines. */
+ if (single_defuse_cycle)
+ record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
if (dump_enabled_p ()
&& reduction_type == FOLD_LEFT_REDUCTION)
dump_printf_loc (MSG_NOTE, vect_location,
"using an in-order (fold-left) reduction.\n");
STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
- /* All but single defuse-cycle optimized, lane-reducing and fold-left
- reductions go through their own vectorizable_* routines. */
- if (!single_defuse_cycle
- && !lane_reducing
- && reduction_type != FOLD_LEFT_REDUCTION)
+
+ /* All but single defuse-cycle optimized and fold-left reductions go
+ through their own vectorizable_* routines. */
+ if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
{
stmt_vec_info tem
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
And vector reduction PHIs are always generated to the full extent, no
matter lane-reducing op exists or not. If some copies or PHIs are
actually superfluous, they would be cleaned up by passes after
- vectorization. An example for single-lane slp is given as below.
+ vectorization. An example for single-lane slp, lane-reducing ops
+ with mixed input vectypes in a reduction chain, is given as below.
Similarly, this handling is applicable for multiple-lane slp as well.
int sum = 1;
for (i)
{
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += w[i]; // widen-sum <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
}
The vector size is 128-bit,vectorization factor is 16. Reduction
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
+
+ sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
+ sum_v1 = sum_v1; // copy
+ sum_v2 = sum_v2; // copy
+ sum_v3 = sum_v3; // copy
+
+ sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+ sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+ sum_v2 = sum_v2; // copy
+ sum_v3 = sum_v3; // copy
}
- sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0
+ sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
*/
unsigned effec_ncopies = vec_oprnds[0].length ();
unsigned total_ncopies = vec_oprnds[reduc_index].length ();
NULL, NULL, node, cost_vec)
|| vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
|| vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+ || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+ stmt_info, node, cost_vec)
|| vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
node, node_instance, cost_vec)
|| vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
slp_tree, slp_instance, int,
bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+ slp_tree, stmt_vector_for_cost *);
extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
slp_tree, slp_instance,
stmt_vector_for_cost *);