The example
float *e;
void f (float *f, float *g, char *h, int n,
int b, int c, int d)
{
float a = 0;
for (int i = 0; i < n; ++i) {
int j = b + i, k = c + i * d;
float l = g[j], m = h[i] ? g[k] : l;
a += f[i] * m;
}
*e = a;
}
gets vectorized using gathers for the access to g:
.L5:
ld1b z4.s, p7/z, [x2, x6]
cmpne p6.b, p7/z, z4.b, #0
ld1w z2.s, p7/z, [x0, x6, lsl 2]
add z7.s, z30.s, z16.s
add z6.s, z16.s, z18.s
add x6, x6, x7
ld1w z5.s, p7/z, [x1, z6.s, sxtw 2]
ld1w z3.s, p6/z, [x1, z7.s, sxtw 2]
incw z16.s
sel z3.s, p6, z3.s, z5.s
fmla z17.s, p7/m, z2.s, z3.s
whilelo p7.s, w6, w3
b.any .L5
however the first g is g[b+i] and second is g[c + i*d];
since b is loop invariant the access to g[b+i] is actually linear and since c
is loop invariant, then the base of the second access g[c + i *d] can be
simplified by recognizing the base as g + c.
Today however SCEV fails to analyze these accesses as affine and as a
consequence we end up with gathers:
: missed: failed: evolution of base is not affine.
base_address:
offset from base address:
constant offset from base address:
step:
base alignment: 0
base misalignment: 0
offset alignment: 0
step alignment: 0
base_object: *_63
Looking at SCEV this is because of an outer cast around the CHREC:
)
(set_scalar_evolution
instantiated_below = 25
(scalar = _65)
(scalar_evolution = (long unsigned int) {b_22(D), +, 1}_2))
)
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = (long unsigned int) {b_22(D), +, 1}_2)
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = g_27(D))
(res = g_27(D)))
which corresponds to
j_66 = b_22(D) + i_67;
_65 = (long unsigned int) j_66;
_64 = _65 * 4;
_63 = g_27(D) + _64;
l_62 = *_63;
and the _64 is deemed to not be affine:
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = _64)
(analyze_scalar_evolution
(loop_nb = 2)
(scalar = _64)
(get_scalar_evolution
(scalar = _64)
(scalar_evolution = _64))
)
(res = scev_not_known))
This patch fixes it by (very carefully) folding a multiply on an unsigned affine
CHREC into the CHREC itself.
which results in
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = 4)
(res = 4))
(set_scalar_evolution
instantiated_below = 25
(scalar = _64)
(scalar_evolution = {(long unsigned int) b_22(D) * 4, +, 4}_2))
)
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = g_27(D))
(res = g_27(D)))
(instantiate_scev
(instantiate_below = 25 -> 12)
(evolution_loop = 2)
(chrec = {(long unsigned int) b_22(D) * 4, +, 4}_2)
(res = {(long unsigned int) b_22(D) * 4, +, 4}_2))
(set_scalar_evolution
instantiated_below = 25
(scalar = _63)
(scalar_evolution = {g_27(D) + (long unsigned int) b_22(D) * 4, +, 4}_2))
)
and dataref now correctly analyzes the base
base_address: g_27(D) + (sizetype) b_22(D) * 4
offset from base address: 0
constant offset from base address: 0
step: 4
base alignment: 4
base misalignment: 0
offset alignment: 128
step alignment: 4
base_object: *g_27(D) + (sizetype) b_22(D) * 4
Access function 0: {0B, +, 4}_2
producing the final codegen:
.L7:
ld1b z4.s, p7/z, [x2, x6]
cmpne p6.b, p7/z, z4.b, #0
ld1w z29.s, p7/z, [x4, x6, lsl 2]
ld1w z2.s, p7/z, [x0, x6, lsl 2]
ld1w z3.s, p6/z, [x5]
add x6, x6, x7
sel z3.s, p6, z3.s, z29.s
add x5, x5, x1
fmla z30.s, p7/m, z2.s, z3.s
whilelo p7.s, w6, w3
b.any .L7
faddv s31, p5, z30.s
gcc/ChangeLog:
* tree-chrec.cc (chrec_convert_1): Fold unsigned CHREC converts.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-scev-affine_1.c: New test.
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float *e;
+void f (float *f, float *g, char *h, int n,
+ int b, int c, int d)
+{
+ float a = 0;
+ for (int i = 0; i < n; ++i) {
+ int j = b + i, k = c + i * d;
+ float l = g[j], m = h[i] ? g[k] : l;
+ a += f[i] * m;
+ }
+ *e = a;
+}
+
+/* { dg-final { scan-tree-dump-not {failed: evolution of base is not affine} "vect" { target aarch64*-*-* } } } */
CHREC_RIGHT (chrec)));
res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from);
}
+ /* Similar perform the trick that (unsigned T)(base + step) can be
+ folded to ((unsigned T)x + (unsigned T)step). */
+ else if (use_overflow_semantics
+ && TREE_CODE (chrec) == POLYNOMIAL_CHREC
+ && INTEGRAL_TYPE_P (ct)
+ && INTEGRAL_TYPE_P (type)
+ && TYPE_OVERFLOW_UNDEFINED (type)
+ /* Must be unsigned so we don't introduce any UB. */
+ && TYPE_UNSIGNED (type)
+ /* The outer type must at least as wide than the inner type so we
+ don't truncate when we fold and must the inner CHREC must be
+ non-wrapping so we don't change the behavior when folding to
+ a wider type. */
+ && TYPE_PRECISION (type) >= TYPE_PRECISION (ct)
+ && (!TYPE_UNSIGNED (ct)
+ || TYPE_PRECISION (type) == TYPE_PRECISION (ct)
+ || nonwrapping_chrec_p (chrec)))
+ {
+ res = build_polynomial_chrec (CHREC_VARIABLE (chrec),
+ fold_convert (type,
+ CHREC_LEFT (chrec)),
+ fold_convert (type,
+ CHREC_RIGHT (chrec)));
+ res = chrec_convert_1 (type, res, at_stmt, use_overflow_semantics, from);
+ }
else
res = fold_convert (type, chrec);