--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 0
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+ for (unsigned int i = start; i < END; ++i)
+ {
+ if (x[i] == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_10.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo (START);
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 2
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+ for (signed int i = START; i < END; ++i)
+ {
+ if (x[i] == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_5.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo ();
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (int start)
+{
+ for (unsigned int i = start; i < END; ++i)
+ {
+ if (x[i] == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_6.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo (START);
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+ for (unsigned int i = START; i < END; ++i)
+ {
+ if (x[i] == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_7.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo ();
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+ for (unsigned int i = START; i < END; i*=2)
+ {
+ if (x[i] == 0)
+ return i;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_8.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo ();
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+
+#define N 512
+#define START 1
+#define END 505
+
+int x[N] __attribute__((aligned(32)));
+
+int __attribute__((noipa))
+foo (void)
+{
+ for (int *p = x + START; p < x + END; p++)
+ {
+ if (*p == 0)
+ return START;
+ }
+ return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* Peels using a scalar loop. */
+/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
--- /dev/null
+/* Fix for PR119351 alignment peeling with vectors and VLS. */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "peel_ind_9.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ int res = foo ();
+ asm volatile ("");
+ if (res != START)
+ __builtin_abort ();
+ return 0;
+}
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
+
+ /* Mark if we have a non-linear IV. */
+ LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
}
suggested_unroll_factor (1),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
+ mask_skip_niters_pfa_offset (NULL_TREE),
rgroup_compare_type (NULL_TREE),
simd_if_cond (NULL_TREE),
partial_vector_style (vect_partial_vectors_none),
unaligned_dr (NULL),
peeling_for_alignment (0),
ptr_mask (0),
+ nonlinear_iv (false),
ivexpr_map (NULL),
scan_map (NULL),
slp_unrolling_factor (1),
LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
peel_mul = gimple_build_vector_from_val (&init_stmts,
step_vectype, peel_mul);
+
+ /* If early break then we have to create a new PHI which we can use as
+ an offset to adjust the induction reduction in early exits.
+
+ This is because when peeling for alignment using masking, the first
+ few elements of the vector can be inactive. As such if we find the
+ entry in the first iteration we have adjust the starting point of
+ the scalar code.
+
+ We do this by creating a new scalar PHI that keeps track of whether
+ we are the first iteration of the loop (with the additional masking)
+ or whether we have taken a loop iteration already.
+
+ The generated sequence:
+
+ pre-header:
+ bb1:
+ i_1 = <number of leading inactive elements>
+
+ header:
+ bb2:
+ i_2 = PHI <i_1(bb1), 0(latch)>
+ …
+
+ early-exit:
+ bb3:
+ i_3 = iv_step * i_2 + PHI<vector-iv>
+
+ The first part of the adjustment to create i_1 and i_2 are done here
+ and the last part creating i_3 is done in
+ vectorizable_live_operations when the induction extraction is
+ materialized. */
+ if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+ && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
+ {
+ auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ tree ty_skip_niters = TREE_TYPE (skip_niters);
+ tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
+ vect_scalar_var,
+ "pfa_iv_offset");
+ gphi *nphi = create_phi_node (break_lhs_phi, bb);
+ add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
+ add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
+ loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
+
+ LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)
+ = PHI_RESULT (nphi);
+ }
}
tree step_mul = NULL_TREE;
unsigned ivn;
/* For early exit where the exit is not in the BB that leads
to the latch then we're restarting the iteration in the
scalar loop. So get the first live value. */
- if ((all_exits_as_early_p || !main_exit_edge)
- && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+ bool early_break_first_element_p
+ = (all_exits_as_early_p || !main_exit_edge)
+ && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
+ if (early_break_first_element_p)
{
tmp_vec_lhs = vec_lhs0;
tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
lhs_type, &exit_gsi);
auto gsi = gsi_for_stmt (use_stmt);
+ if (early_break_first_element_p
+ && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
+ {
+ tree step_expr
+ = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+ tree break_lhs_phi
+ = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
+ tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
+ gimple_seq iv_stmts = NULL;
+
+ /* Now create the PHI for the outside loop usage to
+ retrieve the value for the offset counter. */
+ tree rphi_step
+ = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
+ tree tmp2
+ = gimple_build (&iv_stmts, MULT_EXPR,
+ ty_skip_niters, rphi_step,
+ break_lhs_phi);
+
+ if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
+ tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
+ TREE_TYPE (new_tree), new_tree, tmp2);
+ else
+ {
+ tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
+ tmp2);
+ tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
+ TREE_TYPE (new_tree), new_tree,
+ tmp2);
+ }
+
+ new_tree = tmp2;
+ gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
+ }
+
tree lhs_phi = gimple_phi_result (use_stmt);
remove_phi_node (&gsi, false);
gimple *copy = gimple_build_assign (lhs_phi, new_tree);
elements that should be false in the first mask). */
tree mask_skip_niters;
+ /* If we are using a loop mask to align memory addresses and we're in an
+ early break loop then this variable contains the number of elements that
+ were skipped during the initial iteration of the loop. */
+ tree mask_skip_niters_pfa_offset;
+
/* The type that the loop control IV should be converted to before
testing which of the VF scalars are active and inactive.
Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
/* The mask used to check the alignment of pointers or arrays. */
int ptr_mask;
+ /* Indicates whether the loop has any non-linear IV. */
+ bool nonlinear_iv;
+
/* Data Dependence Relations defining address ranges that are candidates
for a run-time aliasing check. */
auto_vec<ddr_p> may_alias_ddrs;
#define LOOP_VINFO_MASKS(L) (L)->masks
#define LOOP_VINFO_LENS(L) (L)->lens
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
+#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset
#define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
#define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
#define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
#define LOOP_VINFO_DDRS(L) (L)->shared->ddrs
#define LOOP_VINFO_INT_NITERS(L) (TREE_INT_CST_LOW ((L)->num_iters))
#define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment
+#define LOOP_VINFO_NON_LINEAR_IV(L) (L)->nonlinear_iv
#define LOOP_VINFO_UNALIGNED_DR(L) (L)->unaligned_dr
#define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts
#define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs
inline bool
vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
{
+ /* With early break vectorization we don't know whether the accesses will stay
+ inside the loop or not. TODO: The early break adjustment code can be
+ implemented the same way as vectorizable_linear_induction. However we
+ can't test this today so reject it. */
return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
- && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+ && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
+ && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
}
/* Return the number of vectors of type VECTYPE that are needed to get