--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details -Wno-psabi" } */
+
+typedef int vec __attribute__((vector_size (4 * sizeof (int))));
+
+void f1 (vec *p_v_in, vec *p_v_out_1, vec *p_v_out_2)
+{
+ vec sel00 = { 2, 3, 2, 2 };
+ vec sel01 = { 1, 0, 1, 1 };
+ vec sel10 = { 3, 2, 3, 3 };
+ vec sel11 = { 0, 1, 0, 0 };
+ vec sel = { 0, 5, 2, 7 };
+ vec v_1, v_2, v_x, v_y, v_out_1, v_out_2;
+ vec v_in = *p_v_in;
+
+ /* First vec perm sequence. */
+ v_1 = __builtin_shuffle (v_in, v_in, sel00);
+ v_2 = __builtin_shuffle (v_in, v_in, sel01);
+ v_x = v_2 - v_1;
+ v_y = v_1 + v_2;
+ v_out_1 = __builtin_shuffle (v_y, v_x, sel);
+
+ /* Second vec perm sequence. */
+ v_1 = __builtin_shuffle (v_in, v_in, sel10);
+ v_2 = __builtin_shuffle (v_in, v_in, sel11);
+ v_x = v_2 - v_1;
+ v_y = v_1 + v_2;
+ v_out_2 = __builtin_shuffle (v_y, v_x, sel);
+
+ /* Won't blend because the narrowed sequence
+ utilizes three of the four lanes. */
+
+ *p_v_out_1 = v_out_1;
+ *p_v_out_2 = v_out_2;
+}
+
+/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 7, 2, 6 }" "forwprop1" { target { aarch64*-*-* i?86-*-* x86_64-*-* } } } } */
return name;
}
-/* Get an index map from the provided vector permute selector
- and return the number of unique indices.
- E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2
- { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2
- { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4. */
-
-static unsigned int
-get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map)
-{
- gcc_assert (VECTOR_CST_NELTS (sel).is_constant ());
- unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
- unsigned int n = 0;
-
- for (unsigned int i = 0; i < nelts; i++)
- {
- /* Extract the i-th value from the selector. */
- tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
- unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
-
- unsigned int j = 0;
- for (; j <= i; j++)
- {
- tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j);
- unsigned int prev_sel_cst
- = TREE_INT_CST_LOW (prev_sel_cst_tree);
- if (prev_sel_cst == sel_cst)
- break;
- }
- index_map->quick_push (j);
- n += (i == j) ? 1 : 0;
- }
-
- return n;
-}
-
/* Search for opportunities to free half of the lanes in the following pattern:
v_in = {e0, e1, e2, e3}
unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant ();
+ /* Don't analyse sequences with many lanes. */
+ if (nelts > 4)
+ return false;
+
/* Lookup the definition of v_x and v_y. */
gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x));
gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y));
|| nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ())
return false;
- /* Now check permutation selection operands. */
- auto_vec<unsigned int> v_1_lane_map, v_2_lane_map;
- v_1_lane_map.reserve (nelts);
- v_2_lane_map.reserve (nelts);
- unsigned int v_1_lanes, v_2_lanes;
- v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map);
- v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map);
-
- /* Check if we could free up half of the lanes. */
- if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2))
- return false;
-
/* Create the new selector. */
vec_perm_builder new_sel_perm (nelts, nelts, 1);
+ auto_vec<unsigned int> lanes (nelts);
+ lanes.quick_grow_cleared (nelts);
for (unsigned int i = 0; i < nelts; i++)
{
/* Extract the i-th value from the selector. */
- tree sel_cst_tree = VECTOR_CST_ELT (sel, i);
- unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree);
-
- unsigned int j;
- if (sel_cst < nelts)
- j = v_1_lane_map[sel_cst];
- else
- j = v_2_lane_map[sel_cst - nelts] + nelts;
+ unsigned int sel_cst = TREE_INT_CST_LOW (VECTOR_CST_ELT (sel, i));
+ unsigned int lane = sel_cst % nelts;
+ unsigned int offs = sel_cst / nelts;
- new_sel_perm.quick_push (j);
+ /* Check what's in the lane. */
+ unsigned int e_1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, lane));
+ unsigned int e_2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, lane));
- if (dump_file && (dump_flags & TDF_DETAILS))
+ /* Reuse previous lane (if any). */
+ unsigned int l = 0;
+ for (; l < lane; l++)
{
- fprintf (dump_file, "%u", j);
- if (i != (nelts -1))
- fprintf (dump_file, ", ");
+ if ((TREE_INT_CST_LOW (VECTOR_CST_ELT (v_1_sel, l)) == e_1)
+ && (TREE_INT_CST_LOW (VECTOR_CST_ELT (v_2_sel, l)) == e_2))
+ break;
}
+
+ /* Add to narrowed selector. */
+ new_sel_perm.quick_push (l + offs * nelts);
+
+ /* Mark lane as used. */
+ lanes[l] = 1;
}
+ /* Count how many lanes are need. */
+ unsigned int cnt = 0;
+ for (unsigned int i = 0; i < nelts; i++)
+ cnt += lanes[i];
+
+ /* If more than (nelts/2) lanes are needed, skip the sequence. */
+ if (cnt > nelts / 2)
+ return false;
+
+ /* Check if the resulting permuation is cheap. */
vec_perm_indices new_indices (new_sel_perm, 2, nelts);
tree vectype = TREE_TYPE (gimple_assign_lhs (stmt));
machine_mode vmode = TYPE_MODE (vectype);
if (dump_file)
{
- fprintf (dump_file, "Found vec perm simplify sequence ending with: ");
+ fprintf (dump_file, "Found vec perm simplify sequence ending with:\n\t");
print_gimple_stmt (dump_file, stmt, 0);
+
+ if (dump_flags & TDF_DETAILS)
+ {
+ fprintf (dump_file, "\tNarrowed vec_perm selector: ");
+ print_generic_expr (dump_file, (*seq)->new_sel);
+ fprintf (dump_file, "\n");
+ }
}
return true;
unsigned int i;
unsigned int nelts = seq1->nelts;
auto_vec<int> lane_assignment;
- lane_assignment.create (2 * nelts);
+ lane_assignment.create (nelts);
/* Mark all lanes as free. */
- lane_assignment.quick_grow_cleared (2 * nelts);
+ lane_assignment.quick_grow_cleared (nelts);
- /* Reserve lanes for seq1. */
+ /* Allocate lanes for seq1. */
for (i = 0; i < nelts; i++)
{
unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i));
+ l %= nelts;
lane_assignment[l] = 1;
- }
+}
- /* Reserve lanes for seq2 and calculate selector for seq2->stmt. */
+ /* Allocate lanes for seq2 and calculate selector for seq2->stmt. */
vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1);
for (i = 0; i < nelts; i++)
{
- unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
- while (lane_assignment[l] != 0)
- l++;
- lane_assignment[l] = 2;
- seq2_stmt_sel_perm.quick_push (l);
+ unsigned int sel = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i));
+ unsigned int lane = sel % nelts;
+ unsigned int offs = sel / nelts;
+ unsigned int new_sel;
+
+ /* Check if we already allocated the lane for seq2. */
+ unsigned int j = 0;
+ for (; j < i; j++)
+ {
+ unsigned int sel_old;
+ sel_old = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, j));
+ unsigned int lane_old = sel_old % nelts;
+ if (lane == lane_old)
+ {
+ new_sel = seq2_stmt_sel_perm[j].to_constant ();
+ new_sel = (new_sel % nelts) + offs * nelts;
+ break;
+ }
+ }
+
+ /* If the lane is not allocated, we need to do that now. */
+ if (j == i)
+ {
+ unsigned int l_orig = lane;
+ while (lane_assignment[lane] != 0)
+ {
+ lane = (lane + 1) % nelts;
+
+ /* This should not happen if both sequences utilize no more than
+ half of the lanes. Test anyway to guarantee termination. */
+ if (lane == l_orig)
+ return false;
+ }
+
+ /* Allocate lane. */
+ lane_assignment[lane] = 2;
+ new_sel = lane + offs * nelts;
+ }
+
+ seq2_stmt_sel_perm.quick_push (new_sel);
}
+ /* Check if the resulting permuation is cheap. */
seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts);
tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt));
machine_mode vmode = TYPE_MODE (vectype);
vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1);
for (i = 0; i < nelts; i++)
{
- bool use_seq1 = lane_assignment[i] == 1;
- tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt
- : seq2->v_1_stmt);
- tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt
- : seq2->v_2_stmt);
- unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts;
- unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts;
+ bool use_seq1 = lane_assignment[i] != 2;
+ unsigned int l1, l2;
+
+ if (use_seq1)
+ {
+ /* Just reuse the selector indices. */
+ tree s1 = gimple_assign_rhs3 (seq1->v_1_stmt);
+ tree s2 = gimple_assign_rhs3 (seq1->v_2_stmt);
+ l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i));
+ l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i));
+ }
+ else
+ {
+ /* We moved the lanes for seq2, so we need to adjust for that. */
+ tree s1 = gimple_assign_rhs3 (seq2->v_1_stmt);
+ tree s2 = gimple_assign_rhs3 (seq2->v_2_stmt);
+
+ unsigned int j = 0;
+ for (; j < i; j++)
+ {
+ unsigned int sel_new;
+ sel_new = seq2_stmt_sel_perm[j].to_constant ();
+ sel_new %= nelts;
+ if (sel_new == i)
+ break;
+ }
+
+ /* This should not happen. Test anyway to guarantee correctness. */
+ if (j == i)
+ return false;
+
+ l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, j));
+ l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, j));
+ }
seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts));
seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts));
return;
if (dump_file && (dump_flags & TDF_DETAILS))
- fprintf (dump_file, "Processing %u vec perm simplify sequences.\n",
+ fprintf (dump_file, "\nProcessing %u vec perm simplify sequences.\n",
l->length ());
FOR_EACH_VEC_ELT (*l, i, seq1)