unsigned res_npatterns, res_nelts_per_pattern;
unsigned HOST_WIDE_INT res_nelts;
- /* (1) If SEL is a suitable mask as determined by
- valid_mask_for_fold_vec_perm_cst_p, then:
- res_npatterns = max of npatterns between ARG0, ARG1, and SEL
- res_nelts_per_pattern = max of nelts_per_pattern between
- ARG0, ARG1 and SEL.
- (2) If SEL is not a suitable mask, and TYPE is VLS then:
- res_npatterns = nelts in result vector.
- res_nelts_per_pattern = 1.
- This exception is made so that VLS ARG0, ARG1 and SEL work as before. */
- if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
- {
- res_npatterns
- = std::max (VECTOR_CST_NPATTERNS (arg0),
- std::max (VECTOR_CST_NPATTERNS (arg1),
- sel.encoding ().npatterns ()));
+ /* First try to implement the fold in a VLA-friendly way.
+
+ (1) If the selector is simply a duplication of N elements, the
+ result is likewise a duplication of N elements.
+
+ (2) If the selector is N elements followed by a duplication
+ of N elements, the result is too.
+
+ (3) If the selector is N elements followed by an interleaving
+ of N linear series, the situation is more complex.
+
+ valid_mask_for_fold_vec_perm_cst_p detects whether we
+ can handle this case. If we can, then each of the N linear
+ series either (a) selects the same element each time or
+ (b) selects a linear series from one of the input patterns.
- res_nelts_per_pattern
- = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
- std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
- sel.encoding ().nelts_per_pattern ()));
+ If (b) holds for one of the linear series, the result
+ will contain a linear series, and so the result will have
+ the same shape as the selector. If (a) holds for all of
+ the linear series, the result will be the same as (2) above.
+ (b) can only hold if one of the input patterns has a
+ stepped encoding. */
+
+ if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
+ {
+ res_npatterns = sel.encoding ().npatterns ();
+ res_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
+ if (res_nelts_per_pattern == 3
+ && VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3
+ && VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3)
+ res_nelts_per_pattern = 2;
res_nelts = res_npatterns * res_nelts_per_pattern;
}
else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
tree expected_res[] = { ARG0(0), ARG1(0), ARG1(1) };
validate_res (1, 3, res, expected_res);
}
+
+ /* Case 8: Same as aarch64/sve/slp_3.c:
+ arg0, arg1 are dup vectors.
+ sel = { 0, len, 1, len+1, 2, len+2, ... } // (2, 3)
+ So res = { arg0[0], arg1[0], ... } // (2, 1)
+
+ In this case, since the input vectors are dup, only the first two
+ elements per pattern in sel are considered significant. */
+ {
+ tree arg0 = build_vec_cst_rand (vmode, 1, 1);
+ tree arg1 = build_vec_cst_rand (vmode, 1, 1);
+ poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+ vec_perm_builder builder (len, 2, 3);
+ poly_uint64 mask_elems[] = { 0, len, 1, len + 1, 2, len + 2 };
+ builder_push_elems (builder, mask_elems);
+
+ vec_perm_indices sel (builder, 2, len);
+ tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+ tree expected_res[] = { ARG0(0), ARG1(0) };
+ validate_res (2, 1, res, expected_res);
+ }
}
}
ASSERT_TRUE (res == NULL_TREE);
ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns"));
}
+
+ /* Case 8: PR111754: When input vector is not a stepped sequence,
+ check that the result is not a stepped sequence either, even
+ if sel has a stepped sequence. */
+ {
+ tree arg0 = build_vec_cst_rand (vmode, 1, 2);
+ poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+ vec_perm_builder builder (len, 1, 3);
+ poly_uint64 mask_elems[] = { 0, 1, 2 };
+ builder_push_elems (builder, mask_elems);
+
+ vec_perm_indices sel (builder, 1, len);
+ tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg0, sel);
+
+ tree expected_res[] = { ARG0(0), ARG0(1) };
+ validate_res (sel.encoding ().npatterns (), 2, res, expected_res);
+ }
+
+ /* Case 9: If sel doesn't contain a stepped sequence,
+ check that the result has same encoding as sel, irrespective
+ of shape of input vectors. */
+ {
+ tree arg0 = build_vec_cst_rand (vmode, 1, 3, 1);
+ tree arg1 = build_vec_cst_rand (vmode, 1, 3, 1);
+ poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+ vec_perm_builder builder (len, 1, 2);
+ poly_uint64 mask_elems[] = { 0, len };
+ builder_push_elems (builder, mask_elems);
+
+ vec_perm_indices sel (builder, 2, len);
+ tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+ tree expected_res[] = { ARG0(0), ARG1(0) };
+ validate_res (sel.encoding ().npatterns (),
+ sel.encoding ().nelts_per_pattern (), res, expected_res);
+ }
}
}
/* 1 for each 8-bit type. */
/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
-/* 1 for each 16-bit type plus 1 for double. */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
+/* 1 for each 16-bit type */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
/* 1 for each 32-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
-/* 3 for double. */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 6 } } */
/* The 64-bit types need:
-
- ZIP1 ZIP1 (2 ZIP2s optimized away)
ZIP1 ZIP2. */
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
/* The loop should be fully-masked. The 64-bit types need two loads
TEST_ALL (VEC_PERM)
-/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double. */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
+/* 1 for each 8-bit type */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 2 } } */
/* 1 for each 16-bit type. */
/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #80\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #63\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
-/* 4 for double. */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 18 } } */
/* The 32-bit types need:
- ZIP1 ZIP1 (2 ZIP2s optimized away)
ZIP1 ZIP2
and the 64-bit types need:
- ZIP1 ZIP1 ZIP1 ZIP1 (4 ZIP2s optimized away)
ZIP1 ZIP2 ZIP1 ZIP2
ZIP1 ZIP2 ZIP1 ZIP2. */
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 33 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
/* The loop should be fully-masked. The 32-bit types need two loads