}
/* Emit an optimized sequence to perform a vector rotate
- of REG by the vector constant amount AMNT and place the result
+ of REG by the vector constant amount AMNT_VEC and place the result
in DST. Return true iff successful. */
bool
-aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
{
+ rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
+ gcc_assert (CONST_INT_P (amnt));
+ HOST_WIDE_INT rotamnt = UINTVAL (amnt);
machine_mode mode = GET_MODE (reg);
- /* Attempt to expand the rotate as a vector permute.
- For some rotate amounts they can be single instructions and
- even the general single-vector TBL permute has good throughput. */
- if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+ /* Rotates by half the element width map down to REV* instructions and should
+ always be preferred when possible. */
+ if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
+ && expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+ return true;
+ /* 64 and 128-bit vector modes can use the XAR instruction
+ when available. */
+ else if (can_create_pseudo_p ()
+ && ((TARGET_SHA3 && mode == V2DImode)
+ || (TARGET_SVE2
+ && (known_eq (GET_MODE_SIZE (mode), 8)
+ || known_eq (GET_MODE_SIZE (mode), 16)))))
+ {
+ rtx zeroes = aarch64_gen_shareable_zero (mode);
+ rtx xar_op
+ = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
+ amnt_vec);
+ emit_set_insn (dst, xar_op);
+ return true;
+ }
+ /* If none of the above, try to expand rotates by any byte amount as
+ permutes. */
+ else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
return true;
return false;
}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+#pragma GCC target "+sve2+sha3"
+
+/*
+** G1:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar v0\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 39
+** ret
+*/
+v2di
+G1 (v2di r) {
+ return (r >> 39) | (r << 25);
+}
+
+/*
+** G2:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #23
+** ret
+*/
+v4si
+G2 (v4si r) {
+ return (r >> 23) | (r << 9);
+}
+
+/*
+** G3:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #5
+** ret
+*/
+v8hi
+G3 (v8hi r) {
+ return (r >> 5) | (r << 11);
+}
+
+/*
+** G4:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #6
+** ret
+*/
+v16qi
+G4 (v16qi r)
+{
+ return (r << 2) | (r >> 6);
+}
+
+/*
+** G5:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.s, z[0-9]+\.s, z[0-9]+\.s, #22
+** ret
+*/
+v2si
+G5 (v2si r) {
+ return (r >> 22) | (r << 10);
+}
+
+/*
+** G6:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.h, z[0-9]+\.h, z[0-9]+\.h, #7
+** ret
+*/
+v4hi
+G6 (v4hi r) {
+ return (r >> 7) | (r << 9);
+}
+
+/*
+** G7:
+** movi? [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+** xar z0\.b, z[0-9]+\.b, z[0-9]+\.b, #5
+** ret
+*/
+v8qi
+G7 (v8qi r)
+{
+ return (r << 3) | (r >> 5);
+}
+