bool aarch64_constant_address_p (rtx);
bool aarch64_emit_approx_div (rtx, rtx, rtx);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
tree aarch64_vector_load_decl (tree);
rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
void aarch64_expand_call (rtx, rtx, rtx, bool);
(match_dup 4))
(match_dup 3)))]
{
+ if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+ DONE;
+
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
return true;
}
+/* Emit an optimized sequence to perform a vector rotate
+ of REG by the vector constant amount AMNT and place the result
+ in DST. Return true iff successful. */
+
+bool
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+{
+ machine_mode mode = GET_MODE (reg);
+ /* Attempt to expand the rotate as a vector permute.
+ For some rotate amounts they can be single instructions and
+ even the general single-vector TBL permute has good throughput. */
+ if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+ return true;
+ return false;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
return target;
}
+/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
+ permute operation. Emit code to put the result in DST if successfull and
+ return it. Otherwise return NULL. This is intended to implement vector
+ rotates by byte amounts using vector permutes when the target does not offer
+ native vector rotate operations. */
+rtx
+expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
+{
+ rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
+ /* For now handle only rotate by the same integer constant in all lanes.
+ In principle rotates by any constant vector are representable through
+ permutes as long as the individual rotate amounts are multiples of
+ BITS_PER_UNIT. */
+ if (!CONST_INT_P (amt_unwrap))
+ return NULL_RTX;
+
+ int rotamnt = INTVAL (amt_unwrap);
+ if (rotamnt % BITS_PER_UNIT != 0)
+ return NULL_RTX;
+ machine_mode qimode;
+ if (!qimode_for_vec_perm (mode).exists (&qimode))
+ return NULL_RTX;
+
+ vec_perm_builder builder;
+ unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
+ poly_uint64 total_units = GET_MODE_SIZE (mode);
+ builder.new_vector (total_units, nunits, 3);
+ unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
+ unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
+ for (unsigned j = 0; j < 3 * nunits; j += nunits)
+ for (unsigned i = 0; i < nunits; i++)
+ builder.quick_push ((rot_to_perm + i) % nunits + j);
+
+ rtx perm_src = lowpart_subreg (qimode, x, mode);
+ rtx perm_dst = lowpart_subreg (qimode, dst, mode);
+ rtx res
+ = expand_vec_perm_const (qimode, perm_src, perm_src, builder,
+ qimode, perm_dst);
+ if (!res)
+ return NULL_RTX;
+ emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
+ return dst;
+}
+
/* Helper function for canonicalize_cmp_for_target. Swap between inclusive
and exclusive ranges in order to create an equivalent comparison. See
canonicalize_cmp_for_target for the possible cases. */
rtx, int);
extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
int, int);
+extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
#endif // EXPMED_H
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+/*
+** G1:
+** rev64 v0\.4s, v0\.4s
+** ret
+*/
+v2di
+G1 (v2di r)
+{
+ return (r >> 32) | (r << 32);
+}
+
+/*
+** G2:
+** rev32 v0\.8h, v0\.8h
+** ret
+*/
+v4si
+G2 (v4si r)
+{
+ return (r >> 16) | (r << 16);
+}
+
+/*
+** G3:
+** rev16 v0\.16b, v0\.16b
+** ret
+*/
+v8hi
+G3 (v8hi r)
+{
+ return (r >> 8) | (r << 8);
+}
+
+/*
+** G4:
+** rev32 v0\.4h, v0\.4h
+** ret
+*/
+v2si
+G4 (v2si r)
+{
+ return (r >> 16) | (r << 16);
+}
+
+/*
+** G5:
+** rev16 v0\.8b, v0\.8b
+** ret
+*/
+v4hi
+G5 (v4hi r)
+{
+ return (r >> 8) | (r << 8);
+}
+
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
+
+static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
+
+unsigned long long
+__attribute__((noipa,noinline))
+rot_64_one (unsigned long long x, unsigned amt)
+{
+ return (x << amt) | (x >> (64 - amt));
+}
+unsigned int
+__attribute__((noipa,noinline))
+rot_32_one (unsigned int x, unsigned amt)
+{
+ return (x << amt) | (x >> (32 - amt));
+}
+
+unsigned short
+__attribute__((noipa,noinline))
+rot_16_one (unsigned short x, unsigned short amt)
+{
+ return (x << amt) | (x >> (16 - amt));
+}
+
+
+#define ROTFUNC(M,S,A) \
+M \
+__attribute__((noipa,noinline)) \
+rot_##M##_##S##_##A (M x) \
+{ \
+ return (x << A) | (x >> (S - A)); \
+} \
+ \
+void \
+test_rot_##M##_##S##_##A (void) \
+{ \
+ M vec = *(M *)str; \
+ M res = rot_##M##_##S##_##A (vec); \
+ for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++) \
+ if (res[i] != rot_##S##_one (vec[i], A)) \
+ __builtin_abort (); \
+}
+
+ROTFUNC (v2di, 64, 56)
+ROTFUNC (v2di, 64, 48)
+ROTFUNC (v2di, 64, 40)
+ROTFUNC (v2di, 64, 32)
+ROTFUNC (v2di, 64, 24)
+ROTFUNC (v2di, 64, 16)
+ROTFUNC (v2di, 64, 8)
+
+ROTFUNC (v4si, 32, 24)
+ROTFUNC (v4si, 32, 16)
+ROTFUNC (v4si, 32, 8)
+
+ROTFUNC (v8hi, 16, 8)
+
+ROTFUNC (v2si, 32, 24)
+ROTFUNC (v2si, 32, 16)
+ROTFUNC (v2si, 32, 8)
+
+ROTFUNC (v4hi, 16, 8)
+
+#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
+
+int
+main (void)
+{
+ CALL_TEST (v2di, 64, 56);
+ CALL_TEST (v2di, 64, 48);
+ CALL_TEST (v2di, 64, 40);
+ CALL_TEST (v2di, 64, 32);
+ CALL_TEST (v2di, 64, 24);
+ CALL_TEST (v2di, 64, 16);
+ CALL_TEST (v2di, 64, 8);
+
+ CALL_TEST (v4si, 32, 24);
+ CALL_TEST (v4si, 32, 16);
+ CALL_TEST (v4si, 32, 8);
+
+ CALL_TEST (v8hi, 16, 8);
+
+ CALL_TEST (v2si, 32, 24);
+ CALL_TEST (v2si, 32, 16);
+ CALL_TEST (v2si, 32, 8);
+
+ CALL_TEST (v4hi, 16, 8);
+
+ return 0;
+}
+