}
};
+class svxar_impl : public function_base
+{
+public:
+ rtx
+ expand (function_expander &e) const override
+ {
+ /* aarch64_sve2_xar represents this operation with a left-rotate RTX.
+ Convert the right-rotate amount from the intrinsic to fit this. */
+ machine_mode mode = e.vector_mode (0);
+ HOST_WIDE_INT rot = GET_MODE_UNIT_BITSIZE (mode)
+ - INTVAL (e.args[2]);
+ e.args[2] = aarch64_simd_gen_const_vector_dup (mode, rot);
+ return e.use_exact_insn (code_for_aarch64_sve2_xar (mode));
+ }
+};
+
class svcdot_impl : public function_base
{
public:
FUNCTION (svwhilegt, while_comparison, (UNSPEC_WHILEGT, UNSPEC_WHILEHI))
FUNCTION (svwhilerw, svwhilerw_svwhilewr_impl, (UNSPEC_WHILERW))
FUNCTION (svwhilewr, svwhilerw_svwhilewr_impl, (UNSPEC_WHILEWR))
-FUNCTION (svxar, CODE_FOR_MODE0 (aarch64_sve2_xar),)
+FUNCTION (svxar, svxar_impl,)
} /* end namespace aarch64_sve */
;; - XAR
;; -------------------------------------------------------------------------
+;; Also allow the Advanced SIMD modes as the the SVE2 XAR instruction
+;; can handle more element sizes than the TARGET_SHA3 one from Advanced SIMD.
+;; Don't allow the V2DImode use here unless !TARGET_SHA3 as the Advanced SIMD
+;; version should be preferred when available as it is non-destructive on its
+;; input.
(define_insn "@aarch64_sve2_xar<mode>"
- [(set (match_operand:SVE_FULL_I 0 "register_operand")
- (rotatert:SVE_FULL_I
- (xor:SVE_FULL_I
- (match_operand:SVE_FULL_I 1 "register_operand")
- (match_operand:SVE_FULL_I 2 "register_operand"))
- (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")))]
- "TARGET_SVE2"
- {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
- [ w , %0 , w ; * ] xar\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3
- [ ?&w , w , w ; yes ] movprfx\t%0, %1\;xar\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3
+ [(set (match_operand:SVE_ASIMD_FULL_I 0 "register_operand" "=w,?&w")
+ (rotate:SVE_ASIMD_FULL_I
+ (xor:SVE_ASIMD_FULL_I
+ (match_operand:SVE_ASIMD_FULL_I 1 "register_operand" "%0,w")
+ (match_operand:SVE_ASIMD_FULL_I 2 "register_operand" "w,w"))
+ (match_operand:SVE_ASIMD_FULL_I 3 "aarch64_simd_lshift_imm")))]
+ "TARGET_SVE2 && !(<MODE>mode == V2DImode && TARGET_SHA3)"
+ {
+ operands[3]
+ = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode)
+ - INTVAL (unwrap_const_vec_duplicate (operands[3])));
+ if (which_alternative == 0)
+ return "xar\t%Z0.<Vetype>, %Z0.<Vetype>, %Z2.<Vetype>, #%3";
+ return "movprfx\t%Z0, %Z1\;xar\t%Z0.<Vetype>, %Z0.<Vetype>, %Z2.<Vetype>, #%3";
}
+ [(set_attr "movprfx" "*,yes")]
)
;; -------------------------------------------------------------------------
;; All fully-packed SVE integer vector modes.
(define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
+;; All fully-packed SVE integer and Advanced SIMD integer modes.
+(define_mode_iterator SVE_ASIMD_FULL_I [SVE_FULL_I VDQ_I])
+
;; All fully-packed SVE floating-point vector modes.
(define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
/*
** xar_16_s16_tied1:
-** xar z0\.h, z0\.h, z1\.h, #16
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_16_s16_tied1, svint16_t,
/*
** xar_16_s16_tied2:
-** xar z0\.h, z0\.h, z1\.h, #16
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_16_s16_tied2, svint16_t,
/*
** xar_16_s16_untied:
** (
-** movprfx z0, z1
-** xar z0\.h, z0\.h, z2\.h, #16
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.h, z0\.h, z1\.h, #16
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_32_s32_tied1:
-** xar z0\.s, z0\.s, z1\.s, #32
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_32_s32_tied1, svint32_t,
/*
** xar_32_s32_tied2:
-** xar z0\.s, z0\.s, z1\.s, #32
+** (
+** eor z0\.d, z0\.d, z1\.d
+** |
+** eor z0\.d, z1\.d, z0\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_32_s32_tied2, svint32_t,
/*
** xar_32_s32_untied:
** (
-** movprfx z0, z1
-** xar z0\.s, z0\.s, z2\.s, #32
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.s, z0\.s, z1\.s, #32
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_64_s64_tied1:
-** xar z0\.d, z0\.d, z1\.d, #64
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_64_s64_tied1, svint64_t,
/*
** xar_64_s64_tied2:
-** xar z0\.d, z0\.d, z1\.d, #64
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_64_s64_tied2, svint64_t,
/*
** xar_64_s64_untied:
** (
-** movprfx z0, z1
-** xar z0\.d, z0\.d, z2\.d, #64
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.d, z0\.d, z1\.d, #64
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_8_s8_tied1:
-** xar z0\.b, z0\.b, z1\.b, #8
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_8_s8_tied1, svint8_t,
/*
** xar_8_s8_tied2:
-** xar z0\.b, z0\.b, z1\.b, #8
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_8_s8_tied2, svint8_t,
/*
** xar_8_s8_untied:
** (
-** movprfx z0, z1
-** xar z0\.b, z0\.b, z2\.b, #8
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.b, z0\.b, z1\.b, #8
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_16_u16_tied1:
-** xar z0\.h, z0\.h, z1\.h, #16
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_16_u16_tied1, svuint16_t,
/*
** xar_16_u16_tied2:
-** xar z0\.h, z0\.h, z1\.h, #16
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_16_u16_tied2, svuint16_t,
/*
** xar_16_u16_untied:
** (
-** movprfx z0, z1
-** xar z0\.h, z0\.h, z2\.h, #16
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.h, z0\.h, z1\.h, #16
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_32_u32_tied1:
-** xar z0\.s, z0\.s, z1\.s, #32
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_32_u32_tied1, svuint32_t,
/*
** xar_32_u32_tied2:
-** xar z0\.s, z0\.s, z1\.s, #32
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_32_u32_tied2, svuint32_t,
/*
** xar_32_u32_untied:
** (
-** movprfx z0, z1
-** xar z0\.s, z0\.s, z2\.s, #32
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.s, z0\.s, z1\.s, #32
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_64_u64_tied1:
-** xar z0\.d, z0\.d, z1\.d, #64
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_64_u64_tied1, svuint64_t,
/*
** xar_64_u64_tied2:
-** xar z0\.d, z0\.d, z1\.d, #64
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_64_u64_tied2, svuint64_t,
/*
** xar_64_u64_untied:
** (
-** movprfx z0, z1
-** xar z0\.d, z0\.d, z2\.d, #64
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.d, z0\.d, z1\.d, #64
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
/*
** xar_8_u8_tied1:
-** xar z0\.b, z0\.b, z1\.b, #8
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_8_u8_tied1, svuint8_t,
/*
** xar_8_u8_tied2:
-** xar z0\.b, z0\.b, z1\.b, #8
+** (
+** eor z0\.d, z1\.d, z0\.d
+** |
+** eor z0\.d, z0\.d, z1\.d
+** )
** ret
*/
TEST_UNIFORM_Z (xar_8_u8_tied2, svuint8_t,
/*
** xar_8_u8_untied:
** (
-** movprfx z0, z1
-** xar z0\.b, z0\.b, z2\.b, #8
+** eor z0\.d, z1\.d, z2\.d
** |
-** movprfx z0, z2
-** xar z0\.b, z0\.b, z1\.b, #8
+** eor z0\.d, z2\.d, z1\.d
** )
** ret
*/
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC target "+sve2+nosha3"
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+
+v16qi
+xar_v16qi (v16qi a, v16qi b) {
+ v16qi c = a ^ b;
+ return (c << 2) ^ (c >> 6);
+}
+/* { dg-final { scan-assembler {\txar\tz0.b, z[0-9]+.b, z[0-9]+.b, #6} } } */
+
+v8hi
+xar_v8hi (v8hi a, v8hi b) {
+ v8hi c = a ^ b;
+ return (c << 13) ^ (c >> 3);
+}
+/* { dg-final { scan-assembler {\txar\tz0.h, z[0-9]+.h, z[0-9]+.h, #3} } } */
+
+v4si
+xar_v4si (v4si a, v4si b) {
+ v4si c = a ^ b;
+ return (c << 9) ^ (c >> 23);
+}
+/* { dg-final { scan-assembler {\txar\tz0.s, z[0-9]+.s, z[0-9]+.s, #23} } } */
+
+/* When +sha3 for Advanced SIMD is not available we should still use the
+ SVE2 form of XAR. */
+v2di
+xar_v2di (v2di a, v2di b) {
+ v2di c = a ^ b;
+ return (c << 22) ^ (c >> 42);
+}
+/* { dg-final { scan-assembler {\txar\tz0.d, z[0-9]+.d, z[0-9]+.d, #42} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC target "+sve2+sha3"
+
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+
+/* Both +sve2 and +sha3 have V2DImode XAR instructions, but we should
+ prefer the Advanced SIMD one when both are available. */
+v2di
+xar_v2di (v2di a, v2di b) {
+ v2di c = a ^ b;
+ return (c << 22) ^ (c >> 42);
+}
+/* { dg-final { scan-assembler {\txar\tv0.2d, v[0-9]+.2d, v[0-9]+.2d, 42} } } */
+