From a53b8229e64c78256449005929e599b2eab83fbd Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Mon, 8 Feb 2021 11:37:29 +0000 Subject: [PATCH] aarch64: Use RTL builtins for vq[r]dmulh[q]_n intrinsics Rewrite vq[r]dmulh[q]_n Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-08 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Add sq[r]dmulh_n builtin generator macros. * config/aarch64/aarch64-simd.md (aarch64_sqdmulh_n): Define. * config/aarch64/arm_neon.h (vqdmulh_n_s16): Use RTL builtin instead of inline asm. (vqdmulh_n_s32): Likewise. (vqdmulhq_n_s16): Likewise. (vqdmulhq_n_s32): Likewise. (vqrdmulh_n_s16): Likewise. (vqrdmulh_n_s32): Likewise. (vqrdmulhq_n_s16): Likewise. (vqrdmulhq_n_s32): Likewise. --- gcc/config/aarch64/aarch64-simd-builtins.def | 3 ++ gcc/config/aarch64/aarch64-simd.md | 12 +++++ gcc/config/aarch64/arm_neon.h | 56 +++----------------- 3 files changed, 23 insertions(+), 48 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index b885bd5b38bf..f79e71682934 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -348,6 +348,9 @@ /* Implemented by aarch64_sqdmulh. */ BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0, NONE) BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0, NONE) + /* Implemented by aarch64_sqdmulh_n. */ + BUILTIN_VDQHS (BINOP, sqdmulh_n, 0, NONE) + BUILTIN_VDQHS (BINOP, sqrdmulh_n, 0, NONE) /* Implemented by aarch64_sqdmulh_lane. */ BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0, NONE) BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 4edee99051c4..5245cf01ba34 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4639,6 +4639,18 @@ [(set_attr "type" "neon_sat_mul_")] ) +(define_insn "aarch64_sqdmulh_n" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (unspec:VDQHS + [(match_operand:VDQHS 1 "register_operand" "w") + (vec_duplicate:VDQHS + (match_operand: 2 "register_operand" ""))] + VQDMULH))] + "TARGET_SIMD" + "sqdmulh\\t%0., %1., %2.[0]" + [(set_attr "type" "neon_sat_mul__scalar")] +) + ;; sqdmulh_lane (define_insn "aarch64_sqdmulh_lane" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index baa30bd5a9d9..5fb2b3d0d456 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -8769,48 +8769,28 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t __result; - __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t __result; - __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv2si (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t __result; - __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t __result; - __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqdmulh_nv4si (__a, __b); } __extension__ extern __inline int8x16_t @@ -8880,48 +8860,28 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t __result; - __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t __result; - __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv2si (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t __result; - __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(__result) - : "w"(__a), "x"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t __result; - __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqrdmulh_nv4si (__a, __b); } __extension__ extern __inline int8x16_t -- 2.47.2