From 6372b05e5b14f27ddce11c28654956c1ad715dac Mon Sep 17 00:00:00 2001 From: Jonathan Wright Date: Wed, 10 Feb 2021 11:39:39 +0000 Subject: [PATCH] aarch64: Use RTL builtins for polynomial vsli[q]_n intrinsics Rewrite vsli[q]_n_p* Neon intrinsics to use RTL builtins rather than inline assembly code, allowing for better scheduling and optimization. gcc/ChangeLog: 2021-02-10 Jonathan Wright * config/aarch64/aarch64-simd-builtins.def: Use VALLP mode iterator for polynomial ssli_n builtin generator macro. * config/aarch64/arm_neon.h (vsli_n_p8): Use RTL builtin instead of inline asm. (vsli_n_p16): Likewise. (vsliq_n_p8): Likewise. (vsliq_n_p16): Likewise. * config/aarch64/iterators.md: Define VALLP mode iterator. --- gcc/config/aarch64/aarch64-simd-builtins.def | 2 +- gcc/config/aarch64/arm_neon.h | 72 +++++++------------- gcc/config/aarch64/iterators.md | 3 + 3 files changed, 28 insertions(+), 49 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 202f69005e58..534979133f44 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -436,7 +436,7 @@ BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE) BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE) BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE) - VAR2 (SHIFTINSERTP, ssli_n, 0, NONE, di, v2di) + BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE) BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, NONE) /* Implemented by aarch64_qshl_n. */ BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, NONE) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 164c76d3f7bf..38a3a3ff01e1 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9050,57 +9050,33 @@ vshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) __builtin_aarch64_shrn2v2di ((int32x2_t) __a, (int64x2_t) __b, __c); } -#define vsli_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("sli %0.8b,%2.8b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv8qi_ppps (__a, __b, __c); +} -#define vsli_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("sli %0.4h,%2.4h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv4hi_ppps (__a, __b, __c); +} -#define vsliq_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("sli %0.16b,%2.16b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv16qi_ppps (__a, __b, __c); +} -#define vsliq_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("sli %0.8h,%2.8h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c) +{ + return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c); +} #define vsri_n_p8(a, b, c) \ __extension__ \ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 8a765ea8a32b..fe2c51cebf13 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -203,6 +203,9 @@ (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF V2DI V4HF V8HF V2SF V4SF V2DF DI DF]) +;; All Advanced SIMD polynomial modes and DI. +(define_mode_iterator VALLP [V8QI V16QI V4HI V8HI V2DI DI]) + ;; Advanced SIMD modes for Integer reduction across lanes. (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI]) -- 2.47.2