]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Use RTL builtins for polynomial vsli[q]_n intrinsics
authorJonathan Wright <jonathan.wright@arm.com>
Wed, 10 Feb 2021 11:39:39 +0000 (11:39 +0000)
committerJonathan Wright <jonathan.wright@arm.com>
Wed, 28 Apr 2021 20:11:58 +0000 (21:11 +0100)
Rewrite vsli[q]_n_p* Neon intrinsics to use RTL builtins rather than
inline assembly code, allowing for better scheduling and
optimization.

gcc/ChangeLog:

2021-02-10  Jonathan Wright  <jonathan.wright@arm.com>

* config/aarch64/aarch64-simd-builtins.def: Use VALLP mode
iterator for polynomial ssli_n builtin generator macro.
* config/aarch64/arm_neon.h (vsli_n_p8): Use RTL builtin
instead of inline asm.
(vsli_n_p16): Likewise.
(vsliq_n_p8): Likewise.
(vsliq_n_p16): Likewise.
* config/aarch64/iterators.md: Define VALLP mode iterator.

gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/arm_neon.h
gcc/config/aarch64/iterators.md

index 202f69005e58feec2e552e8b0486cdffeb0b647d..534979133f449bd8c5ceba3ae2f06287ee4cf572 100644 (file)
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0, NONE)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0, NONE)
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0, NONE)
-  VAR2 (SHIFTINSERTP, ssli_n, 0, NONE, di, v2di)
+  BUILTIN_VALLP (SHIFTINSERTP, ssli_n, 0, NONE)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0, NONE)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0, NONE)
index 164c76d3f7bffb8ba287959400d21405bd0239e0..38a3a3ff01e1567d9a7726c6fe4ed856299bbce4 100644 (file)
@@ -9050,57 +9050,33 @@ vshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
     __builtin_aarch64_shrn2v2di ((int32x2_t) __a, (int64x2_t) __b, __c);
 }
 
-#define vsli_n_p8(a, b, c)                                              \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x8_t b_ = (b);                                              \
-       poly8x8_t a_ = (a);                                              \
-       poly8x8_t result;                                                \
-       __asm__ ("sli %0.8b,%2.8b,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+  return __builtin_aarch64_ssli_nv8qi_ppps (__a, __b, __c);
+}
 
-#define vsli_n_p16(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x4_t b_ = (b);                                             \
-       poly16x4_t a_ = (a);                                             \
-       poly16x4_t result;                                               \
-       __asm__ ("sli %0.4h,%2.4h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+  return __builtin_aarch64_ssli_nv4hi_ppps (__a, __b, __c);
+}
 
-#define vsliq_n_p8(a, b, c)                                             \
-  __extension__                                                         \
-    ({                                                                  \
-       poly8x16_t b_ = (b);                                             \
-       poly8x16_t a_ = (a);                                             \
-       poly8x16_t result;                                               \
-       __asm__ ("sli %0.16b,%2.16b,%3"                                  \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+  return __builtin_aarch64_ssli_nv16qi_ppps (__a, __b, __c);
+}
 
-#define vsliq_n_p16(a, b, c)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       poly16x8_t b_ = (b);                                             \
-       poly16x8_t a_ = (a);                                             \
-       poly16x8_t result;                                               \
-       __asm__ ("sli %0.8h,%2.8h,%3"                                    \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+  return __builtin_aarch64_ssli_nv8hi_ppps (__a, __b, __c);
+}
 
 #define vsri_n_p8(a, b, c)                                              \
   __extension__                                                         \
index 8a765ea8a32b6e1febbeeafc726b316dc01055ae..fe2c51cebf13d6bff96d3a41e7b331560068f64d 100644 (file)
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
                               V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
 
+;; All Advanced SIMD polynomial modes and DI.
+(define_mode_iterator VALLP [V8QI V16QI V4HI V8HI V2DI DI])
+
 ;; Advanced SIMD modes for Integer reduction across lanes.
 (define_mode_iterator VDQV [V8QI V16QI V4HI V8HI V4SI V2DI])