From: Alan Lawrence Date: Wed, 7 May 2014 10:40:41 +0000 (+0000) Subject: Reimplement AArch64 TRN intrinsics with __builtin_shuffle. X-Git-Tag: releases/gcc-5.1.0~7737 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1d175503d523c7c2404393bbdc08f4dd4cdb9054;p=thirdparty%2Fgcc.git Reimplement AArch64 TRN intrinsics with __builtin_shuffle. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vtrns32.c: Expect zip[12] insn rather than trn[12]. * gcc.target/aarch64/vtrnu32.c: Likewise. * gcc.target/aarch64/vtrnf32.c: Likewise. gcc/ChangeLog: * config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8, vtrn1_s16, vtrn1_s32, vtrn1_u8, vtrn1_u16, vtrn1_u32, vtrn1q_f32, vtrn1q_f64, vtrn1q_p8, vtrn1q_p16, vtrn1q_s8, vtrn1q_s16, vtrn1q_s32, vtrn1q_s64, vtrn1q_u8, vtrn1q_u16, vtrn1q_u32, vtrn1q_u64, vtrn2_f32, vtrn2_p8, vtrn2_p16, vtrn2_s8, vtrn2_s16, vtrn2_s32, vtrn2_u8, vtrn2_u16, vtrn2_u32, vtrn2q_f32, vtrn2q_f64, vtrn2q_p8, vtrn2q_p16, vtrn2q_s8, vtrn2q_s16, vtrn2q_s32, vtrn2q_s64, vtrn2q_u8, vtrn2q_u16, vtrn2q_u32, vtrn2q_u64): Replace temporary asm with __builtin_shuffle. From-SVN: r210151 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e5033a04f268..76339eeede1e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2014-05-07 Alan Lawrence + + * config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8, + vtrn1_s16, vtrn1_s32, vtrn1_u8, vtrn1_u16, vtrn1_u32, vtrn1q_f32, + vtrn1q_f64, vtrn1q_p8, vtrn1q_p16, vtrn1q_s8, vtrn1q_s16, vtrn1q_s32, + vtrn1q_s64, vtrn1q_u8, vtrn1q_u16, vtrn1q_u32, vtrn1q_u64, vtrn2_f32, + vtrn2_p8, vtrn2_p16, vtrn2_s8, vtrn2_s16, vtrn2_s32, vtrn2_u8, + vtrn2_u16, vtrn2_u32, vtrn2q_f32, vtrn2q_f64, vtrn2q_p8, vtrn2q_p16, + vtrn2q_s8, vtrn2q_s16, vtrn2q_s32, vtrn2q_s64, vtrn2q_u8, vtrn2q_u16, + vtrn2q_u32, vtrn2q_u64): Replace temporary asm with __builtin_shuffle. + 2014-05-07 Thomas Schwinge * loop-unswitch.c: Delete. diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index f6213ce2aeac..5ef189129c40 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -12694,468 +12694,6 @@ vsubhn_u64 (uint64x2_t a, uint64x2_t b) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vtrn1_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("trn1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtrn1_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("trn1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vtrn1_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("trn1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtrn1_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("trn1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vtrn1_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("trn1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vtrn1_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("trn1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtrn1_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("trn1 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vtrn1_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("trn1 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vtrn1_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("trn1 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vtrn1q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("trn1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vtrn1q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("trn1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vtrn1q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("trn1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vtrn1q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("trn1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vtrn1q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("trn1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vtrn1q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("trn1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vtrn1q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("trn1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vtrn1q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("trn1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vtrn1q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("trn1 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vtrn1q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("trn1 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vtrn1q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("trn1 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vtrn1q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("trn1 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vtrn2_f32 (float32x2_t a, float32x2_t b) -{ - float32x2_t result; - __asm__ ("trn2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtrn2_p8 (poly8x8_t a, poly8x8_t b) -{ - poly8x8_t result; - __asm__ ("trn2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vtrn2_p16 (poly16x4_t a, poly16x4_t b) -{ - poly16x4_t result; - __asm__ ("trn2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtrn2_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("trn2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vtrn2_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("trn2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vtrn2_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("trn2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtrn2_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("trn2 %0.8b,%1.8b,%2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vtrn2_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("trn2 %0.4h,%1.4h,%2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vtrn2_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("trn2 %0.2s,%1.2s,%2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vtrn2q_f32 (float32x4_t a, float32x4_t b) -{ - float32x4_t result; - __asm__ ("trn2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vtrn2q_f64 (float64x2_t a, float64x2_t b) -{ - float64x2_t result; - __asm__ ("trn2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vtrn2q_p8 (poly8x16_t a, poly8x16_t b) -{ - poly8x16_t result; - __asm__ ("trn2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vtrn2q_p16 (poly16x8_t a, poly16x8_t b) -{ - poly16x8_t result; - __asm__ ("trn2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vtrn2q_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("trn2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vtrn2q_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("trn2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vtrn2q_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("trn2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vtrn2q_s64 (int64x2_t a, int64x2_t b) -{ - int64x2_t result; - __asm__ ("trn2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vtrn2q_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("trn2 %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vtrn2q_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("trn2 %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vtrn2q_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("trn2 %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vtrn2q_u64 (uint64x2_t a, uint64x2_t b) -{ - uint64x2_t result; - __asm__ ("trn2 %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vtst_p8 (poly8x8_t a, poly8x8_t b) { @@ -23150,1309 +22688,1741 @@ vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) (int8x8_t) __b, __c); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t) __builtin_aarch64_usra_nv4hi ((int16x4_t) __a, + (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t) __builtin_aarch64_usra_nv2si ((int32x2_t) __a, + (int32x2_t) __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t) __builtin_aarch64_usra_ndi ((int64x1_t) __a, + (int64x1_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t) __builtin_aarch64_usra_nv16qi ((int8x16_t) __a, + (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t) __builtin_aarch64_usra_nv8hi ((int16x8_t) __a, + (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_usra_nv4si ((int32x4_t) __a, + (int32x4_t) __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_usra_nv2di ((int64x2_t) __a, + (int64x2_t) __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsrad_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t) __builtin_aarch64_ssra_ndi (__a, __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsrad_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t) __builtin_aarch64_usra_ndi (__a, __b, __c); +} + +/* vsri */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +{ + return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t) __builtin_aarch64_ssri_ndi (__a, __b, __c); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +{ + return (uint8x8_t) __builtin_aarch64_usri_nv8qi ((int8x8_t) __a, + (int8x8_t) __b, __c); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return (uint16x4_t) __builtin_aarch64_usri_nv4hi ((int16x4_t) __a, + (int16x4_t) __b, __c); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return (uint32x2_t) __builtin_aarch64_usri_nv2si ((int32x2_t) __a, + (int32x2_t) __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t) __builtin_aarch64_usri_ndi ((int64x1_t) __a, + (int64x1_t) __b, __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +{ + return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +{ + return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +{ + return (uint8x16_t) __builtin_aarch64_usri_nv16qi ((int8x16_t) __a, + (int8x16_t) __b, __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint16x8_t) __builtin_aarch64_usri_nv8hi ((int16x8_t) __a, + (int16x8_t) __b, __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_usri_nv4si ((int32x4_t) __a, + (int32x4_t) __b, __c); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint64x2_t) __builtin_aarch64_usri_nv2di ((int64x2_t) __a, + (int64x2_t) __b, __c); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsrid_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +{ + return (int64x1_t) __builtin_aarch64_ssri_ndi (__a, __b, __c); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsrid_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +{ + return (uint64x1_t) __builtin_aarch64_usri_ndi (__a, __b, __c); +} + +/* vst1 */ + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_f32 (float32_t *a, float32x2_t b) +{ + __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_f64 (float64_t *a, float64x1_t b) +{ + *a = b; +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_p8 (poly8_t *a, poly8x8_t b) +{ + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, + (int8x8_t) b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_p16 (poly16_t *a, poly16x4_t b) +{ + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, + (int16x4_t) b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s8 (int8_t *a, int8x8_t b) +{ + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s16 (int16_t *a, int16x4_t b) +{ + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s32 (int32_t *a, int32x2_t b) +{ + __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_s64 (int64_t *a, int64x1_t b) +{ + *a = b; +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u8 (uint8_t *a, uint8x8_t b) { - return (uint16x4_t) __builtin_aarch64_usra_nv4hi ((int16x4_t) __a, - (int16x4_t) __b, __c); + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, + (int8x8_t) b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u16 (uint16_t *a, uint16x4_t b) { - return (uint32x2_t) __builtin_aarch64_usra_nv2si ((int32x2_t) __a, - (int32x2_t) __b, __c); + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, + (int16x4_t) b); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u32 (uint32_t *a, uint32x2_t b) { - return (uint64x1_t) __builtin_aarch64_usra_ndi ((int64x1_t) __a, - (int64x1_t) __b, __c); + __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, + (int32x2_t) b); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_u64 (uint64_t *a, uint64x1_t b) { - return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c); + *a = b; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_f32 (float32_t *a, float32x4_t b) { - return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c); + __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_f64 (float64_t *a, float64x2_t b) { - return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c); + __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +/* vst1q */ + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_p8 (poly8_t *a, poly8x16_t b) { - return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, + (int8x16_t) b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_p16 (poly16_t *a, poly16x8_t b) { - return (uint8x16_t) __builtin_aarch64_usra_nv16qi ((int8x16_t) __a, - (int8x16_t) __b, __c); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, + (int16x8_t) b); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s8 (int8_t *a, int8x16_t b) { - return (uint16x8_t) __builtin_aarch64_usra_nv8hi ((int16x8_t) __a, - (int16x8_t) __b, __c); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s16 (int16_t *a, int16x8_t b) { - return (uint32x4_t) __builtin_aarch64_usra_nv4si ((int32x4_t) __a, - (int32x4_t) __b, __c); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s32 (int32_t *a, int32x4_t b) { - return (uint64x2_t) __builtin_aarch64_usra_nv2di ((int64x2_t) __a, - (int64x2_t) __b, __c); + __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b); } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vsrad_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_s64 (int64_t *a, int64x2_t b) { - return (int64x1_t) __builtin_aarch64_ssra_ndi (__a, __b, __c); + __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsrad_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u8 (uint8_t *a, uint8x16_t b) { - return (uint64x1_t) __builtin_aarch64_usra_ndi (__a, __b, __c); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, + (int8x16_t) b); } -/* vsri */ - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u16 (uint16_t *a, uint16x8_t b) { - return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, + (int16x8_t) b); } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u32 (uint32_t *a, uint32x4_t b) { - return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c); + __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, + (int32x4_t) b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_u64 (uint64_t *a, uint64x2_t b) { - return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c); + __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, + (int64x2_t) b); } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +/* vstn */ + +__extension__ static __inline void +vst2_s64 (int64_t * __a, int64x1x2_t val) { - return (int64x1_t) __builtin_aarch64_ssri_ndi (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + int64x2x2_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +__extension__ static __inline void +vst2_u64 (uint64_t * __a, uint64x1x2_t val) { - return (uint8x8_t) __builtin_aarch64_usri_nv8qi ((int8x8_t) __a, - (int8x8_t) __b, __c); + __builtin_aarch64_simd_oi __o; + uint64x2x2_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +__extension__ static __inline void +vst2_f64 (float64_t * __a, float64x1x2_t val) { - return (uint16x4_t) __builtin_aarch64_usri_nv4hi ((int16x4_t) __a, - (int16x4_t) __b, __c); + __builtin_aarch64_simd_oi __o; + float64x2x2_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1); + __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +__extension__ static __inline void +vst2_s8 (int8_t * __a, int8x8x2_t val) { - return (uint32x2_t) __builtin_aarch64_usri_nv2si ((int32x2_t) __a, - (int32x2_t) __b, __c); + __builtin_aarch64_simd_oi __o; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p8 (poly8_t * __a, poly8x8x2_t val) { - return (uint64x1_t) __builtin_aarch64_usri_ndi ((int64x1_t) __a, - (int64x1_t) __b, __c); + __builtin_aarch64_simd_oi __o; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s16 (int16_t * __a, int16x4x2_t val) { - return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + int16x8x2_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_p16 (poly16_t * __a, poly16x4x2_t val) { - return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + poly16x8x2_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_s32 (int32_t * __a, int32x2x2_t val) { - return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + int32x4x2_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u8 (uint8_t * __a, uint8x8x2_t val) { - return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u16 (uint16_t * __a, uint16x4x2_t val) { - return (uint8x16_t) __builtin_aarch64_usri_nv16qi ((int8x16_t) __a, - (int8x16_t) __b, __c); + __builtin_aarch64_simd_oi __o; + uint16x8x2_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_u32 (uint32_t * __a, uint32x2x2_t val) { - return (uint16x8_t) __builtin_aarch64_usri_nv8hi ((int16x8_t) __a, - (int16x8_t) __b, __c); + __builtin_aarch64_simd_oi __o; + uint32x4x2_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_f32 (float32_t * __a, float32x2x2_t val) { - return (uint32x4_t) __builtin_aarch64_usri_nv4si ((int32x4_t) __a, - (int32x4_t) __b, __c); + __builtin_aarch64_simd_oi __o; + float32x4x2_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1); + __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s8 (int8_t * __a, int8x16x2_t val) { - return (uint64x2_t) __builtin_aarch64_usri_nv2di ((int64x2_t) __a, - (int64x2_t) __b, __c); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vsrid_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_p8 (poly8_t * __a, poly8x16x2_t val) { - return (int64x1_t) __builtin_aarch64_ssri_ndi (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsrid_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_s16 (int16_t * __a, int16x8x2_t val) { - return (uint64x1_t) __builtin_aarch64_usri_ndi (__a, __b, __c); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -/* vst1 */ - __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_f32 (float32_t *a, float32x2_t b) +vst2q_p16 (poly16_t * __a, poly16x8x2_t val) { - __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_f64 (float64_t *a, float64x1_t b) +vst2q_s32 (int32_t * __a, int32x4x2_t val) { - *a = b; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_p8 (poly8_t *a, poly8x8_t b) +vst2q_s64 (int64_t * __a, int64x2x2_t val) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, - (int8x8_t) b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_p16 (poly16_t *a, poly16x4_t b) +vst2q_u8 (uint8_t * __a, uint8x16x2_t val) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, - (int16x4_t) b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_s8 (int8_t *a, int8x8_t b) +vst2q_u16 (uint16_t * __a, uint16x8x2_t val) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_s16 (int16_t *a, int16x4_t b) +vst2q_u32 (uint32_t * __a, uint32x4x2_t val) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_s32 (int32_t *a, int32x2_t b) +vst2q_u64 (uint64_t * __a, uint64x2x2_t val) { - __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_s64 (int64_t *a, int64x1_t b) +vst2q_f32 (float32_t * __a, float32x4x2_t val) { - *a = b; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1); + __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_u8 (uint8_t *a, uint8x8_t b) +vst2q_f64 (float64_t * __a, float64x2x2_t val) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, - (int8x8_t) b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1); + __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_u16 (uint16_t *a, uint16x4_t b) +__extension__ static __inline void +vst3_s64 (int64_t * __a, int64x1x3_t val) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, - (int16x4_t) b); + __builtin_aarch64_simd_ci __o; + int64x2x3_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_u32 (uint32_t *a, uint32x2_t b) +__extension__ static __inline void +vst3_u64 (uint64_t * __a, uint64x1x3_t val) { - __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, - (int32x2_t) b); + __builtin_aarch64_simd_ci __o; + uint64x2x3_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst1_u64 (uint64_t *a, uint64x1_t b) +__extension__ static __inline void +vst3_f64 (float64_t * __a, float64x1x3_t val) { - *a = b; + __builtin_aarch64_simd_ci __o; + float64x2x3_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2); + __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_f32 (float32_t *a, float32x4_t b) +__extension__ static __inline void +vst3_s8 (int8_t * __a, int8x8x3_t val) { - __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); + __builtin_aarch64_simd_ci __o; + int8x16x3_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_f64 (float64_t *a, float64x2_t b) +vst3_p8 (poly8_t * __a, poly8x8x3_t val) { - __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b); + __builtin_aarch64_simd_ci __o; + poly8x16x3_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vst1q */ - __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_p8 (poly8_t *a, poly8x16_t b) +vst3_s16 (int16_t * __a, int16x4x3_t val) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, - (int8x16_t) b); + __builtin_aarch64_simd_ci __o; + int16x8x3_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_p16 (poly16_t *a, poly16x8_t b) +vst3_p16 (poly16_t * __a, poly16x4x3_t val) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, - (int16x8_t) b); + __builtin_aarch64_simd_ci __o; + poly16x8x3_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_s8 (int8_t *a, int8x16_t b) +vst3_s32 (int32_t * __a, int32x2x3_t val) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b); + __builtin_aarch64_simd_ci __o; + int32x4x3_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_s16 (int16_t *a, int16x8_t b) +vst3_u8 (uint8_t * __a, uint8x8x3_t val) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b); + __builtin_aarch64_simd_ci __o; + uint8x16x3_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_s32 (int32_t *a, int32x4_t b) +vst3_u16 (uint16_t * __a, uint16x4x3_t val) { - __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b); + __builtin_aarch64_simd_ci __o; + uint16x8x3_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_s64 (int64_t *a, int64x2_t b) +vst3_u32 (uint32_t * __a, uint32x2x3_t val) { - __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b); + __builtin_aarch64_simd_ci __o; + uint32x4x3_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_u8 (uint8_t *a, uint8x16_t b) +vst3_f32 (float32_t * __a, float32x2x3_t val) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, - (int8x16_t) b); + __builtin_aarch64_simd_ci __o; + float32x4x3_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2); + __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_u16 (uint16_t *a, uint16x8_t b) +vst3q_s8 (int8_t * __a, int8x16x3_t val) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, - (int16x8_t) b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_u32 (uint32_t *a, uint32x4_t b) +vst3q_p8 (poly8_t * __a, poly8x16x3_t val) { - __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, - (int32x4_t) b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst1q_u64 (uint64_t *a, uint64x2_t b) -{ - __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, - (int64x2_t) b); -} - -/* vstn */ - -__extension__ static __inline void -vst2_s64 (int64_t * __a, int64x1x2_t val) -{ - __builtin_aarch64_simd_oi __o; - int64x2x2_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); - __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); -} - -__extension__ static __inline void -vst2_u64 (uint64_t * __a, uint64x1x2_t val) +vst3q_s16 (int16_t * __a, int16x8x3_t val) { - __builtin_aarch64_simd_oi __o; - uint64x2x2_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); - __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline void -vst2_f64 (float64_t * __a, float64x1x2_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_p16 (poly16_t * __a, poly16x8x3_t val) { - __builtin_aarch64_simd_oi __o; - float64x2x2_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1); - __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline void -vst2_s8 (int8_t * __a, int8x8x2_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_s32 (int32_t * __a, int32x4x3_t val) { - __builtin_aarch64_simd_oi __o; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_p8 (poly8_t * __a, poly8x8x2_t val) +vst3q_s64 (int64_t * __a, int64x2x3_t val) { - __builtin_aarch64_simd_oi __o; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_s16 (int16_t * __a, int16x4x2_t val) +vst3q_u8 (uint8_t * __a, uint8x16x3_t val) { - __builtin_aarch64_simd_oi __o; - int16x8x2_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_p16 (poly16_t * __a, poly16x4x2_t val) +vst3q_u16 (uint16_t * __a, uint16x8x3_t val) { - __builtin_aarch64_simd_oi __o; - poly16x8x2_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_s32 (int32_t * __a, int32x2x2_t val) +vst3q_u32 (uint32_t * __a, uint32x4x3_t val) { - __builtin_aarch64_simd_oi __o; - int32x4x2_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_u8 (uint8_t * __a, uint8x8x2_t val) +vst3q_u64 (uint64_t * __a, uint64x2x3_t val) { - __builtin_aarch64_simd_oi __o; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_u16 (uint16_t * __a, uint16x4x2_t val) +vst3q_f32 (float32_t * __a, float32x4x3_t val) { - __builtin_aarch64_simd_oi __o; - uint16x8x2_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2); + __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_u32 (uint32_t * __a, uint32x2x2_t val) +vst3q_f64 (float64_t * __a, float64x2x3_t val) { - __builtin_aarch64_simd_oi __o; - uint32x4x2_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2); + __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst2_f32 (float32_t * __a, float32x2x2_t val) +__extension__ static __inline void +vst4_s64 (int64_t * __a, int64x1x4_t val) { - __builtin_aarch64_simd_oi __o; - float32x4x2_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); + __builtin_aarch64_simd_xi __o; + int64x2x4_t temp; + temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_s8 (int8_t * __a, int8x16x2_t val) +__extension__ static __inline void +vst4_u64 (uint64_t * __a, uint64x1x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + uint64x2x4_t temp; + temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_p8 (poly8_t * __a, poly8x16x2_t val) +__extension__ static __inline void +vst4_f64 (float64_t * __a, float64x1x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + float64x2x4_t temp; + temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3); + __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_s16 (int16_t * __a, int16x8x2_t val) +__extension__ static __inline void +vst4_s8 (int8_t * __a, int8x8x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + int8x16x4_t temp; + temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_p16 (poly16_t * __a, poly16x8x2_t val) +vst4_p8 (poly8_t * __a, poly8x8x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + poly8x16x4_t temp; + temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_s32 (int32_t * __a, int32x4x2_t val) +vst4_s16 (int16_t * __a, int16x4x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_xi __o; + int16x8x4_t temp; + temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_s64 (int64_t * __a, int64x2x2_t val) +vst4_p16 (poly16_t * __a, poly16x4x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); + __builtin_aarch64_simd_xi __o; + poly16x8x4_t temp; + temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_u8 (uint8_t * __a, uint8x16x2_t val) +vst4_s32 (int32_t * __a, int32x2x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + int32x4x4_t temp; + temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_u16 (uint16_t * __a, uint16x8x2_t val) +vst4_u8 (uint8_t * __a, uint8x8x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + uint8x16x4_t temp; + temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_u32 (uint32_t * __a, uint32x4x2_t val) +vst4_u16 (uint16_t * __a, uint16x4x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_xi __o; + uint16x8x4_t temp; + temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_u64 (uint64_t * __a, uint64x2x2_t val) +vst4_u32 (uint32_t * __a, uint32x2x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); + __builtin_aarch64_simd_xi __o; + uint32x4x4_t temp; + temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_f32 (float32_t * __a, float32x4x2_t val) +vst4_f32 (float32_t * __a, float32x2x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); + __builtin_aarch64_simd_xi __o; + float32x4x4_t temp; + temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3); + __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst2q_f64 (float64_t * __a, float64x2x2_t val) +vst4q_s8 (int8_t * __a, int8x16x4_t val) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline void -vst3_s64 (int64_t * __a, int64x1x3_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p8 (poly8_t * __a, poly8x16x4_t val) { - __builtin_aarch64_simd_ci __o; - int64x2x3_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); - __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ static __inline void -vst3_u64 (uint64_t * __a, uint64x1x3_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s16 (int16_t * __a, int16x8x4_t val) { - __builtin_aarch64_simd_ci __o; - uint64x2x3_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); - __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline void -vst3_f64 (float64_t * __a, float64x1x3_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_p16 (poly16_t * __a, poly16x8x4_t val) { - __builtin_aarch64_simd_ci __o; - float64x2x3_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2); - __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ static __inline void -vst3_s8 (int8_t * __a, int8x8x3_t val) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_s32 (int32_t * __a, int32x4x4_t val) { - __builtin_aarch64_simd_ci __o; - int8x16x3_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_p8 (poly8_t * __a, poly8x8x3_t val) +vst4q_s64 (int64_t * __a, int64x2x4_t val) { - __builtin_aarch64_simd_ci __o; - poly8x16x3_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_s16 (int16_t * __a, int16x4x3_t val) +vst4q_u8 (uint8_t * __a, uint8x16x4_t val) { - __builtin_aarch64_simd_ci __o; - int16x8x3_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_p16 (poly16_t * __a, poly16x4x3_t val) +vst4q_u16 (uint16_t * __a, uint16x8x4_t val) { - __builtin_aarch64_simd_ci __o; - poly16x8x3_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_s32 (int32_t * __a, int32x2x3_t val) +vst4q_u32 (uint32_t * __a, uint32x4x4_t val) { - __builtin_aarch64_simd_ci __o; - int32x4x3_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_u8 (uint8_t * __a, uint8x8x3_t val) +vst4q_u64 (uint64_t * __a, uint64x2x4_t val) { - __builtin_aarch64_simd_ci __o; - uint8x16x3_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_u16 (uint16_t * __a, uint16x4x3_t val) +vst4q_f32 (float32_t * __a, float32x4x4_t val) { - __builtin_aarch64_simd_ci __o; - uint16x8x3_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3); + __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_u32 (uint32_t * __a, uint32x2x3_t val) +vst4q_f64 (float64_t * __a, float64x2x4_t val) { - __builtin_aarch64_simd_ci __o; - uint32x4x3_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3); + __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3_f32 (float32_t * __a, float32x2x3_t val) +/* vsub */ + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vsubd_s64 (int64x1_t __a, int64x1_t __b) { - __builtin_aarch64_simd_ci __o; - float32x4x3_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __a - __b; } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_s8 (int8_t * __a, int8x16x3_t val) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vsubd_u64 (uint64x1_t __a, uint64x1_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __a - __b; } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_p8 (poly8_t * __a, poly8x16x3_t val) +/* vtbx1 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (8)); + int8x8_t __tbl = vtbl1_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_s16 (int16_t * __a, int16x8x3_t val) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_p16 (poly16_t * __a, poly16x8x3_t val) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_s32 (int32_t * __a, int32x4x3_t val) +/* vtbx3 */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (24)); + int8x8_t __tbl = vtbl3_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_s64 (int64_t * __a, int64x2x3_t val) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_u8 (uint8_t * __a, uint8x16x3_t val) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_u16 (uint16_t * __a, uint16x8x3_t val) +/* vtrn */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vtrn1_f32 (float32x2_t __a, float32x2_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_u32 (uint32_t * __a, uint32x4x3_t val) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtrn1_p8 (poly8x8_t __a, poly8x8_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_u64 (uint64_t * __a, uint64x2x3_t val) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vtrn1_p16 (poly16x4_t __a, poly16x4_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtrn1_s8 (int8x8_t __a, int8x8_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_f32 (float32_t * __a, float32x4x3_t val) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vtrn1_s16 (int16x4_t __a, int16x4_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst3q_f64 (float64_t * __a, float64x2x3_t val) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vtrn1_s32 (int32x2_t __a, int32x2_t __b) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ static __inline void -vst4_s64 (int64_t * __a, int64x1x4_t val) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtrn1_u8 (uint8x8_t __a, uint8x8_t __b) { - __builtin_aarch64_simd_xi __o; - int64x2x4_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); - __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void -vst4_u64 (uint64_t * __a, uint64x1x4_t val) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtrn1_u16 (uint16x4_t __a, uint16x4_t __b) { - __builtin_aarch64_simd_xi __o; - uint64x2x4_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); - __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } -__extension__ static __inline void -vst4_f64 (float64_t * __a, float64x1x4_t val) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtrn1_u32 (uint32x2_t __a, uint32x2_t __b) { - __builtin_aarch64_simd_xi __o; - float64x2x4_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3); - __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ static __inline void -vst4_s8 (int8_t * __a, int8x8x4_t val) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vtrn1q_f32 (float32x4_t __a, float32x4_t __b) { - __builtin_aarch64_simd_xi __o; - int8x16x4_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_p8 (poly8_t * __a, poly8x8x4_t val) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vtrn1q_f64 (float64x2_t __a, float64x2_t __b) { - __builtin_aarch64_simd_xi __o; - poly8x16x4_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_s16 (int16_t * __a, int16x4x4_t val) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b) { - __builtin_aarch64_simd_xi __o; - int16x8x4_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_p16 (poly16_t * __a, poly16x4x4_t val) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b) { - __builtin_aarch64_simd_xi __o; - poly16x8x4_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_s32 (int32_t * __a, int32x2x4_t val) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vtrn1q_s8 (int8x16_t __a, int8x16_t __b) { - __builtin_aarch64_simd_xi __o; - int32x4x4_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_u8 (uint8_t * __a, uint8x8x4_t val) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vtrn1q_s16 (int16x8_t __a, int16x8_t __b) { - __builtin_aarch64_simd_xi __o; - uint8x16x4_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_u16 (uint16_t * __a, uint16x4x4_t val) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vtrn1q_s32 (int32x4_t __a, int32x4_t __b) { - __builtin_aarch64_simd_xi __o; - uint16x8x4_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_u32 (uint32_t * __a, uint32x2x4_t val) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vtrn1q_s64 (int64x2_t __a, int64x2_t __b) { - __builtin_aarch64_simd_xi __o; - uint32x4x4_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4_f32 (float32_t * __a, float32x2x4_t val) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b) { - __builtin_aarch64_simd_xi __o; - float32x4x4_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_s8 (int8_t * __a, int8x16x4_t val) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_p8 (poly8_t * __a, poly8x16x4_t val) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_s16 (int16_t * __a, int16x8x4_t val) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_p16 (poly16_t * __a, poly16x8x4_t val) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vtrn2_f32 (float32x2_t __a, float32x2_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_s32 (int32_t * __a, int32x4x4_t val) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtrn2_p8 (poly8x8_t __a, poly8x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_s64 (int64_t * __a, int64x2x4_t val) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vtrn2_p16 (poly16x4_t __a, poly16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_u8 (uint8_t * __a, uint8x16x4_t val) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtrn2_s8 (int8x8_t __a, int8x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_u16 (uint16_t * __a, uint16x8x4_t val) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vtrn2_s16 (int16x4_t __a, int16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_u32 (uint32_t * __a, uint32x4x4_t val) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vtrn2_s32 (int32x2_t __a, int32x2_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_u64 (uint64_t * __a, uint64x2x4_t val) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtrn2_u8 (uint8x8_t __a, uint8x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_f32 (float32_t * __a, float32x4x4_t val) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vtrn2_u16 (uint16x4_t __a, uint16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ static __inline void __attribute__ ((__always_inline__)) -vst4q_f64 (float64_t * __a, float64x2x4_t val) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vtrn2_u32 (uint32x2_t __a, uint32x2_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -/* vsub */ - -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vsubd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vtrn2q_f32 (float32x4_t __a, float32x4_t __b) { - return __a - __b; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vsubd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vtrn2q_f64 (float64x2_t __a, float64x2_t __b) { - return __a - __b; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -/* vtbx1 */ - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (8)); - int8x8_t __tbl = vtbl1_s8 (__tab, __idx); - - return vbsl_s8 (__mask, __tbl, __r); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); - - return vbsl_u8 (__mask, __tbl, __r); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vtrn2q_s8 (int8x16_t __a, int8x16_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); - - return vbsl_p8 (__mask, __tbl, __r); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -/* vtbx3 */ - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vtrn2q_s16 (int16x8_t __a, int16x8_t __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (24)); - int8x8_t __tbl = vtbl3_s8 (__tab, __idx); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif +} - return vbsl_s8 (__mask, __tbl, __r); +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vtrn2q_s32 (int32x4_t __a, int32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vtrn2q_s64 (int64x2_t __a, int64x2_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} - return vbsl_u8 (__mask, __tbl, __r); +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif +} - return vbsl_p8 (__mask, __tbl, __r); +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -/* vtrn */ +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif +} __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vtrn_f32 (float32x2_t a, float32x2_t b) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 936a5b21c8b5..c38d66375172 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2014-05-07 Alan Lawrence + + * gcc.target/aarch64/vtrns32.c: Expect zip[12] insn rather than trn[12]. + * gcc.target/aarch64/vtrnu32.c: Likewise. + * gcc.target/aarch64/vtrnf32.c: Likewise. + 2014-05-07 Alan Lawrence * gcc.target/aarch64/simd/vtrnf32_1.c: New file. diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vtrnf32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vtrnf32_1.c index 24c30a34977e..c345798745fb 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vtrnf32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vtrnf32_1.c @@ -6,6 +6,6 @@ #include #include "vtrnf32.x" -/* { dg-final { scan-assembler-times "trn1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "trn2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vtrns32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vtrns32_1.c index 0a9256c2c8d3..1de20a87b241 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vtrns32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vtrns32_1.c @@ -6,6 +6,6 @@ #include #include "vtrns32.x" -/* { dg-final { scan-assembler-times "trn1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "trn2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vtrnu32_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vtrnu32_1.c index 722a1d721c4d..53ecab887d72 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/vtrnu32_1.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/vtrnu32_1.c @@ -6,6 +6,6 @@ #include #include "vtrnu32.x" -/* { dg-final { scan-assembler-times "trn1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ -/* { dg-final { scan-assembler-times "trn2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip1\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ +/* { dg-final { scan-assembler-times "zip2\[ \t\]+v\[0-9\]+\.2s, ?v\[0-9\]+\.2s, ?v\[0-9\]+\.2s!?\(?:\[ \t\]+@\[a-zA-Z0-9 \]+\)?\n" 1 } } */ /* { dg-final { cleanup-saved-temps } } */