From: Ezra Sitorus Date: Tue, 2 Jan 2024 09:23:41 +0000 (+0000) Subject: arm: vst1q_types_x3 ACLE intrinsics X-Git-Tag: basepoints/gcc-15~2931 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2a0e91d6c1a6c93a248fd947115302b668d97e8e;p=thirdparty%2Fgcc.git arm: vst1q_types_x3 ACLE intrinsics This patch is part of a series of patches implementing the _xN variants of the vst1q intrinsic for the arm port. This patch adds the _x3 variants of the vst1q intrinsic. ACLE documents: https://developer.arm.com/documentation/ihi0053/latest/ ISA documents: https://developer.arm.com/documentation/ddi0487/latest/ gcc/ChangeLog: * config/arm/arm_neon.h (vst1q_u8_x3, vst1q_u16_x3, vst1q_u32_x3, vst1q_u64_x3): New. (vst1q_s8_x3, vst1q_s16_x3, vst1q_s32_x3, vst1q_s64_x3): New. (vst1q_f16_x3, vst1q_f32_x3): New. (vst1q_p8_x3, vst1q_p16_x3, vst1q_p64_x3): New. (vst1q_bf16_x3): New. * config/arm/arm_neon_builtins.def (vst1q_x3): New entries. * config/arm/neon.md (neon_vst1q_x3): New. (neon_vld1x3qa, neon_vst1x3qb): New. * config/arm/unspecs.md (UNSPEC_VST1X3A, UNSPEC_VST1X3B): New. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/vst1q_base_xN_1.c: Add new tests. * gcc.target/arm/simd/vst1q_bf16_xN_1.c: Add new tests. * gcc.target/arm/simd/vst1q_fp16_xN_1.c: Add new tests. * gcc.target/arm/simd/vst1q_p64_xN_1.c: Add new tests. --- diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 9d7331885c56..319a65d9d7c0 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -11359,6 +11359,38 @@ vst1q_s64_x2 (int64_t * __a, int64x2x2_t __b) __builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_s8_x3 (int8_t * __a, int8x16x3_t __b) +{ + union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_s16_x3 (int16_t * __a, int16x8x3_t __b) +{ + union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_s32_x3 (int32_t * __a, int32x4x3_t __b) +{ + union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_s64_x3 (int64_t * __a, int64x2x3_t __b) +{ + union { int64x2x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o); +} + __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1_s8_x3 (int8_t * __a, int8x8x3_t __b) @@ -11696,6 +11728,14 @@ vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __b) __builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __b) +{ + union { poly64x2x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o); +} + #pragma GCC pop_options __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -11759,6 +11799,24 @@ vst1q_f32_x2 (float32_t * __a, float32x4x2_t __b) __builtin_neon_vst1q_x2v4sf (__a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_f16_x3 (float16_t * __a, float16x8x3_t __b) +{ + union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v8hf (__a, __bu.__o); +} +#endif + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_f32_x3 (float32_t * __a, float32x4x3_t __b) +{ + union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v4sf (__a, __bu.__o); +} + __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_u8 (uint8_t * __a, uint8x16_t __b) @@ -11819,6 +11877,38 @@ vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __b) __builtin_neon_vst1q_x2v2di ((__builtin_neon_di *) __a, __bu.__o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t __b) +{ + union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t __b) +{ + union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t __b) +{ + union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v4si ((__builtin_neon_si *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __b) +{ + union { uint64x2x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v2di ((__builtin_neon_di *) __a, __bu.__o); +} + __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_p8 (poly8_t * __a, poly8x16_t __b) @@ -11849,6 +11939,22 @@ vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t __b) __builtin_neon_vst1q_x2v8hi ((__builtin_neon_hi *) __a, __bu.__o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t __b) +{ + union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v16qi ((__builtin_neon_qi *) __a, __bu.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t __b) +{ + union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v8hi ((__builtin_neon_hi *) __a, __bu.__o); +} + __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c) @@ -20533,6 +20639,14 @@ vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __b) __builtin_neon_vst1q_x2v8bf (__a, __bu.__o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __b) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst1q_x3v8bf (__a, __bu.__o); +} + __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2_bf16 (bfloat16_t * __ptr, bfloat16x4x2_t __val) diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index da6cebc99881..903076b92b91 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -314,6 +314,7 @@ VAR14 (STORE1, vst1, VAR7 (STORE1, vst1_x2, v8qi, v4hi, v2si, di, v4hf, v2sf, v4bf) VAR7 (STORE1, vst1q_x2, v16qi, v8hi, v4si, v2di, v8hf, v4sf, v8bf) VAR7 (STORE1, vst1_x3, v8qi, v4hi, v2si, di, v4hf, v2sf, v4bf) +VAR7 (STORE1, vst1q_x3, v16qi, v8hi, v4si, v2di, v8hf, v4sf, v8bf) VAR7 (STORE1, vst1_x4, v8qi, v4hi, v2si, di, v4hf, v2sf, v4bf) VAR14 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di, v4bf, v8bf) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 3769105bb26e..dd1b5ba4c9b8 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5241,6 +5241,53 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_store1_3reg")] ) +(define_expand "neon_vst1q_x3" + [(match_operand:CI 0 "neon_struct_operand") + (match_operand:CI 1 "s_register_operand") + (unspec:VDQX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_NEON" +{ + rtx mem = adjust_address (operands[0], EImode, 0); + emit_insn (gen_neon_vst1x3qa (mem, operands[1])); + mem = adjust_address (mem, EImode, GET_MODE_SIZE (EImode)); + emit_insn (gen_neon_vst1x3qb (mem, operands[1])); + DONE; +}) + +(define_insn "neon_vst1x3qa" + [(set (match_operand:EI 0 "neon_struct_operand" "=Um") + (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") + (unspec:VDQX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST1X3A))] + "TARGET_NEON" +{ + rtx ops[2]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (EImode, REGNO (operands[1])); + + output_asm_insn ("vst1.\t%h1, %A0", ops); + return ""; +} + [(set_attr "type" "neon_store1_3reg")] +) + +(define_insn "neon_vst1x3qb" + [(set (match_operand:EI 0 "neon_struct_operand" "=Um") + (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") + (unspec:VDQX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VST1X3B))] + "TARGET_NEON" +{ + rtx ops[2]; + ops[0] = operands[0]; + ops[1] = gen_rtx_REG (EImode, REGNO (operands[1]) + 6); + + output_asm_insn ("vst1.\t%h1, %A0", ops); + return ""; +} + [(set_attr "type" "neon_store1_3reg")] +) + (define_insn "neon_vst1_x4" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 25609dcccb67..bf8cbfe3aa87 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -463,6 +463,8 @@ UNSPEC_VRSRA_U_N UNSPEC_VSRI UNSPEC_VST1 + UNSPEC_VST1X3A + UNSPEC_VST1X3B UNSPEC_VST1_LANE UNSPEC_VST2 UNSPEC_VST2_LANE diff --git a/gcc/testsuite/gcc.target/arm/simd/vst1q_base_xN_1.c b/gcc/testsuite/gcc.target/arm/simd/vst1q_base_xN_1.c index 232feafade09..e9b7d67f06ab 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vst1q_base_xN_1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vst1q_base_xN_1.c @@ -61,10 +61,70 @@ void test_vst1q_p16_x2 (poly16_t * ptr, poly16x8x2_t val) vst1q_p16_x2 (ptr, val); } -/* { dg-final { scan-assembler-times {vst1.8\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 3 } } */ +void test_vst1q_u8_x3 (uint8_t * ptr, uint8x16x3_t val) +{ + vst1q_u8_x3 (ptr, val); +} + +void test_vst1q_u16_x3 (uint16_t * ptr, uint16x8x3_t val) +{ + vst1q_u16_x3 (ptr, val); +} + +void test_vst1q_u32_x3 (uint32_t * ptr, uint32x4x3_t val) +{ + vst1q_u32_x3 (ptr, val); +} + +void test_vst1q_u64_x3 (uint64_t * ptr, uint64x2x3_t val) +{ + vst1q_u64_x3 (ptr, val); +} + +void test_vst1q_s8_x3 (int8_t * ptr, int8x16x3_t val) +{ + vst1q_s8_x3 (ptr, val); +} + +void test_vst1q_s16_x3 (int16_t * ptr, int16x8x3_t val) +{ + vst1q_s16_x3 (ptr, val); +} + +void test_vst1q_s32_x3 (int32_t * ptr, int32x4x3_t val) +{ + vst1q_s32_x3 (ptr, val); +} + +void test_vst1q_s64_x3 (int64_t * ptr, int64x2x3_t val) +{ + vst1q_s64_x3 (ptr, val); +} + +void test_vst1q_f32_x3 (float32_t * ptr, float32x4x3_t val) +{ + vst1q_f32_x3 (ptr, val); +} + +void test_vst1q_p8_x3 (poly8_t * ptr, poly8x16x3_t val) +{ + vst1q_p8_x3 (ptr, val); +} + +void test_vst1q_p16_x3 (poly16_t * ptr, poly16x8x3_t val) +{ + vst1q_p16_x3 (ptr, val); +} + + +/* { dg-final { scan-assembler-times {vst1.8\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {vst1.8\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]!\n} 3 } } */ -/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]!\n} 3 } } */ -/* { dg-final { scan-assembler-times {vst1.32\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {vst1.32\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {vst1.32\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]!\n} 3 } } */ -/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]\n} 4 } } */ +/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]!\n} 2 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vst1q_bf16_xN_1.c b/gcc/testsuite/gcc.target/arm/simd/vst1q_bf16_xN_1.c index 2a4579f0aaef..44d69522dffc 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vst1q_bf16_xN_1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vst1q_bf16_xN_1.c @@ -10,4 +10,10 @@ void test_vst1q_bf16_x2 (bfloat16_t * ptr, bfloat16x8x2_t val) vst1q_bf16_x2 (ptr, val); } -/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 1 } } */ +void test_vst1q_bf16_x3 (bfloat16_t * ptr, bfloat16x8x3_t val) +{ + vst1q_bf16_x3 (ptr, val); +} + +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]!\n} 1 } } */ \ No newline at end of file diff --git a/gcc/testsuite/gcc.target/arm/simd/vst1q_fp16_xN_1.c b/gcc/testsuite/gcc.target/arm/simd/vst1q_fp16_xN_1.c index 61a7e558c48d..f0ac2d35c1c5 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vst1q_fp16_xN_1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vst1q_fp16_xN_1.c @@ -10,4 +10,10 @@ void test_vst1q_f16_x2 (float16_t * ptr, float16x8x2_t val) vst1q_f16_x2 (ptr, val); } -/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 1 } } */ +void test_vst1q_f16_x3 (float16_t * ptr, float16x8x3_t val) +{ + vst1q_f16_x3 (ptr, val); +} + +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {vst1.16\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+\]!\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vst1q_p64_xN_1.c b/gcc/testsuite/gcc.target/arm/simd/vst1q_p64_xN_1.c index 82f3dad293c6..24fdf3c0c1b3 100644 --- a/gcc/testsuite/gcc.target/arm/simd/vst1q_p64_xN_1.c +++ b/gcc/testsuite/gcc.target/arm/simd/vst1q_p64_xN_1.c @@ -10,4 +10,10 @@ void test_vst1q_p64_x2 (poly64_t * ptr, poly64x2x2_t val) vst1q_p64_x2 (ptr, val); } -/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]\n} 1 } } */ +void test_vst1q_p64_x3 (poly64_t * ptr, poly64x2x3_t val) +{ + vst1q_p64_x3 (ptr, val); +} + +/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {vst1.64\t\{d[0-9]+-d[0-9]+\}, \[r[0-9]+:64\]!\n} 1 } } */