]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Extend SVE2 bit-select instructions for Neon modes.
authorSoumya AR <soumyaa@nvidia.com>
Wed, 11 Dec 2024 04:02:35 +0000 (09:32 +0530)
committerSoumya AR <soumyaa@nvidia.com>
Wed, 11 Dec 2024 04:02:35 +0000 (09:32 +0530)
NBSL, BSL1N, and BSL2N are bit-select intructions on SVE2 with certain operands
inverted. These can be extended to work with Neon modes.

Since these instructions are unpredicated, duplicate patterns were added with
the predicate removed to generate these instructions for Neon modes.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.

Signed-off-by: Soumya AR <soumyaa@nvidia.com>
gcc/ChangeLog:

* config/aarch64/aarch64-sve2.md
(*aarch64_sve2_nbsl_unpred<mode>): New pattern to match unpredicated
form.
(*aarch64_sve2_bsl1n_unpred<mode>): Likewise.
(*aarch64_sve2_bsl2n_unpred<mode>): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/bitsel.c: New test.

gcc/config/aarch64/aarch64-sve2.md
gcc/testsuite/gcc.target/aarch64/sve/bitsel.c [new file with mode: 0644]

index 7e8a505f707cdc71c3d2d0b54ec3d16c3dedf67d..5f41df7cf6ec031d18a7d148d16f83ba67ab4d99 100644 (file)
   }
 )
 
+(define_insn "*aarch64_sve2_nbsl_unpred<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (not:VDQ_I
+         (xor:VDQ_I
+           (and:VDQ_I
+             (xor:VDQ_I
+               (match_operand:VDQ_I 1 "register_operand")
+               (match_operand:VDQ_I 2 "register_operand"))
+             (match_operand:VDQ_I 3 "register_operand"))
+           (match_dup BSL_DUP))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , 1         , 2         , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;nbsl\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+  }
+)
+
 ;; Unpredicated bitwise select with inverted first operand.
 ;; (op3 ? ~bsl_mov : bsl_dup) == ((~(bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)
 (define_expand "@aarch64_sve2_bsl1n<mode>"
   }
 )
 
+(define_insn "*aarch64_sve2_bsl1n_unpred<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (xor:VDQ_I
+         (and:VDQ_I
+           (not:VDQ_I
+             (xor:VDQ_I
+               (match_operand:VDQ_I 1 "register_operand")
+               (match_operand:VDQ_I 2 "register_operand")))
+           (match_operand:VDQ_I 3 "register_operand"))
+         (match_dup BSL_DUP)))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , 1         , 2         , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] bsl1n\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;bsl1n\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+  }
+)
+
 ;; Unpredicated bitwise select with inverted second operand.
 ;; (bsl_dup ? bsl_mov : ~op3) == ((bsl_dup & bsl_mov) | (~op3 & ~bsl_dup))
 (define_expand "@aarch64_sve2_bsl2n<mode>"
   }
 )
 
+(define_insn "*aarch64_sve2_bsl2n_unpred<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (ior:VDQ_I
+         (and:VDQ_I
+           (match_operand:VDQ_I 1 "register_operand")
+           (match_operand:VDQ_I 2 "register_operand"))
+         (and:VDQ_I
+           (not:VDQ_I (match_operand:VDQ_I 3 "register_operand"))
+           (not:VDQ_I (match_dup BSL_DUP)))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , 1         , 2         , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+  }
+)
+
+(define_insn "*aarch64_sve2_bsl2n_unpred<mode>"
+  [(set (match_operand:VDQ_I 0 "register_operand")
+       (ior:VDQ_I
+         (and:VDQ_I
+           (match_operand:VDQ_I 1 "register_operand")
+           (match_operand:VDQ_I 2 "register_operand"))
+         (and:VDQ_I
+           (not:VDQ_I (match_dup BSL_DUP))
+           (not:VDQ_I (match_operand:VDQ_I 3 "register_operand")))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , 1         , 2         , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Shift-and-accumulate operations
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/bitsel.c b/gcc/testsuite/gcc.target/aarch64/sve/bitsel.c
new file mode 100644 (file)
index 0000000..635bfef
--- /dev/null
@@ -0,0 +1,35 @@
+/* { dg-options "-O2 -mcpu=neoverse-v2 --param aarch64-autovec-preference=asimd-only" } */
+
+#include <stdint.h>
+
+#define OPNBSL(x,y,z) (~(((x) & (z)) | ((y) & ~(z))))
+#define OPBSL1N(x,y,z) ((~(x) & (z)) | ((y) & ~(z)))
+#define OPBSL2N(x,y,z) (((x) & (z)) | (~(y) & ~(z)))
+
+#define N 1024
+
+#define TYPE(N) int##N##_t
+
+#define TEST(SIZE, OP, SUFFIX)                                  \
+void __attribute__ ((noinline, noclone))                        \
+f_##SIZE##_##SUFFIX                                             \
+  (TYPE(SIZE) *restrict a, TYPE(SIZE) *restrict b,              \
+   TYPE(SIZE) *restrict c, TYPE(SIZE) *restrict d)              \
+{                                                               \
+  for (int i = 0; i < N; i++)                                   \
+    a[i] = OP (b[i], c[i], d[i]);                               \
+}
+
+#define TEST_ALL(SIZE)                                          \
+  TEST(SIZE, OPNBSL, nbsl)                                      \
+  TEST(SIZE, OPBSL1N, bsl1n)                                    \
+  TEST(SIZE, OPBSL2N, bsl2n)
+
+TEST_ALL(8);
+TEST_ALL(16);
+TEST_ALL(32);
+TEST_ALL(64);
+
+/* { dg-final { scan-assembler-times {\tnbsl\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl1n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tbsl2n\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
\ No newline at end of file