]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Use SVE ASRD instruction with Neon modes.
authorSoumya AR <soumyaa@nvidia.com>
Wed, 11 Dec 2024 04:15:09 +0000 (09:45 +0530)
committerSoumya AR <soumyaa@nvidia.com>
Wed, 11 Dec 2024 04:20:02 +0000 (09:50 +0530)
The ASRD instruction on SVE performs an arithmetic shift right by an immediate
for divide.

This patch enables the use of ASRD with Neon modes.

For example:

int in[N], out[N];

void
foo (void)
{
  for (int i = 0; i < N; i++)
    out[i] = in[i] / 4;
}

compiles to:

ldr q31, [x1, x0]
cmlt v30.16b, v31.16b, #0
and z30.b, z30.b, 3
add v30.16b, v30.16b, v31.16b
sshr v30.16b, v30.16b, 2
str q30, [x0, x2]
add x0, x0, 16
cmp x0, 1024

but can just be:

ldp q30, q31, [x0], 32
asrd z31.b, p7/m, z31.b, #2
asrd z30.b, p7/m, z30.b, #2
stp q30, q31, [x1], 32
cmp x0, x2

This patch also adds the following overload:
aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
Depending on the data mode, the function returns a predicate with the
appropriate bits set.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_ptrue_reg): New overload.
* config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): Likewise.
* config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3
and *sdiv_pow2<mode>3 to support Neon modes.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/sve-asrd.c: New test.

Co-authored-by: Richard Sandiford <richard.sandiford@arm.com>
Signed-off-by: Soumya AR <soumyaa@nvidia.com>
gcc/config/aarch64/aarch64-protos.h
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/aarch64.cc
gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c [new file with mode: 0644]

index db2baca58665d6a0c9f3ebb99f5fe780f6882cd3..bd17486e9128a21bd205ef1fb3ec3e323408ec59 100644 (file)
@@ -1018,6 +1018,7 @@ void aarch64_expand_mov_immediate (rtx, rtx);
 rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
 rtx aarch64_ptrue_reg (machine_mode);
 rtx aarch64_ptrue_reg (machine_mode, unsigned int);
+rtx aarch64_ptrue_reg (machine_mode, machine_mode);
 rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
index 04326bca0e7415e736156eb8db44f6142138480e..a72ca2a500d394598268c6adfe717eed94a304b3 100644 (file)
 
 ;; Unpredicated ASRD.
 (define_expand "sdiv_pow2<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-       (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+       (unspec:SVE_VDQ_I
          [(match_dup 3)
-          (unspec:SVE_I
-            [(match_operand:SVE_I 1 "register_operand")
+          (unspec:SVE_VDQ_I
+            [(match_operand:SVE_VDQ_I 1 "register_operand")
              (match_operand 2 "aarch64_simd_rshift_imm")]
             UNSPEC_ASRD)]
         UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
-    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode, <MODE>mode);
   }
 )
 
 ;; Predicated ASRD.
 (define_insn "*sdiv_pow2<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-       (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+       (unspec:SVE_VDQ_I
          [(match_operand:<VPRED> 1 "register_operand")
-          (unspec:SVE_I
-            [(match_operand:SVE_I 2 "register_operand")
-             (match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
+          (unspec:SVE_VDQ_I
+            [(match_operand:SVE_VDQ_I 2 "register_operand")
+             (match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
             UNSPEC_ASRD)]
          UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
-     [ w        , Upl , 0 ; *              ] asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
-     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+     [ w        , Upl , 0 ; *              ] asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0, %Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
   }
 )
 
index 3606dc174c2f5ff941bf119457af5816f2a90cce..4d1b3cca0c42e053764933391be7b0e21f79999e 100644 (file)
@@ -3778,6 +3778,22 @@ aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
   return gen_lowpart (mode, reg);
 }
 
+/* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
+
+   DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
+   If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
+   of the predicate will be active and the rest will be inactive.
+   If DATA_MODE is an SVE mode, every bit of the predicate will be active.  */
+rtx
+aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
+{
+  if (aarch64_sve_mode_p (data_mode))
+    return aarch64_ptrue_reg (pred_mode);
+
+  auto size = GET_MODE_SIZE (data_mode).to_constant ();
+  return aarch64_ptrue_reg (pred_mode, size);
+}
+
 /* Return an all-false predicate register of mode MODE.  */
 
 rtx
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c b/gcc/testsuite/gcc.target/aarch64/sve/sve-asrd.c
new file mode 100644 (file)
index 0000000..341baae
--- /dev/null
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+#define FUNC(TYPE, I)                                                          \
+  TYPE M_##TYPE##_##I[I];                                                      \
+  void asrd_##TYPE##_##I ()                                                    \
+  {                                                                            \
+    for (int i = 0; i < I; i++)                                                \
+      {                                                                        \
+       M_##TYPE##_##I[i] /= 4;                                                \
+      }                                                                        \
+  }
+
+/*
+** asrd_int8_t_8:
+**     ...
+**     ptrue   (p[0-7]).b, vl8
+**     ...
+**     asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
+**     ...
+*/
+FUNC(int8_t, 8);
+
+/*
+** asrd_int8_t_16:
+**     ...
+**     ptrue   (p[0-7]).b, vl16
+**     ...
+**     asrd    z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
+**     ...
+*/
+FUNC(int8_t, 16);
+
+/*
+** asrd_int16_t_4:
+**     ...
+**     ptrue   (p[0-7]).b, vl8
+**     ...
+**     asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
+**     ...
+*/
+FUNC(int16_t, 4);
+
+/*
+** asrd_int16_t_8:
+**     ...
+**     ptrue   (p[0-7]).b, vl16
+**     ...
+**     asrd    z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
+**     ...
+*/
+FUNC(int16_t, 8);
+
+/*
+** asrd_int32_t_2:
+**     ...
+**     ptrue   (p[0-7]).b, vl8
+**     ...
+**     asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
+**     ...
+*/
+FUNC(int32_t, 2);
+
+/*
+** asrd_int32_t_4:
+**     ...
+**     ptrue   (p[0-7]).b, vl16
+**     ...
+**     asrd    z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
+**     ...
+*/
+FUNC(int32_t, 4);
+
+/*
+** asrd_int64_t_2:
+**     ...
+**     ptrue   (p[0-7]).b, vl16
+**     ...
+**     asrd    z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
+**     ...
+*/
+FUNC(int64_t, 2);
+