]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]
authorPengxuan Zheng <quic_pzheng@quicinc.com>
Thu, 1 Aug 2024 00:00:01 +0000 (17:00 -0700)
committerPengxuan Zheng <quic_pzheng@quicinc.com>
Fri, 2 Aug 2024 00:48:36 +0000 (17:48 -0700)
This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt     v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt     z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount<mode>2): Add TARGET_SVE
support.
* config/aarch64/aarch64-sve.md (@aarch64_pred_<optab><mode>): Use new
iterator SVE_VDQ_I.
* config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator.
(VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/iterators.md
gcc/testsuite/gcc.target/aarch64/popcnt-sve.c [new file with mode: 0644]

index 459e11b09a19cdc97a5153cfd8c4e0e07a7ffb0c..816f499e9634989c41da13ab7ca8a6294bb09f08 100644 (file)
        (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+    if (TARGET_SVE)
+      {
+       rtx p = aarch64_ptrue_reg (<VPRED>mode);
+       emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
+                                                   p,
+                                                   operands[1]));
+       DONE;
+      }
+
     /* Generate a byte popcount.  */
     machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
     rtx tmp = gen_reg_rtx (mode);
index c3ed5075c4ed20983d6aa22d287d8433929aa418..a5cd42be9d5cfad431e0742f537b8bf22b2ed961 100644 (file)
 
 ;; Integer unary arithmetic predicated with a PTRUE.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-       (unspec:SVE_I
+  [(set (match_operand:SVE_VDQ_I 0 "register_operand")
+       (unspec:SVE_VDQ_I
          [(match_operand:<VPRED> 1 "register_operand")
-          (SVE_INT_UNARY:SVE_I
-            (match_operand:SVE_I 2 "register_operand"))]
+          (SVE_INT_UNARY:SVE_VDQ_I
+            (match_operand:SVE_VDQ_I 2 "register_operand"))]
          UNSPEC_PRED_X))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 ; attrs: movprfx ]
-     [ w        , Upl , 0 ; *              ] <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
-     [ ?&w      , Upl , w ; yes            ] movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+     [ w        , Upl , 0 ; *              ] <sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
+     [ ?&w      , Upl , w ; yes            ] movprfx\t%Z0, %Z2\;<sve_int_op>\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>
   }
 )
 
   }
 )
 
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] General unary arithmetic corresponding to unspecs
 ;; -------------------------------------------------------------------------
index 95fe8f070f4c3f5770e4424162bf13b712adedf3..aaa4afefe2ce95b92dc2a3cff51ad4bdd1ee33b7 100644 (file)
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
+;; All SVE and Advanced SIMD integer vector modes.
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
                                VNx4SI VNx2SI
                         (VNx32BF "VNx8BI")
                         (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
                         (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+                        (V8QI "VNx8BI") (V16QI "VNx16BI")
+                        (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
                         (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
new file mode 100644 (file)
index 0000000..8e349ef
--- /dev/null
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f_v4hi:
+**     ptrue   (p[0-7]).b, all
+**     ldr     d([0-9]+), \[x0\]
+**     cnt     z\2.h, \1/m, z\2.h
+**     str     d\2, \[x1\]
+**     ret
+*/
+void
+f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v8hi:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.h, \1/m, z\2.h
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+  d[4] = __builtin_popcount (b[4]);
+  d[5] = __builtin_popcount (b[5]);
+  d[6] = __builtin_popcount (b[6]);
+  d[7] = __builtin_popcount (b[7]);
+}
+
+/*
+** f_v2si:
+**     ptrue   (p[0-7]).b, all
+**     ldr     d([0-9]+), \[x0\]
+**     cnt     z\2.s, \1/m, z\2.s
+**     str     d\2, \[x1\]
+**     ret
+*/
+void
+f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+}
+
+/*
+** f_v4si:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.s, \1/m, z\2.s
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v2di:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.d, \1/m, z\2.d
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v2di (unsigned long *__restrict b, unsigned long *__restrict d)
+{
+  d[0] = __builtin_popcountll (b[0]);
+  d[1] = __builtin_popcountll (b[1]);
+}