aarch64: Emit XAR for vector rotates where possible

author Kyrylo Tkachov <ktkachov@nvidia.com>

Tue, 22 Oct 2024 14:52:36 +0000 (07:52 -0700)

committer Kyrylo Tkachov <ktkachov@nvidia.com>

Mon, 4 Nov 2024 08:41:09 +0000 (09:41 +0100)
author Kyrylo Tkachov <ktkachov@nvidia.com>
Tue, 22 Oct 2024 14:52:36 +0000 (07:52 -0700)
committer Kyrylo Tkachov <ktkachov@nvidia.com>
Mon, 4 Nov 2024 08:41:09 +0000 (09:41 +0100)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 7388f6b8fdf1aad475c1744de05cf177c6b93b9f..00f99d5004cac9af9b78dd51120fde9d7e7781c6 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16019,17 +16019,39 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
  }
  
  /* Emit an optimized sequence to perform a vector rotate
-   of REG by the vector constant amount AMNT and place the result
+   of REG by the vector constant amount AMNT_VEC and place the result
     in DST.  Return true iff successful.  */
  
  bool
-aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec)
  {
+  rtx amnt = unwrap_const_vec_duplicate (amnt_vec);
+  gcc_assert (CONST_INT_P (amnt));
+  HOST_WIDE_INT rotamnt = UINTVAL (amnt);
    machine_mode mode = GET_MODE (reg);
-  /* Attempt to expand the rotate as a vector permute.
-     For some rotate amounts they can be single instructions and
-     even the general single-vector TBL permute has good throughput.  */
-  if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+  /* Rotates by half the element width map down to REV* instructions and should
+     always be preferred when possible.  */
+  if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2
+      && expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+    return true;
+  /* 64 and 128-bit vector modes can use the XAR instruction
+     when available.  */
+  else if (can_create_pseudo_p ()
+          && ((TARGET_SHA3 && mode == V2DImode)
+              || (TARGET_SVE2
+                  && (known_eq (GET_MODE_SIZE (mode), 8)
+                      || known_eq (GET_MODE_SIZE (mode), 16)))))
+    {
+      rtx zeroes = aarch64_gen_shareable_zero (mode);
+      rtx xar_op
+       = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes),
+                                               amnt_vec);
+      emit_set_insn (dst, xar_op);
+      return true;
+    }
+  /* If none of the above, try to expand rotates by any byte amount as
+     permutes.  */
+  else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
      return true;
    return false;
  }
diff --git a/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c b/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c

new file mode 100644 (file)

index 0000000..7300770
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+#pragma GCC target "+sve2+sha3"
+
+/*
+** G1:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     v0\.2d, v[0-9]+\.2d, v[0-9]+\.2d, 39
+**      ret
+*/
+v2di
+G1 (v2di r) {
+    return (r >> 39) | (r << 25);
+}
+
+/*
+** G2:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.s, z[0-9]+\.s, z[0-9]+\.s, #23
+**      ret
+*/
+v4si
+G2 (v4si r) {
+    return (r >> 23) | (r << 9);
+}
+
+/*
+** G3:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.h, z[0-9]+\.h, z[0-9]+\.h, #5
+**      ret
+*/
+v8hi
+G3 (v8hi r) {
+    return (r >> 5) | (r << 11);
+}
+
+/*
+** G4:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.b, z[0-9]+\.b, z[0-9]+\.b, #6
+**      ret
+*/
+v16qi
+G4 (v16qi r)
+{
+  return (r << 2) | (r >> 6);
+}
+
+/*
+** G5:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.s, z[0-9]+\.s, z[0-9]+\.s, #22
+**      ret
+*/
+v2si
+G5 (v2si r) {
+    return (r >> 22) | (r << 10);
+}
+
+/*
+** G6:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.h, z[0-9]+\.h, z[0-9]+\.h, #7
+**      ret
+*/
+v4hi
+G6 (v4hi r) {
+    return (r >> 7) | (r << 9);
+}
+
+/*
+** G7:
+**     movi?   [vdz][0-9]+\.?(?:[0-9]*[bhsd])?, #?0
+**     xar     z0\.b, z[0-9]+\.b, z[0-9]+\.b, #5
+**      ret
+*/
+v8qi
+G7 (v8qi r)
+{
+  return (r << 3) | (r >> 5);
+}
+
author	Kyrylo Tkachov <ktkachov@nvidia.com>
	Tue, 22 Oct 2024 14:52:36 +0000 (07:52 -0700)
committer	Kyrylo Tkachov <ktkachov@nvidia.com>
	Mon, 4 Nov 2024 08:41:09 +0000 (09:41 +0100)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/rotate_xar_1.c	[new file with mode: 0644]	patch \| blob