AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

author Tamar Christina <tamar.christina@arm.com>

Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)

committer Tamar Christina <tamar.christina@arm.com>

Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)
author Tamar Christina <tamar.christina@arm.com>
Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)
committer Tamar Christina <tamar.christina@arm.com>
Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..7f0cc47d0f071de9297068baa85c6d5fc4d7fa5b 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25413,6 +25413,7 @@ struct expand_vec_perm_d
    unsigned int vec_flags;
    unsigned int op_vec_flags;
    bool one_vector_p;
+  bool zero_op0_p, zero_op1_p;
    bool testing_p;
  };
  
@@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
    /* to_constant is safe since this routine is specific to Advanced SIMD
       vectors.  */
    unsigned int nelt = d->perm.length ().to_constant ();
+
+  /* If one register is the constant vector of 0 then we only need
+     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
+     do this earlier since vec_perm_indices clamps elements to within range so
+     we can only do it during codegen.  */
+  if (d->zero_op0_p)
+    d->op0 = d->op1;
+  else if (d->zero_op1_p)
+    d->op1 = d->op0;
+
    for (unsigned int i = 0; i < nelt; ++i)
-    /* If big-endian and two vectors we end up with a weird mixed-endian
-       mode on NEON.  Reverse the index within each word but not the word
-       itself.  to_constant is safe because we checked is_constant above.  */
-    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
-                       ? d->perm[i].to_constant () ^ (nelt - 1)
-                       : d->perm[i].to_constant ());
+    {
+      auto val = d->perm[i].to_constant ();
+
+      /* If we're selecting from a 0 vector, we can just use an out of range
+        index instead.  */
+      if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
+       rperm[i] = constm1_rtx;
+      else
+       {
+         /* If we are remapping a zero register as the first parameter we need
+            to adjust the indices of the non-zero register.  */
+         if (d->zero_op0_p)
+           val = val % nelt;
+
+         /* If big-endian and two vectors we end up with a weird mixed-endian
+            mode on NEON.  Reverse the index within each word but not the word
+            itself.  to_constant is safe because we checked is_constant
+            above.  */
+         rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
+       }
+    }
  
    sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
    sel = force_reg (vmode, sel);
@@ -26179,6 +26205,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
    else
      d.one_vector_p = false;
  
+  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
+  d.zero_op1_p = op1 == CONST0_RTX (op_mode);
    d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
                      sel.nelts_per_input ());
    d.vmode = vmode;
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c

new file mode 100644 (file)

index 0000000..5595127
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+typedef unsigned short v8hi __attribute__ ((vector_size (16)));
+
+v8hi f2a (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
+}
+
+v8hi f2b (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
+}
+
+typedef unsigned char v16qi __attribute__ ((vector_size (16)));
+
+v16qi f3a (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24);
+}
+
+v16qi f3b (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c

new file mode 100644 (file)

index 0000000..e7d5a67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target le } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (zeros, a, 0, 5, 1, 6);
+}
+
+v4si f2 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */
+/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */
author	Tamar Christina <tamar.christina@arm.com>
	Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)
committer	Tamar Christina <tamar.christina@arm.com>
	Fri, 5 Jul 2024 11:10:39 +0000 (12:10 +0100)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c	[new file with mode: 0644]	patch \| blob