aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165]

author Pengxuan Zheng <quic_pzheng@quicinc.com>

Wed, 7 May 2025 17:47:37 +0000 (10:47 -0700)

committer Pengxuan Zheng <quic_pzheng@quicinc.com>

Fri, 16 May 2025 18:23:20 +0000 (11:23 -0700)
author Pengxuan Zheng <quic_pzheng@quicinc.com>
Wed, 7 May 2025 17:47:37 +0000 (10:47 -0700)
committer Pengxuan Zheng <quic_pzheng@quicinc.com>
Fri, 16 May 2025 18:23:20 +0000 (11:23 -0700)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index f5552e4b86ce993dadc12fcb14e7a12add4b4530..34f9725485d2016f6268ee1cb22a1d6fd3f1b6fc 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26886,6 +26886,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
    return true;
  }
  
+/* Recognize patterns suitable for the AND instructions.  */
+static bool
+aarch64_evpc_and (struct expand_vec_perm_d *d)
+{
+  /* Either d->op0 or d->op1 should be a vector of all zeros.  */
+  if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p))
+    return false;
+
+  machine_mode mode = d->vmode;
+  machine_mode sel_mode;
+  if (!related_int_vector_mode (mode).exists (&sel_mode))
+    return false;
+
+  insn_code and_code = optab_handler (and_optab, sel_mode);
+  rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p);
+  if (and_code == CODE_FOR_nothing || !and_mask)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  class expand_operand ops[3];
+  rtx in = d->zero_op0_p ? d->op1 : d->op0;
+  create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode);
+  create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode);
+  create_input_operand (&ops[2], and_mask, sel_mode);
+  expand_insn (and_code, 3, ops);
+  rtx result = gen_lowpart (mode, ops[0].value);
+  if (!rtx_equal_p (d->target, result))
+    emit_move_insn (d->target, result);
+
+  return true;
+}
+
  static bool
  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
  {
@@ -26921,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
             return true;
           else if (aarch64_evpc_uzp (d))
             return true;
+         else if (aarch64_evpc_and (d))
+           return true;
           else if (aarch64_evpc_trn (d))
             return true;
           else if (aarch64_evpc_sel (d))
diff --git a/gcc/optabs.cc b/gcc/optabs.cc

index 92d6d50d55a019fc99a458992259c52b7fd199f6..5c9450f61450fa4425d08339a1c2b5f7f5e654ec 100644 (file)
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6362,6 +6362,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target,
    return NULL_RTX;
  }
  
+/* Check if vec_perm mask SEL is a constant equivalent to an and operation of
+   the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s,
+   assuming the other vec_perm operand is a constant vector of zeros.  Return
+   the mask for the equivalent and operation, or NULL_RTX if the vec_perm can
+   not be modeled as an and.  MODE is the mode of the value being anded.
+   ZERO_OP0_P is true if the first operand of the vec_perm is a constant vector
+   of zeros or false if the second operand of the vec_perm is a constant vector
+   of zeros.  */
+rtx
+vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+                  bool zero_op0_p)
+{
+  unsigned int nelt;
+  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
+    return NULL_RTX;
+
+  rtx_vector_builder builder (mode, nelt, 1);
+  machine_mode emode = GET_MODE_INNER (mode);
+
+  for (unsigned int i = 0; i < nelt; i++)
+    {
+      if (zero_op0_p)
+       {
+         if (known_eq (sel[i], nelt + i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_lt (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+      else
+       {
+         if (known_eq (sel[i], i))
+           builder.quick_push (CONSTM1_RTX (emode));
+         else if (known_ge (sel[i], nelt))
+           builder.quick_push (CONST0_RTX (emode));
+         else
+           return NULL_RTX;
+       }
+    }
+
+  return builder.build ();
+}
+
  /* Implement a permutation of vectors v0 and v1 using the permutation
     vector in SEL and return the result.  Use TARGET to hold the result
     if nonnull and convenient.
diff --git a/gcc/optabs.h b/gcc/optabs.h

index ae525c848d3278e6538c1cad80e0065e24b6ae58..a8b0e93d60bcd63c9de5ee6a435843da1c5798dd 100644 (file)
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -334,6 +334,10 @@ extern bool have_insn_for (enum rtx_code, machine_mode);
  /* Generate a conditional trap instruction.  */
  extern rtx_insn *gen_cond_trap (enum rtx_code, rtx, rtx, rtx);
  
+/* Check whether the vec_perm can be interpreted as an and operation.  */
+extern rtx vec_perm_and_mask (machine_mode mode, const vec_perm_indices &sel,
+                             bool zero_op0_p);
+
  /* Generate code for VEC_PERM_EXPR.  */
  extern rtx expand_vec_perm_var (machine_mode, rtx, rtx, rtx, rtx);
  extern rtx expand_vec_perm_const (machine_mode, rtx, rtx,
diff --git a/gcc/testsuite/gcc.target/aarch64/and-be.c b/gcc/testsuite/gcc.target/aarch64/and-be.c

new file mode 100644 (file)

index 0000000..7457dd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and-be.c
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff000000
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/and-le.c b/gcc/testsuite/gcc.target/aarch64/and-le.c

new file mode 100644 (file)

index 0000000..398813b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/and-le.c
@@ -0,0 +1,123 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+
+/*
+** f_v4hi:
+**     mvni    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+f_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 });
+}
+
+/*
+** g_v4hi:
+**     movi    v([0-9]+).2s, 0xff, msl 8
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v4hi
+g_v4hi (v4hi x)
+{
+  return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 5, 2, 7 });
+}
+
+/*
+** f_v8hi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v8hi
+f_v8hi (v8hi x)
+{
+  return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8hi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+f_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 2, 5 });
+}
+
+/*
+** g_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+g_v4si (v4si x)
+{
+  return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 1, 5, 3, 7 });
+}
+
+/*
+** h_v4si:
+**     movi    v([0-9]+).2d, 0xffffffff00000000
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4si
+h_v4si (v4si x)
+{
+  return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 7, 1, 6, 3 });
+}
+
+/*
+** f_v4sf:
+**     movi    v([0-9]+).2d, 0xffffffff
+**     and     v0.16b, (?:v0.16b, v\1.16b|v\1.16b, v0.16b)
+**     ret
+*/
+v4sf
+f_v4sf (v4sf x)
+{
+  return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 0, 6, 2, 7 });
+}
+
+/*
+** f_v8qi:
+**     movi    d([0-9]+), 0xff00ff00ff
+**     and     v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b)
+**     ret
+*/
+v8qi
+f_v8qi (v8qi x)
+{
+  return __builtin_shuffle (x, (v8qi){ 0, 0, 0, 0, 0, 0, 0, 0 },
+                           (v8qi){ 0, 8, 2, 9, 4, 10, 12, 11 });
+}
+
+/*
+** f_v16qi:
+**     ...
+**     and     v0.16b, (?:v0.16b, v[0-9]+.16b|v[0-9]+.16b, v0.16b)
+**     ret
+*/
+v16qi
+f_v16qi (v16qi x)
+{
+  return __builtin_shuffle (
+      x, (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      (v16qi){ 16, 1, 17, 3, 18, 5, 19, 7, 20, 9, 21, 11, 22, 13, 23, 24 });
+}
author	Pengxuan Zheng <quic_pzheng@quicinc.com>
	Wed, 7 May 2025 17:47:37 +0000 (10:47 -0700)
committer	Pengxuan Zheng <quic_pzheng@quicinc.com>
	Fri, 16 May 2025 18:23:20 +0000 (11:23 -0700)
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/optabs.cc		patch \| blob \| blame \| history
gcc/optabs.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/and-be.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/and-le.c	[new file with mode: 0644]	patch \| blob