PR96463: Optimise svld1rq from vectors for little endian AArch64 targets.

author Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>

Sun, 12 Jun 2022 03:20:16 +0000 (08:50 +0530)

committer Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>

Sun, 12 Jun 2022 03:25:04 +0000 (08:55 +0530)
author Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Sun, 12 Jun 2022 03:20:16 +0000 (08:50 +0530)
committer Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Sun, 12 Jun 2022 03:25:04 +0000 (08:55 +0530)
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc

index bee410929bd39d6d9883945ed1069dc785b6ea8b..82f9eba5c397af04924bdebdc684a1d77682d3fd 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -44,6 +44,7 @@
  #include "aarch64-sve-builtins-shapes.h"
  #include "aarch64-sve-builtins-base.h"
  #include "aarch64-sve-builtins-functions.h"
+#include "ssa.h"
  
  using namespace aarch64_sve;
  
@@ -1207,6 +1208,64 @@ public:
      insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
      return e.use_contiguous_load_insn (icode);
    }
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    tree arg0 = gimple_call_arg (f.call, 0);
+    tree arg1 = gimple_call_arg (f.call, 1);
+
+    /* Transform:
+       lhs = svld1rq ({-1, -1, ... }, arg1)
+       into:
+       tmp = mem_ref<vectype> [(elem * {ref-all}) arg1]
+       lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
+       on little endian target.
+       vectype is the corresponding ADVSIMD type.  */
+
+    if (!BYTES_BIG_ENDIAN
+       && integer_all_onesp (arg0))
+      {
+       tree lhs = gimple_call_lhs (f.call);
+       tree lhs_type = TREE_TYPE (lhs);
+       poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+       tree eltype = TREE_TYPE (lhs_type);
+
+       scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
+       machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
+       tree vectype = build_vector_type_for_mode (eltype, vq_mode);
+
+       tree elt_ptr_type
+         = build_pointer_type_for_mode (eltype, VOIDmode, true);
+       tree zero = build_zero_cst (elt_ptr_type);
+
+       /* Use element type alignment.  */
+       tree access_type
+         = build_aligned_type (vectype, TYPE_ALIGN (eltype));
+
+       tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0);
+       tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero);
+       gimple *mem_ref_stmt
+         = gimple_build_assign (mem_ref_lhs, mem_ref_op);
+       gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
+
+       int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+       vec_perm_builder sel (lhs_len, source_nelts, 1);
+       for (int i = 0; i < source_nelts; i++)
+         sel.quick_push (i);
+
+       vec_perm_indices indices (sel, 1, source_nelts);
+       gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
+                                                  TYPE_MODE (access_type),
+                                                  indices));
+       tree mask_type = build_vector_type (ssizetype, lhs_len);
+       tree mask = vec_perm_indices_to_tree (mask_type, indices);
+       return gimple_build_assign (lhs, VEC_PERM_EXPR,
+                                   mem_ref_lhs, mem_ref_lhs, mask);
+      }
+
+    return NULL;
+  }
  };
  
  class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 5969d1f56c2b3f3e13e38bf9537a51d6379c9ca8..d21e041eccbc755b73703e144cd71559f86dc241 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23342,7 +23342,9 @@ struct expand_vec_perm_d
    rtx target, op0, op1;
    vec_perm_indices perm;
    machine_mode vmode;
+  machine_mode op_mode;
    unsigned int vec_flags;
+  unsigned int op_vec_flags;
    bool one_vector_p;
    bool testing_p;
  };
@@ -23577,6 +23579,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
  
    newd.vmode = new_mode;
    newd.vec_flags = VEC_ADVSIMD;
+  newd.op_mode = newd.vmode;
+  newd.op_vec_flags = newd.vec_flags;
    newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
    newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
    newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
@@ -23891,6 +23895,33 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
    return true;
  }
  
+/* Try to implement D using SVE dup instruction.  */
+
+static bool
+aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
+{
+  if (BYTES_BIG_ENDIAN
+      || !d->one_vector_p
+      || d->vec_flags != VEC_SVE_DATA
+      || d->op_vec_flags != VEC_ADVSIMD
+      || d->perm.encoding ().nelts_per_pattern () != 1
+      || !known_eq (d->perm.encoding ().npatterns (),
+                   GET_MODE_NUNITS (d->op_mode))
+      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+    return false;
+
+  int npatterns = d->perm.encoding ().npatterns ();
+  for (int i = 0; i < npatterns; i++)
+    if (!known_eq (d->perm[i], i))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  return true;
+}
+
  /* Try to implement D using SVE SEL instruction.  */
  
  static bool
@@ -24014,6 +24045,8 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d)
  static bool
  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
  {
+  gcc_assert (d->op_mode != E_VOIDmode);
+
    /* The pattern matching functions above are written to look for a small
       number to begin the sequence (0, 1, N/2).  If we begin with an index
       from the second operand, we can swap the operands.  */
@@ -24030,30 +24063,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
         || d->vec_flags == VEC_SVE_PRED)
        && known_gt (nelt, 1))
      {
-      if (aarch64_evpc_rev_local (d))
-       return true;
-      else if (aarch64_evpc_rev_global (d))
-       return true;
-      else if (aarch64_evpc_ext (d))
-       return true;
-      else if (aarch64_evpc_dup (d))
-       return true;
-      else if (aarch64_evpc_zip (d))
-       return true;
-      else if (aarch64_evpc_uzp (d))
-       return true;
-      else if (aarch64_evpc_trn (d))
-       return true;
-      else if (aarch64_evpc_sel (d))
-       return true;
-      else if (aarch64_evpc_ins (d))
-       return true;
-      else if (aarch64_evpc_reencode (d))
-       return true;
-      if (d->vec_flags == VEC_SVE_DATA)
-       return aarch64_evpc_sve_tbl (d);
-      else if (d->vec_flags == VEC_ADVSIMD)
-       return aarch64_evpc_tbl (d);
+      if (d->vmode == d->op_mode)
+       {
+         if (aarch64_evpc_rev_local (d))
+           return true;
+         else if (aarch64_evpc_rev_global (d))
+           return true;
+         else if (aarch64_evpc_ext (d))
+           return true;
+         else if (aarch64_evpc_dup (d))
+           return true;
+         else if (aarch64_evpc_zip (d))
+           return true;
+         else if (aarch64_evpc_uzp (d))
+           return true;
+         else if (aarch64_evpc_trn (d))
+           return true;
+         else if (aarch64_evpc_sel (d))
+           return true;
+         else if (aarch64_evpc_ins (d))
+           return true;
+         else if (aarch64_evpc_reencode (d))
+           return true;
+
+         if (d->vec_flags == VEC_SVE_DATA)
+           return aarch64_evpc_sve_tbl (d);
+         else if (d->vec_flags == VEC_ADVSIMD)
+           return aarch64_evpc_tbl (d);
+       }
+      else
+       {
+         if (aarch64_evpc_sve_dup (d))
+           return true;
+       }
      }
    return false;
  }
@@ -24065,9 +24107,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
                                   rtx target, rtx op0, rtx op1,
                                   const vec_perm_indices &sel)
  {
-  if (vmode != op_mode)
-    return false;
-
    struct expand_vec_perm_d d;
  
    /* Check whether the mask can be applied to a single vector.  */
@@ -24091,6 +24130,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
                      sel.nelts_per_input ());
    d.vmode = vmode;
    d.vec_flags = aarch64_classify_vector_mode (d.vmode);
+  d.op_mode = op_mode;
+  d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
    d.target = target;
    d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
    if (op0 == op1)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c

new file mode 100644 (file)

index 0000000..b68f43c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8x16_t, s8)
+TEST(svint16_t, int16x8_t, s16)
+TEST(svint32_t, int32x4_t, s32)
+TEST(svint64_t, int64x2_t, s64)
+
+TEST(svuint8_t, uint8x16_t, u8)
+TEST(svuint16_t, uint16x8_t, u16)
+TEST(svuint32_t, uint32x4_t, u32)
+TEST(svuint64_t, uint64x2_t, u64)
+
+TEST(svfloat16_t, float16x8_t, f16)
+TEST(svfloat32_t, float32x4_t, f32)
+TEST(svfloat64_t, float64x2_t, f64)
+
+TEST(svbfloat16_t, bfloat16x8_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c

new file mode 100644 (file)

index 0000000..196de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+#include "arm_sve.h"
+
+#define TEST(ret_type, param_type, suffix) \
+ret_type test_##suffix(param_type *x) \
+{ \
+  return svld1rq_##suffix (svptrue_b8 (), &x[0]); \
+}
+
+TEST(svint8_t, int8_t, s8)
+TEST(svint16_t, int16_t, s16)
+TEST(svint32_t, int32_t, s32)
+TEST(svint64_t, int64_t, s64)
+
+TEST(svuint8_t, uint8_t, u8)
+TEST(svuint16_t, uint16_t, u16)
+TEST(svuint32_t, uint32_t, u32)
+TEST(svuint64_t, uint64_t, u64)
+
+TEST(svfloat16_t, float16_t, f16)
+TEST(svfloat32_t, float32_t, f32)
+TEST(svfloat64_t, float64_t, f64)
+
+TEST(svbfloat16_t, bfloat16_t, bf16)
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]} 12 { target aarch64_little_endian } } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc

index 8de1b144a426776bf464765477c71ee8f2e52b81..9e5d84a980552667dc7dce48519fb45c8e36449c 100644 (file)
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -4297,18 +4297,14 @@ verify_gimple_assign_ternary (gassign *stmt)
        break;
  
      case VEC_PERM_EXPR:
-      if (!useless_type_conversion_p (lhs_type, rhs1_type)
-         || !useless_type_conversion_p (lhs_type, rhs2_type))
-       {
-         error ("type mismatch in %qs", code_name);
-         debug_generic_expr (lhs_type);
-         debug_generic_expr (rhs1_type);
-         debug_generic_expr (rhs2_type);
-         debug_generic_expr (rhs3_type);
-         return true;
-       }
+      /* If permute is constant, then we allow for lhs and rhs
+        to have different vector types, provided:
+        (1) lhs, rhs1, rhs2 have same element type.
+        (2) rhs3 vector is constant and has integer element type.
+        (3) len(lhs) == len(rhs3) && len(rhs1) == len(rhs2).  */
  
-      if (TREE_CODE (rhs1_type) != VECTOR_TYPE
+      if (TREE_CODE (lhs_type) != VECTOR_TYPE
+         || TREE_CODE (rhs1_type) != VECTOR_TYPE
           || TREE_CODE (rhs2_type) != VECTOR_TYPE
           || TREE_CODE (rhs3_type) != VECTOR_TYPE)
         {
@@ -4320,10 +4316,28 @@ verify_gimple_assign_ternary (gassign *stmt)
           return true;
         }
  
+      /* If rhs3 is constant, we allow lhs, rhs1 and rhs2 to be different vector types,
+        as long as lhs, rhs1 and rhs2 have same element type.  */
+      if (TREE_CONSTANT (rhs3)
+         ? (!useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs1_type))
+            || !useless_type_conversion_p (TREE_TYPE (lhs_type), TREE_TYPE (rhs2_type)))
+         : (!useless_type_conversion_p (lhs_type, rhs1_type)
+            || !useless_type_conversion_p (lhs_type, rhs2_type)))
+       {
+           error ("type mismatch in %qs", code_name);
+           debug_generic_expr (lhs_type);
+           debug_generic_expr (rhs1_type);
+           debug_generic_expr (rhs2_type);
+           debug_generic_expr (rhs3_type);
+           return true;
+       }
+
+      /* If rhs3 is constant, relax the check len(rhs2) == len(rhs3).  */
        if (maybe_ne (TYPE_VECTOR_SUBPARTS (rhs1_type),
                     TYPE_VECTOR_SUBPARTS (rhs2_type))
-         || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
-                      TYPE_VECTOR_SUBPARTS (rhs3_type))
+         || (!TREE_CONSTANT(rhs3)
+             && maybe_ne (TYPE_VECTOR_SUBPARTS (rhs2_type),
+                          TYPE_VECTOR_SUBPARTS (rhs3_type)))
           || maybe_ne (TYPE_VECTOR_SUBPARTS (rhs3_type),
                        TYPE_VECTOR_SUBPARTS (lhs_type)))
         {
author	Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
	Sun, 12 Jun 2022 03:20:16 +0000 (08:50 +0530)
committer	Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
	Sun, 12 Jun 2022 03:25:04 +0000 (08:55 +0530)
gcc/config/aarch64/aarch64-sve-builtins-base.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr96463-2.c	[new file with mode: 0644]	patch \| blob
gcc/tree-cfg.cc		patch \| blob \| blame \| history