vect: Fix SLP layout handling of masked loads [PR106794]

author Richard Sandiford <richard.sandiford@arm.com>

Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)

committer Richard Sandiford <richard.sandiford@arm.com>

Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)
author Richard Sandiford <richard.sandiford@arm.com>
Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)
committer Richard Sandiford <richard.sandiford@arm.com>
Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)
diff --git a/gcc/testsuite/g++.dg/vect/pr106794.cc b/gcc/testsuite/g++.dg/vect/pr106794.cc

new file mode 100644 (file)

index 0000000..f056563
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr106794.cc
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+/* { dg-additional-options "-march=bdver2" { target x86_64-*-* i?86-*-* } } */
+
+template <class T> struct Vector3 {
+  Vector3();
+  Vector3(T, T, T);
+  T length() const;
+  T x, y, z;
+};
+template <class T>
+Vector3<T>::Vector3(T _x, T _y, T _z) : x(_x), y(_y), z(_z) {}
+Vector3<float> cross(Vector3<float> a, Vector3<float> b) {
+  return Vector3<float>(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z,
+                        a.x * b.y - a.y * b.x);
+}
+template <class T> T Vector3<T>::length() const { return z; }
+int generateNormals_i;
+float generateNormals_p2_0, generateNormals_p0_0;
+struct SphereMesh {
+  void generateNormals();
+  float vertices;
+};
+void SphereMesh::generateNormals() {
+  Vector3<float> *faceNormals = new Vector3<float>;
+  for (int j; j; j++) {
+    float *p0 = &vertices + 3, *p1 = &vertices + j * 3, *p2 = &vertices + 3,
+          *p3 = &vertices + generateNormals_i + j * 3;
+    Vector3<float> v0(p1[0] - generateNormals_p0_0, p1[1] - 1, p1[2] - 2),
+        v1(0, 1, 2);
+    if (v0.length())
+      v1 = Vector3<float>(p3[0] - generateNormals_p2_0, p3[1] - p2[1],
+                          p3[2] - p2[2]);
+    else
+      v1 = Vector3<float>(generateNormals_p0_0 - p3[0], p0[1] - p3[1],
+                          p0[2] - p3[2]);
+    Vector3<float> faceNormal = cross(v0, v1);
+    faceNormals[j] = faceNormal;
+  }
+}
diff --git a/gcc/testsuite/gcc.dg/vect/pr106914.c b/gcc/testsuite/gcc.dg/vect/pr106914.c

new file mode 100644 (file)

index 0000000..9d9b3e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr106914.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fprofile-generate" } */
+/* { dg-additional-options "-mavx512vl" { target x86_64-*-* i?86-*-* } } */
+
+int *mask_slp_int64_t_8_2_x, *mask_slp_int64_t_8_2_y, *mask_slp_int64_t_8_2_z;
+
+void
+__attribute__mask_slp_int64_t_8_2() {
+  for (int i; i; i += 8) {
+    mask_slp_int64_t_8_2_x[i + 6] =
+        mask_slp_int64_t_8_2_y[i + 6] ? mask_slp_int64_t_8_2_z[i] : 1;
+    mask_slp_int64_t_8_2_x[i + 7] =
+        mask_slp_int64_t_8_2_y[i + 7] ? mask_slp_int64_t_8_2_z[i + 7] : 2;
+  }
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index ca3422c2a1e972867f0c7e318edf53c47048ac51..229f2663ebcc6161e9294381a9a857308fe4c713 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4494,7 +4494,8 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    if (rep
        && STMT_VINFO_DATA_REF (rep)
-      && DR_IS_READ (STMT_VINFO_DATA_REF (rep)))
+      && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
+      && SLP_TREE_LOAD_PERMUTATION (node).exists ())
      {
        auto_load_permutation_t tmp_perm;
        tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
@@ -4569,8 +4570,12 @@ vect_optimize_slp_pass::start_choosing_layouts ()
        if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
         {
           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
-            unpermuted, record a layout that reverses this permutation.  */
-         gcc_assert (partition.layout == 0);
+            unpermuted, record a layout that reverses this permutation.
+
+            We would need more work to cope with loads that are internally
+            permuted and also have inputs (such as masks for
+            IFN_MASK_LOADs).  */
+         gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
             continue;
           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
@@ -4684,12 +4689,21 @@ vect_optimize_slp_pass::start_choosing_layouts ()
           vertex.weight = vect_slp_node_weight (node);
  
           /* We do not handle stores with a permutation, so all
-            incoming permutations must have been materialized.  */
+            incoming permutations must have been materialized.
+
+            We also don't handle masked grouped loads, which lack a
+            permutation vector.  In this case the memory locations
+            form an implicit second input to the loads, on top of the
+            explicit mask input, and the memory input's layout cannot
+            be changed.
+
+            On the other hand, we do support permuting gather loads and
+            masked gather loads, where each scalar load is independent
+            of the others.  This can be useful if the address/index input
+            benefits from permutation.  */
           if (STMT_VINFO_DATA_REF (rep)
-             && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
-           /* ???  We're forcing materialization in place
-              of the child here, we'd need special handling
-              in materialization to leave layout -1 here.  */
+             && STMT_VINFO_GROUPED_ACCESS (rep)
+             && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
             partition.layout = 0;
  
           /* We cannot change the layout of an operation that is
author	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 20 Sep 2022 14:27:47 +0000 (15:27 +0100)
gcc/testsuite/g++.dg/vect/pr106794.cc	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/pr106914.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-slp.cc		patch \| blob \| blame \| history