PR111754: Rework encoding of result for VEC_PERM_EXPR with constant input vectors.

author Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>

Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)

committer Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>

Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)
author Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)
committer Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc

index 332bc8aead29efab08ddd5099fed577c6571bdca..dff09b81f7bcf537387084ed667eb8479f86ab71 100644 (file)
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -10803,27 +10803,38 @@ fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices &sel,
    unsigned res_npatterns, res_nelts_per_pattern;
    unsigned HOST_WIDE_INT res_nelts;
  
-  /* (1) If SEL is a suitable mask as determined by
-     valid_mask_for_fold_vec_perm_cst_p, then:
-     res_npatterns = max of npatterns between ARG0, ARG1, and SEL
-     res_nelts_per_pattern = max of nelts_per_pattern between
-                            ARG0, ARG1 and SEL.
-     (2) If SEL is not a suitable mask, and TYPE is VLS then:
-     res_npatterns = nelts in result vector.
-     res_nelts_per_pattern = 1.
-     This exception is made so that VLS ARG0, ARG1 and SEL work as before.  */
-  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
-    {
-      res_npatterns
-       = std::max (VECTOR_CST_NPATTERNS (arg0),
-                   std::max (VECTOR_CST_NPATTERNS (arg1),
-                             sel.encoding ().npatterns ()));
+  /* First try to implement the fold in a VLA-friendly way.
+
+     (1) If the selector is simply a duplication of N elements, the
+        result is likewise a duplication of N elements.
+
+     (2) If the selector is N elements followed by a duplication
+        of N elements, the result is too.
+
+     (3) If the selector is N elements followed by an interleaving
+        of N linear series, the situation is more complex.
+
+        valid_mask_for_fold_vec_perm_cst_p detects whether we
+        can handle this case.  If we can, then each of the N linear
+        series either (a) selects the same element each time or
+        (b) selects a linear series from one of the input patterns.
  
-      res_nelts_per_pattern
-       = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
-                   std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
-                             sel.encoding ().nelts_per_pattern ()));
+        If (b) holds for one of the linear series, the result
+        will contain a linear series, and so the result will have
+        the same shape as the selector.  If (a) holds for all of
+        the linear series, the result will be the same as (2) above.
  
+        (b) can only hold if one of the input patterns has a
+        stepped encoding.  */
+
+  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason))
+    {
+      res_npatterns = sel.encoding ().npatterns ();
+      res_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
+      if (res_nelts_per_pattern == 3
+         && VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3
+         && VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3)
+       res_nelts_per_pattern = 2;
        res_nelts = res_npatterns * res_nelts_per_pattern;
      }
    else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
@@ -17622,6 +17633,29 @@ test_nunits_min_2 (machine_mode vmode)
         tree expected_res[] = { ARG0(0), ARG1(0), ARG1(1) };
         validate_res (1, 3, res, expected_res);
        }
+
+      /* Case 8: Same as aarch64/sve/slp_3.c:
+        arg0, arg1 are dup vectors.
+        sel = { 0, len, 1, len+1, 2, len+2, ... } // (2, 3)
+        So res = { arg0[0], arg1[0], ... } // (2, 1)
+
+        In this case, since the input vectors are dup, only the first two
+        elements per pattern in sel are considered significant.  */
+      {
+       tree arg0 = build_vec_cst_rand (vmode, 1, 1);
+       tree arg1 = build_vec_cst_rand (vmode, 1, 1);
+       poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+       vec_perm_builder builder (len, 2, 3);
+       poly_uint64 mask_elems[] = { 0, len, 1, len + 1, 2, len + 2 };
+       builder_push_elems (builder, mask_elems);
+
+       vec_perm_indices sel (builder, 2, len);
+       tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+       tree expected_res[] = { ARG0(0), ARG1(0) };
+       validate_res (2, 1, res, expected_res);
+      }
      }
  }
  
@@ -17790,6 +17824,44 @@ test_nunits_min_4 (machine_mode vmode)
         ASSERT_TRUE (res == NULL_TREE);
         ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns"));
        }
+
+      /* Case 8: PR111754: When input vector is not a stepped sequence,
+        check that the result is not a stepped sequence either, even
+        if sel has a stepped sequence.  */
+      {
+       tree arg0 = build_vec_cst_rand (vmode, 1, 2);
+       poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+       vec_perm_builder builder (len, 1, 3);
+       poly_uint64 mask_elems[] = { 0, 1, 2 };
+       builder_push_elems (builder, mask_elems);
+
+       vec_perm_indices sel (builder, 1, len);
+       tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg0, sel);
+
+       tree expected_res[] = { ARG0(0), ARG0(1) };
+       validate_res (sel.encoding ().npatterns (), 2, res, expected_res);
+      }
+
+      /* Case 9: If sel doesn't contain a stepped sequence,
+        check that the result has same encoding as sel, irrespective
+        of shape of input vectors.  */
+      {
+       tree arg0 = build_vec_cst_rand (vmode, 1, 3, 1);
+       tree arg1 = build_vec_cst_rand (vmode, 1, 3, 1);
+       poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+
+       vec_perm_builder builder (len, 1, 2);
+       poly_uint64 mask_elems[] = { 0, len };
+       builder_push_elems (builder, mask_elems);
+
+       vec_perm_indices sel (builder, 2, len);
+       tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
+
+       tree expected_res[] = { ARG0(0), ARG1(0) };
+       validate_res (sel.encoding ().npatterns (),
+                     sel.encoding ().nelts_per_pattern (), res, expected_res);
+      }
      }
  }
  
diff --git a/gcc/testsuite/gcc.dg/vect/pr111754.c b/gcc/testsuite/gcc.dg/vect/pr111754.c

new file mode 100644 (file)

index 0000000..7c1c168
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr111754.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+typedef float __attribute__((__vector_size__ (16))) F;
+
+F foo (F a, F b)
+{
+  F v = (F) { 9 };
+  return __builtin_shufflevector (v, v, 1, 0, 1, 2);
+}
+
+/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */
+/* { dg-final { scan-tree-dump "return \{ 0.0, 9.0e\\+0, 0.0, 0.0 \}" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c

index 82dd43a4d98db9d15ca0a6441a3a5da78bced87f..775c1e1d530ffe1872af154bbf3583ec1c169539 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
@@ -33,21 +33,14 @@ TEST_ALL (VEC_PERM)
  
  /* 1 for each 8-bit type.  */
  /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
-/* 1 for each 16-bit type plus 1 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
+/* 1 for each 16-bit type  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
  /* 1 for each 32-bit type.  */
  /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
-/* 3 for double.  */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 6 } } */
  /* The 64-bit types need:
-
-      ZIP1 ZIP1 (2 ZIP2s optimized away)
        ZIP1 ZIP2.  */
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
  /* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
  
  /* The loop should be fully-masked.  The 64-bit types need two loads
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c

index b1fa5e3cf68ba5dfcde0052a72cc1f98e3592426..5a9fc8ff750bf7886a64b9e4846ee1851ca249d2 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
@@ -35,31 +35,20 @@ vec_slp_##TYPE (TYPE *restrict a, int n)                    \
  
  TEST_ALL (VEC_PERM)
  
-/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
+/* 1 for each 8-bit type  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 2 } } */
  /* 1 for each 16-bit type.  */
  /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #80\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #63\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
-/* 4 for double.  */
-/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 18 } } */
  /* The 32-bit types need:
  
-      ZIP1 ZIP1 (2 ZIP2s optimized away)
        ZIP1 ZIP2
  
     and the 64-bit types need:
  
-      ZIP1 ZIP1 ZIP1 ZIP1 (4 ZIP2s optimized away)
        ZIP1 ZIP2 ZIP1 ZIP2
        ZIP1 ZIP2 ZIP1 ZIP2.  */
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 33 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
  /* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
  
  /* The loop should be fully-masked.  The 32-bit types need two loads
author	Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
	Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)
committer	Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
	Mon, 27 Nov 2023 17:10:49 +0000 (22:40 +0530)
gcc/fold-const.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/pr111754.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/slp_3.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/sve/slp_4.c		patch \| blob \| blame \| history