load and store-lanes with SLP

author Richard Biener <rguenther@suse.de>

Fri, 5 Jul 2024 08:35:08 +0000 (10:35 +0200)

committer Richard Biener <rguenth@gcc.gnu.org>

Mon, 2 Sep 2024 06:50:32 +0000 (08:50 +0200)
author Richard Biener <rguenther@suse.de>
Fri, 5 Jul 2024 08:35:08 +0000 (10:35 +0200)
committer Richard Biener <rguenth@gcc.gnu.org>
Mon, 2 Sep 2024 06:50:32 +0000 (08:50 +0200)
diff --git a/gcc/testsuite/gcc.dg/vect/pr68445.c b/gcc/testsuite/gcc.dg/vect/pr68445.c

index 15bffdc7e05f5f5237d905d2bbf07d7231938d27..71d61b93bf662f1ca8a7be37769a62990863dc86 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/pr68445.c
+++ b/gcc/testsuite/gcc.dg/vect/pr68445.c
@@ -16,4 +16,4 @@ void IMB_double_fast_x (int *destf, int *dest, int y, int *p1f)
      }
  }
  
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-1.c b/gcc/testsuite/gcc.dg/vect/slp-1.c

index d4a13f12df664f76cd69c158df844ed3a202f350..e1a45e1f1a7a7b29d4f5bd65f21686d88a38aa54 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-1.c
@@ -122,5 +122,4 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target {! vect_strided5 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target vect_strided5 } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11b.c b/gcc/testsuite/gcc.dg/vect/slp-11b.c

index df64c8db350dbb12295c61e84d32d5a5c20a1ebe..0208f03dafb482d08d748df52ad75e3f5c4b4e85 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-11b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11b.c
@@ -45,4 +45,4 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_strided4 || vect_perm } && vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && vect_int_mult } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-11c.c b/gcc/testsuite/gcc.dg/vect/slp-11c.c

index 2e70fca39ba1bc0a9b1a15d67612a1dd1c88374a..25d7f2ce3832f579d6a603da220bcf9a67346375 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-11c.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-11c.c
@@ -45,5 +45,4 @@ int main (void)
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } */
  /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { { vect_uintfloat_cvt && vect_strided2 } && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-2.c b/gcc/testsuite/gcc.dg/vect/slp-2.c

index d0de3577eb6a1b8219e8a79a1a684f6b1b7baf52..08d2116c3bee49bf684e3eb7e31e5c485e844894 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-2.c
@@ -144,5 +144,5 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
    
diff --git a/gcc/testsuite/gcc.dg/vect/slp-23.c b/gcc/testsuite/gcc.dg/vect/slp-23.c

index 8836acf03306742605734342aa09bb2d7893694d..d32ee5ba73becb9e0b53bfc2af27a64571c56899 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-23.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-23.c
@@ -114,5 +114,5 @@ int main (void)
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */
  /* SLP fails for the second loop with variable-length SVE because
     the load size is greater than the minimum vector size.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */
    
diff --git a/gcc/testsuite/gcc.dg/vect/slp-33.c b/gcc/testsuite/gcc.dg/vect/slp-33.c

index c382093c2329b09d3ef9e78abadd1f7ffe22dfda..9c6c1e4cbecf06dde53f6a2240c268ec3e01d3da 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-33.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-33.c
@@ -108,7 +108,7 @@ int main (void)
  /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  {target {vect_uintfloat_cvt && vect_int_mult} } } } */
  /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
  /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target {vect_uintfloat_cvt && vect_int_mult} } } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  {target {{! { vect_uintfloat_cvt}} && {! {vect_int_mult}}} } } } */
    
diff --git a/gcc/testsuite/gcc.dg/vect/slp-42.c b/gcc/testsuite/gcc.dg/vect/slp-42.c

index 6b78246c2dff027db93e9572182a041722618955..53eca6b6648ddbda6fa60a398f9875e18fab5106 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-42.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-42.c
@@ -15,5 +15,5 @@ void foo (int n)
      }
  }
  
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail { vect_variable_length && { ! vect_strided8 } } } } } */
  /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c

index bf445473657c39cdf4c01f271aa02f4d11fcc4f8..b44a673f7dec50cdb32e5da0ba1360922296c699 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -98,4 +98,4 @@ main ()
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail { vect_load_lanes && vect_variable_length } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-53.c b/gcc/testsuite/gcc.dg/vect/slp-53.c

index d8cd5f85b3c48b876261ea1c4725a3457ce70173..50b3e9d3cee511984fd652091167d84815dfa824 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-53.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-53.c
@@ -12,4 +12,5 @@ void foo (int * __restrict x, int *y)
      }
  }
  
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target { vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-54.c b/gcc/testsuite/gcc.dg/vect/slp-54.c

index ab66b349d1f405b20444858e4d2ba230a8ee197f..57268ab50b7b9fd36be4491206032fae97307823 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-54.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-54.c
@@ -15,4 +15,4 @@ void foo (int * __restrict x, int *y)
      }
  }
  
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } xfail riscv*-*-* } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-55.c b/gcc/testsuite/gcc.dg/vect/slp-55.c

new file mode 100644 (file)

index 0000000..0bf65ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-55.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_int_mult } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+void foo (int * __restrict a, int *b, int *c)
+{
+  for (int i = 0; i < 1024; ++i)
+    {
+      a[2*i] = b[i] + 7;
+      a[2*i+1] = c[i] * 3;
+    }
+}
+
+int bar (int *b)
+{
+  int res = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      res += b[2*i] + 7;
+      res += b[2*i+1] * 3;
+    }
+  return res;
+}
+
+void baz (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 1024; ++i)
+    {
+      a[2*i] = b[2*i] + 7;
+      a[2*i+1] = b[2*i+1] * 3;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-56.c b/gcc/testsuite/gcc.dg/vect/slp-56.c

new file mode 100644 (file)

index 0000000..0b985ea
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-56.c
@@ -0,0 +1,51 @@
+#include "tree-vect.h"
+
+/* This is a load-lane / masked-store-lane test that more reliably
+   triggers SLP than SVEs mask_srtuct_store_*.c  */
+
+void __attribute__ ((noipa))
+test4 (int *__restrict dest, int *__restrict src,
+       int *__restrict cond, int bias, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      int value0 = src[i * 4] + bias;
+      int value1 = src[i * 4 + 1] * bias;
+      int value2 = src[i * 4 + 2] + bias;
+      int value3 = src[i * 4 + 3] * bias;
+      if (cond[i])
+        {
+          dest[i * 4] = value0;
+          dest[i * 4 + 1] = value1;
+          dest[i * 4 + 2] = value2;
+          dest[i * 4 + 3] = value3;
+        }
+    }
+}
+
+int dest[16*4];
+int src[16*4];
+int cond[16];
+const int dest_chk[16*4] = {0, 0, 0, 0, 9, 25, 11, 35, 0, 0, 0, 0, 17, 65, 19,
+    75, 0, 0, 0, 0, 25, 105, 27, 115, 0, 0, 0, 0, 33, 145, 35, 155, 0, 0, 0,
+    0, 41, 185, 43, 195, 0, 0, 0, 0, 49, 225, 51, 235, 0, 0, 0, 0, 57, 265, 59,
+    275, 0, 0, 0, 0, 65, 305, 67, 315};
+
+int main()
+{
+  check_vect ();
+#pragma GCC novector
+  for (int i = 0; i < 16; ++i)
+    cond[i] = i & 1;
+#pragma GCC novector
+  for (int i = 0; i < 16 * 4; ++i)
+    src[i] = i;
+  test4 (dest, src, cond, 5, 16);
+#pragma GCC novector
+  for (int i = 0; i < 16 * 4; ++i)
+    if (dest[i] != dest_chk[i])
+      abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target { vect_variable_length && vect_load_lanes } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c

index c76ea5d17efa938d61a2b967e1392cd9860b1bbe..16ab0cc76059720f124ea126941b9252d122cf96 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-cond-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-cond-1.c
@@ -125,5 +125,4 @@ main ()
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target { ! vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { target { vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c

index 2792b932734a7a8ad4958454de56956081753d7c..07f871c897265855f31edc368685fe1eeb28d8ea 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c
@@ -56,5 +56,4 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_unpack } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack xfail { vect_variable_length && vect_load_lanes } } } } */
-
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c

index 5c75dc12b695785405b7d56891e7e71ac24e2539..0f7b479ce594bf5c045062cf0a6e8d14cfb61066 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c
@@ -51,5 +51,5 @@ int main (void)
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_unpack } } } */
  /* The epilogues are vectorized using partial vectors.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target { vect_unpack && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  { target { { vect_unpack && vect_partial_vectors_usage_1 } && { ! s390_vx } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c

index dbb107f95fec3338b135ff965e8be2b514cc1fe6..93b59075bce37550e2c6f8af5e617f44edc82d0e 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
@@ -81,9 +81,8 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump "can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
  
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c

index 03de4c61b503db3ab0d48b09e18dfc74364da66d..2cce30c2444323ba6166ceee6a768fbd9d881a47 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-10.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-10.c
@@ -53,4 +53,4 @@ int main ()
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
  /* SLP fails for variable-length SVE because the load size is greater
     than the minimum vector size.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { { aarch64_sve || riscv_v } && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm xfail { aarch64_sve && vect_variable_length } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c

index 41fd159adce8395dd805f089e94aacfe7eeba09f..6ac29e731226542abcca9a8b142451d5eb64cc43 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
@@ -55,8 +55,6 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes  } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c

index 9ea35ba5afca2db0033150e35fca6b961b389c03..d1953054892550b16b45b954b6531826c4f8002b 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
@@ -68,9 +68,7 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm && {! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm || vect_load_lanes } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
  
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c

index f4bda39c837dbe17985caa7880f00b1d4d4e357d..107968f1f7ce65c53bf0280e700f659f625d8c1e 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
@@ -115,4 +115,4 @@ int main (int argc, const char* argv[])
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
  /* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! { vect_load_lanes && vect_strided5 } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c

index 7128cf471555d5f589b11e1e58a65b0211e7d6fd..0dedd4a9b86c547c265cf4f10e62e81e6782ecc2 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
@@ -105,9 +105,6 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
-
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c

index 5cc6261d69a15d2a3f6b691c13544c27dc8f9941..000848c587c311fcdb0a1294a4d586cb3e66b02c 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -106,5 +106,5 @@ int main (int argc, const char* argv[])
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
  /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm3_int } } } */
  /* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
-/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes xfail vect_perm3_int } } } */
+/* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c

index df13c37bc75d43173d4e1b9d0daf533ba5829c7f..f15736ef729ddc379d8d32f69920849c2b3d986a 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
@@ -97,8 +97,6 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_int || vect_load_lanes } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c

index 029be5485b62ffef915f3b6b28306501852733d7..7610524f0bf587702b20d1ea3425324d8ba081bf 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
@@ -61,10 +61,8 @@ int main (int argc, const char* argv[])
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_byte } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_byte && { { ! vect_partial_vectors_usage_1 } || s390_vx } } } } } */
  /* The epilogues are vectorized using partial vectors.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && { { ! vect_load_lanes } && { vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump "Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_byte && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_byte && {  vect_partial_vectors_usage_1 && { ! s390_vx } } } } } } */
  /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
  /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c

index 89400fb4565920940b710f8c3f4d41a308679fa7..c9468d81a9de55076e473ca3019e5faf731f36d1 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
@@ -58,7 +58,5 @@ int main (int argc, const char* argv[])
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */
  /* We don't try permutes with a group size of 3 for variable-length
     vectors.  */
-/* { dg-final { scan-tree-dump "permutation requires at least three vectors" "vect" { target { vect_perm_short && { ! vect_perm3_short } } xfail vect_variable_length } } } */
-/* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_perm3_short || { vect32 || vect_load_lanes } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short || { vect32 || vect_load_lanes } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c

index ac562dc475c7bf73cf49a662fa406d51b6bddf69..0d850720d63151d596e8f99bbef58e23222ab6db 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
@@ -40,5 +40,4 @@ main (void)
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target vect_load_lanes } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { ! vect_hw_misalign } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc

index 1fb7bbd4d258190c4deb99f01ca2cc7d4939e5d3..242d5e2d9169387e44b426fa22186a3f1850d594 100644 (file)
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2958,82 +2958,6 @@ start_over:
                                        "unsupported SLP instances\n");
           goto again;
         }
-
-      /* Check whether any load in ALL SLP instances is possibly permuted.  */
-      slp_tree load_node, slp_root;
-      unsigned i, x;
-      slp_instance instance;
-      bool can_use_lanes = true;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
-       {
-         slp_root = SLP_INSTANCE_TREE (instance);
-         int group_size = SLP_TREE_LANES (slp_root);
-         tree vectype = SLP_TREE_VECTYPE (slp_root);
-         bool loads_permuted = false;
-         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
-           {
-             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
-               continue;
-             unsigned j;
-             stmt_vec_info load_info;
-             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
-               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
-                 {
-                   loads_permuted = true;
-                   break;
-                 }
-           }
-
-         /* If the loads and stores can be handled with load/store-lane
-            instructions record it and move on to the next instance.  */
-         if (loads_permuted
-             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
-             && vect_store_lanes_supported (vectype, group_size, false)
-                  != IFN_LAST)
-           {
-             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
-               if (STMT_VINFO_GROUPED_ACCESS
-                     (SLP_TREE_REPRESENTATIVE (load_node)))
-                 {
-                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
-                       (SLP_TREE_REPRESENTATIVE (load_node));
-                   /* Use SLP for strided accesses (or if we can't
-                      load-lanes).  */
-                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
-                       || vect_load_lanes_supported
-                            (STMT_VINFO_VECTYPE (stmt_vinfo),
-                             DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
-                     break;
-                 }
-
-             can_use_lanes
-               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
-
-             if (can_use_lanes && dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location,
-                                "SLP instance %p can use load/store-lanes\n",
-                                (void *) instance);
-           }
-         else
-           {
-             can_use_lanes = false;
-             break;
-           }
-       }
-
-      /* If all SLP instances can use load/store-lanes abort SLP and try again
-        with SLP disabled.  */
-      if (can_use_lanes)
-       {
-         ok = opt_result::failure_at (vect_location,
-                                      "Built SLP cancelled: can use "
-                                      "load/store-lanes\n");
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "Built SLP cancelled: all SLP instances support "
-                            "load/store-lanes\n");
-         goto again;
-       }
      }
  
    /* Dissolve SLP-only groups.  */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index 2304cdac583862b9be57681550b0455dad233f68..5a65a99d61ed9600d7793a68899f4d6089c35ad8 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -121,6 +121,7 @@ _slp_tree::_slp_tree ()
    SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
    SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
    SLP_TREE_CODE (this) = ERROR_MARK;
+  this->ldst_lanes = false;
    SLP_TREE_VECTYPE (this) = NULL_TREE;
    SLP_TREE_REPRESENTATIVE (this) = NULL;
    SLP_TREE_REF_COUNT (this) = 1;
@@ -3483,7 +3484,8 @@ static bool
  vect_analyze_slp_instance (vec_info *vinfo,
                            scalar_stmts_to_slp_tree_map_t *bst_map,
                            stmt_vec_info stmt_info, slp_instance_kind kind,
-                          unsigned max_tree_size, unsigned *limit);
+                          unsigned max_tree_size, unsigned *limit,
+                          bool force_single_lane = false);
  
  /* Build an interleaving scheme for the store sources RHS_NODES from
     SCALAR_STMTS.  */
@@ -3678,7 +3680,8 @@ vect_build_slp_instance (vec_info *vinfo,
                          unsigned max_tree_size, unsigned *limit,
                          scalar_stmts_to_slp_tree_map_t *bst_map,
                          /* ???  We need stmt_info for group splitting.  */
-                        stmt_vec_info stmt_info_)
+                        stmt_vec_info stmt_info_,
+                        bool force_single_lane = false)
  {
    /* If there's no budget left bail out early.  */
    if (*limit == 0)
@@ -3707,9 +3710,17 @@ vect_build_slp_instance (vec_info *vinfo,
    poly_uint64 max_nunits = 1;
    unsigned tree_size = 0;
    unsigned i;
-  slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
-                                      &max_nunits, matches, limit,
-                                      &tree_size, bst_map);
+
+  slp_tree node = NULL;
+  if (force_single_lane)
+    {
+      matches[0] = true;
+      matches[1] = false;
+    }
+  else
+    node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+                               &max_nunits, matches, limit,
+                               &tree_size, bst_map);
    if (node != NULL)
      {
        /* Calculate the unrolling factor based on the smallest type.  */
@@ -3905,10 +3916,33 @@ vect_build_slp_instance (vec_info *vinfo,
        /* For loop vectorization split the RHS into arbitrary pieces of
          size >= 1.  */
        else if (is_a <loop_vec_info> (vinfo)
-              && (i > 0 && i < group_size)
-              && !vect_slp_prefer_store_lanes_p (vinfo,
-                                                 stmt_info, group_size, i))
-       {
+              && (group_size != 1 && i < group_size))
+       {
+         /* There are targets that cannot do even/odd interleaving schemes
+            so they absolutely need to use load/store-lanes.  For now
+            force single-lane SLP for them - they would be happy with
+            uniform power-of-two lanes (but depending on element size),
+            but even if we can use 'i' as indicator we would need to
+            backtrack when later lanes fail to discover with the same
+            granularity.  We cannot turn any of strided or scatter store
+            into store-lanes.  */
+         /* ???  If this is not in sync with what get_load_store_type
+            later decides the SLP representation is not good for other
+            store vectorization methods.  */
+         bool want_store_lanes
+           = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+              && ! STMT_VINFO_STRIDED_P (stmt_info)
+              && compare_step_with_zero (vinfo, stmt_info) > 0
+              && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
+                                                group_size, 1));
+         if (want_store_lanes || force_single_lane)
+           i = 1;
+
+         /* A fatal discovery fail doesn't always mean single-lane SLP
+            isn't a possibility, so try.  */
+         if (i == 0)
+           i = 1;
+
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
                              "Splitting SLP group at stmt %u\n", i);
@@ -3942,7 +3976,10 @@ vect_build_slp_instance (vec_info *vinfo,
                                                (max_nunits, end - start));
                   rhs_nodes.safe_push (node);
                   start = end;
-                 end = group_size;
+                 if (want_store_lanes || force_single_lane)
+                   end = start + 1;
+                 else
+                   end = group_size;
                 }
               else
                 {
@@ -3976,7 +4013,31 @@ vect_build_slp_instance (vec_info *vinfo,
             }
  
           /* Now we assume we can build the root SLP node from all stores.  */
-         node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
+         if (want_store_lanes)
+           {
+             /* For store-lanes feed the store node with all RHS nodes
+                in order.  */
+             node = vect_create_new_slp_node (scalar_stmts,
+                                              SLP_TREE_CHILDREN
+                                                (rhs_nodes[0]).length ());
+             SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+             node->ldst_lanes = true;
+             SLP_TREE_CHILDREN (node)
+               .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
+                               + rhs_nodes.length () - 1);
+             /* First store value and possibly mask.  */
+             SLP_TREE_CHILDREN (node)
+               .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
+             /* Rest of the store values.  All mask nodes are the same,
+                this should be guaranteed by dataref group discovery.  */
+             for (unsigned j = 1; j < rhs_nodes.length (); ++j)
+               SLP_TREE_CHILDREN (node)
+                 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
+             for (slp_tree child : SLP_TREE_CHILDREN (node))
+               child->refcnt++;
+           }
+         else
+           node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
  
           while (!rhs_nodes.is_empty ())
             vect_free_slp_tree (rhs_nodes.pop ());
@@ -4043,7 +4104,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
                            scalar_stmts_to_slp_tree_map_t *bst_map,
                            stmt_vec_info stmt_info,
                            slp_instance_kind kind,
-                          unsigned max_tree_size, unsigned *limit)
+                          unsigned max_tree_size, unsigned *limit,
+                          bool force_single_lane)
  {
    vec<stmt_vec_info> scalar_stmts;
  
@@ -4088,7 +4150,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
                                       roots, remain,
                                       max_tree_size, limit, bst_map,
                                       kind == slp_inst_kind_store
-                                     ? stmt_info : NULL);
+                                     ? stmt_info : NULL, force_single_lane);
  
    /* ???  If this is slp_inst_kind_store and the above succeeded here's
       where we should do store group splitting.  */
@@ -4184,12 +4246,50 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
       lower.  */
    stmt_vec_info first
      = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
+  unsigned group_lanes = DR_GROUP_SIZE (first);
+
+  /* Verify if all load permutations can be implemented with a suitably
+     large element load-lanes operation.  */
+  unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
+  if (STMT_VINFO_STRIDED_P (first)
+      || compare_step_with_zero (loop_vinfo, first) <= 0
+      || exact_log2 (ld_lanes_lanes) == -1
+      /* ???  For now only support the single-lane case as there is
+        missing support on the store-lane side and code generation
+        isn't up to the task yet.  */
+      || ld_lanes_lanes != 1
+      || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
+                                   group_lanes / ld_lanes_lanes,
+                                   false) == IFN_LAST)
+    ld_lanes_lanes = 0;
+  else
+    /* Verify the loads access the same number of lanes aligned to
+       ld_lanes_lanes.  */
+    for (slp_tree load : loads)
+      {
+       if (SLP_TREE_LANES (load) != ld_lanes_lanes)
+         {
+           ld_lanes_lanes = 0;
+           break;
+         }
+       unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
+       if (first % ld_lanes_lanes != 0)
+         {
+           ld_lanes_lanes = 0;
+           break;
+         }
+       for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
+         if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
+           {
+             ld_lanes_lanes = 0;
+             break;
+           }
+      }
  
    /* Only a power-of-two number of lanes matches interleaving with N levels.
       ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
       at each step.  */
-  unsigned group_lanes = DR_GROUP_SIZE (first);
-  if (exact_log2 (group_lanes) == -1 && group_lanes != 3)
+  if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
      return;
  
    for (slp_tree load : loads)
@@ -4206,7 +4306,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
          with a non-1:1 load permutation around instead of canonicalizing
          those into a load and a permute node.  Removing this early
          check would do such canonicalization.  */
-      if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+      if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
+         && ld_lanes_lanes == 0)
         continue;
  
        /* First build (and possibly re-use) a load node for the
@@ -4239,10 +4340,20 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
         final_perm.quick_push
           (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
  
+      if (ld_lanes_lanes != 0)
+       {
+         /* ???  If this is not in sync with what get_load_store_type
+            later decides the SLP representation is not good for other
+            store vectorization methods.  */
+         l0->ldst_lanes = true;
+         load->ldst_lanes = true;
+       }
+
        while (1)
         {
           unsigned group_lanes = SLP_TREE_LANES (l0);
-         if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
+         if (ld_lanes_lanes != 0
+             || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
             break;
  
           /* Try to lower by reducing the group to half its size using an
@@ -4570,6 +4681,94 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
         }
      }
  
+  /* Check whether we should force some SLP instances to use load/store-lanes
+     and do so by forcing SLP re-discovery with single lanes.  We used
+     to cancel SLP when this applied to all instances in a loop but now
+     we decide this per SLP instance.  It's important to do this only
+     after SLP pattern recognition.  */
+  if (is_a <loop_vec_info> (vinfo))
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+      if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+         && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
+       {
+         slp_tree slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+
+         auto_vec<slp_tree> loads;
+         hash_set<slp_tree> visited;
+         vect_gather_slp_loads (loads, slp_root, visited);
+
+         /* Check whether any load in the SLP instance is possibly
+            permuted.  */
+         bool loads_permuted = false;
+         slp_tree load_node;
+         unsigned j;
+         FOR_EACH_VEC_ELT (loads, j, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned k;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can use load/store-lanes force re-discovery
+            with single lanes.  */
+         if (loads_permuted
+             && !slp_root->ldst_lanes
+             && vect_store_lanes_supported (vectype, group_size, false)
+             != IFN_LAST)
+           {
+             bool can_use_lanes = true;
+             FOR_EACH_VEC_ELT (loads, j, load_node)
+               if (STMT_VINFO_GROUPED_ACCESS
+                     (SLP_TREE_REPRESENTATIVE (load_node)))
+                 {
+                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                       (SLP_TREE_REPRESENTATIVE (load_node));
+                   /* Use SLP for strided accesses (or if we can't
+                      load-lanes).  */
+                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                       || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
+                       || vect_load_lanes_supported
+                            (STMT_VINFO_VECTYPE (stmt_vinfo),
+                             DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+                     {
+                       can_use_lanes = false;
+                       break;
+                     }
+                 }
+
+             if (can_use_lanes)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "SLP instance %p can use load/store-lanes,"
+                                    " re-discovering with single-lanes\n",
+                                    (void *) instance);
+
+                 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
+
+                 vect_free_slp_instance (instance);
+                 limit = max_tree_size;
+                 bool res = vect_analyze_slp_instance (vinfo, bst_map,
+                                                       stmt_info,
+                                                       slp_inst_kind_store,
+                                                       max_tree_size, &limit,
+                                                       true);
+                 gcc_assert (res);
+                 auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
+                 LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
+               }
+           }
+       }
+
    /* When we end up with load permutations that we cannot possibly handle,
       like those requiring three vector inputs, lower them using interleaving
       like schemes.  */
@@ -9877,6 +10076,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
  
    gcc_assert (perm.length () == SLP_TREE_LANES (node));
  
+  /* Load-lanes permute.  This permute only acts as a forwarder to
+     select the correct vector def of the load-lanes load which
+     has the permuted vectors in its vector defs like
+     { v0, w0, r0, v1, w1, r1 ... } for a ld3.  */
+  if (node->ldst_lanes)
+    {
+      gcc_assert (children.length () == 1);
+      if (!gsi)
+       /* This is a trivial op always supported.  */
+       return 1;
+      slp_tree child = children[0];
+      unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
+                         / SLP_TREE_LANES (node));
+      unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
+      for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
+       {
+         tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
+         node->push_vec_def (def);
+       }
+      return 1;
+    }
+
    /* REPEATING_P is true if every output vector is guaranteed to use the
       same permute vector.  We can handle that case for both variable-length
       and constant-length vectors, but we only handle other cases for
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 72a29c0584b949a30bc44f20b108aeef950b3bb4..d2282c0dc4fd0490944d2f9db22510718dc773b3 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1509,7 +1509,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
  
    unsigned int nvectors;
    if (slp_node)
-    nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+    /* ???  Incorrect for multi-lane lanes.  */
+    nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
    else
      nvectors = vect_get_num_copies (loop_vinfo, vectype);
  
@@ -1795,7 +1796,7 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
     elements with a known constant step.  Return -1 if that step
     is negative, 0 if it is zero, and 1 if it is greater than zero.  */
  
-static int
+int
  compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
  {
    dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
@@ -2070,6 +2071,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                  is irrelevant for them.  */
               *alignment_support_scheme = dr_unaligned_supported;
             }
+         /* Try using LOAD/STORE_LANES.  */
+         else if (slp_node->ldst_lanes
+                  && (*lanes_ifn
+                        = (vls_type == VLS_LOAD
+                           ? vect_load_lanes_supported (vectype, group_size, masked_p)
+                           : vect_store_lanes_supported (vectype, group_size,
+                                                         masked_p))) != IFN_LAST)
+           *memory_access_type = VMAT_LOAD_STORE_LANES;
           else
             *memory_access_type = VMAT_CONTIGUOUS;
  
@@ -8201,6 +8210,16 @@ vectorizable_store (vec_info *vinfo,
                             &lanes_ifn))
      return false;
  
+  if (slp_node
+      && slp_node->ldst_lanes
+      && memory_access_type != VMAT_LOAD_STORE_LANES)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "discovered store-lane but cannot use it.\n");
+      return false;
+    }
+
    if (mask)
      {
        if (memory_access_type == VMAT_CONTIGUOUS)
@@ -8717,7 +8736,7 @@ vectorizable_store (vec_info *vinfo,
    else
      {
        if (memory_access_type == VMAT_LOAD_STORE_LANES)
-       aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+       aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
        else
         aggr_type = vectype;
        bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -8774,11 +8793,24 @@ vectorizable_store (vec_info *vinfo,
  
    if (memory_access_type == VMAT_LOAD_STORE_LANES)
      {
-      gcc_assert (!slp && grouped_store);
+      if (costing_p && slp_node)
+       /* Update all incoming store operand nodes, the general handling
+          above only handles the mask and the first store operand node.  */
+       for (slp_tree child : SLP_TREE_CHILDREN (slp_node))
+         if (child != mask_node
+             && !vect_maybe_update_slp_op_vectype (child, vectype))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "incompatible vector types for invariants\n");
+             return false;
+           }
        unsigned inside_cost = 0, prologue_cost = 0;
        /* For costing some adjacent vector stores, we'd like to cost with
          the total number of them once instead of cost each one by one. */
        unsigned int n_adjacent_stores = 0;
+      if (slp)
+       ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size;
        for (j = 0; j < ncopies; j++)
         {
           gimple *new_stmt;
@@ -8796,7 +8828,7 @@ vectorizable_store (vec_info *vinfo,
                   op = vect_get_store_rhs (next_stmt_info);
                   if (costing_p)
                     update_prologue_cost (&prologue_cost, op);
-                 else
+                 else if (!slp)
                     {
                       vect_get_vec_defs_for_operand (vinfo, next_stmt_info,
                                                      ncopies, op,
@@ -8811,15 +8843,15 @@ vectorizable_store (vec_info *vinfo,
                 {
                   if (mask)
                     {
-                     vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
-                                                    mask, &vec_masks,
-                                                    mask_vectype);
+                     if (slp_node)
+                       vect_get_slp_defs (mask_node, &vec_masks);
+                     else
+                       vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies,
+                                                      mask, &vec_masks,
+                                                      mask_vectype);
                       vec_mask = vec_masks[0];
                     }
  
-                 /* We should have catched mismatched types earlier.  */
-                 gcc_assert (
-                   useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
                   dataref_ptr
                     = vect_create_data_ref_ptr (vinfo, first_stmt_info,
                                                 aggr_type, NULL, offset, &dummy,
@@ -8831,10 +8863,16 @@ vectorizable_store (vec_info *vinfo,
               gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
               /* DR_CHAIN is then used as an input to
                  vect_permute_store_chain().  */
-             for (i = 0; i < group_size; i++)
+             if (!slp)
                 {
-                 vec_oprnd = (*gvec_oprnds[i])[j];
-                 dr_chain[i] = vec_oprnd;
+                 /* We should have caught mismatched types earlier.  */
+                 gcc_assert (
+                   useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
+                 for (i = 0; i < group_size; i++)
+                   {
+                     vec_oprnd = (*gvec_oprnds[i])[j];
+                     dr_chain[i] = vec_oprnd;
+                   }
                 }
               if (mask)
                 vec_mask = vec_masks[j];
@@ -8844,12 +8882,12 @@ vectorizable_store (vec_info *vinfo,
  
           if (costing_p)
             {
-             n_adjacent_stores += vec_num;
+             n_adjacent_stores += group_size;
               continue;
             }
  
           /* Get an array into which we can store the individual vectors.  */
-         tree vec_array = create_vector_array (vectype, vec_num);
+         tree vec_array = create_vector_array (vectype, group_size);
  
           /* Invalidate the current contents of VEC_ARRAY.  This should
              become an RTL clobber too, which prevents the vector registers
@@ -8857,9 +8895,19 @@ vectorizable_store (vec_info *vinfo,
           vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
  
           /* Store the individual vectors into the array.  */
-         for (i = 0; i < vec_num; i++)
+         for (i = 0; i < group_size; i++)
             {
-             vec_oprnd = dr_chain[i];
+             if (slp)
+               {
+                 slp_tree child;
+                 if (i == 0 || !mask_node)
+                   child = SLP_TREE_CHILDREN (slp_node)[i];
+                 else
+                   child = SLP_TREE_CHILDREN (slp_node)[i + 1];
+                 vec_oprnd = SLP_TREE_VEC_DEFS (child)[j];
+               }
+             else
+               vec_oprnd = dr_chain[i];
               write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
                                   i);
             }
@@ -8929,9 +8977,10 @@ vectorizable_store (vec_info *vinfo,
  
           /* Record that VEC_ARRAY is now dead.  */
           vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
-         if (j == 0)
+         if (j == 0 && !slp)
             *vec_stmt = new_stmt;
-         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+         if (!slp)
+           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
         }
  
        if (costing_p)
@@ -10035,6 +10084,16 @@ vectorizable_load (vec_info *vinfo,
                             &lanes_ifn))
      return false;
  
+  if (slp_node
+      && slp_node->ldst_lanes
+      && memory_access_type != VMAT_LOAD_STORE_LANES)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "discovered load-lane but cannot use it.\n");
+      return false;
+    }
+
    if (mask)
      {
        if (memory_access_type == VMAT_CONTIGUOUS)
@@ -10753,7 +10812,7 @@ vectorizable_load (vec_info *vinfo,
    else
      {
        if (memory_access_type == VMAT_LOAD_STORE_LANES)
-       aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
+       aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
        else
         aggr_type = vectype;
        bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
@@ -10777,12 +10836,13 @@ vectorizable_load (vec_info *vinfo,
      {
        gcc_assert (alignment_support_scheme == dr_aligned
                   || alignment_support_scheme == dr_unaligned_supported);
-      gcc_assert (grouped_load && !slp);
  
        unsigned int inside_cost = 0, prologue_cost = 0;
        /* For costing some adjacent vector loads, we'd like to cost with
          the total number of them once instead of cost each one by one. */
        unsigned int n_adjacent_loads = 0;
+      if (slp_node)
+       ncopies = slp_node->vec_stmts_size / group_size;
        for (j = 0; j < ncopies; j++)
         {
           if (costing_p)
@@ -10833,7 +10893,7 @@ vectorizable_load (vec_info *vinfo,
           if (mask)
             vec_mask = vec_masks[j];
  
-         tree vec_array = create_vector_array (vectype, vec_num);
+         tree vec_array = create_vector_array (vectype, group_size);
  
           tree final_mask = NULL_TREE;
           tree final_len = NULL_TREE;
@@ -10896,24 +10956,31 @@ vectorizable_load (vec_info *vinfo,
           gimple_call_set_nothrow (call, true);
           vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
  
-         dr_chain.create (vec_num);
+         if (!slp)
+           dr_chain.create (group_size);
           /* Extract each vector into an SSA_NAME.  */
-         for (i = 0; i < vec_num; i++)
+         for (unsigned i = 0; i < group_size; i++)
             {
               new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
                                             vec_array, i);
-             dr_chain.quick_push (new_temp);
+             if (slp)
+               slp_node->push_vec_def (new_temp);
+             else
+               dr_chain.quick_push (new_temp);
             }
  
-         /* Record the mapping between SSA_NAMEs and statements.  */
-         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+         if (!slp)
+           /* Record the mapping between SSA_NAMEs and statements.  */
+           vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
  
           /* Record that VEC_ARRAY is now dead.  */
           vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
  
-         dr_chain.release ();
+         if (!slp)
+           dr_chain.release ();
  
-         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+         if (!slp_node)
+           *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
         }
  
        if (costing_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index df6c8ada2f7814ac1ea89913e881dd659bd2da62..699ae9e33ba93a8b9e80932dcb43caebab1806ed 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -222,6 +222,9 @@ struct _slp_tree {
    unsigned int lanes;
    /* The operation of this node.  */
    enum tree_code code;
+  /* Whether uses of this load or feeders of this store are suitable
+     for load/store-lanes.  */
+  bool ldst_lanes;
  
    int vertex;
  
@@ -2313,6 +2316,7 @@ extern bool supportable_indirect_convert_operation (code_helper,
                                                     tree, tree,
                                                     vec<std::pair<tree, tree_code> > *,
                                                     tree = NULL_TREE);
+extern int compare_step_with_zero (vec_info *, stmt_vec_info);
  
  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
                                   enum vect_cost_for_stmt, stmt_vec_info,
author	Richard Biener <rguenther@suse.de>
	Fri, 5 Jul 2024 08:35:08 +0000 (10:35 +0200)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Mon, 2 Sep 2024 06:50:32 +0000 (08:50 +0200)
gcc/testsuite/gcc.dg/vect/pr68445.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-11b.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-11c.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-2.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-23.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-33.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-42.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-46.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-53.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-54.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-55.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/slp-56.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/slp-cond-1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-multitypes-11-big-array.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-multitypes-11.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-10.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-2.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-3.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-4.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-5.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-6.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-7.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-8.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-perm-9.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/vect-complex-5.c		patch \| blob \| blame \| history
gcc/tree-vect-loop.cc		patch \| blob \| blame \| history
gcc/tree-vect-slp.cc		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history
gcc/tree-vectorizer.h		patch \| blob \| blame \| history