[PATCH] Fix SLP when ifcvt versioned loop is not vectorized

author Kugan Vivekanandarajah <kvivekananda@nvidia.com>

Wed, 30 Oct 2024 20:23:10 +0000 (07:23 +1100)

committer Kugan Vivekanandarajah <kvivekananda@nvidia.com>

Wed, 30 Oct 2024 20:23:16 +0000 (07:23 +1100)
author Kugan Vivekanandarajah <kvivekananda@nvidia.com>
Wed, 30 Oct 2024 20:23:10 +0000 (07:23 +1100)
committer Kugan Vivekanandarajah <kvivekananda@nvidia.com>
Wed, 30 Oct 2024 20:23:16 +0000 (07:23 +1100)
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c

new file mode 100644 (file)

index 0000000..b2cc1d1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c
@@ -0,0 +1,74 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+#include <stdint.h>
+#include <string.h>
+
+
+typedef struct {
+    uint16_t d;
+    uint16_t m;
+    uint8_t val1[4];
+    uint8_t val2[16];
+} st1;
+
+typedef struct {
+    float d;
+    float s;
+    int8_t val2[32];
+} st2;
+
+float table[1 << 16];
+
+inline static float foo(uint16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return table[s];
+}
+
+
+void test(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / 32;
+
+    
+    const st1 * restrict x = vx;
+    const st2 * restrict y = vy;
+
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t val1;
+        memcpy(&val1, x[i].val1, sizeof(val1));
+
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        if (val1) {
+            for (int j = 0; j < 16; ++j) {
+                const uint8_t xh_0 = ((val1 >> (j)) << 4) & 0x10;
+                const uint8_t xh_1 = ((val1 >> (j + 12)) ) & 0x10;
+
+                const int32_t x0 = (x[i].val2[j] & 0xF) | xh_0;
+                const int32_t x1 = (x[i].val2[j] >> 4) | xh_1;
+
+                sumi0 += (x0 * y[i].val2[j]);
+                sumi1 += (x1 * y[i].val2[j + 16]);
+            }
+        } else {
+            for (int j = 0; j < 16; ++j) {
+                const int32_t x0 = (x[i].val2[j] & 0xF);
+                const int32_t x1 = (x[i].val2[j] >> 4);
+
+                sumi0 += (x0 * y[i].val2[j]);
+                sumi1 += (x1 * y[i].val2[j + 16]);
+            }
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += (foo(x[i].d)*y[i].d)*sumi + foo(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1"  { target { { vect_int_mult && vect_element_align } && { ! powerpc*-*-* } } } } } */
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc

index af112f212fe2bb06ab7315ce0c9fcc187ca11466..16fa0ec1bb77765b5f27760c1affcc8853dcaedb 100644 (file)
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -1326,6 +1326,7 @@ pass_vectorize::execute (function *fun)
             if (g)
               {
                 fold_loop_internal_call (g, boolean_false_node);
+               loop->dont_vectorize = false;
                 ret |= TODO_cleanup_cfg;
                 g = NULL;
               }
@@ -1335,6 +1336,7 @@ pass_vectorize::execute (function *fun)
             if (g)
               {
                 fold_loop_internal_call (g, boolean_false_node);
+               loop->dont_vectorize = false;
                 ret |= TODO_cleanup_cfg;
               }
           }
author	Kugan Vivekanandarajah <kvivekananda@nvidia.com>
	Wed, 30 Oct 2024 20:23:10 +0000 (07:23 +1100)
committer	Kugan Vivekanandarajah <kvivekananda@nvidia.com>
	Wed, 30 Oct 2024 20:23:16 +0000 (07:23 +1100)
gcc/testsuite/gcc.dg/vect/bb-slp-77.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vectorizer.cc		patch \| blob \| blame \| history