Avoid selecting masked epilogs for in-order reduction vectorization

author Richard Biener <rguenther@suse.de>

Thu, 22 Jan 2026 13:06:50 +0000 (14:06 +0100)

committer Richard Biener <rguenther@suse.de>

Fri, 23 Jan 2026 07:49:59 +0000 (08:49 +0100)
author Richard Biener <rguenther@suse.de>
Thu, 22 Jan 2026 13:06:50 +0000 (14:06 +0100)
committer Richard Biener <rguenther@suse.de>
Fri, 23 Jan 2026 07:49:59 +0000 (08:49 +0100)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index a3d0f7cb64963d837802f578f2a1c1705ce8d348..42ae9ccb05186b4217d994aa88d3eeaa25292f23 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26572,6 +26572,12 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
               > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
         m_costs[vect_body] = INT_MAX;
  
+      /* We'd like to avoid using masking if there's an in-order reduction
+        to vectorize because that will also perform in-order adds of
+        masked elements (as neutral value, of course) here, but there
+        is currently no way to indicate to try un-masked with the same
+        mode.  */
+
        bool any_reduc_p = false;
        for (int i = 0; i != X86_REDUC_LAST; i++)
         if (m_num_reduc[i])
@@ -26687,6 +26693,20 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
                   }
               }
           }
+      /* Avoid using masking if there's an in-order reduction
+        to vectorize because that will also perform in-order adds of
+        masked elements (as neutral value, of course).  */
+      if (!avoid)
+       {
+         for (auto inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
+           if (SLP_INSTANCE_KIND (inst) == slp_inst_kind_reduc_group
+               && (vect_reduc_type (loop_vinfo, SLP_INSTANCE_TREE (inst))
+                   == FOLD_LEFT_REDUCTION))
+             {
+               avoid = true;
+               break;
+             }
+       }
        if (!avoid)
         {
           m_suggested_epilogue_mode = loop_vinfo->vector_mode;
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c

new file mode 100644 (file)

index 0000000..5b8c358
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mavx512bw -mtune-ctrl=avx512_masked_epilogues" } */
+
+void test (const unsigned char * __restrict__ pi,
+          const float * __restrict__ blk,
+          int texel_count,
+          float *pp_avg_rgb)
+{
+    float pp_avg_rgb_0 = 0.0f;
+    float pp_avg_rgb_1 = 0.0f;
+    float pp_avg_rgb_2 = 0.0f;
+    float pp_avg_rgb_3 = 0.0f;
+    for (int lane_id = 0; lane_id < texel_count; lane_id++) {
+        unsigned char r_byte = pi[lane_id * 4 + 0];
+        unsigned char g_byte = pi[lane_id * 4 + 1];
+        unsigned char b_byte = pi[lane_id * 4 + 2];
+        unsigned char a_byte = pi[lane_id * 4 + 3];
+
+        float r_float = blk[lane_id * 4 + 0];
+        float g_float = blk[lane_id * 4 + 1];
+        float b_float = blk[lane_id * 4 + 2];
+        float a_float = blk[lane_id * 4 + 3];
+
+        int r_is_zero = (r_byte == 0) ? 1 : 0;
+        int r_in_bounds = (texel_count > lane_id) ? 1 : 0;
+        int r_mask = r_is_zero * (-r_in_bounds);
+        if (r_mask != 0) {
+            pp_avg_rgb_0 += r_float;
+        }
+        int g_is_zero = (g_byte == 0) ? 1 : 0;
+        int g_in_bounds = (texel_count > lane_id) ? 1 : 0;
+        int g_mask = g_is_zero * (-g_in_bounds);
+        if (g_mask != 0) {
+            pp_avg_rgb_1 += g_float;
+        }
+        int b_is_zero = (b_byte == 0) ? 1 : 0;
+        int b_in_bounds = (texel_count > lane_id) ? 1 : 0;
+        int b_mask = b_is_zero * (-b_in_bounds);
+        if (b_mask != 0) {
+            pp_avg_rgb_2 += b_float;
+        }
+        int a_is_zero = (a_byte == 0) ? 1 : 0;
+        int a_in_bounds = (texel_count > lane_id) ? 1 : 0;
+        int a_mask = a_is_zero * (-a_in_bounds);
+        if (a_mask != 0) {
+            pp_avg_rgb_3 += a_float;
+        }
+    }
+    pp_avg_rgb[0] = pp_avg_rgb_0;
+    pp_avg_rgb[1] = pp_avg_rgb_1;
+    pp_avg_rgb[2] = pp_avg_rgb_2;
+    pp_avg_rgb[3] = pp_avg_rgb_3;
+}
+
+/* Even though there's an SLP opportunity in-order reductions should never use
+   masked epilogs.  */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 byte vectors" "vect" } } */
author	Richard Biener <rguenther@suse.de>
	Thu, 22 Jan 2026 13:06:50 +0000 (14:06 +0100)
committer	Richard Biener <rguenther@suse.de>
	Fri, 23 Jan 2026 07:49:59 +0000 (08:49 +0100)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-epil-1.c	[new file with mode: 0644]	patch \| blob