[x86] avoid using masked vector epilogues when no scalar epilog is needed

author Richard Biener <rguenther@suse.de>

Thu, 6 Nov 2025 12:19:35 +0000 (13:19 +0100)

committer Richard Biener <rguenth@gcc.gnu.org>

Mon, 17 Nov 2025 08:22:06 +0000 (09:22 +0100)
author Richard Biener <rguenther@suse.de>
Thu, 6 Nov 2025 12:19:35 +0000 (13:19 +0100)
committer Richard Biener <rguenth@gcc.gnu.org>
Mon, 17 Nov 2025 08:22:06 +0000 (09:22 +0100)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 6b6febc88709d9396d3c091a1ba1d825f518e520..8aac0820bc2251e28ec61bdc4feb4bbab8bfbb2c 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26609,6 +26609,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
    if (loop_vinfo
        && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
        && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+      /* Avoid a masked epilog if cascaded epilogues eventually get us
+        to one with VF 1 as that means no scalar epilog at all.  */
+      && !((GET_MODE_SIZE (loop_vinfo->vector_mode)
+           / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () == 16)
+          && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
        && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
        && !OPTION_SET_P (param_vect_partial_vector_usage))
      {
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c

new file mode 100644 (file)

index 0000000..ca3294d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=znver5" } */
+
+struct S {
+    float m_col1[4];
+    float m_col2[4];
+    float m_col3[4];
+    float m_col4[4];
+};
+
+void apply(struct S *s, const float *in, float *out, long numPixels)
+{
+  for (long idx = 0; idx < numPixels; ++idx)
+    {
+      const float r = in[0];
+      const float g = in[1];
+      const float b = in[2];
+      const float a = in[3];
+      out[0] = r*s->m_col1[0] + g*s->m_col2[0] + b*s->m_col3[0] + a*s->m_col4[0];
+      out[1] = r*s->m_col1[1] + g*s->m_col2[1] + b*s->m_col3[1] + a*s->m_col4[1];
+      out[2] = r*s->m_col1[2] + g*s->m_col2[2] + b*s->m_col3[2] + a*s->m_col4[2];
+      out[3] = r*s->m_col1[3] + g*s->m_col2[3] + b*s->m_col3[3] + a*s->m_col4[3];
+      in  += 4;
+      out += 4;
+    }
+}
+
+/* Check that we do not use a masked epilog but a SSE one with VF 1
+   (and possibly a AVX2 one as well).  */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */
author	Richard Biener <rguenther@suse.de>
	Thu, 6 Nov 2025 12:19:35 +0000 (13:19 +0100)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Mon, 17 Nov 2025 08:22:06 +0000 (09:22 +0100)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr122573.c	[new file with mode: 0644]	patch \| blob