tree-optimization/123190 - allow VF == 1 epilog vectorization

author Richard Biener <rguenther@suse.de>

Wed, 14 Jan 2026 09:53:05 +0000 (10:53 +0100)

committer Richard Biener <rguenth@gcc.gnu.org>

Wed, 14 Jan 2026 13:44:00 +0000 (14:44 +0100)
author Richard Biener <rguenther@suse.de>
Wed, 14 Jan 2026 09:53:05 +0000 (10:53 +0100)
committer Richard Biener <rguenth@gcc.gnu.org>
Wed, 14 Jan 2026 13:44:00 +0000 (14:44 +0100)
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c

new file mode 100644 (file)

index 0000000..4265ac8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */
+
+typedef struct {
+   double real;
+   double imag;
+} complex;
+
+typedef struct { complex e[3][3]; } su3_matrix;
+
+void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){
+int i,j;
+register double t,ar,ai,br,bi,cr,ci;
+    for(i=0;i<3;i++)
+      for(j=0;j<3;j++){
+
+        ar=a->e[i][0].real; ai=a->e[i][0].imag;
+        br=b->e[j][0].real; bi=b->e[j][0].imag;
+        cr=ar*br; t=ai*bi; cr += t;
+        ci=ai*br; t=ar*bi; ci -= t;
+
+        ar=a->e[i][1].real; ai=a->e[i][1].imag;
+        br=b->e[j][1].real; bi=b->e[j][1].imag;
+        t=ar*br; cr += t; t=ai*bi; cr += t;
+        t=ar*bi; ci -= t; t=ai*br; ci += t;
+
+        ar=a->e[i][2].real; ai=a->e[i][2].imag;
+        br=b->e[j][2].real; bi=b->e[j][2].imag;
+        t=ar*br; cr += t; t=ai*bi; cr += t;
+        t=ar*bi; ci -= t; t=ai*br; ci += t;
+
+        c->e[i][j].real=cr;
+        c->e[i][j].imag=ci;
+    }
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-28.c b/gcc/testsuite/gcc.dg/vect/slp-28.c

index 1f987874f0df53f009cf535a8559632cd4ccb8a3..bf6271eed25301b28690e571d1d529645fb08118 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-28.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-28.c
@@ -1,4 +1,5 @@
  /* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
  
  #include <stdarg.h>
  #include "tree-vect.h"
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc

index 74eecb832e6f4a049be9dd6f46af577f57e567d0..fdf544fa47b043afe68e0d5f8a65f04014b7836f 100644 (file)
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1792,9 +1792,13 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
             }
         }
        /* Reject vectorizing for a single scalar iteration, even if
-        we could in principle implement that using partial vectors.  */
+        we could in principle implement that using partial vectors.
+        But allow such vectorization if VF == 1 in case we do not
+        need to peel for gaps (if we need, avoid vectorization for
+        reasons of code footprint).  */
        unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-      if (scalar_niters <= peeling_gap + 1)
+      if (scalar_niters <= peeling_gap + 1
+         && (assumed_vf > 1 || peeling_gap != 0))
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
author	Richard Biener <rguenther@suse.de>
	Wed, 14 Jan 2026 09:53:05 +0000 (10:53 +0100)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Wed, 14 Jan 2026 13:44:00 +0000 (14:44 +0100)
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/slp-28.c		patch \| blob \| blame \| history
gcc/tree-vect-loop.cc		patch \| blob \| blame \| history