tree-optimization/115895 - overrun with masked loop

author Richard Biener <rguenther@suse.de>

Tue, 14 Jan 2025 14:26:21 +0000 (15:26 +0100)

committer Richard Biener <rguenth@gcc.gnu.org>

Thu, 16 Jan 2025 07:44:39 +0000 (08:44 +0100)
author Richard Biener <rguenther@suse.de>
Tue, 14 Jan 2025 14:26:21 +0000 (15:26 +0100)
committer Richard Biener <rguenth@gcc.gnu.org>
Thu, 16 Jan 2025 07:44:39 +0000 (08:44 +0100)
diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115895.c b/gcc/testsuite/gcc.target/i386/vect-pr115895.c

new file mode 100644 (file)

index 0000000..2246c66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115895.c
@@ -0,0 +1,65 @@
+/* For some targets we end up vectorizing the below loop such that the `sp`
+   single integer is loaded into a 4 integer vector.
+   While the writes are all safe, without 2 scalar loops being peeled into the
+   epilogue we would read past the end of the 31 integer array.  This happens
+   because we load a 4 integer chunk to only use the first integer and
+   increment by 2 integers at a time, hence the last load needs s[30-33] and
+   the penultimate load needs s[28-31].
+   This testcase ensures that we do not crash due to that behaviour.  */
+/* { dg-do run } */
+/* { dg-options "-std=gnu17 -O2 -ftree-vectorize -fno-vect-cost-model --param vect-partial-vector-usage=2 -mavx512bw -mprefer-vector-width=512" } */
+/* { dg-require-effective-target mmap } */
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+
+#define MB_BLOCK_SIZE 16
+#define VERT_PRED_16 0
+#define HOR_PRED_16 1
+#define DC_PRED_16 2
+int *sptr;
+extern void intrapred_luma_16x16();
+unsigned short mprr_2[5][16][16];
+void initialise_s(int *s) { }
+int main_1() {
+    void *s_mapping;
+    void *end_s;
+    s_mapping = mmap ((void *)ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (s_mapping == MAP_FAILED)
+      {
+       perror ("mmap");
+       return 1;
+      }
+    end_s = (s_mapping + MMAP_SIZE);
+    sptr = (int*)(end_s - sizeof(int[31]));
+    intrapred_luma_16x16(sptr);
+    return 0;
+}
+
+void intrapred_luma_16x16(int * restrict sp) {
+    for (int j=0; j < MB_BLOCK_SIZE; j++)
+      {
+       mprr_2[VERT_PRED_16][j][0]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][1]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][2]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][3]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][4]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][5]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][6]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][7]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][8]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][9]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][10]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][11]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][12]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][13]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][14]=sp[j*2];
+       mprr_2[VERT_PRED_16][j][15]=sp[j*2];
+      }
+}
+
+#define DO_TEST main_1
+#include "avx512-check.h"
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 0c0f999d3e3c2f73b14a33055d34cc1043b3704c..b5dd1a2e40f1acc3989bbcec075395b53891b0e7 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2216,13 +2216,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
  
              If there is a combination of the access not covering the full
              vector and a gap recorded then we may need to peel twice.  */
+         bool large_vector_overrun_p = false;
           if (loop_vinfo
               && (*memory_access_type == VMAT_CONTIGUOUS
                   || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
               && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
               && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                               nunits))
-           overrun_p = true;
+           large_vector_overrun_p = overrun_p = true;
  
           /* If the gap splits the vector in half and the target
              can do half-vector operations avoid the epilogue peeling
@@ -2273,7 +2274,8 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                  access and that is sufficiently small to be covered
                  by the single scalar iteration.  */
               unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
-             if (!nunits.is_constant (&cnunits)
+             if (masked_p
+                 || !nunits.is_constant (&cnunits)
                   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
                   || (((cremain = (group_size * cvf - gap) % cnunits), true)
                       && ((cpart_size = (1 << ceil_log2 (cremain))), true)
@@ -2282,9 +2284,11 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                                (vectype, cnunits / cpart_size,
                                 &half_vtype) == NULL_TREE)))
                 {
-                 /* If all fails we can still resort to niter masking, so
-                    enforce the use of partial vectors.  */
-                 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+                 /* If all fails we can still resort to niter masking unless
+                    the vectors used are too big, so enforce the use of
+                    partial vectors.  */
+                 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                     && !large_vector_overrun_p)
                     {
                       if (dump_enabled_p ())
                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2302,6 +2306,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                       return false;
                     }
                 }
+             else if (large_vector_overrun_p)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "can't operate on partial vectors because "
+                                    "only unmasked loads handle access "
+                                    "shortening required because of gaps at "
+                                    "the end of the access\n");
+                 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+               }
             }
         }
      }
author	Richard Biener <rguenther@suse.de>
	Tue, 14 Jan 2025 14:26:21 +0000 (15:26 +0100)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Thu, 16 Jan 2025 07:44:39 +0000 (08:44 +0100)
gcc/testsuite/gcc.target/i386/vect-pr115895.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history