--- /dev/null
+/* For some targets we end up vectorizing the below loop such that the `sp`
+ single integer is loaded into a 4 integer vector.
+ While the writes are all safe, without 2 scalar loops being peeled into the
+ epilogue we would read past the end of the 31 integer array. This happens
+ because we load a 4 integer chunk to only use the first integer and
+ increment by 2 integers at a time, hence the last load needs s[30-33] and
+ the penultimate load needs s[28-31].
+ This testcase ensures that we do not crash due to that behaviour. */
+/* { dg-do run } */
+/* { dg-options "-std=gnu17 -O2 -ftree-vectorize -fno-vect-cost-model --param vect-partial-vector-usage=2 -mavx512bw -mprefer-vector-width=512" } */
+/* { dg-require-effective-target mmap } */
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+
+#define MB_BLOCK_SIZE 16
+#define VERT_PRED_16 0
+#define HOR_PRED_16 1
+#define DC_PRED_16 2
+int *sptr;
+extern void intrapred_luma_16x16();
+unsigned short mprr_2[5][16][16];
+void initialise_s(int *s) { }
+int main_1() {
+ void *s_mapping;
+ void *end_s;
+ s_mapping = mmap ((void *)ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (s_mapping == MAP_FAILED)
+ {
+ perror ("mmap");
+ return 1;
+ }
+ end_s = (s_mapping + MMAP_SIZE);
+ sptr = (int*)(end_s - sizeof(int[31]));
+ intrapred_luma_16x16(sptr);
+ return 0;
+}
+
+void intrapred_luma_16x16(int * restrict sp) {
+ for (int j=0; j < MB_BLOCK_SIZE; j++)
+ {
+ mprr_2[VERT_PRED_16][j][0]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][1]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][2]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][3]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][4]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][5]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][6]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][7]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][8]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][9]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][10]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][11]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][12]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][13]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][14]=sp[j*2];
+ mprr_2[VERT_PRED_16][j][15]=sp[j*2];
+ }
+}
+
+#define DO_TEST main_1
+#include "avx512-check.h"
If there is a combination of the access not covering the full
vector and a gap recorded then we may need to peel twice. */
+ bool large_vector_overrun_p = false;
if (loop_vinfo
&& (*memory_access_type == VMAT_CONTIGUOUS
|| *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
&& SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
&& !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
nunits))
- overrun_p = true;
+ large_vector_overrun_p = overrun_p = true;
/* If the gap splits the vector in half and the target
can do half-vector operations avoid the epilogue peeling
access and that is sufficiently small to be covered
by the single scalar iteration. */
unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
- if (!nunits.is_constant (&cnunits)
+ if (masked_p
+ || !nunits.is_constant (&cnunits)
|| !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
|| (((cremain = (group_size * cvf - gap) % cnunits), true)
&& ((cpart_size = (1 << ceil_log2 (cremain))), true)
(vectype, cnunits / cpart_size,
&half_vtype) == NULL_TREE)))
{
- /* If all fails we can still resort to niter masking, so
- enforce the use of partial vectors. */
- if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ /* If all fails we can still resort to niter masking unless
+ the vectors used are too big, so enforce the use of
+ partial vectors. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !large_vector_overrun_p)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
return false;
}
}
+ else if (large_vector_overrun_p)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors because "
+ "only unmasked loads handle access "
+ "shortening required because of gaps at "
+ "the end of the access\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
}
}
}