&& LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16)
m_suggested_epilogue_mode = V8QImode;
+ /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use
+ a masked epilogue if that doesn't seem detrimental. */
+ if (loop_vinfo
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+ && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
+ && !OPTION_SET_P (param_vect_partial_vector_usage))
+ {
+ bool avoid = false;
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+ {
+ unsigned int peel_niter
+ = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ peel_niter += 1;
+ /* When we know the number of scalar iterations of the epilogue,
+ avoid masking when a single vector epilog iteration handles
+ it in full. */
+ if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter)
+ % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()))
+ avoid = true;
+ }
+ if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo))))
+ for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo))
+ {
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ ;
+ else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+ ;
+ else
+ {
+ int loop_depth
+ = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
+ DDR_LOOP_NEST (ddr));
+ if (DDR_NUM_DIST_VECTS (ddr) == 1
+ && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0)
+ {
+ /* Avoid the case when there's an outer loop that might
+ traverse a multi-dimensional array with the inner
+ loop just executing the masked epilogue with a
+ read-write where the next outer iteration might
+ read from the masked part of the previous write,
+ 'n' filling half a vector.
+ for (j = 0; j < m; ++j)
+ for (i = 0; i < n; ++i)
+ a[j][i] = c * a[j][i]; */
+ avoid = true;
+ break;
+ }
+ }
+ }
+ if (!avoid)
+ {
+ m_suggested_epilogue_mode = loop_vinfo->vector_mode;
+ m_masked_epilogue = 1;
+ }
+ }
+
vector_costs::finish_cost (scalar_costs);
}
DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
m_ZNVER4 | m_ZNVER5)
+/* X86_TUNE_AVX512_MAKED_EPILOGUES: Use two masked vector epilogues
+ when fit. */
+DEF_TUNE (X86_TUNE_AVX512_MASKED_EPILOGUES, "avx512_masked_epilogues",
+ m_ZNVER4 | m_ZNVER5)
+
/*****************************************************************************/
/*****************************************************************************/
/* Historical relics: tuning flags that helps a specific old CPU designs */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void bar (double *a, double *b, double c, int n, int m)
+{
+ for (int j = 0; j < m; ++j)
+ for (int i = 0; i < n; ++i)
+ a[j*n + i] = b[j*n + i] + c;
+}
+
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using masked 64 byte vectors" "vect" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver5 -fdump-tree-vect-optimized" } */
+
+void foo (double *a, double b, double c, int n, int m)
+{
+ for (int j = 0; j < m; ++j)
+ for (int i = 0; i < n; ++i)
+ a[j*n + i] = a[j*n + i] * b + c;
+}
+
+/* We do not want to use a masked epilogue for the inner loop as the next
+ outer iteration will possibly immediately read from elements masked of
+ the previous inner loop epilogue and that never forwards. */
+/* { dg-final { scan-tree-dump "epilogue loop vectorized using 32 byte vectors" "vect" } } */