int blas_limit, blas_call gemm);
export_proto(matmul_r16);
-#if defined(HAVE_AVX) && defined(HAVE_AVX2)
-/* REAL types generate identical code for AVX and AVX2. Only generate
- an AVX2 function if we are dealing with integer. */
-#undef HAVE_AVX2
-#endif
-
-
/* Put exhaustive list of possible architectures here here, ORed together. */
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
static void
matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
- int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
+ int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
static void
matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
#endif /* HAVE_AVX512F */
#ifdef HAVE_AVX2
- if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+ if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
+ && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_p = matmul_r16_avx2;
goto tailcall;