While these are technically different instructions, no such CPU exists
that has AVX2 that doesn't have BMI2. Enabling BMI2 allows us to
eliminate several flag stalls by having flagless versions of shifts, and
allows us to not clobber and move around GPRs so much in scalar code.
There's usually a sizeable benefit for enabling it. Since we're building
with BMI2 for AVX2 functions, let's also just make sure the CPU claims
to support it (just to cover our bases).
AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2
AVX512VNNIFLAG=-mavx512vnni -mbmi2
-AVX2FLAG=-mavx2
+AVX2FLAG=-mavx2 -mbmi2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
SSE42FLAG=-msse4.2
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
- set(AVX2FLAG "-mavx2")
+ set(AVX2FLAG "-mavx2 -mbmi2")
else()
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
- set(AVX2FLAG "-mavx2")
+ set(AVX2FLAG "-mavx2 -mbmi2")
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
# instruction scheduling unless you specify a reasonable -mtune= target
avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
avx512vnniflag="${avx512flag} -mavx512vnni"
-avx2flag="-mavx2"
+avx2flag="-mavx2 -mbmi2"
sse2flag="-msse2"
ssse3flag="-mssse3"
sse42flag="-msse4.2"
#endif
// X86 - AVX
#ifdef X86_AVX2
- if (cf.x86.has_avx2) {
+ /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for
+ * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers
+ * for the shift results as an operand, eliminating several register-register moves when the original value needs
+ * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */
+ if (cf.x86.has_avx2 && cf.x86.has_bmi2) {
ft.adler32 = &adler32_avx2;
ft.adler32_fold_copy = &adler32_fold_copy_avx2;
ft.chunkmemset_safe = &chunkmemset_safe_avx2;