From: Nathan Moin Vaziri Date: Thu, 11 Jun 2026 23:38:57 +0000 (-0700) Subject: Add -mbmi to AVX2 and AVX512 compile flags X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;ds=sidebyside;p=thirdparty%2Fzlib-ng.git Add -mbmi to AVX2 and AVX512 compile flags The AVX2 and AVX512 flags enable BMI2 but not BMI1, and TZCNT is a BMI1 instruction. GCC emits the rep bsf encoding that executes as TZCNT on BMI hardware regardless, but clang gates on the feature bit and emits plain BSF, which is slower on AMD. Every CPU with AVX2 also has BMI1, so the flag only affects code already behind AVX2 runtime detection. Assisted-By: Claude Opus 4.8 (1M context) --- diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 56bd7ca7b..3b9acfa13 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -143,14 +143,14 @@ macro(check_avx512_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2") else() set(AVX512FLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) @@ -182,12 +182,12 @@ macro(check_avx512vnni_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2") else() set(AVX512VNNIFLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) @@ -223,12 +223,12 @@ macro(check_avx2_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX2FLAG "-mavx2 -mbmi2") + set(AVX2FLAG "-mavx2 -mbmi -mbmi2") else() set(AVX2FLAG "/arch:AVX2") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") - set(AVX2FLAG "-mavx2 -mbmi2") + set(AVX2FLAG "-mavx2 -mbmi -mbmi2") elseif(MSVC) set(AVX2FLAG "/arch:AVX2") endif() diff --git a/configure b/configure index a1a208aa7..fec16c029 100755 --- a/configure +++ b/configure @@ -114,9 +114,9 @@ buildcrc32la=1 floatabi= # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target -avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" +avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2" avx512vnniflag="${avx512flag} -mavx512vnni" -avx2flag="-mavx2 -mbmi2" +avx2flag="-mavx2 -mbmi -mbmi2" sse2flag="-msse2" ssse3flag="-mssse3" sse41flag="-msse4.1"