]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add -mbmi to AVX2 and AVX512 compile flags develop
authorNathan Moin Vaziri <nathan@nathanm.com>
Thu, 11 Jun 2026 23:38:57 +0000 (16:38 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 13 Jun 2026 10:03:26 +0000 (12:03 +0200)
The AVX2 and AVX512 flags enable BMI2 but not BMI1, and TZCNT is a
BMI1 instruction. GCC emits the rep bsf encoding that executes as
TZCNT on BMI hardware regardless, but clang gates on the feature bit
and emits plain BSF, which is slower on AMD. Every CPU with AVX2 also
has BMI1, so the flag only affects code already behind AVX2 runtime
detection.

Assisted-By: Claude Opus 4.8 (1M context)
cmake/detect-intrinsics.cmake
configure

index 56bd7ca7b77a219888d972d41b33875b12af3107..3b9acfa13a3631f24ad51d60eec09aca55943952 100644 (file)
@@ -143,14 +143,14 @@ macro(check_avx512_intrinsics)
     if(NOT NATIVEFLAG)
         if(CMAKE_C_COMPILER_ID MATCHES "Intel")
             if(CMAKE_HOST_UNIX OR APPLE)
-                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
+                set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2")
             else()
                 set(AVX512FLAG "/arch:AVX512")
             endif()
         elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
             # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
             # instruction scheduling unless you specify a reasonable -mtune= target
-            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
+            set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2")
             if(NOT MSVC)
                 check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
                 if(HAVE_CASCADE_LAKE)
@@ -182,12 +182,12 @@ macro(check_avx512vnni_intrinsics)
     if(NOT NATIVEFLAG)
         if(CMAKE_C_COMPILER_ID MATCHES "Intel")
             if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
-                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
+                set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2")
             else()
                 set(AVX512VNNIFLAG "/arch:AVX512")
             endif()
         elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
-            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
+            set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2")
             if(NOT MSVC)
                 check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
                 if(HAVE_CASCADE_LAKE)
@@ -223,12 +223,12 @@ macro(check_avx2_intrinsics)
     if(NOT NATIVEFLAG)
         if(CMAKE_C_COMPILER_ID MATCHES "Intel")
             if(CMAKE_HOST_UNIX OR APPLE)
-                set(AVX2FLAG "-mavx2 -mbmi2")
+                set(AVX2FLAG "-mavx2 -mbmi -mbmi2")
             else()
                 set(AVX2FLAG "/arch:AVX2")
             endif()
         elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
-            set(AVX2FLAG "-mavx2 -mbmi2")
+            set(AVX2FLAG "-mavx2 -mbmi -mbmi2")
         elseif(MSVC)
             set(AVX2FLAG "/arch:AVX2")
         endif()
index a1a208aa7a0380c1b7acfe1a17a7ee1dc8043c5e..fec16c029c84460c27341a4a1435be7b5a0bdf5c 100755 (executable)
--- a/configure
+++ b/configure
@@ -114,9 +114,9 @@ buildcrc32la=1
 floatabi=
 # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
 # instruction scheduling unless you specify a reasonable -mtune= target
-avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2"
 avx512vnniflag="${avx512flag} -mavx512vnni"
-avx2flag="-mavx2 -mbmi2"
+avx2flag="-mavx2 -mbmi -mbmi2"
 sse2flag="-msse2"
 ssse3flag="-mssse3"
 sse41flag="-msse4.1"