From bd6b67944486855e944e00e5a1cbaafd828e52b2 Mon Sep 17 00:00:00 2001 From: Nathan Moin Vaziri Date: Thu, 11 Jun 2026 16:38:57 -0700 Subject: [PATCH] Add -mbmi to AVX2 and AVX512 compile flags The AVX2 and AVX512 flags enable BMI2 but not BMI1, and TZCNT is a BMI1 instruction. GCC emits the rep bsf encoding that executes as TZCNT on BMI hardware regardless, but clang gates on the feature bit and emits plain BSF, which is slower on AMD. Every CPU with AVX2 also has BMI1, so the flag only affects code already behind AVX2 runtime detection. Assisted-By: Claude Opus 4.8 (1M context) --- cmake/detect-intrinsics.cmake | 12 ++++++------ configure | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 56bd7ca7b..3b9acfa13 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -143,14 +143,14 @@ macro(check_avx512_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2") else() set(AVX512FLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target - set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2") + set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) @@ -182,12 +182,12 @@ macro(check_avx512vnni_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2") else() set(AVX512VNNIFLAG "/arch:AVX512") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") - set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2") + set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2") if(NOT MSVC) check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE) if(HAVE_CASCADE_LAKE) @@ -223,12 +223,12 @@ macro(check_avx2_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX2FLAG "-mavx2 -mbmi2") + set(AVX2FLAG "-mavx2 -mbmi -mbmi2") else() set(AVX2FLAG "/arch:AVX2") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC") - set(AVX2FLAG "-mavx2 -mbmi2") + set(AVX2FLAG "-mavx2 -mbmi -mbmi2") elseif(MSVC) set(AVX2FLAG "/arch:AVX2") endif() diff --git a/configure b/configure index a1a208aa7..fec16c029 100755 --- a/configure +++ b/configure @@ -114,9 +114,9 @@ buildcrc32la=1 floatabi= # For CPUs that can benefit from AVX512, it seems GCC generates suboptimal # instruction scheduling unless you specify a reasonable -mtune= target -avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" +avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2" avx512vnniflag="${avx512flag} -mavx512vnni" -avx2flag="-mavx2 -mbmi2" +avx2flag="-mavx2 -mbmi -mbmi2" sse2flag="-msse2" ssse3flag="-mssse3" sse41flag="-msse4.1" -- 2.47.3