The AVX2 and AVX512 flags enable BMI2 but not BMI1, and TZCNT is a
BMI1 instruction. GCC emits the rep bsf encoding that executes as
TZCNT on BMI hardware regardless, but clang gates on the feature bit
and emits plain BSF, which is slower on AMD. Every CPU with AVX2 also
has BMI1, so the flag only affects code already behind AVX2 runtime
detection.
Assisted-By: Claude Opus 4.8 (1M context)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
- set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2")
else()
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
else()
set(AVX512FLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
- set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2")
+ set(AVX512FLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE OR CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
- set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
+ set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
else()
set(AVX512VNNIFLAG "/arch:AVX512")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
- set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi2")
+ set(AVX512VNNIFLAG "-mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mbmi -mbmi2")
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
if(NOT MSVC)
check_c_compiler_flag("-mtune=cascadelake" HAVE_CASCADE_LAKE)
if(HAVE_CASCADE_LAKE)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
if(NOT NATIVEFLAG)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
- set(AVX2FLAG "-mavx2 -mbmi2")
+ set(AVX2FLAG "-mavx2 -mbmi -mbmi2")
else()
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
else()
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID MATCHES "NVHPC")
- set(AVX2FLAG "-mavx2 -mbmi2")
+ set(AVX2FLAG "-mavx2 -mbmi -mbmi2")
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
elseif(MSVC)
set(AVX2FLAG "/arch:AVX2")
endif()
floatabi=
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
floatabi=
# For CPUs that can benefit from AVX512, it seems GCC generates suboptimal
# instruction scheduling unless you specify a reasonable -mtune= target
-avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2"
+avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi -mbmi2"
avx512vnniflag="${avx512flag} -mavx512vnni"
avx512vnniflag="${avx512flag} -mavx512vnni"
-avx2flag="-mavx2 -mbmi2"
+avx2flag="-mavx2 -mbmi -mbmi2"
sse2flag="-msse2"
ssse3flag="-mssse3"
sse41flag="-msse4.1"
sse2flag="-msse2"
ssse3flag="-mssse3"
sse41flag="-msse4.1"