From 7020cb3f74382c04b7d291a4d99108f088090b1a Mon Sep 17 00:00:00 2001 From: Adam Stylinski Date: Wed, 27 Nov 2024 19:00:52 -0500 Subject: [PATCH] Enable AVX2 functions to be built with BMI2 instructions While these are technically different instructions, no such CPU exists that has AVX2 that doesn't have BMI2. Enabling BMI2 allows us to eliminate several flag stalls by having flagless versions of shifts, and allows us to not clobber and move around GPRs so much in scalar code. There's usually a sizeable benefit for enabling it. Since we're building with BMI2 for AVX2 functions, let's also just make sure the CPU claims to support it (just to cover our bases). --- arch/x86/Makefile.in | 2 +- cmake/detect-intrinsics.cmake | 4 ++-- configure | 2 +- functable.c | 6 +++++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index a012e61e..a797517d 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -10,7 +10,7 @@ SUFFIX= AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2 AVX512VNNIFLAG=-mavx512vnni -mbmi2 -AVX2FLAG=-mavx2 +AVX2FLAG=-mavx2 -mbmi2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 SSE42FLAG=-msse4.2 diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index b8eabe8e..b96ac0a4 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -151,12 +151,12 @@ macro(check_avx2_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX2FLAG "-mavx2") + set(AVX2FLAG "-mavx2 -mbmi2") else() set(AVX2FLAG "/arch:AVX2") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(AVX2FLAG "-mavx2") + set(AVX2FLAG "-mavx2 -mbmi2") elseif(MSVC) set(AVX2FLAG "/arch:AVX2") endif() diff --git a/configure b/configure index 738e5f92..e6270ae0 100755 --- a/configure +++ b/configure @@ -108,7 +108,7 @@ forcesse2=0 # instruction scheduling unless you specify a reasonable -mtune= target avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" avx512vnniflag="${avx512flag} -mavx512vnni" -avx2flag="-mavx2" +avx2flag="-mavx2 -mbmi2" sse2flag="-msse2" ssse3flag="-mssse3" sse42flag="-msse4.2" diff --git a/functable.c b/functable.c index c8b11b5f..9c114568 100644 --- a/functable.c +++ b/functable.c @@ -110,7 +110,11 @@ static void init_functable(void) { #endif // X86 - AVX #ifdef X86_AVX2 - if (cf.x86.has_avx2) { + /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for + * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers + * for the shift results as an operand, eliminating several register-register moves when the original value needs + * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ + if (cf.x86.has_avx2 && cf.x86.has_bmi2) { ft.adler32 = &adler32_avx2; ft.adler32_fold_copy = &adler32_fold_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2; -- 2.47.2