From: Adam Stylinski Date: Thu, 28 Nov 2024 00:00:52 +0000 (-0500) Subject: Enable AVX2 functions to be built with BMI2 instructions X-Git-Tag: 2.2.3~18 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7020cb3f74382c04b7d291a4d99108f088090b1a;p=thirdparty%2Fzlib-ng.git Enable AVX2 functions to be built with BMI2 instructions While these are technically different instructions, no such CPU exists that has AVX2 that doesn't have BMI2. Enabling BMI2 allows us to eliminate several flag stalls by having flagless versions of shifts, and allows us to not clobber and move around GPRs so much in scalar code. There's usually a sizeable benefit for enabling it. Since we're building with BMI2 for AVX2 functions, let's also just make sure the CPU claims to support it (just to cover our bases). --- diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index a012e61ea..a797517df 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -10,7 +10,7 @@ SUFFIX= AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw -mbmi2 AVX512VNNIFLAG=-mavx512vnni -mbmi2 -AVX2FLAG=-mavx2 +AVX2FLAG=-mavx2 -mbmi2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 SSE42FLAG=-msse4.2 diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index b8eabe8e2..b96ac0a44 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -151,12 +151,12 @@ macro(check_avx2_intrinsics) if(NOT NATIVEFLAG) if(CMAKE_C_COMPILER_ID MATCHES "Intel") if(CMAKE_HOST_UNIX OR APPLE) - set(AVX2FLAG "-mavx2") + set(AVX2FLAG "-mavx2 -mbmi2") else() set(AVX2FLAG "/arch:AVX2") endif() elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(AVX2FLAG "-mavx2") + set(AVX2FLAG "-mavx2 -mbmi2") elseif(MSVC) set(AVX2FLAG "/arch:AVX2") endif() diff --git a/configure b/configure index 738e5f928..e6270ae03 100755 --- a/configure +++ b/configure @@ -108,7 +108,7 @@ forcesse2=0 # instruction scheduling unless you specify a reasonable -mtune= target avx512flag="-mavx512f -mavx512dq -mavx512bw -mavx512vl -mbmi2" avx512vnniflag="${avx512flag} -mavx512vnni" -avx2flag="-mavx2" +avx2flag="-mavx2 -mbmi2" sse2flag="-msse2" ssse3flag="-mssse3" sse42flag="-msse4.2" diff --git a/functable.c b/functable.c index c8b11b5fa..9c114568b 100644 --- a/functable.c +++ b/functable.c @@ -110,7 +110,11 @@ static void init_functable(void) { #endif // X86 - AVX #ifdef X86_AVX2 - if (cf.x86.has_avx2) { + /* BMI2 support is all but implicit with AVX2 but let's sanity check this just in case. Enabling BMI2 allows for + * flagless shifts, resulting in fewer flag stalls for the pipeline, and allows us to set destination registers + * for the shift results as an operand, eliminating several register-register moves when the original value needs + * to remain intact. They also allow for a count operand that isn't the CL register, avoiding contention there */ + if (cf.x86.has_avx2 && cf.x86.has_bmi2) { ft.adler32 = &adler32_avx2; ft.adler32_fold_copy = &adler32_fold_copy_avx2; ft.chunkmemset_safe = &chunkmemset_safe_avx2;