From: Hans Kristian Rosbach Date: Thu, 15 Jan 2026 22:42:19 +0000 (+0100) Subject: Unify baseline benchmarking for both adler32 and crc32. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2aeaa6418172660d65e6a4f64d881a5aa68b8dd8;p=thirdparty%2Fzlib-ng.git Unify baseline benchmarking for both adler32 and crc32. Fix missing benchmarks of _copy functions for some platforms. --- diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index fbfb85602..9d2994f4a 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -78,75 +78,99 @@ public: } \ BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); -// Queue both misaligned and aligned for each benchmark -#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \ - BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ - BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); -// Adler32 + memcpy benchmark for reference -#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \ - BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ +// Adler32 + memcpy benchmarks for reference +#ifdef HASH_BASELINE +#define MEMCPY_NAME(name) name##_memcpy +#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned +#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ Bench(state, [](uint32_t init_sum, unsigned char *dst, \ const uint8_t *buf, size_t len) -> uint32_t { \ memcpy(dst, buf, (size_t)len); \ - return copyfunc(init_sum, buf, len); \ + return hashfunc(init_sum, buf, len); \ }, 1); \ } \ - BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10); + BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); +#endif + -BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); + +// Optionally also benchmark using memcpy with normal hash function for baseline +#ifdef HASH_BASELINE +#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag); +#else +#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) +#endif + +BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1); #ifdef DISABLE_RUNTIME_CPU_DETECTION -BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1); +BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1); #else #ifdef ARM_NEON -/* If we inline this copy for neon, the function would go here */ -BENCHMARK_ADLER32_COPY(neon, adler32_copy_neon, test_cpu_features.arm.has_neon); -BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon); +BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon); #endif #ifdef PPC_VMX -BENCHMARK_ADLER32_COPY(vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec); +BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec); #endif #ifdef POWER8_VSX -BENCHMARK_ADLER32_COPY(power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07); +BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07); #endif #ifdef RISCV_RVV -//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); -BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); +BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv); #endif + #ifdef X86_SSSE3 -BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3); +BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3); #endif #ifdef X86_SSE42 -BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3); -BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42); +// There is no adler32_sse42, so only test the copy variant +BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42); #endif #ifdef X86_AVX2 -BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2); -BENCHMARK_ADLER32_COPY(avx2, adler32_copy_avx2, test_cpu_features.x86.has_avx2); +BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2); #endif #ifdef X86_AVX512 -BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common); -BENCHMARK_ADLER32_COPY(avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common); +BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common); #endif #ifdef X86_AVX512VNNI -BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni); -BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni); +BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni); #endif #ifdef LOONGARCH_LSX -BENCHMARK_ADLER32_BASELINE_COPY(lsx_baseline, adler32_lsx, test_cpu_features.loongarch.has_lsx); -BENCHMARK_ADLER32_COPY(lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx); +BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx); #endif #ifdef LOONGARCH_LASX -BENCHMARK_ADLER32_BASELINE_COPY(lasx_baseline, adler32_lasx, test_cpu_features.loongarch.has_lasx); -BENCHMARK_ADLER32_COPY(lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx); +BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx); #endif #endif diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc index 71497e9ac..aa5c3f9e2 100644 --- a/test/benchmarks/benchmark_crc32_copy.cc +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -76,55 +76,99 @@ public: } \ Bench(state, copyfunc, 1); \ } \ - BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512); + BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// CRC32 + memcpy benchmarks for reference +#ifdef HASH_BASELINE +#define MEMCPY_NAME(name) name##_memcpy +#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 0); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned +#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, [](uint32_t init_sum, unsigned char *dst, \ + const uint8_t *buf, size_t len) -> uint32_t { \ + memcpy(dst, buf, (size_t)len); \ + return hashfunc(init_sum, buf, len); \ + }, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); +#endif + // Queue both misaligned and aligned for each benchmark -#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \ +#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \ BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); +// Optionally also benchmark using memcpy with normal hash function for baseline +#ifdef HASH_BASELINE +#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag); +#else +#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) +#endif + // Base test -BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1); +BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1); #ifdef DISABLE_RUNTIME_CPU_DETECTION // Native - BENCHMARK_CRC32_COPY(native, native_crc32_copy, 1) + BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1) #else // Optimized functions # ifndef WITHOUT_CHORBA - BENCHMARK_CRC32_COPY(chorba, crc32_copy_chorba, 1) + BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1) # endif # ifndef WITHOUT_CHORBA_SSE # ifdef X86_SSE2 - BENCHMARK_CRC32_COPY(chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2); + BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2); # endif # ifdef X86_SSE41 - BENCHMARK_CRC32_COPY(chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41); + BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41); # endif # endif # ifdef ARM_CRC32 - BENCHMARK_CRC32_COPY(armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32) + BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32) # endif # ifdef ARM_PMULL_EOR3 - BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3) + BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3) # endif # ifdef LOONGARCH_CRC - BENCHMARK_CRC32_COPY(loongarch, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc) + BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc) # endif # ifdef POWER8_VSX_CRC32 - BENCHMARK_CRC32_COPY(power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07) + BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07) # endif # ifdef RISCV_CRC32_ZBC - BENCHMARK_CRC32_COPY(riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc) + BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc) # endif # ifdef S390_CRC32_VX - BENCHMARK_CRC32_COPY(vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx) + BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx) # endif # ifdef X86_PCLMULQDQ_CRC - BENCHMARK_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) + BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) # endif # ifdef X86_VPCLMULQDQ_CRC - BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) + BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) # endif #endif