From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Thu, 15 Jan 2026 22:42:19 +0000 (+0100)
Subject: Unify baseline benchmarking for both adler32 and crc32.
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2aeaa6418172660d65e6a4f64d881a5aa68b8dd8;p=thirdparty%2Fzlib-ng.git

Unify baseline benchmarking for both adler32 and crc32.
Fix missing benchmarks of _copy functions for some platforms.
---

diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
index fbfb85602..9d2994f4a 100644
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -78,75 +78,99 @@ public:
     } \
     BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
 
-// Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
-    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
-    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
 
-// Adler32 + memcpy benchmark for reference
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
-    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+// Adler32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
         } \
         Bench(state, [](uint32_t init_sum, unsigned char *dst, \
                         const uint8_t *buf, size_t len) -> uint32_t { \
             memcpy(dst, buf, (size_t)len); \
-            return copyfunc(init_sum, buf, len); \
+            return hashfunc(init_sum, buf, len); \
         }, 1); \
     } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
 
-BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1);
 
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
-BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1);
+BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1);
 #else
 
 #ifdef ARM_NEON
-/* If we inline this copy for neon, the function would go here */
-BENCHMARK_ADLER32_COPY(neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
-BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon);
+BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
 #endif
 
 #ifdef PPC_VMX
-BENCHMARK_ADLER32_COPY(vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
+BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
 #endif
 #ifdef POWER8_VSX
-BENCHMARK_ADLER32_COPY(power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
+BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
 #endif
 
 #ifdef RISCV_RVV
-//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
-BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
+BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv);
 #endif
+
 #ifdef X86_SSSE3
-BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
 #endif
 #ifdef X86_SSE42
-BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
-BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
+// There is no adler32_sse42, so only test the copy variant
+BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
 #endif
 #ifdef X86_AVX2
-BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
-BENCHMARK_ADLER32_COPY(avx2, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
 #endif
 #ifdef X86_AVX512
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common);
-BENCHMARK_ADLER32_COPY(avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
+BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
 #endif
 #ifdef X86_AVX512VNNI
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
-BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
 #endif
 
 #ifdef LOONGARCH_LSX
-BENCHMARK_ADLER32_BASELINE_COPY(lsx_baseline, adler32_lsx, test_cpu_features.loongarch.has_lsx);
-BENCHMARK_ADLER32_COPY(lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
+BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
 #endif
 #ifdef LOONGARCH_LASX
-BENCHMARK_ADLER32_BASELINE_COPY(lasx_baseline, adler32_lasx, test_cpu_features.loongarch.has_lasx);
-BENCHMARK_ADLER32_COPY(lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
+BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
 #endif
 
 #endif
diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc
index 71497e9ac..aa5c3f9e2 100644
--- a/test/benchmarks/benchmark_crc32_copy.cc
+++ b/test/benchmarks/benchmark_crc32_copy.cc
@@ -76,55 +76,99 @@ public:
         } \
         Bench(state, copyfunc, 1); \
     } \
-    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// CRC32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+	Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+	Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
 
 // Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \
     BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
     BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
 
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
 // Base test
-BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);
+BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
 
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
     // Native
-    BENCHMARK_CRC32_COPY(native, native_crc32_copy, 1)
+    BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1)
 #else
     // Optimized functions
 #  ifndef WITHOUT_CHORBA
-    BENCHMARK_CRC32_COPY(chorba, crc32_copy_chorba, 1)
+    BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1)
 #  endif
 #  ifndef WITHOUT_CHORBA_SSE
 #    ifdef X86_SSE2
-    BENCHMARK_CRC32_COPY(chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
+    BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
 #    endif
 #    ifdef X86_SSE41
-    BENCHMARK_CRC32_COPY(chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
+    BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
 #    endif
 #  endif
 #  ifdef ARM_CRC32
-    BENCHMARK_CRC32_COPY(armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
+    BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
 #  endif
 #  ifdef ARM_PMULL_EOR3
-    BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
+    BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
 #  endif
 #  ifdef LOONGARCH_CRC
-    BENCHMARK_CRC32_COPY(loongarch, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
+    BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
 #  endif
 #  ifdef POWER8_VSX_CRC32
-    BENCHMARK_CRC32_COPY(power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
+    BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
 #  endif
 #  ifdef RISCV_CRC32_ZBC
-    BENCHMARK_CRC32_COPY(riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
+    BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
 #  endif
 #  ifdef S390_CRC32_VX
-    BENCHMARK_CRC32_COPY(vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
+    BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
 #  endif
 #  ifdef X86_PCLMULQDQ_CRC
-    BENCHMARK_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
+    BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
 #  endif
 #  ifdef X86_VPCLMULQDQ_CRC
-    BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+    BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
 #  endif
 
 #endif