Unify baseline benchmarking for both adler32 and crc32.

author Hans Kristian Rosbach <hk-git@circlestorm.org>

Thu, 15 Jan 2026 22:42:19 +0000 (23:42 +0100)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
author Hans Kristian Rosbach <hk-git@circlestorm.org>
Thu, 15 Jan 2026 22:42:19 +0000 (23:42 +0100)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc

index fbfb85602e5bc98ca6a7c6c38000aa1bed282156..9d2994f4ad62154c72fa7a44f1f9f3fe45606d25 100644 (file)
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -78,75 +78,99 @@ public:
      } \
      BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
  
-// Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
-    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
-    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
  
-// Adler32 + memcpy benchmark for reference
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
-    BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+// Adler32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
          } \
          Bench(state, [](uint32_t init_sum, unsigned char *dst, \
                          const uint8_t *buf, size_t len) -> uint32_t { \
              memcpy(dst, buf, (size_t)len); \
-            return copyfunc(init_sum, buf, len); \
+            return hashfunc(init_sum, buf, len); \
          }, 1); \
      } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
+    BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
  
-BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1);
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
-BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1);
+BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1);
  #else
  
  #ifdef ARM_NEON
-/* If we inline this copy for neon, the function would go here */
-BENCHMARK_ADLER32_COPY(neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
-BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon);
+BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
  #endif
  
  #ifdef PPC_VMX
-BENCHMARK_ADLER32_COPY(vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
+BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
  #endif
  #ifdef POWER8_VSX
-BENCHMARK_ADLER32_COPY(power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
+BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
  #endif
  
  #ifdef RISCV_RVV
-//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
-BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
+BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv);
  #endif
+
  #ifdef X86_SSSE3
-BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
  #endif
  #ifdef X86_SSE42
-BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
-BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
+// There is no adler32_sse42, so only test the copy variant
+BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
  #endif
  #ifdef X86_AVX2
-BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
-BENCHMARK_ADLER32_COPY(avx2, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
  #endif
  #ifdef X86_AVX512
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common);
-BENCHMARK_ADLER32_COPY(avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
+BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
  #endif
  #ifdef X86_AVX512VNNI
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
-BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
  #endif
  
  #ifdef LOONGARCH_LSX
-BENCHMARK_ADLER32_BASELINE_COPY(lsx_baseline, adler32_lsx, test_cpu_features.loongarch.has_lsx);
-BENCHMARK_ADLER32_COPY(lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
+BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
  #endif
  #ifdef LOONGARCH_LASX
-BENCHMARK_ADLER32_BASELINE_COPY(lasx_baseline, adler32_lasx, test_cpu_features.loongarch.has_lasx);
-BENCHMARK_ADLER32_COPY(lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
+BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
  #endif
  
  #endif
diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc

index 71497e9aca9f4c514933e48292f0be9344f39c8d..aa5c3f9e2cd570fce1bf44108df3cc101a322147 100644 (file)
--- a/test/benchmarks/benchmark_crc32_copy.cc
+++ b/test/benchmarks/benchmark_crc32_copy.cc
@@ -76,55 +76,99 @@ public:
          } \
          Bench(state, copyfunc, 1); \
      } \
-    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// CRC32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+       Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 0); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+       Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+                        const uint8_t *buf, size_t len) -> uint32_t { \
+            memcpy(dst, buf, (size_t)len); \
+            return hashfunc(init_sum, buf, len); \
+        }, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
  
  // Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \
      BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
      BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
  
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
  // Base test
-BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);
+BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
      // Native
-    BENCHMARK_CRC32_COPY(native, native_crc32_copy, 1)
+    BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1)
  #else
      // Optimized functions
  #  ifndef WITHOUT_CHORBA
-    BENCHMARK_CRC32_COPY(chorba, crc32_copy_chorba, 1)
+    BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1)
  #  endif
  #  ifndef WITHOUT_CHORBA_SSE
  #    ifdef X86_SSE2
-    BENCHMARK_CRC32_COPY(chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
+    BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
  #    endif
  #    ifdef X86_SSE41
-    BENCHMARK_CRC32_COPY(chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
+    BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
  #    endif
  #  endif
  #  ifdef ARM_CRC32
-    BENCHMARK_CRC32_COPY(armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
+    BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
  #  endif
  #  ifdef ARM_PMULL_EOR3
-    BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
+    BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
  #  endif
  #  ifdef LOONGARCH_CRC
-    BENCHMARK_CRC32_COPY(loongarch, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
+    BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
  #  endif
  #  ifdef POWER8_VSX_CRC32
-    BENCHMARK_CRC32_COPY(power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
+    BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
  #  endif
  #  ifdef RISCV_CRC32_ZBC
-    BENCHMARK_CRC32_COPY(riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
+    BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
  #  endif
  #  ifdef S390_CRC32_VX
-    BENCHMARK_CRC32_COPY(vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
+    BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
  #  endif
  #  ifdef X86_PCLMULQDQ_CRC
-    BENCHMARK_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
+    BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
  #  endif
  #  ifdef X86_VPCLMULQDQ_CRC
-    BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+    BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
  #  endif
  
  #endif
author	Hans Kristian Rosbach <hk-git@circlestorm.org>
	Thu, 15 Jan 2026 22:42:19 +0000 (23:42 +0100)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
test/benchmarks/benchmark_adler32_copy.cc		patch \| blob \| blame \| history
test/benchmarks/benchmark_crc32_copy.cc		patch \| blob \| blame \| history