} \
BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
-// Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
- BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
- BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
-// Adler32 + memcpy benchmark for reference
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
- BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+// Adler32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 0); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, [](uint32_t init_sum, unsigned char *dst, \
const uint8_t *buf, size_t len) -> uint32_t { \
memcpy(dst, buf, (size_t)len); \
- return copyfunc(init_sum, buf, len); \
+ return hashfunc(init_sum, buf, len); \
}, 1); \
} \
- BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
+ BENCHMARK_REGISTER_F(adler32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
-BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
+
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_ADLER32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
+BENCHMARK_ADLER32_COPY(c, adler32_c, adler32_copy_c, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
-BENCHMARK_ADLER32_BASELINE_COPY(native, native_adler32, 1);
+BENCHMARK_ADLER32_COPY(native, native_adler32, native_adler32_copy, 1);
#else
#ifdef ARM_NEON
-/* If we inline this copy for neon, the function would go here */
-BENCHMARK_ADLER32_COPY(neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
-BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, test_cpu_features.arm.has_neon);
+BENCHMARK_ADLER32_COPY(neon, adler32_neon, adler32_copy_neon, test_cpu_features.arm.has_neon);
#endif
#ifdef PPC_VMX
-BENCHMARK_ADLER32_COPY(vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
+BENCHMARK_ADLER32_COPY(vmx, adler32_vmx, adler32_copy_vmx, test_cpu_features.power.has_altivec);
#endif
#ifdef POWER8_VSX
-BENCHMARK_ADLER32_COPY(power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
+BENCHMARK_ADLER32_COPY(power8, adler32_power8, adler32_copy_power8, test_cpu_features.power.has_arch_2_07);
#endif
#ifdef RISCV_RVV
-//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
-BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv);
+BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, adler32_copy_rvv, test_cpu_features.riscv.has_rvv);
#endif
+
#ifdef X86_SSSE3
-BENCHMARK_ADLER32_COPY(ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
+BENCHMARK_ADLER32_COPY(ssse3, adler32_ssse3, adler32_copy_ssse3, test_cpu_features.x86.has_ssse3);
#endif
#ifdef X86_SSE42
-BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3);
-BENCHMARK_ADLER32_COPY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
+// There is no adler32_sse42, so only test the copy variant
+BENCHMARK_ADLER32_COPY_ONLY(sse42, adler32_copy_sse42, test_cpu_features.x86.has_sse42);
#endif
#ifdef X86_AVX2
-BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, test_cpu_features.x86.has_avx2);
-BENCHMARK_ADLER32_COPY(avx2, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_avx, adler32_copy_avx2, test_cpu_features.x86.has_avx2);
#endif
#ifdef X86_AVX512
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, test_cpu_features.x86.has_avx512_common);
-BENCHMARK_ADLER32_COPY(avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
+BENCHMARK_ADLER32_COPY(avx512, adler32_avx512, adler32_copy_avx512, test_cpu_features.x86.has_avx512_common);
#endif
#ifdef X86_AVX512VNNI
-BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
-BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_avx512_vnni, adler32_copy_avx512_vnni, test_cpu_features.x86.has_avx512vnni);
#endif
#ifdef LOONGARCH_LSX
-BENCHMARK_ADLER32_BASELINE_COPY(lsx_baseline, adler32_lsx, test_cpu_features.loongarch.has_lsx);
-BENCHMARK_ADLER32_COPY(lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
+BENCHMARK_ADLER32_COPY(lsx, adler32_lsx, adler32_copy_lsx, test_cpu_features.loongarch.has_lsx);
#endif
#ifdef LOONGARCH_LASX
-BENCHMARK_ADLER32_BASELINE_COPY(lasx_baseline, adler32_lasx, test_cpu_features.loongarch.has_lasx);
-BENCHMARK_ADLER32_COPY(lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
+BENCHMARK_ADLER32_COPY(lasx, adler32_lasx, adler32_copy_lasx, test_cpu_features.loongarch.has_lasx);
#endif
#endif
} \
Bench(state, copyfunc, 1); \
} \
- BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+ BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// CRC32 + memcpy benchmarks for reference
+#ifdef HASH_BASELINE
+#define MEMCPY_NAME(name) name##_memcpy
+#define BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 0); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+#define MEMCPY_ALIGNED_NAME(name) name##_memcpy_aligned
+#define BENCHMARK_CRC32_MEMCPY_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const uint8_t *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, (size_t)len); \
+ return hashfunc(init_sum, buf, len); \
+ }, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, MEMCPY_ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+#endif
+
// Queue both misaligned and aligned for each benchmark
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+#define BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag) \
BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
+// Optionally also benchmark using memcpy with normal hash function for baseline
+#ifdef HASH_BASELINE
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_MEMCPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_MEMCPY_ALIGNED(name, copyfunc, support_flag);
+#else
+#define BENCHMARK_CRC32_COPY(name, hashfunc, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_ONLY(name, copyfunc, support_flag)
+#endif
+
// Base test
-BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);
+BENCHMARK_CRC32_COPY(braid, crc32_braid, crc32_copy_braid, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
// Native
- BENCHMARK_CRC32_COPY(native, native_crc32_copy, 1)
+ BENCHMARK_CRC32_COPY(native, native_crc32, native_crc32_copy, 1)
#else
// Optimized functions
# ifndef WITHOUT_CHORBA
- BENCHMARK_CRC32_COPY(chorba, crc32_copy_chorba, 1)
+ BENCHMARK_CRC32_COPY(chorba, crc32_chorba, crc32_copy_chorba, 1)
# endif
# ifndef WITHOUT_CHORBA_SSE
# ifdef X86_SSE2
- BENCHMARK_CRC32_COPY(chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
+ BENCHMARK_CRC32_COPY(chorba_sse2, crc32_chorba_sse2, crc32_copy_chorba_sse2, test_cpu_features.x86.has_sse2);
# endif
# ifdef X86_SSE41
- BENCHMARK_CRC32_COPY(chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
+ BENCHMARK_CRC32_COPY(chorba_sse41, crc32_chorba_sse41, crc32_copy_chorba_sse41, test_cpu_features.x86.has_sse41);
# endif
# endif
# ifdef ARM_CRC32
- BENCHMARK_CRC32_COPY(armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
+ BENCHMARK_CRC32_COPY(armv8, crc32_armv8, crc32_copy_armv8, test_cpu_features.arm.has_crc32)
# endif
# ifdef ARM_PMULL_EOR3
- BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
+ BENCHMARK_CRC32_COPY(armv8_pmull_eor3, crc32_armv8_pmull_eor3, crc32_copy_armv8_pmull_eor3, test_cpu_features.arm.has_crc32 && test_cpu_features.arm.has_pmull && test_cpu_features.arm.has_eor3)
# endif
# ifdef LOONGARCH_CRC
- BENCHMARK_CRC32_COPY(loongarch, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
+ BENCHMARK_CRC32_COPY(loongarch, crc32_loongarch64, crc32_copy_loongarch64, test_cpu_features.loongarch.has_crc)
# endif
# ifdef POWER8_VSX_CRC32
- BENCHMARK_CRC32_COPY(power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
+ BENCHMARK_CRC32_COPY(power8, crc32_power8, crc32_copy_power8, test_cpu_features.power.has_arch_2_07)
# endif
# ifdef RISCV_CRC32_ZBC
- BENCHMARK_CRC32_COPY(riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
+ BENCHMARK_CRC32_COPY(riscv, crc32_riscv, crc32_copy_riscv64_zbc, test_cpu_features.riscv.has_zbc)
# endif
# ifdef S390_CRC32_VX
- BENCHMARK_CRC32_COPY(vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
+ BENCHMARK_CRC32_COPY(vx, crc32_s390_vx, crc32_copy_s390_vx, test_cpu_features.s390.has_vx)
# endif
# ifdef X86_PCLMULQDQ_CRC
- BENCHMARK_CRC32_COPY(pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
+ BENCHMARK_CRC32_COPY(pclmulqdq, crc32_pclmulqdq, crc32_copy_pclmulqdq, test_cpu_features.x86.has_pclmulqdq)
# endif
# ifdef X86_VPCLMULQDQ_CRC
- BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+ BENCHMARK_CRC32_COPY(vpclmulqdq, crc32_vpclmulqdq, crc32_copy_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
# endif
#endif