From: Hans Kristian Rosbach Date: Fri, 14 Nov 2025 14:33:32 +0000 (+0100) Subject: Add benchmark for crc32 fold copy implementations X-Git-Tag: 2.3.0-rc2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bb3be3084572f396001f7bfbb3ce1124df94aa25;p=thirdparty%2Fzlib-ng.git Add benchmark for crc32 fold copy implementations Uses local functions for benchmarking some of the run-time selected variants. --- diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt index d7549c3aa..886f20e75 100644 --- a/test/benchmarks/CMakeLists.txt +++ b/test/benchmarks/CMakeLists.txt @@ -44,6 +44,7 @@ add_executable(benchmark_zlib benchmark_compare256_rle.cc benchmark_compress.cc benchmark_crc32.cc + benchmark_crc32_fold_copy.cc benchmark_insert_string.cc benchmark_main.cc benchmark_slidehash.cc diff --git a/test/benchmarks/benchmark_crc32_fold_copy.cc b/test/benchmarks/benchmark_crc32_fold_copy.cc new file mode 100644 index 000000000..eb6b93ba9 --- /dev/null +++ b/test/benchmarks/benchmark_crc32_fold_copy.cc @@ -0,0 +1,131 @@ +/* benchmark_crc32_fold_copy.cc -- benchmark for crc32 implementations doing folded copying + * Copyright (C) 2025 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include + +extern "C" { +# include "zbuild.h" +# include "arch_functions.h" +# include "../test_cpu_features.h" +} + +#define BUFSIZE (32768 + 16 + 16) + +// We have no function that gives us direct access to these, so we have a local implementation for benchmarks +static void crc32_fold_copy_braid(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_braid(crc->value, src, len); + memcpy(dst, src, len); +} +#ifndef WITHOUT_CHORBA +static void crc32_fold_copy_chorba(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba(crc->value, src, len); + memcpy(dst, src, len); +} +#endif +#ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + static void crc32_fold_copy_chorba_sse2(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba_sse2(crc->value, src, len); + memcpy(dst, src, len); + } +# endif +# ifdef X86_SSE41 + static void crc32_fold_copy_chorba_sse41(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = crc32_chorba_sse41(crc->value, src, len); + memcpy(dst, src, len); + } +# endif +#endif + +class crc32_fc: public benchmark::Fixture { +protected: + uint32_t *testdata; + uint8_t *dstbuf; + uint32_t crc; + +public: + void SetUp(const ::benchmark::State& state) { + testdata = (uint32_t *)malloc(BUFSIZE); + dstbuf = (uint8_t *)malloc(BUFSIZE); + assert((testdata != NULL) && (dstbuf != NULL)); + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); + } + } + + void Bench(benchmark::State& state, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy, + crc32_fold_final_func fold_final) { + ALIGNED_(16) crc32_fold crc_st; + int misalign = 0; + // Prepare an initial crc state + fold_reset(&crc_st); + crc = 0; + + // Benchmark the CRC32 fold copy operation + for (auto _ : state) { + fold_copy(&crc_st, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + misalign++; + if (misalign > 14) + misalign = 0; + } + + // Finalize the CRC32 calculation + crc = fold_final(&crc_st); + + // Prevent the result from being optimized away + benchmark::DoNotOptimize(crc); + } + + void TearDown(const ::benchmark::State& state) { + free(testdata); + free(dstbuf); + } +}; + +#define BENCHMARK_CRC32_FOLD(name, resfunc, copyfunc, finfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_fc, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, resfunc, copyfunc, finfunc); \ + } \ + BENCHMARK_REGISTER_F(crc32_fc, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10); + +// Generic +BENCHMARK_CRC32_FOLD(braid_c, crc32_fold_reset_c, crc32_fold_copy_braid, crc32_fold_final_c, 1) + +#ifdef DISABLE_RUNTIME_CPU_DETECTION + // Native + BENCHMARK_CRC32_FOLD(native, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1) +#else + + // Optimized functions +# ifndef WITHOUT_CHORBA + BENCHMARK_CRC32_FOLD(chorba_c, crc32_fold_reset_c, crc32_fold_copy_chorba, crc32_fold_final_c, 1) +# endif +# ifdef ARM_CRC32 + BENCHMARK_CRC32_FOLD(armv8, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32) +# endif +# ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + BENCHMARK_CRC32_FOLD(chorba_sse2, crc32_fold_reset_c, crc32_fold_copy_chorba_sse2, crc32_fold_final_c, test_cpu_features.x86.has_sse2) +# endif +# ifdef X86_SSE41 + BENCHMARK_CRC32_FOLD(chorba_sse41, crc32_fold_reset_c, crc32_fold_copy_chorba_sse41, crc32_fold_final_c, test_cpu_features.x86.has_sse41) +# endif +# endif +# ifdef X86_PCLMULQDQ_CRC + BENCHMARK_CRC32_FOLD(pclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq) +# endif +# ifdef X86_VPCLMULQDQ_CRC + BENCHMARK_CRC32_FOLD(vpclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) +# endif +# ifdef LOONGARCH_CRC + BENCHMARK_CRC32_FOLD(loongarch64, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc) +# endif + +#endif