From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Fri, 14 Nov 2025 14:33:32 +0000 (+0100)
Subject: Add benchmark for crc32 fold copy implementations
X-Git-Tag: 2.3.0-rc2~1
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bb3be3084572f396001f7bfbb3ce1124df94aa25;p=thirdparty%2Fzlib-ng.git

Add benchmark for crc32 fold copy implementations
Uses local functions for benchmarking some of the run-time selected variants.
---

diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index d7549c3aa..886f20e75 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -44,6 +44,7 @@ add_executable(benchmark_zlib
     benchmark_compare256_rle.cc
     benchmark_compress.cc
     benchmark_crc32.cc
+    benchmark_crc32_fold_copy.cc
     benchmark_insert_string.cc
     benchmark_main.cc
     benchmark_slidehash.cc
diff --git a/test/benchmarks/benchmark_crc32_fold_copy.cc b/test/benchmarks/benchmark_crc32_fold_copy.cc
new file mode 100644
index 000000000..eb6b93ba9
--- /dev/null
+++ b/test/benchmarks/benchmark_crc32_fold_copy.cc
@@ -0,0 +1,131 @@
+/* benchmark_crc32_fold_copy.cc -- benchmark for crc32 implementations doing folded copying
+ * Copyright (C) 2025 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <benchmark/benchmark.h>
+#include <assert.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "arch_functions.h"
+#  include "../test_cpu_features.h"
+}
+
+#define BUFSIZE (32768 + 16 + 16)
+
+// We have no function that gives us direct access to these, so we have a local implementation for benchmarks
+static void crc32_fold_copy_braid(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc->value = crc32_braid(crc->value, src, len);
+    memcpy(dst, src, len);
+}
+#ifndef WITHOUT_CHORBA
+static void crc32_fold_copy_chorba(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc->value = crc32_chorba(crc->value, src, len);
+    memcpy(dst, src, len);
+}
+#endif
+#ifndef WITHOUT_CHORBA_SSE
+#  ifdef X86_SSE2
+    static void crc32_fold_copy_chorba_sse2(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+        crc->value = crc32_chorba_sse2(crc->value, src, len);
+        memcpy(dst, src, len);
+    }
+#  endif
+#  ifdef X86_SSE41
+    static void crc32_fold_copy_chorba_sse41(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+        crc->value = crc32_chorba_sse41(crc->value, src, len);
+        memcpy(dst, src, len);
+    }
+#  endif
+#endif
+
+class crc32_fc: public benchmark::Fixture {
+protected:
+    uint32_t *testdata;
+    uint8_t *dstbuf;
+    uint32_t crc;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        testdata = (uint32_t *)malloc(BUFSIZE);
+        dstbuf = (uint8_t *)malloc(BUFSIZE);
+        assert((testdata != NULL) && (dstbuf != NULL));
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, crc32_fold_reset_func fold_reset, crc32_fold_copy_func fold_copy,
+               crc32_fold_final_func fold_final) {
+        ALIGNED_(16) crc32_fold crc_st;
+        int misalign = 0;
+        // Prepare an initial crc state
+        fold_reset(&crc_st);
+        crc = 0;
+
+        // Benchmark the CRC32 fold copy operation
+        for (auto _ : state) {
+            fold_copy(&crc_st, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            misalign++;
+            if (misalign > 14)
+                misalign = 0;
+        }
+
+        // Finalize the CRC32 calculation
+        crc = fold_final(&crc_st);
+
+        // Prevent the result from being optimized away
+        benchmark::DoNotOptimize(crc);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        free(testdata);
+        free(dstbuf);
+    }
+};
+
+#define BENCHMARK_CRC32_FOLD(name, resfunc, copyfunc, finfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_fc, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, resfunc, copyfunc, finfunc); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_fc, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10);
+
+// Generic
+BENCHMARK_CRC32_FOLD(braid_c, crc32_fold_reset_c, crc32_fold_copy_braid, crc32_fold_final_c, 1)
+
+#ifdef DISABLE_RUNTIME_CPU_DETECTION
+    // Native
+    BENCHMARK_CRC32_FOLD(native, native_crc32_fold_reset, native_crc32_fold_copy, native_crc32_fold_final, 1)
+#else
+
+    // Optimized functions
+#  ifndef WITHOUT_CHORBA
+    BENCHMARK_CRC32_FOLD(chorba_c, crc32_fold_reset_c, crc32_fold_copy_chorba, crc32_fold_final_c, 1)
+#  endif
+#  ifdef ARM_CRC32
+    BENCHMARK_CRC32_FOLD(armv8, crc32_fold_reset_c, crc32_fold_copy_armv8, crc32_fold_final_c, test_cpu_features.arm.has_crc32)
+#  endif
+#  ifndef WITHOUT_CHORBA_SSE
+#    ifdef X86_SSE2
+        BENCHMARK_CRC32_FOLD(chorba_sse2, crc32_fold_reset_c, crc32_fold_copy_chorba_sse2, crc32_fold_final_c, test_cpu_features.x86.has_sse2)
+#    endif
+#    ifdef X86_SSE41
+        BENCHMARK_CRC32_FOLD(chorba_sse41, crc32_fold_reset_c, crc32_fold_copy_chorba_sse41, crc32_fold_final_c, test_cpu_features.x86.has_sse41)
+#     endif
+#  endif
+#  ifdef X86_PCLMULQDQ_CRC
+    BENCHMARK_CRC32_FOLD(pclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_pclmulqdq_copy, crc32_fold_pclmulqdq_final, test_cpu_features.x86.has_pclmulqdq)
+#  endif
+#  ifdef X86_VPCLMULQDQ_CRC
+    BENCHMARK_CRC32_FOLD(vpclmulqdq, crc32_fold_pclmulqdq_reset, crc32_fold_vpclmulqdq_copy, crc32_fold_pclmulqdq_final, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq))
+#  endif
+#  ifdef LOONGARCH_CRC
+    BENCHMARK_CRC32_FOLD(loongarch64, crc32_fold_reset_c, crc32_fold_copy_loongarch64, crc32_fold_final_c, test_cpu_features.loongarch.has_crc)
+#  endif
+
+#endif