From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Wed, 14 Jan 2026 16:18:02 +0000 (+0100)
Subject: Unify adler32/crc32 benchmarks and add rotating misalignment
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b233f8675b08220aa0db9207cb4295df871299fd;p=thirdparty%2Fzlib-ng.git

Unify adler32/crc32 benchmarks and add rotating misalignment
Add aligned benchmarks for adler32/crc32
---

diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc
index 48121db42..8fb213004 100644
--- a/test/benchmarks/benchmark_adler32.cc
+++ b/test/benchmarks/benchmark_adler32.cc
@@ -3,65 +3,79 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include <stdio.h>
-#include <assert.h>
-
 #include <benchmark/benchmark.h>
 
 extern "C" {
 #  include "zbuild.h"
-#  include "zutil_p.h"
 #  include "arch_functions.h"
 #  include "../test_cpu_features.h"
 }
 
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
 
 class adler32: public benchmark::Fixture {
 private:
-    uint32_t *random_ints;
+    uint32_t *testdata;
 
 public:
-    void SetUp(const ::benchmark::State&) {
-        /* Control the alignment so that we have the best case scenario for loads. With
-         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
-         * And while this is a realistic scenario, it makes it difficult to compare benchmark
-         * to benchmark because one allocation could have been aligned perfectly for the loads
-         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
-         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
-         * control the _consistency_ of the results */
-        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints != NULL);
-
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints[i] = rand();
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
         }
     }
 
-    void Bench(benchmark::State& state, adler32_func adler32) {
+    // Benchmark Adler32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) {
+        int misalign = 0;
         uint32_t hash = 0;
 
         for (auto _ : state) {
-            hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+            hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
         }
 
+        // Prevent the result from being optimized away
         benchmark::DoNotOptimize(hash);
     }
 
     void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints);
+        zng_free_aligned(testdata);
     }
 };
 
-#define BENCHMARK_ADLER32(name, fptr, support_flag) \
+#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \
     BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
         } \
-        Bench(state, fptr); \
+        Bench(state, hashfunc, 0); \
     } \
-    BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10)
+    BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \
+    BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag);
 
 BENCHMARK_ADLER32(c, adler32_c, 1);
 
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc
index d8efa0d22..fbfb85602 100644
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -3,75 +3,88 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include <stdio.h>
-#include <assert.h>
-#include <string.h>
-
 #include <benchmark/benchmark.h>
 
 extern "C" {
 #  include "zbuild.h"
-#  include "zutil_p.h"
 #  include "arch_functions.h"
 #  include "../test_cpu_features.h"
 }
 
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
-
-typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len);
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
 
 class adler32_copy: public benchmark::Fixture {
 private:
-    uint32_t *random_ints_src;
-    uint32_t *random_ints_dst;
+    uint32_t *testdata;
+    uint8_t *dstbuf;
 
 public:
-    void SetUp(const ::benchmark::State&) {
-        /* Control the alignment so that we have the best case scenario for loads. With
-         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
-         * And while this is a realistic scenario, it makes it difficult to compare benchmark
-         * to benchmark because one allocation could have been aligned perfectly for the loads
-         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
-         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
-         * control the _consistency_ of the results */
-        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints_src != NULL);
-        assert(random_ints_dst != NULL);
-
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints_src[i] = rand();
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
         }
     }
 
-    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+    // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) {
+        int misalign = 0;
         uint32_t hash = 0;
 
         for (auto _ : state) {
-            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
-                                (const unsigned char*)random_ints_src, (size_t)state.range(0));
+            hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
         }
 
+        // Prevent the result from being optimized away
         benchmark::DoNotOptimize(hash);
     }
 
     void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints_src);
-        zng_free(random_ints_dst);
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
     }
 };
 
-#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+// Misaligned
+#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \
     BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
         } \
-        Bench(state, fptr); \
+        Bench(state, copyfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
     } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+    BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
 
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+// Adler32 + memcpy benchmark for reference
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
     BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
@@ -79,10 +92,10 @@ public:
         Bench(state, [](uint32_t init_sum, unsigned char *dst, \
                         const uint8_t *buf, size_t len) -> uint32_t { \
             memcpy(dst, buf, (size_t)len); \
-            return fptr(init_sum, buf, len); \
-        }); \
+            return copyfunc(init_sum, buf, len); \
+        }, 1); \
     } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
 
 BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
 
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc
index e3c4d9e2c..df7eaec3e 100644
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@@ -3,59 +3,80 @@
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#include <stdio.h>
-#include <assert.h>
-
 #include <benchmark/benchmark.h>
 
 extern "C" {
 #  include "zbuild.h"
-#  include "zutil_p.h"
 #  include "arch_functions.h"
 #  include "../test_cpu_features.h"
 }
 
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
 
 class crc32: public benchmark::Fixture {
 private:
-    uint32_t *random_ints;
+    uint32_t *testdata;
 
 public:
-    void SetUp(const ::benchmark::State&) {
-        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints != NULL);
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
 
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints[i] = rand();
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
         }
     }
 
-    void Bench(benchmark::State& state, crc32_func crc32) {
+    // Benchmark CRC32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) {
+        int misalign = 0;
         uint32_t hash = 0;
 
         for (auto _ : state) {
-            hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+            hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
         }
 
+        // Prevent the result from being optimized away
         benchmark::DoNotOptimize(hash);
     }
 
     void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints);
+        zng_free_aligned(testdata);
     }
 };
 
-#define BENCHMARK_CRC32(name, fptr, support_flag) \
+#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \
     BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
         } \
-        Bench(state, fptr); \
+        Bench(state, hashfunc, 0); \
     } \
     BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
 
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32(name, hashfunc, support_flag) \
+    BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag);
+
 BENCHMARK_CRC32(braid, crc32_braid, 1);
 
 #ifdef DISABLE_RUNTIME_CPU_DETECTION
diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc
index e2de0f5a5..71497e9ac 100644
--- a/test/benchmarks/benchmark_crc32_copy.cc
+++ b/test/benchmarks/benchmark_crc32_copy.cc
@@ -4,7 +4,6 @@
  */
 
 #include <benchmark/benchmark.h>
-#include <assert.h>
 
 extern "C" {
 #  include "zbuild.h"
@@ -12,7 +11,9 @@ extern "C" {
 #  include "../test_cpu_features.h"
 }
 
-#define BUFSIZE (32768 + 16 + 16)
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
 
 class crc32_copy: public benchmark::Fixture {
 protected:
@@ -20,46 +21,67 @@ protected:
     uint8_t *dstbuf;
 
 public:
-    void SetUp(const ::benchmark::State&) {
-        testdata = (uint32_t *)malloc(BUFSIZE);
-        dstbuf = (uint8_t *)malloc(BUFSIZE);
-        assert((testdata != NULL) && (dstbuf != NULL));
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
 
         for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
             testdata[i] = rand();
         }
     }
 
-    void Bench(benchmark::State& state, crc32_copy_func crc32_copy) {
+    // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) {
         int misalign = 0;
-        uint32_t crc = 0;
+        uint32_t hash = 0;
 
-        // Benchmark the CRC32 copy operation
         for (auto _ : state) {
-            crc = crc32_copy(crc, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
-            misalign++;
-            if (misalign > 14)
+            hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
                 misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
         }
 
         // Prevent the result from being optimized away
-        benchmark::DoNotOptimize(crc);
+        benchmark::DoNotOptimize(hash);
     }
 
     void TearDown(const ::benchmark::State&) {
-        free(testdata);
-        free(dstbuf);
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
     }
 };
 
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+// Misaligned
+#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \
     BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \
         if (!(support_flag)) { \
             state.SkipWithError("CPU does not support " #name); \
         } \
-        Bench(state, copyfunc); \
+        Bench(state, copyfunc, 0); \
     } \
-    BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10);
+    BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
 
 // Base test
 BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);