Unify adler32/crc32 benchmarks and add rotating misalignment

author Hans Kristian Rosbach <hk-git@circlestorm.org>

Wed, 14 Jan 2026 16:18:02 +0000 (17:18 +0100)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
author Hans Kristian Rosbach <hk-git@circlestorm.org>
Wed, 14 Jan 2026 16:18:02 +0000 (17:18 +0100)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc

index 48121db428623a7656a7fefec36d7c448be32f52..8fb21300414bca1a964736ce8d09db3db5c2673b 100644 (file)
--- a/test/benchmarks/benchmark_adler32.cc
+++ b/test/benchmarks/benchmark_adler32.cc
@@ -3,65 +3,79 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#include <stdio.h>
-#include <assert.h>
-
  #include <benchmark/benchmark.h>
  
  extern "C" {
  #  include "zbuild.h"
-#  include "zutil_p.h"
  #  include "arch_functions.h"
  #  include "../test_cpu_features.h"
  }
  
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
  
  class adler32: public benchmark::Fixture {
  private:
-    uint32_t *random_ints;
+    uint32_t *testdata;
  
  public:
-    void SetUp(const ::benchmark::State&) {
-        /* Control the alignment so that we have the best case scenario for loads. With
-         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
-         * And while this is a realistic scenario, it makes it difficult to compare benchmark
-         * to benchmark because one allocation could have been aligned perfectly for the loads
-         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
-         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
-         * control the _consistency_ of the results */
-        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints != NULL);
-
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints[i] = rand();
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
          }
      }
  
-    void Bench(benchmark::State& state, adler32_func adler32) {
+    // Benchmark Adler32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) {
+        int misalign = 0;
          uint32_t hash = 0;
  
          for (auto _ : state) {
-            hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+            hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
          }
  
+        // Prevent the result from being optimized away
          benchmark::DoNotOptimize(hash);
      }
  
      void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints);
+        zng_free_aligned(testdata);
      }
  };
  
-#define BENCHMARK_ADLER32(name, fptr, support_flag) \
+#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \
      BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
          } \
-        Bench(state, fptr); \
+        Bench(state, hashfunc, 0); \
      } \
-    BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10)
+    BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \
+    BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag);
  
  BENCHMARK_ADLER32(c, adler32_c, 1);
  
diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc

index d8efa0d22eb91b510236bd75f0b35b4593965814..fbfb85602e5bc98ca6a7c6c38000aa1bed282156 100644 (file)
--- a/test/benchmarks/benchmark_adler32_copy.cc
+++ b/test/benchmarks/benchmark_adler32_copy.cc
@@ -3,75 +3,88 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#include <stdio.h>
-#include <assert.h>
-#include <string.h>
-
  #include <benchmark/benchmark.h>
  
  extern "C" {
  #  include "zbuild.h"
-#  include "zutil_p.h"
  #  include "arch_functions.h"
  #  include "../test_cpu_features.h"
  }
  
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
-
-typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len);
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
  
  class adler32_copy: public benchmark::Fixture {
  private:
-    uint32_t *random_ints_src;
-    uint32_t *random_ints_dst;
+    uint32_t *testdata;
+    uint8_t *dstbuf;
  
  public:
-    void SetUp(const ::benchmark::State&) {
-        /* Control the alignment so that we have the best case scenario for loads. With
-         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
-         * And while this is a realistic scenario, it makes it difficult to compare benchmark
-         * to benchmark because one allocation could have been aligned perfectly for the loads
-         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
-         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
-         * control the _consistency_ of the results */
-        random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints_src != NULL);
-        assert(random_ints_dst != NULL);
-
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints_src[i] = rand();
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
+
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
          }
      }
  
-    void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+    // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) {
+        int misalign = 0;
          uint32_t hash = 0;
  
          for (auto _ : state) {
-            hash = adler32_func(hash, (unsigned char *)random_ints_dst,
-                                (const unsigned char*)random_ints_src, (size_t)state.range(0));
+            hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
          }
  
+        // Prevent the result from being optimized away
          benchmark::DoNotOptimize(hash);
      }
  
      void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints_src);
-        zng_free(random_ints_dst);
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
      }
  };
  
-#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+// Misaligned
+#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \
      BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
          } \
-        Bench(state, fptr); \
+        Bench(state, copyfunc, 0); \
+    } \
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
      } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+    BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
+    BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
  
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+// Adler32 + memcpy benchmark for reference
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
      BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
@@ -79,10 +92,10 @@ public:
          Bench(state, [](uint32_t init_sum, unsigned char *dst, \
                          const uint8_t *buf, size_t len) -> uint32_t { \
              memcpy(dst, buf, (size_t)len); \
-            return fptr(init_sum, buf, len); \
-        }); \
+            return copyfunc(init_sum, buf, len); \
+        }, 1); \
      } \
-    BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+    BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
  
  BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
  
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc

index e3c4d9e2c2d5fc0864df80e7f71ca857baac12e5..df7eaec3e667e0c4e049c715fbafd01cc7d4c44c 100644 (file)
--- a/test/benchmarks/benchmark_crc32.cc
+++ b/test/benchmarks/benchmark_crc32.cc
@@ -3,59 +3,80 @@
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
-#include <stdio.h>
-#include <assert.h>
-
  #include <benchmark/benchmark.h>
  
  extern "C" {
  #  include "zbuild.h"
-#  include "zutil_p.h"
  #  include "arch_functions.h"
  #  include "../test_cpu_features.h"
  }
  
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
  
  class crc32: public benchmark::Fixture {
  private:
-    uint32_t *random_ints;
+    uint32_t *testdata;
  
  public:
-    void SetUp(const ::benchmark::State&) {
-        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
-        assert(random_ints != NULL);
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
  
-        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
-            random_ints[i] = rand();
+        for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+            testdata[i] = rand();
          }
      }
  
-    void Bench(benchmark::State& state, crc32_func crc32) {
+    // Benchmark CRC32, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) {
+        int misalign = 0;
          uint32_t hash = 0;
  
          for (auto _ : state) {
-            hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+            hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
+                misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
          }
  
+        // Prevent the result from being optimized away
          benchmark::DoNotOptimize(hash);
      }
  
      void TearDown(const ::benchmark::State&) {
-        zng_free(random_ints);
+        zng_free_aligned(testdata);
      }
  };
  
-#define BENCHMARK_CRC32(name, fptr, support_flag) \
+#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \
      BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
          } \
-        Bench(state, fptr); \
+        Bench(state, hashfunc, 0); \
      } \
      BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
  
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, hashfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32(name, hashfunc, support_flag) \
+    BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \
+    BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag);
+
  BENCHMARK_CRC32(braid, crc32_braid, 1);
  
  #ifdef DISABLE_RUNTIME_CPU_DETECTION
diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc

index e2de0f5a597f8f9c6f1fa62a69fe4d5de7341786..71497e9aca9f4c514933e48292f0be9344f39c8d 100644 (file)
--- a/test/benchmarks/benchmark_crc32_copy.cc
+++ b/test/benchmarks/benchmark_crc32_copy.cc
@@ -4,7 +4,6 @@
   */
  
  #include <benchmark/benchmark.h>
-#include <assert.h>
  
  extern "C" {
  #  include "zbuild.h"
@@ -12,7 +11,9 @@ extern "C" {
  #  include "../test_cpu_features.h"
  }
  
-#define BUFSIZE (32768 + 16 + 16)
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
  
  class crc32_copy: public benchmark::Fixture {
  protected:
@@ -20,46 +21,67 @@ protected:
      uint8_t *dstbuf;
  
  public:
-    void SetUp(const ::benchmark::State&) {
-        testdata = (uint32_t *)malloc(BUFSIZE);
-        dstbuf = (uint8_t *)malloc(BUFSIZE);
-        assert((testdata != NULL) && (dstbuf != NULL));
+    void SetUp(::benchmark::State& state) {
+        testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+        dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+        if (testdata == NULL || dstbuf == NULL) {
+            state.SkipWithError("malloc failed");
+            return;
+        }
  
          for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
              testdata[i] = rand();
          }
      }
  
-    void Bench(benchmark::State& state, crc32_copy_func crc32_copy) {
+    // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results
+    void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) {
          int misalign = 0;
-        uint32_t crc = 0;
+        uint32_t hash = 0;
  
-        // Benchmark the CRC32 copy operation
          for (auto _ : state) {
-            crc = crc32_copy(crc, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
-            misalign++;
-            if (misalign > 14)
+            hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+            if (misalign >= 63)
                  misalign = 0;
+            else
+                misalign += (DO_ALIGNED) ? 16 : 1;
          }
  
          // Prevent the result from being optimized away
-        benchmark::DoNotOptimize(crc);
+        benchmark::DoNotOptimize(hash);
      }
  
      void TearDown(const ::benchmark::State&) {
-        free(testdata);
-        free(dstbuf);
+        zng_free_aligned(testdata);
+        zng_free_aligned(dstbuf);
      }
  };
  
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+// Misaligned
+#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \
      BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \
          if (!(support_flag)) { \
              state.SkipWithError("CPU does not support " #name); \
          } \
-        Bench(state, copyfunc); \
+        Bench(state, copyfunc, 0); \
      } \
-    BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10);
+    BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \
+    BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+        if (!(support_flag)) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, copyfunc, 1); \
+    } \
+    BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+    BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+    BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
  
  // Base test
  BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);
author	Hans Kristian Rosbach <hk-git@circlestorm.org>
	Wed, 14 Jan 2026 16:18:02 +0000 (17:18 +0100)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Tue, 20 Jan 2026 21:59:40 +0000 (22:59 +0100)
test/benchmarks/benchmark_adler32.cc		patch \| blob \| blame \| history
test/benchmarks/benchmark_adler32_copy.cc		patch \| blob \| blame \| history
test/benchmarks/benchmark_crc32.cc		patch \| blob \| blame \| history
test/benchmarks/benchmark_crc32_copy.cc		patch \| blob \| blame \| history