From: Hans Kristian Rosbach Date: Wed, 14 Jan 2026 16:18:02 +0000 (+0100) Subject: Unify adler32/crc32 benchmarks and add rotating misalignment X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b233f8675b08220aa0db9207cb4295df871299fd;p=thirdparty%2Fzlib-ng.git Unify adler32/crc32 benchmarks and add rotating misalignment Add aligned benchmarks for adler32/crc32 --- diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index 48121db42..8fb213004 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -3,65 +3,79 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include -#include - #include extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) +#define BUFSIZE ((4 * 1024 * 1024) + 64) class adler32: public benchmark::Fixture { private: - uint32_t *random_ints; + uint32_t *testdata; public: - void SetUp(const ::benchmark::State&) { - /* Control the alignment so that we have the best case scenario for loads. With - * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. - * And while this is a realistic scenario, it makes it difficult to compare benchmark - * to benchmark because one allocation could have been aligned perfectly for the loads - * while the subsequent one happened to not be. This is not to be advantageous to AVX512 - * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to - * control the _consistency_ of the results */ - random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints != NULL); - - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints[i] = rand(); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, adler32_func adler32) { + // Benchmark Adler32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0)); + hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints); + zng_free_aligned(testdata); } }; -#define BENCHMARK_ADLER32(name, fptr, support_flag) \ +#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, hashfunc, 0); \ } \ - BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10) + BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \ + BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag); BENCHMARK_ADLER32(c, adler32_c, 1); diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index d8efa0d22..fbfb85602 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -3,75 +3,88 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include -#include -#include - #include extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) - -typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len); +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) class adler32_copy: public benchmark::Fixture { private: - uint32_t *random_ints_src; - uint32_t *random_ints_dst; + uint32_t *testdata; + uint8_t *dstbuf; public: - void SetUp(const ::benchmark::State&) { - /* Control the alignment so that we have the best case scenario for loads. With - * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. - * And while this is a realistic scenario, it makes it difficult to compare benchmark - * to benchmark because one allocation could have been aligned perfectly for the loads - * while the subsequent one happened to not be. This is not to be advantageous to AVX512 - * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to - * control the _consistency_ of the results */ - random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints_src != NULL); - assert(random_ints_dst != NULL); - - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints_src[i] = rand(); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } + + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, adler32_cpy_func adler32_func) { + // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = adler32_func(hash, (unsigned char *)random_ints_dst, - (const unsigned char*)random_ints_src, (size_t)state.range(0)); + hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints_src); - zng_free(random_ints_dst); + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); } }; -#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \ +// Misaligned +#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, copyfunc, 0); \ + } \ + BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ } \ - BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \ + BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag); -#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \ +// Adler32 + memcpy benchmark for reference +#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ @@ -79,10 +92,10 @@ public: Bench(state, [](uint32_t init_sum, unsigned char *dst, \ const uint8_t *buf, size_t len) -> uint32_t { \ memcpy(dst, buf, (size_t)len); \ - return fptr(init_sum, buf, len); \ - }); \ + return copyfunc(init_sum, buf, len); \ + }, 1); \ } \ - BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE); + BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10); BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1); diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index e3c4d9e2c..df7eaec3e 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -3,59 +3,80 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include -#include - #include extern "C" { # include "zbuild.h" -# include "zutil_p.h" # include "arch_functions.h" # include "../test_cpu_features.h" } -#define MAX_RANDOM_INTS (1024 * 1024) -#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) +#define BUFSIZE ((4 * 1024 * 1024) + 64) class crc32: public benchmark::Fixture { private: - uint32_t *random_ints; + uint32_t *testdata; public: - void SetUp(const ::benchmark::State&) { - random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); - assert(random_ints != NULL); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL) { + state.SkipWithError("malloc failed"); + return; + } - for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { - random_ints[i] = rand(); + for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { + testdata[i] = rand(); } } - void Bench(benchmark::State& state, crc32_func crc32) { + // Benchmark CRC32, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) { + int misalign = 0; uint32_t hash = 0; for (auto _ : state) { - hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0)); + hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) + misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } + // Prevent the result from being optimized away benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - zng_free(random_ints); + zng_free_aligned(testdata); } }; -#define BENCHMARK_CRC32(name, fptr, support_flag) \ +#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \ BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, fptr); \ + Bench(state, hashfunc, 0); \ } \ BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, hashfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32(name, hashfunc, support_flag) \ + BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \ + BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag); + BENCHMARK_CRC32(braid, crc32_braid, 1); #ifdef DISABLE_RUNTIME_CPU_DETECTION diff --git a/test/benchmarks/benchmark_crc32_copy.cc b/test/benchmarks/benchmark_crc32_copy.cc index e2de0f5a5..71497e9ac 100644 --- a/test/benchmarks/benchmark_crc32_copy.cc +++ b/test/benchmarks/benchmark_crc32_copy.cc @@ -4,7 +4,6 @@ */ #include -#include extern "C" { # include "zbuild.h" @@ -12,7 +11,9 @@ extern "C" { # include "../test_cpu_features.h" } -#define BUFSIZE (32768 + 16 + 16) +// Hash copy functions are used on strm->next_in buffers, we process +// 512-32k sizes (x2 for initial fill) at a time if enough data is available. +#define BUFSIZE (65536 + 64) class crc32_copy: public benchmark::Fixture { protected: @@ -20,46 +21,67 @@ protected: uint8_t *dstbuf; public: - void SetUp(const ::benchmark::State&) { - testdata = (uint32_t *)malloc(BUFSIZE); - dstbuf = (uint8_t *)malloc(BUFSIZE); - assert((testdata != NULL) && (dstbuf != NULL)); + void SetUp(::benchmark::State& state) { + testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64); + dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64); + if (testdata == NULL || dstbuf == NULL) { + state.SkipWithError("malloc failed"); + return; + } for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) { testdata[i] = rand(); } } - void Bench(benchmark::State& state, crc32_copy_func crc32_copy) { + // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results + void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) { int misalign = 0; - uint32_t crc = 0; + uint32_t hash = 0; - // Benchmark the CRC32 copy operation for (auto _ : state) { - crc = crc32_copy(crc, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); - misalign++; - if (misalign > 14) + hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0)); + if (misalign >= 63) misalign = 0; + else + misalign += (DO_ALIGNED) ? 16 : 1; } // Prevent the result from being optimized away - benchmark::DoNotOptimize(crc); + benchmark::DoNotOptimize(hash); } void TearDown(const ::benchmark::State&) { - free(testdata); - free(dstbuf); + zng_free_aligned(testdata); + zng_free_aligned(dstbuf); } }; -#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \ +// Misaligned +#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \ BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \ if (!(support_flag)) { \ state.SkipWithError("CPU does not support " #name); \ } \ - Bench(state, copyfunc); \ + Bench(state, copyfunc, 0); \ } \ - BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10); + BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10); + +// Aligned +#define ALIGNED_NAME(name) name##_aligned +#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \ + BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \ + if (!(support_flag)) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, copyfunc, 1); \ + } \ + BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512); + +// Queue both misaligned and aligned for each benchmark +#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \ + BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \ + BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag); // Base test BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);