* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include <stdio.h>
-#include <assert.h>
-
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
-# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
class adler32: public benchmark::Fixture {
private:
- uint32_t *random_ints;
+ uint32_t *testdata;
public:
- void SetUp(const ::benchmark::State&) {
- /* Control the alignment so that we have the best case scenario for loads. With
- * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
- * And while this is a realistic scenario, it makes it difficult to compare benchmark
- * to benchmark because one allocation could have been aligned perfectly for the loads
- * while the subsequent one happened to not be. This is not to be advantageous to AVX512
- * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
- * control the _consistency_ of the results */
- random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
- assert(random_ints != NULL);
-
- for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
- random_ints[i] = rand();
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
}
}
- void Bench(benchmark::State& state, adler32_func adler32) {
+ // Benchmark Adler32, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, adler32_func adler32, const int DO_ALIGNED) {
+ int misalign = 0;
uint32_t hash = 0;
for (auto _ : state) {
- hash = adler32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+ hash = adler32(hash, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
}
+ // Prevent the result from being optimized away
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State&) {
- zng_free(random_ints);
+ zng_free_aligned(testdata);
}
};
-#define BENCHMARK_ADLER32(name, fptr, support_flag) \
+#define BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag) \
BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
} \
- Bench(state, fptr); \
+ Bench(state, hashfunc, 0); \
} \
- BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10)
+ BENCHMARK_REGISTER_F(adler32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(adler32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32(name, hashfunc, support_flag) \
+ BENCHMARK_ADLER32_MISALIGNED(name, hashfunc, support_flag); \
+ BENCHMARK_ADLER32_ALIGNED(name, hashfunc, support_flag);
BENCHMARK_ADLER32(c, adler32_c, 1);
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include <stdio.h>
-#include <assert.h>
-#include <string.h>
-
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
-# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
-
-typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len);
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
class adler32_copy: public benchmark::Fixture {
private:
- uint32_t *random_ints_src;
- uint32_t *random_ints_dst;
+ uint32_t *testdata;
+ uint8_t *dstbuf;
public:
- void SetUp(const ::benchmark::State&) {
- /* Control the alignment so that we have the best case scenario for loads. With
- * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
- * And while this is a realistic scenario, it makes it difficult to compare benchmark
- * to benchmark because one allocation could have been aligned perfectly for the loads
- * while the subsequent one happened to not be. This is not to be advantageous to AVX512
- * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
- * control the _consistency_ of the results */
- random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
- random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
- assert(random_ints_src != NULL);
- assert(random_ints_dst != NULL);
-
- for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
- random_ints_src[i] = rand();
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL || dstbuf == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
+
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
}
}
- void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+ // Benchmark Adler32_copy, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, adler32_copy_func adler32_copy, const int DO_ALIGNED) {
+ int misalign = 0;
uint32_t hash = 0;
for (auto _ : state) {
- hash = adler32_func(hash, (unsigned char *)random_ints_dst,
- (const unsigned char*)random_ints_src, (size_t)state.range(0));
+ hash = adler32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
}
+ // Prevent the result from being optimized away
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State&) {
- zng_free(random_ints_src);
- zng_free(random_ints_dst);
+ zng_free_aligned(testdata);
+ zng_free_aligned(dstbuf);
}
};
-#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+// Misaligned
+#define BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag) \
BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
} \
- Bench(state, fptr); \
+ Bench(state, copyfunc, 0); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 1); \
} \
- BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+ BENCHMARK_REGISTER_F(adler32_copy, ALIGNED_NAME(name))->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_ADLER32_COPY(name, copyfunc, support_flag) \
+ BENCHMARK_ADLER32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_ADLER32_COPY_ALIGNED(name, copyfunc, support_flag);
-#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+// Adler32 + memcpy benchmark for reference
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, copyfunc, support_flag) \
BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
Bench(state, [](uint32_t init_sum, unsigned char *dst, \
const uint8_t *buf, size_t len) -> uint32_t { \
memcpy(dst, buf, (size_t)len); \
- return fptr(init_sum, buf, len); \
- }); \
+ return copyfunc(init_sum, buf, len); \
+ }, 1); \
} \
- BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+ BENCHMARK_REGISTER_F(adler32_copy, name)->Arg(3)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10)->Arg(64<<10);
BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include <stdio.h>
-#include <assert.h>
-
#include <benchmark/benchmark.h>
extern "C" {
# include "zbuild.h"
-# include "zutil_p.h"
# include "arch_functions.h"
# include "../test_cpu_features.h"
}
-#define MAX_RANDOM_INTS (1024 * 1024)
-#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+#define BUFSIZE ((4 * 1024 * 1024) + 64)
class crc32: public benchmark::Fixture {
private:
- uint32_t *random_ints;
+ uint32_t *testdata;
public:
- void SetUp(const ::benchmark::State&) {
- random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
- assert(random_ints != NULL);
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
- for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
- random_ints[i] = rand();
+ for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
+ testdata[i] = rand();
}
}
- void Bench(benchmark::State& state, crc32_func crc32) {
+ // Benchmark CRC32, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, crc32_func crc32, const int DO_ALIGNED) {
+ int misalign = 0;
uint32_t hash = 0;
for (auto _ : state) {
- hash = crc32(hash, (const unsigned char *)random_ints, (size_t)state.range(0));
+ hash = crc32(hash, (const unsigned char *)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
+ misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
}
+ // Prevent the result from being optimized away
benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State&) {
- zng_free(random_ints);
+ zng_free_aligned(testdata);
}
};
-#define BENCHMARK_CRC32(name, fptr, support_flag) \
+#define BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag) \
BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
} \
- Bench(state, fptr); \
+ Bench(state, hashfunc, 0); \
} \
BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, hashfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32, ALIGNED_NAME(name))->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32(name, hashfunc, support_flag) \
+ BENCHMARK_CRC32_MISALIGNED(name, hashfunc, support_flag); \
+ BENCHMARK_CRC32_ALIGNED(name, hashfunc, support_flag);
+
BENCHMARK_CRC32(braid, crc32_braid, 1);
#ifdef DISABLE_RUNTIME_CPU_DETECTION
*/
#include <benchmark/benchmark.h>
-#include <assert.h>
extern "C" {
# include "zbuild.h"
# include "../test_cpu_features.h"
}
-#define BUFSIZE (32768 + 16 + 16)
+// Hash copy functions are used on strm->next_in buffers, we process
+// 512-32k sizes (x2 for initial fill) at a time if enough data is available.
+#define BUFSIZE (65536 + 64)
class crc32_copy: public benchmark::Fixture {
protected:
uint8_t *dstbuf;
public:
- void SetUp(const ::benchmark::State&) {
- testdata = (uint32_t *)malloc(BUFSIZE);
- dstbuf = (uint8_t *)malloc(BUFSIZE);
- assert((testdata != NULL) && (dstbuf != NULL));
+ void SetUp(::benchmark::State& state) {
+ testdata = (uint32_t *)zng_alloc_aligned(BUFSIZE, 64);
+ dstbuf = (uint8_t *)zng_alloc_aligned(BUFSIZE, 64);
+ if (testdata == NULL || dstbuf == NULL) {
+ state.SkipWithError("malloc failed");
+ return;
+ }
for (uint32_t i = 0; i < BUFSIZE/sizeof(uint32_t); i++) {
testdata[i] = rand();
}
}
- void Bench(benchmark::State& state, crc32_copy_func crc32_copy) {
+ // Benchmark CRC32_copy, with rolling buffer misalignment for consistent results
+ void Bench(benchmark::State& state, crc32_copy_func crc32_copy, const int DO_ALIGNED) {
int misalign = 0;
- uint32_t crc = 0;
+ uint32_t hash = 0;
- // Benchmark the CRC32 copy operation
for (auto _ : state) {
- crc = crc32_copy(crc, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
- misalign++;
- if (misalign > 14)
+ hash = crc32_copy(hash, dstbuf + misalign, (const unsigned char*)testdata + misalign, (size_t)state.range(0));
+ if (misalign >= 63)
misalign = 0;
+ else
+ misalign += (DO_ALIGNED) ? 16 : 1;
}
// Prevent the result from being optimized away
- benchmark::DoNotOptimize(crc);
+ benchmark::DoNotOptimize(hash);
}
void TearDown(const ::benchmark::State&) {
- free(testdata);
- free(dstbuf);
+ zng_free_aligned(testdata);
+ zng_free_aligned(dstbuf);
}
};
-#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+// Misaligned
+#define BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag) \
BENCHMARK_DEFINE_F(crc32_copy, name)(benchmark::State& state) { \
if (!(support_flag)) { \
state.SkipWithError("CPU does not support " #name); \
} \
- Bench(state, copyfunc); \
+ Bench(state, copyfunc, 0); \
} \
- BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(16)->Arg(48)->Arg(192)->Arg(512)->Arg(4<<10)->Arg(16<<10)->Arg(32<<10);
+ BENCHMARK_REGISTER_F(crc32_copy, name)->Arg(32)->Arg(512)->Arg(8<<10)->Arg(32<<10)->Arg(64<<10);
+
+// Aligned
+#define ALIGNED_NAME(name) name##_aligned
+#define BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag) \
+ BENCHMARK_DEFINE_F(crc32_copy, ALIGNED_NAME(name))(benchmark::State& state) { \
+ if (!(support_flag)) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, copyfunc, 1); \
+ } \
+ BENCHMARK_REGISTER_F(crc32_copy, ALIGNED_NAME(name))->Arg(16)->Arg(32)->Arg(64)->Arg(512);
+
+// Queue both misaligned and aligned for each benchmark
+#define BENCHMARK_CRC32_COPY(name, copyfunc, support_flag) \
+ BENCHMARK_CRC32_COPY_MISALIGNED(name, copyfunc, support_flag); \
+ BENCHMARK_CRC32_COPY_ALIGNED(name, copyfunc, support_flag);
// Base test
BENCHMARK_CRC32_COPY(braid, crc32_copy_braid, 1);