From: Adam Stylinski Date: Mon, 21 Feb 2022 05:17:07 +0000 (-0500) Subject: Adding some application-specific benchmarks X-Git-Tag: 2.1.0-beta1~361 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4c63726c022c66cc2541c6f87f66837dd162a019;p=thirdparty%2Fzlib-ng.git Adding some application-specific benchmarks So far there's only added png encode and decode with predictably compressible bytes. This gives us a rough idea of more holistic impacts of performance improvements (and regressions). An interesting thing found with this, when compared with stock zlib, we're slower for png decoding at levels 8 & 9. When we are slower, we are spending a fair amount of time in the chunk copy function. This probably merits a closer look. This code creates optionally an alternative benchmark binary that links with an alternative static zlib implementation. This can be used to quickly compare between different forks. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 000f3ad9b..5d453a9be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,7 @@ option(ZLIB_ENABLE_TESTS "Build test binaries" ON) option(ZLIB_DUAL_LINK "Dual link tests against system zlib" OFF) option(WITH_FUZZERS "Build test/fuzz" OFF) option(WITH_BENCHMARKS "Build test/benchmarks" OFF) +option(WITH_BENCHMARK_APPS "Build application benchmarks" OFF) option(WITH_OPTIM "Build with optimisation" ON) option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF) option(WITH_NEW_STRATEGIES "Use new strategies" ON) @@ -1456,6 +1457,7 @@ add_feature_info(ZLIB_DUAL_LINK ZLIB_DUAL_LINK "Dual link tests against system z add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support") add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz") add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks") +add_feature_info(WITH_BENCHMARK_APPS WITH_BENCHMARK_APPS "Build application benchmarks") add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt index e68f54259..df1df4973 100644 --- a/test/benchmarks/CMakeLists.txt +++ b/test/benchmarks/CMakeLists.txt @@ -44,3 +44,52 @@ if(ZLIB_ENABLE_TESTS) add_test(NAME benchmark_zlib COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $) endif() + +if(WITH_BENCHMARK_APPS) + option(BUILD_ALT_BENCH "Link against alternative zlib implementation" OFF) + + # Search for libpng package + find_package(PNG QUIET) + + if(NOT PNG_FOUND) + FetchContent_Declare(PNG + GIT_REPOSITORY https://github.com/glennrp/libpng.git) + FetchContent_MakeAvailable(PNG) + FetchContent_GetProperties(PNG) + + if(NOT PNG_POPULATED) + FetchContent_Populate(PNG) + endif() + endif() + + set(BENCH_APP_SRCS + benchmark_png_encode.cc + benchmark_png_decode.cc + benchmark_main.cc + ) + + add_executable(benchmark_zlib_apps ${BENCH_APP_SRCS}) + + if(DEFINED BUILD_ALT_BENCH) + set(ZLIB_ALT_LIB "libz.a" CACHE FILEPATH "Optional alternative zlib implementation (defaults to stock zlib)") + add_executable(benchmark_zlib_apps_alt ${BENCH_APP_SRCS}) + target_link_libraries(benchmark_zlib_apps_alt libpng.a ${ZLIB_ALT_LIB} benchmark::benchmark) + target_compile_definitions(benchmark_zlib_apps_alt PRIVATE BUILD_ALT=1) + target_include_directories(benchmark_zlib_apps_alt PRIVATE + ${CMAKE_SOURCE_DIR} + ${CMAKE_BINARY_DIR} + ${PNG_INCLUDE_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + endif() + + target_include_directories(benchmark_zlib_apps PRIVATE + ${CMAKE_SOURCE_DIR} + ${CMAKE_BINARY_DIR} + ${PNG_INCLUDE_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + + # We need the static png library if we're statically linking to zlib, + # otherwise it will resolve these things in the system provided dynamic + # libraries (likely linked to stock zlib) + target_link_libraries(benchmark_zlib_apps libpng.a zlibstatic benchmark::benchmark) +endif() diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md index b005027e0..5dce7f51b 100644 --- a/test/benchmarks/README.md +++ b/test/benchmarks/README.md @@ -1,5 +1,4 @@ ## Benchmarks - These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark). *Repetitions* @@ -17,3 +16,32 @@ To filter out which benchmarks are performed use: ``` --benchmark_filter="adler32*" ``` + +There are two different benchmarks, micro and macro. + +### Benchmark benchmark_zlib +These are microbenchmarks intended to test lower level subfunctions of the library. + +Benchmarks include impelementations of: + - Adler32 + - CRC + - 256 byte comparisons + - SIMD accelerated "slide hash" routine + +By default these benchmarks report things on the nanosecond scale and are small enough +to measure very minute diferences. + +### Benchmark benchmark_zlib_apps +These benchmarks measure applications of zlib as a whole. Currently the only examples +are PNG encoding and decoding. The PNG encode and decode tests leveraging procedurally +generated and highly compressible image data. + +Additionally, a test called `png_decode_realistic` that will decode any RGB 8 BPP encoded +set of PNGs in the working directory under a directory named "test_pngs" with files named +{0..1}.png. If these images do not exist, they will error out and the benchmark will move +on to the next set of benchmarks. + +*benchmark_zlib_apps_alt* + +The user can compile a comparison benchmark application linking to any zlib-compatible +implementation of his or her choosing. diff --git a/test/benchmarks/benchmark_main.cc b/test/benchmarks/benchmark_main.cc index 600ac2281..ee8b61489 100644 --- a/test/benchmarks/benchmark_main.cc +++ b/test/benchmarks/benchmark_main.cc @@ -7,13 +7,17 @@ #include +#ifndef BUILD_ALT extern "C" { # include "zbuild.h" # include "cpu_features.h" } +#endif int main(int argc, char** argv) { +#ifndef BUILD_ALT cpu_check_features(); +#endif ::benchmark::Initialize(&argc, argv); ::benchmark::RunSpecifiedBenchmarks(); diff --git a/test/benchmarks/benchmark_png_decode.cc b/test/benchmarks/benchmark_png_decode.cc new file mode 100644 index 000000000..c037976c8 --- /dev/null +++ b/test/benchmarks/benchmark_png_decode.cc @@ -0,0 +1,126 @@ +#include +#include +#include "benchmark_png_shared.h" +#include + +class png_decode: public benchmark::Fixture { +protected: + png_dat inpng[10]; + + /* Backing this on the heap is a more realistic benchmark */ + uint8_t *output_img_buf = NULL; + +public: + /* Let's make the vanilla version have something extremely compressible */ + virtual void init_img(png_bytep img_bytes, size_t width, size_t height) { + init_compressible(img_bytes, width*height); + } + + void SetUp(const ::benchmark::State& state) { + output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + assert(output_img_buf != NULL); + init_img(output_img_buf, IMWIDTH, IMHEIGHT); + + /* First we need to author the png bytes to be decoded */ + for (int i = 0; i < 10; ++i) { + inpng[i] = {NULL, 0, 0}; + encode_png(output_img_buf, &inpng[i], i, IMWIDTH, IMHEIGHT); + } + } + + /* State in this circumstance will convey the compression level */ + void Bench(benchmark::State &state) { + for (auto _ : state) { + int compress_lvl = state.range(0); + png_parse_dat in = { inpng[compress_lvl].buf }; + uint32_t width, height; + decode_png(&in, (png_bytepp)&output_img_buf, IMWIDTH * IMHEIGHT * 3, width, height); + } + } + + void TearDown(const ::benchmark::State &state) { + free(output_img_buf); + for (int i = 0; i < 10; ++i) { + free(inpng[i].buf); + } + } +}; + +class png_decode_realistic: public png_decode { +private: + bool test_files_found = false; + +public: + void SetUp(const ::benchmark::State &state) { + output_img_buf = NULL; + output_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + /* Let's take all the images at different compression levels and jam their bytes into buffers */ + char test_fname[25]; + FILE *files[10]; + + /* Set all to NULL */ + memset(files, 0, sizeof(FILE*)); + + for (size_t i = 0; i < 10; ++i) { + sprintf(test_fname, "test_pngs/%1lu.png", i); + FILE *in_img = fopen(test_fname, "r"); + if (in_img == NULL) { + for (size_t j = 0; j < i; ++j) { + if (files[j]) + fclose(files[j]); + } + + /* For proper cleanup */ + for (size_t j = i; j < 10; ++j) { + inpng[i] = { NULL, 0, 0 }; + } + + return; + } + files[i] = in_img; + } + + test_files_found = true; + /* Now that we've established we have all the png files, let's read all of their bytes into buffers */ + for (size_t i = 0; i < 10; ++i) { + FILE *in_file = files[i]; + fseek(in_file, 0, SEEK_END); + size_t num_bytes = ftell(in_file); + rewind(in_file); + + uint8_t *raw_file = (uint8_t*)malloc(num_bytes); + if (raw_file == NULL) + abort(); + + inpng[i].buf = raw_file; + inpng[i].len = num_bytes; + inpng[i].buf_rem = 0; + + size_t bytes_read = fread(raw_file, 1, num_bytes, in_file); + if (bytes_read != num_bytes) { + fprintf(stderr, "couldn't read all of the bytes for file test_pngs/%lu.png", i); + abort(); + } + + fclose(in_file); + } + } + + void Bench(benchmark::State &state) { + if (!test_files_found) { + state.SkipWithError("Test imagery in test_pngs not found"); + } + + png_decode::Bench(state); + } +}; + +BENCHMARK_DEFINE_F(png_decode, png_decode)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_decode, png_decode)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); + +BENCHMARK_DEFINE_F(png_decode_realistic, png_decode_realistic)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_decode_realistic, png_decode_realistic)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); diff --git a/test/benchmarks/benchmark_png_encode.cc b/test/benchmarks/benchmark_png_encode.cc new file mode 100644 index 000000000..f1c597d36 --- /dev/null +++ b/test/benchmarks/benchmark_png_encode.cc @@ -0,0 +1,54 @@ +#include +#include +#include +#include "benchmark_png_shared.h" + +#define IMWIDTH 1024 +#define IMHEIGHT 1024 + +class png_encode: public benchmark::Fixture { +private: + png_dat outpng; + + /* Backing this on the heap is a more realistic benchmark */ + uint8_t *input_img_buf = NULL; + +public: + /* Let's make the vanilla version have something extremely compressible */ + virtual void init_img(png_bytep img_bytes, size_t width, size_t height) { + init_compressible(img_bytes, width * height); + } + + void SetUp(const ::benchmark::State& state) { + input_img_buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + outpng.buf = (uint8_t*)malloc(IMWIDTH * IMHEIGHT * 3); + /* Using malloc rather than zng_alloc so that we can call realloc. + * IMWIDTH * IMHEIGHT is likely to be more than enough bytes, though, + * given that a simple run length encoding already pretty much can + * reduce to this */ + outpng.len = 0; + outpng.buf_rem = IMWIDTH * IMHEIGHT * 3; + assert(input_img_buf != NULL); + assert(outpng.buf != NULL); + init_img(input_img_buf, IMWIDTH, IMHEIGHT); + } + + /* State in this circumstance will convey the compression level */ + void Bench(benchmark::State &state) { + for (auto _ : state) { + encode_png((png_bytep)input_img_buf, &outpng, state.range(0), IMWIDTH, IMHEIGHT); + outpng.buf_rem = outpng.len; + outpng.len = 0; + } + } + + void TearDown(const ::benchmark::State &state) { + free(input_img_buf); + free(outpng.buf); + } +}; + +BENCHMARK_DEFINE_F(png_encode, encode_compressible)(benchmark::State &state) { + Bench(state); +} +BENCHMARK_REGISTER_F(png_encode, encode_compressible)->DenseRange(0, 9, 1)->Unit(benchmark::kMicrosecond); diff --git a/test/benchmarks/benchmark_png_shared.h b/test/benchmarks/benchmark_png_shared.h new file mode 100644 index 000000000..1b29d3be9 --- /dev/null +++ b/test/benchmarks/benchmark_png_shared.h @@ -0,0 +1,146 @@ +#pragma once + +#include +#include +#include + +#define IMWIDTH 1024 +#define IMHEIGHT 1024 + +extern "C" { +# include +} + +typedef struct _png_dat { + uint8_t *buf; + int64_t len; + size_t buf_rem; +} png_dat; + +typedef struct _png_parse_dat { + uint8_t *cur_pos; +} png_parse_dat; + +/* Write a customized write callback so that we write back to an in-memory buffer. + * This allows the testing to not involve disk IO */ +static void png_write_cb(png_structp pngp, png_bytep data, png_size_t len) { + png_dat *dat = (png_dat*)png_get_io_ptr(pngp); + size_t curSize = dat->len + len; + + /* realloc double the requested buffer size to prevent excessive reallocs */ + if (dat->buf_rem < len) { + dat->buf = (uint8_t*)realloc(dat->buf, dat->len + dat->buf_rem + 2 * len); + + if (!dat->buf) { + /* Pretty unlikely but we'll put it here just in case */ + fprintf(stderr, "realloc failed, exiting\n"); + exit(1); + } + + dat->buf_rem += 2 * len; + } + + memcpy(dat->buf + dat->len, data, len); + dat->len = curSize; + dat->buf_rem -= len; +} + +static void init_compressible(png_bytep buf, size_t num_pix) { + /* It doesn't actually matter what we make this, but for + * the sake of a reasonable test image, let's make this + * be a stripe of R, G, & B, with no alpha channel */ + int32_t i = 0; + int32_t red_stop = num_pix / 3; + int32_t blue_stop = 2 * num_pix / 3; + int32_t green_stop = num_pix; + + for (int32_t x = 0; i < red_stop; x += 3, ++i) { + buf[x] = 255; + buf[x + 1] = 0; + buf[x + 2] = 0; + } + + for (int32_t x = 3 * i; i < blue_stop; x+= 3, ++i) { + buf[x] = 0; + buf[x + 1] = 255; + buf[x + 2] = 0; + } + + for (int32_t x = 3 * i; i < green_stop; x += 3, ++i) { + buf[x] = 0; + buf[x + 1] = 0; + buf[x + 2] = 255; + } +} + +static inline void encode_png(png_bytep buf, png_dat *outpng, int32_t comp_level, uint32_t width, uint32_t height) { + png_structp png = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + + /* Most of this error handling is _likely_ not necessary. Likewise it's likely + * a lot of this stuff can be done in the setup function to avoid measuring this + * fixed setup time, but for now we'll do it here */ + if (!png) abort(); + + png_infop info = png_create_info_struct(png); + if (!info) abort(); + + png_set_write_fn(png, outpng, png_write_cb, NULL); + png_bytep *png_row_ptrs = new png_bytep[height]; + for (int i = 0; i < IMHEIGHT; ++i) { + png_row_ptrs[i] = (png_bytep)&buf[3*i*width]; + } + + png_set_IHDR(png, info, IMWIDTH, IMHEIGHT, 8, PNG_COLOR_TYPE_RGB, + PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, + PNG_FILTER_TYPE_DEFAULT); + + png_write_info(png, info); + png_set_compression_level(png, comp_level); + png_set_filter(png, 0, PNG_FILTER_NONE); + png_write_image(png, (png_bytepp)png_row_ptrs); + png_write_end(png, NULL); + png_destroy_write_struct(&png, &info); + delete[] png_row_ptrs; +} + +static void read_from_pngdat(png_structp png, png_bytep out, png_size_t bytes_to_read) { + png_parse_dat *io = (png_parse_dat*)png_get_io_ptr(png); + memcpy(out, io->cur_pos, bytes_to_read); + io->cur_pos += bytes_to_read; +} + +static inline int decode_png(png_parse_dat *dat, png_bytepp out_bytes, size_t in_size, uint32_t &width, uint32_t &height) { + png_structp png = NULL; + png = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + + if (!png) abort(); + png_infop info = NULL; + info = png_create_info_struct(png); + if (!info) abort(); + + png_set_read_fn(png, dat, read_from_pngdat); + png_read_info(png, info); + + int bit_depth = 0, color_type = -1; + png_get_IHDR(png, info, &width, &height, &bit_depth, &color_type, NULL, NULL, NULL); + + size_t im_size = width * height * bit_depth/8; + if (color_type != PNG_COLOR_TYPE_RGB) { + fprintf(stderr, "expected an 8 bpp RGB image\n"); + abort(); + } + + if (im_size > in_size) { + *out_bytes = (png_bytep)realloc(*out_bytes, im_size); + } + + png_bytep *out_rows = new png_bytep[height]; + for (size_t i = 0; i < height; ++i) + out_rows[i] = *out_bytes + (width*i); + + png_read_rows(png, out_rows, NULL, IMHEIGHT); + png_destroy_read_struct(&png, &info, NULL); + delete[] out_rows; + + return im_size; +}