From: Nathan Moinvaziri Date: Sat, 8 Jan 2022 21:28:41 +0000 (-0800) Subject: Added adler32, compare256, crc32, and slide_hash benchmarks using Google Benchmark. X-Git-Tag: 2.1.0-beta1~441 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6f179fd3014b61e77bc22245026f6630b05a863d;p=thirdparty%2Fzlib-ng.git Added adler32, compare256, crc32, and slide_hash benchmarks using Google Benchmark. Co-authored-by: Adam Stylinski --- diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 5662ed5d8..918be18b3 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -18,13 +18,19 @@ jobs: cmake-args: -DWITH_SANITIZER=Address codecov: ubuntu_gcc + - name: Ubuntu GCC Benchmark + os: ubuntu-latest + compiler: gcc + cmake-args: -DWITH_BENCHMARKS=ON + codecov: ubuntu_gcc_benchmark + - name: Ubuntu GCC Symbol Prefix os: ubuntu-latest compiler: gcc cmake-args: -DZLIB_SYMBOL_PREFIX=zTest_ codecov: ubuntu_gcc_sprefix - - name: Ubuntu GCC Compat SPrefix + - name: Ubuntu GCC Compat Symbol Prefix os: ubuntu-latest compiler: gcc cmake-args: -DZLIB_COMPAT=ON -DZLIB_SYMBOL_PREFIX=zTest_ diff --git a/.github/workflows/configure.yml b/.github/workflows/configure.yml index 6458673a1..2c72afb8a 100644 --- a/.github/workflows/configure.yml +++ b/.github/workflows/configure.yml @@ -176,12 +176,12 @@ jobs: compiler: gcc configure-args: --warn --zlib-compat --static --with-dfltcc-deflate --with-dfltcc-inflate - - name: macOS GCC symbol prefix + - name: macOS GCC Symbol Prefix os: macOS-latest compiler: gcc configure-args: --sprefix=zTest_ - - name: macOS GCC symbol prefix & compat + - name: macOS GCC Symbol Prefix & Compat os: macOS-latest compiler: gcc configure-args: --zlib-compat --sprefix=zTest_ diff --git a/CMakeLists.txt b/CMakeLists.txt index 104ff163c..5391fd5c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ option(ZLIB_COMPAT "Compile with zlib compatible API" OFF) option(ZLIB_ENABLE_TESTS "Build test binaries" ON) option(ZLIB_DUAL_LINK "Dual link tests against system zlib" OFF) option(WITH_FUZZERS "Build test/fuzz" OFF) +option(WITH_BENCHMARKS "Build test/benchmarks" OFF) option(WITH_OPTIM "Build with optimisation" ON) option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF) option(WITH_NEW_STRATEGIES "Use new strategies" ON) @@ -922,6 +923,7 @@ set(ZLIB_PUBLIC_HDRS set(ZLIB_PRIVATE_HDRS adler32_p.h chunkset_tpl.h + cpu_features.h crc32_p.h crc32_tbl.h crc32_comb_tbl.h @@ -948,6 +950,7 @@ set(ZLIB_SRCS chunkset.c compare256.c compress.c + cpu_features.c crc32.c crc32_comb.c crc32_fold.c @@ -1288,6 +1291,10 @@ if(ZLIB_ENABLE_TESTS) endforeach() endif() + if(WITH_BENCHMARKS) + add_subdirectory(test/benchmarks) + endif() + macro(test_minigzip name path) # Construct compression arguments for minigzip set(compress_args -k -c) @@ -1446,6 +1453,7 @@ add_feature_info(ZLIB_ENABLE_TESTS ZLIB_ENABLE_TESTS "Build test binaries") add_feature_info(ZLIB_DUAL_LINK ZLIB_DUAL_LINK "Dual link tests against system zlib") add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support") add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz") +add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks") add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS diff --git a/Makefile.in b/Makefile.in index 8f89d0a9c..ca2b1127f 100644 --- a/Makefile.in +++ b/Makefile.in @@ -78,6 +78,7 @@ OBJZ = \ chunkset.o \ compare256.o \ compress.o \ + cpu_features.o \ crc32.o \ crc32_comb.o \ crc32_fold.o \ @@ -114,6 +115,7 @@ PIC_OBJZ = \ chunkset.lo \ compare256.lo \ compress.lo \ + cpu_features.lo \ crc32.lo \ crc32_comb.lo \ crc32_fold.lo \ diff --git a/README.md b/README.md index 599fdd957..5b4834d7b 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ Build Options | WITH_NATIVE_INSTRUCTIONS | --native | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | | WITH_SANITIZER | --with-sanitizer | Build with sanitizer (memory, address, undefined) | OFF | | WITH_FUZZERS | --with-fuzzers | Build test/fuzz | OFF | +| WITH_BENCHMARKS | | Build test/benchmarks | OFF | | WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | | WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | diff --git a/arch/x86/x86.c b/arch/x86/x86.c index 32baf8a74..66bfa4e28 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -1,5 +1,4 @@ -/* - * x86 feature check +/* x86.c - x86 feature check * * Copyright (C) 2013 Intel Corporation. All rights reserved. * Author: @@ -119,7 +118,7 @@ void Z_INTERNAL x86_check_features(void) { x86_cpu_well_suited_avx512 = 1; } else if (model == 0xa && extended_model == 0x6) { /* Icelake server */ - x86_cpu_well_suited_avx512 = 1; + x86_cpu_well_suited_avx512 = 1; } else if (model == 0xf && extended_model == 0x8) { /* Saphire rapids */ x86_cpu_well_suited_avx512 = 1; diff --git a/arch/x86/x86.h b/arch/x86/x86.h index 00f8d9efc..0cb79e69a 100644 --- a/arch/x86/x86.h +++ b/arch/x86/x86.h @@ -1,4 +1,4 @@ -/* cpu.h -- check for CPU features +/* x86.h -- check for CPU features * Copyright (C) 2013 Intel Corporation Jim Kukunas * For conditions of distribution and use, see copyright notice in zlib.h */ diff --git a/cpu_features.c b/cpu_features.c new file mode 100644 index 000000000..70bfcb698 --- /dev/null +++ b/cpu_features.c @@ -0,0 +1,23 @@ +/* cpu_features.c -- CPU architecture feature check + * Copyright (C) 2017 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "zutil.h" + +Z_INTERNAL void cpu_check_features(void) { + static int features_checked = 0; + if (features_checked) + return; +#if defined(X86_FEATURES) + x86_check_features(); +#elif defined(ARM_FEATURES) + arm_check_features(); +#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) + power_check_features(); +#elif defined(S390_FEATURES) + s390_check_features(); +#endif + features_checked = 1; +} diff --git a/cpu_features.h b/cpu_features.h new file mode 100644 index 000000000..4cd1ed7b2 --- /dev/null +++ b/cpu_features.h @@ -0,0 +1,190 @@ +/* cpu_features.h -- CPU architecture feature check + * Copyright (C) 2017 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CPU_FEATURES +#define CPU_FEATURES + +#include "deflate.h" +#include "crc32_fold.h" + +#ifdef X86_FEATURES +# include "fallback_builtins.h" +#endif + +extern void cpu_check_features(); + +/* update_hash */ +extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val); +#ifdef X86_SSE42_CRC_HASH +extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val); +#elif defined(ARM_ACLE_CRC_HASH) +extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val); +#endif + +/* insert_string */ +extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); +#ifdef X86_SSE42_CRC_HASH +extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); +#elif defined(ARM_ACLE_CRC_HASH) +extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); +#endif + +/* quick_insert_string */ +extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); +#ifdef X86_SSE42_CRC_HASH +extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); +#elif defined(ARM_ACLE_CRC_HASH) +extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); +#endif + +/* slide_hash */ +#ifdef X86_SSE2 +void slide_hash_sse2(deflate_state *s); +#elif defined(ARM_NEON_SLIDEHASH) +void slide_hash_neon(deflate_state *s); +#endif +#if defined(PPC_VMX_SLIDEHASH) +void slide_hash_vmx(deflate_state *s); +#endif +#if defined(POWER8_VSX_SLIDEHASH) +void slide_hash_power8(deflate_state *s); +#endif +#ifdef X86_AVX2 +void slide_hash_avx2(deflate_state *s); +#endif + +/* adler32 */ +extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); +#ifdef ARM_NEON_ADLER32 +extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef PPC_VMX_ADLER32 +extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_SSE41_ADLER32 +extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_SSSE3_ADLER32 +extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_AVX2_ADLER32 +extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_AVX512_ADLER32 +extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef POWER8_VSX_ADLER32 +extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); +#endif + +/* CRC32 folding */ +#ifdef X86_PCLMULQDQ_CRC +extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc); +extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc); +#endif + +/* memory chunking */ +extern uint32_t chunksize_c(void); +extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#ifdef X86_SSE2_CHUNKSET +extern uint32_t chunksize_sse2(void); +extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif +#ifdef X86_AVX_CHUNKSET +extern uint32_t chunksize_avx(void); +extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif +#ifdef ARM_NEON_CHUNKSET +extern uint32_t chunksize_neon(void); +extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif +#ifdef POWER8_VSX_CHUNKSET +extern uint32_t chunksize_power8(void); +extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif + +/* CRC32 */ +extern uint32_t crc32_byfour(uint32_t crc, const unsigned char *buf, uint64_t len); +#ifdef ARM_ACLE_CRC_HASH +extern uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len); +#elif defined(POWER8_VSX_CRC32) +extern uint32_t crc32_power8(uint32_t crc, const unsigned char *buf, uint64_t len); +#elif defined(S390_CRC32_VX) +extern uint32_t s390_crc32_vx(uint32_t crc, const unsigned char *buf, uint64_t len); +#endif + +/* compare256 */ +extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1); +#ifdef UNALIGNED_OK +extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1); +extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1); +#ifdef UNALIGNED64_OK +extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1); +#endif +#ifdef X86_SSE42_CMP_STR +extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); +#endif +#endif + +/* longest_match */ +extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED_OK +extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); +extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED64_OK +extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); +#endif +#ifdef X86_SSE42_CMP_STR +extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); +#endif +#endif + +/* longest_match_slow */ +extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED_OK +extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); +extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED64_OK +extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); +#endif +#ifdef X86_SSE42_CMP_STR +extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match); +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match); +#endif +#endif + +#endif diff --git a/functable.c b/functable.c index 3cfa13ade..96fbc9831 100644 --- a/functable.c +++ b/functable.c @@ -11,206 +11,10 @@ #include "functable.h" -#ifdef X86_FEATURES -# include "fallback_builtins.h" -#endif - -/* update_hash */ -extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val); -#ifdef X86_SSE42_CRC_HASH -extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val); -#elif defined(ARM_ACLE_CRC_HASH) -extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val); -#endif - -/* insert_string */ -extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count); -#ifdef X86_SSE42_CRC_HASH -extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count); -#elif defined(ARM_ACLE_CRC_HASH) -extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count); -#endif - -/* quick_insert_string */ -extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); -#ifdef X86_SSE42_CRC_HASH -extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str); -#elif defined(ARM_ACLE_CRC_HASH) -extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str); -#endif - -/* slide_hash */ -#ifdef X86_SSE2 -void slide_hash_sse2(deflate_state *s); -#elif defined(ARM_NEON_SLIDEHASH) -void slide_hash_neon(deflate_state *s); -#endif -#if defined(PPC_VMX_SLIDEHASH) -void slide_hash_vmx(deflate_state *s); -#endif -#if defined(POWER8_VSX_SLIDEHASH) -void slide_hash_power8(deflate_state *s); -#endif -#ifdef X86_AVX2 -void slide_hash_avx2(deflate_state *s); -#endif - -/* adler32 */ -extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); -#ifdef ARM_NEON_ADLER32 -extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef PPC_VMX_ADLER32 -extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef X86_SSE41_ADLER32 -extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef X86_SSSE3_ADLER32 -extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef X86_AVX2_ADLER32 -extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef X86_AVX512_ADLER32 -extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef X86_AVX512VNNI_ADLER32 -extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#ifdef POWER8_VSX_ADLER32 -extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); -#endif - -/* CRC32 folding */ -extern uint32_t crc32_fold_reset_c(crc32_fold *crc); -extern void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); -extern uint32_t crc32_fold_final_c(crc32_fold *crc); - -#ifdef X86_PCLMULQDQ_CRC -extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc); -extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); -extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc); -#endif - -/* memory chunking */ -extern uint32_t chunksize_c(void); -extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); -extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len); -extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len); -extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#ifdef X86_SSE2_CHUNKSET -extern uint32_t chunksize_sse2(void); -extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); -extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); -extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); -extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef X86_AVX_CHUNKSET -extern uint32_t chunksize_avx(void); -extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); -extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len); -extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len); -extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef ARM_NEON_CHUNKSET -extern uint32_t chunksize_neon(void); -extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); -extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len); -extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len); -extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif -#ifdef POWER8_VSX_CHUNKSET -extern uint32_t chunksize_power8(void); -extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len); -extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); -extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len); -extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len); -extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); -#endif - -/* CRC32 */ -Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); - -#ifdef ARM_ACLE_CRC_HASH -extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); -#elif defined(POWER8_VSX_CRC32) -extern uint32_t crc32_power8(uint32_t, const unsigned char *, uint64_t); -#elif defined(S390_CRC32_VX) -extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t); -#endif - -/* compare256 */ -extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1); -#ifdef UNALIGNED_OK -extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1); -extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1); -#ifdef UNALIGNED64_OK -extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1); -#endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1); -#endif -#endif - -/* longest_match */ -extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match); -#ifdef UNALIGNED_OK -extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match); -extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match); -#ifdef UNALIGNED64_OK -extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); -#endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); -#endif -#endif - -/* longest_match_slow */ -extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); -#ifdef UNALIGNED_OK -extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); -extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); -#ifdef UNALIGNED64_OK -extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); -#endif -#ifdef X86_SSE42_CMP_STR -extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match); -#endif -#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match); -#endif -#endif +#include "cpu_features.h" Z_INTERNAL Z_TLS struct functable_s functable; -Z_INTERNAL void cpu_check_features(void) -{ - static int features_checked = 0; - if (features_checked) - return; -#if defined(X86_FEATURES) - x86_check_features(); -#elif defined(ARM_FEATURES) - arm_check_features(); -#elif defined(PPC_FEATURES) || defined(POWER_FEATURES) - power_check_features(); -#elif defined(S390_FEATURES) - s390_check_features(); -#endif - features_checked = 1; -} - /* stub functions */ Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) { // Initialize default diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt new file mode 100644 index 000000000..c3d841023 --- /dev/null +++ b/test/benchmarks/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.12) + +include(CheckCCompilerFlag) +include(FeatureSummary) +include(FetchContent) + +enable_language(CXX) + +# Search for Google benchmark package +find_package(benchmark QUIET) +if(NOT benchmark_FOUND) + # Fetch google benchmark source code from official repository + set(BENCHMARK_ENABLE_TESTING OFF) + FetchContent_Declare(benchmark + GIT_REPOSITORY https://github.com/google/benchmark.git) + FetchContent_MakeAvailable(benchmark) + FetchContent_GetProperties(benchmark) + + if(NOT benchmark_POPULATED) + FetchContent_Populate(benchmark) + endif() +endif() + +add_executable(benchmark_zlib + benchmark_adler32.cc + benchmark_compare256.cc + benchmark_crc32.cc + benchmark_main.cc + benchmark_slidehash.cc + ) + +target_include_directories(benchmark_zlib PRIVATE + ${CMAKE_SOURCE_DIR} + ${CMAKE_BINARY_DIR} + ${benchmark_SOURCE_DIR}/benchmark/include) + +target_link_libraries(benchmark_zlib zlibstatic benchmark::benchmark) +if(WIN32) + target_link_libraries(benchmark_zlib shlwapi) +endif() + +if(ZLIB_ENABLE_TESTS) + add_test(NAME benchmark_zlib + COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $) +endif() diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md new file mode 100644 index 000000000..b005027e0 --- /dev/null +++ b/test/benchmarks/README.md @@ -0,0 +1,19 @@ +## Benchmarks + +These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark). + +*Repetitions* + +To increase the number of times each benchmark iteration is run use: + +``` +--benchmark_repetitions=20 +``` + +*Filters* + +To filter out which benchmarks are performed use: + +``` +--benchmark_filter="adler32*" +``` diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc new file mode 100644 index 000000000..682908880 --- /dev/null +++ b/test/benchmarks/benchmark_adler32.cc @@ -0,0 +1,94 @@ +/* benchmark_adler32.cc -- benchmark adler32 variants + * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include +#include +#include + +#include + +extern "C" { +# include "zbuild.h" +# include "zutil.h" +# include "zutil_p.h" +# include "cpu_features.h" +} + +#define MAX_RANDOM_INTS (1024 * 1024) +#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) + +typedef uint32_t (*adler32_func)(uint32_t adler, const unsigned char *buf, size_t len); + +class adler32: public benchmark::Fixture { +private: + uint32_t *random_ints; + +public: + void SetUp(const ::benchmark::State& state) { + /* Control the alignment so that we have the best case scenario for loads. With + * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load. + * And while this is a realistic scenario, it makes it difficult to compare benchmark + * to benchmark because one allocation could have been aligned perfectly for the loads + * while the subsequent one happened to not be. This is not to be advantageous to AVX512 + * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to + * control the _consistency_ of the results */ + random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + assert(random_ints != NULL); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + random_ints[i] = rand(); + } + } + + void Bench(benchmark::State& state, adler32_func adler32) { + uint32_t hash = 0; + + for (auto _ : state) { + hash = adler32(hash, (const unsigned char *)random_ints, state.range(0)); + } + + benchmark::DoNotOptimize(hash); + } + + void TearDown(const ::benchmark::State& state) { + zng_free(random_ints); + } +}; + +#define BENCHMARK_ADLER32(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(adler32, name)->Range(2048, MAX_RANDOM_INTS_SIZE); + +BENCHMARK_ADLER32(c, adler32_c, 1); + +#ifdef ARM_NEON_ADLER32 +BENCHMARK_ADLER32(neon, adler32_neon, arm_has_neon); +#elif defined(POWER8_VSX_ADLER32) +BENCHMARK_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07); +#elif defined(PPC_VMX_ADLER32) +BENCHMARK_ADLER32(vmx, adler32_vmx, power_cpu_has_altivec); +#endif + +#ifdef X86_SSSE3_ADLER32 +BENCHMARK_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3); +#endif +#ifdef X86_SSE41_ADLER32 +BENCHMARK_ADLER32(sse41, adler32_sse41, x86_cpu_has_sse41); +#endif +#ifdef X86_AVX2_ADLER32 +BENCHMARK_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2); +#endif +#ifdef X86_AVX512_ADLER32 +BENCHMARK_ADLER32(avx512, adler32_avx512, x86_cpu_has_avx512); +#endif +#ifdef X86_AVX512VNNI_ADLER32 +BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, x86_cpu_has_avx512vnni); +#endif diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc new file mode 100644 index 000000000..063d110fa --- /dev/null +++ b/test/benchmarks/benchmark_compare256.cc @@ -0,0 +1,84 @@ +/* benchmark_compare256.cc -- benchmark compare256 variants + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include +#include + +#include + +extern "C" { +# include "zbuild.h" +# include "zutil.h" +# include "zutil_p.h" +# include "cpu_features.h" +} + +#define MAX_COMPARE_SIZE (256) + +typedef uint32_t (*compare256_func)(const unsigned char *src0, const unsigned char *src1); + +class compare256: public benchmark::Fixture { +private: + uint8_t *str1; + uint8_t *str2; + +public: + void SetUp(const ::benchmark::State& state) { + str1 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE); + assert(str1 != NULL); + memset(str1, 'a', MAX_COMPARE_SIZE); + + str2 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE); + assert(str2 != NULL); + memset(str2, 'a', MAX_COMPARE_SIZE); + } + + void Bench(benchmark::State& state, compare256_func compare256) { + int32_t match_len = (int32_t)state.range(0); + uint32_t len; + + str2[match_len] = 0; + for (auto _ : state) { + len = compare256((const uint8_t *)str1, (const uint8_t *)str2); + } + str2[match_len] = 'a'; + + benchmark::DoNotOptimize(len); + } + + void TearDown(const ::benchmark::State& state) { + zng_free(str1); + zng_free(str2); + } +}; + +#define BENCHMARK_COMPARE256(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE); + +BENCHMARK_COMPARE256(c, compare256_c, 1); + +#ifdef UNALIGNED_OK +BENCHMARK_COMPARE256(unaligned_16, compare256_unaligned_16, 1); +#ifdef HAVE_BUILTIN_CTZ +BENCHMARK_COMPARE256(unaligned_32, compare256_unaligned_32, 1); +#endif +#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) +BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1); +#endif +#endif + +#ifdef X86_SSE42_CMP_STR +BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42); +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2); +#endif diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc new file mode 100644 index 000000000..69d11a4d1 --- /dev/null +++ b/test/benchmarks/benchmark_crc32.cc @@ -0,0 +1,80 @@ +/* benchmark_crc32.cc -- benchmark crc32 variants + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include +#include +#include + +#include + +extern "C" { +# include "zbuild.h" +# include "zutil.h" +# include "zutil_p.h" +# include "cpu_features.h" +} + +#define MAX_RANDOM_INTS (1024 * 1024) +#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t)) + +typedef uint32_t (*crc32_func)(uint32_t crc32, const unsigned char * buf, uint64_t len); + +class crc32: public benchmark::Fixture { +private: + uint32_t *random_ints; + +public: + void SetUp(const ::benchmark::State& state) { + random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE); + assert(random_ints != NULL); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + random_ints[i] = rand(); + } + } + + void Bench(benchmark::State& state, crc32_func crc32) { + uint32_t hash = 0; + + for (auto _ : state) { + hash = crc32(hash, (const unsigned char *)random_ints, state.range(0)); + } + + benchmark::DoNotOptimize(hash); + } + + void TearDown(const ::benchmark::State& state) { + zng_free(random_ints); + } +}; + +#define BENCHMARK_CRC32(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(crc32, name)->Range(1, MAX_RANDOM_INTS_SIZE); + +BENCHMARK_CRC32(byfour, crc32_byfour, 1); + +#ifdef ARM_ACLE_CRC_HASH +BENCHMARK_CRC32(acle, crc32_acle, arm_cpu_has_crc32); +#elif defined(POWER8_VSX_CRC32) +BENCHMARK_CRC32(power8, crc32_power8, power_cpu_has_arch_2_07); +#elif defined(S390_CRC32_VX) +BENCHMARK_CRC32(vx, s390_crc32_vx, s390_cpu_has_vx); +#elif defined(X86_PCLMULQDQ_CRC) +/* CRC32 fold does a memory copy while hashing */ +uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) { + crc32_fold ALIGNED_(16) crc_state; + crc32_fold_reset_pclmulqdq(&crc_state); + crc32_fold_copy_pclmulqdq(&crc_state, (uint8_t *)buf, buf, len); + return crc32_fold_final_pclmulqdq(&crc_state); +} +BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, x86_cpu_has_pclmulqdq); +#endif \ No newline at end of file diff --git a/test/benchmarks/benchmark_main.cc b/test/benchmarks/benchmark_main.cc new file mode 100644 index 000000000..81582c353 --- /dev/null +++ b/test/benchmarks/benchmark_main.cc @@ -0,0 +1,25 @@ +/* benchmark_main.cc -- benchmark suite main entry point + * Copyright (C) 2022 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include +#include + +#include + +extern "C" { +# include "zbuild.h" +# include "zutil.h" +# include "cpu_features.h" +} + +int main(int argc, char** argv) { + cpu_check_features(); + + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + + return EXIT_SUCCESS; +} diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc new file mode 100644 index 000000000..174ea0931 --- /dev/null +++ b/test/benchmarks/benchmark_slidehash.cc @@ -0,0 +1,91 @@ +/* benchmark_slidehash.cc -- benchmark slide_hash variants + * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include +#include +#include + +extern "C" { +# include "zbuild.h" +# include "zutil.h" +# include "zutil_p.h" +# include "cpu_features.h" +# include "deflate.h" +} + +#include + +#define MAX_RANDOM_INTS 32768 + +typedef void (*slide_hash_func)(deflate_state *s); + +class slide_hash: public benchmark::Fixture { +private: + uint16_t *l0; + uint16_t *l1; + deflate_state *s_g; + +public: + void SetUp(const ::benchmark::State& state) { + l0 = (uint16_t *)zng_alloc(HASH_SIZE * sizeof(uint16_t)); + + for (int32_t i = 0; i < HASH_SIZE; i++) { + l0[i] = rand(); + } + + l1 = (uint16_t *)zng_alloc(MAX_RANDOM_INTS * sizeof(uint16_t)); + + for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) { + l1[i] = rand(); + } + + deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state)); + s->head = l0; + s->prev = l1; + s_g = s; + } + + void Bench(benchmark::State& state, slide_hash_func slide_hash) { + s_g->w_size = (uint32_t)state.range(0); + + for (auto _ : state) { + slide_hash(s_g); + benchmark::DoNotOptimize(s_g); + } + } + + void TearDown(const ::benchmark::State& state) { + zng_free(l0); + zng_free(l1); + } +}; + +#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \ + BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \ + if (!support_flag) { \ + state.SkipWithError("CPU does not support " #name); \ + } \ + Bench(state, fptr); \ + } \ + BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(1024, MAX_RANDOM_INTS); + +BENCHMARK_SLIDEHASH(c, slide_hash_c, 1); + +#ifdef ARM_NEON_SLIDEHASH +BENCHMARK_SLIDEHASH(neon, slide_hash_neon, arm_cpu_has_neon); +#endif +#ifdef POWER8_VSX_SLIDEHASH +BENCHMARK_SLIDEHASH(power8, slide_hash_power8, power_cpu_has_arch_2_07); +#endif +#ifdef PPC_VMX_SLIDEHASH +BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, power_cpu_has_altivec); +#endif + +#ifdef X86_SSE2 +BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, x86_cpu_has_sse2); +#endif +#ifdef X86_AVX2 +BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, x86_cpu_has_avx2); +#endif diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 82ca6c2dc..e65123360 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -48,6 +48,7 @@ OBJS = \ chunkset.obj \ compare256.obj \ compress.obj \ + cpu_features.obj \ crc32.obj \ crc32_comb.obj \ crc32_fold.obj \ @@ -185,6 +186,7 @@ gzread.obj: $(SRCDIR)/gzread.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/z gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h +cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index ace50794e..29ce95581 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -51,6 +51,7 @@ OBJS = \ chunkset.obj \ compare256.obj \ compress.obj \ + cpu_features.obj \ crc32.obj \ crc32_comb.obj \ crc32_fold.obj \ @@ -197,6 +198,7 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR) compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h +cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index cfdfcca74..0059606c5 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -57,6 +57,7 @@ OBJS = \ compare256_avx2.obj \ compare256_sse42.obj \ compress.obj \ + cpu_features.obj \ crc32.obj \ crc32_comb.obj \ crc32_fold.obj \ @@ -191,6 +192,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h +cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h