cmake-args: -DWITH_SANITIZER=Address
codecov: ubuntu_gcc
+ - name: Ubuntu GCC Benchmark
+ os: ubuntu-latest
+ compiler: gcc
+ cmake-args: -DWITH_BENCHMARKS=ON
+ codecov: ubuntu_gcc_benchmark
+
- name: Ubuntu GCC Symbol Prefix
os: ubuntu-latest
compiler: gcc
cmake-args: -DZLIB_SYMBOL_PREFIX=zTest_
codecov: ubuntu_gcc_sprefix
- - name: Ubuntu GCC Compat SPrefix
+ - name: Ubuntu GCC Compat Symbol Prefix
os: ubuntu-latest
compiler: gcc
cmake-args: -DZLIB_COMPAT=ON -DZLIB_SYMBOL_PREFIX=zTest_
compiler: gcc
configure-args: --warn --zlib-compat --static --with-dfltcc-deflate --with-dfltcc-inflate
- - name: macOS GCC symbol prefix
+ - name: macOS GCC Symbol Prefix
os: macOS-latest
compiler: gcc
configure-args: --sprefix=zTest_
- - name: macOS GCC symbol prefix & compat
+ - name: macOS GCC Symbol Prefix & Compat
os: macOS-latest
compiler: gcc
configure-args: --zlib-compat --sprefix=zTest_
option(ZLIB_ENABLE_TESTS "Build test binaries" ON)
option(ZLIB_DUAL_LINK "Dual link tests against system zlib" OFF)
option(WITH_FUZZERS "Build test/fuzz" OFF)
+option(WITH_BENCHMARKS "Build test/benchmarks" OFF)
option(WITH_OPTIM "Build with optimisation" ON)
option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF)
option(WITH_NEW_STRATEGIES "Use new strategies" ON)
set(ZLIB_PRIVATE_HDRS
adler32_p.h
chunkset_tpl.h
+ cpu_features.h
crc32_p.h
crc32_tbl.h
crc32_comb_tbl.h
chunkset.c
compare256.c
compress.c
+ cpu_features.c
crc32.c
crc32_comb.c
crc32_fold.c
endforeach()
endif()
+ if(WITH_BENCHMARKS)
+ add_subdirectory(test/benchmarks)
+ endif()
+
macro(test_minigzip name path)
# Construct compression arguments for minigzip
set(compress_args -k -c)
add_feature_info(ZLIB_DUAL_LINK ZLIB_DUAL_LINK "Dual link tests against system zlib")
add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support")
add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz")
+add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks")
add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation")
add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies")
add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS
chunkset.o \
compare256.o \
compress.o \
+ cpu_features.o \
crc32.o \
crc32_comb.o \
crc32_fold.o \
chunkset.lo \
compare256.lo \
compress.lo \
+ cpu_features.lo \
crc32.lo \
crc32_comb.lo \
crc32_fold.lo \
| WITH_NATIVE_INSTRUCTIONS | --native | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
| WITH_SANITIZER | --with-sanitizer | Build with sanitizer (memory, address, undefined) | OFF |
| WITH_FUZZERS | --with-fuzzers | Build test/fuzz | OFF |
+| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
-/*
- * x86 feature check
+/* x86.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
x86_cpu_well_suited_avx512 = 1;
} else if (model == 0xa && extended_model == 0x6) {
/* Icelake server */
- x86_cpu_well_suited_avx512 = 1;
+ x86_cpu_well_suited_avx512 = 1;
} else if (model == 0xf && extended_model == 0x8) {
/* Saphire rapids */
x86_cpu_well_suited_avx512 = 1;
-/* cpu.h -- check for CPU features
+/* x86.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
--- /dev/null
+/* cpu_features.c -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+Z_INTERNAL void cpu_check_features(void) {
+ static int features_checked = 0;
+ if (features_checked)
+ return;
+#if defined(X86_FEATURES)
+ x86_check_features();
+#elif defined(ARM_FEATURES)
+ arm_check_features();
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+ power_check_features();
+#elif defined(S390_FEATURES)
+ s390_check_features();
+#endif
+ features_checked = 1;
+}
--- /dev/null
+/* cpu_features.h -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FEATURES
+#define CPU_FEATURES
+
+#include "deflate.h"
+#include "crc32_fold.h"
+
+#ifdef X86_FEATURES
+# include "fallback_builtins.h"
+#endif
+
+extern void cpu_check_features();
+
+/* update_hash */
+extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
+#ifdef X86_SSE42_CRC_HASH
+extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
+#endif
+
+/* insert_string */
+extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
+#ifdef X86_SSE42_CRC_HASH
+extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
+#endif
+
+/* quick_insert_string */
+extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
+#ifdef X86_SSE42_CRC_HASH
+extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
+#endif
+
+/* slide_hash */
+#ifdef X86_SSE2
+void slide_hash_sse2(deflate_state *s);
+#elif defined(ARM_NEON_SLIDEHASH)
+void slide_hash_neon(deflate_state *s);
+#endif
+#if defined(PPC_VMX_SLIDEHASH)
+void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX_SLIDEHASH)
+void slide_hash_power8(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
+#endif
+
+/* adler32 */
+extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
+#ifdef ARM_NEON_ADLER32
+extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef PPC_VMX_ADLER32
+extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_SSE41_ADLER32
+extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_SSSE3_ADLER32
+extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef POWER8_VSX_ADLER32
+extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
+#endif
+
+/* CRC32 folding */
+#ifdef X86_PCLMULQDQ_CRC
+extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
+extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
+#endif
+
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2_CHUNKSET
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_AVX_CHUNKSET
+extern uint32_t chunksize_avx(void);
+extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON_CHUNKSET
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef POWER8_VSX_CHUNKSET
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
+/* CRC32 */
+extern uint32_t crc32_byfour(uint32_t crc, const unsigned char *buf, uint64_t len);
+#ifdef ARM_ACLE_CRC_HASH
+extern uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len);
+#elif defined(POWER8_VSX_CRC32)
+extern uint32_t crc32_power8(uint32_t crc, const unsigned char *buf, uint64_t len);
+#elif defined(S390_CRC32_VX)
+extern uint32_t s390_crc32_vx(uint32_t crc, const unsigned char *buf, uint64_t len);
+#endif
+
+/* compare256 */
+extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1);
+#ifdef UNALIGNED_OK
+extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1);
+extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1);
+#ifdef UNALIGNED64_OK
+extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
+#endif
+#endif
+
+/* longest_match */
+extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED_OK
+extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+
+/* longest_match_slow */
+extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED_OK
+extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+
+#endif
#include "functable.h"
-#ifdef X86_FEATURES
-# include "fallback_builtins.h"
-#endif
-
-/* update_hash */
-extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
-#ifdef X86_SSE42_CRC_HASH
-extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
-#endif
-
-/* insert_string */
-extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
-#ifdef X86_SSE42_CRC_HASH
-extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
-#endif
-
-/* quick_insert_string */
-extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
-#ifdef X86_SSE42_CRC_HASH
-extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
-#endif
-
-/* slide_hash */
-#ifdef X86_SSE2
-void slide_hash_sse2(deflate_state *s);
-#elif defined(ARM_NEON_SLIDEHASH)
-void slide_hash_neon(deflate_state *s);
-#endif
-#if defined(PPC_VMX_SLIDEHASH)
-void slide_hash_vmx(deflate_state *s);
-#endif
-#if defined(POWER8_VSX_SLIDEHASH)
-void slide_hash_power8(deflate_state *s);
-#endif
-#ifdef X86_AVX2
-void slide_hash_avx2(deflate_state *s);
-#endif
-
-/* adler32 */
-extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
-#ifdef ARM_NEON_ADLER32
-extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef PPC_VMX_ADLER32
-extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_SSE41_ADLER32
-extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_SSSE3_ADLER32
-extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef POWER8_VSX_ADLER32
-extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
-#endif
-
-/* CRC32 folding */
-extern uint32_t crc32_fold_reset_c(crc32_fold *crc);
-extern void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern uint32_t crc32_fold_final_c(crc32_fold *crc);
-
-#ifdef X86_PCLMULQDQ_CRC
-extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
-extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
-#endif
-
-/* memory chunking */
-extern uint32_t chunksize_c(void);
-extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#ifdef X86_SSE2_CHUNKSET
-extern uint32_t chunksize_sse2(void);
-extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_AVX_CHUNKSET
-extern uint32_t chunksize_avx(void);
-extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef ARM_NEON_CHUNKSET
-extern uint32_t chunksize_neon(void);
-extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef POWER8_VSX_CHUNKSET
-extern uint32_t chunksize_power8(void);
-extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-
-/* CRC32 */
-Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
-
-#ifdef ARM_ACLE_CRC_HASH
-extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
-#elif defined(POWER8_VSX_CRC32)
-extern uint32_t crc32_power8(uint32_t, const unsigned char *, uint64_t);
-#elif defined(S390_CRC32_VX)
-extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t);
-#endif
-
-/* compare256 */
-extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1);
-#ifdef UNALIGNED_OK
-extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1);
-extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1);
-#ifdef UNALIGNED64_OK
-extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
-#endif
-#endif
-
-/* longest_match */
-extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED_OK
-extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-
-/* longest_match_slow */
-extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED_OK
-extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#endif
+#include "cpu_features.h"
Z_INTERNAL Z_TLS struct functable_s functable;
-Z_INTERNAL void cpu_check_features(void)
-{
- static int features_checked = 0;
- if (features_checked)
- return;
-#if defined(X86_FEATURES)
- x86_check_features();
-#elif defined(ARM_FEATURES)
- arm_check_features();
-#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
- power_check_features();
-#elif defined(S390_FEATURES)
- s390_check_features();
-#endif
- features_checked = 1;
-}
-
/* stub functions */
Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) {
// Initialize default
--- /dev/null
+cmake_minimum_required(VERSION 3.12)
+
+include(CheckCCompilerFlag)
+include(FeatureSummary)
+include(FetchContent)
+
+enable_language(CXX)
+
+# Search for Google benchmark package
+find_package(benchmark QUIET)
+if(NOT benchmark_FOUND)
+ # Fetch google benchmark source code from official repository
+ set(BENCHMARK_ENABLE_TESTING OFF)
+ FetchContent_Declare(benchmark
+ GIT_REPOSITORY https://github.com/google/benchmark.git)
+ FetchContent_MakeAvailable(benchmark)
+ FetchContent_GetProperties(benchmark)
+
+ if(NOT benchmark_POPULATED)
+ FetchContent_Populate(benchmark)
+ endif()
+endif()
+
+add_executable(benchmark_zlib
+ benchmark_adler32.cc
+ benchmark_compare256.cc
+ benchmark_crc32.cc
+ benchmark_main.cc
+ benchmark_slidehash.cc
+ )
+
+target_include_directories(benchmark_zlib PRIVATE
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_BINARY_DIR}
+ ${benchmark_SOURCE_DIR}/benchmark/include)
+
+target_link_libraries(benchmark_zlib zlibstatic benchmark::benchmark)
+if(WIN32)
+ target_link_libraries(benchmark_zlib shlwapi)
+endif()
+
+if(ZLIB_ENABLE_TESTS)
+ add_test(NAME benchmark_zlib
+ COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib>)
+endif()
--- /dev/null
+## Benchmarks
+
+These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark).
+
+*Repetitions*
+
+To increase the number of times each benchmark iteration is run use:
+
+```
+--benchmark_repetitions=20
+```
+
+*Filters*
+
+To filter out which benchmarks are performed use:
+
+```
+--benchmark_filter="adler32*"
+```
--- /dev/null
+/* benchmark_adler32.cc -- benchmark adler32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil.h"
+# include "zutil_p.h"
+# include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const unsigned char *buf, size_t len);
+
+class adler32: public benchmark::Fixture {
+private:
+ uint32_t *random_ints;
+
+public:
+ void SetUp(const ::benchmark::State& state) {
+ /* Control the alignment so that we have the best case scenario for loads. With
+ * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+ * And while this is a realistic scenario, it makes it difficult to compare benchmark
+ * to benchmark because one allocation could have been aligned perfectly for the loads
+ * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+ * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+ * control the _consistency_ of the results */
+ random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+ assert(random_ints != NULL);
+
+ for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+ random_ints[i] = rand();
+ }
+ }
+
+ void Bench(benchmark::State& state, adler32_func adler32) {
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = adler32(hash, (const unsigned char *)random_ints, state.range(0));
+ }
+
+ benchmark::DoNotOptimize(hash);
+ }
+
+ void TearDown(const ::benchmark::State& state) {
+ zng_free(random_ints);
+ }
+};
+
+#define BENCHMARK_ADLER32(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(adler32, name)->Range(2048, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+BENCHMARK_ADLER32(neon, adler32_neon, arm_has_neon);
+#elif defined(POWER8_VSX_ADLER32)
+BENCHMARK_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07);
+#elif defined(PPC_VMX_ADLER32)
+BENCHMARK_ADLER32(vmx, adler32_vmx, power_cpu_has_altivec);
+#endif
+
+#ifdef X86_SSSE3_ADLER32
+BENCHMARK_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3);
+#endif
+#ifdef X86_SSE41_ADLER32
+BENCHMARK_ADLER32(sse41, adler32_sse41, x86_cpu_has_sse41);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32(avx512, adler32_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
--- /dev/null
+/* benchmark_compare256.cc -- benchmark compare256 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil.h"
+# include "zutil_p.h"
+# include "cpu_features.h"
+}
+
+#define MAX_COMPARE_SIZE (256)
+
+typedef uint32_t (*compare256_func)(const unsigned char *src0, const unsigned char *src1);
+
+class compare256: public benchmark::Fixture {
+private:
+ uint8_t *str1;
+ uint8_t *str2;
+
+public:
+ void SetUp(const ::benchmark::State& state) {
+ str1 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
+ assert(str1 != NULL);
+ memset(str1, 'a', MAX_COMPARE_SIZE);
+
+ str2 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
+ assert(str2 != NULL);
+ memset(str2, 'a', MAX_COMPARE_SIZE);
+ }
+
+ void Bench(benchmark::State& state, compare256_func compare256) {
+ int32_t match_len = (int32_t)state.range(0);
+ uint32_t len;
+
+ str2[match_len] = 0;
+ for (auto _ : state) {
+ len = compare256((const uint8_t *)str1, (const uint8_t *)str2);
+ }
+ str2[match_len] = 'a';
+
+ benchmark::DoNotOptimize(len);
+ }
+
+ void TearDown(const ::benchmark::State& state) {
+ zng_free(str1);
+ zng_free(str2);
+ }
+};
+
+#define BENCHMARK_COMPARE256(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
+
+BENCHMARK_COMPARE256(c, compare256_c, 1);
+
+#ifdef UNALIGNED_OK
+BENCHMARK_COMPARE256(unaligned_16, compare256_unaligned_16, 1);
+#ifdef HAVE_BUILTIN_CTZ
+BENCHMARK_COMPARE256(unaligned_32, compare256_unaligned_32, 1);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
+#endif
+#endif
+
+#ifdef X86_SSE42_CMP_STR
+BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
+#endif
--- /dev/null
+/* benchmark_crc32.cc -- benchmark crc32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil.h"
+# include "zutil_p.h"
+# include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*crc32_func)(uint32_t crc32, const unsigned char * buf, uint64_t len);
+
+class crc32: public benchmark::Fixture {
+private:
+ uint32_t *random_ints;
+
+public:
+ void SetUp(const ::benchmark::State& state) {
+ random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+ assert(random_ints != NULL);
+
+ for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+ random_ints[i] = rand();
+ }
+ }
+
+ void Bench(benchmark::State& state, crc32_func crc32) {
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = crc32(hash, (const unsigned char *)random_ints, state.range(0));
+ }
+
+ benchmark::DoNotOptimize(hash);
+ }
+
+ void TearDown(const ::benchmark::State& state) {
+ zng_free(random_ints);
+ }
+};
+
+#define BENCHMARK_CRC32(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(crc32, name)->Range(1, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_CRC32(byfour, crc32_byfour, 1);
+
+#ifdef ARM_ACLE_CRC_HASH
+BENCHMARK_CRC32(acle, crc32_acle, arm_cpu_has_crc32);
+#elif defined(POWER8_VSX_CRC32)
+BENCHMARK_CRC32(power8, crc32_power8, power_cpu_has_arch_2_07);
+#elif defined(S390_CRC32_VX)
+BENCHMARK_CRC32(vx, s390_crc32_vx, s390_cpu_has_vx);
+#elif defined(X86_PCLMULQDQ_CRC)
+/* CRC32 fold does a memory copy while hashing */
+uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
+ crc32_fold ALIGNED_(16) crc_state;
+ crc32_fold_reset_pclmulqdq(&crc_state);
+ crc32_fold_copy_pclmulqdq(&crc_state, (uint8_t *)buf, buf, len);
+ return crc32_fold_final_pclmulqdq(&crc_state);
+}
+BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, x86_cpu_has_pclmulqdq);
+#endif
\ No newline at end of file
--- /dev/null
+/* benchmark_main.cc -- benchmark suite main entry point
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil.h"
+# include "cpu_features.h"
+}
+
+int main(int argc, char** argv) {
+ cpu_check_features();
+
+ ::benchmark::Initialize(&argc, argv);
+ ::benchmark::RunSpecifiedBenchmarks();
+
+ return EXIT_SUCCESS;
+}
--- /dev/null
+/* benchmark_slidehash.cc -- benchmark slide_hash variants
+ * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdint.h>
+#include <limits.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil.h"
+# include "zutil_p.h"
+# include "cpu_features.h"
+# include "deflate.h"
+}
+
+#include <benchmark/benchmark.h>
+
+#define MAX_RANDOM_INTS 32768
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+class slide_hash: public benchmark::Fixture {
+private:
+ uint16_t *l0;
+ uint16_t *l1;
+ deflate_state *s_g;
+
+public:
+ void SetUp(const ::benchmark::State& state) {
+ l0 = (uint16_t *)zng_alloc(HASH_SIZE * sizeof(uint16_t));
+
+ for (int32_t i = 0; i < HASH_SIZE; i++) {
+ l0[i] = rand();
+ }
+
+ l1 = (uint16_t *)zng_alloc(MAX_RANDOM_INTS * sizeof(uint16_t));
+
+ for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+ l1[i] = rand();
+ }
+
+ deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state));
+ s->head = l0;
+ s->prev = l1;
+ s_g = s;
+ }
+
+ void Bench(benchmark::State& state, slide_hash_func slide_hash) {
+ s_g->w_size = (uint32_t)state.range(0);
+
+ for (auto _ : state) {
+ slide_hash(s_g);
+ benchmark::DoNotOptimize(s_g);
+ }
+ }
+
+ void TearDown(const ::benchmark::State& state) {
+ zng_free(l0);
+ zng_free(l1);
+ }
+};
+
+#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(1024, MAX_RANDOM_INTS);
+
+BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
+
+#ifdef ARM_NEON_SLIDEHASH
+BENCHMARK_SLIDEHASH(neon, slide_hash_neon, arm_cpu_has_neon);
+#endif
+#ifdef POWER8_VSX_SLIDEHASH
+BENCHMARK_SLIDEHASH(power8, slide_hash_power8, power_cpu_has_arch_2_07);
+#endif
+#ifdef PPC_VMX_SLIDEHASH
+BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, power_cpu_has_altivec);
+#endif
+
+#ifdef X86_SSE2
+BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, x86_cpu_has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, x86_cpu_has_avx2);
+#endif
chunkset.obj \
compare256.obj \
compress.obj \
+ cpu_features.obj \
crc32.obj \
crc32_comb.obj \
crc32_fold.obj \
gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h
compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
chunkset.obj \
compare256.obj \
compress.obj \
+ cpu_features.obj \
crc32.obj \
crc32_comb.obj \
crc32_fold.obj \
compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
compare256_avx2.obj \
compare256_sse42.obj \
compress.obj \
+ cpu_features.obj \
crc32.obj \
crc32_comb.obj \
crc32_fold.obj \
chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h