]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Added adler32, compare256, crc32, and slide_hash benchmarks using Google Benchmark.
authorNathan Moinvaziri <nathan@nathanm.com>
Sat, 8 Jan 2022 21:28:41 +0000 (13:28 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 17 Jan 2022 08:10:02 +0000 (09:10 +0100)
Co-authored-by: Adam Stylinski <kungfujesus06@gmail.com>
20 files changed:
.github/workflows/cmake.yml
.github/workflows/configure.yml
CMakeLists.txt
Makefile.in
README.md
arch/x86/x86.c
arch/x86/x86.h
cpu_features.c [new file with mode: 0644]
cpu_features.h [new file with mode: 0644]
functable.c
test/benchmarks/CMakeLists.txt [new file with mode: 0644]
test/benchmarks/README.md [new file with mode: 0644]
test/benchmarks/benchmark_adler32.cc [new file with mode: 0644]
test/benchmarks/benchmark_compare256.cc [new file with mode: 0644]
test/benchmarks/benchmark_crc32.cc [new file with mode: 0644]
test/benchmarks/benchmark_main.cc [new file with mode: 0644]
test/benchmarks/benchmark_slidehash.cc [new file with mode: 0644]
win32/Makefile.a64
win32/Makefile.arm
win32/Makefile.msc

index 5662ed5d8f469a4747a38a0d858c5b85750a1637..918be18b334de329bbc3f91f81b50262bae70a09 100644 (file)
@@ -18,13 +18,19 @@ jobs:
             cmake-args: -DWITH_SANITIZER=Address
             codecov: ubuntu_gcc
 
+          - name: Ubuntu GCC Benchmark
+            os: ubuntu-latest
+            compiler: gcc
+            cmake-args: -DWITH_BENCHMARKS=ON
+            codecov: ubuntu_gcc_benchmark
+
           - name: Ubuntu GCC Symbol Prefix
             os: ubuntu-latest
             compiler: gcc
             cmake-args: -DZLIB_SYMBOL_PREFIX=zTest_
             codecov: ubuntu_gcc_sprefix
 
-          - name: Ubuntu GCC Compat SPrefix
+          - name: Ubuntu GCC Compat Symbol Prefix
             os: ubuntu-latest
             compiler: gcc
             cmake-args: -DZLIB_COMPAT=ON -DZLIB_SYMBOL_PREFIX=zTest_
index 6458673a1ea3e88dfd97a0b8cce39341d9c61bc5..2c72afb8aaac0e0f9f1a605e32401d8cb8ae8901 100644 (file)
@@ -176,12 +176,12 @@ jobs:
             compiler: gcc
             configure-args: --warn --zlib-compat --static --with-dfltcc-deflate --with-dfltcc-inflate
 
-          - name: macOS GCC symbol prefix
+          - name: macOS GCC Symbol Prefix
             os: macOS-latest
             compiler: gcc
             configure-args: --sprefix=zTest_
 
-          - name: macOS GCC symbol prefix & compat
+          - name: macOS GCC Symbol Prefix & Compat
             os: macOS-latest
             compiler: gcc
             configure-args: --zlib-compat --sprefix=zTest_
index 104ff163c31abcd49ffca0f0e786ed9ec5b6e6ed..5391fd5c596660ff8f7c8eb0ddcdef875efeba4a 100644 (file)
@@ -75,6 +75,7 @@ option(ZLIB_COMPAT "Compile with zlib compatible API" OFF)
 option(ZLIB_ENABLE_TESTS "Build test binaries" ON)
 option(ZLIB_DUAL_LINK "Dual link tests against system zlib" OFF)
 option(WITH_FUZZERS "Build test/fuzz" OFF)
+option(WITH_BENCHMARKS "Build test/benchmarks" OFF)
 option(WITH_OPTIM "Build with optimisation" ON)
 option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces performance)" OFF)
 option(WITH_NEW_STRATEGIES "Use new strategies" ON)
@@ -922,6 +923,7 @@ set(ZLIB_PUBLIC_HDRS
 set(ZLIB_PRIVATE_HDRS
     adler32_p.h
     chunkset_tpl.h
+    cpu_features.h
     crc32_p.h
     crc32_tbl.h
     crc32_comb_tbl.h
@@ -948,6 +950,7 @@ set(ZLIB_SRCS
     chunkset.c
     compare256.c
     compress.c
+    cpu_features.c
     crc32.c
     crc32_comb.c
     crc32_fold.c
@@ -1288,6 +1291,10 @@ if(ZLIB_ENABLE_TESTS)
         endforeach()
     endif()
 
+    if(WITH_BENCHMARKS)
+        add_subdirectory(test/benchmarks)
+    endif()
+
     macro(test_minigzip name path)
         # Construct compression arguments for minigzip
         set(compress_args -k -c)
@@ -1446,6 +1453,7 @@ add_feature_info(ZLIB_ENABLE_TESTS ZLIB_ENABLE_TESTS "Build test binaries")
 add_feature_info(ZLIB_DUAL_LINK ZLIB_DUAL_LINK "Dual link tests against system zlib")
 add_feature_info(WITH_SANITIZER WITH_SANITIZER "Enable sanitizer support")
 add_feature_info(WITH_FUZZERS WITH_FUZZERS "Build test/fuzz")
+add_feature_info(WITH_BENCHMARKS WITH_BENCHMARKS "Build test/benchmarks")
 add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation")
 add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies")
 add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS
index 8f89d0a9c59252cdaedb0de965ef2c13bee26d2f..ca2b1127f7178e9416f4e9d61dde02b28f25ef23 100644 (file)
@@ -78,6 +78,7 @@ OBJZ = \
        chunkset.o \
        compare256.o \
        compress.o \
+       cpu_features.o \
        crc32.o \
        crc32_comb.o \
        crc32_fold.o \
@@ -114,6 +115,7 @@ PIC_OBJZ = \
        chunkset.lo \
        compare256.lo \
        compress.lo \
+       cpu_features.lo \
        crc32.lo \
        crc32_comb.lo \
        crc32_fold.lo \
index 599fdd9571e6acd83a5a34898e979ce71b5482e8..5b4834d7b0d58f1fc0e5751aa1aad03c9c9bc667 100644 (file)
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ Build Options
 | WITH_NATIVE_INSTRUCTIONS | --native                 | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
 | WITH_SANITIZER           | --with-sanitizer         | Build with sanitizer (memory, address, undefined)                                     | OFF     |
 | WITH_FUZZERS             | --with-fuzzers           | Build test/fuzz                                                                       | OFF     |
+| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
 | WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
 | WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
 
index 32baf8a749494793ab8d8a2fd7e2f8f6c1bc27d0..66bfa4e284017eaa01390ceabe3fc457ba723b14 100644 (file)
@@ -1,5 +1,4 @@
-/*
- * x86 feature check
+/* x86.c - x86 feature check
  *
  * Copyright (C) 2013 Intel Corporation. All rights reserved.
  * Author:
@@ -119,7 +118,7 @@ void Z_INTERNAL x86_check_features(void) {
                 x86_cpu_well_suited_avx512 = 1;
             } else if (model == 0xa && extended_model == 0x6) {
                 /* Icelake server */
-                x86_cpu_well_suited_avx512 = 1; 
+                x86_cpu_well_suited_avx512 = 1;
             } else if (model == 0xf && extended_model == 0x8) {
                 /* Saphire rapids */
                 x86_cpu_well_suited_avx512 = 1;
index 00f8d9efc7878cccecf6612e4f9447e6fd96497f..0cb79e69a1b0f721381ebaa4dd695299b3cb77ca 100644 (file)
@@ -1,4 +1,4 @@
-/* cpu.h -- check for CPU features
+/* x86.h -- check for CPU features
 * Copyright (C) 2013 Intel Corporation Jim Kukunas
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
diff --git a/cpu_features.c b/cpu_features.c
new file mode 100644 (file)
index 0000000..70bfcb6
--- /dev/null
@@ -0,0 +1,23 @@
+/* cpu_features.c -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+#include "zutil.h"
+
+Z_INTERNAL void cpu_check_features(void) {
+    static int features_checked = 0;
+    if (features_checked)
+        return;
+#if defined(X86_FEATURES)
+    x86_check_features();
+#elif defined(ARM_FEATURES)
+    arm_check_features();
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
+    power_check_features();
+#elif defined(S390_FEATURES)
+    s390_check_features();
+#endif
+    features_checked = 1;
+}
diff --git a/cpu_features.h b/cpu_features.h
new file mode 100644 (file)
index 0000000..4cd1ed7
--- /dev/null
@@ -0,0 +1,190 @@
+/* cpu_features.h -- CPU architecture feature check
+ * Copyright (C) 2017 Hans Kristian Rosbach
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CPU_FEATURES
+#define CPU_FEATURES
+
+#include "deflate.h"
+#include "crc32_fold.h"
+
+#ifdef X86_FEATURES
+#  include "fallback_builtins.h"
+#endif
+
+extern void cpu_check_features();
+
+/* update_hash */
+extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
+#ifdef X86_SSE42_CRC_HASH
+extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
+#endif
+
+/* insert_string */
+extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
+#ifdef X86_SSE42_CRC_HASH
+extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
+#endif
+
+/* quick_insert_string */
+extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
+#ifdef X86_SSE42_CRC_HASH
+extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
+#elif defined(ARM_ACLE_CRC_HASH)
+extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
+#endif
+
+/* slide_hash */
+#ifdef X86_SSE2
+void slide_hash_sse2(deflate_state *s);
+#elif defined(ARM_NEON_SLIDEHASH)
+void slide_hash_neon(deflate_state *s);
+#endif
+#if defined(PPC_VMX_SLIDEHASH)
+void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX_SLIDEHASH)
+void slide_hash_power8(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
+#endif
+
+/* adler32 */
+extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
+#ifdef ARM_NEON_ADLER32
+extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef PPC_VMX_ADLER32
+extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_SSE41_ADLER32
+extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_SSSE3_ADLER32
+extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef POWER8_VSX_ADLER32
+extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
+#endif
+
+/* CRC32 folding */
+#ifdef X86_PCLMULQDQ_CRC
+extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
+extern void     crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
+#endif
+
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2_CHUNKSET
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef X86_AVX_CHUNKSET
+extern uint32_t chunksize_avx(void);
+extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON_CHUNKSET
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef POWER8_VSX_CHUNKSET
+extern uint32_t chunksize_power8(void);
+extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
+/* CRC32 */
+extern uint32_t crc32_byfour(uint32_t crc, const unsigned char *buf, uint64_t len);
+#ifdef ARM_ACLE_CRC_HASH
+extern uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len);
+#elif defined(POWER8_VSX_CRC32)
+extern uint32_t crc32_power8(uint32_t crc, const unsigned char *buf, uint64_t len);
+#elif defined(S390_CRC32_VX)
+extern uint32_t s390_crc32_vx(uint32_t crc, const unsigned char *buf, uint64_t len);
+#endif
+
+/* compare256 */
+extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1);
+#ifdef UNALIGNED_OK
+extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1);
+extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1);
+#ifdef UNALIGNED64_OK
+extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
+#endif
+#endif
+
+/* longest_match */
+extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED_OK
+extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+
+/* longest_match_slow */
+extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED_OK
+extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
+#ifdef UNALIGNED64_OK
+extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
+#endif
+#ifdef X86_SSE42_CMP_STR
+extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
+#endif
+#endif
+
+#endif
index 3cfa13ade396cc46fc4f0a8b6f87559aedbe1ae7..96fbc98315908f7defbfdf4d935db7d316b396be 100644 (file)
 
 #include "functable.h"
 
-#ifdef X86_FEATURES
-#  include "fallback_builtins.h"
-#endif
-
-/* update_hash */
-extern uint32_t update_hash_c(deflate_state *const s, uint32_t h, uint32_t val);
-#ifdef X86_SSE42_CRC_HASH
-extern uint32_t update_hash_sse4(deflate_state *const s, uint32_t h, uint32_t val);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern uint32_t update_hash_acle(deflate_state *const s, uint32_t h, uint32_t val);
-#endif
-
-/* insert_string */
-extern void insert_string_c(deflate_state *const s, const uint32_t str, uint32_t count);
-#ifdef X86_SSE42_CRC_HASH
-extern void insert_string_sse4(deflate_state *const s, const uint32_t str, uint32_t count);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern void insert_string_acle(deflate_state *const s, const uint32_t str, uint32_t count);
-#endif
-
-/* quick_insert_string */
-extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
-#ifdef X86_SSE42_CRC_HASH
-extern Pos quick_insert_string_sse4(deflate_state *const s, const uint32_t str);
-#elif defined(ARM_ACLE_CRC_HASH)
-extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
-#endif
-
-/* slide_hash */
-#ifdef X86_SSE2
-void slide_hash_sse2(deflate_state *s);
-#elif defined(ARM_NEON_SLIDEHASH)
-void slide_hash_neon(deflate_state *s);
-#endif
-#if defined(PPC_VMX_SLIDEHASH)
-void slide_hash_vmx(deflate_state *s);
-#endif
-#if defined(POWER8_VSX_SLIDEHASH)
-void slide_hash_power8(deflate_state *s);
-#endif
-#ifdef X86_AVX2
-void slide_hash_avx2(deflate_state *s);
-#endif
-
-/* adler32 */
-extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
-#ifdef ARM_NEON_ADLER32
-extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef PPC_VMX_ADLER32
-extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_SSE41_ADLER32
-extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_SSSE3_ADLER32
-extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
-#endif
-#ifdef POWER8_VSX_ADLER32
-extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
-#endif
-
-/* CRC32 folding */
-extern uint32_t crc32_fold_reset_c(crc32_fold *crc);
-extern void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern uint32_t crc32_fold_final_c(crc32_fold *crc);
-
-#ifdef X86_PCLMULQDQ_CRC
-extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
-extern void     crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
-extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
-#endif
-
-/* memory chunking */
-extern uint32_t chunksize_c(void);
-extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#ifdef X86_SSE2_CHUNKSET
-extern uint32_t chunksize_sse2(void);
-extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef X86_AVX_CHUNKSET
-extern uint32_t chunksize_avx(void);
-extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef ARM_NEON_CHUNKSET
-extern uint32_t chunksize_neon(void);
-extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-#ifdef POWER8_VSX_CHUNKSET
-extern uint32_t chunksize_power8(void);
-extern uint8_t* chunkcopy_power8(uint8_t *out, uint8_t const *from, unsigned len);
-extern uint8_t* chunkcopy_safe_power8(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
-extern uint8_t* chunkunroll_power8(uint8_t *out, unsigned *dist, unsigned *len);
-extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len);
-extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
-#endif
-
-/* CRC32 */
-Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
-
-#ifdef ARM_ACLE_CRC_HASH
-extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
-#elif defined(POWER8_VSX_CRC32)
-extern uint32_t crc32_power8(uint32_t, const unsigned char *, uint64_t);
-#elif defined(S390_CRC32_VX)
-extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t);
-#endif
-
-/* compare256 */
-extern uint32_t compare256_c(const unsigned char *src0, const unsigned char *src1);
-#ifdef UNALIGNED_OK
-extern uint32_t compare256_unaligned_16(const unsigned char *src0, const unsigned char *src1);
-extern uint32_t compare256_unaligned_32(const unsigned char *src0, const unsigned char *src1);
-#ifdef UNALIGNED64_OK
-extern uint32_t compare256_unaligned_64(const unsigned char *src0, const unsigned char *src1);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t compare256_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
-#endif
-#endif
-
-/* longest_match */
-extern uint32_t longest_match_c(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED_OK
-extern uint32_t longest_match_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#endif
-
-/* longest_match_slow */
-extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED_OK
-extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match);
-extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match);
-#ifdef UNALIGNED64_OK
-extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
-#endif
-#ifdef X86_SSE42_CMP_STR
-extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match);
-#endif
-#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
-#endif
+#include "cpu_features.h"
 
 Z_INTERNAL Z_TLS struct functable_s functable;
 
-Z_INTERNAL void cpu_check_features(void)
-{
-    static int features_checked = 0;
-    if (features_checked)
-        return;
-#if defined(X86_FEATURES)
-    x86_check_features();
-#elif defined(ARM_FEATURES)
-    arm_check_features();
-#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
-    power_check_features();
-#elif defined(S390_FEATURES)
-    s390_check_features();
-#endif
-    features_checked = 1;
-}
-
 /* stub functions */
 Z_INTERNAL uint32_t update_hash_stub(deflate_state *const s, uint32_t h, uint32_t val) {
     // Initialize default
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
new file mode 100644 (file)
index 0000000..c3d8410
--- /dev/null
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.12)
+
+include(CheckCCompilerFlag)
+include(FeatureSummary)
+include(FetchContent)
+
+enable_language(CXX)
+
+# Search for Google benchmark package
+find_package(benchmark QUIET)
+if(NOT benchmark_FOUND)
+    # Fetch google benchmark source code from official repository
+    set(BENCHMARK_ENABLE_TESTING OFF)
+    FetchContent_Declare(benchmark
+        GIT_REPOSITORY https://github.com/google/benchmark.git)
+    FetchContent_MakeAvailable(benchmark)
+    FetchContent_GetProperties(benchmark)
+
+    if(NOT benchmark_POPULATED)
+        FetchContent_Populate(benchmark)
+    endif()
+endif()
+
+add_executable(benchmark_zlib
+    benchmark_adler32.cc
+    benchmark_compare256.cc
+    benchmark_crc32.cc
+    benchmark_main.cc
+    benchmark_slidehash.cc
+    )
+
+target_include_directories(benchmark_zlib PRIVATE
+    ${CMAKE_SOURCE_DIR}
+    ${CMAKE_BINARY_DIR}
+    ${benchmark_SOURCE_DIR}/benchmark/include)
+
+target_link_libraries(benchmark_zlib zlibstatic benchmark::benchmark)
+if(WIN32)
+    target_link_libraries(benchmark_zlib shlwapi)
+endif()
+
+if(ZLIB_ENABLE_TESTS)
+    add_test(NAME benchmark_zlib
+        COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:benchmark_zlib>)
+endif()
diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md
new file mode 100644 (file)
index 0000000..b005027
--- /dev/null
@@ -0,0 +1,19 @@
+## Benchmarks
+
+These benchmarks are written using [Google Benchmark](https://github.com/google/benchmark).
+
+*Repetitions*
+
+To increase the number of times each benchmark iteration is run use:
+
+```
+--benchmark_repetitions=20
+```
+
+*Filters*
+
+To filter out which benchmarks are performed use:
+
+```
+--benchmark_filter="adler32*"
+```
diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc
new file mode 100644 (file)
index 0000000..6829088
--- /dev/null
@@ -0,0 +1,94 @@
+/* benchmark_adler32.cc -- benchmark adler32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_func)(uint32_t adler, const unsigned char *buf, size_t len);
+
+class adler32: public benchmark::Fixture {
+private:
+    uint32_t *random_ints;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        /* Control the alignment so that we have the best case scenario for loads. With
+         * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+         * And while this is a realistic scenario, it makes it difficult to compare benchmark
+         * to benchmark because one allocation could have been aligned perfectly for the loads
+         * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+         * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+         * control the _consistency_ of the results */
+        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, adler32_func adler32) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = adler32(hash, (const unsigned char *)random_ints, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints);
+    }
+};
+
+#define BENCHMARK_ADLER32(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(adler32, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(adler32, name)->Range(2048, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+BENCHMARK_ADLER32(neon, adler32_neon, arm_has_neon);
+#elif defined(POWER8_VSX_ADLER32)
+BENCHMARK_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07);
+#elif defined(PPC_VMX_ADLER32)
+BENCHMARK_ADLER32(vmx, adler32_vmx, power_cpu_has_altivec);
+#endif
+
+#ifdef X86_SSSE3_ADLER32
+BENCHMARK_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3);
+#endif
+#ifdef X86_SSE41_ADLER32
+BENCHMARK_ADLER32(sse41, adler32_sse41, x86_cpu_has_sse41);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32(avx512, adler32_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32(avx512_vnni, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc
new file mode 100644 (file)
index 0000000..063d110
--- /dev/null
@@ -0,0 +1,84 @@
+/* benchmark_compare256.cc -- benchmark compare256 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_COMPARE_SIZE (256)
+
+typedef uint32_t (*compare256_func)(const unsigned char *src0, const unsigned char *src1);
+
+class compare256: public benchmark::Fixture {
+private:
+    uint8_t *str1;
+    uint8_t *str2;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        str1 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
+        assert(str1 != NULL);
+        memset(str1, 'a', MAX_COMPARE_SIZE);
+
+        str2 = (uint8_t *)zng_alloc(MAX_COMPARE_SIZE);
+        assert(str2 != NULL);
+        memset(str2, 'a', MAX_COMPARE_SIZE);
+    }
+
+    void Bench(benchmark::State& state, compare256_func compare256) {
+        int32_t match_len = (int32_t)state.range(0);
+        uint32_t len;
+
+        str2[match_len] = 0;
+        for (auto _ : state) {
+            len = compare256((const uint8_t *)str1, (const uint8_t *)str2);
+        }
+        str2[match_len] = 'a';
+
+        benchmark::DoNotOptimize(len);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(str1);
+        zng_free(str2);
+    }
+};
+
+#define BENCHMARK_COMPARE256(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(compare256, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(compare256, name)->Range(1, MAX_COMPARE_SIZE);
+
+BENCHMARK_COMPARE256(c, compare256_c, 1);
+
+#ifdef UNALIGNED_OK
+BENCHMARK_COMPARE256(unaligned_16, compare256_unaligned_16, 1);
+#ifdef HAVE_BUILTIN_CTZ
+BENCHMARK_COMPARE256(unaligned_32, compare256_unaligned_32, 1);
+#endif
+#if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
+BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
+#endif
+#endif
+
+#ifdef X86_SSE42_CMP_STR
+BENCHMARK_COMPARE256(unaligned_sse4, compare256_unaligned_sse4, x86_cpu_has_sse42);
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
+#endif
diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc
new file mode 100644 (file)
index 0000000..69d11a4
--- /dev/null
@@ -0,0 +1,80 @@
+/* benchmark_crc32.cc -- benchmark crc32 variants
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*crc32_func)(uint32_t crc32, const unsigned char * buf, uint64_t len);
+
+class crc32: public benchmark::Fixture {
+private:
+    uint32_t *random_ints;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        random_ints = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+        assert(random_ints != NULL);
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            random_ints[i] = rand();
+        }
+    }
+
+    void Bench(benchmark::State& state, crc32_func crc32) {
+        uint32_t hash = 0;
+
+        for (auto _ : state) {
+            hash = crc32(hash, (const unsigned char *)random_ints, state.range(0));
+        }
+
+        benchmark::DoNotOptimize(hash);
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(random_ints);
+    }
+};
+
+#define BENCHMARK_CRC32(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(crc32, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(crc32, name)->Range(1, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_CRC32(byfour, crc32_byfour, 1);
+
+#ifdef ARM_ACLE_CRC_HASH
+BENCHMARK_CRC32(acle, crc32_acle, arm_cpu_has_crc32);
+#elif defined(POWER8_VSX_CRC32)
+BENCHMARK_CRC32(power8, crc32_power8, power_cpu_has_arch_2_07);
+#elif defined(S390_CRC32_VX)
+BENCHMARK_CRC32(vx, s390_crc32_vx, s390_cpu_has_vx);
+#elif defined(X86_PCLMULQDQ_CRC)
+/* CRC32 fold does a memory copy while hashing */
+uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
+    crc32_fold ALIGNED_(16) crc_state;
+    crc32_fold_reset_pclmulqdq(&crc_state);
+    crc32_fold_copy_pclmulqdq(&crc_state, (uint8_t *)buf, buf, len);
+    return crc32_fold_final_pclmulqdq(&crc_state);
+}
+BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, x86_cpu_has_pclmulqdq);
+#endif
\ No newline at end of file
diff --git a/test/benchmarks/benchmark_main.cc b/test/benchmarks/benchmark_main.cc
new file mode 100644 (file)
index 0000000..81582c3
--- /dev/null
@@ -0,0 +1,25 @@
+/* benchmark_main.cc -- benchmark suite main entry point
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil.h"
+#  include "cpu_features.h"
+}
+
+int main(int argc, char** argv) {
+    cpu_check_features();
+
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+
+    return EXIT_SUCCESS;
+}
diff --git a/test/benchmarks/benchmark_slidehash.cc b/test/benchmarks/benchmark_slidehash.cc
new file mode 100644 (file)
index 0000000..174ea09
--- /dev/null
@@ -0,0 +1,91 @@
+/* benchmark_slidehash.cc -- benchmark slide_hash variants
+ * Copyright (C) 2022 Adam Stylinski, Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdint.h>
+#include <stdint.h>
+#include <limits.h>
+
+extern "C" {
+#  include "zbuild.h"
+#  include "zutil.h"
+#  include "zutil_p.h"
+#  include "cpu_features.h"
+#  include "deflate.h"
+}
+
+#include <benchmark/benchmark.h>
+
+#define MAX_RANDOM_INTS 32768
+
+typedef void (*slide_hash_func)(deflate_state *s);
+
+class slide_hash: public benchmark::Fixture {
+private:
+    uint16_t *l0;
+    uint16_t *l1;
+    deflate_state *s_g;
+
+public:
+    void SetUp(const ::benchmark::State& state) {
+        l0 = (uint16_t *)zng_alloc(HASH_SIZE * sizeof(uint16_t));
+
+        for (int32_t i = 0; i < HASH_SIZE; i++) {
+            l0[i] = rand();
+        }
+
+        l1 = (uint16_t *)zng_alloc(MAX_RANDOM_INTS * sizeof(uint16_t));
+
+        for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+            l1[i] = rand();
+        }
+
+        deflate_state *s = (deflate_state*)malloc(sizeof(deflate_state));
+        s->head = l0;
+        s->prev = l1;
+        s_g = s;
+    }
+
+    void Bench(benchmark::State& state, slide_hash_func slide_hash) {
+        s_g->w_size = (uint32_t)state.range(0);
+
+        for (auto _ : state) {
+            slide_hash(s_g);
+            benchmark::DoNotOptimize(s_g);
+        }
+    }
+
+    void TearDown(const ::benchmark::State& state) {
+        zng_free(l0);
+        zng_free(l1);
+    }
+};
+
+#define BENCHMARK_SLIDEHASH(name, fptr, support_flag) \
+    BENCHMARK_DEFINE_F(slide_hash, name)(benchmark::State& state) { \
+        if (!support_flag) { \
+            state.SkipWithError("CPU does not support " #name); \
+        } \
+        Bench(state, fptr); \
+    } \
+    BENCHMARK_REGISTER_F(slide_hash, name)->RangeMultiplier(2)->Range(1024, MAX_RANDOM_INTS);
+
+BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
+
+#ifdef ARM_NEON_SLIDEHASH
+BENCHMARK_SLIDEHASH(neon, slide_hash_neon, arm_cpu_has_neon);
+#endif
+#ifdef POWER8_VSX_SLIDEHASH
+BENCHMARK_SLIDEHASH(power8, slide_hash_power8, power_cpu_has_arch_2_07);
+#endif
+#ifdef PPC_VMX_SLIDEHASH
+BENCHMARK_SLIDEHASH(vmx, slide_hash_vmx, power_cpu_has_altivec);
+#endif
+
+#ifdef X86_SSE2
+BENCHMARK_SLIDEHASH(sse2, slide_hash_sse2, x86_cpu_has_sse2);
+#endif
+#ifdef X86_AVX2
+BENCHMARK_SLIDEHASH(avx2, slide_hash_avx2, x86_cpu_has_avx2);
+#endif
index 82ca6c2dc1e37433f9afd87e113f41994a4e8f2c..e65123360992338561bc5779cdf8dad58e2b4158 100644 (file)
@@ -48,6 +48,7 @@ OBJS = \
        chunkset.obj \
        compare256.obj \
        compress.obj \
+       cpu_features.obj \
        crc32.obj \
        crc32_comb.obj \
        crc32_fold.obj \
@@ -185,6 +186,7 @@ gzread.obj: $(SRCDIR)/gzread.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/z
 gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h
 compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
 crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
index ace50794e255a239cd04f967c20dcb6e0e477d48..29ce955811b68fd7cb07d2d56456468e1675634f 100644 (file)
@@ -51,6 +51,7 @@ OBJS = \
        chunkset.obj \
        compare256.obj \
        compress.obj \
+       cpu_features.obj \
        crc32.obj \
        crc32_comb.obj \
        crc32_fold.obj \
@@ -197,6 +198,7 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)
 compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
 crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
index cfdfcca749736693d61d2dae9f7ac92e4c5cb0c4..0059606c5a761eaaf26f9245ea4a71d5c56f9103 100644 (file)
@@ -57,6 +57,7 @@ OBJS = \
        compare256_avx2.obj \
        compare256_sse42.obj \
        compress.obj \
+       cpu_features.obj \
        crc32.obj \
        crc32_comb.obj \
        crc32_fold.obj \
@@ -191,6 +192,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 chunkset_sse2.obj: $(SRCDIR)/arch/x86/chunkset_sse2.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
 crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h