From: Hans Kristian Rosbach Date: Fri, 23 Aug 2019 18:25:26 +0000 (+0200) Subject: Add slide_hash to functable, and enable the sse2-optimized version. X-Git-Tag: 1.9.9-b1~443 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4cee5dcdfea03ef7c590d60c0f618e2dbcf573df;p=thirdparty%2Fzlib-ng.git Add slide_hash to functable, and enable the sse2-optimized version. Add necessary code to cmake and configure. Fix slide_hash_sse2 to compile with zlib-ng. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 49a5f927..7f709f92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -592,6 +592,7 @@ if(WITH_OPTIM) if(HAVE_SSE2_INTRIN) add_definitions(-DX86_SSE2) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/fill_window_sse.c) + set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/slide_sse.c) if(NOT ${ARCH} MATCHES "x86_64") add_intrinsics_option("${SSE2FLAG}") add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable") diff --git a/arch/x86/slide_sse.c b/arch/x86/slide_sse.c index 342fd562..9d2ca2aa 100644 --- a/arch/x86/slide_sse.c +++ b/arch/x86/slide_sse.c @@ -8,45 +8,39 @@ * * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "deflate.h" +#include "../../zbuild.h" +#include "../../deflate.h" -#ifdef USE_SSE_SLIDE #include -void slide_hash_sse(deflate_state *s) -{ +ZLIB_INTERNAL void slide_hash_sse2(deflate_state *s) { + Pos *p; unsigned n; - Posf *p; - uInt wsize = s->w_size; - z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size); + unsigned wsize = s->w_size; + const __m128i xmm_wsize = _mm_set1_epi16(s->w_size); n = s->hash_size; p = &s->head[n] - 8; do { __m128i value, result; - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); - p -= 8; - n -= 8; + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + p -= 8; + n -= 8; } while (n > 0); -#ifndef FASTEST n = wsize; p = &s->prev[n] - 8; do { __m128i value, result; - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); + value = _mm_loadu_si128((__m128i *)p); + result= _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); - p -= 8; - n -= 8; + p -= 8; + n -= 8; } while (n > 0); -#endif } - -#endif - diff --git a/configure b/configure index a6c7aa07..a00e7cf1 100755 --- a/configure +++ b/configure @@ -994,8 +994,8 @@ case "${ARCH}" in if test ${HAVE_SSE2_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE2" SFLAGS="${SFLAGS} -DX86_SSE2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo" if test $forcesse2 -eq 1; then CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2" @@ -1045,8 +1045,8 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH" SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo" if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN" diff --git a/deflate.c b/deflate.c index 5380b962..8ee4f1be 100644 --- a/deflate.c +++ b/deflate.c @@ -105,7 +105,6 @@ typedef block_state (*compress_func) (deflate_state *s, int flush); /* Compression function. Returns the block state after the call. */ static int deflateStateCheck (PREFIX3(stream) *strm); -static void slide_hash (deflate_state *s); static block_state deflate_stored (deflate_state *s, int flush); ZLIB_INTERNAL block_state deflate_fast (deflate_state *s, int flush); ZLIB_INTERNAL block_state deflate_quick (deflate_state *s, int flush); @@ -196,7 +195,7 @@ static const config configuration_table[10] = { * bit values at the expense of memory usage). We slide even when level == 0 to * keep the hash table consistent if we switch back to level > 0 later. */ -static void slide_hash(deflate_state *s) { +ZLIB_INTERNAL void slide_hash_c(deflate_state *s) { unsigned n; Pos *p; unsigned int wsize = s->w_size; @@ -639,7 +638,7 @@ int ZEXPORT PREFIX(deflateParams)(PREFIX3(stream) *strm, int level, int strategy if (s->level != level) { if (s->level == 0 && s->matches != 0) { if (s->matches == 1) { - slide_hash(s); + functable.slide_hash(s); } else { CLEAR_HASH(s); } @@ -1297,7 +1296,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) { s->block_start -= (long) wsize; if (s->insert > s->strstart) s->insert = s->strstart; - slide_hash(s); + functable.slide_hash(s); more += wsize; } if (s->strm->avail_in == 0) diff --git a/deflate.h b/deflate.h index 99a4f5ca..a47cb72b 100644 --- a/deflate.h +++ b/deflate.h @@ -330,6 +330,7 @@ static inline void put_short(deflate_state *s, uint16_t w) { void ZLIB_INTERNAL fill_window_c(deflate_state *s); +void ZLIB_INTERNAL slide_hash_c(deflate_state *s); /* in trees.c */ void ZLIB_INTERNAL zng_tr_init(deflate_state *s); diff --git a/functable.c b/functable.c index b3020e00..8ae69605 100644 --- a/functable.c +++ b/functable.c @@ -23,12 +23,18 @@ extern void fill_window_sse(deflate_state *s); extern void fill_window_arm(deflate_state *s); #endif +/* slide_hash */ +#ifdef X86_SSE2 +void slide_hash_sse2(deflate_state *s); +#endif + /* adler32 */ extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32) extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); #endif +/* CRC32 */ ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); #ifdef DYNAMIC_CRC_TABLE @@ -46,14 +52,22 @@ extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t); extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t); #endif + /* stub definitions */ ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count); ZLIB_INTERNAL void fill_window_stub(deflate_state *s); ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len); ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len); +ZLIB_INTERNAL void slide_hash_stub(deflate_state *s); /* functable init */ -ZLIB_INTERNAL __thread struct functable_s functable = {fill_window_stub,insert_string_stub,adler32_stub,crc32_stub}; +ZLIB_INTERNAL __thread struct functable_s functable = { + fill_window_stub, + insert_string_stub, + adler32_stub, + crc32_stub, + slide_hash_stub + }; /* stub functions */ @@ -88,6 +102,20 @@ ZLIB_INTERNAL void fill_window_stub(deflate_state *s) { functable.fill_window(s); } +ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) { + // Initialize default + functable.slide_hash=&slide_hash_c; + + #ifdef X86_SSE2 + # if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) + if (x86_cpu_has_sse2) + # endif + functable.slide_hash=&slide_hash_sse2; + #endif + + functable.slide_hash(s); +} + ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) { // Initialize default functable.adler32=&adler32_c; diff --git a/functable.h b/functable.h index 280651c3..a9c8e9b5 100644 --- a/functable.h +++ b/functable.h @@ -13,6 +13,7 @@ struct functable_s { Pos (* insert_string) (deflate_state *const s, const Pos str, unsigned int count); uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); + void (* slide_hash) (deflate_state *s); }; ZLIB_INTERNAL extern __thread struct functable_s functable;