From: Jim Kukunas Date: Tue, 2 Jul 2013 19:09:37 +0000 (-0700) Subject: Adds SSE2 optimized hash shifting to fill_window. X-Git-Tag: v1.2.8-jtkv4~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5640481288aaf91efcc27622ae973d373bfc2cf3;p=thirdparty%2Fzlib-ng.git Adds SSE2 optimized hash shifting to fill_window. Uses SSE2 subtraction with saturation to shift the hash in 16B chunks. Renames the old fill_window implementation to fill_window_c(), and adds a new fill_window_sse() implementation in fill_window_sse.c. Moves UPDATE_HASH into deflate.h and changes the scope of read_buf from local to ZLIB_INTERNAL for sharing between the two implementations. Updates the configure script to check for SSE2 intrinsics and enables this optimization by default on x86. The runtime check for SSE2 support only occurs on 32-bit, as x86_64 requires SSE2. Adds an explicit rule in Makefile.in to build fill_window_sse.c with the -msse2 compiler flag, which is required for SSE2 intrinsics. --- diff --git a/Makefile.in b/Makefile.in index c61aa3008..4774810f2 100644 --- a/Makefile.in +++ b/Makefile.in @@ -45,6 +45,9 @@ TAR=tar SHELL=/bin/sh EXE= +FILL_WINDOW_SSE_o= +FILL_WINDOW_SSE_lo= + prefix = /usr/local exec_prefix = ${prefix} libdir = ${exec_prefix}/lib @@ -54,11 +57,11 @@ mandir = ${prefix}/share/man man3dir = ${mandir}/man3 pkgconfigdir = ${libdir}/pkgconfig -OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o +OBJZ = adler32.o crc32.o ${FILL_WINDOW_SSE_o} deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o OBJC = $(OBJZ) $(OBJG) -PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo +PIC_OBJZ = adler32.lo crc32.lo ${FILL_WINDOW_SSE_lo} deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) @@ -113,6 +116,14 @@ test64: all64 fi; \ rm -f $$TMP64 +fill_window_sse.lo: fill_window_sse.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) -msse2 -DPIC -c -o objs/$*.o $< + -@mv objs/$*.o $@ + +fill_window_sse.o: fill_window_sse.c + ${CC} ${CFLAGS} -msse2 -I. -c -o $@ fill_window_sse.c + infcover.o: test/infcover.c zlib.h zconf.h $(CC) $(CFLAGS) -I. -c -o $@ test/infcover.c diff --git a/configure b/configure index ff66ab32e..9755cbeba 100755 --- a/configure +++ b/configure @@ -760,6 +760,23 @@ EOF fi fi +# Check for SSE2 intrinsics +cat > $test.c << EOF +#include +int main(void) +{ + __m128i zero = _mm_setzero_si128(); + return 0; +} +EOF +if try ${CC} ${CFLAGS} -msse2 $test.c; then + echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log + HAVE_SSE2_INTRIN=1 +else + echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log + HAVE_SSE2_INTRIN=0 +fi + # Set ARCH specific FLAGS case "${ARCH}" in x86_64) @@ -774,6 +791,18 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS" SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS" + + if test ${HAVE_SSE2_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -UCHECK_SSE2 -DHAVE_SSE2" + SFLAGS="${SFLAGS} -UCHECK_SSE2 -DHAVE_SSE2" + FILL_WINDOW_SSE_o="fill_window_sse.o" + FILL_WINDOW_SSE_lo="fill_window_sse.lo" + OBJS="${OBJS} ${FILL_WINDOW_SSE_o}" + PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}" + else + FILL_WINDOW_SSE_o="" + FILL_WINDOW_SSE_lo="" + fi ;; i386 | i486 | i586 | i686) OBJC="${OBJC} x86.o" @@ -787,6 +816,18 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS" SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS" + + if test ${HAVE_SSE2_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DCHECK_SSE2 -DHAVE_SSE2" + SFLAGS="${SFLAGS} -DCHECK_SSE2 -DHAVE_SSE2" + FILL_WINDOW_SSE_o="fill_window_sse.o" + FILL_WINDOW_SSE_lo="fill_window_sse.lo" + OBJS="${OBJS} ${FILL_WINDOW_SSE_o}" + PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}" + else + FILL_WINDOW_SSE_o="" + FILL_WINDOW_SSE_lo="" + fi ;; esac @@ -821,6 +862,8 @@ echo mandir = $mandir >> configure.log echo prefix = $prefix >> configure.log echo sharedlibdir = $sharedlibdir >> configure.log echo uname = $uname >> configure.log +echo FILL_WINDOW_SSE_o = ${FILL_WINDOW_SSE_o} >> configure.log +echo FILL_WINDOW_SSE_lo= ${FILL_WINDOW_SSE_lo} >> configure.log # udpate Makefile with the configure results sed < Makefile.in " @@ -850,6 +893,8 @@ sed < Makefile.in " /^PIC_OBJC *=/s#=.*#= $PIC_OBJC# /^all: */s#:.*#: $ALL# /^test: */s#:.*#: $TEST# +/^FILL_WINDOW_SSE_o *=/s#=.*#=$FILL_WINDOW_SSE_o# +/^FILL_WINDOW_SSE_lo *=/s#=.*#=$FILL_WINDOW_SSE_lo# " > Makefile # create zlib.pc with the configure results diff --git a/deflate.c b/deflate.c index 96f555b01..32df2119b 100644 --- a/deflate.c +++ b/deflate.c @@ -84,7 +84,7 @@ local block_state deflate_huff OF((deflate_state *s, int flush)); local void lm_init OF((deflate_state *s)); local void putShortMSB OF((deflate_state *s, uInt b)); local void flush_pending OF((z_streamp strm)); -local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); +ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size)); #ifdef ASMV void match_init OF((void)); /* asm code initialization */ uInt longest_match OF((deflate_state *s, IPos cur_match)); @@ -158,14 +158,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ /* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */ #define RANK(f) (((f) << 1) - ((f) > 4 ? 9 : 0)) -/* =========================================================================== - * Update a hash value with the given input byte - * IN assertion: all calls to to UPDATE_HASH are made with consecutive - * input characters, so that a running hash key can be computed from the - * previous key instead of complete recalculation each time. - */ -#define UPDATE_HASH(s,h,c) (h = (((h)<hash_shift) ^ (c)) & s->hash_mask) - /* =========================================================================== * Insert string str in the dictionary and set match_head to the previous head @@ -179,12 +171,12 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ */ #ifdef FASTEST #define INSERT_STRING(s, str, match_head) \ - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + (UPDATE_HASH(s, s->ins_h, (str)), \ match_head = s->head[s->ins_h], \ s->head[s->ins_h] = (Pos)(str)) #else #define INSERT_STRING(s, str, match_head) \ - (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ + (UPDATE_HASH(s, s->ins_h, (str)), \ match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \ s->head[s->ins_h] = (Pos)(str)) #endif @@ -197,6 +189,10 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */ s->head[s->hash_size-1] = NIL; \ zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head)); +#ifdef CHECK_SSE2 +#include "x86.h" +#endif + /* ========================================================================= */ int ZEXPORT deflateInit_(strm, level, version, stream_size) z_streamp strm; @@ -230,6 +226,10 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy, * output size for (length,distance) codes is <= 24 bits. */ +#ifdef CHECK_SSE2 + x86_check_features(); +#endif + if (version == Z_NULL || version[0] != my_version[0] || stream_size != sizeof(z_stream)) { return Z_VERSION_ERROR; @@ -365,7 +365,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength) str = s->strstart; n = s->lookahead - (MIN_MATCH-1); do { - UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]); + UPDATE_HASH(s, s->ins_h, str); #ifndef FASTEST s->prev[str & s->w_mask] = s->head[s->ins_h]; #endif @@ -1073,7 +1073,7 @@ int ZEXPORT deflateCopy (dest, source) * allocating a large strm->next_in buffer and copying from it. * (See also flush_pending()). */ -local int read_buf(strm, buf, size) +ZLIB_INTERNAL int read_buf(strm, buf, size) z_streamp strm; Bytef *buf; unsigned size; @@ -1171,10 +1171,31 @@ local void check_match(s, start, match, length) * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ -local void fill_window(s) +#ifdef HAVE_SSE2 +extern void fill_window_sse(deflate_state *s); +#endif +local void fill_window_c(deflate_state *s); + +local void fill_window(deflate_state *s) +{ +#ifdef HAVE_SSE2 +#ifdef CHECK_SSE2 + if (x86_cpu_has_sse2) { +#endif + fill_window_sse(s); + return; +#ifdef CHECK_SSE2 + } +#endif +#endif + + fill_window_c(s); +} + +local void fill_window_c(s) deflate_state *s; { - register unsigned n, m; + register unsigned n; register Posf *p; unsigned more; /* Amount of free space at the end of the window. */ uInt wsize = s->w_size; @@ -1216,6 +1237,7 @@ local void fill_window(s) n = s->hash_size; p = &s->head[n]; do { + unsigned m; m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); } while (--n); @@ -1224,6 +1246,7 @@ local void fill_window(s) #ifndef FASTEST p = &s->prev[n]; do { + unsigned m; m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); /* If n is not on any hash chain, prev[n] is garbage but @@ -1255,12 +1278,12 @@ local void fill_window(s) if (s->lookahead + s->insert >= MIN_MATCH) { uInt str = s->strstart - s->insert; s->ins_h = s->window[str]; - UPDATE_HASH(s, s->ins_h, s->window[str + 1]); + UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1)); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif while (s->insert) { - UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]); + UPDATE_HASH(s, s->ins_h, str); #ifndef FASTEST s->prev[str & s->w_mask] = s->head[s->ins_h]; #endif @@ -1478,7 +1501,7 @@ local block_state deflate_fast(s, flush) s->strstart += s->match_length; s->match_length = 0; s->ins_h = s->window[s->strstart]; - UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); + UPDATE_HASH(s, s->ins_h, s->strstart+1 - (MIN_MATCH-1)); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif diff --git a/deflate.h b/deflate.h index ce0299edd..f1c1ed9ba 100644 --- a/deflate.h +++ b/deflate.h @@ -343,4 +343,12 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf, flush = _tr_tally(s, distance, length) #endif +/* =========================================================================== + * Update a hash value with the given input byte + * IN assertion: all calls to to UPDATE_HASH are made with consecutive + * input characters, so that a running hash key can be computed from the + * previous key instead of complete recalculation each time. + */ +#define UPDATE_HASH(s,h,i) (h = (((h)<hash_shift) ^ (s->window[i + (MIN_MATCH-1)])) & s->hash_mask) + #endif /* DEFLATE_H */ diff --git a/fill_window_sse.c b/fill_window_sse.c new file mode 100644 index 000000000..e07fd1c43 --- /dev/null +++ b/fill_window_sse.c @@ -0,0 +1,172 @@ +/* + * Fill Window with SSE2-optimized hash shifting + * + * Copyright (C) 2013 Intel Corporation + * Authors: + * Arjan van de Ven + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifdef HAVE_SSE2 + +#include +#include "deflate.h" + +void fill_window_sse(deflate_state *s) +{ + z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size); + + register unsigned n; + register Posf *p; + unsigned more; /* Amount of free space at the end of the window. */ + uInt wsize = s->w_size; + + Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); + + do { + more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); + + /* Deal with !@#$% 64K limit: */ + if (sizeof(int) <= 2) { + if (more == 0 && s->strstart == 0 && s->lookahead == 0) { + more = wsize; + + } else if (more == (unsigned)(-1)) { + /* Very unlikely, but possible on 16 bit machine if + * strstart == 0 && lookahead == 1 (input done a byte at time) + */ + more--; + } + } + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + if (s->strstart >= wsize+MAX_DIST(s)) { + + zmemcpy(s->window, s->window+wsize, (unsigned)wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= (long) wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + n = s->hash_size; + p = &s->head[n]; + p -= 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result = _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + + p -= 8; + n -= 8; + } while (n > 0); + + n = wsize; +#ifndef FASTEST + p = &s->prev[n]; + p -= 8; + do { + __m128i value, result; + + value = _mm_loadu_si128((__m128i *)p); + result = _mm_subs_epu16(value, xmm_wsize); + _mm_storeu_si128((__m128i *)p, result); + + p -= 8; + n -= 8; + } while (n > 0); +#endif + more += wsize; + } + if (s->strm->avail_in == 0) break; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead + s->insert >= MIN_MATCH) { + uInt str = s->strstart - s->insert; + s->ins_h = s->window[str]; + if (str >= 1) + UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1)); +#if MIN_MATCH != 3 + Call UPDATE_HASH() MIN_MATCH-3 more times +#endif + while (s->insert) { + UPDATE_HASH(s, s->ins_h, str); +#ifndef FASTEST + s->prev[str & s->w_mask] = s->head[s->ins_h]; +#endif + s->head[s->ins_h] = (Pos)str; + str++; + s->insert--; + if (s->lookahead + s->insert < MIN_MATCH) + break; + } + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); + + /* If the WIN_INIT bytes after the end of the current data have never been + * written, then zero those bytes in order to avoid memory check reports of + * the use of uninitialized (or uninitialised as Julian writes) bytes by + * the longest match routines. Update the high water mark for the next + * time through here. WIN_INIT is set to MAX_MATCH since the longest match + * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. + */ + if (s->high_water < s->window_size) { + ulg curr = s->strstart + (ulg)(s->lookahead); + ulg init; + + if (s->high_water < curr) { + /* Previous high water mark below current data -- zero WIN_INIT + * bytes or up to end of window, whichever is less. + */ + init = s->window_size - curr; + if (init > WIN_INIT) + init = WIN_INIT; + zmemzero(s->window + curr, (unsigned)init); + s->high_water = curr + init; + } + else if (s->high_water < (ulg)curr + WIN_INIT) { + /* High water mark at or above current data, but below current data + * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up + * to end of window, whichever is less. + */ + init = (ulg)curr + WIN_INIT - s->high_water; + if (init > s->window_size - s->high_water) + init = s->window_size - s->high_water; + zmemzero(s->window + s->high_water, (unsigned)init); + s->high_water += init; + } + } + + Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD, + "not enough room for search"); +} +#endif