SHELL=/bin/sh
EXE=
+FILL_WINDOW_SSE_o=
+FILL_WINDOW_SSE_lo=
+
prefix = /usr/local
exec_prefix = ${prefix}
libdir = ${exec_prefix}/lib
man3dir = ${mandir}/man3
pkgconfigdir = ${libdir}/pkgconfig
-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
+OBJZ = adler32.o crc32.o ${FILL_WINDOW_SSE_o} deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
OBJC = $(OBJZ) $(OBJG)
-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
+PIC_OBJZ = adler32.lo crc32.lo ${FILL_WINDOW_SSE_lo} deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)
fi; \
rm -f $$TMP64
+fill_window_sse.lo: fill_window_sse.c
+ -@mkdir objs 2>/dev/null || test -d objs
+ $(CC) $(SFLAGS) -msse2 -DPIC -c -o objs/$*.o $<
+ -@mv objs/$*.o $@
+
+fill_window_sse.o: fill_window_sse.c
+ ${CC} ${CFLAGS} -msse2 -I. -c -o $@ fill_window_sse.c
+
infcover.o: test/infcover.c zlib.h zconf.h
$(CC) $(CFLAGS) -I. -c -o $@ test/infcover.c
fi
fi
+# Check for SSE2 intrinsics
+cat > $test.c << EOF
+#include <immintrin.h>
+int main(void)
+{
+ __m128i zero = _mm_setzero_si128();
+ return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} -msse2 $test.c; then
+ echo "Checking for SSE2 intrinsics ... Yes." | tee -a configure.log
+ HAVE_SSE2_INTRIN=1
+else
+ echo "Checking for SSE2 intrinsics ... No." | tee -a configure.log
+ HAVE_SSE2_INTRIN=0
+fi
+
# Set ARCH specific FLAGS
case "${ARCH}" in
x86_64)
CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+ if test ${HAVE_SSE2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+ SFLAGS="${SFLAGS} -UCHECK_SSE2 -DHAVE_SSE2"
+ FILL_WINDOW_SSE_o="fill_window_sse.o"
+ FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+ OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+ PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+ else
+ FILL_WINDOW_SSE_o=""
+ FILL_WINDOW_SSE_lo=""
+ fi
;;
i386 | i486 | i586 | i686)
OBJC="${OBJC} x86.o"
CFLAGS="${CFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
SFLAGS="${SFLAGS} -DADLER32_UNROLL_LESS -DCRC32_UNROLL_LESS"
+
+ if test ${HAVE_SSE2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+ SFLAGS="${SFLAGS} -DCHECK_SSE2 -DHAVE_SSE2"
+ FILL_WINDOW_SSE_o="fill_window_sse.o"
+ FILL_WINDOW_SSE_lo="fill_window_sse.lo"
+ OBJS="${OBJS} ${FILL_WINDOW_SSE_o}"
+ PIC_OBJS="${PIC_OBJS} ${FILL_WINDOW_SSE_lo}"
+ else
+ FILL_WINDOW_SSE_o=""
+ FILL_WINDOW_SSE_lo=""
+ fi
;;
esac
echo prefix = $prefix >> configure.log
echo sharedlibdir = $sharedlibdir >> configure.log
echo uname = $uname >> configure.log
+echo FILL_WINDOW_SSE_o = ${FILL_WINDOW_SSE_o} >> configure.log
+echo FILL_WINDOW_SSE_lo= ${FILL_WINDOW_SSE_lo} >> configure.log
# udpate Makefile with the configure results
sed < Makefile.in "
/^PIC_OBJC *=/s#=.*#= $PIC_OBJC#
/^all: */s#:.*#: $ALL#
/^test: */s#:.*#: $TEST#
+/^FILL_WINDOW_SSE_o *=/s#=.*#=$FILL_WINDOW_SSE_o#
+/^FILL_WINDOW_SSE_lo *=/s#=.*#=$FILL_WINDOW_SSE_lo#
" > Makefile
# create zlib.pc with the configure results
local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
-local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
#ifdef ASMV
void match_init OF((void)); /* asm code initialization */
uInt longest_match OF((deflate_state *s, IPos cur_match));
/* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
#define RANK(f) (((f) << 1) - ((f) > 4 ? 9 : 0))
-/* ===========================================================================
- * Update a hash value with the given input byte
- * IN assertion: all calls to to UPDATE_HASH are made with consecutive
- * input characters, so that a running hash key can be computed from the
- * previous key instead of complete recalculation each time.
- */
-#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
*/
#ifdef FASTEST
#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ (UPDATE_HASH(s, s->ins_h, (str)), \
match_head = s->head[s->ins_h], \
s->head[s->ins_h] = (Pos)(str))
#else
#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
+ (UPDATE_HASH(s, s->ins_h, (str)), \
match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
s->head[s->ins_h] = (Pos)(str))
#endif
s->head[s->hash_size-1] = NIL; \
zmemzero((Bytef *)s->head, (unsigned)(s->hash_size-1)*sizeof(*s->head));
+#ifdef CHECK_SSE2
+#include "x86.h"
+#endif
+
/* ========================================================================= */
int ZEXPORT deflateInit_(strm, level, version, stream_size)
z_streamp strm;
* output size for (length,distance) codes is <= 24 bits.
*/
+#ifdef CHECK_SSE2
+ x86_check_features();
+#endif
+
if (version == Z_NULL || version[0] != my_version[0] ||
stream_size != sizeof(z_stream)) {
return Z_VERSION_ERROR;
str = s->strstart;
n = s->lookahead - (MIN_MATCH-1);
do {
- UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+ UPDATE_HASH(s, s->ins_h, str);
#ifndef FASTEST
s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
* allocating a large strm->next_in buffer and copying from it.
* (See also flush_pending()).
*/
-local int read_buf(strm, buf, size)
+ZLIB_INTERNAL int read_buf(strm, buf, size)
z_streamp strm;
Bytef *buf;
unsigned size;
* performed for at least two bytes (required for the zip translate_eol
* option -- not supported here).
*/
-local void fill_window(s)
+#ifdef HAVE_SSE2
+extern void fill_window_sse(deflate_state *s);
+#endif
+local void fill_window_c(deflate_state *s);
+
+local void fill_window(deflate_state *s)
+{
+#ifdef HAVE_SSE2
+#ifdef CHECK_SSE2
+ if (x86_cpu_has_sse2) {
+#endif
+ fill_window_sse(s);
+ return;
+#ifdef CHECK_SSE2
+ }
+#endif
+#endif
+
+ fill_window_c(s);
+}
+
+local void fill_window_c(s)
deflate_state *s;
{
- register unsigned n, m;
+ register unsigned n;
register Posf *p;
unsigned more; /* Amount of free space at the end of the window. */
uInt wsize = s->w_size;
n = s->hash_size;
p = &s->head[n];
do {
+ unsigned m;
m = *--p;
*p = (Pos)(m >= wsize ? m-wsize : NIL);
} while (--n);
#ifndef FASTEST
p = &s->prev[n];
do {
+ unsigned m;
m = *--p;
*p = (Pos)(m >= wsize ? m-wsize : NIL);
/* If n is not on any hash chain, prev[n] is garbage but
if (s->lookahead + s->insert >= MIN_MATCH) {
uInt str = s->strstart - s->insert;
s->ins_h = s->window[str];
- UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
+ UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
#if MIN_MATCH != 3
Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
while (s->insert) {
- UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+ UPDATE_HASH(s, s->ins_h, str);
#ifndef FASTEST
s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
s->strstart += s->match_length;
s->match_length = 0;
s->ins_h = s->window[s->strstart];
- UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+ UPDATE_HASH(s, s->ins_h, s->strstart+1 - (MIN_MATCH-1));
#if MIN_MATCH != 3
Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
flush = _tr_tally(s, distance, length)
#endif
+/* ===========================================================================
+ * Update a hash value with the given input byte
+ * IN assertion: all calls to to UPDATE_HASH are made with consecutive
+ * input characters, so that a running hash key can be computed from the
+ * previous key instead of complete recalculation each time.
+ */
+#define UPDATE_HASH(s,h,i) (h = (((h)<<s->hash_shift) ^ (s->window[i + (MIN_MATCH-1)])) & s->hash_mask)
+
#endif /* DEFLATE_H */
--- /dev/null
+/*
+ * Fill Window with SSE2-optimized hash shifting
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef HAVE_SSE2
+
+#include <immintrin.h>
+#include "deflate.h"
+
+void fill_window_sse(deflate_state *s)
+{
+ z_const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
+
+ register unsigned n;
+ register Posf *p;
+ unsigned more; /* Amount of free space at the end of the window. */
+ uInt wsize = s->w_size;
+
+ Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+ do {
+ more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
+
+ /* Deal with !@#$% 64K limit: */
+ if (sizeof(int) <= 2) {
+ if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
+ more = wsize;
+
+ } else if (more == (unsigned)(-1)) {
+ /* Very unlikely, but possible on 16 bit machine if
+ * strstart == 0 && lookahead == 1 (input done a byte at time)
+ */
+ more--;
+ }
+ }
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+
+ zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= (long) wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ n = s->hash_size;
+ p = &s->head[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+
+ n = wsize;
+#ifndef FASTEST
+ p = &s->prev[n];
+ p -= 8;
+ do {
+ __m128i value, result;
+
+ value = _mm_loadu_si128((__m128i *)p);
+ result = _mm_subs_epu16(value, xmm_wsize);
+ _mm_storeu_si128((__m128i *)p, result);
+
+ p -= 8;
+ n -= 8;
+ } while (n > 0);
+#endif
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0) break;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead + s->insert >= MIN_MATCH) {
+ uInt str = s->strstart - s->insert;
+ s->ins_h = s->window[str];
+ if (str >= 1)
+ UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
+#if MIN_MATCH != 3
+ Call UPDATE_HASH() MIN_MATCH-3 more times
+#endif
+ while (s->insert) {
+ UPDATE_HASH(s, s->ins_h, str);
+#ifndef FASTEST
+ s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+ s->head[s->ins_h] = (Pos)str;
+ str++;
+ s->insert--;
+ if (s->lookahead + s->insert < MIN_MATCH)
+ break;
+ }
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+ /* If the WIN_INIT bytes after the end of the current data have never been
+ * written, then zero those bytes in order to avoid memory check reports of
+ * the use of uninitialized (or uninitialised as Julian writes) bytes by
+ * the longest match routines. Update the high water mark for the next
+ * time through here. WIN_INIT is set to MAX_MATCH since the longest match
+ * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+ */
+ if (s->high_water < s->window_size) {
+ ulg curr = s->strstart + (ulg)(s->lookahead);
+ ulg init;
+
+ if (s->high_water < curr) {
+ /* Previous high water mark below current data -- zero WIN_INIT
+ * bytes or up to end of window, whichever is less.
+ */
+ init = s->window_size - curr;
+ if (init > WIN_INIT)
+ init = WIN_INIT;
+ zmemzero(s->window + curr, (unsigned)init);
+ s->high_water = curr + init;
+ }
+ else if (s->high_water < (ulg)curr + WIN_INIT) {
+ /* High water mark at or above current data, but below current data
+ * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+ * to end of window, whichever is less.
+ */
+ init = (ulg)curr + WIN_INIT - s->high_water;
+ if (init > s->window_size - s->high_water)
+ init = s->window_size - s->high_water;
+ zmemzero(s->window + s->high_water, (unsigned)init);
+ s->high_water += init;
+ }
+ }
+
+ Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+ "not enough room for search");
+}
+#endif