]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Standardize fill_window implementations and abstract out slide_hash_neon for ARM.
authorNathan Moinvaziri <nathan@nathanm.com>
Mon, 10 Feb 2020 03:59:01 +0000 (19:59 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Thu, 30 Apr 2020 22:21:18 +0000 (00:21 +0200)
23 files changed:
CMakeLists.txt
arch/arm/Makefile.in
arch/arm/fill_window_arm.c [deleted file]
arch/arm/slide_neon.c [new file with mode: 0644]
arch/x86/Makefile.in
arch/x86/README.md
arch/x86/deflate_quick.c
arch/x86/fill_window_sse.c [deleted file]
configure
crc32.c
deflate.c
deflate.h
deflate_fast.c
deflate_medium.c
deflate_slow.c
fallback_builtins.h
functable.c
functable.h
inflate.c
win32/Makefile.a64
win32/Makefile.arm
win32/Makefile.msc
zutil.h

index 6fb57e11b2e172ce0df40afe15923c64b0e20b51..32eb662c0af637c867c490bb25a918fac9cd0662 100644 (file)
@@ -624,16 +624,19 @@ endif()
 
 if(WITH_OPTIM)
     if(BASEARCH_ARM_FOUND)
-        add_definitions(-DARM_GETAUXVAL)
-        list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c ${ARCHDIR}/fill_window_arm.c)
+        add_definitions(-DARM_CPUID)
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h)
+        list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c)
         if(WITH_NEON)
-            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c)
-            add_definitions(-DARM_NEON_ADLER32)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/adler32_neon.h)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c)
+            add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH)
             add_intrinsics_option("${NEONFLAG}")
             if(MSVC)
                 add_definitions(-D__ARM_NEON__)
             endif()
-            add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+            add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
+            add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
         endif()
         if(WITH_ACLE)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
@@ -659,6 +662,7 @@ if(WITH_OPTIM)
         endif()
     elseif(BASEARCH_X86_FOUND)
         add_definitions(-DX86_CPUID)
+        list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h)
         list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/x86.c)
         if(MSVC)
             list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
@@ -685,7 +689,7 @@ if(WITH_OPTIM)
         endif()
         if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
             add_definitions(-DX86_SSE2)
-            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/fill_window_sse.c ${ARCHDIR}/slide_sse.c)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_sse.c)
             if(NOT ${ARCH} MATCHES "x86_64")
                 add_intrinsics_option("${SSE2FLAG}")
                 add_feature_info(FORCE_SSE2 FORCE_SSE2 "Assume CPU is SSE2 capable")
index a64d591e53041fd4b27fdf1c4faa61805c698812..9a25482ce4bfac6ace9e2ba41e9e09a1c0f28627 100644 (file)
@@ -12,7 +12,7 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo slide_neon.o slide_neon.lo insert_string_acle.o insert_string_acle.lo
 
 adler32_neon.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
@@ -32,11 +32,11 @@ crc32_acle.o:
 crc32_acle.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
 
-fill_window_arm.o:
-       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.o:
+       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
 
-fill_window_arm.lo:
-       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+slide_neon.lo:
+       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_neon.c
 
 insert_string_acle.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c
deleted file mode 100644 (file)
index 4367451..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
- * Copyright (C) 2017 Mika T. Lindqvist
- *
- * Authors:
- * Mika T. Lindqvist <postmaster@raasu.org>
- * Jun He <jun.he@arm.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-#include "../../deflate.h"
-#include "../../deflate_p.h"
-#include "../../functable.h"
-
-extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-
-/* SIMD version of hash_chain rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
-    register uint16x8_t v, *p;
-    register size_t n;
-
-    size_t size = entries*sizeof(table[0]);
-    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
-
-    Assert(sizeof(Pos) == 2, "Wrong Pos size");
-    v = vdupq_n_u16(window_size);
-
-    p = (uint16x8_t *)table;
-    n = size / (sizeof(uint16x8_t) * 8);
-    do {
-        p[0] = vqsubq_u16(p[0], v);
-        p[1] = vqsubq_u16(p[1], v);
-        p[2] = vqsubq_u16(p[2], v);
-        p[3] = vqsubq_u16(p[3], v);
-        p[4] = vqsubq_u16(p[4], v);
-        p[5] = vqsubq_u16(p[5], v);
-        p[6] = vqsubq_u16(p[6], v);
-        p[7] = vqsubq_u16(p[7], v);
-        p += 8;
-    } while (--n);
-}
-#else
-/* generic version for hash rebase */
-static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
-    unsigned int i;
-    for (i = 0; i < entries; i++) {
-        table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL;
-    }
-}
-#endif
-
-void fill_window_arm(deflate_state *s) {
-    register unsigned n;
-    unsigned long more;  /* Amount of free space at the end of the window. */
-    unsigned int wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = s->window_size - s->lookahead - s->strstart;
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, wsize);
-            s->match_start -= wsize;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-
-            slide_hash_chain(s->head, s->hash_size, wsize);
-            slide_hash_chain(s->prev, wsize, wsize);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0)
-            break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            unsigned int str = s->strstart - s->insert;
-            unsigned int insert_cnt = s->insert;
-            unsigned int slen;
-
-            s->ins_h = s->window[str];
-
-            if (UNLIKELY(s->lookahead < MIN_MATCH))
-                insert_cnt += s->lookahead - MIN_MATCH;
-            slen = insert_cnt;
-            if (str >= (MIN_MATCH - 2))
-            {
-                str += 2 - MIN_MATCH;
-                insert_cnt += MIN_MATCH - 2;
-            }
-            if (insert_cnt > 0)
-            {
-                functable.insert_string(s, str, insert_cnt);
-                s->insert -= slen;
-            }
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)s->lookahead;
-        unsigned long init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            memset(s->window + curr, 0, init);
-            s->high_water = curr + init;
-        } else if (s->high_water < curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = curr + WIN_INIT;
-            if (init > s->window_size)
-                init = s->window_size;
-            init -= s->high_water;
-            memset(s->window + s->high_water, 0, init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
diff --git a/arch/arm/slide_neon.c b/arch/arm/slide_neon.c
new file mode 100644 (file)
index 0000000..352d5a6
--- /dev/null
@@ -0,0 +1,48 @@
+/* slide_neon.c -- Optimized hash table shifting for ARM with support for NEON instructions
+ * Copyright (C) 2017 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ * Jun He <jun.he@arm.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_NEON_SLIDEHASH)
+#include <arm_neon.h>
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) {
+    register uint16x8_t v, *p;
+    register size_t n;
+
+    size_t size = entries*sizeof(table[0]);
+    Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err");
+
+    Assert(sizeof(Pos) == 2, "Wrong Pos size");
+    v = vdupq_n_u16(window_size);
+
+    p = (uint16x8_t *)table;
+    n = size / (sizeof(uint16x8_t) * 8);
+    do {
+        p[0] = vqsubq_u16(p[0], v);
+        p[1] = vqsubq_u16(p[1], v);
+        p[2] = vqsubq_u16(p[2], v);
+        p[3] = vqsubq_u16(p[3], v);
+        p[4] = vqsubq_u16(p[4], v);
+        p[5] = vqsubq_u16(p[5], v);
+        p[6] = vqsubq_u16(p[6], v);
+        p[7] = vqsubq_u16(p[7], v);
+        p += 8;
+    } while (--n);
+}
+
+ZLIB_INTERNAL void slide_hash_neon(deflate_state *s) {
+    unsigned int wsize = s->w_size;
+
+    slide_hash_chain(s->head, s->hash_size, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
index 187d06fdaef67e42cc3809c5839582093bd1b073..8da40bf7c3de3cd37255b93af6be63f72485388c 100644 (file)
@@ -17,7 +17,7 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
+all: x86.o x86.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
 
 x86.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -25,12 +25,6 @@ x86.o:
 x86.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
 
-fill_window_sse.o:
-       $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
-
-fill_window_sse.lo:
-       $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_sse.c
-
 deflate_quick.o:
        $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/deflate_quick.c
 
index 6d23945ff752490208d35d8d76941d6fc998d104..8bf6d08e5df050193f8cae74a567af75dba0172d 100644 (file)
@@ -3,6 +3,6 @@ Contents
 
 |Name|Description|
 |:-|:-|
-|fill_window_sse.c|SSE2 optimized fill_window|
 |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
 |crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|slide_sse2.c|SSE2 optimized slide_hash|
index 5cbc653dfd8edda07b014b5ac5bedadd88c100e1..809c08163069b4ef25f1c5adb1e4ce0057669124 100644 (file)
@@ -30,7 +30,6 @@
 #  include <ctype.h>
 #endif
 
-extern void fill_window_sse(deflate_state *s);
 extern void flush_pending(PREFIX3(stream) *strm);
 
 static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) {
@@ -209,7 +208,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
         }
 
         if (s->lookahead < MIN_LOOKAHEAD) {
-            fill_window_sse(s);
+            fill_window(s);
             if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                 static_emit_end_block(s, 0);
                 return need_more;
diff --git a/arch/x86/fill_window_sse.c b/arch/x86/fill_window_sse.c
deleted file mode 100644 (file)
index be35f9f..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Fill Window with SSE2-optimized hash shifting
- *
- * Copyright (C) 2013 Intel Corporation
- * Authors:
- *  Arjan van de Ven    <arjan@linux.intel.com>
- *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifdef X86_SSE2
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../deflate.h"
-#include "../../deflate_p.h"
-#include "../../functable.h"
-
-extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
-void slide_hash_sse2(deflate_state *s);
-#ifdef X86_AVX2
-void slide_hash_avx2(deflate_state *s);
-#endif
-
-ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
-    register unsigned n;
-    unsigned more;    /* Amount of free space at the end of the window. */
-    unsigned int wsize = s->w_size;
-
-    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
-
-    do {
-        more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
-
-        /* Deal with !@#$% 64K limit: */
-        if (sizeof(int) <= 2) {
-            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
-                more = wsize;
-
-            } else if (more == (unsigned)(-1)) {
-                /* Very unlikely, but possible on 16 bit machine if
-                 * strstart == 0 && lookahead == 1 (input done a byte at time)
-                 */
-                more--;
-            }
-        }
-
-        /* If the window is almost full and there is insufficient lookahead,
-         * move the upper half to the lower one to make room in the upper half.
-         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, (unsigned)wsize);
-            s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
-            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
-            s->block_start -= (long) wsize;
-
-            /* Slide the hash table (could be avoided with 32 bit values
-               at the expense of memory usage). We slide even when level == 0
-               to keep the hash table consistent if we switch back to level > 0
-               later. (Using level 0 permanently is not an optimal usage of
-               zlib, so we don't care about this pathological case.)
-             */
-#ifdef X86_AVX2
-            if (x86_cpu_has_avx2) {
-                slide_hash_avx2(s);
-            } else
-#endif
-            slide_hash_sse2(s);
-            more += wsize;
-        }
-        if (s->strm->avail_in == 0) break;
-
-        /* If there was no sliding:
-         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
-         *    more == window_size - lookahead - strstart
-         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
-         * => more >= window_size - 2*WSIZE + 2
-         * In the BIG_MEM or MMAP case (not yet supported),
-         *   window_size == input_size + MIN_LOOKAHEAD  &&
-         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
-         * Otherwise, window_size == 2*WSIZE so more >= 2.
-         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
-         */
-        Assert(more >= 2, "more < 2");
-
-        n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
-        s->lookahead += n;
-
-        /* Initialize the hash value now that we have some input: */
-        if (s->lookahead + s->insert >= MIN_MATCH) {
-            unsigned int str = s->strstart - s->insert;
-            s->ins_h = s->window[str];
-            if (str >= 1)
-                functable.quick_insert_string(s, str + 2 - MIN_MATCH);
-#if MIN_MATCH != 3
-#error Call insert_string() MIN_MATCH-3 more times
-            while (s->insert) {
-                functable.quick_insert_string(s, str);
-                str++;
-                s->insert--;
-                if (s->lookahead + s->insert < MIN_MATCH)
-                    break;
-            }
-#else
-            unsigned int count;
-            if (UNLIKELY(s->lookahead == 1)) {
-                count = s->insert - 1;
-            } else {
-                count = s->insert;
-            }
-            functable.insert_string(s, str, count);
-            s->insert -= count;
-#endif
-        }
-        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
-         * but this is not important since only literal bytes will be emitted.
-         */
-    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
-
-    /* If the WIN_INIT bytes after the end of the current data have never been
-     * written, then zero those bytes in order to avoid memory check reports of
-     * the use of uninitialized (or uninitialised as Julian writes) bytes by
-     * the longest match routines.  Update the high water mark for the next
-     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
-     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
-     */
-    if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
-        unsigned long init;
-
-        if (s->high_water < curr) {
-            /* Previous high water mark below current data -- zero WIN_INIT
-             * bytes or up to end of window, whichever is less.
-             */
-            init = s->window_size - curr;
-            if (init > WIN_INIT)
-                init = WIN_INIT;
-            memset(s->window + curr, 0, (unsigned)init);
-            s->high_water = curr + init;
-        } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
-            /* High water mark at or above current data, but below current data
-             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
-             * to end of window, whichever is less.
-             */
-            init = (unsigned long)curr + WIN_INIT - s->high_water;
-            if (init > s->window_size - s->high_water)
-                init = s->window_size - s->high_water;
-            memset(s->window + s->high_water, 0, (unsigned)init);
-            s->high_water += init;
-        }
-    }
-
-    Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search");
-}
-#endif
index b865684da21b5842195bef67ccf033680db96e11..5177d1715364239b31f39b10a03d2583e4d6a854 100755 (executable)
--- a/configure
+++ b/configure
@@ -1019,8 +1019,8 @@ case "${ARCH}" in
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE2"
                 SFLAGS="${SFLAGS} -DX86_SSE2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o slide_sse.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo slide_sse.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
@@ -1028,7 +1028,6 @@ case "${ARCH}" in
                 fi
 
                 # Enable deflate_quick at level 1?
-                # requires SSE2: code uses fill_window_sse
                 if test $without_new_strategies -eq 0; then
                     CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
                     SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY"
@@ -1077,8 +1076,8 @@ case "${ARCH}" in
             CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
             SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
 
-            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o slide_sse.o"
-            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo slide_sse.lo"
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo"
 
             if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
@@ -1116,10 +1115,10 @@ case "${ARCH}" in
         ARCHDIR=arch/arm
 
         if test $without_optimizations -eq 0; then
-            CFLAGS="${CFLAGS} -DARM_GETAUXVAL"
-            SFLAGS="${SFLAGS} -DARM_GETAUXVAL"
-            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o"
-            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo"
+            CFLAGS="${CFLAGS} -DARM_CPUID"
+            SFLAGS="${SFLAGS} -DARM_CPUID"
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo"
         fi
 
 
@@ -1169,11 +1168,11 @@ case "${ARCH}" in
                     fi
 
                     if test $buildneon -eq 1; then
-                        CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32"
-                        SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32"
+                        CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1192,11 +1191,11 @@ case "${ARCH}" in
                             SFLAGS="${SFLAGS} -mfpu=neon"
                         fi
 
-                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32"
-                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32"
+                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1216,11 +1215,11 @@ case "${ARCH}" in
                             SFLAGS="${SFLAGS} -mfpu=neon"
                         fi
 
-                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32"
-                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32"
+                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1239,10 +1238,10 @@ case "${ARCH}" in
         fi
 
         if test $without_optimizations -eq 0; then
-            CFLAGS="${CFLAGS} -DARM_GETAUXVAL"
-            SFLAGS="${SFLAGS} -DARM_GETAUXVAL"
-            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o"
-            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo"
+            CFLAGS="${CFLAGS} -DARM_CPUID"
+            SFLAGS="${SFLAGS} -DARM_CPUID"
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo"
 
             if test $buildacle -eq 1; then
                 if test $native -eq 0; then
diff --git a/crc32.c b/crc32.c
index 6d9d6a69371b8a711de29c67a78c8d7e37a83c36..7939d491d977c8685ecf198676897ad36d61dee0 100644 (file)
--- a/crc32.c
+++ b/crc32.c
@@ -201,6 +201,7 @@ ZLIB_INTERNAL void crc_finalize(deflate_state *const s) {
 
 ZLIB_INTERNAL void crc_reset(deflate_state *const s) {
 #ifdef X86_PCLMULQDQ_CRC
+    x86_check_features();
     if (x86_cpu_has_pclmulqdq) {
         crc_fold_init(s);
         return;
index 00e2b5624480c362a8c64146716b7978448d4b56..40997827b2331b57677971bca99e243bf2e62306 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -265,7 +265,7 @@ int ZEXPORT PREFIX(deflateInit2_)(PREFIX3(stream) *strm, int level, int method,
 
 #if defined(X86_CPUID)
     x86_check_features();
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
     arm_check_features();
 #endif
 
@@ -473,14 +473,14 @@ int ZEXPORT PREFIX(deflateSetDictionary)(PREFIX3(stream) *strm, const unsigned c
     next = strm->next_in;
     strm->avail_in = dictLength;
     strm->next_in = (const unsigned char *)dictionary;
-    functable.fill_window(s);
+    fill_window(s);
     while (s->lookahead >= MIN_MATCH) {
         str = s->strstart;
         n = s->lookahead - (MIN_MATCH-1);
         functable.insert_string(s, str, n);
         s->strstart = str + n;
         s->lookahead = MIN_MATCH-1;
-        functable.fill_window(s);
+        fill_window(s);
     }
     s->strstart += s->lookahead;
     s->block_start = (long)s->strstart;
@@ -1246,22 +1246,22 @@ void check_match(deflate_state *s, IPos start, IPos match, int length) {
  *    option -- not supported here).
  */
 
-void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
+void ZLIB_INTERNAL fill_window(deflate_state *s) {
     unsigned n;
-    unsigned more;    /* Amount of free space at the end of the window. */
+    unsigned long more;    /* Amount of free space at the end of the window. */
     unsigned int wsize = s->w_size;
 
     Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
 
     do {
-        more = (unsigned)(s->window_size -(unsigned long)s->lookahead -(unsigned long)s->strstart);
+        more = s->window_size - s->lookahead - s->strstart;
 
         /* If the window is almost full and there is insufficient lookahead,
          * move the upper half to the lower one to make room in the upper half.
          */
         if (s->strstart >= wsize+MAX_DIST(s)) {
-            memcpy(s->window, s->window+wsize, (unsigned)wsize - more);
-            s->match_start -= wsize;
+            memcpy(s->window, s->window+wsize, (unsigned)wsize);
+            s->match_start = (s->match_start >= wsize) ? s->match_start - wsize : 0;
             s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
             s->block_start -= (long) wsize;
             if (s->insert > s->strstart)
@@ -1310,7 +1310,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
             } else {
                 count = s->insert;
             }
-            functable.insert_string(s,str,count);
+            functable.insert_string(s, str, count);
             s->insert -= count;
 #endif
         }
@@ -1327,7 +1327,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
      * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
      */
     if (s->high_water < s->window_size) {
-        unsigned long curr = s->strstart + (unsigned long)(s->lookahead);
+        unsigned long curr = s->strstart + (unsigned long)s->lookahead;
         unsigned long init;
 
         if (s->high_water < curr) {
@@ -1337,9 +1337,9 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
             init = s->window_size - curr;
             if (init > WIN_INIT)
                 init = WIN_INIT;
-            memset(s->window + curr, 0, (unsigned)init);
+            memset(s->window + curr, 0, init);
             s->high_water = curr + init;
-        } else if (s->high_water < (unsigned long)curr + WIN_INIT) {
+        } else if (s->high_water < curr + WIN_INIT) {
             /* High water mark at or above current data, but below current data
              * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
              * to end of window, whichever is less.
@@ -1347,7 +1347,7 @@ void ZLIB_INTERNAL fill_window_c(deflate_state *s) {
             init = (unsigned long)curr + WIN_INIT - s->high_water;
             if (init > s->window_size - s->high_water)
                 init = s->window_size - s->high_water;
-            memset(s->window + s->high_water, 0, (unsigned)init);
+            memset(s->window + s->high_water, 0, init);
             s->high_water += init;
         }
     }
@@ -1562,7 +1562,7 @@ static block_state deflate_rle(deflate_state *s, int flush) {
          * for the longest run, plus one for the unrolled loop.
          */
         if (s->lookahead <= MAX_MATCH) {
-            functable.fill_window(s);
+            fill_window(s);
             if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) {
                 return need_more;
             }
@@ -1629,7 +1629,7 @@ static block_state deflate_huff(deflate_state *s, int flush) {
     for (;;) {
         /* Make sure that we have a literal to write. */
         if (s->lookahead == 0) {
-            functable.fill_window(s);
+            fill_window(s);
             if (s->lookahead == 0) {
                 if (flush == Z_NO_FLUSH)
                     return need_more;
index 07c2587e2933196c732872c9ebd92794d5284cc3..70f98120f113e92e27f7c8348bf922986eb24ff1 100644 (file)
--- a/deflate.h
+++ b/deflate.h
@@ -372,7 +372,7 @@ static inline void put_uint32_msb(deflate_state *s, uint32_t dw) {
    memory checker errors from longest match routines */
 
 
-void ZLIB_INTERNAL fill_window_c(deflate_state *s);
+void ZLIB_INTERNAL fill_window(deflate_state *s);
 void ZLIB_INTERNAL slide_hash_c(deflate_state *s);
 
         /* in trees.c */
index 56599261278694fe49755f1d09e7ee489d4fa048..9efda47816e88e8d95951792d2d39dc1590de0df 100644 (file)
@@ -28,7 +28,7 @@ ZLIB_INTERNAL block_state deflate_fast(deflate_state *s, int flush) {
          * string following the next match.
          */
         if (s->lookahead < MIN_LOOKAHEAD) {
-            functable.fill_window(s);
+            fill_window(s);
             if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                 return need_more;
             }
index 31f837232786c6b564c0ca8a2e90f6df4d692155..4e5d0949dbe1543cf9741db6f870d5e048b3d9a1 100644 (file)
@@ -207,7 +207,7 @@ ZLIB_INTERNAL block_state deflate_medium(deflate_state *s, int flush) {
          * string following the next current_match.
          */
         if (s->lookahead < MIN_LOOKAHEAD) {
-            functable.fill_window(s);
+            fill_window(s);
             if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                 return need_more;
             }
index fb217ccd74e44c3b0363994dd4023c4d2371b7ac..b8b15982c6b4898a245bd7104b2087e8e4b63a2e 100644 (file)
@@ -36,7 +36,7 @@ ZLIB_INTERNAL block_state deflate_slow(deflate_state *s, int flush) {
          * string following the next match.
          */
         if (s->lookahead < MIN_LOOKAHEAD) {
-            functable.fill_window(s);
+            fill_window(s);
             if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
                 return need_more;
             }
index 3598b7c913d3f946293abaafc82f4895f74ea349..8bd16ed874319ff828966741dc9227d419a1b7a1 100644 (file)
@@ -12,6 +12,7 @@
  */
 static __forceinline unsigned long __builtin_ctzl(unsigned long value) {
 #ifdef X86_CPUID
+    x86_check_features();
     if (x86_cpu_has_tzcnt)
         return _tzcnt_u32(value);
 #endif
index c462a921209226d9279735b722ad268def2bd85a..8f634500ffc41a73700ded7cb593fa3fc50191c3 100644 (file)
@@ -24,17 +24,14 @@ extern Pos quick_insert_string_sse4(deflate_state *const s, const Pos str);
 #elif defined(ARM_ACLE_CRC_HASH)
 extern Pos quick_insert_string_acle(deflate_state *const s, const Pos str);
 #endif
-
-/* fill_window */
-#if defined(X86_SSE2)
-extern void fill_window_sse(deflate_state *s);
-#elif defined(ARM_GETAUXVAL)
-extern void fill_window_arm(deflate_state *s);
-#endif
-
 /* slide_hash */
 #ifdef X86_SSE2
 void slide_hash_sse2(deflate_state *s);
+#elif defined(ARM_NEON_SLIDEHASH)
+void slide_hash_neon(deflate_state *s);
+#endif
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
 #endif
 
 /* adler32 */
@@ -65,14 +62,12 @@ extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
 /* stub definitions */
 ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count);
 ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str);
-ZLIB_INTERNAL void fill_window_stub(deflate_state *s);
 ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len);
 ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len);
 ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
 
 /* functable init */
 ZLIB_INTERNAL __thread struct functable_s functable = {
-    fill_window_stub,
     insert_string_stub,
     quick_insert_string_stub,
     adler32_stub,
@@ -80,11 +75,25 @@ ZLIB_INTERNAL __thread struct functable_s functable = {
     slide_hash_stub
 };
 
+ZLIB_INTERNAL void cpu_check_features(void)
+{
+    static int features_checked = 0;
+    if (features_checked)
+        return;
+#ifdef X86_CPUID
+    x86_check_features();
+#elif ARM_CPUID
+    arm_check_features();
+#endif
+    features_checked = 1;
+}
 
 /* stub functions */
 ZLIB_INTERNAL Pos insert_string_stub(deflate_state *const s, const Pos str, unsigned int count) {
     // Initialize default
+
     functable.insert_string = &insert_string_c;
+    cpu_check_features();
 
 #ifdef X86_SSE42_CRC_HASH
     if (x86_cpu_has_sse42)
@@ -111,31 +120,23 @@ ZLIB_INTERNAL Pos quick_insert_string_stub(deflate_state *const s, const Pos str
     return functable.quick_insert_string(s, str);
 }
 
-ZLIB_INTERNAL void fill_window_stub(deflate_state *s) {
-    // Initialize default
-    functable.fill_window = &fill_window_c;
-
-#if defined(X86_SSE2)
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
-    if (x86_cpu_has_sse2)
-# endif
-        functable.fill_window = &fill_window_sse;
-#elif defined(ARM_GETAUXVAL)
-    functable.fill_window = &fill_window_arm;
-#endif
-
-    functable.fill_window(s);
-}
-
 ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
-    // Initialize default
+
     functable.slide_hash = &slide_hash_c;
+    cpu_check_features();
 
 #ifdef X86_SSE2
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+#  if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
     if (x86_cpu_has_sse2)
-# endif
+#  endif
         functable.slide_hash = &slide_hash_sse2;
+#elif defined(ARM_NEON_SLIDEHASH)
+    if (arm_cpu_has_neon)
+        functable.slide_hash = &slide_hash_neon;
+#endif
+#ifdef X86_AVX2
+    if (x86_cpu_has_avx2)
+        functable.slide_hash = &slide_hash_avx2;
 #endif
 
     functable.slide_hash(s);
@@ -144,6 +145,7 @@ ZLIB_INTERNAL void slide_hash_stub(deflate_state *s) {
 ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
     // Initialize default
     functable.adler32 = &adler32_c;
+    cpu_check_features();
 
 #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32)
     if (arm_cpu_has_neon)
@@ -163,6 +165,7 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64
     if (crc_table_empty)
         make_crc_table();
 #endif /* DYNAMIC_CRC_TABLE */
+    cpu_check_features();
 
     if (sizeof(void *) == sizeof(ptrdiff_t)) {
 #if BYTE_ORDER == LITTLE_ENDIAN
index 98e068a20eab35422e8036e233e3d2c8c91c51e6..a03c1e40c39af0d82b7bf10d0aa08b47f7c7cc92 100644 (file)
@@ -9,7 +9,6 @@
 #include "deflate.h"
 
 struct functable_s {
-    void     (* fill_window)        (deflate_state *s);
     Pos      (* insert_string)      (deflate_state *const s, const Pos str, unsigned int count);
     Pos      (* quick_insert_string)(deflate_state *const s, const Pos str);
     uint32_t (* adler32)            (uint32_t adler, const unsigned char *buf, size_t len);
index bfaf85fa4efdd74065c18a0b40234613fca4aecb..9ccb0de92b61cdead9a6d98307de36924983fcea 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -133,7 +133,7 @@ int ZEXPORT PREFIX(inflateInit2_)(PREFIX3(stream) *strm, int windowBits, const c
 
 #if defined(X86_CPUID)
     x86_check_features();
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
     arm_check_features();
 #endif
 
index 28a41d8769e3ac75e78d2aedd4eb97a5836e0046..94e7f57da62a1ae64c22847f4d26f2fa9efd1143 100644 (file)
@@ -50,8 +50,8 @@ RCFILE = zlib-ng1.rc
 RESFILE = zlib-ng1.res
 SUFFIX = -ng
 !endif
-WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj
+WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj slide_neon.obj .adler32_neon.obj
 
 # targets
 all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
index bd04f0f8efc5b1f7eff108a9c9e6bdd8b8436c84..2c3a82d4e82321adf62a5dedadddff2d65c8926f 100644 (file)
@@ -63,8 +63,8 @@ NEON_ARCH = /arch:VFPv3
 !endif
 !if "$(WITH_NEON)" != ""
 CFLAGS = $(CFLAGS) $(NEON_ARCH)
-WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NOCHECK_NEON
-OBJS = $(OBJS) adler32_neon.obj
+WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH -DARM_NOCHECK_NEON
+OBJS = $(OBJS) adler32_neon.obj slide_neon.obj
 !endif
 
 # targets
index 687b0831ad9cf152421270e5997c3a7d898d6960..9f9cd31dd8ce0bfec4f880158589dbb5a1aee78d 100644 (file)
@@ -38,7 +38,7 @@ OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_q
        deflate_medium.obj \
        functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \
        slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
-       x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj
+       x86.obj insert_string_sse.obj crc_folding.obj
 !if "$(ZLIB_COMPAT)" != ""
 WITH_GZFILEOP = yes
 WFLAGS = $(WFLAGS) -DZLIB_COMPAT
diff --git a/zutil.h b/zutil.h
index 8e9d056f4e4c6c60900e293e71d1125bd7d06e3c..9da665273fca66b6c7d847023423c0a520be60fc 100644 (file)
--- a/zutil.h
+++ b/zutil.h
@@ -246,7 +246,7 @@ void ZLIB_INTERNAL   zng_cfree(void *opaque, void *ptr);
 
 #if defined(X86_CPUID)
 #  include "arch/x86/x86.h"
-#elif defined(ARM_GETAUXVAL)
+#elif defined(ARM_CPUID)
 #  include "arch/arm/arm.h"
 #endif