Use uint8_t[8] struct on big-endian machines for speed.
list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c)
if(WITH_NEON)
- add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH)
- set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c)
+ add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH)
+ set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/memchunk_neon.c ${ARCHDIR}/slide_neon.c)
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
add_intrinsics_option("${NEONFLAG}" ${NEON_SRCS})
if(MSVC)
add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
endif()
- if(WITH_ACLE)
+ if(WITH_ACLE AND NOT MSVC)
add_definitions(-DARM_ACLE_CRC_HASH)
set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
- # For ARM aarch64, we need to check WITH_NEON first
- if("${ARCH}" MATCHES "arm" OR NOT WITH_NEON)
- add_intrinsics_option("${ACLEFLAG}" ${ACLE_SRCS})
- endif()
+ add_intrinsics_option("${ACLEFLAG}" ${ACLE_SRCS})
list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"")
endif()
add_intrinsics_option("${SSE4FLAG}" ${SSE42_SRCS})
endif()
if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
- add_definitions(-DX86_SSE2)
- set(SSE2_SRCS ${ARCHDIR}/slide_sse.c)
+ add_definitions(-DX86_SSE2 -DX86_SSE2_MEMCHUNK)
+ set(SSE2_SRCS ${ARCHDIR}/memchunk_sse.c ${ARCHDIR}/slide_sse.c)
list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
if(NOT ${ARCH} MATCHES "x86_64")
add_intrinsics_option("${SSE2FLAG}" ${SSE2_SRCS})
inftrees.h
insert_string_tpl.h
match_tpl.h
- memcopy.h
+ memchunk_tpl.h
trees.h
trees_emit.h
trees_p.h
inftrees.c
inffast.c
insert_string.c
+ memchunk.c
trees.c
uncompr.c
zutil.c
| inffast.* | Decompress data with speed optimizations |
| inffixed.h | Table for decoding fixed codes |
| inftrees.h | Generate Huffman trees for efficient decoding |
-| memcopy.h | Inline functions to copy small data chunks |
+| memchunk.* | Inline functions to copy small data chunks |
| trees.* | Output deflated data using Huffman coding |
| uncompr.c | Decompress a memory buffer |
| zconf.h.cmakein | zconf.h template for cmake |
inflate.o \
inftrees.o \
insert_string.o \
+ memchunk.o \
trees.o \
uncompr.o \
zutil.o \
inflate.lo \
inftrees.lo \
insert_string.lo \
+ memchunk.lo \
trees.lo \
uncompr.lo \
zutil.lo \
adler32_neon.o adler32_neon.lo \
armfeature.o armfeature.lo \
crc32_acle.o crc32_acle.lo \
+ memchunk_neon.o memchunk_neon.lo \
slide_neon.o slide_neon.lo \
insert_string_acle.o insert_string_acle.lo
insert_string_acle.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
+memchunk_neon.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_neon.c
+
+memchunk_neon.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_neon.c
+
mostlyclean: clean
clean:
rm -f *.o *.lo *~
c = ~c;
return c;
}
-#endif /* __ARM_FEATURE_CRC32 */
+#endif
--- /dev/null
+/* memchunk_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef MEMCHUNK_NEON_H_
+#define MEMCHUNK_NEON_H_
+
+#ifdef ARM_NEON_MEMCHUNK
+#include "zbuild.h"
+#include "zutil.h"
+
+#include <arm_neon.h>
+
+typedef uint8x16_t memchunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_3
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+ *chunk = vld1q_dup_u8(from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, memchunk_t *chunk) {
+ *chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+ *chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+ *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
+}
+
+#define CHUNKSIZE chunksize_neon
+#define CHUNKCOPY chunkcopy_neon
+#define CHUNKCOPY_SAFE chunkcopy_safe_neon
+#define CHUNKUNROLL chunkunroll_neon
+#define CHUNKMEMSET chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len);
+uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len);
+
+static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
+ uint8x8x3_t chunks;
+ unsigned sz = sizeof(chunks);
+ if (len < sz) {
+ out = CHUNKUNROLL(out, &dist, &len);
+ return CHUNKCOPY(out, out - dist, len);
+ }
+
+ /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
+ chunks[0] = {a,a,a,a,a,a,a,a}
+ chunks[1] = {b,b,b,b,b,b,b,b}
+ chunks[2] = {c,c,c,c,c,c,c,c}. */
+ chunks = vld3_dup_u8(from);
+
+ unsigned rem = len % sz;
+ len -= rem;
+ while (len) {
+ /* Store "a,b,c, ..., a,b,c". */
+ vst3_u8(out, chunks);
+ out += sz;
+ len -= sz;
+ }
+
+ if (!rem)
+ return out;
+
+ /* Last, deal with the case when LEN is not a multiple of SZ. */
+ out = CHUNKUNROLL(out, &dist, &rem);
+ return CHUNKCOPY(out, out - dist, rem);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define HAVE_CHUNKMEMSET_6
+
+static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
+ uint16x8x3_t chunks;
+ unsigned sz = sizeof(chunks);
+ if (len < sz) {
+ out = CHUNKUNROLL(out, &dist, &len);
+ return CHUNKCOPY(out, out - dist, len);
+ }
+
+ /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
+ chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
+ chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
+ chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
+ chunks = vld3q_dup_u16((unsigned short *)from);
+
+ unsigned rem = len % sz;
+ len -= rem;
+ while (len) {
+ /* Store "ab,cd,ef, ..., ab,cd,ef". */
+ vst3q_u16((unsigned short *)out, chunks);
+ out += sz;
+ len -= sz;
+ }
+
+ if (!rem)
+ return out;
+
+ /* Last, deal with the case when LEN is not a multiple of SZ. */
+ out = CHUNKUNROLL(out, &dist, &rem);
+ return CHUNKCOPY(out, out - dist, rem);
+}
+
+#endif
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+ *chunk = *(memchunk_t *)s;
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+ memcpy(out, chunk, sizeof(memchunk_t));
+}
+
+#include "memchunk_tpl.h"
+
+#endif
+#endif
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
crc_folding.o crc_folding.lo \
- slide_avx.o slide_avx.lo
+ memchunk_sse.o memchunk_sse.lo \
+ slide_avx.o slide_avx.lo \
slide_sse.o slide_sse.lo
x86.o:
crc_folding.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+memchunk_sse.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_sse.c
+
+memchunk_sse.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_sse.c
+
slide_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
--- /dev/null
+/* memchunk_sse.c -- SSE inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef MEMCHUNK_SSE_H_
+#define MEMCHUNK_SSE_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i memchunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+ *chunk = _mm_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, memchunk_t *chunk) {
+ *chunk = _mm_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+ *chunk = _mm_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+ *chunk = _mm_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE chunksize_sse2
+#define CHUNKCOPY chunkcopy_sse2
+#define CHUNKCOPY_SAFE chunkcopy_safe_sse2
+#define CHUNKUNROLL chunkunroll_sse2
+#define CHUNKMEMSET chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "memchunk_tpl.h"
+
+#endif
+#endif
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo"
if test ${HAVE_SSE2_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_SSE2"
- SFLAGS="${SFLAGS} -DX86_SSE2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo"
+ CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_MEMCHUNK"
+ SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_MEMCHUNK"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o memchunk_sse.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo memchunk_sse.lo"
if test $forcesse2 -eq 1; then
CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
# Enable arch-specific optimizations?
if test $without_optimizations -eq 0; then
- CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
- SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
+ CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE2_MEMCHUNK -DX86_SSE42_CRC_HASH"
+ SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE2_MEMCHUNK -DX86_SSE42_CRC_HASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o memchunk_sse.o slide_sse.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo memchunk_sse.lo slide_sse.lo"
if test ${HAVE_SSSE3_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
fi
if test $buildneon -eq 1; then
- CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
fi
fi
;;
SFLAGS="${SFLAGS} -mfpu=neon"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
fi
fi
;;
SFLAGS="${SFLAGS} -mfpu=neon"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
fi
fi
;;
if test $native -eq 0; then
ARCH="${ARCH}+simd"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+ CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
fi
fi
extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
#endif
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2_MEMCHUNK
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
/* CRC32 */
ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1);
ZLIB_INTERNAL int32_t longest_match_stub(deflate_state *const s, Pos cur_match);
+ZLIB_INTERNAL uint32_t chunksize_stub(void);
+ZLIB_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len);
+ZLIB_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+ZLIB_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len);
+ZLIB_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len);
+ZLIB_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left);
/* functable init */
ZLIB_INTERNAL __thread struct functable_s functable = {
crc32_stub,
slide_hash_stub,
compare258_stub,
- longest_match_stub
+ longest_match_stub,
+ chunksize_stub,
+ chunkcopy_stub,
+ chunkcopy_safe_stub,
+ chunkunroll_stub,
+ chunkmemset_stub,
+ chunkmemset_safe_stub
};
ZLIB_INTERNAL void cpu_check_features(void)
return functable.adler32(adler, buf, len);
}
+ZLIB_INTERNAL uint32_t chunksize_stub(void) {
+ // Initialize default
+ functable.chunksize = &chunksize_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunksize = &chunksize_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunksize = &chunksize_neon;
+#endif
+
+ return functable.chunksize();
+}
+
+ZLIB_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
+ // Initialize default
+ functable.chunkcopy = &chunkcopy_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunkcopy = &chunkcopy_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunkcopy = &chunkcopy_neon;
+#endif
+
+ return functable.chunkcopy(out, from, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
+ // Initialize default
+ functable.chunkcopy_safe = &chunkcopy_safe_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunkcopy_safe = &chunkcopy_safe_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunkcopy_safe = &chunkcopy_safe_neon;
+#endif
+
+ return functable.chunkcopy_safe(out, from, len, safe);
+}
+
+ZLIB_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
+ // Initialize default
+ functable.chunkunroll = &chunkunroll_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunkunroll = &chunkunroll_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunkunroll = &chunkunroll_neon;
+#endif
+
+ return functable.chunkunroll(out, dist, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
+ // Initialize default
+ functable.chunkmemset = &chunkmemset_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunkmemset = &chunkmemset_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunkmemset = &chunkmemset_neon;
+#endif
+
+ return functable.chunkmemset(out, dist, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+ // Initialize default
+ functable.chunkmemset_safe = &chunkmemset_safe_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+ if (x86_cpu_has_sse2)
+# endif
+ functable.chunkmemset_safe = &chunkmemset_safe_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+ if (arm_cpu_has_neon)
+ functable.chunkmemset_safe = &chunkmemset_safe_neon;
+#endif
+
+ return functable.chunkmemset_safe(out, dist, len, left);
+}
+
ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
Assert(sizeof(uint64_t) >= sizeof(size_t),
void (* slide_hash) (deflate_state *s);
int32_t (* compare258) (const unsigned char *src0, const unsigned char *src1);
int32_t (* longest_match) (deflate_state *const s, Pos cur_match);
+ uint32_t (* chunksize) (void);
+ uint8_t* (* chunkcopy) (uint8_t *out, uint8_t const *from, unsigned len);
+ uint8_t* (* chunkcopy_safe) (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+ uint8_t* (* chunkunroll) (uint8_t *out, unsigned *dist, unsigned *len);
+ uint8_t* (* chunkmemset) (uint8_t *out, unsigned dist, unsigned len);
+ uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
};
ZLIB_INTERNAL extern __thread struct functable_s functable;
-
#endif
#include "inflate.h"
#include "inffast.h"
#include "inflate_p.h"
-#include "memcopy.h"
+#include "functable.h"
+
+/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
+static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
+ uint64_t chunk;
+ memcpy(&chunk, in, sizeof(chunk));
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+ return chunk << bits;
+#else
+ return ZSWAP64(chunk) << bits;
+#endif
+}
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
unsigned char *out; /* local strm->next_out */
unsigned char *beg; /* inflate()'s initial strm->next_out */
unsigned char *end; /* while out < end, enough space available */
-#ifdef INFFAST_CHUNKSIZE
unsigned char *safe; /* can use chunkcopy provided out < safe */
-#endif
#ifdef INFLATE_STRICT
unsigned dmax; /* maximum distance from zlib header */
#endif
out = strm->next_out;
beg = out - (start - strm->avail_out);
end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1));
-
-#ifdef INFFAST_CHUNKSIZE
safe = out + strm->avail_out;
-#endif
#ifdef INFLATE_STRICT
dmax = state->dmax;
#endif
}
#endif
}
-#ifdef INFFAST_CHUNKSIZE
from = window;
if (wnext == 0) { /* very common case */
from += wsize - op;
from += wsize - op;
if (op < len) { /* some from end of window */
len -= op;
- out = chunkcopysafe(out, from, op, safe);
+ out = functable.chunkcopy_safe(out, from, op, safe);
from = window; /* more from start of window */
op = wnext;
/* This (rare) case can create a situation where
}
if (op < len) { /* still need some from output */
len -= op;
- out = chunkcopysafe(out, from, op, safe);
- out = chunkunroll(out, &dist, &len);
- out = chunkcopysafe(out, out - dist, len, safe);
+ out = functable.chunkcopy_safe(out, from, op, safe);
+ out = functable.chunkunroll(out, &dist, &len);
+ out = functable.chunkcopy_safe(out, out - dist, len, safe);
} else {
- out = chunkcopysafe(out, from, len, safe);
+ out = functable.chunkcopy_safe(out, from, len, safe);
}
-#else
- from = window;
- if (wnext == 0) { /* very common case */
- from += wsize - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = out - dist; /* rest from output */
- }
- } else if (wnext < op) { /* wrap around window */
- from += wsize + wnext - op;
- op -= wnext;
- if (op < len) { /* some from end of window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = window;
- if (wnext < len) { /* some from start of window */
- op = wnext;
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- } else { /* contiguous in window */
- from += wnext - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
-
- out = chunk_copy(out, from, (int) (out - from), len);
-#endif
} else {
-#ifdef INFFAST_CHUNKSIZE
/* Whole reference is in range of current output. No
range checks are necessary because we start with room
for at least 258 bytes of output, so unroll and roundoff
operations can write beyond `out+len` so long as they
stay within 258 bytes of `out`.
*/
- if (dist >= len || dist >= INFFAST_CHUNKSIZE)
- out = chunkcopy(out, out - dist, len);
- else
- out = chunkmemset(out, dist, len);
-#else
- if (len < sizeof(uint64_t))
- out = set_bytes(out, out - dist, dist, len);
- else if (dist == 1)
- out = byte_memset(out, len);
+ if (dist >= len || dist >= functable.chunksize())
+ out = functable.chunkcopy(out, out - dist, len);
else
- out = chunk_memset(out, out - dist, dist, len);
-#endif
+ out = functable.chunkmemset(out, dist, len);
}
} else if ((op & 64) == 0) { /* 2nd level distance code */
here = dcode + here->val + BITS(op);
#include "inffast.h"
#include "inflate_p.h"
#include "inffixed.h"
-#include "memcopy.h"
#include "functable.h"
/* Architecture-specific hooks. */
int ZLIB_INTERNAL inflate_ensure_window(struct inflate_state *state) {
/* if it hasn't been done already, allocate space for the window */
if (state->window == NULL) {
-#ifdef INFFAST_CHUNKSIZE
unsigned wsize = 1U << state->wbits;
- state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, wsize + INFFAST_CHUNKSIZE, sizeof(unsigned char));
+ state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, wsize + functable.chunksize(), sizeof(unsigned char));
if (state->window == Z_NULL)
return 1;
- memset(state->window + wsize, 0, INFFAST_CHUNKSIZE);
-#else
- state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, 1U << state->wbits, sizeof(unsigned char));
- if (state->window == NULL)
- return 1;
-#endif
+ memset(state->window + wsize, 0, functable.chunksize());
}
/* if window not in use yet, initialize */
copy = state->length;
if (copy > left)
copy = left;
-#if defined(INFFAST_CHUNKSIZE)
- put = chunkcopysafe(put, from, copy, put + left);
-#else
- if (copy >= sizeof(uint64_t))
- put = chunk_memcpy(put, from, copy);
- else
- put = copy_bytes(put, from, copy);
-#endif
+
+ put = functable.chunkcopy_safe(put, from, copy, put + left);
} else { /* copy from output */
copy = state->length;
if (copy > left)
copy = left;
-#if defined(INFFAST_CHUNKSIZE)
- put = chunkmemsetsafe(put, state->offset, copy, left);
-#else
- if (copy >= sizeof(uint64_t))
- put = chunk_memset(put, put - state->offset, state->offset, copy);
- else
- put = set_bytes(put, put - state->offset, state->offset, copy);
-#endif
+
+ put = functable.chunkmemset_safe(put, state->offset, copy, left);
}
left -= copy;
state->length -= copy;
--- /dev/null
+/* memchunk.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef MEMCHUNK_H_
+#define MEMCHUNK_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef UNALIGNED_OK
+typedef uint64_t memchunk_t;
+#else
+typedef uint8_t memchunk_t[8];
+#endif
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+ *chunk = 0x0101010101010101 * (uint8_t)*from;
+#else
+ memset(chunk, *from, sizeof(memchunk_t));
+#endif
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+ uint32_t half_chunk;
+ half_chunk = *(uint32_t *)from;
+ *chunk = 0x0000000100000001 * (uint64_t)half_chunk;
+#else
+ uint8_t *chunkptr = (uint8_t *)chunk;
+ memcpy(chunkptr, from, 4);
+ memcpy(chunkptr+4, from, 4);
+#endif
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+ *chunk = *(uint64_t *)from;
+#else
+ memcpy(chunk, from, sizeof(memchunk_t));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+ chunkmemset_8((uint8_t *)s, chunk);
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+ *(uint64_t *)out = *chunk;
+#elif defined(_MSC_VER)
+ /* Cast to memchunk_t pointer to avoid compiler error on MSVC ARM */
+ memchunk_t *target = (memchunk_t *)chunk;
+ memcpy(target, &chunk, sizeof(chunk));
+#else
+ memcpy(out, chunk, sizeof(memchunk_t));
+#endif
+}
+
+#define CHUNKSIZE chunksize_c
+#define CHUNKCOPY chunkcopy_c
+#define CHUNKCOPY_SAFE chunkcopy_safe_c
+#define CHUNKUNROLL chunkunroll_c
+#define CHUNKMEMSET chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "memchunk_tpl.h"
+
+#endif
--- /dev/null
+/* memchunk_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* Returns the chunk size */
+uint32_t CHUNKSIZE(void) {
+ return sizeof(memchunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+ memchunk_t bytes of output even if the length is shorter than this,
+ that the length is non-zero, and that `from` lags `out` by at least
+ sizeof memchunk_t bytes (or that they don't overlap at all or simply that
+ the distance is less than the length of the copy).
+
+ Aside from better memory bus utilisation, this means that short copies
+ (memchunk_t bytes or fewer) will fall straight through the loop
+ without iteration, which will hopefully make the branch prediction more
+ reliable. */
+uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+ memchunk_t chunk;
+ --len;
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += (len % sizeof(memchunk_t)) + 1;
+ from += (len % sizeof(memchunk_t)) + 1;
+ len /= sizeof(memchunk_t);
+ while (len > 0) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += sizeof(memchunk_t);
+ from += sizeof(memchunk_t);
+ --len;
+ }
+ return out;
+}
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
+ if ((safe - out) < (ptrdiff_t)sizeof(memchunk_t)) {
+ if (len & 8) {
+ memcpy(out, from, 8);
+ out += 8;
+ from += 8;
+ }
+ if (len & 4) {
+ memcpy(out, from, 4);
+ out += 4;
+ from += 4;
+ }
+ if (len & 2) {
+ memcpy(out, from, 2);
+ out += 2;
+ from += 2;
+ }
+ if (len & 1) {
+ *out++ = *from++;
+ }
+ return out;
+ }
+ return CHUNKCOPY(out, from, len);
+}
+
+/* Perform short copies until distance can be rewritten as being at least
+ sizeof memchunk_t.
+
+ This assumes that it's OK to overwrite at least the first
+ 2*sizeof(memchunk_t) bytes of output even if the copy is shorter than this.
+ This assumption holds because inflate_fast() starts every iteration with at
+ least 258 bytes of output space available (258 being the maximum length
+ output from a single token; see inflate_fast()'s assumptions below). */
+uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+ unsigned char const *from = out - *dist;
+ memchunk_t chunk;
+ while (*dist < *len && *dist < sizeof(memchunk_t)) {
+ loadchunk(from, &chunk);
+ storechunk(out, &chunk);
+ out += *dist;
+ *len -= *dist;
+ *dist += *dist;
+ }
+ return out;
+}
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+ Return OUT + LEN. */
+uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+ /* Debug performance related issues when len < sizeof(uint64_t):
+ Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+ Assert(dist > 0, "cannot have a distance 0");
+
+ unsigned char *from = out - dist;
+ memchunk_t chunk;
+ unsigned sz = sizeof(chunk);
+ if (len < sz) {
+ do {
+ *out++ = *from++;
+ --len;
+ } while (len != 0);
+ return out;
+ }
+
+#ifdef HAVE_CHUNKMEMSET_1
+ if (dist == 1) {
+ chunkmemset_1(from, &chunk);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_2
+ if (dist == 2) {
+ chunkmemset_2(from, &chunk);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_3
+ if (dist == 3) {
+ return chunkmemset_3(out, from, dist, len);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+ if (dist == 4) {
+ chunkmemset_4(from, &chunk);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_6
+ if (dist == 6) {
+ return chunkmemset_6(out, from, dist, len);
+ } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+ if (dist == 8) {
+ chunkmemset_8(from, &chunk);
+ } else
+#endif
+ if (dist == sz) {
+ loadchunk(from, &chunk);
+ } else {
+ out = CHUNKUNROLL(out, &dist, &len);
+ return CHUNKCOPY(out, out - dist, len);
+ }
+
+ unsigned rem = len % sz;
+ len -= rem;
+ while (len) {
+ storechunk(out, &chunk);
+ out += sz;
+ len -= sz;
+ }
+
+ /* Last, deal with the case when LEN is not a multiple of SZ. */
+ if (rem)
+ memcpy(out, from, rem);
+ out += rem;
+
+ return out;
+}
+
+uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+ if (left < (unsigned)(3 * sizeof(memchunk_t))) {
+ while (len > 0) {
+ *out = *(out - dist);
+ out++;
+ --len;
+ }
+ return out;
+ }
+ return CHUNKMEMSET(out, dist, len);
+}
+++ /dev/null
-/* memcopy.h -- inline functions to copy small data chunks.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifndef MEMCOPY_H_
-#define MEMCOPY_H_
-
-#include "zendian.h"
-
-/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
-static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
- uint64_t chunk;
- memcpy(&chunk, in, sizeof(chunk));
-
-#if BYTE_ORDER == LITTLE_ENDIAN
- return chunk << bits;
-#else
- return ZSWAP64(chunk) << bits;
-#endif
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#ifdef _M_ARM64
-# include <arm64_neon.h>
-#else
-# include <arm_neon.h>
-#endif
-typedef uint8x16_t inffast_chunk_t;
-#define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
-#endif
-
-#if defined(X86_SSE2)
-#include <immintrin.h>
-typedef __m128i inffast_chunk_t;
-#define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
-#endif
-
-#ifdef INFFAST_CHUNKSIZE
-/*
- Ask the compiler to perform a wide, unaligned load with an machine
- instruction appropriate for the inffast_chunk_t type.
- */
-static inline inffast_chunk_t loadchunk(unsigned char const* s) {
- inffast_chunk_t c;
- memcpy(&c, s, sizeof(c));
- return c;
-}
-
-/*
- Ask the compiler to perform a wide, unaligned store with an machine
- instruction appropriate for the inffast_chunk_t type.
- */
-static inline void storechunk(unsigned char* d, inffast_chunk_t c) {
-#ifdef _MSC_VER
- /* Cast to inffast_chunk_t pointer to avoid compiler error on MSVC ARM */
- inffast_chunk_t *dst_chunk = (inffast_chunk_t *)d;
- memcpy(dst_chunk, &c, sizeof(c));
-#else
- memcpy(d, &c, sizeof(c));
-#endif
-}
-
-/*
- Behave like memcpy, but assume that it's OK to overwrite at least
- INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this,
- that the length is non-zero, and that `from` lags `out` by at least
- INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that
- the distance is less than the length of the copy).
-
- Aside from better memory bus utilisation, this means that short copies
- (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop
- without iteration, which will hopefully make the branch prediction more
- reliable.
- */
-static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) {
- --len;
- storechunk(out, loadchunk(from));
- out += (len % INFFAST_CHUNKSIZE) + 1;
- from += (len % INFFAST_CHUNKSIZE) + 1;
- len /= INFFAST_CHUNKSIZE;
- while (len > 0) {
- storechunk(out, loadchunk(from));
- out += INFFAST_CHUNKSIZE;
- from += INFFAST_CHUNKSIZE;
- --len;
- }
- return out;
-}
-
-/*
- Behave like chunkcopy, but avoid writing beyond of legal output.
- */
-static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len,
- unsigned char *safe) {
- if ((safe - out) < (ptrdiff_t)INFFAST_CHUNKSIZE) {
- if (len & 8) {
- memcpy(out, from, 8);
- out += 8;
- from += 8;
- }
- if (len & 4) {
- memcpy(out, from, 4);
- out += 4;
- from += 4;
- }
- if (len & 2) {
- memcpy(out, from, 2);
- out += 2;
- from += 2;
- }
- if (len & 1) {
- *out++ = *from++;
- }
- return out;
- }
- return chunkcopy(out, from, len);
-}
-
-/*
- Perform short copies until distance can be rewritten as being at least
- INFFAST_CHUNKSIZE.
-
- This assumes that it's OK to overwrite at least the first
- 2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this.
- This assumption holds because inflate_fast() starts every iteration with at
- least 258 bytes of output space available (258 being the maximum length
- output from a single token; see inflate_fast()'s assumptions below).
- */
-static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) {
- unsigned char const *from = out - *dist;
- while (*dist < *len && *dist < INFFAST_CHUNKSIZE) {
- storechunk(out, loadchunk(from));
- out += *dist;
- *len -= *dist;
- *dist += *dist;
- }
- return out;
-}
-
-static inline inffast_chunk_t chunkmemset_1(unsigned char *from) {
-#if defined(X86_SSE2)
- int8_t c;
- memcpy(&c, from, sizeof(c));
- return _mm_set1_epi8(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
- return vld1q_dup_u8(from);
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_2(unsigned char *from) {
- int16_t c;
- memcpy(&c, from, sizeof(c));
-#if defined(X86_SSE2)
- return _mm_set1_epi16(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
- return vreinterpretq_u8_s16(vdupq_n_s16(c));
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_4(unsigned char *from) {
- int32_t c;
- memcpy(&c, from, sizeof(c));
-#if defined(X86_SSE2)
- return _mm_set1_epi32(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
- return vreinterpretq_u8_s32(vdupq_n_s32(c));
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_8(unsigned char *from) {
-#if defined(X86_SSE2)
- int64_t c;
- memcpy(&c, from, sizeof(c));
- return _mm_set1_epi64x(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
- return vcombine_u8(vld1_u8(from), vld1_u8(from));
-#endif
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-static inline unsigned char *chunkmemset_3(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
- uint8x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = chunkunroll(out, &dist, &len);
- return chunkcopy(out, out - dist, len);
- }
-
- /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
- chunks[0] = {a,a,a,a,a,a,a,a}
- chunks[1] = {b,b,b,b,b,b,b,b}
- chunks[2] = {c,c,c,c,c,c,c,c}. */
- chunks = vld3_dup_u8(from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "a,b,c, ..., a,b,c". */
- vst3_u8(out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = chunkunroll(out, &dist, &rem);
- return chunkcopy(out, out - dist, rem);
-}
-#endif
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-static inline unsigned char *chunkmemset_6(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
- uint16x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = chunkunroll(out, &dist, &len);
- return chunkcopy(out, out - dist, len);
- }
-
- /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
- chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
- chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
- chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
- chunks = vld3q_dup_u16((unsigned short *)from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "ab,cd,ef, ..., ab,cd,ef". */
- vst3q_u16((unsigned short *)out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = chunkunroll(out, &dist, &rem);
- return chunkcopy(out, out - dist, rem);
-}
-#endif
-
-/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
-static inline unsigned char *chunkmemset(unsigned char *out, unsigned dist, unsigned len) {
- /* Debug performance related issues when len < sizeof(uint64_t):
- Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
- Assert(dist > 0, "cannot have a distance 0");
-
- unsigned char *from = out - dist;
- inffast_chunk_t chunk;
- unsigned sz = sizeof(chunk);
- if (len < sz) {
- do {
- *out++ = *from++;
- --len;
- } while (len != 0);
- return out;
- }
-
- switch (dist) {
- case 1: {
- chunk = chunkmemset_1(from);
- break;
- }
- case 2: {
- chunk = chunkmemset_2(from);
- break;
- }
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
- case 3:
- return chunkmemset_3(out, from, dist, len);
-#endif
- case 4: {
- chunk = chunkmemset_4(from);
- break;
- }
-#if defined(__aarch64__) || defined(_M_ARM64)
- case 6:
- return chunkmemset_6(out, from, dist, len);
-#endif
- case 8: {
- chunk = chunkmemset_8(from);
- break;
- }
- case 16:
- memcpy(&chunk, from, sz);
- break;
-
- default:
- out = chunkunroll(out, &dist, &len);
- return chunkcopy(out, out - dist, len);
- }
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- memcpy(out, &chunk, sz);
- out += sz;
- len -= sz;
- }
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- if (rem)
- memcpy(out, &chunk, rem);
- out += rem;
- return out;
-}
-
-static inline unsigned char* chunkmemsetsafe(unsigned char *out, unsigned dist, unsigned len, unsigned left) {
- if (left < (unsigned)(3 * INFFAST_CHUNKSIZE)) {
- while (len > 0) {
- *out = *(out - dist);
- out++;
- --len;
- }
- return out;
- }
-
- return chunkmemset(out, dist, len);
-}
-
-#else /* INFFAST_CHUNKSIZE */
-
-static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) {
- *out++ = *from;
- return out;
-}
-
-static inline unsigned char *copy_2_bytes(unsigned char *out, unsigned char *from) {
- uint16_t chunk;
- unsigned sz = sizeof(chunk);
- memcpy(&chunk, from, sz);
- memcpy(out, &chunk, sz);
- return out + sz;
-}
-
-static inline unsigned char *copy_3_bytes(unsigned char *out, unsigned char *from) {
- out = copy_1_bytes(out, from);
- return copy_2_bytes(out, from + 1);
-}
-
-static inline unsigned char *copy_4_bytes(unsigned char *out, unsigned char *from) {
- uint32_t chunk;
- unsigned sz = sizeof(chunk);
- memcpy(&chunk, from, sz);
- memcpy(out, &chunk, sz);
- return out + sz;
-}
-
-static inline unsigned char *copy_5_bytes(unsigned char *out, unsigned char *from) {
- out = copy_1_bytes(out, from);
- return copy_4_bytes(out, from + 1);
-}
-
-static inline unsigned char *copy_6_bytes(unsigned char *out, unsigned char *from) {
- out = copy_2_bytes(out, from);
- return copy_4_bytes(out, from + 2);
-}
-
-static inline unsigned char *copy_7_bytes(unsigned char *out, unsigned char *from) {
- out = copy_3_bytes(out, from);
- return copy_4_bytes(out, from + 3);
-}
-
-static inline unsigned char *copy_8_bytes(unsigned char *out, unsigned char *from) {
- uint64_t chunk;
- unsigned sz = sizeof(chunk);
- memcpy(&chunk, from, sz);
- memcpy(out, &chunk, sz);
- return out + sz;
-}
-
-/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
-static inline unsigned char *copy_bytes(unsigned char *out, unsigned char *from, unsigned len) {
- Assert(len < 8, "copy_bytes should be called with less than 8 bytes");
-
-#ifndef UNALIGNED_OK
- while (len--) {
- *out++ = *from++;
- }
- return out;
-#else
- switch (len) {
- case 7:
- return copy_7_bytes(out, from);
- case 6:
- return copy_6_bytes(out, from);
- case 5:
- return copy_5_bytes(out, from);
- case 4:
- return copy_4_bytes(out, from);
- case 3:
- return copy_3_bytes(out, from);
- case 2:
- return copy_2_bytes(out, from);
- case 1:
- return copy_1_bytes(out, from);
- case 0:
- return out;
- default:
- Assert(0, "should not happen");
- }
-
- return out;
-#endif /* UNALIGNED_OK */
-}
-
-/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
-static inline unsigned char *set_bytes(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
- Assert(len < 8, "set_bytes should be called with less than 8 bytes");
-
-#ifndef UNALIGNED_OK
- (void)dist;
- while (len--) {
- *out++ = *from++;
- }
- return out;
-#else
- if (dist >= len)
- return copy_bytes(out, from, len);
-
- switch (dist) {
- case 6:
- Assert(len == 7, "len should be exactly 7");
- out = copy_6_bytes(out, from);
- return copy_1_bytes(out, from);
-
- case 5:
- Assert(len == 6 || len == 7, "len should be either 6 or 7");
- out = copy_5_bytes(out, from);
- return copy_bytes(out, from, len - 5);
-
- case 4:
- Assert(len == 5 || len == 6 || len == 7, "len should be either 5, 6, or 7");
- out = copy_4_bytes(out, from);
- return copy_bytes(out, from, len - 4);
-
- case 3:
- Assert(4 <= len && len <= 7, "len should be between 4 and 7");
- out = copy_3_bytes(out, from);
- switch (len) {
- case 7:
- return copy_4_bytes(out, from);
- case 6:
- return copy_3_bytes(out, from);
- case 5:
- return copy_2_bytes(out, from);
- case 4:
- return copy_1_bytes(out, from);
- default:
- Assert(0, "should not happen");
- break;
- }
-
- case 2:
- Assert(3 <= len && len <= 7, "len should be between 3 and 7");
- out = copy_2_bytes(out, from);
- switch (len) {
- case 7:
- out = copy_4_bytes(out, from);
- out = copy_1_bytes(out, from);
- return out;
- case 6:
- out = copy_4_bytes(out, from);
- return out;
- case 5:
- out = copy_2_bytes(out, from);
- out = copy_1_bytes(out, from);
- return out;
- case 4:
- out = copy_2_bytes(out, from);
- return out;
- case 3:
- out = copy_1_bytes(out, from);
- return out;
- default:
- Assert(0, "should not happen");
- break;
- }
-
- case 1:
- Assert(2 <= len && len <= 7, "len should be between 2 and 7");
- unsigned char c = *from;
- switch (len) {
- case 7:
- memset(out, c, 7);
- return out + 7;
- case 6:
- memset(out, c, 6);
- return out + 6;
- case 5:
- memset(out, c, 5);
- return out + 5;
- case 4:
- memset(out, c, 4);
- return out + 4;
- case 3:
- memset(out, c, 3);
- return out + 3;
- case 2:
- memset(out, c, 2);
- return out + 2;
- default:
- Assert(0, "should not happen");
- break;
- }
- }
- return out;
-#endif /* UNALIGNED_OK */
-}
-
-/* Byte by byte semantics: copy LEN bytes from OUT + DIST and write them to OUT. Return OUT + LEN. */
-static inline unsigned char *chunk_memcpy(unsigned char *out, unsigned char *from, unsigned len) {
- unsigned sz = sizeof(uint64_t);
- Assert(len >= sz, "chunk_memcpy should be called on larger chunks");
-
- /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
- copy_8_bytes(out, from);
-
- unsigned rem = len % sz;
- len /= sz;
- out += rem;
- from += rem;
-
- unsigned by8 = len % sz;
- len -= by8;
- switch (by8) {
- case 7:
- out = copy_8_bytes(out, from);
- from += sz;
- case 6:
- out = copy_8_bytes(out, from);
- from += sz;
- case 5:
- out = copy_8_bytes(out, from);
- from += sz;
- case 4:
- out = copy_8_bytes(out, from);
- from += sz;
- case 3:
- out = copy_8_bytes(out, from);
- from += sz;
- case 2:
- out = copy_8_bytes(out, from);
- from += sz;
- case 1:
- out = copy_8_bytes(out, from);
- from += sz;
- }
-
- while (len) {
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
- out = copy_8_bytes(out, from);
- from += sz;
-
- len -= 8;
- }
-
- return out;
-}
-
-/* Memset LEN bytes in OUT with the value at OUT - 1. Return OUT + LEN. */
-static inline unsigned char *byte_memset(unsigned char *out, unsigned len) {
- unsigned sz = sizeof(uint64_t);
- Assert(len >= sz, "byte_memset should be called on larger chunks");
-
- unsigned char *from = out - 1;
- unsigned char c = *from;
-
- /* First, deal with the case when LEN is not a multiple of SZ. */
- memset(out, c, sz);
- unsigned rem = len % sz;
- len /= sz;
- out += rem;
-
- unsigned by8 = len % 8;
- len -= by8;
- switch (by8) {
- case 7:
- memset(out, c, sz);
- out += sz;
- case 6:
- memset(out, c, sz);
- out += sz;
- case 5:
- memset(out, c, sz);
- out += sz;
- case 4:
- memset(out, c, sz);
- out += sz;
- case 3:
- memset(out, c, sz);
- out += sz;
- case 2:
- memset(out, c, sz);
- out += sz;
- case 1:
- memset(out, c, sz);
- out += sz;
- }
-
- while (len) {
- /* When sz is a constant, the compiler replaces __builtin_memset with an
- inline version that does not incur a function call overhead. */
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- memset(out, c, sz);
- out += sz;
- len -= 8;
- }
-
- return out;
-}
-
-/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
-static inline unsigned char *chunk_memset(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
- if (dist >= len)
- return chunk_memcpy(out, from, len);
-
- Assert(len >= sizeof(uint64_t), "chunk_memset should be called on larger chunks");
-
- /* Double up the size of the memset pattern until reaching the largest pattern of size less than SZ. */
- unsigned sz = sizeof(uint64_t);
- while (dist < len && dist < sz) {
- copy_8_bytes(out, from);
-
- out += dist;
- len -= dist;
- dist += dist;
-
- /* Make sure the next memcpy has at least SZ bytes to be copied. */
- if (len < sz)
- /* Finish up byte by byte when there are not enough bytes left. */
- return set_bytes(out, from, dist, len);
- }
-
- return chunk_memcpy(out, from, len);
-}
-
-/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
-static inline unsigned char *chunk_copy(unsigned char *out, unsigned char *from, int dist, unsigned len) {
- if (len < sizeof(uint64_t)) {
- if (dist > 0)
- return set_bytes(out, from, dist, len);
-
- return copy_bytes(out, from, len);
- }
-
- if (dist == 1)
- return byte_memset(out, len);
-
- if (dist > 0)
- return chunk_memset(out, from, dist, len);
-
- return chunk_memcpy(out, from, len);
-}
-#endif /* INFFAST_CHUNKSIZE */
-#endif /* MEMCOPY_H_ */
-DUNALIGNED_OK \
-DUNALIGNED64_OK \
-D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+ -DARM_CPUID \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
inftrees.obj \
inffast.obj \
insert_string.obj \
+ memchunk.obj \
trees.obj \
uncompr.obj \
zutil.obj \
-DARM_ACLE_CRC_HASH \
-D__ARM_NEON__=1 \
-DARM_NEON_ADLER32 \
+ -DARM_NEON_MEMCHUNK \
-DARM_NEON_SLIDEHASH \
-DARM_NOCHECK_NEON \
#
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj slide_neon.obj adler32_neon.obj
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj memchunk_neon.obj slide_neon.obj
# targets
all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
deflate_medium.obj: $(SRCDIR)/deflate_medium.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h
-D_CRT_NONSTDC_NO_DEPRECATE \
-DUNALIGNED_OK \
-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+ -DARM_CPUID \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
inftrees.obj \
inffast.obj \
insert_string.obj \
+ memchunk.obj \
trees.obj \
uncompr.obj \
zutil.obj \
WFLAGS = $(WFLAGS) \
-D__ARM_NEON__=1 \
-DARM_NEON_ADLER32 \
+ -DARM_NEON_MEMCHUNK \
-DARM_NEON_SLIDEHASH \
-DARM_NOCHECK_NEON \
#
-OBJS = $(OBJS) adler32_neon.obj slide_neon.obj
+OBJS = $(OBJS) adler32_neon.obj memchunk_neon.obj slide_neon.obj
!endif
# targets
deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/trees_emit.h
deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h
-DX86_SSE42_CRC_INTRIN \
-DX86_SSE42_CRC_HASH \
-DX86_AVX2 \
+ -DX86_SSE2_MEMCHUNK \
-DUNALIGNED_OK \
-DUNALIGNED64_OK \
#
inffast.obj \
insert_string.obj \
insert_string_sse.obj \
+ memchunk.obj \
+ memchunk_sse.obj \
slide_avx.obj \
slide_sse.obj \
trees.obj \
deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/trees_emit.h
deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+memchunk_sse.obj: $(SRCDIR)/arch/x86/memchunk_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
slide_sse.obj: $(SRCDIR)/arch/x86/slide_sse.c $(SRCDIR)/deflate.h
trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h