]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Split memcopy by architecture.
authorNathan Moinvaziri <nathan@solidstatenetworks.com>
Wed, 18 Mar 2020 01:03:15 +0000 (18:03 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 28 Jun 2020 09:16:05 +0000 (11:16 +0200)
Use uint8_t[8] struct on big-endian machines for speed.

19 files changed:
CMakeLists.txt
INDEX.md
Makefile.in
arch/arm/Makefile.in
arch/arm/crc32_acle.c
arch/arm/memchunk_neon.c [new file with mode: 0644]
arch/x86/Makefile.in
arch/x86/memchunk_sse.c [new file with mode: 0644]
configure
functable.c
functable.h
inffast.c
inflate.c
memchunk.c [new file with mode: 0644]
memchunk_tpl.h [new file with mode: 0644]
memcopy.h [deleted file]
win32/Makefile.a64
win32/Makefile.arm
win32/Makefile.msc

index 9510a235f93c3135cac93199176a7a4ceb3bb935..3504ce51c0ca5d69fc1c0175142ba42ba6c2c6be 100644 (file)
@@ -675,8 +675,8 @@ if(WITH_OPTIM)
         list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/arm.h)
         list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/armfeature.c)
         if(WITH_NEON)
-            add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH)
-            set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/slide_neon.c)
+            add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH)
+            set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/memchunk_neon.c ${ARCHDIR}/slide_neon.c)
             list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
             add_intrinsics_option("${NEONFLAG}" ${NEON_SRCS})
             if(MSVC)
@@ -685,13 +685,10 @@ if(WITH_OPTIM)
             add_feature_info(NEON_ALDER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
             add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
         endif()
-        if(WITH_ACLE)
+        if(WITH_ACLE AND NOT MSVC)
             add_definitions(-DARM_ACLE_CRC_HASH)
             set(ACLE_SRCS ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
-            # For ARM aarch64, we need to check WITH_NEON first
-            if("${ARCH}" MATCHES "arm" OR NOT WITH_NEON)
-                add_intrinsics_option("${ACLEFLAG}" ${ACLE_SRCS})
-            endif()
+            add_intrinsics_option("${ACLEFLAG}" ${ACLE_SRCS})
             list(APPEND ZLIB_ARCH_SRCS ${ACLE_SRCS})
             add_feature_info(ACLE_CRC 1 "Support ACLE optimized CRC hash generation, using \"${ACLEFLAG}\"")
         endif()
@@ -756,8 +753,8 @@ if(WITH_OPTIM)
             add_intrinsics_option("${SSE4FLAG}" ${SSE42_SRCS})
         endif()
         if(WITH_SSE2 AND HAVE_SSE2_INTRIN)
-            add_definitions(-DX86_SSE2)
-            set(SSE2_SRCS ${ARCHDIR}/slide_sse.c)
+            add_definitions(-DX86_SSE2 -DX86_SSE2_MEMCHUNK)
+            set(SSE2_SRCS ${ARCHDIR}/memchunk_sse.c ${ARCHDIR}/slide_sse.c)
             list(APPEND ZLIB_ARCH_SRCS ${SSE2_SRCS})
             if(NOT ${ARCH} MATCHES "x86_64")
                 add_intrinsics_option("${SSE2FLAG}" ${SSE2_SRCS})
@@ -867,7 +864,7 @@ set(ZLIB_PRIVATE_HDRS
     inftrees.h
     insert_string_tpl.h
     match_tpl.h
-    memcopy.h
+    memchunk_tpl.h
     trees.h
     trees_emit.h
     trees_p.h
@@ -891,6 +888,7 @@ set(ZLIB_SRCS
     inftrees.c
     inffast.c
     insert_string.c
+    memchunk.c
     trees.c
     uncompr.c
     zutil.c
index 341df3fb31ccbfba3b7dd06255f337a263cce4c3..c5816024fe1cb6b77a37945761317ebcf45260f1 100644 (file)
--- a/INDEX.md
+++ b/INDEX.md
@@ -28,7 +28,7 @@ Contents
 | inffast.*        | Decompress data with speed optimizations                       |
 | inffixed.h       | Table for decoding fixed codes                                 |
 | inftrees.h       | Generate Huffman trees for efficient decoding                  |
-| memcopy.h        | Inline functions to copy small data chunks                     |
+| memchunk.*       | Inline functions to copy small data chunks                     |
 | trees.*          | Output deflated data using Huffman coding                      |
 | uncompr.c        | Decompress a memory buffer                                     |
 | zconf.h.cmakein  | zconf.h template for cmake                                     |
index 27ffa14c2c14923f0cf968fad3abb47b6a5e6869..9433bb400a4a2b9bac36c8c83542cda0301ac287 100644 (file)
@@ -87,6 +87,7 @@ OBJZ = \
        inflate.o \
        inftrees.o \
        insert_string.o \
+       memchunk.o \
        trees.o \
        uncompr.o \
        zutil.o \
@@ -116,6 +117,7 @@ PIC_OBJZ = \
        inflate.lo \
        inftrees.lo \
        insert_string.lo \
+       memchunk.lo \
        trees.lo \
        uncompr.lo \
        zutil.lo \
index 486cdfd52cfcef0b346c7354dec5e02433b93d5c..db4bab3ffd25d18e76fd2bfb01ad3de4eb261188 100644 (file)
@@ -16,6 +16,7 @@ all: \
        adler32_neon.o adler32_neon.lo \
        armfeature.o armfeature.lo \
        crc32_acle.o crc32_acle.lo \
+       memchunk_neon.o memchunk_neon.lo \
        slide_neon.o slide_neon.lo \
        insert_string_acle.o insert_string_acle.lo
 
@@ -49,6 +50,12 @@ insert_string_acle.o:
 insert_string_acle.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
 
+memchunk_neon.o:
+       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_neon.c
+
+memchunk_neon.lo:
+       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_neon.c
+
 mostlyclean: clean
 clean:
        rm -f *.o *.lo *~
index 768c22ba21b721fee576e6c7a9fb70aadd462f93..581bad160e4283a76133a52a5c0023ecace3e676 100644 (file)
@@ -112,4 +112,4 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
     c = ~c;
     return c;
 }
-#endif /* __ARM_FEATURE_CRC32 */
+#endif
diff --git a/arch/arm/memchunk_neon.c b/arch/arm/memchunk_neon.c
new file mode 100644 (file)
index 0000000..17c484b
--- /dev/null
@@ -0,0 +1,127 @@
+/* memchunk_neon.c -- NEON inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef MEMCHUNK_NEON_H_
+#define MEMCHUNK_NEON_H_
+
+#ifdef ARM_NEON_MEMCHUNK
+#include "zbuild.h"
+#include "zutil.h"
+
+#include <arm_neon.h>
+
+typedef uint8x16_t memchunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_3
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+    *chunk = vld1q_dup_u8(from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, memchunk_t *chunk) {
+    *chunk = vreinterpretq_u8_s16(vdupq_n_s16(*(int16_t *)from));
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+    *chunk = vreinterpretq_u8_s32(vdupq_n_s32(*(int32_t *)from));
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+    *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
+}
+
+#define CHUNKSIZE        chunksize_neon
+#define CHUNKCOPY        chunkcopy_neon
+#define CHUNKCOPY_SAFE   chunkcopy_safe_neon
+#define CHUNKUNROLL      chunkunroll_neon
+#define CHUNKMEMSET      chunkmemset_neon
+#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
+
+uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len);
+uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len);
+
+static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
+    uint8x8x3_t chunks;
+    unsigned sz = sizeof(chunks);
+    if (len < sz) {
+        out = CHUNKUNROLL(out, &dist, &len);
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
+       chunks[0] = {a,a,a,a,a,a,a,a}
+       chunks[1] = {b,b,b,b,b,b,b,b}
+       chunks[2] = {c,c,c,c,c,c,c,c}. */
+    chunks = vld3_dup_u8(from);
+
+    unsigned rem = len % sz;
+    len -= rem;
+    while (len) {
+        /* Store "a,b,c, ..., a,b,c". */
+        vst3_u8(out, chunks);
+        out += sz;
+        len -= sz;
+    }
+
+    if (!rem)
+        return out;
+
+    /* Last, deal with the case when LEN is not a multiple of SZ. */
+    out = CHUNKUNROLL(out, &dist, &rem);
+    return CHUNKCOPY(out, out - dist, rem);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define HAVE_CHUNKMEMSET_6
+
+static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
+    uint16x8x3_t chunks;
+    unsigned sz = sizeof(chunks);
+    if (len < sz) {
+        out = CHUNKUNROLL(out, &dist, &len);
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
+       chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
+       chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
+       chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
+    chunks = vld3q_dup_u16((unsigned short *)from);
+
+    unsigned rem = len % sz;
+    len -= rem;
+    while (len) {
+        /* Store "ab,cd,ef, ..., ab,cd,ef". */
+        vst3q_u16((unsigned short *)out, chunks);
+        out += sz;
+        len -= sz;
+    }
+
+    if (!rem)
+        return out;
+
+    /* Last, deal with the case when LEN is not a multiple of SZ. */
+    out = CHUNKUNROLL(out, &dist, &rem);
+    return CHUNKCOPY(out, out - dist, rem);
+}
+
+#endif
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+    *chunk = *(memchunk_t *)s;
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+    memcpy(out, chunk, sizeof(memchunk_t));
+}
+
+#include "memchunk_tpl.h"
+
+#endif
+#endif
index f0f4452607fee47b912559a95654ea9249f32544..f14f2520d39feb298e31a29eafec0091afdbd576 100644 (file)
@@ -26,7 +26,8 @@ all: \
        compare258_sse.o compare258_sse.lo \
        insert_string_sse.o insert_string_sse.lo \
        crc_folding.o crc_folding.lo \
-       slide_avx.o slide_avx.lo
+       memchunk_sse.o memchunk_sse.lo \
+       slide_avx.o slide_avx.lo \
        slide_sse.o slide_sse.lo
 
 x86.o:
@@ -59,6 +60,12 @@ crc_folding.o:
 crc_folding.lo:
        $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
 
+memchunk_sse.o:
+       $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_sse.c
+
+memchunk_sse.lo:
+       $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/memchunk_sse.c
+
 slide_avx.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
 
diff --git a/arch/x86/memchunk_sse.c b/arch/x86/memchunk_sse.c
new file mode 100644 (file)
index 0000000..4955e29
--- /dev/null
@@ -0,0 +1,54 @@
+/* memchunk_sse.c -- SSE inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef MEMCHUNK_SSE_H_
+#define MEMCHUNK_SSE_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_SSE2
+#include <immintrin.h>
+
+typedef __m128i memchunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+    *chunk = _mm_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, memchunk_t *chunk) {
+    *chunk = _mm_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+    *chunk = _mm_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+    *chunk = _mm_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+    *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+    _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_sse2
+#define CHUNKCOPY        chunkcopy_sse2
+#define CHUNKCOPY_SAFE   chunkcopy_safe_sse2
+#define CHUNKUNROLL      chunkunroll_sse2
+#define CHUNKMEMSET      chunkmemset_sse2
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse2
+
+#include "memchunk_tpl.h"
+
+#endif
+#endif
index 16fbadcf791f7554cabab480933a528ca11285d2..e2d3209e69d71b7d69fb74312fe28fa254f00933 100755 (executable)
--- a/configure
+++ b/configure
@@ -1111,10 +1111,10 @@ case "${ARCH}" in
             ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo"
 
             if test ${HAVE_SSE2_INTRIN} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_SSE2"
-                SFLAGS="${SFLAGS} -DX86_SSE2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo"
+                CFLAGS="${CFLAGS} -DX86_SSE2 -DX86_SSE2_MEMCHUNK"
+                SFLAGS="${SFLAGS} -DX86_SSE2 -DX86_SSE2_MEMCHUNK"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_sse.o memchunk_sse.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_sse.lo memchunk_sse.lo"
 
                 if test $forcesse2 -eq 1; then
                     CFLAGS="${CFLAGS} -DX86_NOCHECK_SSE2"
@@ -1173,11 +1173,11 @@ case "${ARCH}" in
 
         # Enable arch-specific optimizations?
         if test $without_optimizations -eq 0; then
-            CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
-            SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE42_CRC_HASH"
+            CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE2_MEMCHUNK -DX86_SSE42_CRC_HASH"
+            SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2 -DX86_SSE2_MEMCHUNK -DX86_SSE42_CRC_HASH"
 
-            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o"
-            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo"
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o memchunk_sse.o slide_sse.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo memchunk_sse.lo slide_sse.lo"
 
             if test ${HAVE_SSSE3_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
@@ -1274,11 +1274,11 @@ case "${ARCH}" in
                     fi
 
                     if test $buildneon -eq 1; then
-                        CFLAGS="${CFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
-                        SFLAGS="${SFLAGS} -mfpu=neon -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1297,11 +1297,11 @@ case "${ARCH}" in
                             SFLAGS="${SFLAGS} -mfpu=neon"
                         fi
 
-                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
-                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1321,11 +1321,11 @@ case "${ARCH}" in
                             SFLAGS="${SFLAGS} -mfpu=neon"
                         fi
 
-                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
-                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
+                        CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+                        SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
                     fi
                 fi
             ;;
@@ -1363,10 +1363,10 @@ case "${ARCH}" in
                 if test $native -eq 0; then
                     ARCH="${ARCH}+simd"
                 fi
-                CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
-                SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_SLIDEHASH"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o slide_neon.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo slide_neon.lo"
+                CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+                SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_MEMCHUNK -DARM_NEON_SLIDEHASH"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o memchunk_neon.o slide_neon.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo memchunk_neon.lo slide_neon.lo"
             fi
         fi
 
index d8fcd3129241879bb17ef6302fce640cb660e3ba..81bd786f9a9957513377bbb25508242ad58fc0d0 100644 (file)
@@ -57,6 +57,30 @@ extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t le
 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
 #endif
 
+/* memory chunking */
+extern uint32_t chunksize_c(void);
+extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_c(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_c(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_c(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#ifdef X86_SSE2_MEMCHUNK
+extern uint32_t chunksize_sse2(void);
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_sse2(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+extern uint32_t chunksize_neon(void);
+extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_neon(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_neon(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_neon(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
+
 /* CRC32 */
 ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
 
@@ -110,6 +134,12 @@ ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64
 ZLIB_INTERNAL void slide_hash_stub(deflate_state *s);
 ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned char *src1);
 ZLIB_INTERNAL int32_t longest_match_stub(deflate_state *const s, Pos cur_match);
+ZLIB_INTERNAL uint32_t chunksize_stub(void);
+ZLIB_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len);
+ZLIB_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+ZLIB_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len);
+ZLIB_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len);
+ZLIB_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left);
 
 /* functable init */
 ZLIB_INTERNAL __thread struct functable_s functable = {
@@ -119,7 +149,13 @@ ZLIB_INTERNAL __thread struct functable_s functable = {
     crc32_stub,
     slide_hash_stub,
     compare258_stub,
-    longest_match_stub
+    longest_match_stub,
+    chunksize_stub,
+    chunkcopy_stub,
+    chunkcopy_safe_stub,
+    chunkunroll_stub,
+    chunkmemset_stub,
+    chunkmemset_safe_stub
 };
 
 ZLIB_INTERNAL void cpu_check_features(void)
@@ -224,6 +260,114 @@ ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, si
     return functable.adler32(adler, buf, len);
 }
 
+ZLIB_INTERNAL uint32_t chunksize_stub(void) {
+    // Initialize default
+    functable.chunksize = &chunksize_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunksize = &chunksize_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunksize = &chunksize_neon;
+#endif
+
+    return functable.chunksize();
+}
+
+ZLIB_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned len) {
+    // Initialize default
+    functable.chunkcopy = &chunkcopy_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunkcopy = &chunkcopy_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunkcopy = &chunkcopy_neon;
+#endif
+
+    return functable.chunkcopy(out, from, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
+    // Initialize default
+    functable.chunkcopy_safe = &chunkcopy_safe_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunkcopy_safe = &chunkcopy_safe_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunkcopy_safe = &chunkcopy_safe_neon;
+#endif
+
+    return functable.chunkcopy_safe(out, from, len, safe);
+}
+
+ZLIB_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
+    // Initialize default
+    functable.chunkunroll = &chunkunroll_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunkunroll = &chunkunroll_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunkunroll = &chunkunroll_neon;
+#endif
+
+    return functable.chunkunroll(out, dist, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) {
+    // Initialize default
+    functable.chunkmemset = &chunkmemset_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunkmemset = &chunkmemset_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunkmemset = &chunkmemset_neon;
+#endif
+
+    return functable.chunkmemset(out, dist, len);
+}
+
+ZLIB_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+    // Initialize default
+    functable.chunkmemset_safe = &chunkmemset_safe_c;
+
+#ifdef X86_SSE2_MEMCHUNK
+# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
+    if (x86_cpu_has_sse2)
+# endif
+        functable.chunkmemset_safe = &chunkmemset_safe_sse2;
+#endif
+#ifdef ARM_NEON_MEMCHUNK
+    if (arm_cpu_has_neon)
+        functable.chunkmemset_safe = &chunkmemset_safe_neon;
+#endif
+
+    return functable.chunkmemset_safe(out, dist, len, left);
+}
+
 ZLIB_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t len) {
 
     Assert(sizeof(uint64_t) >= sizeof(size_t),
index d9babe57711cc638cd53dd8ef09960268989c6a2..45712424edcbab605376a48e0c253fe6566c695e 100644 (file)
@@ -16,9 +16,14 @@ struct functable_s {
     void     (* slide_hash)         (deflate_state *s);
     int32_t  (* compare258)         (const unsigned char *src0, const unsigned char *src1);
     int32_t  (* longest_match)      (deflate_state *const s, Pos cur_match);
+    uint32_t (* chunksize)          (void);
+    uint8_t* (* chunkcopy)          (uint8_t *out, uint8_t const *from, unsigned len);
+    uint8_t* (* chunkcopy_safe)     (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+    uint8_t* (* chunkunroll)        (uint8_t *out, unsigned *dist, unsigned *len);
+    uint8_t* (* chunkmemset)        (uint8_t *out, unsigned dist, unsigned len);
+    uint8_t* (* chunkmemset_safe)   (uint8_t *out, unsigned dist, unsigned len, unsigned left);
 };
 
 ZLIB_INTERNAL extern __thread struct functable_s functable;
 
-
 #endif
index 1f74967f5e1b133693257287b53b06208cfd17f9..57d3ce4ea0aeb6dac5e88a18ec4fd36f936c8c12 100644 (file)
--- a/inffast.c
+++ b/inffast.c
@@ -9,8 +9,20 @@
 #include "inflate.h"
 #include "inffast.h"
 #include "inflate_p.h"
-#include "memcopy.h"
+#include "functable.h"
 
+
+/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
+static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
+    uint64_t chunk;
+    memcpy(&chunk, in, sizeof(chunk));
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+    return chunk << bits;
+#else
+    return ZSWAP64(chunk) << bits;
+#endif
+}
 /*
    Decode literal, length, and distance codes and write out the resulting
    literal and match bytes until either not enough input or output is
@@ -58,9 +70,7 @@ void ZLIB_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start)
     unsigned char *out;         /* local strm->next_out */
     unsigned char *beg;         /* inflate()'s initial strm->next_out */
     unsigned char *end;         /* while out < end, enough space available */
-#ifdef INFFAST_CHUNKSIZE
     unsigned char *safe;        /* can use chunkcopy provided out < safe */
-#endif
 #ifdef INFLATE_STRICT
     unsigned dmax;              /* maximum distance from zlib header */
 #endif
@@ -126,10 +136,7 @@ void ZLIB_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start)
     out = strm->next_out;
     beg = out - (start - strm->avail_out);
     end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1));
-
-#ifdef INFFAST_CHUNKSIZE
     safe = out + strm->avail_out;
-#endif
 #ifdef INFLATE_STRICT
     dmax = state->dmax;
 #endif
@@ -230,7 +237,6 @@ void ZLIB_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start)
                         }
 #endif
                     }
-#ifdef INFFAST_CHUNKSIZE
                     from = window;
                     if (wnext == 0) {           /* very common case */
                         from += wsize - op;
@@ -241,7 +247,7 @@ void ZLIB_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start)
                         from += wsize - op;
                         if (op < len) {         /* some from end of window */
                             len -= op;
-                            out = chunkcopysafe(out, from, op, safe);
+                            out = functable.chunkcopy_safe(out, from, op, safe);
                             from = window;      /* more from start of window */
                             op = wnext;
                             /* This (rare) case can create a situation where
@@ -251,74 +257,23 @@ void ZLIB_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start)
                     }
                     if (op < len) {             /* still need some from output */
                         len -= op;
-                        out = chunkcopysafe(out, from, op, safe);
-                        out = chunkunroll(out, &dist, &len);
-                        out = chunkcopysafe(out, out - dist, len, safe);
+                        out = functable.chunkcopy_safe(out, from, op, safe);
+                        out = functable.chunkunroll(out, &dist, &len);
+                        out = functable.chunkcopy_safe(out, out - dist, len, safe);
                     } else {
-                        out = chunkcopysafe(out, from, len, safe);
+                        out = functable.chunkcopy_safe(out, from, len, safe);
                     }
-#else
-                    from = window;
-                    if (wnext == 0) {           /* very common case */
-                        from += wsize - op;
-                        if (op < len) {         /* some from window */
-                            len -= op;
-                            do {
-                                *out++ = *from++;
-                            } while (--op);
-                            from = out - dist;  /* rest from output */
-                        }
-                    } else if (wnext < op) {      /* wrap around window */
-                        from += wsize + wnext - op;
-                        op -= wnext;
-                        if (op < len) {         /* some from end of window */
-                            len -= op;
-                            do {
-                                *out++ = *from++;
-                            } while (--op);
-                            from = window;
-                            if (wnext < len) {  /* some from start of window */
-                                op = wnext;
-                                len -= op;
-                                do {
-                                    *out++ = *from++;
-                                } while (--op);
-                                from = out - dist;      /* rest from output */
-                            }
-                        }
-                    } else {                      /* contiguous in window */
-                        from += wnext - op;
-                        if (op < len) {         /* some from window */
-                            len -= op;
-                            do {
-                                *out++ = *from++;
-                            } while (--op);
-                            from = out - dist;  /* rest from output */
-                        }
-                    }
-
-                    out = chunk_copy(out, from, (int) (out - from), len);
-#endif
                 } else {
-#ifdef INFFAST_CHUNKSIZE
                     /* Whole reference is in range of current output.  No
                        range checks are necessary because we start with room
                        for at least 258 bytes of output, so unroll and roundoff
                        operations can write beyond `out+len` so long as they
                        stay within 258 bytes of `out`.
                     */
-                    if (dist >= len || dist >= INFFAST_CHUNKSIZE)
-                        out = chunkcopy(out, out - dist, len);
-                    else
-                        out = chunkmemset(out, dist, len);
-#else
-                    if (len < sizeof(uint64_t))
-                        out = set_bytes(out, out - dist, dist, len);
-                    else if (dist == 1)
-                        out = byte_memset(out, len);
+                    if (dist >= len || dist >= functable.chunksize())
+                        out = functable.chunkcopy(out, out - dist, len);
                     else
-                        out = chunk_memset(out, out - dist, dist, len);
-#endif
+                        out = functable.chunkmemset(out, dist, len);
                 }
             } else if ((op & 64) == 0) {          /* 2nd level distance code */
                 here = dcode + here->val + BITS(op);
index bce468096e0204187bb9c11612cb6f97176ae254..69fc0e1f7023dbfcee0e3b61a1ac51567e320eba 100644 (file)
--- a/inflate.c
+++ b/inflate.c
@@ -10,7 +10,6 @@
 #include "inffast.h"
 #include "inflate_p.h"
 #include "inffixed.h"
-#include "memcopy.h"
 #include "functable.h"
 
 /* Architecture-specific hooks. */
@@ -203,17 +202,11 @@ void ZLIB_INTERNAL fixedtables(struct inflate_state *state) {
 int ZLIB_INTERNAL inflate_ensure_window(struct inflate_state *state) {
     /* if it hasn't been done already, allocate space for the window */
     if (state->window == NULL) {
-#ifdef INFFAST_CHUNKSIZE
         unsigned wsize = 1U << state->wbits;
-        state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, wsize + INFFAST_CHUNKSIZE, sizeof(unsigned char));
+        state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, wsize + functable.chunksize(), sizeof(unsigned char));
         if (state->window == Z_NULL)
             return 1;
-        memset(state->window + wsize, 0, INFFAST_CHUNKSIZE);
-#else
-        state->window = (unsigned char *) ZALLOC_WINDOW(state->strm, 1U << state->wbits, sizeof(unsigned char));
-        if (state->window == NULL)
-            return 1;
-#endif
+        memset(state->window + wsize, 0, functable.chunksize());
     }
 
     /* if window not in use yet, initialize */
@@ -977,26 +970,14 @@ int32_t ZEXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) {
                     copy = state->length;
                 if (copy > left)
                     copy = left;
-#if defined(INFFAST_CHUNKSIZE)
-                put = chunkcopysafe(put, from, copy, put + left);
-#else
-                if (copy >= sizeof(uint64_t))
-                    put = chunk_memcpy(put, from, copy);
-                else
-                    put = copy_bytes(put, from, copy);
-#endif
+
+                put = functable.chunkcopy_safe(put, from, copy, put + left);
             } else {                             /* copy from output */
                 copy = state->length;
                 if (copy > left)
                     copy = left;
-#if defined(INFFAST_CHUNKSIZE)
-                put = chunkmemsetsafe(put, state->offset, copy, left);
-#else
-                if (copy >= sizeof(uint64_t))
-                    put = chunk_memset(put, put - state->offset, state->offset, copy);
-                else
-                    put = set_bytes(put, put - state->offset, state->offset, copy);
-#endif
+
+                put = functable.chunkmemset_safe(put, state->offset, copy, left);
             }
             left -= copy;
             state->length -= copy;
diff --git a/memchunk.c b/memchunk.c
new file mode 100644 (file)
index 0000000..f470df8
--- /dev/null
@@ -0,0 +1,73 @@
+/* memchunk.c -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef MEMCHUNK_H_
+#define MEMCHUNK_H_
+
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef UNALIGNED_OK
+typedef uint64_t memchunk_t;
+#else
+typedef uint8_t memchunk_t[8];
+#endif
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+    *chunk = 0x0101010101010101 * (uint8_t)*from;
+#else
+    memset(chunk, *from, sizeof(memchunk_t));
+#endif
+}
+
+static inline void chunkmemset_4(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+    uint32_t half_chunk;
+    half_chunk = *(uint32_t *)from;
+    *chunk = 0x0000000100000001 * (uint64_t)half_chunk;
+#else
+    uint8_t *chunkptr = (uint8_t *)chunk;
+    memcpy(chunkptr, from, 4);
+    memcpy(chunkptr+4, from, 4);
+#endif
+}
+
+static inline void chunkmemset_8(uint8_t *from, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+    *chunk = *(uint64_t *)from;
+#else
+    memcpy(chunk, from, sizeof(memchunk_t));
+#endif
+}
+
+static inline void loadchunk(uint8_t const *s, memchunk_t *chunk) {
+    chunkmemset_8((uint8_t *)s, chunk);
+}
+
+static inline void storechunk(uint8_t *out, memchunk_t *chunk) {
+#ifdef UNALIGNED_OK
+    *(uint64_t *)out = *chunk;
+#elif defined(_MSC_VER)
+    /* Cast to memchunk_t pointer to avoid compiler error on MSVC ARM */
+    memchunk_t *target = (memchunk_t *)chunk;
+    memcpy(target, &chunk, sizeof(chunk));
+#else
+    memcpy(out, chunk, sizeof(memchunk_t));
+#endif
+}
+
+#define CHUNKSIZE        chunksize_c
+#define CHUNKCOPY        chunkcopy_c
+#define CHUNKCOPY_SAFE   chunkcopy_safe_c
+#define CHUNKUNROLL      chunkunroll_c
+#define CHUNKMEMSET      chunkmemset_c
+#define CHUNKMEMSET_SAFE chunkmemset_safe_c
+
+#include "memchunk_tpl.h"
+
+#endif
diff --git a/memchunk_tpl.h b/memchunk_tpl.h
new file mode 100644 (file)
index 0000000..507e5ed
--- /dev/null
@@ -0,0 +1,166 @@
+/* memchunk_tpl.h -- inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* Returns the chunk size */
+uint32_t CHUNKSIZE(void) {
+    return sizeof(memchunk_t);
+}
+
+/* Behave like memcpy, but assume that it's OK to overwrite at least
+   memchunk_t bytes of output even if the length is shorter than this,
+   that the length is non-zero, and that `from` lags `out` by at least
+   sizeof memchunk_t bytes (or that they don't overlap at all or simply that
+   the distance is less than the length of the copy).
+
+   Aside from better memory bus utilisation, this means that short copies
+   (memchunk_t bytes or fewer) will fall straight through the loop
+   without iteration, which will hopefully make the branch prediction more
+   reliable. */
+uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+    memchunk_t chunk;
+    --len;
+    loadchunk(from, &chunk);
+    storechunk(out, &chunk);
+    out += (len % sizeof(memchunk_t)) + 1;
+    from += (len % sizeof(memchunk_t)) + 1;
+    len /= sizeof(memchunk_t);
+    while (len > 0) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += sizeof(memchunk_t);
+        from += sizeof(memchunk_t);
+        --len;
+    }
+    return out;
+}
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
+    if ((safe - out) < (ptrdiff_t)sizeof(memchunk_t)) {
+        if (len & 8) {
+            memcpy(out, from, 8);
+            out += 8;
+            from += 8;
+        }
+        if (len & 4) {
+            memcpy(out, from, 4);
+            out += 4;
+            from += 4;
+        }
+        if (len & 2) {
+            memcpy(out, from, 2);
+            out += 2;
+            from += 2;
+        }
+        if (len & 1) {
+            *out++ = *from++;
+        }
+        return out;
+    }
+    return CHUNKCOPY(out, from, len);
+}
+
+/* Perform short copies until distance can be rewritten as being at least
+   sizeof memchunk_t.
+
+   This assumes that it's OK to overwrite at least the first
+   2*sizeof(memchunk_t) bytes of output even if the copy is shorter than this.
+   This assumption holds because inflate_fast() starts every iteration with at
+   least 258 bytes of output space available (258 being the maximum length
+   output from a single token; see inflate_fast()'s assumptions below). */
+uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
+    unsigned char const *from = out - *dist;
+    memchunk_t chunk;
+    while (*dist < *len && *dist < sizeof(memchunk_t)) {
+        loadchunk(from, &chunk);
+        storechunk(out, &chunk);
+        out += *dist;
+        *len -= *dist;
+        *dist += *dist;
+    }
+    return out;
+}
+
+/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
+   Return OUT + LEN. */
+uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+    /* Debug performance related issues when len < sizeof(uint64_t):
+       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
+    Assert(dist > 0, "cannot have a distance 0");
+
+    unsigned char *from = out - dist;
+    memchunk_t chunk;
+    unsigned sz = sizeof(chunk);
+    if (len < sz) {
+        do {
+            *out++ = *from++;
+            --len;
+        } while (len != 0);
+        return out;
+    }
+
+#ifdef HAVE_CHUNKMEMSET_1
+    if (dist == 1) {
+        chunkmemset_1(from, &chunk);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_2
+    if (dist == 2) {
+        chunkmemset_2(from, &chunk);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_3
+    if (dist == 3) {
+        return chunkmemset_3(out, from, dist, len);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_4
+    if (dist == 4) {
+        chunkmemset_4(from, &chunk);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_6
+    if (dist == 6) {
+        return chunkmemset_6(out, from, dist, len);
+    } else
+#endif
+#ifdef HAVE_CHUNKMEMSET_8
+    if (dist == 8) {
+        chunkmemset_8(from, &chunk);
+    } else
+#endif
+    if (dist == sz) {
+        loadchunk(from, &chunk);
+    } else {
+        out = CHUNKUNROLL(out, &dist, &len);
+        return CHUNKCOPY(out, out - dist, len);
+    }
+
+    unsigned rem = len % sz;
+    len -= rem;
+    while (len) {
+        storechunk(out, &chunk);
+        out += sz;
+        len -= sz;
+    }
+
+    /* Last, deal with the case when LEN is not a multiple of SZ. */
+    if (rem)
+        memcpy(out, from, rem);
+    out += rem;
+
+    return out;
+}
+
+uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+    if (left < (unsigned)(3 * sizeof(memchunk_t))) {
+        while (len > 0) {
+            *out = *(out - dist);
+            out++;
+            --len;
+        }
+        return out;
+    }
+    return CHUNKMEMSET(out, dist, len);
+}
diff --git a/memcopy.h b/memcopy.h
deleted file mode 100644 (file)
index b3742c9..0000000
--- a/memcopy.h
+++ /dev/null
@@ -1,685 +0,0 @@
-/* memcopy.h -- inline functions to copy small data chunks.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#ifndef MEMCOPY_H_
-#define MEMCOPY_H_
-
-#include "zendian.h"
-
-/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
-static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
-    uint64_t chunk;
-    memcpy(&chunk, in, sizeof(chunk));
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-    return chunk << bits;
-#else
-    return ZSWAP64(chunk) << bits;
-#endif
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#ifdef _M_ARM64
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
-typedef uint8x16_t inffast_chunk_t;
-#define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
-#endif
-
-#if defined(X86_SSE2)
-#include <immintrin.h>
-typedef __m128i inffast_chunk_t;
-#define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t)
-#endif
-
-#ifdef INFFAST_CHUNKSIZE
-/*
-   Ask the compiler to perform a wide, unaligned load with an machine
-   instruction appropriate for the inffast_chunk_t type.
- */
-static inline inffast_chunk_t loadchunk(unsigned char const* s) {
-    inffast_chunk_t c;
-    memcpy(&c, s, sizeof(c));
-    return c;
-}
-
-/*
-   Ask the compiler to perform a wide, unaligned store with an machine
-   instruction appropriate for the inffast_chunk_t type.
- */
-static inline void storechunk(unsigned char* d, inffast_chunk_t c) {
-#ifdef _MSC_VER
-    /* Cast to inffast_chunk_t pointer to avoid compiler error on MSVC ARM */
-    inffast_chunk_t *dst_chunk = (inffast_chunk_t *)d;
-    memcpy(dst_chunk, &c, sizeof(c));
-#else
-    memcpy(d, &c, sizeof(c));
-#endif
-}
-
-/*
-   Behave like memcpy, but assume that it's OK to overwrite at least
-   INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this,
-   that the length is non-zero, and that `from` lags `out` by at least
-   INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that
-   the distance is less than the length of the copy).
-
-   Aside from better memory bus utilisation, this means that short copies
-   (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop
-   without iteration, which will hopefully make the branch prediction more
-   reliable.
- */
-static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) {
-    --len;
-    storechunk(out, loadchunk(from));
-    out += (len % INFFAST_CHUNKSIZE) + 1;
-    from += (len % INFFAST_CHUNKSIZE) + 1;
-    len /= INFFAST_CHUNKSIZE;
-    while (len > 0) {
-        storechunk(out, loadchunk(from));
-        out += INFFAST_CHUNKSIZE;
-        from += INFFAST_CHUNKSIZE;
-        --len;
-    }
-    return out;
-}
-
-/*
-   Behave like chunkcopy, but avoid writing beyond of legal output.
- */
-static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len,
-                                           unsigned char *safe) {
-    if ((safe - out) < (ptrdiff_t)INFFAST_CHUNKSIZE) {
-        if (len & 8) {
-            memcpy(out, from, 8);
-            out += 8;
-            from += 8;
-        }
-        if (len & 4) {
-            memcpy(out, from, 4);
-            out += 4;
-            from += 4;
-        }
-        if (len & 2) {
-            memcpy(out, from, 2);
-            out += 2;
-            from += 2;
-        }
-        if (len & 1) {
-            *out++ = *from++;
-        }
-        return out;
-    }
-    return chunkcopy(out, from, len);
-}
-
-/*
-   Perform short copies until distance can be rewritten as being at least
-   INFFAST_CHUNKSIZE.
-
-   This assumes that it's OK to overwrite at least the first
-   2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this.
-   This assumption holds because inflate_fast() starts every iteration with at
-   least 258 bytes of output space available (258 being the maximum length
-   output from a single token; see inflate_fast()'s assumptions below).
- */
-static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) {
-    unsigned char const *from = out - *dist;
-    while (*dist < *len && *dist < INFFAST_CHUNKSIZE) {
-        storechunk(out, loadchunk(from));
-        out += *dist;
-        *len -= *dist;
-        *dist += *dist;
-    }
-    return out;
-}
-
-static inline inffast_chunk_t chunkmemset_1(unsigned char *from) {
-#if defined(X86_SSE2)
-    int8_t c;
-    memcpy(&c, from, sizeof(c));
-    return _mm_set1_epi8(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-    return vld1q_dup_u8(from);
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_2(unsigned char *from) {
-    int16_t c;
-    memcpy(&c, from, sizeof(c));
-#if defined(X86_SSE2)
-    return _mm_set1_epi16(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-    return vreinterpretq_u8_s16(vdupq_n_s16(c));
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_4(unsigned char *from) {
-    int32_t c;
-    memcpy(&c, from, sizeof(c));
-#if defined(X86_SSE2)
-    return _mm_set1_epi32(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-    return vreinterpretq_u8_s32(vdupq_n_s32(c));
-#endif
-}
-
-static inline inffast_chunk_t chunkmemset_8(unsigned char *from) {
-#if defined(X86_SSE2)
-    int64_t c;
-    memcpy(&c, from, sizeof(c));
-    return _mm_set1_epi64x(c);
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
-    return vcombine_u8(vld1_u8(from), vld1_u8(from));
-#endif
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-static inline unsigned char *chunkmemset_3(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
-    uint8x8x3_t chunks;
-    unsigned sz = sizeof(chunks);
-    if (len < sz) {
-        out = chunkunroll(out, &dist, &len);
-        return chunkcopy(out, out - dist, len);
-    }
-
-    /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
-       chunks[0] = {a,a,a,a,a,a,a,a}
-       chunks[1] = {b,b,b,b,b,b,b,b}
-       chunks[2] = {c,c,c,c,c,c,c,c}. */
-    chunks = vld3_dup_u8(from);
-
-    unsigned rem = len % sz;
-    len -= rem;
-    while (len) {
-        /* Store "a,b,c, ..., a,b,c". */
-        vst3_u8(out, chunks);
-        out += sz;
-        len -= sz;
-    }
-
-    if (!rem)
-        return out;
-
-    /* Last, deal with the case when LEN is not a multiple of SZ. */
-    out = chunkunroll(out, &dist, &rem);
-    return chunkcopy(out, out - dist, rem);
-}
-#endif
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-static inline unsigned char *chunkmemset_6(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
-    uint16x8x3_t chunks;
-    unsigned sz = sizeof(chunks);
-    if (len < sz) {
-        out = chunkunroll(out, &dist, &len);
-        return chunkcopy(out, out - dist, len);
-    }
-
-    /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
-       chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
-       chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
-       chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
-    chunks = vld3q_dup_u16((unsigned short *)from);
-
-    unsigned rem = len % sz;
-    len -= rem;
-    while (len) {
-        /* Store "ab,cd,ef, ..., ab,cd,ef". */
-        vst3q_u16((unsigned short *)out, chunks);
-        out += sz;
-        len -= sz;
-    }
-
-    if (!rem)
-        return out;
-
-    /* Last, deal with the case when LEN is not a multiple of SZ. */
-    out = chunkunroll(out, &dist, &rem);
-    return chunkcopy(out, out - dist, rem);
-}
-#endif
-
-/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
-static inline unsigned char *chunkmemset(unsigned char *out, unsigned dist, unsigned len) {
-    /* Debug performance related issues when len < sizeof(uint64_t):
-       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
-    Assert(dist > 0, "cannot have a distance 0");
-
-    unsigned char *from = out - dist;
-    inffast_chunk_t chunk;
-    unsigned sz = sizeof(chunk);
-    if (len < sz) {
-        do {
-            *out++ = *from++;
-            --len;
-        } while (len != 0);
-        return out;
-    }
-
-    switch (dist) {
-    case 1: {
-        chunk = chunkmemset_1(from);
-        break;
-    }
-    case 2: {
-        chunk = chunkmemset_2(from);
-        break;
-    }
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    case 3:
-        return chunkmemset_3(out, from, dist, len);
-#endif
-    case 4: {
-        chunk = chunkmemset_4(from);
-        break;
-    }
-#if defined(__aarch64__) || defined(_M_ARM64)
-    case 6:
-        return chunkmemset_6(out, from, dist, len);
-#endif
-    case 8: {
-        chunk = chunkmemset_8(from);
-        break;
-    }
-    case 16:
-        memcpy(&chunk, from, sz);
-        break;
-
-    default:
-        out = chunkunroll(out, &dist, &len);
-        return chunkcopy(out, out - dist, len);
-    }
-
-    unsigned rem = len % sz;
-    len -= rem;
-    while (len) {
-        memcpy(out, &chunk, sz);
-        out += sz;
-        len -= sz;
-    }
-
-    /* Last, deal with the case when LEN is not a multiple of SZ. */
-    if (rem)
-        memcpy(out, &chunk, rem);
-    out += rem;
-    return out;
-}
-
-static inline unsigned char* chunkmemsetsafe(unsigned char *out, unsigned dist, unsigned len, unsigned left) {
-    if (left < (unsigned)(3 * INFFAST_CHUNKSIZE)) {
-        while (len > 0) {
-            *out = *(out - dist);
-            out++;
-            --len;
-        }
-        return out;
-    }
-
-    return chunkmemset(out, dist, len);
-}
-
-#else /* INFFAST_CHUNKSIZE */
-
-static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) {
-    *out++ = *from;
-    return out;
-}
-
-static inline unsigned char *copy_2_bytes(unsigned char *out, unsigned char *from) {
-    uint16_t chunk;
-    unsigned sz = sizeof(chunk);
-    memcpy(&chunk, from, sz);
-    memcpy(out, &chunk, sz);
-    return out + sz;
-}
-
-static inline unsigned char *copy_3_bytes(unsigned char *out, unsigned char *from) {
-    out = copy_1_bytes(out, from);
-    return copy_2_bytes(out, from + 1);
-}
-
-static inline unsigned char *copy_4_bytes(unsigned char *out, unsigned char *from) {
-    uint32_t chunk;
-    unsigned sz = sizeof(chunk);
-    memcpy(&chunk, from, sz);
-    memcpy(out, &chunk, sz);
-    return out + sz;
-}
-
-static inline unsigned char *copy_5_bytes(unsigned char *out, unsigned char *from) {
-    out = copy_1_bytes(out, from);
-    return copy_4_bytes(out, from + 1);
-}
-
-static inline unsigned char *copy_6_bytes(unsigned char *out, unsigned char *from) {
-    out = copy_2_bytes(out, from);
-    return copy_4_bytes(out, from + 2);
-}
-
-static inline unsigned char *copy_7_bytes(unsigned char *out, unsigned char *from) {
-    out = copy_3_bytes(out, from);
-    return copy_4_bytes(out, from + 3);
-}
-
-static inline unsigned char *copy_8_bytes(unsigned char *out, unsigned char *from) {
-    uint64_t chunk;
-    unsigned sz = sizeof(chunk);
-    memcpy(&chunk, from, sz);
-    memcpy(out, &chunk, sz);
-    return out + sz;
-}
-
-/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
-static inline unsigned char *copy_bytes(unsigned char *out, unsigned char *from, unsigned len) {
-    Assert(len < 8, "copy_bytes should be called with less than 8 bytes");
-
-#ifndef UNALIGNED_OK
-    while (len--) {
-        *out++ = *from++;
-    }
-    return out;
-#else
-    switch (len) {
-    case 7:
-        return copy_7_bytes(out, from);
-    case 6:
-        return copy_6_bytes(out, from);
-    case 5:
-        return copy_5_bytes(out, from);
-    case 4:
-        return copy_4_bytes(out, from);
-    case 3:
-        return copy_3_bytes(out, from);
-    case 2:
-        return copy_2_bytes(out, from);
-    case 1:
-        return copy_1_bytes(out, from);
-    case 0:
-        return out;
-    default:
-        Assert(0, "should not happen");
-    }
-
-    return out;
-#endif /* UNALIGNED_OK */
-}
-
-/* Copy LEN bytes (7 or fewer) from FROM into OUT. Return OUT + LEN. */
-static inline unsigned char *set_bytes(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
-    Assert(len < 8, "set_bytes should be called with less than 8 bytes");
-
-#ifndef UNALIGNED_OK
-    (void)dist;
-    while (len--) {
-        *out++ = *from++;
-    }
-    return out;
-#else
-    if (dist >= len)
-        return copy_bytes(out, from, len);
-
-    switch (dist) {
-    case 6:
-        Assert(len == 7, "len should be exactly 7");
-        out = copy_6_bytes(out, from);
-        return copy_1_bytes(out, from);
-
-    case 5:
-        Assert(len == 6 || len == 7, "len should be either 6 or 7");
-        out = copy_5_bytes(out, from);
-        return copy_bytes(out, from, len - 5);
-
-    case 4:
-        Assert(len == 5 || len == 6 || len == 7, "len should be either 5, 6, or 7");
-        out = copy_4_bytes(out, from);
-        return copy_bytes(out, from, len - 4);
-
-    case 3:
-        Assert(4 <= len && len <= 7, "len should be between 4 and 7");
-        out = copy_3_bytes(out, from);
-        switch (len) {
-        case 7:
-            return copy_4_bytes(out, from);
-        case 6:
-            return copy_3_bytes(out, from);
-        case 5:
-            return copy_2_bytes(out, from);
-        case 4:
-            return copy_1_bytes(out, from);
-        default:
-            Assert(0, "should not happen");
-            break;
-        }
-
-    case 2:
-        Assert(3 <= len && len <= 7, "len should be between 3 and 7");
-        out = copy_2_bytes(out, from);
-        switch (len) {
-        case 7:
-            out = copy_4_bytes(out, from);
-            out = copy_1_bytes(out, from);
-            return out;
-        case 6:
-            out = copy_4_bytes(out, from);
-            return out;
-        case 5:
-            out = copy_2_bytes(out, from);
-            out = copy_1_bytes(out, from);
-            return out;
-        case 4:
-            out = copy_2_bytes(out, from);
-            return out;
-        case 3:
-            out = copy_1_bytes(out, from);
-            return out;
-        default:
-            Assert(0, "should not happen");
-            break;
-        }
-
-    case 1:
-        Assert(2 <= len && len <= 7, "len should be between 2 and 7");
-        unsigned char c = *from;
-        switch (len) {
-        case 7:
-            memset(out, c, 7);
-            return out + 7;
-        case 6:
-            memset(out, c, 6);
-            return out + 6;
-        case 5:
-            memset(out, c, 5);
-            return out + 5;
-        case 4:
-            memset(out, c, 4);
-            return out + 4;
-        case 3:
-            memset(out, c, 3);
-            return out + 3;
-        case 2:
-            memset(out, c, 2);
-            return out + 2;
-        default:
-            Assert(0, "should not happen");
-            break;
-        }
-    }
-    return out;
-#endif /* UNALIGNED_OK */
-}
-
-/* Byte by byte semantics: copy LEN bytes from OUT + DIST and write them to OUT. Return OUT + LEN. */
-static inline unsigned char *chunk_memcpy(unsigned char *out, unsigned char *from, unsigned len) {
-    unsigned sz = sizeof(uint64_t);
-    Assert(len >= sz, "chunk_memcpy should be called on larger chunks");
-
-    /* Copy a few bytes to make sure the loop below has a multiple of SZ bytes to be copied. */
-    copy_8_bytes(out, from);
-
-    unsigned rem = len % sz;
-    len /= sz;
-    out += rem;
-    from += rem;
-
-    unsigned by8 = len % sz;
-    len -= by8;
-    switch (by8) {
-    case 7:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 6:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 5:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 4:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 3:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 2:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    case 1:
-        out = copy_8_bytes(out, from);
-        from += sz;
-    }
-
-    while (len) {
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-        out = copy_8_bytes(out, from);
-        from += sz;
-
-        len -= 8;
-    }
-
-    return out;
-}
-
-/* Memset LEN bytes in OUT with the value at OUT - 1. Return OUT + LEN. */
-static inline unsigned char *byte_memset(unsigned char *out, unsigned len) {
-    unsigned sz = sizeof(uint64_t);
-    Assert(len >= sz, "byte_memset should be called on larger chunks");
-
-    unsigned char *from = out - 1;
-    unsigned char c = *from;
-
-    /* First, deal with the case when LEN is not a multiple of SZ. */
-    memset(out, c, sz);
-    unsigned rem = len % sz;
-    len /= sz;
-    out += rem;
-
-    unsigned by8 = len % 8;
-    len -= by8;
-    switch (by8) {
-    case 7:
-        memset(out, c, sz);
-        out += sz;
-    case 6:
-        memset(out, c, sz);
-        out += sz;
-    case 5:
-        memset(out, c, sz);
-        out += sz;
-    case 4:
-        memset(out, c, sz);
-        out += sz;
-    case 3:
-        memset(out, c, sz);
-        out += sz;
-    case 2:
-        memset(out, c, sz);
-        out += sz;
-    case 1:
-        memset(out, c, sz);
-        out += sz;
-    }
-
-    while (len) {
-        /* When sz is a constant, the compiler replaces __builtin_memset with an
-           inline version that does not incur a function call overhead. */
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        memset(out, c, sz);
-        out += sz;
-        len -= 8;
-    }
-
-    return out;
-}
-
-/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */
-static inline unsigned char *chunk_memset(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) {
-    if (dist >= len)
-        return chunk_memcpy(out, from, len);
-
-    Assert(len >= sizeof(uint64_t), "chunk_memset should be called on larger chunks");
-
-    /* Double up the size of the memset pattern until reaching the largest pattern of size less than SZ. */
-    unsigned sz = sizeof(uint64_t);
-    while (dist < len && dist < sz) {
-        copy_8_bytes(out, from);
-
-        out += dist;
-        len -= dist;
-        dist += dist;
-
-        /* Make sure the next memcpy has at least SZ bytes to be copied.  */
-        if (len < sz)
-            /* Finish up byte by byte when there are not enough bytes left. */
-            return set_bytes(out, from, dist, len);
-    }
-
-    return chunk_memcpy(out, from, len);
-}
-
-/* Byte by byte semantics: copy LEN bytes from FROM and write them to OUT. Return OUT + LEN. */
-static inline unsigned char *chunk_copy(unsigned char *out, unsigned char *from, int dist, unsigned len) {
-    if (len < sizeof(uint64_t)) {
-        if (dist > 0)
-            return set_bytes(out, from, dist, len);
-
-        return copy_bytes(out, from, len);
-    }
-
-    if (dist == 1)
-        return byte_memset(out, len);
-
-    if (dist > 0)
-        return chunk_memset(out, from, dist, len);
-
-    return chunk_memcpy(out, from, len);
-}
-#endif /* INFFAST_CHUNKSIZE */
-#endif /* MEMCOPY_H_ */
index 6b6a8f5b2865d35415d90cd0f4305f11bc0c68ee..bd12e071980f0d3514e431b176e6715a7eabd980 100644 (file)
@@ -29,6 +29,7 @@ WFLAGS  = \
        -DUNALIGNED_OK \
        -DUNALIGNED64_OK \
        -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+       -DARM_CPUID \
        #
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
@@ -56,6 +57,7 @@ OBJS = \
        inftrees.obj \
        inffast.obj \
        insert_string.obj \
+       memchunk.obj \
        trees.obj \
        uncompr.obj \
        zutil.obj \
@@ -77,10 +79,11 @@ WFLAGS = $(WFLAGS) \
        -DARM_ACLE_CRC_HASH \
        -D__ARM_NEON__=1 \
        -DARM_NEON_ADLER32 \
+       -DARM_NEON_MEMCHUNK \
        -DARM_NEON_SLIDEHASH \
        -DARM_NOCHECK_NEON \
        #
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj slide_neon.obj adler32_neon.obj
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj memchunk_neon.obj slide_neon.obj
 
 # targets
 all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
@@ -160,9 +163,10 @@ deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.
 deflate_medium.obj: $(SRCDIR)/deflate_medium.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
 inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
 zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h
 
index 95933e247c7187573edc30936190a07a2b72e3c0..69f694a82aa9635632455b0bba9d0ba5f68ee4bb 100644 (file)
@@ -28,6 +28,7 @@ WFLAGS  = \
        -D_CRT_NONSTDC_NO_DEPRECATE \
        -DUNALIGNED_OK \
        -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
+       -DARM_CPUID \
        #
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
@@ -59,6 +60,7 @@ OBJS = \
        inftrees.obj \
        inffast.obj \
        insert_string.obj \
+       memchunk.obj \
        trees.obj \
        uncompr.obj \
        zutil.obj \
@@ -88,10 +90,11 @@ CFLAGS = $(CFLAGS) $(NEON_ARCH)
 WFLAGS = $(WFLAGS) \
        -D__ARM_NEON__=1 \
        -DARM_NEON_ADLER32 \
+       -DARM_NEON_MEMCHUNK \
        -DARM_NEON_SLIDEHASH \
        -DARM_NOCHECK_NEON \
        #
-OBJS = $(OBJS) adler32_neon.obj slide_neon.obj
+OBJS = $(OBJS) adler32_neon.obj memchunk_neon.obj slide_neon.obj
 !endif
 
 # targets
@@ -172,9 +175,10 @@ deflate_medium.obj: $(SRCDIR)/deflate_medium.c $(SRCDIR)/zbuild.h $(SRCDIR)/defl
 deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/trees_emit.h
 deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
 inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
 zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h
 
index 8bfa91bb39bff0419cf643c2ee6bfde8cac3081f..f7b0bac7885bc44ea461393e69c359548ca14ae2 100644 (file)
@@ -32,6 +32,7 @@ WFLAGS  = \
        -DX86_SSE42_CRC_INTRIN \
        -DX86_SSE42_CRC_HASH \
        -DX86_AVX2 \
+       -DX86_SSE2_MEMCHUNK \
        -DUNALIGNED_OK \
        -DUNALIGNED64_OK \
        #
@@ -65,6 +66,8 @@ OBJS = \
        inffast.obj \
        insert_string.obj \
        insert_string_sse.obj \
+       memchunk.obj \
+       memchunk_sse.obj \
        slide_avx.obj \
        slide_sse.obj \
        trees.obj \
@@ -169,9 +172,11 @@ deflate_medium.obj: $(SRCDIR)/deflate_medium.c $(SRCDIR)/zbuild.h $(SRCDIR)/defl
 deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/trees_emit.h
 deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h
-inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h
-inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h $(SRCDIR)/functable.h
+inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h
+inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h
 inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h
+memchunk.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+memchunk_sse.obj: $(SRCDIR)/arch/x86/memchunk_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 slide_sse.obj: $(SRCDIR)/arch/x86/slide_sse.c $(SRCDIR)/deflate.h
 trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees.h
 zutil.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h