]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Move crc32 folding functions into functable.
authorNathan Moinvaziri <nathan@nathanm.com>
Sat, 3 Jul 2021 00:44:08 +0000 (17:44 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 13 Aug 2021 13:05:34 +0000 (15:05 +0200)
18 files changed:
CMakeLists.txt
Makefile.in
arch/x86/INDEX.md
arch/x86/Makefile.in
arch/x86/crc32_fold_pclmulqdq.c [moved from arch/x86/crc_folding.c with 89% similarity]
arch/x86/crc_folding.h [deleted file]
configure
crc32.c
crc32_fold.c [new file with mode: 0644]
crc32_fold.h [new file with mode: 0644]
deflate.c
deflate.h
functable.c
functable.h
win32/Makefile.a64
win32/Makefile.arm
win32/Makefile.msc
zutil.h

index 6c148222d04e49cff23dc8140431123e2adfc8cd..dcd07ad6023b34c96abb529b3a4531446d2eff0c 100644 (file)
@@ -766,7 +766,7 @@ if(WITH_OPTIM)
             check_pclmulqdq_intrinsics()
             if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
                 add_definitions(-DX86_PCLMULQDQ_CRC)
-                set(PCLMULQDQ_SRCS ${ARCHDIR}/crc_folding.c)
+                set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c)
                 add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG}\"")
                 list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
                 set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
@@ -849,6 +849,7 @@ set(ZLIB_PRIVATE_HDRS
     crc32_p.h
     crc32_tbl.h
     crc32_comb_tbl.h
+    crc32_fold.h
     deflate.h
     deflate_p.h
     functable.h
@@ -873,6 +874,7 @@ set(ZLIB_SRCS
     compress.c
     crc32.c
     crc32_comb.c
+    crc32_fold.c
     deflate.c
     deflate_fast.c
     deflate_huff.c
index 19901ef4793dacdee1b89bf18ab358785beeb002..fb6a30c24790167faeef610a66eebe1a51f6731d 100644 (file)
@@ -78,6 +78,7 @@ OBJZ = \
        compress.o \
        crc32.o \
        crc32_comb.o \
+       crc32_fold.o \
        deflate.o \
        deflate_fast.o \
        deflate_huff.o \
@@ -113,6 +114,7 @@ PIC_OBJZ = \
        compress.lo \
        crc32.lo \
        crc32_comb.lo \
+       crc32_fold.lo \
        deflate.lo \
        deflate_fast.lo \
        deflate_huff.lo \
index d32bfe8b1d17a4c70b45b268b6e0cc2c0ee6945d..e20245a5e1df41396dcce233bb0514cc9a938d43 100644 (file)
@@ -4,5 +4,5 @@ Contents
 |Name|Description|
 |:-|:-|
 |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
-|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
 |slide_hash_sse2.c|SSE2 optimized slide_hash|
index fa153592a50a628a74e6a29747f145c597e11527..c5e588e70d9cd44ce9d4dcb48031ec5816e09768 100644 (file)
@@ -28,7 +28,7 @@ all: \
        compare258_avx.o compare258_avx.lo \
        compare258_sse.o compare258_sse.lo \
        insert_string_sse.o insert_string_sse.lo \
-       crc_folding.o crc_folding.lo \
+       crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
        slide_hash_avx.o slide_hash_avx.lo \
        slide_hash_sse.o slide_hash_sse.lo
 
@@ -68,11 +68,11 @@ insert_string_sse.o:
 insert_string_sse.lo:
        $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
 
-crc_folding.o:
-       $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+crc32_fold_pclmulqdq.o:
+       $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
 
-crc_folding.lo:
-       $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+crc32_fold_pclmulqdq.lo:
+       $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
 
 slide_hash_avx.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c
similarity index 89%
rename from arch/x86/crc_folding.c
rename to arch/x86/crc32_fold_pclmulqdq.c
index de087e34409e5390f432b528bca6dbbe77b3d7c6..30a24ea869eb6b50c5883fa94bc17738a1a4f787 100644 (file)
  */
 
 #ifdef X86_PCLMULQDQ_CRC
+#include "../../zutil.h"
 
 #include <inttypes.h>
 #include <immintrin.h>
 #include <wmmintrin.h>
 
-#include "crc_folding.h"
+#include "../../crc32_fold.h"
 
-Z_INTERNAL void crc_fold_init(unsigned int crc0[4 * 5]) {
+Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
     /* CRC_SAVE */
-    _mm_storeu_si128((__m128i *)crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
-    _mm_storeu_si128((__m128i *)crc0 + 1, _mm_setzero_si128());
-    _mm_storeu_si128((__m128i *)crc0 + 2, _mm_setzero_si128());
-    _mm_storeu_si128((__m128i *)crc0 + 3, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487));
+    _mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128());
+    _mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128());
+
+    return 0;
 }
 
 static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
@@ -224,16 +227,16 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
     *xmm_crc3 = _mm_castps_si128(ps_res);
 }
 
-Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, const unsigned char *src, long len) {
+Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
     unsigned long algn_diff;
     __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
     char ALIGNED_(16) partial_buf[16] = { 0 };
 
     /* CRC_LOAD */
-    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
-    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
-    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
-    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
+    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
+    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
+    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
+    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
     __m128i xmm_crc_part;
 
     if (len < 16) {
@@ -260,7 +263,7 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
         xmm_crc_part = _mm_setzero_si128();
     }
 
-    while ((len -= 64) >= 0) {
+    while (len >= 64) {
         /* CRC_LOAD */
         xmm_t0 = _mm_load_si128((__m128i *)src);
         xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -282,14 +285,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
 
         src += 64;
         dst += 64;
+        len -= 64;
     }
 
     /*
      * len = num bytes left - 64
      */
-    if (len + 16 >= 0) {
-        len += 16;
-
+    if (len >= 48) {
         xmm_t0 = _mm_load_si128((__m128i *)src);
         xmm_t1 = _mm_load_si128((__m128i *)src + 1);
         xmm_t2 = _mm_load_si128((__m128i *)src + 2);
@@ -303,15 +305,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
         xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
         xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
         xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
-
+        len -= 48;
         if (len == 0)
             goto done;
 
         dst += 48;
         memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
-    } else if (len + 32 >= 0) {
-        len += 32;
-
+    } else if (len >= 32) {
         xmm_t0 = _mm_load_si128((__m128i *)src);
         xmm_t1 = _mm_load_si128((__m128i *)src + 1);
 
@@ -323,14 +323,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
         xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
         xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
 
+        len -= 32;
         if (len == 0)
             goto done;
 
         dst += 32;
         memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
-    } else if (len + 48 >= 0) {
-        len += 48;
-
+    } else if (len >= 16) {
         xmm_t0 = _mm_load_si128((__m128i *)src);
 
         fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
@@ -339,13 +338,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons
 
         xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
 
+        len -= 16;
         if (len == 0)
             goto done;
 
         dst += 16;
         memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
     } else {
-        len += 64;
         if (len == 0)
             goto done;
         memcpy(&xmm_crc_part, src, len);
@@ -358,11 +357,11 @@ partial:
     partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
 done:
     /* CRC_SAVE */
-    _mm_storeu_si128((__m128i *)crc0 + 0, xmm_crc0);
-    _mm_storeu_si128((__m128i *)crc0 + 1, xmm_crc1);
-    _mm_storeu_si128((__m128i *)crc0 + 2, xmm_crc2);
-    _mm_storeu_si128((__m128i *)crc0 + 3, xmm_crc3);
-    _mm_storeu_si128((__m128i *)crc0 + 4, xmm_crc_part);
+    _mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0);
+    _mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1);
+    _mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2);
+    _mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3);
+    _mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part);
 }
 
 static const unsigned ALIGNED_(16) crc_k[] = {
@@ -382,18 +381,17 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };
 
-uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) {
+Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
     const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
     const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
 
-    uint32_t crc;
     __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
 
     /* CRC_LOAD */
-    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
-    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
-    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
-    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
+    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
+    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
+    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
+    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
 
     /*
      * k1
@@ -447,8 +445,9 @@ uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) {
     xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
     xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
 
-    crc = (uint32_t)_mm_extract_epi32(xmm_crc3, 2);
-    return ~crc;
+    crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
+
+    return crc->value;
 }
 
 #endif
diff --git a/arch/x86/crc_folding.h b/arch/x86/crc_folding.h
deleted file mode 100644 (file)
index 3af7f07..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/* crc_folding.h
- *
- * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
- * instruction.
- *
- * Copyright (C) 2013 Intel Corporation Jim Kukunas
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef CRC_FOLDING_H_
-#define CRC_FOLDING_H_
-
-#include "../../zutil.h"
-
-Z_INTERNAL void crc_fold_init(unsigned int crc0[4 * 5]);
-Z_INTERNAL uint32_t crc_fold_512to32(unsigned int crc0[4 * 5]);
-Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *, const unsigned char *, long);
-
-#endif
index 596ce6046a5823ea98b8ba95b0ee849800ada5e7..765ea443596ea0aa61a4d3f16ac394665cda3f3e 100755 (executable)
--- a/configure
+++ b/configure
@@ -1353,8 +1353,8 @@ case "${ARCH}" in
             if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC"
                 SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc_folding.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
             fi
         fi
     ;;
diff --git a/crc32.c b/crc32.c
index 3cb066ce5ea74279eff8604253c7cdc2974053a2..c519d874aa86a5c931f8c42e709b391f9009ab78 100644 (file)
--- a/crc32.c
+++ b/crc32.c
@@ -168,34 +168,3 @@ Z_INTERNAL uint32_t crc32_big(uint32_t crc, const unsigned char *buf, uint64_t l
     return ZSWAP32(c);
 }
 #endif /* BYTE_ORDER == BIG_ENDIAN */
-
-#ifdef X86_PCLMULQDQ_CRC
-#include "arch/x86/x86.h"
-#include "arch/x86/crc_folding.h"
-
-Z_INTERNAL void crc_finalize(deflate_state *const s) {
-    if (x86_cpu_has_pclmulqdq)
-        s->strm->adler = crc_fold_512to32(s->crc0);
-}
-#endif
-
-Z_INTERNAL void crc_reset(deflate_state *const s) {
-#ifdef X86_PCLMULQDQ_CRC
-    x86_check_features();
-    if (x86_cpu_has_pclmulqdq) {
-        crc_fold_init(s->crc0);
-    }
-#endif
-    s->strm->adler = CRC32_INITIAL_VALUE;
-}
-
-Z_INTERNAL void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size) {
-#ifdef X86_PCLMULQDQ_CRC
-    if (x86_cpu_has_pclmulqdq) {
-        crc_fold_copy(strm->state->crc0, dst, strm->next_in, size);
-        return;
-    }
-#endif
-    memcpy(dst, strm->next_in, size);
-    strm->adler = PREFIX(crc32)(strm->adler, dst, size);
-}
diff --git a/crc32_fold.c b/crc32_fold.c
new file mode 100644 (file)
index 0000000..7771c36
--- /dev/null
@@ -0,0 +1,23 @@
+/* crc32_fold.c -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "zutil.h"
+#include "functable.h"
+
+#include "crc32_fold.h"
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) {
+    crc->value = CRC32_INITIAL_VALUE;
+    return crc->value;
+}
+
+Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    crc->value = functable.crc32(crc->value, src, len);
+    memcpy(dst, src, len);
+}
+
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) {
+    return crc->value;
+}
diff --git a/crc32_fold.h b/crc32_fold.h
new file mode 100644 (file)
index 0000000..ec6d13d
--- /dev/null
@@ -0,0 +1,17 @@
+/* crc32_fold.h -- crc32 folding interface
+ * Copyright (C) 2021 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifndef CRC32_FOLD_H_
+#define CRC32_FOLD_H_
+
+typedef struct crc32_fold_s {
+    uint32_t ALIGNED_(16) fold[4 * 5];
+    uint32_t value;
+} crc32_fold;
+
+Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc);
+Z_INTERNAL void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+#endif
index c1ee6d41043269a58737c173b16271b604bd9b6a..79fa85dbc49acec97ceef1940468d0faa2353bc8 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -114,12 +114,6 @@ static void lm_set_level         (deflate_state *s, int level);
 static void lm_init              (deflate_state *s);
 Z_INTERNAL unsigned read_buf  (PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
 
-extern void crc_reset(deflate_state *const s);
-#ifdef X86_PCLMULQDQ_CRC
-extern void crc_finalize(deflate_state *const s);
-#endif
-extern void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size);
-
 extern uint32_t update_hash_roll        (deflate_state *const s, uint32_t h, uint32_t val);
 extern void     insert_string_roll      (deflate_state *const s, uint32_t str, uint32_t count);
 extern Pos      quick_insert_string_roll(deflate_state *const s, uint32_t str);
@@ -454,7 +448,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) {
 
 #ifdef GZIP
     if (s->wrap == 2)
-        crc_reset(s);
+        strm->adler = functable.crc32_fold_reset(&s->crc_fold);
     else
 #endif
         strm->adler = ADLER32_INITIAL_VALUE;
@@ -780,7 +774,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
 #ifdef GZIP
     if (s->status == GZIP_STATE) {
         /* gzip header */
-        crc_reset(s);
+        functable.crc32_fold_reset(&s->crc_fold);
         put_byte(s, 31);
         put_byte(s, 139);
         put_byte(s, 8);
@@ -897,7 +891,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
                 }
             }
             put_short(s, (uint16_t)strm->adler);
-            crc_reset(s);
+            functable.crc32_fold_reset(&s->crc_fold);
         }
         s->status = BUSY_STATE;
 
@@ -968,9 +962,8 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) {
     /* Write the trailer */
 #ifdef GZIP
     if (s->wrap == 2) {
-#  ifdef X86_PCLMULQDQ_CRC
-        crc_finalize(s);
-#  endif
+        strm->adler = functable.crc32_fold_final(&s->crc_fold);
+
         put_uint32(s, strm->adler);
         put_uint32(s, (uint32_t)strm->total_in);
     } else
@@ -1082,7 +1075,7 @@ Z_INTERNAL unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned
         memcpy(buf, strm->next_in, len);
 #ifdef GZIP
     } else if (strm->state->wrap == 2) {
-        copy_with_crc(strm, buf, len);
+        functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len);
 #endif
     } else {
         memcpy(buf, strm->next_in, len);
index 8c443d0c1de318f0bdd10e95865aeced82b0894c..94ff239ce4bbc1894744cdc9291fe2826b8fec42 100644 (file)
--- a/deflate.h
+++ b/deflate.h
@@ -12,6 +12,7 @@
 
 #include "zutil.h"
 #include "zendian.h"
+#include "crc32_fold.h"
 
 /* define NO_GZIP when compiling if you want to disable gzip header and
    trailer creation by deflate().  NO_GZIP would be used to avoid linking in
@@ -210,10 +211,7 @@ struct internal_state {
 
     int nice_match; /* Stop searching when current match exceeds this */
 
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
-    /* Only used if X86_PCLMULQDQ_CRC is defined */
-    unsigned crc0[4 * 5];
-#endif
+    crc32_fold ALIGNED_(16) crc_fold;
 
                 /* used by trees.c: */
     /* Didn't use ct_data typedef below to suppress compiler warning */
index 36eec102314f83f22fb15be5b14db4a0c125ec68..d8b561b5a9b65724414ecb4c603212264e364af6 100644 (file)
@@ -73,6 +73,17 @@ extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t le
 extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
 #endif
 
+/* CRC32 folding */
+extern uint32_t crc32_fold_reset_c(crc32_fold *crc);
+extern void     crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t crc32_fold_final_c(crc32_fold *crc);
+
+#ifdef X86_PCLMULQDQ_CRC
+extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
+extern void     crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
+#endif
+
 /* memory chunking */
 extern uint32_t chunksize_c(void);
 extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len);
@@ -304,6 +315,36 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
     return functable.adler32(adler, buf, len);
 }
 
+Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) {
+    functable.crc32_fold_reset = crc32_fold_reset_c;
+    cpu_check_features();
+#ifdef X86_PCLMULQDQ_CRC
+    if (x86_cpu_has_pclmulqdq)
+        functable.crc32_fold_reset = crc32_fold_reset_pclmulqdq;
+#endif
+    return functable.crc32_fold_reset(crc);
+}
+
+Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+    functable.crc32_fold_copy = crc32_fold_copy_c;
+    cpu_check_features();
+#ifdef X86_PCLMULQDQ_CRC
+    if (x86_cpu_has_pclmulqdq)
+        functable.crc32_fold_copy = crc32_fold_copy_pclmulqdq;
+#endif
+    functable.crc32_fold_copy(crc, dst, src, len);
+}
+
+Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) {
+    functable.crc32_fold_final = crc32_fold_final_c;
+    cpu_check_features();
+#ifdef X86_PCLMULQDQ_CRC
+    if (x86_cpu_has_pclmulqdq)
+        functable.crc32_fold_final = crc32_fold_final_pclmulqdq;
+#endif
+    return functable.crc32_fold_final(crc);
+}
+
 Z_INTERNAL uint32_t chunksize_stub(void) {
     // Initialize default
     functable.chunksize = &chunksize_c;
@@ -579,6 +620,9 @@ Z_INTERNAL Z_TLS struct functable_s functable = {
     quick_insert_string_stub,
     adler32_stub,
     crc32_stub,
+    crc32_fold_reset_stub,
+    crc32_fold_copy_stub,
+    crc32_fold_final_stub,
     slide_hash_stub,
     compare258_stub,
     longest_match_stub,
index f4b17569ac2f7bac00fe39fc6c18c9fcf02b3b41..36039fcb27ff792cab618ce6393a4761477362a0 100644 (file)
@@ -7,6 +7,7 @@
 #define FUNCTABLE_H_
 
 #include "deflate.h"
+#include "crc32_fold.h"
 
 struct functable_s {
     uint32_t (* update_hash)        (deflate_state *const s, uint32_t h, uint32_t val);
@@ -14,6 +15,9 @@ struct functable_s {
     Pos      (* quick_insert_string)(deflate_state *const s, uint32_t str);
     uint32_t (* adler32)            (uint32_t adler, const unsigned char *buf, size_t len);
     uint32_t (* crc32)              (uint32_t crc, const unsigned char *buf, uint64_t len);
+    uint32_t (* crc32_fold_reset)   (crc32_fold *crc);
+    void     (* crc32_fold_copy)    (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+    uint32_t (* crc32_fold_final)   (crc32_fold *crc);
     void     (* slide_hash)         (deflate_state *s);
     uint32_t (* compare258)         (const unsigned char *src0, const unsigned char *src1);
     uint32_t (* longest_match)      (deflate_state *const s, Pos cur_match);
index 8746e5cbfd0fe44ecc811d1e9c9b52fb7ea561b0..418558b1377b251def5e420ddde9023b8fe102f8 100644 (file)
@@ -49,6 +49,7 @@ OBJS = \
        compress.obj \
        crc32.obj \
        crc32_comb.obj \
+       crc32_fold.obj \
        deflate.obj \
        deflate_fast.obj \
        deflate_huff.obj \
@@ -168,6 +169,7 @@ compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
+crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
index 8403043996214bc827d6371579c9bb91def774a8..c44c4d8c865e3c444140879a9900545bc5048299 100644 (file)
@@ -52,6 +52,7 @@ OBJS = \
        compress.obj \
        crc32.obj \
        crc32_comb.obj \
+       crc32_fold.obj \
        deflate.obj \
        deflate_fast.obj \
        deflate_huff.obj \
@@ -180,6 +181,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
+crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
index 3110f7fe989792067bb4588e8b8733b97c23fe77..eec878f249acecff0c34d5df740bbc2aee4bee6e 100644 (file)
@@ -58,7 +58,8 @@ OBJS = \
        compress.obj \
        crc32.obj \
        crc32_comb.obj \
-       crc_folding.obj \
+       crc32_fold.obj \
+       crc32_fold_pclmulqdq.obj \
        deflate.obj \
        deflate_fast.obj \
        deflate_huff.obj \
@@ -174,6 +175,8 @@ chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)
 chunkset_sse.obj: $(SRCDIR)/arch/x86/chunkset_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h
+crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
+crc32_fold_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq.c $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
diff --git a/zutil.h b/zutil.h
index 30bd6394ea9f3d9afc256d4e9e1de50cb38477c7..0a44f775c371384398751b7521d18de971554093 100644 (file)
--- a/zutil.h
+++ b/zutil.h
@@ -76,7 +76,7 @@ extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */
 #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
 
 #define ADLER32_INITIAL_VALUE 1 /* initial adler-32 hash value */
-#define CRC32_INITIAL_VALUE 0 /* initial crc-32 hash value */
+#define CRC32_INITIAL_VALUE   0 /* initial crc-32 hash value */
 
         /* target dependencies */