From: Nathan Moinvaziri Date: Sat, 3 Jul 2021 00:44:08 +0000 (-0700) Subject: Move crc32 folding functions into functable. X-Git-Tag: 2.1.0-beta1~519 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d802e8900f2a09b5a2d0710bfee3e7b3adc87567;p=thirdparty%2Fzlib-ng.git Move crc32 folding functions into functable. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c148222d..dcd07ad60 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -766,7 +766,7 @@ if(WITH_OPTIM) check_pclmulqdq_intrinsics() if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN) add_definitions(-DX86_PCLMULQDQ_CRC) - set(PCLMULQDQ_SRCS ${ARCHDIR}/crc_folding.c) + set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c) add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE4FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") @@ -849,6 +849,7 @@ set(ZLIB_PRIVATE_HDRS crc32_p.h crc32_tbl.h crc32_comb_tbl.h + crc32_fold.h deflate.h deflate_p.h functable.h @@ -873,6 +874,7 @@ set(ZLIB_SRCS compress.c crc32.c crc32_comb.c + crc32_fold.c deflate.c deflate_fast.c deflate_huff.c diff --git a/Makefile.in b/Makefile.in index 19901ef47..fb6a30c24 100644 --- a/Makefile.in +++ b/Makefile.in @@ -78,6 +78,7 @@ OBJZ = \ compress.o \ crc32.o \ crc32_comb.o \ + crc32_fold.o \ deflate.o \ deflate_fast.o \ deflate_huff.o \ @@ -113,6 +114,7 @@ PIC_OBJZ = \ compress.lo \ crc32.lo \ crc32_comb.lo \ + crc32_fold.lo \ deflate.lo \ deflate_fast.lo \ deflate_huff.lo \ diff --git a/arch/x86/INDEX.md b/arch/x86/INDEX.md index d32bfe8b1..e20245a5e 100644 --- a/arch/x86/INDEX.md +++ b/arch/x86/INDEX.md @@ -4,5 +4,5 @@ Contents |Name|Description| |:-|:-| |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1| -|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation| +|crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation| |slide_hash_sse2.c|SSE2 optimized slide_hash| diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index fa153592a..c5e588e70 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -28,7 +28,7 @@ all: \ compare258_avx.o compare258_avx.lo \ compare258_sse.o compare258_sse.lo \ insert_string_sse.o insert_string_sse.lo \ - crc_folding.o crc_folding.lo \ + crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ slide_hash_avx.o slide_hash_avx.lo \ slide_hash_sse.o slide_hash_sse.lo @@ -68,11 +68,11 @@ insert_string_sse.o: insert_string_sse.lo: $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c -crc_folding.o: - $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +crc32_fold_pclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c -crc_folding.lo: - $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +crc32_fold_pclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c slide_hash_avx.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c diff --git a/arch/x86/crc_folding.c b/arch/x86/crc32_fold_pclmulqdq.c similarity index 89% rename from arch/x86/crc_folding.c rename to arch/x86/crc32_fold_pclmulqdq.c index de087e344..30a24ea86 100644 --- a/arch/x86/crc_folding.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -17,19 +17,22 @@ */ #ifdef X86_PCLMULQDQ_CRC +#include "../../zutil.h" #include #include #include -#include "crc_folding.h" +#include "../../crc32_fold.h" -Z_INTERNAL void crc_fold_init(unsigned int crc0[4 * 5]) { +Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) { /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)crc0 + 0, _mm_cvtsi32_si128(0x9db42487)); - _mm_storeu_si128((__m128i *)crc0 + 1, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)crc0 + 2, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)crc0 + 3, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487)); + _mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128()); + _mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128()); + + return 0; } static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { @@ -224,16 +227,16 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, *xmm_crc3 = _mm_castps_si128(ps_res); } -Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, const unsigned char *src, long len) { +Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { unsigned long algn_diff; __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; char ALIGNED_(16) partial_buf[16] = { 0 }; /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3); + __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0); + __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1); + __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2); + __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3); __m128i xmm_crc_part; if (len < 16) { @@ -260,7 +263,7 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons xmm_crc_part = _mm_setzero_si128(); } - while ((len -= 64) >= 0) { + while (len >= 64) { /* CRC_LOAD */ xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); @@ -282,14 +285,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons src += 64; dst += 64; + len -= 64; } /* * len = num bytes left - 64 */ - if (len + 16 >= 0) { - len += 16; - + if (len >= 48) { xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); xmm_t2 = _mm_load_si128((__m128i *)src + 2); @@ -303,15 +305,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); - + len -= 48; if (len == 0) goto done; dst += 48; memcpy(&xmm_crc_part, (__m128i *)src + 3, len); - } else if (len + 32 >= 0) { - len += 32; - + } else if (len >= 32) { xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); @@ -323,14 +323,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + len -= 32; if (len == 0) goto done; dst += 32; memcpy(&xmm_crc_part, (__m128i *)src + 2, len); - } else if (len + 48 >= 0) { - len += 48; - + } else if (len >= 16) { xmm_t0 = _mm_load_si128((__m128i *)src); fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); @@ -339,13 +338,13 @@ Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, cons xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + len -= 16; if (len == 0) goto done; dst += 16; memcpy(&xmm_crc_part, (__m128i *)src + 1, len); } else { - len += 64; if (len == 0) goto done; memcpy(&xmm_crc_part, src, len); @@ -358,11 +357,11 @@ partial: partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); done: /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)crc0 + 0, xmm_crc0); - _mm_storeu_si128((__m128i *)crc0 + 1, xmm_crc1); - _mm_storeu_si128((__m128i *)crc0 + 2, xmm_crc2); - _mm_storeu_si128((__m128i *)crc0 + 3, xmm_crc3); - _mm_storeu_si128((__m128i *)crc0 + 4, xmm_crc_part); + _mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0); + _mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1); + _mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2); + _mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3); + _mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part); } static const unsigned ALIGNED_(16) crc_k[] = { @@ -382,18 +381,17 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) { +Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) { const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); - uint32_t crc; __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3); + __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0); + __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1); + __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2); + __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3); /* * k1 @@ -447,8 +445,9 @@ uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) { xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1); - crc = (uint32_t)_mm_extract_epi32(xmm_crc3, 2); - return ~crc; + crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2)); + + return crc->value; } #endif diff --git a/arch/x86/crc_folding.h b/arch/x86/crc_folding.h deleted file mode 100644 index 3af7f079a..000000000 --- a/arch/x86/crc_folding.h +++ /dev/null @@ -1,19 +0,0 @@ -/* crc_folding.h - * - * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ - * instruction. - * - * Copyright (C) 2013 Intel Corporation Jim Kukunas - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef CRC_FOLDING_H_ -#define CRC_FOLDING_H_ - -#include "../../zutil.h" - -Z_INTERNAL void crc_fold_init(unsigned int crc0[4 * 5]); -Z_INTERNAL uint32_t crc_fold_512to32(unsigned int crc0[4 * 5]); -Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *, const unsigned char *, long); - -#endif diff --git a/configure b/configure index 596ce6046..765ea4435 100755 --- a/configure +++ b/configure @@ -1353,8 +1353,8 @@ case "${ARCH}" in if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC" SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc_folding.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo" fi fi ;; diff --git a/crc32.c b/crc32.c index 3cb066ce5..c519d874a 100644 --- a/crc32.c +++ b/crc32.c @@ -168,34 +168,3 @@ Z_INTERNAL uint32_t crc32_big(uint32_t crc, const unsigned char *buf, uint64_t l return ZSWAP32(c); } #endif /* BYTE_ORDER == BIG_ENDIAN */ - -#ifdef X86_PCLMULQDQ_CRC -#include "arch/x86/x86.h" -#include "arch/x86/crc_folding.h" - -Z_INTERNAL void crc_finalize(deflate_state *const s) { - if (x86_cpu_has_pclmulqdq) - s->strm->adler = crc_fold_512to32(s->crc0); -} -#endif - -Z_INTERNAL void crc_reset(deflate_state *const s) { -#ifdef X86_PCLMULQDQ_CRC - x86_check_features(); - if (x86_cpu_has_pclmulqdq) { - crc_fold_init(s->crc0); - } -#endif - s->strm->adler = CRC32_INITIAL_VALUE; -} - -Z_INTERNAL void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size) { -#ifdef X86_PCLMULQDQ_CRC - if (x86_cpu_has_pclmulqdq) { - crc_fold_copy(strm->state->crc0, dst, strm->next_in, size); - return; - } -#endif - memcpy(dst, strm->next_in, size); - strm->adler = PREFIX(crc32)(strm->adler, dst, size); -} diff --git a/crc32_fold.c b/crc32_fold.c new file mode 100644 index 000000000..7771c369c --- /dev/null +++ b/crc32_fold.c @@ -0,0 +1,23 @@ +/* crc32_fold.c -- crc32 folding interface + * Copyright (C) 2021 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" +#include "zutil.h" +#include "functable.h" + +#include "crc32_fold.h" + +Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc) { + crc->value = CRC32_INITIAL_VALUE; + return crc->value; +} + +Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + crc->value = functable.crc32(crc->value, src, len); + memcpy(dst, src, len); +} + +Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) { + return crc->value; +} diff --git a/crc32_fold.h b/crc32_fold.h new file mode 100644 index 000000000..ec6d13d15 --- /dev/null +++ b/crc32_fold.h @@ -0,0 +1,17 @@ +/* crc32_fold.h -- crc32 folding interface + * Copyright (C) 2021 Nathan Moinvaziri + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#ifndef CRC32_FOLD_H_ +#define CRC32_FOLD_H_ + +typedef struct crc32_fold_s { + uint32_t ALIGNED_(16) fold[4 * 5]; + uint32_t value; +} crc32_fold; + +Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); +Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc); + +#endif diff --git a/deflate.c b/deflate.c index c1ee6d410..79fa85dbc 100644 --- a/deflate.c +++ b/deflate.c @@ -114,12 +114,6 @@ static void lm_set_level (deflate_state *s, int level); static void lm_init (deflate_state *s); Z_INTERNAL unsigned read_buf (PREFIX3(stream) *strm, unsigned char *buf, unsigned size); -extern void crc_reset(deflate_state *const s); -#ifdef X86_PCLMULQDQ_CRC -extern void crc_finalize(deflate_state *const s); -#endif -extern void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size); - extern uint32_t update_hash_roll (deflate_state *const s, uint32_t h, uint32_t val); extern void insert_string_roll (deflate_state *const s, uint32_t str, uint32_t count); extern Pos quick_insert_string_roll(deflate_state *const s, uint32_t str); @@ -454,7 +448,7 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) { #ifdef GZIP if (s->wrap == 2) - crc_reset(s); + strm->adler = functable.crc32_fold_reset(&s->crc_fold); else #endif strm->adler = ADLER32_INITIAL_VALUE; @@ -780,7 +774,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { #ifdef GZIP if (s->status == GZIP_STATE) { /* gzip header */ - crc_reset(s); + functable.crc32_fold_reset(&s->crc_fold); put_byte(s, 31); put_byte(s, 139); put_byte(s, 8); @@ -897,7 +891,7 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { } } put_short(s, (uint16_t)strm->adler); - crc_reset(s); + functable.crc32_fold_reset(&s->crc_fold); } s->status = BUSY_STATE; @@ -968,9 +962,8 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { /* Write the trailer */ #ifdef GZIP if (s->wrap == 2) { -# ifdef X86_PCLMULQDQ_CRC - crc_finalize(s); -# endif + strm->adler = functable.crc32_fold_final(&s->crc_fold); + put_uint32(s, strm->adler); put_uint32(s, (uint32_t)strm->total_in); } else @@ -1082,7 +1075,7 @@ Z_INTERNAL unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned memcpy(buf, strm->next_in, len); #ifdef GZIP } else if (strm->state->wrap == 2) { - copy_with_crc(strm, buf, len); + functable.crc32_fold_copy(&strm->state->crc_fold, buf, strm->next_in, len); #endif } else { memcpy(buf, strm->next_in, len); diff --git a/deflate.h b/deflate.h index 8c443d0c1..94ff239ce 100644 --- a/deflate.h +++ b/deflate.h @@ -12,6 +12,7 @@ #include "zutil.h" #include "zendian.h" +#include "crc32_fold.h" /* define NO_GZIP when compiling if you want to disable gzip header and trailer creation by deflate(). NO_GZIP would be used to avoid linking in @@ -210,10 +211,7 @@ struct internal_state { int nice_match; /* Stop searching when current match exceeds this */ -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) - /* Only used if X86_PCLMULQDQ_CRC is defined */ - unsigned crc0[4 * 5]; -#endif + crc32_fold ALIGNED_(16) crc_fold; /* used by trees.c: */ /* Didn't use ct_data typedef below to suppress compiler warning */ diff --git a/functable.c b/functable.c index 36eec1023..d8b561b5a 100644 --- a/functable.c +++ b/functable.c @@ -73,6 +73,17 @@ extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t le extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len); #endif +/* CRC32 folding */ +extern uint32_t crc32_fold_reset_c(crc32_fold *crc); +extern void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t crc32_fold_final_c(crc32_fold *crc); + +#ifdef X86_PCLMULQDQ_CRC +extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc); +extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc); +#endif + /* memory chunking */ extern uint32_t chunksize_c(void); extern uint8_t* chunkcopy_c(uint8_t *out, uint8_t const *from, unsigned len); @@ -304,6 +315,36 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ return functable.adler32(adler, buf, len); } +Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) { + functable.crc32_fold_reset = crc32_fold_reset_c; + cpu_check_features(); +#ifdef X86_PCLMULQDQ_CRC + if (x86_cpu_has_pclmulqdq) + functable.crc32_fold_reset = crc32_fold_reset_pclmulqdq; +#endif + return functable.crc32_fold_reset(crc); +} + +Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { + functable.crc32_fold_copy = crc32_fold_copy_c; + cpu_check_features(); +#ifdef X86_PCLMULQDQ_CRC + if (x86_cpu_has_pclmulqdq) + functable.crc32_fold_copy = crc32_fold_copy_pclmulqdq; +#endif + functable.crc32_fold_copy(crc, dst, src, len); +} + +Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) { + functable.crc32_fold_final = crc32_fold_final_c; + cpu_check_features(); +#ifdef X86_PCLMULQDQ_CRC + if (x86_cpu_has_pclmulqdq) + functable.crc32_fold_final = crc32_fold_final_pclmulqdq; +#endif + return functable.crc32_fold_final(crc); +} + Z_INTERNAL uint32_t chunksize_stub(void) { // Initialize default functable.chunksize = &chunksize_c; @@ -579,6 +620,9 @@ Z_INTERNAL Z_TLS struct functable_s functable = { quick_insert_string_stub, adler32_stub, crc32_stub, + crc32_fold_reset_stub, + crc32_fold_copy_stub, + crc32_fold_final_stub, slide_hash_stub, compare258_stub, longest_match_stub, diff --git a/functable.h b/functable.h index f4b17569a..36039fcb2 100644 --- a/functable.h +++ b/functable.h @@ -7,6 +7,7 @@ #define FUNCTABLE_H_ #include "deflate.h" +#include "crc32_fold.h" struct functable_s { uint32_t (* update_hash) (deflate_state *const s, uint32_t h, uint32_t val); @@ -14,6 +15,9 @@ struct functable_s { Pos (* quick_insert_string)(deflate_state *const s, uint32_t str); uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); + uint32_t (* crc32_fold_reset) (crc32_fold *crc); + void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); + uint32_t (* crc32_fold_final) (crc32_fold *crc); void (* slide_hash) (deflate_state *s); uint32_t (* compare258) (const unsigned char *src0, const unsigned char *src1); uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index 8746e5cbf..418558b13 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -49,6 +49,7 @@ OBJS = \ compress.obj \ crc32.obj \ crc32_comb.obj \ + crc32_fold.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -168,6 +169,7 @@ compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h +crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 840304399..c44c4d8c8 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -52,6 +52,7 @@ OBJS = \ compress.obj \ crc32.obj \ crc32_comb.obj \ + crc32_fold.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -180,6 +181,7 @@ uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h +crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 3110f7fe9..eec878f24 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -58,7 +58,8 @@ OBJS = \ compress.obj \ crc32.obj \ crc32_comb.obj \ - crc_folding.obj \ + crc32_fold.obj \ + crc32_fold_pclmulqdq.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -174,6 +175,8 @@ chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR) chunkset_sse.obj: $(SRCDIR)/arch/x86/chunkset_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h crc32_comb.obj: $(SRCDIR)/crc32_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/crc32_comb_tbl.h +crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h +crc32_fold_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq.c $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h diff --git a/zutil.h b/zutil.h index 30bd6394e..0a44f775c 100644 --- a/zutil.h +++ b/zutil.h @@ -76,7 +76,7 @@ extern z_const char * const PREFIX(z_errmsg)[10]; /* indexed by 2-zlib_error */ #define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */ #define ADLER32_INITIAL_VALUE 1 /* initial adler-32 hash value */ -#define CRC32_INITIAL_VALUE 0 /* initial crc-32 hash value */ +#define CRC32_INITIAL_VALUE 0 /* initial crc-32 hash value */ /* target dependencies */