From: Hans Kristian Rosbach Date: Tue, 11 Nov 2025 19:23:24 +0000 (+0100) Subject: Reorganize Chorba activation. X-Git-Tag: 2.3.0-rc2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8003f57828f7310aaa035519bfa17c93b5621977;p=thirdparty%2Fzlib-ng.git Reorganize Chorba activation. Now WITHOUT_CHORBA will only disable the crc32_chorba C fallback. SSE2, SSE41 and pclmul variants will still be able to use their Chorba-algorithm based code, but their fallback to the generic crc32_chorba C code in SSE2 and SSE41 will be disabled, reducing their performance on really big input buffers (not used during deflate/inflate, only when calling crc32 directly). Remove the crc32_c function (and its file crc32_c.c), instead use the normal functable routing to select between crc32_braid and crc32_chorba. Disable sse2 and sse4.1 variants of Chorba-crc32 on MSVC older than 2022 due to code generation bug in 2019 causing segfaults. Compile either crc32_chorba_small_nondestructive or crc32_chorba_small_nondestructive_32bit, not both. Don't compile crc32_chorba_32768_nondestructive on 32bit arch. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe245a43..df8311338 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -217,6 +217,10 @@ elseif(MSVC) if(MSVC_VERSION VERSION_LESS 1800) message(SEND_ERROR "Unsupported Visual Studio compiler version (requires 2013 or later).") endif() + if(MSVC_VERSION VERSION_LESS 1930) + message(STATUS "Old Visual Studio compiler version, disabling SSE2/SSE4.1 Chorba variants (requires 2022 or later).") + add_definitions(-DWITHOUT_CHORBA_SSE) + endif() # TODO. ICC can be used through MSVC. I'm not sure if we'd ever see that combination # (who'd use cmake from an IDE...) but checking for ICC before checking for MSVC should # avoid mistakes. @@ -1312,7 +1316,6 @@ set(ZLIB_ALL_FALLBACK_SRCS arch/generic/chunkset_c.c arch/generic/compare256_c.c arch/generic/crc32_braid_c.c - arch/generic/crc32_c.c arch/generic/crc32_fold_c.c arch/generic/slide_hash_c.c ) @@ -1326,7 +1329,6 @@ elseif(${ARCH} STREQUAL "x86_64" AND WITH_SSE2) arch/generic/adler32_c.c arch/generic/adler32_fold_c.c arch/generic/crc32_braid_c.c - arch/generic/crc32_c.c arch/generic/crc32_fold_c.c ) diff --git a/Makefile.in b/Makefile.in index fbf1e3f4c..1785e5f1f 100644 --- a/Makefile.in +++ b/Makefile.in @@ -80,7 +80,6 @@ OBJZ = \ arch/generic/chunkset_c.o \ arch/generic/compare256_c.o \ arch/generic/crc32_braid_c.o \ - arch/generic/crc32_c.o \ arch/generic/crc32_fold_c.o \ arch/generic/slide_hash_c.o \ adler32.o \ @@ -122,7 +121,6 @@ PIC_OBJZ = \ arch/generic/chunkset_c.lo \ arch/generic/compare256_c.lo \ arch/generic/crc32_braid_c.lo \ - arch/generic/crc32_c.lo \ arch/generic/crc32_fold_c.lo \ arch/generic/slide_hash_c.lo \ adler32.lo \ diff --git a/arch/generic/Makefile.in b/arch/generic/Makefile.in index 6040083f6..ba20e9e5f 100644 --- a/arch/generic/Makefile.in +++ b/arch/generic/Makefile.in @@ -18,7 +18,6 @@ all: \ chunkset_c.o chunkset_c.lo \ compare256_c.o compare256_c.lo \ crc32_braid_c.o crc32_braid_c.lo \ - crc32_c.o crc32_c.lo \ crc32_chorba_c.o crc32_chorba_c.lo \ crc32_fold_c.o crc32_fold_c.lo \ slide_hash_c.o slide_hash_c.lo @@ -54,12 +53,6 @@ crc32_braid_c.o: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_b crc32_braid_c.lo: $(SRCDIR)/crc32_braid_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_braid_c.c -crc32_c.o: $(SRCDIR)/crc32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_c.c - -crc32_c.lo: $(SRCDIR)/crc32_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_c.c - crc32_chorba_c.o: $(SRCDIR)/crc32_chorba_c.c $(SRCTOP)/zbuild.h $(SRCTOP)/crc32_braid_p.h $(SRCTOP)/crc32_braid_tbl.h $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_chorba_c.c diff --git a/arch/generic/crc32_c.c b/arch/generic/crc32_c.c deleted file mode 100644 index e7394a8c9..000000000 --- a/arch/generic/crc32_c.c +++ /dev/null @@ -1,42 +0,0 @@ -#include "zbuild.h" -#include "crc32.h" -#include "crc32_braid_p.h" -#include "generic_functions.h" - -Z_INTERNAL uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len) { - uint32_t c = (~crc) & 0xffffffff; - -#ifndef WITHOUT_CHORBA - uint64_t* aligned_buf; - size_t aligned_len; - unsigned long algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 0xF)) & 0xF; - if (algn_diff < len) { - if (algn_diff) { - c = crc32_braid_internal(c, buf, algn_diff); - } - aligned_buf = (uint64_t*) (buf + algn_diff); - aligned_len = len - algn_diff; - if(aligned_len > CHORBA_LARGE_THRESHOLD) - c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); -# if OPTIMAL_CMP == 64 - else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) - c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); - else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) - c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); -# else - else if (aligned_len > CHORBA_SMALL_THRESHOLD_32BIT) - c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, aligned_len); -# endif - else - c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); - } - else { - c = crc32_braid_internal(c, buf, len); - } -#else - c = crc32_braid_internal(c, buf, len); -#endif /* WITHOUT_CHORBA */ - - /* Return the CRC, post-conditioned. */ - return c ^ 0xffffffff; -} diff --git a/arch/generic/crc32_chorba_c.c b/arch/generic/crc32_chorba_c.c index 76b050f29..4041abd46 100644 --- a/arch/generic/crc32_chorba_c.c +++ b/arch/generic/crc32_chorba_c.c @@ -495,6 +495,7 @@ Z_INTERNAL uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_wo return crc; } +# if OPTIMAL_CMP == 64 /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 */ Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len) { const uint64_t* input = buf; @@ -1230,6 +1231,8 @@ Z_INTERNAL uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint6 return crc; } +#else // OPTIMAL_CMP == 64 + Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const uint32_t* buf, size_t len) { const uint32_t* input = buf; uint32_t final[20] = {0}; @@ -1442,3 +1445,38 @@ Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const return crc; } +#endif // OPTIMAL_CMP == 64 + +Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) { + uint32_t c = (~crc) & 0xffffffff; + + uint64_t* aligned_buf; + size_t aligned_len; + unsigned long algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 0xF)) & 0xF; + if (algn_diff < len) { + if (algn_diff) { + c = crc32_braid_internal(c, buf, algn_diff); + } + aligned_buf = (uint64_t*) (buf + algn_diff); + aligned_len = len - algn_diff; + if(aligned_len > CHORBA_LARGE_THRESHOLD) + c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); +# if OPTIMAL_CMP == 64 + else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) + c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); + else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) + c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); +# else + else if (aligned_len > CHORBA_SMALL_THRESHOLD_32BIT) + c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, aligned_len); +# endif + else + c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); + } + else { + c = crc32_braid_internal(c, buf, len); + } + + /* Return the CRC, post-conditioned. */ + return c ^ 0xffffffff; +} diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 21358f069..cb92bd373 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -22,11 +22,11 @@ uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned uint32_t compare256_c(const uint8_t *src0, const uint8_t *src1); -uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len); uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len); #ifndef WITHOUT_CHORBA + uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len); uint32_t crc32_chorba_118960_nondestructive (uint32_t crc, const z_word_t* input, size_t len); uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len); uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint64_t* buf, size_t len); @@ -50,7 +50,11 @@ void slide_hash_c(deflate_state *s); # define native_adler32 adler32_c # define native_adler32_fold_copy adler32_fold_copy_c # define native_chunkmemset_safe chunkmemset_safe_c -# define native_crc32 crc32_c +#ifndef WITHOUT_CHORBA +# define native_crc32 crc32_chorba +#else +# define native_crc32 crc32_braid +#endif # define native_crc32_fold crc32_fold_c # define native_crc32_fold_copy crc32_fold_copy_c # define native_crc32_fold_final crc32_fold_final_c diff --git a/arch/riscv/crc32_zbc.c b/arch/riscv/crc32_zbc.c index d5dc71cc9..e3f3c7164 100644 --- a/arch/riscv/crc32_zbc.c +++ b/arch/riscv/crc32_zbc.c @@ -6,13 +6,12 @@ #if defined(RISCV_CRC32_ZBC) #include "zbuild.h" +#include "arch_functions.h" #include #define CLMUL_MIN_LEN 16 // Minimum size of buffer for _crc32_clmul #define CLMUL_CHUNK_LEN 16 // Length of chunk for clmul -extern uint32_t crc32_c(uint32_t crc, const uint8_t *buf, size_t len); - #define CONSTANT_R3 0x1751997d0ULL #define CONSTANT_R4 0x0ccaa009eULL #define CONSTANT_R5 0x163cd6124ULL @@ -84,12 +83,12 @@ finish_fold: Z_INTERNAL uint32_t crc32_riscv64_zbc(uint32_t crc, const uint8_t *buf, size_t len) { if (len < CLMUL_MIN_LEN) { - return crc32_c(crc, buf, len); + return crc32_braid(crc, buf, len); } uint64_t unaligned_length = len % CLMUL_CHUNK_LEN; if (unaligned_length) { - crc = crc32_c(crc, buf, unaligned_length); + crc = crc32_braid(crc, buf, unaligned_length); buf += unaligned_length; len -= unaligned_length; } diff --git a/arch/s390/crc32-vx.c b/arch/s390/crc32-vx.c index d8fcf79cb..155eee494 100644 --- a/arch/s390/crc32-vx.c +++ b/arch/s390/crc32-vx.c @@ -202,12 +202,12 @@ uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t size_t prealign, aligned, remaining; if (len < VX_MIN_LEN + VX_ALIGN_MASK) - return crc32_c(crc, buf, len); + return crc32_braid(crc, buf, len); if ((uintptr_t)buf & VX_ALIGN_MASK) { prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK); len -= prealign; - crc = crc32_c(crc, buf, prealign); + crc = crc32_braid(crc, buf, prealign); buf += prealign; } aligned = len & ~VX_ALIGN_MASK; @@ -216,7 +216,7 @@ uint32_t Z_INTERNAL crc32_s390_vx(uint32_t crc, const unsigned char *buf, size_t crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff; if (remaining) - crc = crc32_c(crc, buf + aligned, remaining); + crc = crc32_braid(crc, buf + aligned, remaining); return crc; } diff --git a/arch/x86/chorba_sse2.c b/arch/x86/chorba_sse2.c index ac98e994c..3e25d7586 100644 --- a/arch/x86/chorba_sse2.c +++ b/arch/x86/chorba_sse2.c @@ -1,4 +1,4 @@ -#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) +#if defined(X86_SSE2) && !defined(WITHOUT_CHORBA_SSE) #include "zbuild.h" #include "crc32_braid_p.h" @@ -6,10 +6,7 @@ #include "crc32.h" #include #include "arch/x86/x86_intrins.h" -#include "arch/generic/generic_functions.h" -#include - -uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len); +#include "arch_functions.h" #define READ_NEXT(in, off, a, b) do { \ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ @@ -862,9 +859,12 @@ Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t l } aligned_buf = (uint64_t*) (buf + algn_diff); aligned_len = len - algn_diff; +#if !defined(WITHOUT_CHORBA) if(aligned_len > CHORBA_LARGE_THRESHOLD) { c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); - } else if (aligned_len > 72) { + } else +#endif + if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) { c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len); } else { c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); diff --git a/arch/x86/chorba_sse41.c b/arch/x86/chorba_sse41.c index 53d6e156c..aebede45e 100644 --- a/arch/x86/chorba_sse41.c +++ b/arch/x86/chorba_sse41.c @@ -1,4 +1,4 @@ -#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) #include "zbuild.h" #include "crc32_braid_p.h" @@ -7,11 +7,7 @@ #include #include #include "arch/x86/x86_intrins.h" -#include "arch/generic/generic_functions.h" -#include - -uint32_t crc32_braid_base(uint32_t c, const uint8_t *buf, size_t len); -uint32_t chorba_small_nondestructive_sse2(uint32_t c, const uint64_t *aligned_buf, size_t aligned_len); +#include "arch_functions.h" #define READ_NEXT(in, off, a, b) do { \ a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \ @@ -321,9 +317,12 @@ Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t } aligned_buf = (uint64_t*) (buf + algn_diff); aligned_len = len - algn_diff; +#if !defined(WITHOUT_CHORBA) if(aligned_len > CHORBA_LARGE_THRESHOLD) { c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); - } else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && + } else +#endif + if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) { c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len); } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) { diff --git a/arch/x86/crc32_fold_pclmulqdq_tpl.h b/arch/x86/crc32_fold_pclmulqdq_tpl.h index 4e5b11bf9..f4c924903 100644 --- a/arch/x86/crc32_fold_pclmulqdq_tpl.h +++ b/arch/x86/crc32_fold_pclmulqdq_tpl.h @@ -105,225 +105,223 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint } #endif -#ifndef WITHOUT_CHORBA - /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 - * We interleave the PCLMUL-base folds with 8x scaled generator - * polynomial copies; we read 8x QWORDS and then XOR them into - * the stream at the following offsets: 6, 9, 10, 16, 20, 22, - * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper - * as "generator_64_bits_unrolled_8" */ - while (len >= 512 + 64 + 16*8) { - __m128i chorba8 = _mm_loadu_si128((__m128i *)src); - __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1); - __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2); - __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3); - __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4); - __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5); - __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6); - __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7); + /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398 + * We interleave the PCLMUL-base folds with 8x scaled generator + * polynomial copies; we read 8x QWORDS and then XOR them into + * the stream at the following offsets: 6, 9, 10, 16, 20, 22, + * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper + * as "generator_64_bits_unrolled_8" */ + while (len >= 512 + 64 + 16*8) { + __m128i chorba8 = _mm_loadu_si128((__m128i *)src); + __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1); + __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2); + __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3); + __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4); + __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5); + __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6); + __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, chorba8); - _mm_storeu_si128((__m128i *)dst + 1, chorba7); - _mm_storeu_si128((__m128i *)dst + 2, chorba6); - _mm_storeu_si128((__m128i *)dst + 3, chorba5); - _mm_storeu_si128((__m128i *)dst + 4, chorba4); - _mm_storeu_si128((__m128i *)dst + 5, chorba3); - _mm_storeu_si128((__m128i *)dst + 6, chorba2); - _mm_storeu_si128((__m128i *)dst + 7, chorba1); - dst += 16*8; + _mm_storeu_si128((__m128i *)dst, chorba8); + _mm_storeu_si128((__m128i *)dst + 1, chorba7); + _mm_storeu_si128((__m128i *)dst + 2, chorba6); + _mm_storeu_si128((__m128i *)dst + 3, chorba5); + _mm_storeu_si128((__m128i *)dst + 4, chorba4); + _mm_storeu_si128((__m128i *)dst + 5, chorba3); + _mm_storeu_si128((__m128i *)dst + 6, chorba2); + _mm_storeu_si128((__m128i *)dst + 7, chorba1); + dst += 16*8; #else - XOR_INITIAL128(chorba8); + XOR_INITIAL128(chorba8); #endif - chorba2 = _mm_xor_si128(chorba2, chorba8); - chorba1 = _mm_xor_si128(chorba1, chorba7); - src += 16*8; - len -= 16*8; + chorba2 = _mm_xor_si128(chorba2, chorba8); + chorba1 = _mm_xor_si128(chorba1, chorba7); + src += 16*8; + len -= 16*8; - xmm_t0 = _mm_loadu_si128((__m128i *)src); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 1); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 2); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 3); + xmm_t0 = _mm_loadu_si128((__m128i *)src); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 1); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 2); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 3); - fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(xmm_t0, chorba6); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 4); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 5); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 6); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 7); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(xmm_t0, chorba6); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 4); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 5); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 6); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 7); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 8); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 9); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 10); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 11); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 8); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 9); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 10); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 11); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7); - xmm_t2 = _mm_xor_si128(xmm_t2, chorba6); - xmm_t3 = _mm_xor_si128(xmm_t3, chorba5); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 12); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 13); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 14); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 15); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7); + xmm_t2 = _mm_xor_si128(xmm_t2, chorba6); + xmm_t3 = _mm_xor_si128(xmm_t3, chorba5); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 12); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 13); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 14); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 15); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 16); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 17); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 18); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 19); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 16); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 17); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 18); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 19); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 20); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 21); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 22); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 23); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 20); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 21); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 22); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 23); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 24); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 25); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 26); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 27); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 24); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 25); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 26); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 27); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5); - xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - xmm_t0 = _mm_loadu_si128((__m128i *)src + 28); - xmm_t1 = _mm_loadu_si128((__m128i *)src + 29); - xmm_t2 = _mm_loadu_si128((__m128i *)src + 30); - xmm_t3 = _mm_loadu_si128((__m128i *)src + 31); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5); + xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + xmm_t0 = _mm_loadu_si128((__m128i *)src + 28); + xmm_t1 = _mm_loadu_si128((__m128i *)src + 29); + xmm_t2 = _mm_loadu_si128((__m128i *)src + 30); + xmm_t3 = _mm_loadu_si128((__m128i *)src + 31); + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); #ifdef COPY - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); - dst += 64; + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; #endif - xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4); - xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3); - xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2); - xmm_t3 = _mm_xor_si128(xmm_t3, chorba1); - xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); - xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); - xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); - xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); - - len -= 512; - src += 512; - } -#endif /* WITHOUT_CHORBA */ + xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4); + xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3); + xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2); + xmm_t3 = _mm_xor_si128(xmm_t3, chorba1); + xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0); + xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1); + xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2); + xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3); + + len -= 512; + src += 512; + } while (len >= 64) { len -= 64; diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h index 933733af2..c6c4c8f8b 100644 --- a/arch/x86/crc32_pclmulqdq_tpl.h +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -22,9 +22,6 @@ #include #include #include // _mm_extract_epi32 -#ifdef X86_VPCLMULQDQ -# include -#endif #include "crc32.h" #include "crc32_braid_p.h" @@ -168,7 +165,6 @@ static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m1 *xmm_crc3 = _mm_castps_si128(ps_res3); } -#ifndef WITHOUT_CHORBA static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { const __m128i xmm_fold12 = _mm_set_epi64x(0x596C8D81, 0xF5E48C85); __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3; @@ -210,7 +206,6 @@ static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m *xmm_crc2 = _mm_castps_si128(ps_res2); *xmm_crc3 = _mm_castps_si128(ps_res3); } -#endif static const unsigned ALIGNED_(32) pshufb_shf_table[60] = { 0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */ diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 918b7e0f6..5d9065e1b 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -24,9 +24,10 @@ uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsign void slide_hash_sse2(deflate_state *s); # endif void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start); -# if !defined(WITHOUT_CHORBA) +# if !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse2(uint32_t crc32, const uint8_t *buf, size_t len); -# endif + uint32_t chorba_small_nondestructive_sse2(uint32_t c, const uint64_t *aligned_buf, size_t aligned_len); +# endif #endif #ifdef X86_SSSE3 @@ -35,10 +36,8 @@ uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsig void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start); #endif -#ifdef X86_SSE41 -# if !defined(WITHOUT_CHORBA) +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) uint32_t crc32_chorba_sse41(uint32_t crc32, const uint8_t *buf, size_t len); -# endif #endif #ifdef X86_SSE42 @@ -105,9 +104,9 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # define native_longest_match longest_match_sse2 # undef native_longest_match_slow # define native_longest_match_slow longest_match_slow_sse2 -# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) -# undef native_crc32 -# define native_crc32 crc32_chorba_sse2 +# if !defined(WITHOUT_CHORBA_SSE) +# undef native_crc32 +# define native_crc32 crc32_chorba_sse2 # endif # endif # endif @@ -121,7 +120,7 @@ uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); # define native_inflate_fast inflate_fast_ssse3 # endif // X86 - SSE4.1 -# if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && defined(__SSE4_1__) && !defined(NO_CHORBA_SSE) +# if defined(X86_SSE41) && defined(__SSE4_1__) && !defined(WITHOUT_CHORBA_SSE) # undef native_crc32 # define native_crc32 crc32_chorba_sse41 # endif diff --git a/crc32.h b/crc32.h index 4c1eacaea..e26b59e52 100644 --- a/crc32.h +++ b/crc32.h @@ -15,8 +15,6 @@ #define CHORBA_SMALL_THRESHOLD_64BIT 72 #define CHORBA_SMALL_THRESHOLD_32BIT 80 -Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len); - typedef struct crc32_fold_s { uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; uint32_t value; diff --git a/functable.c b/functable.c index f8a122d8d..8924f7351 100644 --- a/functable.c +++ b/functable.c @@ -80,7 +80,7 @@ static int init_functable(void) { // x86_64 always has SSE2, so we can use SSE2 functions as fallbacks where available. ft.adler32 = &adler32_c; ft.adler32_fold_copy = &adler32_fold_copy_c; - ft.crc32 = &crc32_c; + ft.crc32 = &crc32_braid; ft.crc32_fold = &crc32_fold_c; ft.crc32_fold_copy = &crc32_fold_copy_c; ft.crc32_fold_final = &crc32_fold_final_c; @@ -95,7 +95,7 @@ static int init_functable(void) { ft.adler32 = &adler32_c; ft.adler32_fold_copy = &adler32_fold_copy_c; ft.chunkmemset_safe = &chunkmemset_safe_c; - ft.crc32 = &crc32_c; + ft.crc32 = &crc32_braid; ft.crc32_fold = &crc32_fold_c; ft.crc32_fold_copy = &crc32_fold_copy_c; ft.crc32_fold_final = &crc32_fold_final_c; @@ -110,6 +110,11 @@ static int init_functable(void) { // Select arch-optimized functions #ifdef WITH_OPTIM + // Chorba generic C fallback +#ifndef WITHOUT_CHORBA + ft.crc32 = &crc32_chorba; +#endif + // X86 - SSE2 #ifdef X86_SSE2 # if !defined(__x86_64__) && !defined(_M_X64) @@ -117,7 +122,7 @@ static int init_functable(void) { # endif { ft.chunkmemset_safe = &chunkmemset_safe_sse2; -# if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) +# if !defined(WITHOUT_CHORBA_SSE) ft.crc32 = &crc32_chorba_sse2; # endif ft.inflate_fast = &inflate_fast_sse2; @@ -139,11 +144,9 @@ static int init_functable(void) { #endif // X86 - SSE4.1 -#ifdef X86_SSE41 +#if defined(X86_SSE41) && !defined(WITHOUT_CHORBA_SSE) if (cf.x86.has_sse41) { -#if !defined(WITHOUT_CHORBA) && !defined(NO_CHORBA_SSE) ft.crc32 = &crc32_chorba_sse41; -#endif } #endif diff --git a/test/benchmarks/benchmark_crc32.cc b/test/benchmarks/benchmark_crc32.cc index 1e95b2777..3b00f87d7 100644 --- a/test/benchmarks/benchmark_crc32.cc +++ b/test/benchmarks/benchmark_crc32.cc @@ -56,12 +56,6 @@ public: } \ BENCHMARK_REGISTER_F(crc32, name)->Arg(1)->Arg(8)->Arg(12)->Arg(16)->Arg(32)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(32<<10)->Arg(256<<10)->Arg(4096<<10); -#ifndef WITHOUT_CHORBA -BENCHMARK_CRC32(generic_chorba, crc32_c, 1); -#else -BENCHMARK_CRC32(generic, crc32_c, 1); -#endif - BENCHMARK_CRC32(braid, crc32_braid, 1); #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -69,14 +63,16 @@ BENCHMARK_CRC32(native, native_crc32, 1); #else #ifndef WITHOUT_CHORBA -# if defined(X86_SSE2) && !defined(NO_CHORBA_SSE) +BENCHMARK_CRC32(chorba_c, crc32_chorba, 1); +#endif +#ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 BENCHMARK_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2); -# if defined(X86_SSE41) && !defined(NO_CHORBA_SSE) - BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41); -# endif +# endif +# ifdef X86_SSE41 + BENCHMARK_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41); # endif #endif - #ifdef ARM_CRC32 BENCHMARK_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32); #endif diff --git a/test/test_crc32.cc b/test/test_crc32.cc index d44d079e9..ca0767d46 100644 --- a/test/test_crc32.cc +++ b/test/test_crc32.cc @@ -269,12 +269,6 @@ INSTANTIATE_TEST_SUITE_P(crc32, crc32_variant, testing::ValuesIn(tests)); hash(func); \ } -#ifndef WITHOUT_CHORBA -TEST_CRC32(generic_chorba, crc32_c, 1) -#else -TEST_CRC32(generic, crc32_c, 1) -#endif - TEST_CRC32(braid, crc32_braid, 1) #ifdef DISABLE_RUNTIME_CPU_DETECTION @@ -297,6 +291,9 @@ static const int align_offsets[] = { } #endif +#ifndef WITHOUT_CHORBA +TEST_CRC32(chorba_c, crc32_chorba, 1) +#endif #ifdef ARM_CRC32 INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets)); TEST_CRC32(armv8, crc32_armv8, test_cpu_features.arm.has_crc32) @@ -317,11 +314,13 @@ TEST_CRC32(pclmulqdq, crc32_pclmulqdq, test_cpu_features.x86.has_pclmulqdq) #ifdef X86_VPCLMULQDQ_CRC TEST_CRC32(vpclmulqdq, crc32_vpclmulqdq, (test_cpu_features.x86.has_pclmulqdq && test_cpu_features.x86.has_avx512_common && test_cpu_features.x86.has_vpclmulqdq)) #endif -#if !defined(WITHOUT_CHORBA) && defined(X86_SSE2) && !defined(NO_CHORBA_SSE) -TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2) -#endif -#if !defined(WITHOUT_CHORBA) && defined(X86_SSE41) && !defined(NO_CHORBA_SSE) -TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41) +#ifndef WITHOUT_CHORBA_SSE +# ifdef X86_SSE2 + TEST_CRC32(chorba_sse2, crc32_chorba_sse2, test_cpu_features.x86.has_sse2) +# endif +# ifdef X86_SSE41 + TEST_CRC32(chorba_sse41, crc32_chorba_sse41, test_cpu_features.x86.has_sse41) +# endif #endif #if defined(LOONGARCH_CRC) INSTANTIATE_TEST_SUITE_P(crc32_alignment, crc32_align, testing::ValuesIn(align_offsets));