From: Nathan Moinvaziri Date: Sat, 9 May 2020 04:19:43 +0000 (-0400) Subject: Split compare258 into compare256 for longest_match and compare258 for deflate_quick. X-Git-Tag: 1.9.9-b1~296 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d26fd7a4f0218e708f32f14a3a088d0e64e40455;p=thirdparty%2Fzlib-ng.git Split compare258 into compare256 for longest_match and compare258 for deflate_quick. --- diff --git a/arch/x86/compare258_avx.c b/arch/x86/compare258_avx.c index 010c6922b..0399abca8 100644 --- a/arch/x86/compare258_avx.c +++ b/arch/x86/compare258_avx.c @@ -16,10 +16,9 @@ #endif /* UNALIGNED_OK, AVX2 intrinsic comparison */ -static inline int32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { - const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 256; - +static inline int32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { + int32_t len = 0; + do { __m256i ymm_src0, ymm_src1, ymm_cmp; ymm_src0 = _mm256_loadu_si256((__m256i*)src0); @@ -28,10 +27,10 @@ static inline int32_t compare258_unaligned_avx2_static(const unsigned char *src0 int mask = _mm256_movemask_epi8(ymm_cmp); if ((unsigned int)mask != 0xFFFFFFFF) { int match_byte = __builtin_ctz(~mask); /* Invert bits so identical = 0 */ - return (int32_t)(src0 - src0start + match_byte); + return (int32_t)(len + match_byte); } - src0 += 32, src1 += 32; + src0 += 32, src1 += 32, len += 32; ymm_src0 = _mm256_loadu_si256((__m256i*)src0); ymm_src1 = _mm256_loadu_si256((__m256i*)src1); @@ -39,18 +38,26 @@ static inline int32_t compare258_unaligned_avx2_static(const unsigned char *src0 mask = _mm256_movemask_epi8(ymm_cmp); if ((unsigned int)mask != 0xFFFFFFFF) { int match_byte = __builtin_ctz(~mask); - return (int32_t)(src0 - src0start + match_byte); + return (int32_t)(len + match_byte); } - src0 += 32, src1 += 32; - } while (src0 < src0end); + src0 += 32, src1 += 32, len += 32; + } while (len < 256); - if (*(uint16_t *)src0 == *(uint16_t *)src1) - src0 += 2, src1 += 2; - else if (*src0 == *src1) - src0 += 1, src1 += 1; + return len; +} + +static inline int32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + return (*src0 == *src1); + + src0 += 2, src1 += 2; + if (*src0 != *src1) + return 2; + if (src0[1] != src1[1]) + return 3; - return (int32_t)(src0 - src0start); + return compare256_unaligned_avx2_static(src0, src1) + 2; } int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) { @@ -58,6 +65,7 @@ int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char } #define LONGEST_MATCH longest_match_unaligned_avx2 +#define COMPARE256 compare256_unaligned_avx2_static #define COMPARE258 compare258_unaligned_avx2_static #include "match_p.h" diff --git a/arch/x86/compare258_sse.c b/arch/x86/compare258_sse.c index b93b8a41b..3e5e4eef3 100644 --- a/arch/x86/compare258_sse.c +++ b/arch/x86/compare258_sse.c @@ -26,10 +26,8 @@ #endif /* UNALIGNED_OK, SSE4.2 intrinsic comparison */ -static inline int32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { -#ifdef _MSC_VER - const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 256; +static inline int32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { + int32_t len = 0; do { #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY @@ -40,76 +38,34 @@ static inline int32_t compare258_unaligned_sse4_static(const unsigned char *src0 xmm_src1 = _mm_loadu_si128((__m128i *)src1); ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - return (int32_t)(src0 - src0start + ret); + return (int32_t)(len + ret); } - src0 += 16, src1 += 16; + src0 += 16, src1 += 16, len += 16; xmm_src0 = _mm_loadu_si128((__m128i *)src0); xmm_src1 = _mm_loadu_si128((__m128i *)src1); ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - return (int32_t)(src0 - src0start + ret); + return (int32_t)(len + ret); } - src0 += 16, src1 += 16; - } while (src0 < src0end); - - if (*(uint16_t *)src0 == *(uint16_t *)src1) - src0 += 2, src1 += 2; - else if (*src0 == *src1) - src0 += 1, src1 += 1; - - return (int32_t)(src0 - src0start); -#else - uintptr_t ax, dx, cx; - __m128i xmm_src0; - - ax = 16; - dx = 16; - /* Set cx to something, otherwise gcc thinks it's used - uninitalised */ - cx = 0; - - __asm__ __volatile__ ( - "1:" - "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" - "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" - "jc 2f\n\t" - "add $16, %[ax]\n\t" - - "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" - "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" - "jc 2f\n\t" - "add $16, %[ax]\n\t" - - "cmp $256 + 16, %[ax]\n\t" - "jb 1b\n\t" - -# if !defined(__x86_64__) - "movzwl -16(%[src0], %[ax]), %[dx]\n\t" -# else - "movzwq -16(%[src0], %[ax]), %[dx]\n\t" -# endif - "xorw -16(%[src1], %[ax]), %%dx\n\t" - "jnz 3f\n\t" - - "add $2, %[ax]\n\t" - "jmp 4f\n\t" - "3:\n\t" - "rep; bsf %[dx], %[cx]\n\t" - "shr $3, %[cx]\n\t" - "2:" - "add %[cx], %[ax]\n\t" - "4:" - : [ax] "+a" (ax), - [cx] "+c" (cx), - [dx] "+d" (dx), - [xmm_src0] "=x" (xmm_src0) - : [src0] "r" (src0), - [src1] "r" (src1) - : "cc" - ); - return (int32_t)(ax - 16); -#endif + src0 += 16, src1 += 16, len += 16; + } while (len < 256); + + return len; +} + +static inline int32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + return (*src0 == *src1); + + src0 += 2, src1 += 2; + + if (*src0 != *src1) + return 2; + if (src0[1] != src1[1]) + return 3; + + return compare256_unaligned_sse4_static(src0, src1) + 2; } int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) { @@ -117,6 +73,7 @@ int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char } #define LONGEST_MATCH longest_match_unaligned_sse4 +#define COMPARE256 compare256_unaligned_sse4_static #define COMPARE258 compare258_unaligned_sse4_static #include "match_p.h" diff --git a/compare258.c b/compare258.c index d32e1deff..db212d138 100644 --- a/compare258.c +++ b/compare258.c @@ -9,9 +9,9 @@ #include "fallback_builtins.h" /* ALIGNED, byte comparison */ -static inline int32_t compare258_c_static(const unsigned char *src0, const unsigned char *src1) { +static inline int32_t compare256_c_static(const unsigned char *src0, const unsigned char *src1) { const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 258; + const unsigned char *src0end = src0 + 256; do { if (*src0 != *src1) @@ -26,31 +26,39 @@ static inline int32_t compare258_c_static(const unsigned char *src0, const unsig if (*src0 != *src1) break; src0 += 1, src1 += 1; - if (*src0 != *src1) - break; - src0 += 1, src1 += 1; - if (*src0 != *src1) - break; - src0 += 1, src1 += 1; } while (src0 < src0end); - + return (int32_t)(src0 - src0start); } +static inline int32_t compare258_c_static(const unsigned char *src0, const unsigned char *src1) { + if (*src0 != *src1) + return 0; + src0 += 1, src1 += 1; + if (*src0 != *src1) + return 1; + src0 += 1, src1 += 1; + if (*src0 != *src1) + return 2; + + return compare256_c_static(src0, src1) + 2; +} + int32_t compare258_c(const unsigned char *src0, const unsigned char *src1) { return compare258_c_static(src0, src1); } #define LONGEST_MATCH longest_match_c +#define COMPARE256 compare256_c_static #define COMPARE258 compare258_c_static #include "match_p.h" #ifdef UNALIGNED_OK /* UNALIGNED_OK, 16-bit integer comparison */ -static inline int32_t compare258_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { +static inline int32_t compare256_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 258; + const unsigned char *src0end = src0 + 256; do { if (*(uint16_t *)src0 != *(uint16_t *)src1) @@ -62,12 +70,19 @@ static inline int32_t compare258_unaligned_16_static(const unsigned char *src0, if (*(uint16_t *)src0 != *(uint16_t *)src1) break; src0 += 2, src1 += 2; + if (*(uint16_t *)src0 != *(uint16_t *)src1) + break; + src0 += 2, src1 += 2; } while (src0 < src0end); - if (*src0 == *src1) - src0 += 1; + return (int32_t)(src0 - src0start) + (src0 < src0end && *src0 == *src1); +} - return (int32_t)(src0 - src0start); +static inline int32_t compare258_unaligned_16_static(const unsigned char *src0, const unsigned char *src1) { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + return (*src0 == *src1); + + return compare256_unaligned_16_static(src0+2, src1+2) + 2; } int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1) { @@ -75,15 +90,15 @@ int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char * } #define LONGEST_MATCH longest_match_unaligned_16 +#define COMPARE256 compare256_unaligned_16_static #define COMPARE258 compare258_unaligned_16_static #include "match_p.h" #ifdef HAVE_BUILTIN_CTZ /* UNALIGNED_OK, 32-bit integer comparison */ -static inline int32_t compare258_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { - const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 256; +static inline int32_t compare256_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { + int32_t len = 0; do { uint32_t sv = *(uint32_t *)src0; @@ -92,18 +107,26 @@ static inline int32_t compare258_unaligned_32_static(const unsigned char *src0, if (xor) { uint32_t match_byte = __builtin_ctz(xor) / 8; - return (int32_t)(src0 - src0start + match_byte); + return (int32_t)(len + match_byte); } - src0 += 4, src1 += 4; - } while (src0 < src0end); + src0 += 4, src1 += 4, len += 4; + } while (len < 256); - if (*(uint16_t *)src0 == *(uint16_t *)src1) - src0 += 2, src1 += 2; - else if (*src0 == *src1) - src0 += 1, src1 += 1; + return len; +} - return (int32_t)(src0 - src0start); +static inline int32_t compare258_unaligned_32_static(const unsigned char *src0, const unsigned char *src1) { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + return (*src0 == *src1); + + src0 += 2, src1 += 2; + if (*src0 != *src1) + return 2; + if (src0[1] != src1[1]) + return 3; + + return compare256_unaligned_32_static(src0, src1) + 2; } int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1) { @@ -111,6 +134,7 @@ int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char * } #define LONGEST_MATCH longest_match_unaligned_32 +#define COMPARE256 compare256_unaligned_32_static #define COMPARE258 compare258_unaligned_32_static #include "match_p.h" @@ -119,9 +143,8 @@ int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char * #ifdef HAVE_BUILTIN_CTZLL /* UNALIGNED_OK, 64-bit integer comparison */ -static inline int32_t compare258_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { - const unsigned char *src0start = src0; - const unsigned char *src0end = src0 + 256; +static inline int32_t compare256_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { + int32_t len = 0; do { uint64_t sv = *(uint64_t *)src0; @@ -130,18 +153,26 @@ static inline int32_t compare258_unaligned_64_static(const unsigned char *src0, if (xor) { uint64_t match_byte = __builtin_ctzll(xor) / 8; - return (int32_t)(src0 - src0start + match_byte); + return (int32_t)(len + match_byte); } - src0 += 8, src1 += 8; - } while (src0 < src0end); + src0 += 8, src1 += 8, len += 8; + } while (len < 256); - if (*(uint16_t *)src0 == *(uint16_t *)src1) - src0 += 2, src1 += 2; - else if (*src0 == *src1) - src0 += 1, src1 += 1; + return len; +} - return (int32_t)(src0 - src0start); +static inline int32_t compare258_unaligned_64_static(const unsigned char *src0, const unsigned char *src1) { + if (*(uint16_t *)src0 != *(uint16_t *)src1) + return (*src0 == *src1); + + src0 += 2, src1 += 2; + if (*src0 != *src1) + return 2; + if (src0[1] != src1[1]) + return 3; + + return compare256_unaligned_64_static(src0, src1) + 2; } int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1) { @@ -149,6 +180,7 @@ int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char * } #define LONGEST_MATCH longest_match_unaligned_64 +#define COMPARE256 compare256_unaligned_64_static #define COMPARE258 compare258_unaligned_64_static #include "match_p.h" diff --git a/match_p.h b/match_p.h index 09726be44..93aa0d475 100644 --- a/match_p.h +++ b/match_p.h @@ -104,7 +104,12 @@ int32_t LONGEST_MATCH(deflate_state *const s, IPos cur_match) { if (!cont) break; +#if MIN_MATCH >= 2 && defined(UNALIGNED_OK) + len = COMPARE256(scan+2, match+2) + 2; +#else len = COMPARE258(scan, match); +#endif + Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); if (len > best_len) { @@ -128,3 +133,7 @@ int32_t LONGEST_MATCH(deflate_state *const s, IPos cur_match) { return best_len; return s->lookahead; } + +#undef LONGEST_MATCH +#undef COMPARE256 +#undef COMPARE258