From: Nathan Moinvaziri Date: Thu, 7 May 2020 15:23:00 +0000 (-0700) Subject: Modified deflate_quick to use compare258_unaligned_sse4. X-Git-Tag: 1.9.9-b1~303 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=62e3baaeb5e80bab6d0b4e1ca2fc3946f7847568;p=thirdparty%2Fzlib-ng.git Modified deflate_quick to use compare258_unaligned_sse4. --- diff --git a/arch/x86/deflate_quick.c b/arch/x86/deflate_quick.c index d4056bed0..23995387d 100644 --- a/arch/x86/deflate_quick.c +++ b/arch/x86/deflate_quick.c @@ -31,94 +31,7 @@ extern const ct_data static_ltree[L_CODES+2]; extern const ct_data static_dtree[D_CODES]; -static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) { -#ifdef _MSC_VER - long cnt; - - cnt = 0; - do { -#define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY - - int ret; - __m128i xmm_src0, xmm_src1; - - xmm_src0 = _mm_loadu_si128((__m128i *)(src0 + cnt)); - xmm_src1 = _mm_loadu_si128((__m128i *)(src1 + cnt)); - ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - cnt += ret; - break; - } - cnt += 16; - - xmm_src0 = _mm_loadu_si128((__m128i *)(src0 + cnt)); - xmm_src1 = _mm_loadu_si128((__m128i *)(src1 + cnt)); - ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - cnt += ret; - break; - } - cnt += 16; - } while (cnt < 256); - - if (memcmp(src0 + cnt, src1 + cnt, sizeof(uint16_t)) == 0) { - cnt += 2; - } else if (*(src0 + cnt) == *(src1 + cnt)) { - cnt++; - } - return cnt; -#else - uintptr_t ax, dx, cx; - __m128i xmm_src0; - - ax = 16; - dx = 16; - /* set cx to something, otherwise gcc thinks it's used - uninitalised */ - cx = 0; - - __asm__ __volatile__ ( - "1:" - "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" - "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" - "jc 2f\n\t" - "add $16, %[ax]\n\t" - - "movdqu -16(%[src0], %[ax]), %[xmm_src0]\n\t" - "pcmpestri $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t" - "jc 2f\n\t" - "add $16, %[ax]\n\t" - - "cmp $256 + 16, %[ax]\n\t" - "jb 1b\n\t" - -# if !defined(__x86_64__) - "movzwl -16(%[src0], %[ax]), %[dx]\n\t" -# else - "movzwq -16(%[src0], %[ax]), %[dx]\n\t" -# endif - "xorw -16(%[src1], %[ax]), %%dx\n\t" - "jnz 3f\n\t" - - "add $2, %[ax]\n\t" - "jmp 4f\n\t" - "3:\n\t" - "rep; bsf %[dx], %[cx]\n\t" - "shr $3, %[cx]\n\t" - "2:" - "add %[cx], %[ax]\n\t" - "4:" - : [ax] "+a" (ax), - [cx] "+c" (cx), - [dx] "+d" (dx), - [xmm_src0] "=x" (xmm_src0) - : [src0] "r" (src0), - [src1] "r" (src1) - : "cc" - ); - return ax - 16; -#endif -} +extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { IPos hash_head; @@ -154,7 +67,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) { dist = s->strstart - hash_head; if (dist > 0 && (dist-1) < (s->w_size - 1)) { - match_len = compare258(s->window + s->strstart, s->window + hash_head); + match_len = compare258_unaligned_sse4(s->window + s->strstart, s->window + hash_head); if (match_len >= MIN_MATCH) { if (match_len > s->lookahead)