]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Modified deflate_quick to use compare258_unaligned_sse4.
authorNathan Moinvaziri <nathan@nathanm.com>
Thu, 7 May 2020 15:23:00 +0000 (08:23 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 24 May 2020 11:53:25 +0000 (13:53 +0200)
arch/x86/deflate_quick.c

index d4056bed0f00c26497ccc27134369cc698166ca0..23995387d536b10f5be3d43f0fdf214a0d1a5514 100644 (file)
 extern const ct_data static_ltree[L_CODES+2];
 extern const ct_data static_dtree[D_CODES];
 
-static inline long compare258(const unsigned char *const src0, const unsigned char *const src1) {
-#ifdef _MSC_VER
-    long cnt;
-
-    cnt = 0;
-    do {
-#define mode  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
-
-        int ret;
-        __m128i xmm_src0, xmm_src1;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)(src0 + cnt));
-        xmm_src1 = _mm_loadu_si128((__m128i *)(src1 + cnt));
-        ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
-            cnt += ret;
-            break;
-        }
-        cnt += 16;
-
-        xmm_src0 = _mm_loadu_si128((__m128i *)(src0 + cnt));
-        xmm_src1 = _mm_loadu_si128((__m128i *)(src1 + cnt));
-        ret = _mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
-        if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
-            cnt += ret;
-            break;
-        }
-        cnt += 16;
-    } while (cnt < 256);
-
-    if (memcmp(src0 + cnt, src1 + cnt, sizeof(uint16_t)) == 0) {
-        cnt += 2;
-    } else if (*(src0 + cnt) == *(src1 + cnt)) {
-        cnt++;
-    }
-    return cnt;
-#else
-    uintptr_t ax, dx, cx;
-    __m128i xmm_src0;
-
-    ax = 16;
-    dx = 16;
-    /* set cx to something, otherwise gcc thinks it's used
-       uninitalised */
-    cx = 0;
-
-    __asm__ __volatile__ (
-    "1:"
-        "movdqu     -16(%[src0], %[ax]), %[xmm_src0]\n\t"
-        "pcmpestri  $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t"
-        "jc         2f\n\t"
-        "add        $16, %[ax]\n\t"
-
-        "movdqu     -16(%[src0], %[ax]), %[xmm_src0]\n\t"
-        "pcmpestri  $0x18, -16(%[src1], %[ax]), %[xmm_src0]\n\t"
-        "jc         2f\n\t"
-        "add        $16, %[ax]\n\t"
-
-        "cmp        $256 + 16, %[ax]\n\t"
-        "jb         1b\n\t"
-
-#  if !defined(__x86_64__)
-        "movzwl     -16(%[src0], %[ax]), %[dx]\n\t"
-#  else
-        "movzwq     -16(%[src0], %[ax]), %[dx]\n\t"
-#  endif
-        "xorw       -16(%[src1], %[ax]), %%dx\n\t"
-        "jnz        3f\n\t"
-
-        "add        $2, %[ax]\n\t"
-        "jmp        4f\n\t"
-    "3:\n\t"
-        "rep; bsf   %[dx], %[cx]\n\t"
-        "shr        $3, %[cx]\n\t"
-    "2:"
-        "add        %[cx], %[ax]\n\t"
-    "4:"
-    : [ax] "+a" (ax),
-      [cx] "+c" (cx),
-      [dx] "+d" (dx),
-      [xmm_src0] "=x" (xmm_src0)
-    : [src0] "r" (src0),
-      [src1] "r" (src1)
-    : "cc"
-    );
-    return ax - 16;
-#endif
-}
+extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
 
 ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
     IPos hash_head;
@@ -154,7 +67,7 @@ ZLIB_INTERNAL block_state deflate_quick(deflate_state *s, int flush) {
             dist = s->strstart - hash_head;
 
             if (dist > 0 && (dist-1) < (s->w_size - 1)) {
-                match_len = compare258(s->window + s->strstart, s->window + hash_head);
+                match_len = compare258_unaligned_sse4(s->window + s->strstart, s->window + hash_head);
 
                 if (match_len >= MIN_MATCH) {
                     if (match_len > s->lookahead)