From: Hans Kristian Rosbach <hk-git@circlestorm.org>
Date: Thu, 6 Nov 2014 19:59:54 +0000 (+0100)
Subject: Merge longest_match implementation from CloudFlare, authored by
X-Git-Tag: 1.9.9-b1~891
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c91d28ec792104094717a07fb53785ffc245fa5d;p=thirdparty%2Fzlib-ng.git

Merge longest_match implementation from CloudFlare, authored by
Shuxin Yang in commit 31043308c3d3edfb487d2c4cbe7290bd5b63c65c at
https://github.com/cloudflare/zlib/commit/31043308c3d3edfb487d2c4cbe7290bd5b63c65c

This has been modified by me to fit into zlib-ng.
---

diff --git a/deflate.h b/deflate.h
index 7f7deb3ae..68dc71146 100644
--- a/deflate.h
+++ b/deflate.h
@@ -23,6 +23,10 @@
 #  define GZIP
 #endif
 
+#define NIL 0
+/* Tail of hash chains */
+
+
 /* ===========================================================================
  * Internal compression state.
  */
diff --git a/match.c b/match.c
index 0843933ac..5198c85b4 100644
--- a/match.c
+++ b/match.c
@@ -13,7 +13,11 @@
 #include "deflate.h"
 
 #if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
-#  define std2_longest_match
+#  if defined(HAVE_BUILTIN_CTZL)
+#    define std3_longest_match
+#  else
+#    define std2_longest_match
+#  endif
 #else
 #  define std1_longest_match
 #endif
@@ -189,7 +193,7 @@ ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match)
 	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
 		"need lookahead");
 	do {
-        unsigned char *match;
+		unsigned char *match;
 		Assert(cur_match < s->strstart, "no future");
 		match = s->window + cur_match;
 		
@@ -261,3 +265,191 @@ ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match)
 	return s->lookahead;
 }
 #endif
+
+#ifdef std3_longest_match
+
+/* longest_match() with minor change to improve performance (in terms of
+ * execution time).
+ *
+ * The pristine longest_match() function is sketched bellow (strip the
+ * then-clause of the "#ifdef UNALIGNED_OK"-directive)
+ *
+ * ------------------------------------------------------------
+ * uInt longest_match(...) {
+ *    ...
+ *    do {
+ *        match = s->window + cur_match;                //s0
+ *        if (*(ushf*)(match+best_len-1) != scan_end || //s1
+ *            *(ushf*)match != scan_start) continue;    //s2
+ *        ...
+ *
+ *        do {
+ *        } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ *                 scan < strend); //s3
+ *
+ *        ...
+ *    } while(cond); //s4
+ *
+ * -------------------------------------------------------------
+ *
+ * The change include:
+ *
+ *  1) The hottest statements of the function is: s0, s1 and s4. Pull them
+ *     together to form a new loop. The benefit is two-fold:
+ *
+ *    o. Ease the compiler to yield good code layout: the conditional-branch
+ *       corresponding to s1 and its biased target s4 become very close (likely,
+ *       fit in the same cache-line), hence improving instruction-fetching
+ *       efficiency.
+ *
+ *    o. Ease the compiler to promote "s->window" into register. "s->window"
+ *       is loop-invariant; it is supposed to be promoted into register and keep
+ *       the value throughout the entire loop. However, there are many such
+ *       loop-invariant, and x86-family has small register file; "s->window" is
+ *       likely to be chosen as register-allocation victim such that its value
+ *       is reloaded from memory in every single iteration. By forming a new loop,
+ *       "s->window" is loop-invariant of that newly created tight loop. It is
+ *       lot easier for compiler to promote this quantity to register and keep
+ *       its value throughout the entire small loop.
+ *
+ * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time.
+ *    This is done by:
+ *        ------------------------------------------------
+ *        v1 = load from "scan" by sizeof(long) bytes
+ *        v2 = load from "match" by sizeof(lnog) bytes
+ *        v3 = v1 xor v2
+ *        match-bit = little-endian-machine(yes-for-x86) ?
+ *                     count-trailing-zero(v3) :
+ *                     count-leading-zero(v3);
+ *
+ *        match-byte = match-bit/8
+ *
+ *        "scan" and "match" advance if necessary
+ *       -------------------------------------------------
+ */
+
+ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match)
+{
+    unsigned chain_length = s->max_chain_length;/* max hash chain length */
+    register Byte *scan = s->window + s->strstart; /* current string */
+    register Byte *match;                       /* matched string */
+    register int len;                           /* length of current match */
+    int best_len = s->prev_length;              /* best match length so far */
+    int nice_match = s->nice_match;             /* stop if match long enough */
+    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+        s->strstart - (IPos)MAX_DIST(s) : NIL;
+    /* Stop when cur_match becomes <= limit. To simplify the code,
+     * we prevent matches with the string of window index 0.
+     */
+    Pos *prev = s->prev;
+    uInt wmask = s->w_mask;
+
+    register Byte *strend = s->window + s->strstart + MAX_MATCH;
+    register unsigned short scan_start = *(unsigned short*)scan;
+    register unsigned short scan_end   = *(unsigned short*)(scan+best_len-1);
+
+    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+     * It is easy to get rid of this optimization if necessary.
+     */
+    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+    /* Do not waste too much time if we already have a good match: */
+    if (s->prev_length >= s->good_match) {
+        chain_length >>= 2;
+    }
+    /* Do not look for matches beyond the end of the input. This is necessary
+     * to make deflate deterministic.
+     */
+    if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+    do {
+        Assert(cur_match < s->strstart, "no future");
+
+        /* Skip to next match if the match length cannot increase
+         * or if the match length is less than 2.  Note that the checks below
+         * for insufficient lookahead only occur occasionally for performance
+         * reasons.  Therefore uninitialized memory will be accessed, and
+         * conditional jumps will be made that depend on those values.
+         * However the length of the match is limited to the lookahead, so
+         * the output of deflate is not affected by the uninitialized values.
+         */
+        Byte *win = s->window;
+        int cont = 1;
+        do {
+            match = win + cur_match;
+            if (likely(*(unsigned short*)(match+best_len-1) != scan_end)) {
+                if ((cur_match = prev[cur_match & wmask]) > limit
+                    && --chain_length != 0) {
+                    continue;
+                } else
+                    cont = 0;
+            }
+            break;
+        } while (1);
+
+        if (!cont)
+            break;
+
+        if (*(unsigned short*)match != scan_start)
+            continue;
+
+        /* It is not necessary to compare scan[2] and match[2] since they are
+         * always equal when the other bytes match, given that the hash keys
+         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+         * strstart+3, +5, ... up to strstart+257. We check for insufficient
+         * lookahead only every 4th comparison; the 128th check will be made
+         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+         * necessary to put more guard bytes at the end of the window, or
+         * to check more often for insufficient lookahead.
+         */
+        scan += 2, match+=2;
+        Assert(*scan == *match, "match[2]?");
+        do {
+            unsigned long sv = *(unsigned long*)(void*)scan;
+            unsigned long mv = *(unsigned long*)(void*)match;
+            unsigned long xor = sv ^ mv;
+            if (xor) {
+                int match_byte = __builtin_ctzl(xor) / 8;
+                scan += match_byte;
+                match += match_byte;
+                break;
+            } else {
+                scan += sizeof(unsigned long);
+                match += sizeof(unsigned long);
+            }
+        } while (scan < strend);
+
+        if (scan > strend)
+            scan = strend;
+
+        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+        len = MAX_MATCH - (int)(strend - scan);
+        scan = strend - MAX_MATCH;
+
+        if (len > best_len) {
+            s->match_start = cur_match;
+            best_len = len;
+            if (len >= nice_match) break;
+            scan_end = *(unsigned short*)(scan+best_len-1);
+        } else {
+            /*
+             * The probability of finding a match later if we here
+             * is pretty low, so for performance it's best to
+             * outright stop here for the lower compression levels
+             */
+            if (s->level < 6)
+                break;
+        }
+    } while ((cur_match = prev[cur_match & wmask]) > limit
+             && --chain_length != 0);
+
+    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+    return s->lookahead;
+}
+#endif