#include "deflate.h"
#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
-# define std2_longest_match
+# if defined(HAVE_BUILTIN_CTZL)
+# define std3_longest_match
+# else
+# define std2_longest_match
+# endif
#else
# define std1_longest_match
#endif
Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
"need lookahead");
do {
- unsigned char *match;
+ unsigned char *match;
Assert(cur_match < s->strstart, "no future");
match = s->window + cur_match;
return s->lookahead;
}
#endif
+
+#ifdef std3_longest_match
+
+/* longest_match() with minor change to improve performance (in terms of
+ * execution time).
+ *
+ * The pristine longest_match() function is sketched bellow (strip the
+ * then-clause of the "#ifdef UNALIGNED_OK"-directive)
+ *
+ * ------------------------------------------------------------
+ * uInt longest_match(...) {
+ * ...
+ * do {
+ * match = s->window + cur_match; //s0
+ * if (*(ushf*)(match+best_len-1) != scan_end || //s1
+ * *(ushf*)match != scan_start) continue; //s2
+ * ...
+ *
+ * do {
+ * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+ * scan < strend); //s3
+ *
+ * ...
+ * } while(cond); //s4
+ *
+ * -------------------------------------------------------------
+ *
+ * The change include:
+ *
+ * 1) The hottest statements of the function is: s0, s1 and s4. Pull them
+ * together to form a new loop. The benefit is two-fold:
+ *
+ * o. Ease the compiler to yield good code layout: the conditional-branch
+ * corresponding to s1 and its biased target s4 become very close (likely,
+ * fit in the same cache-line), hence improving instruction-fetching
+ * efficiency.
+ *
+ * o. Ease the compiler to promote "s->window" into register. "s->window"
+ * is loop-invariant; it is supposed to be promoted into register and keep
+ * the value throughout the entire loop. However, there are many such
+ * loop-invariant, and x86-family has small register file; "s->window" is
+ * likely to be chosen as register-allocation victim such that its value
+ * is reloaded from memory in every single iteration. By forming a new loop,
+ * "s->window" is loop-invariant of that newly created tight loop. It is
+ * lot easier for compiler to promote this quantity to register and keep
+ * its value throughout the entire small loop.
+ *
+ * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time.
+ * This is done by:
+ * ------------------------------------------------
+ * v1 = load from "scan" by sizeof(long) bytes
+ * v2 = load from "match" by sizeof(lnog) bytes
+ * v3 = v1 xor v2
+ * match-bit = little-endian-machine(yes-for-x86) ?
+ * count-trailing-zero(v3) :
+ * count-leading-zero(v3);
+ *
+ * match-byte = match-bit/8
+ *
+ * "scan" and "match" advance if necessary
+ * -------------------------------------------------
+ */
+
+ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match)
+{
+ unsigned chain_length = s->max_chain_length;/* max hash chain length */
+ register Byte *scan = s->window + s->strstart; /* current string */
+ register Byte *match; /* matched string */
+ register int len; /* length of current match */
+ int best_len = s->prev_length; /* best match length so far */
+ int nice_match = s->nice_match; /* stop if match long enough */
+ IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
+ s->strstart - (IPos)MAX_DIST(s) : NIL;
+ /* Stop when cur_match becomes <= limit. To simplify the code,
+ * we prevent matches with the string of window index 0.
+ */
+ Pos *prev = s->prev;
+ uInt wmask = s->w_mask;
+
+ register Byte *strend = s->window + s->strstart + MAX_MATCH;
+ register unsigned short scan_start = *(unsigned short*)scan;
+ register unsigned short scan_end = *(unsigned short*)(scan+best_len-1);
+
+ /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
+ * It is easy to get rid of this optimization if necessary.
+ */
+ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
+
+ /* Do not waste too much time if we already have a good match: */
+ if (s->prev_length >= s->good_match) {
+ chain_length >>= 2;
+ }
+ /* Do not look for matches beyond the end of the input. This is necessary
+ * to make deflate deterministic.
+ */
+ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+ Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+
+ do {
+ Assert(cur_match < s->strstart, "no future");
+
+ /* Skip to next match if the match length cannot increase
+ * or if the match length is less than 2. Note that the checks below
+ * for insufficient lookahead only occur occasionally for performance
+ * reasons. Therefore uninitialized memory will be accessed, and
+ * conditional jumps will be made that depend on those values.
+ * However the length of the match is limited to the lookahead, so
+ * the output of deflate is not affected by the uninitialized values.
+ */
+ Byte *win = s->window;
+ int cont = 1;
+ do {
+ match = win + cur_match;
+ if (likely(*(unsigned short*)(match+best_len-1) != scan_end)) {
+ if ((cur_match = prev[cur_match & wmask]) > limit
+ && --chain_length != 0) {
+ continue;
+ } else
+ cont = 0;
+ }
+ break;
+ } while (1);
+
+ if (!cont)
+ break;
+
+ if (*(unsigned short*)match != scan_start)
+ continue;
+
+ /* It is not necessary to compare scan[2] and match[2] since they are
+ * always equal when the other bytes match, given that the hash keys
+ * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
+ * strstart+3, +5, ... up to strstart+257. We check for insufficient
+ * lookahead only every 4th comparison; the 128th check will be made
+ * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+ * necessary to put more guard bytes at the end of the window, or
+ * to check more often for insufficient lookahead.
+ */
+ scan += 2, match+=2;
+ Assert(*scan == *match, "match[2]?");
+ do {
+ unsigned long sv = *(unsigned long*)(void*)scan;
+ unsigned long mv = *(unsigned long*)(void*)match;
+ unsigned long xor = sv ^ mv;
+ if (xor) {
+ int match_byte = __builtin_ctzl(xor) / 8;
+ scan += match_byte;
+ match += match_byte;
+ break;
+ } else {
+ scan += sizeof(unsigned long);
+ match += sizeof(unsigned long);
+ }
+ } while (scan < strend);
+
+ if (scan > strend)
+ scan = strend;
+
+ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+
+ len = MAX_MATCH - (int)(strend - scan);
+ scan = strend - MAX_MATCH;
+
+ if (len > best_len) {
+ s->match_start = cur_match;
+ best_len = len;
+ if (len >= nice_match) break;
+ scan_end = *(unsigned short*)(scan+best_len-1);
+ } else {
+ /*
+ * The probability of finding a match later if we here
+ * is pretty low, so for performance it's best to
+ * outright stop here for the lower compression levels
+ */
+ if (s->level < 6)
+ break;
+ }
+ } while ((cur_match = prev[cur_match & wmask]) > limit
+ && --chain_length != 0);
+
+ if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
+ return s->lookahead;
+}
+#endif