From: Hans Kristian Rosbach Date: Thu, 6 Nov 2014 19:59:54 +0000 (+0100) Subject: Merge longest_match implementation from CloudFlare, authored by X-Git-Tag: 1.9.9-b1~891 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c91d28ec792104094717a07fb53785ffc245fa5d;p=thirdparty%2Fzlib-ng.git Merge longest_match implementation from CloudFlare, authored by Shuxin Yang in commit 31043308c3d3edfb487d2c4cbe7290bd5b63c65c at https://github.com/cloudflare/zlib/commit/31043308c3d3edfb487d2c4cbe7290bd5b63c65c This has been modified by me to fit into zlib-ng. --- diff --git a/deflate.h b/deflate.h index 7f7deb3ae..68dc71146 100644 --- a/deflate.h +++ b/deflate.h @@ -23,6 +23,10 @@ # define GZIP #endif +#define NIL 0 +/* Tail of hash chains */ + + /* =========================================================================== * Internal compression state. */ diff --git a/match.c b/match.c index 0843933ac..5198c85b4 100644 --- a/match.c +++ b/match.c @@ -13,7 +13,11 @@ #include "deflate.h" #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) -# define std2_longest_match +# if defined(HAVE_BUILTIN_CTZL) +# define std3_longest_match +# else +# define std2_longest_match +# endif #else # define std1_longest_match #endif @@ -189,7 +193,7 @@ ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match) Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); do { - unsigned char *match; + unsigned char *match; Assert(cur_match < s->strstart, "no future"); match = s->window + cur_match; @@ -261,3 +265,191 @@ ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match) return s->lookahead; } #endif + +#ifdef std3_longest_match + +/* longest_match() with minor change to improve performance (in terms of + * execution time). + * + * The pristine longest_match() function is sketched bellow (strip the + * then-clause of the "#ifdef UNALIGNED_OK"-directive) + * + * ------------------------------------------------------------ + * uInt longest_match(...) { + * ... + * do { + * match = s->window + cur_match; //s0 + * if (*(ushf*)(match+best_len-1) != scan_end || //s1 + * *(ushf*)match != scan_start) continue; //s2 + * ... + * + * do { + * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && + * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && + * scan < strend); //s3 + * + * ... + * } while(cond); //s4 + * + * ------------------------------------------------------------- + * + * The change include: + * + * 1) The hottest statements of the function is: s0, s1 and s4. Pull them + * together to form a new loop. The benefit is two-fold: + * + * o. Ease the compiler to yield good code layout: the conditional-branch + * corresponding to s1 and its biased target s4 become very close (likely, + * fit in the same cache-line), hence improving instruction-fetching + * efficiency. + * + * o. Ease the compiler to promote "s->window" into register. "s->window" + * is loop-invariant; it is supposed to be promoted into register and keep + * the value throughout the entire loop. However, there are many such + * loop-invariant, and x86-family has small register file; "s->window" is + * likely to be chosen as register-allocation victim such that its value + * is reloaded from memory in every single iteration. By forming a new loop, + * "s->window" is loop-invariant of that newly created tight loop. It is + * lot easier for compiler to promote this quantity to register and keep + * its value throughout the entire small loop. + * + * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time. + * This is done by: + * ------------------------------------------------ + * v1 = load from "scan" by sizeof(long) bytes + * v2 = load from "match" by sizeof(lnog) bytes + * v3 = v1 xor v2 + * match-bit = little-endian-machine(yes-for-x86) ? + * count-trailing-zero(v3) : + * count-leading-zero(v3); + * + * match-byte = match-bit/8 + * + * "scan" and "match" advance if necessary + * ------------------------------------------------- + */ + +ZLIB_INTERNAL unsigned longest_match(deflate_state *z_const s, IPos cur_match) +{ + unsigned chain_length = s->max_chain_length;/* max hash chain length */ + register Byte *scan = s->window + s->strstart; /* current string */ + register Byte *match; /* matched string */ + register int len; /* length of current match */ + int best_len = s->prev_length; /* best match length so far */ + int nice_match = s->nice_match; /* stop if match long enough */ + IPos limit = s->strstart > (IPos)MAX_DIST(s) ? + s->strstart - (IPos)MAX_DIST(s) : NIL; + /* Stop when cur_match becomes <= limit. To simplify the code, + * we prevent matches with the string of window index 0. + */ + Pos *prev = s->prev; + uInt wmask = s->w_mask; + + register Byte *strend = s->window + s->strstart + MAX_MATCH; + register unsigned short scan_start = *(unsigned short*)scan; + register unsigned short scan_end = *(unsigned short*)(scan+best_len-1); + + /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. + * It is easy to get rid of this optimization if necessary. + */ + Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); + + /* Do not waste too much time if we already have a good match: */ + if (s->prev_length >= s->good_match) { + chain_length >>= 2; + } + /* Do not look for matches beyond the end of the input. This is necessary + * to make deflate deterministic. + */ + if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); + + do { + Assert(cur_match < s->strstart, "no future"); + + /* Skip to next match if the match length cannot increase + * or if the match length is less than 2. Note that the checks below + * for insufficient lookahead only occur occasionally for performance + * reasons. Therefore uninitialized memory will be accessed, and + * conditional jumps will be made that depend on those values. + * However the length of the match is limited to the lookahead, so + * the output of deflate is not affected by the uninitialized values. + */ + Byte *win = s->window; + int cont = 1; + do { + match = win + cur_match; + if (likely(*(unsigned short*)(match+best_len-1) != scan_end)) { + if ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0) { + continue; + } else + cont = 0; + } + break; + } while (1); + + if (!cont) + break; + + if (*(unsigned short*)match != scan_start) + continue; + + /* It is not necessary to compare scan[2] and match[2] since they are + * always equal when the other bytes match, given that the hash keys + * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at + * strstart+3, +5, ... up to strstart+257. We check for insufficient + * lookahead only every 4th comparison; the 128th check will be made + * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is + * necessary to put more guard bytes at the end of the window, or + * to check more often for insufficient lookahead. + */ + scan += 2, match+=2; + Assert(*scan == *match, "match[2]?"); + do { + unsigned long sv = *(unsigned long*)(void*)scan; + unsigned long mv = *(unsigned long*)(void*)match; + unsigned long xor = sv ^ mv; + if (xor) { + int match_byte = __builtin_ctzl(xor) / 8; + scan += match_byte; + match += match_byte; + break; + } else { + scan += sizeof(unsigned long); + match += sizeof(unsigned long); + } + } while (scan < strend); + + if (scan > strend) + scan = strend; + + Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); + + len = MAX_MATCH - (int)(strend - scan); + scan = strend - MAX_MATCH; + + if (len > best_len) { + s->match_start = cur_match; + best_len = len; + if (len >= nice_match) break; + scan_end = *(unsigned short*)(scan+best_len-1); + } else { + /* + * The probability of finding a match later if we here + * is pretty low, so for performance it's best to + * outright stop here for the lower compression levels + */ + if (s->level < 6) + break; + } + } while ((cur_match = prev[cur_match & wmask]) > limit + && --chain_length != 0); + + if ((uInt)best_len <= s->lookahead) return (uInt)best_len; + return s->lookahead; +} +#endif