From: Nathan Moinvaziri Date: Wed, 18 Mar 2020 03:58:31 +0000 (-0700) Subject: Combine longest_match implementations and use compare258 functable stub. X-Git-Tag: 1.9.9-b1~299 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=965b452649a6bfb26ff49ef32b98d5d4658f90ba;p=thirdparty%2Fzlib-ng.git Combine longest_match implementations and use compare258 functable stub. --- diff --git a/deflate.c b/deflate.c index 81068ffbe..f696849a1 100644 --- a/deflate.c +++ b/deflate.c @@ -50,7 +50,6 @@ #include "zbuild.h" #include "deflate.h" #include "deflate_p.h" -#include "match_p.h" #include "functable.h" const char zng_deflate_copyright[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler "; diff --git a/match_p.h b/match_p.h index dd13e0087..98b32811e 100644 --- a/match_p.h +++ b/match_p.h @@ -1,163 +1,44 @@ -/* - * Set match_start to the longest match starting at the given string and - * return its length. Matches shorter or equal to prev_length are discarded, - * in which case the result is equal to prev_length and match_start is garbage. - * - * IN assertions: cur_match is the head of the hash chain for the current - * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 - * OUT assertion: the match length is not greater than s->lookahead - */ #include "zbuild.h" #include "deflate.h" +#include "functable.h" -#if (defined(UNALIGNED_OK) && MAX_MATCH == 258) +#ifndef BESTCMP_TYPE +#define BESTCMP_TYPE - /* ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. */ -# if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__) -# define std2_longest_match - /* Only use std3_longest_match for little_endian systems, also avoid using it with - non-gcc compilers since the __builtin_ctzl() function might not be optimized. */ -# elif(defined(__GNUC__) && (defined(HAVE_BUILTIN_CTZ) || defined(HAVE_BUILTIN_CTZLL)) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ - || defined(__LITTLE_ENDIAN__))) -# define std3_longest_match -# elif(defined(_MSC_VER) && defined(_WIN32)) -# define std3_longest_match -# else -# define std2_longest_match -# endif - -#else -# define std1_longest_match -#endif - - -#if defined(_MSC_VER) && !defined(__clang__) -# if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) -# include "fallback_builtins.h" -# endif -#endif - -extern int32_t compare258_c(const unsigned char *src0, const unsigned char *src1); #ifdef UNALIGNED_OK -extern int32_t compare258_unaligned_16(const unsigned char *src0, const unsigned char *src1); -extern int32_t compare258_unaligned_32(const unsigned char *src0, const unsigned char *src1); -extern int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned char *src1); -#ifdef X86_SSE42_CMP_STR -extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1); -#endif +#if MIN_MATCH >= 4 +typedef uint32_t bestcmp_t; +#elif MIN_MATCH >= 2 +typedef uint16_t bestcmp_t; +#else +typedef uint8_t bestcmp_t; #endif +#else +typedef uint8_t bestcmp_t; +#endif -#ifdef std1_longest_match - -/* - * Standard longest_match - * - */ -static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { - const unsigned wmask = s->w_mask; - const Pos *prev = s->prev; - - unsigned chain_length; - IPos limit; - unsigned int len, best_len, nice_match; - unsigned char *scan, *match, *strend, scan_end, scan_end1; - - /* - * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple - * of 16. It is easy to get rid of this optimization if necessary. - */ - Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); - - /* - * Do not waste too much time if we already have a good match - */ - best_len = s->prev_length ? s->prev_length : 1; - chain_length = s->max_chain_length; - if (best_len >= s->good_match) - chain_length >>= 2; - - /* - * Do not looks for matches beyond the end of the input. This is - * necessary to make deflate deterministic - */ - nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match; - - /* - * Stop when cur_match becomes <= limit. To simplify the code, - * we prevent matches with the string of window index 0 - */ - limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; - - scan = s->window + s->strstart; - strend = s->window + s->strstart + MAX_MATCH; - scan_end1 = scan[best_len-1]; - scan_end = scan[best_len]; - - Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); - do { - if (cur_match >= s->strstart) { - break; - } - match = s->window + cur_match; - - /* - * Skip to next match if the match length cannot increase - * or if the match length is less than 2. Note that the checks - * below for insufficient lookahead only occur occasionally - * for performance reasons. Therefore uninitialized memory - * will be accessed and conditional jumps will be made that - * depend on those values. However the length of the match - * is limited to the lookahead, so the output of deflate is not - * affected by the uninitialized values. - */ - if (match[best_len] != scan_end || - match[best_len-1] != scan_end1 || - *match != *scan || - match[1] != scan[1]) - continue; - - len = compare258_c(scan, match); - Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); - - if (len > best_len) { - s->match_start = cur_match; - best_len = len; - if (len >= nice_match) - break; - scan_end1 = scan[best_len-1]; - scan_end = scan[best_len]; - } else { - /* - * The probability of finding a match later if we here - * is pretty low, so for performance it's best to - * outright stop here for the lower compression levels - */ - if (s->level < TRIGGER_LEVEL) - break; - } - } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length); - - if ((unsigned int)best_len <= s->lookahead) - return best_len; - return s->lookahead; -} #endif -#ifdef std2_longest_match /* - * UNALIGNED_OK longest_match + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is garbage. * + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 + * OUT assertion: the match length is not greater than s->lookahead */ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { + unsigned int strstart = s->strstart; const unsigned wmask = s->w_mask; + unsigned char *window = s->window; + unsigned char *scan = window + strstart; const Pos *prev = s->prev; - - uint16_t scan_start, scan_end; unsigned chain_length; IPos limit; unsigned int len, best_len, nice_match; - unsigned char *scan, *strend; + bestcmp_t scan_end, scan_start; /* * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple @@ -168,7 +49,9 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { /* * Do not waste too much time if we already have a good match */ - best_len = s->prev_length ? s->prev_length : 1; + best_len = s->prev_length; + if (best_len == 0) + best_len = 1; chain_length = s->max_chain_length; if (best_len >= s->good_match) chain_length >>= 2; @@ -183,20 +66,17 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { * Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0 */ - limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; + limit = strstart > MAX_DIST(s) ? strstart - MAX_DIST(s) : 0; - scan = s->window + s->strstart; - strend = s->window + s->strstart + MAX_MATCH - 1; - memcpy(&scan_start, scan, sizeof(scan_start)); - memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); + scan_start = *(bestcmp_t *)(scan); + scan_end = *(bestcmp_t *)(scan+best_len-1); - Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); + Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); do { unsigned char *match; - if (cur_match >= s->strstart) { + int cont; + if (cur_match >= strstart) break; - } - match = s->window + cur_match; /* * Skip to next match if the match length cannot increase @@ -208,169 +88,15 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { * is limited to the lookahead, so the output of deflate is not * affected by the uninitialized values. */ - uint16_t val; - memcpy(&val, match + best_len - 1, sizeof(val)); - if (LIKELY(val != scan_end)) - continue; - - memcpy(&val, match, sizeof(val)); - if (val != scan_start) - continue; - - len = compare258_unaligned_16(scan, match); - Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); - - if (len > best_len) { - s->match_start = cur_match; - best_len = len; - if (len >= nice_match) - break; - memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); - } else { - /* - * The probability of finding a match later if we here - * is pretty low, so for performance it's best to - * outright stop here for the lower compression levels - */ - if (s->level < TRIGGER_LEVEL) - break; - } - } while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit); - - if ((unsigned)best_len <= s->lookahead) - return best_len; - return s->lookahead; -} -#endif - -#ifdef std3_longest_match -/* longest_match() with minor change to improve performance (in terms of - * execution time). - * - * The pristine longest_match() function is sketched below (strip the - * then-clause of the "#ifdef UNALIGNED_OK"-directive) - * - * ------------------------------------------------------------ - * unsigned int longest_match(...) { - * ... - * do { - * match = s->window + cur_match; //s0 - * if (*(ushf*)(match+best_len-1) != scan_end || //s1 - * *(ushf*)match != scan_start) continue; //s2 - * ... - * - * do { - * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && - * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && - * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && - * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && - * scan < strend); //s3 - * - * ... - * } while(cond); //s4 - * - * ------------------------------------------------------------- - * - * The change include: - * - * 1) The hottest statements of the function is: s0, s1 and s4. Pull them - * together to form a new loop. The benefit is two-fold: - * - * o. Ease the compiler to yield good code layout: the conditional-branch - * corresponding to s1 and its biased target s4 become very close (likely, - * fit in the same cache-line), hence improving instruction-fetching - * efficiency. - * - * o. Ease the compiler to promote "s->window" into register. "s->window" - * is loop-invariant; it is supposed to be promoted into register and keep - * the value throughout the entire loop. However, there are many such - * loop-invariant, and x86-family has small register file; "s->window" is - * likely to be chosen as register-allocation victim such that its value - * is reloaded from memory in every single iteration. By forming a new loop, - * "s->window" is loop-invariant of that newly created tight loop. It is - * lot easier for compiler to promote this quantity to register and keep - * its value throughout the entire small loop. - * - * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time. - * This is done by: - * ------------------------------------------------ - * v1 = load from "scan" by sizeof(long) bytes - * v2 = load from "match" by sizeof(lnog) bytes - * v3 = v1 xor v2 - * match-bit = little-endian-machine(yes-for-x86) ? - * count-trailing-zero(v3) : - * count-leading-zero(v3); - * - * match-byte = match-bit/8 - * - * "scan" and "match" advance if necessary - * ------------------------------------------------- - */ - -static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { - unsigned int strstart = s->strstart; - unsigned chain_length = s->max_chain_length;/* max hash chain length */ - unsigned char *window = s->window; - register unsigned char *scan = window + strstart; /* current string */ - register unsigned char *match; /* matched string */ - register unsigned int len; /* length of current match */ - unsigned int best_len = s->prev_length ? s->prev_length : 1; /* best match length so far */ - unsigned int nice_match = s->nice_match; /* stop if match long enough */ - IPos limit = strstart > (IPos)MAX_DIST(s) ? - strstart - (IPos)MAX_DIST(s) : NIL; - /* Stop when cur_match becomes <= limit. To simplify the code, - * we prevent matches with the string of window index 0. - */ - Pos *prev = s->prev; - unsigned int wmask = s->w_mask; - - register unsigned char *strend = window + strstart + MAX_MATCH; - - uint16_t scan_start, scan_end; - - memcpy(&scan_start, scan, sizeof(scan_start)); - memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); - - /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. - * It is easy to get rid of this optimization if necessary. - */ - Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); - - /* Do not waste too much time if we already have a good match: */ - if (s->prev_length >= s->good_match) { - chain_length >>= 2; - } - /* Do not look for matches beyond the end of the input. This is necessary - * to make deflate deterministic. - */ - if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead; - - Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); - - do { - if (cur_match >= strstart) { - break; - } - - /* Skip to next match if the match length cannot increase - * or if the match length is less than 2. Note that the checks below - * for insufficient lookahead only occur occasionally for performance - * reasons. Therefore uninitialized memory will be accessed, and - * conditional jumps will be made that depend on those values. - * However the length of the match is limited to the lookahead, so - * the output of deflate is not affected by the uninitialized values. - */ - int cont = 1; + cont = 1; do { match = window + cur_match; - if (LIKELY(memcmp(match+best_len-1, &scan_end, sizeof(scan_end)) != 0 - || memcmp(match, &scan_start, sizeof(scan_start)) != 0)) { - if ((cur_match = prev[cur_match & wmask]) > limit - && --chain_length != 0) { + if (LIKELY(*(bestcmp_t *)(match+best_len-1) != scan_end || + *(bestcmp_t *)(match) != scan_start)) { + if ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0) { continue; - } else { - cont = 0; } + cont = 0; } break; } while (1); @@ -378,12 +104,7 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { if (!cont) break; -#ifdef HAVE_BUILTIN_CTZLL - len = compare258_unaligned_64(scan, match); -#elif defined(HAVE_BUILTIN_CTZ) - len = compare258_unaligned_32(scan, match); -#endif - + len = functable.compare258(scan, match); Assert(scan+len <= window+(unsigned)(s->window_size-1), "wild scan"); if (len > best_len) { @@ -391,7 +112,7 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { best_len = len; if (len >= nice_match) break; - memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); + scan_end = *(bestcmp_t *)(scan+best_len-1); } else { /* * The probability of finding a match later if we here @@ -403,8 +124,7 @@ static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { } } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); - if ((unsigned int)best_len <= s->lookahead) - return (unsigned int)best_len; + if (best_len <= s->lookahead) + return best_len; return s->lookahead; } -#endif