From: Nathan Moinvaziri Date: Tue, 22 Jun 2021 03:38:51 +0000 (-0700) Subject: Separate fast-zlib matching algorithm into its own longest_match variant. X-Git-Tag: 2.1.0-beta1~546 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ef416b7e2747620189b666ec82179bff369c35a7;p=thirdparty%2Fzlib-ng.git Separate fast-zlib matching algorithm into its own longest_match variant. --- diff --git a/arch/x86/compare258_avx.c b/arch/x86/compare258_avx.c index d9108fdeb..3452127f5 100644 --- a/arch/x86/compare258_avx.c +++ b/arch/x86/compare258_avx.c @@ -58,9 +58,16 @@ Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const u return compare258_unaligned_avx2_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_avx2 -#define COMPARE256 compare256_unaligned_avx2_static -#define COMPARE258 compare258_unaligned_avx2_static +#define LONGEST_MATCH longest_match_unaligned_avx2 +#define COMPARE256 compare256_unaligned_avx2_static +#define COMPARE258 compare258_unaligned_avx2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_avx2 +#define COMPARE256 compare256_unaligned_avx2_static +#define COMPARE258 compare258_unaligned_avx2_static #include "match_tpl.h" diff --git a/arch/x86/compare258_sse.c b/arch/x86/compare258_sse.c index 17534c051..1bea2e95b 100644 --- a/arch/x86/compare258_sse.c +++ b/arch/x86/compare258_sse.c @@ -65,9 +65,16 @@ Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const u return compare258_unaligned_sse4_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_sse4 -#define COMPARE256 compare256_unaligned_sse4_static -#define COMPARE258 compare258_unaligned_sse4_static +#define LONGEST_MATCH longest_match_unaligned_sse4 +#define COMPARE256 compare256_unaligned_sse4_static +#define COMPARE258 compare258_unaligned_sse4_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_sse4 +#define COMPARE256 compare256_unaligned_sse4_static +#define COMPARE258 compare258_unaligned_sse4_static #include "match_tpl.h" diff --git a/compare258.c b/compare258.c index 6b452b89c..f4f1936c4 100644 --- a/compare258.c +++ b/compare258.c @@ -57,9 +57,16 @@ Z_INTERNAL uint32_t compare258_c(const unsigned char *src0, const unsigned char return compare258_c_static(src0, src1); } -#define LONGEST_MATCH longest_match_c -#define COMPARE256 compare256_c_static -#define COMPARE258 compare258_c_static +#define LONGEST_MATCH longest_match_c +#define COMPARE256 compare256_c_static +#define COMPARE258 compare258_c_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_c +#define COMPARE256 compare256_c_static +#define COMPARE258 compare258_c_staticc #include "match_tpl.h" @@ -97,9 +104,16 @@ Z_INTERNAL uint32_t compare258_unaligned_16(const unsigned char *src0, const uns return compare258_unaligned_16_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_16 -#define COMPARE256 compare256_unaligned_16_static -#define COMPARE258 compare258_unaligned_16_static +#define LONGEST_MATCH longest_match_unaligned_16 +#define COMPARE256 compare256_unaligned_16_static +#define COMPARE258 compare258_unaligned_16_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_16 +#define COMPARE256 compare256_unaligned_16_static +#define COMPARE258 compare258_unaligned_16_static #include "match_tpl.h" @@ -135,9 +149,16 @@ Z_INTERNAL uint32_t compare258_unaligned_32(const unsigned char *src0, const uns return compare258_unaligned_32_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_32 -#define COMPARE256 compare256_unaligned_32_static -#define COMPARE258 compare258_unaligned_32_static +#define LONGEST_MATCH longest_match_unaligned_32 +#define COMPARE256 compare256_unaligned_32_static +#define COMPARE258 compare258_unaligned_32_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_32 +#define COMPARE256 compare256_unaligned_32_static +#define COMPARE258 compare258_unaligned_32_static #include "match_tpl.h" @@ -175,9 +196,16 @@ Z_INTERNAL uint32_t compare258_unaligned_64(const unsigned char *src0, const uns return compare258_unaligned_64_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_64 -#define COMPARE256 compare256_unaligned_64_static -#define COMPARE258 compare258_unaligned_64_static +#define LONGEST_MATCH longest_match_unaligned_64 +#define COMPARE256 compare256_unaligned_64_static +#define COMPARE258 compare258_unaligned_64_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_unaligned_64 +#define COMPARE256 compare256_unaligned_64_static +#define COMPARE258 compare258_unaligned_64_static #include "match_tpl.h" diff --git a/functable.c b/functable.c index 8fc94c5c1..5ed930c10 100644 --- a/functable.c +++ b/functable.c @@ -142,6 +142,22 @@ extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_mat #endif #endif +/* longest_match_slow */ +extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED_OK +extern uint32_t longest_match_slow_unaligned_16(deflate_state *const s, Pos cur_match); +extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_match); +#ifdef UNALIGNED64_OK +extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); +#endif +#ifdef X86_SSE42_CMP_STR +extern uint32_t longest_match_slow_unaligned_sse4(deflate_state *const s, Pos cur_match); +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) +extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match); +#endif +#endif + Z_INTERNAL Z_TLS struct functable_s functable; Z_INTERNAL void cpu_check_features(void) @@ -474,6 +490,31 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { return functable.longest_match(s, cur_match); } +Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_match) { + + functable.longest_match_slow = &longest_match_slow_c; + +#ifdef UNALIGNED_OK +# if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) + functable.longest_match_slow = &longest_match_slow_unaligned_64; +# elif defined(HAVE_BUILTIN_CTZ) + functable.longest_match_slow = &longest_match_slow_unaligned_32; +# else + functable.longest_match_slow = &longest_match_slow_unaligned_16; +# endif +# ifdef X86_SSE42_CMP_STR + if (x86_cpu_has_sse42) + functable.longest_match_slow = &longest_match_slow_unaligned_sse4; +# endif +# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_avx2) + functable.longest_match_slow = &longest_match_slow_unaligned_avx2; +# endif +#endif + + return functable.longest_match_slow(s, cur_match); +} + /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { update_hash_stub, @@ -484,6 +525,7 @@ Z_INTERNAL Z_TLS struct functable_s functable = { slide_hash_stub, compare258_stub, longest_match_stub, + longest_match_slow_stub, chunksize_stub, chunkcopy_stub, chunkcopy_safe_stub, diff --git a/functable.h b/functable.h index 49d2f5d56..f4b17569a 100644 --- a/functable.h +++ b/functable.h @@ -17,6 +17,7 @@ struct functable_s { void (* slide_hash) (deflate_state *s); uint32_t (* compare258) (const unsigned char *src0, const unsigned char *src1); uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); + uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match); uint32_t (* chunksize) (void); uint8_t* (* chunkcopy) (uint8_t *out, uint8_t const *from, unsigned len); uint8_t* (* chunkcopy_safe) (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); diff --git a/match_tpl.h b/match_tpl.h index 3d8ac4e3c..6e6665455 100644 --- a/match_tpl.h +++ b/match_tpl.h @@ -45,12 +45,16 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { Z_REGISTER unsigned char *mbase_start = window; Z_REGISTER unsigned char *mbase_end; const Pos *prev = s->prev; - Pos limit, limit_base; + Pos limit; +#ifdef LONGEST_MATCH_SLOW + Pos limit_base; + int32_t rolling_hash; +#else int32_t early_exit; +#endif uint32_t chain_length, nice_match, best_len, offset; uint32_t lookahead = s->lookahead; Pos match_offset = 0; - int32_t rolling_hash; bestcmp_t scan_end; #ifndef UNALIGNED_OK bestcmp_t scan_end0; @@ -92,8 +96,11 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { /* Do not waste too much time if we already have a good match */ chain_length = s->max_chain_length; +#ifdef LONGEST_MATCH_SLOW rolling_hash = chain_length > 1024; +#else early_exit = s->level < EARLY_EXIT_TRIGGER_LEVEL; +#endif if (best_len >= s->good_match) chain_length >>= 2; nice_match = (uint32_t)s->nice_match; @@ -101,8 +108,9 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { /* Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0 */ - limit = limit_base = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0; - + limit = strstart > MAX_DIST(s) ? (Pos)(strstart - MAX_DIST(s)) : 0; +#ifdef LONGEST_MATCH_SLOW + limit_base = limit; if (best_len >= STD_MIN_MATCH && rolling_hash) { /* We're continuing search (lazy evaluation). */ uint32_t i, hash; @@ -133,7 +141,7 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { mbase_start -= match_offset; mbase_end -= match_offset; } - +#endif Assert((unsigned long)strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead"); for (;;) { if (cur_match >= strstart) @@ -207,6 +215,7 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #ifndef UNALIGNED_OK scan_end0 = *(bestcmp_t *)(scan+offset+1); #endif +#ifdef LONGEST_MATCH_SLOW /* Look for a better string offset */ if (len > STD_MIN_MATCH && match_start + len < strstart && rolling_hash) { Pos pos, next_pos; @@ -254,27 +263,33 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { mbase_start = window-match_offset; mbase_end = (mbase_start+offset); continue; - } else { - mbase_end = (mbase_start+offset); } - } else if (UNLIKELY(early_exit)) { +#endif + mbase_end = (mbase_start+offset); + } +#ifndef LONGEST_MATCH_SLOW + else if (UNLIKELY(early_exit)) { /* The probability of finding a match later if we here is pretty low, so for * performance it's best to outright stop here for the lower compression levels */ break; } +#endif GOTO_NEXT_CHAIN; } return best_len; +#ifdef LONGEST_MATCH_SLOW break_matching: if (best_len < s->lookahead) return best_len; return s->lookahead; +#endif } +#undef LONGEST_MATCH_SLOW #undef LONGEST_MATCH #undef COMPARE256 #undef COMPARE258