From: Hans Kristian Rosbach Date: Mon, 24 Apr 2017 08:53:39 +0000 (+0200) Subject: Add a struct func_table and function functableInit. X-Git-Tag: 1.9.9-b1~660^2~17 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=da5133848856cbef6a565736ba488b4b408922c8;p=thirdparty%2Fzlib-ng.git Add a struct func_table and function functableInit. The struct contains pointers to select functions to be used by the rest of zlib, and the init function selects what functions will be used depending on what optimizations has been compiled in and what instruction-sets are available at runtime. Tests done on a haswell cpu running minigzip -6 compression of a 40M file shows a 2.5% decrease in branches, and a 25-30% reduction in iTLB-loads. The reduction i iTLB-loads is likely mostly due to the inability to inline functions. This also causes a slight performance regression of around 1%, this might still be worth it to make it much easier to implement new optimized functions for various architectures and instruction sets. The performance penalty will get smaller for functions that get more alternative implementations to choose from, since there is no need to add more branches to every call of the function. Today insert_string has 1 branch to choose insert_string_sse or insert_string_c, but if we also add for example insert_string_sse4 then that would have needed another branch, and it would probably at some point hinder effective inlining too. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 58851962a..84c3137d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -485,6 +485,7 @@ set(ZLIB_PUBLIC_HDRS set(ZLIB_PRIVATE_HDRS crc32.h deflate.h + functable.h gzguts.h inffast.h inffixed.h @@ -501,6 +502,7 @@ set(ZLIB_SRCS deflate_fast.c deflate_medium.c deflate_slow.c + functable.c inflate.c infback.c inftrees.c diff --git a/Makefile.in b/Makefile.in index 018328f36..4f7100c33 100644 --- a/Makefile.in +++ b/Makefile.in @@ -66,11 +66,11 @@ mandir = ${prefix}/share/man man3dir = ${mandir}/man3 pkgconfigdir = ${libdir}/pkgconfig -OBJZ = adler32.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o match.o infback.o inffast.o inflate.o inftrees.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS) +OBJZ = adler32.o compress.o crc32.o deflate.o deflate_fast.o deflate_medium.o deflate_slow.o functable.o match.o infback.o inffast.o inflate.o inftrees.o trees.o uncompr.o zutil.o $(ARCH_STATIC_OBJS) OBJG = gzclose.o gzlib.o gzread.o gzwrite.o OBJC = $(OBJZ) $(OBJG) -PIC_OBJZ = adler32.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo match.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS) +PIC_OBJZ = adler32.lo compress.lo crc32.lo deflate.lo deflate_fast.lo deflate_medium.lo deflate_slow.lo functable.lo match.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo uncompr.lo zutil.lo $(ARCH_SHARED_OBJS) PIC_OBJG = gzclose.lo gzlib.lo gzread.lo gzwrite.lo PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) @@ -311,13 +311,14 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. adler32.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +functable.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h gzclose.o gzlib.o gzread.o gzwrite.o: $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/gzguts.h compress.o example.o minigzip.o uncompr.o: $(SRCDIR)/zlib.h zconf.h crc32.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/crc32.h -deflate.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_fast.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_medium.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_slow.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_fast.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_medium.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_slow.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h infback.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/inffixed.h inffast.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h inflate.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/inffixed.h $(SRCDIR)/memcopy.h @@ -325,25 +326,26 @@ inftrees.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h trees.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h zutil.o: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/crc32_acle.o: zconf.h -arch/aarch64/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/aarch64/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/crc32_acle.o: zconf.h -arch/arm/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/arm/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_folding.o: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_pclmulqdq.o: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/deflate_quick.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -arch/x86/fill_window_sse.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/x86/fill_window_sse.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/x86.o: $(SRCDIR)/arch/x86/x86.h adler32.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +functable.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h gzclose.lo gzlib.lo gzread.lo gzwrite.lo: $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/gzguts.h compress.lo example.lo minigzip.lo uncompr.lo: $(SRCDIR)/zlib.h zconf.h crc32.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/crc32.h -deflate.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_fast.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_medium.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -deflate_slow.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_fast.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_medium.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +deflate_slow.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/match.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h infback.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/inffixed.h inffast.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/memcopy.h inflate.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/inffixed.h $(SRCDIR)/memcopy.h @@ -351,13 +353,13 @@ inftrees.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h trees.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h zutil.lo: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/crc32_acle.lo: zconf.h -arch/aarch64/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/aarch64/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/crc32_acle.lo: zconf.h -arch/arm/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/arm/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_folding.lo: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_pclmulqdq.lo: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/deflate_quick.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h -arch/x86/fill_window_sse.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h +arch/x86/fill_window_sse.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/x86.lo: $(SRCDIR)/arch/x86/x86.h diff --git a/arch/aarch64/fill_window_arm.c b/arch/aarch64/fill_window_arm.c index 784c55a0b..04dd76a57 100644 --- a/arch/aarch64/fill_window_arm.c +++ b/arch/aarch64/fill_window_arm.c @@ -12,6 +12,7 @@ #include "deflate.h" #include "deflate_p.h" +#include "functable.h" extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); @@ -122,7 +123,7 @@ void fill_window_arm(deflate_state *s) { } if (insert_cnt > 0) { - insert_string(s, str, insert_cnt); + functable.insert_string(s, str, insert_cnt); s->insert -= slen; } } diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c index 4ff23e04a..a088a3fe5 100644 --- a/arch/arm/fill_window_arm.c +++ b/arch/arm/fill_window_arm.c @@ -12,6 +12,7 @@ #include "deflate.h" #include "deflate_p.h" +#include "functable.h" extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); @@ -122,7 +123,7 @@ void fill_window_arm(deflate_state *s) { } if (insert_cnt > 0) { - insert_string(s, str, insert_cnt); + functable.insert_string(s, str, insert_cnt); s->insert -= slen; } } diff --git a/arch/x86/fill_window_sse.c b/arch/x86/fill_window_sse.c index c71059056..11e95f556 100644 --- a/arch/x86/fill_window_sse.c +++ b/arch/x86/fill_window_sse.c @@ -13,6 +13,7 @@ #include #include "deflate.h" #include "deflate_p.h" +#include "functable.h" extern int read_buf(z_stream *strm, unsigned char *buf, unsigned size); @@ -109,11 +110,11 @@ ZLIB_INTERNAL void fill_window_sse(deflate_state *s) { unsigned int str = s->strstart - s->insert; s->ins_h = s->window[str]; if (str >= 1) - insert_string(s, str + 2 - MIN_MATCH, 1); + functable.insert_string(s, str + 2 - MIN_MATCH, 1); #if MIN_MATCH != 3 #error Call insert_string() MIN_MATCH-3 more times while (s->insert) { - insert_string(s, str, 1); + functable.insert_string(s, str, 1); str++; s->insert--; if (s->lookahead + s->insert < MIN_MATCH) @@ -126,7 +127,7 @@ ZLIB_INTERNAL void fill_window_sse(deflate_state *s) { }else{ count = s->insert; } - insert_string(s,str,count); + functable.insert_string(s, str, count); s->insert -= count; #endif } diff --git a/deflate.c b/deflate.c index 60adf15b7..6ec9f1ead 100644 --- a/deflate.c +++ b/deflate.c @@ -52,6 +52,7 @@ #include "deflate.h" #include "deflate_p.h" #include "match.h" +#include "functable.h" const char deflate_copyright[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler "; /* @@ -240,6 +241,8 @@ int ZEXPORT deflateInit2_(z_stream *strm, int level, int method, int windowBits, x86_check_features(); #endif + functableInit(); + if (version == NULL || version[0] != my_version[0] || stream_size != sizeof(z_stream)) { return Z_VERSION_ERROR; } @@ -399,14 +402,14 @@ int ZEXPORT deflateSetDictionary(z_stream *strm, const unsigned char *dictionary next = strm->next_in; strm->avail_in = dictLength; strm->next_in = (const unsigned char *)dictionary; - fill_window(s); + functable.fill_window(s); while (s->lookahead >= MIN_MATCH) { str = s->strstart; n = s->lookahead - (MIN_MATCH-1); - insert_string(s, str, n); + functable.insert_string(s, str, n); s->strstart = str + n; s->lookahead = MIN_MATCH-1; - fill_window(s); + functable.fill_window(s); } s->strstart += s->lookahead; s->block_start = (long)s->strstart; @@ -1186,31 +1189,6 @@ void check_match(deflate_state *s, IPos start, IPos match, int length) { * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ -#ifdef X86_SSE2_FILL_WINDOW -extern void fill_window_sse(deflate_state *s); -#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) -extern void fill_window_arm(deflate_state *s); -#endif -void fill_window_c(deflate_state *s); - -void fill_window(deflate_state *s) { -#ifdef X86_SSE2_FILL_WINDOW -# ifndef X86_NOCHECK_SSE2 - if (x86_cpu_has_sse2) { -# endif - fill_window_sse(s); -# ifndef X86_NOCHECK_SSE2 - } else { - fill_window_c(s); - } -# endif - -#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) - fill_window_arm(s); -#else - fill_window_c(s); -#endif -} void fill_window_c(deflate_state *s) { unsigned n; @@ -1258,11 +1236,11 @@ void fill_window_c(deflate_state *s) { unsigned int str = s->strstart - s->insert; s->ins_h = s->window[str]; if (str >= 1) - insert_string(s, str + 2 - MIN_MATCH, 1); + functable.insert_string(s, str + 2 - MIN_MATCH, 1); #if MIN_MATCH != 3 #error Call insert_string() MIN_MATCH-3 more times while (s->insert) { - insert_string(s, str, 1); + functable.insert_string(s, str, 1); str++; s->insert--; if (s->lookahead + s->insert < MIN_MATCH) @@ -1275,7 +1253,7 @@ void fill_window_c(deflate_state *s) { }else{ count = s->insert; } - insert_string(s,str,count); + functable.insert_string(s,str,count); s->insert -= count; #endif } @@ -1522,7 +1500,7 @@ static block_state deflate_rle(deflate_state *s, int flush) { * for the longest run, plus one for the unrolled loop. */ if (s->lookahead <= MAX_MATCH) { - fill_window(s); + functable.fill_window(s); if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) { return need_more; } @@ -1589,7 +1567,7 @@ static block_state deflate_huff(deflate_state *s, int flush) { for (;;) { /* Make sure that we have a literal to write. */ if (s->lookahead == 0) { - fill_window(s); + functable.fill_window(s); if (s->lookahead == 0) { if (flush == Z_NO_FLUSH) return need_more; diff --git a/deflate.h b/deflate.h index 37e559680..17f95c2aa 100644 --- a/deflate.h +++ b/deflate.h @@ -342,6 +342,9 @@ typedef enum { /* Number of bytes after end of data in window to initialize in order to avoid memory checker errors from longest match routines */ + +void fill_window_c(deflate_state *s); + /* in trees.c */ void ZLIB_INTERNAL _tr_init(deflate_state *s); int ZLIB_INTERNAL _tr_tally(deflate_state *s, unsigned dist, unsigned lc); diff --git a/deflate_fast.c b/deflate_fast.c index c16905375..5b86e2799 100644 --- a/deflate_fast.c +++ b/deflate_fast.c @@ -7,6 +7,7 @@ #include "deflate.h" #include "deflate_p.h" #include "match.h" +#include "functable.h" /* =========================================================================== * Compress as much as possible from the input stream, return the current @@ -26,7 +27,7 @@ block_state deflate_fast(deflate_state *s, int flush) { * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { - fill_window(s); + functable.fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } @@ -39,7 +40,7 @@ block_state deflate_fast(deflate_state *s, int flush) { */ hash_head = NIL; if (s->lookahead >= MIN_MATCH) { - hash_head = insert_string(s, s->strstart, 1); + hash_head = functable.insert_string(s, s->strstart, 1); } /* Find the longest match, discarding those <= prev_length. @@ -68,7 +69,7 @@ block_state deflate_fast(deflate_state *s, int flush) { s->strstart++; #ifdef NOT_TWEAK_COMPILER do { - insert_string(s, s->strstart, 1); + functable.insert_string(s, s->strstart, 1); s->strstart++; /* strstart never exceeds WSIZE-MAX_MATCH, so there are * always MIN_MATCH bytes ahead. @@ -76,7 +77,7 @@ block_state deflate_fast(deflate_state *s, int flush) { } while (--s->match_length != 0); #else { - insert_string(s, s->strstart, s->match_length); + functable.insert_string(s, s->strstart, s->match_length); s->strstart += s->match_length; s->match_length = 0; } @@ -86,9 +87,9 @@ block_state deflate_fast(deflate_state *s, int flush) { s->match_length = 0; s->ins_h = s->window[s->strstart]; #ifndef NOT_TWEAK_COMPILER - insert_string(s, s->strstart + 2 - MIN_MATCH, MIN_MATCH - 2); + functable.insert_string(s, s->strstart + 2 - MIN_MATCH, MIN_MATCH - 2); #else - insert_string(s, s->strstart + 2 - MIN_MATCH, 1); + functable.insert_string(s, s->strstart + 2 - MIN_MATCH, 1); #if MIN_MATCH != 3 #warning Call insert_string() MIN_MATCH-3 more times #endif diff --git a/deflate_medium.c b/deflate_medium.c index 2a12d4bfd..fd79866c7 100644 --- a/deflate_medium.c +++ b/deflate_medium.c @@ -10,6 +10,7 @@ #include "deflate.h" #include "deflate_p.h" #include "match.h" +#include "functable.h" struct match { unsigned int match_start; @@ -63,7 +64,7 @@ static void insert_match(deflate_state *s, struct match match) { if (match.match_length) { if (match.strstart >= match.orgstart) { - insert_string(s, match.strstart, 1); + functable.insert_string(s, match.strstart, 1); } } } @@ -73,9 +74,9 @@ static void insert_match(deflate_state *s, struct match match) { if (match.match_length > 0) { if (match.strstart >= match.orgstart) { if (match.strstart + match.match_length - 1 >= match.orgstart) { - insert_string(s, match.strstart, match.match_length); + functable.insert_string(s, match.strstart, match.match_length); } else { - insert_string(s, match.strstart, match.orgstart - match.strstart + 1); + functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); } match.strstart += match.match_length; match.match_length = 0; @@ -94,7 +95,7 @@ static void insert_match(deflate_state *s, struct match match) { #ifdef NOT_TWEAK_COMPILER do { if (likely(match.strstart >= match.orgstart)) { - insert_string(s, match.strstart, 1); + functable.insert_string(s, match.strstart, 1); } match.strstart++; /* strstart never exceeds WSIZE-MAX_MATCH, so there are @@ -104,9 +105,9 @@ static void insert_match(deflate_state *s, struct match match) { #else if (likely(match.strstart >= match.orgstart)) { if (likely(match.strstart + match.match_length - 1 >= match.orgstart)) { - insert_string(s, match.strstart, match.match_length); + functable.insert_string(s, match.strstart, match.match_length); } else { - insert_string(s, match.strstart, match.orgstart - match.strstart + 1); + functable.insert_string(s, match.strstart, match.orgstart - match.strstart + 1); } } match.strstart += match.match_length; @@ -118,9 +119,9 @@ static void insert_match(deflate_state *s, struct match match) { s->ins_h = s->window[match.strstart]; if (match.strstart >= (MIN_MATCH - 2)) #ifndef NOT_TWEAK_COMPILER - insert_string(s, match.strstart + 2 - MIN_MATCH, MIN_MATCH - 2); + functable.insert_string(s, match.strstart + 2 - MIN_MATCH, MIN_MATCH - 2); #else - insert_string(s, match.strstart + 2 - MIN_MATCH, 1); + functable.insert_string(s, match.strstart + 2 - MIN_MATCH, 1); #if MIN_MATCH != 3 #warning Call insert_string() MIN_MATCH-3 more times #endif @@ -210,7 +211,7 @@ block_state deflate_medium(deflate_state *s, int flush) { * string following the next current_match. */ if (s->lookahead < MIN_LOOKAHEAD) { - fill_window(s); + functable.fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } @@ -232,7 +233,7 @@ block_state deflate_medium(deflate_state *s, int flush) { } else { hash_head = 0; if (s->lookahead >= MIN_MATCH) { - hash_head = insert_string(s, s->strstart, 1); + hash_head = functable.insert_string(s, s->strstart, 1); } /* set up the initial match to be a 1 byte literal */ @@ -266,7 +267,7 @@ block_state deflate_medium(deflate_state *s, int flush) { /* now, look ahead one */ if (s->lookahead > MIN_LOOKAHEAD && (current_match.strstart + current_match.match_length) < (s->window_size - MIN_LOOKAHEAD)) { s->strstart = current_match.strstart + current_match.match_length; - hash_head = insert_string(s, s->strstart, 1); + hash_head = functable.insert_string(s, s->strstart, 1); /* set up the initial match to be a 1 byte literal */ next_match.match_start = 0; diff --git a/deflate_p.h b/deflate_p.h index 4860ff9a5..b0d06ad87 100644 --- a/deflate_p.h +++ b/deflate_p.h @@ -20,7 +20,6 @@ void check_match(deflate_state *s, IPos start, IPos match, int length); #else #define check_match(s, start, match, length) #endif -void fill_window(deflate_state *s); void flush_pending(z_stream *strm); /* =========================================================================== @@ -32,12 +31,6 @@ void flush_pending(z_stream *strm); * (except for the last MIN_MATCH-1 bytes of the input file). */ -#ifdef X86_SSE4_2_CRC_HASH -extern Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count); -#elif defined(ARM_ACLE_CRC_HASH) -extern Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count); -#endif - static inline Pos insert_string_c(deflate_state *const s, const Pos str, unsigned int count) { Pos ret = 0; unsigned int idx; @@ -53,18 +46,6 @@ static inline Pos insert_string_c(deflate_state *const s, const Pos str, unsigne return ret; } -static inline Pos insert_string(deflate_state *const s, const Pos str, unsigned int count) { -#ifdef X86_SSE4_2_CRC_HASH - if (x86_cpu_has_sse42) - return insert_string_sse(s, str, count); -#endif -#if defined(ARM_ACLE_CRC_HASH) - return insert_string_acle(s, str, count); -#else - return insert_string_c(s, str, count); -#endif -} - /* =========================================================================== * Flush the current block, with given end-of-file flag. * IN assertion: strstart is set to the end of the current match. diff --git a/deflate_slow.c b/deflate_slow.c index c0be3eaa3..61c1888cd 100644 --- a/deflate_slow.c +++ b/deflate_slow.c @@ -7,6 +7,7 @@ #include "deflate.h" #include "deflate_p.h" #include "match.h" +#include "functable.h" /* =========================================================================== * Local data @@ -34,7 +35,7 @@ block_state deflate_slow(deflate_state *s, int flush) { * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { - fill_window(s); + functable.fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } @@ -47,7 +48,7 @@ block_state deflate_slow(deflate_state *s, int flush) { */ hash_head = NIL; if (s->lookahead >= MIN_MATCH) { - hash_head = insert_string(s, s->strstart, 1); + hash_head = functable.insert_string(s, s->strstart, 1); } /* Find the longest match, discarding those <= prev_length. @@ -97,7 +98,7 @@ block_state deflate_slow(deflate_state *s, int flush) { s->prev_length -= 2; do { if (++s->strstart <= max_insert) { - insert_string(s, s->strstart, 1); + functable.insert_string(s, s->strstart, 1); } } while (--s->prev_length != 0); s->match_available = 0; @@ -110,7 +111,7 @@ block_state deflate_slow(deflate_state *s, int flush) { if (unlikely(insert_cnt > max_insert - s->strstart)) insert_cnt = max_insert - s->strstart; - insert_string(s, s->strstart + 1, insert_cnt); + functable.insert_string(s, s->strstart + 1, insert_cnt); s->prev_length = 0; s->match_available = 0; s->match_length = MIN_MATCH-1; diff --git a/functable.c b/functable.c new file mode 100644 index 000000000..9887b99bc --- /dev/null +++ b/functable.c @@ -0,0 +1,51 @@ +/* functable.c -- Choose relevant optimized functions at runtime + * Copyright (C) 2017 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "functable.h" +#include "deflate.h" +#include "deflate_p.h" + +#if defined(X86_CPUID) +# include "arch/x86/x86.h" +#endif + +#ifdef X86_SSE4_2_CRC_HASH +extern Pos insert_string_sse(deflate_state *const s, const Pos str, unsigned int count); +#elif defined(ARM_ACLE_CRC_HASH) +extern Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count); +#endif + +#ifdef X86_SSE2_FILL_WINDOW +extern void fill_window_sse(deflate_state *s); +#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) +extern void fill_window_arm(deflate_state *s); +#endif + +/* ========================================================================= + * Initialize functable + */ +ZLIB_INTERNAL void functableInit() { + // Initialize defaults + functable.insert_string=&insert_string_c; + functable.fill_window=&fill_window_c; + + // insert_string + #ifdef X86_SSE4_2_CRC_HASH + if (x86_cpu_has_sse42) + functable.insert_string=&insert_string_sse; + #elif defined(ARM_ACLE_CRC_HASH) + functable.insert_string=&insert_string_acle; + #endif + + // fill_window + #ifdef X86_SSE2_FILL_WINDOW + # ifndef X86_NOCHECK_SSE2 + if (x86_cpu_has_sse2) + # endif + functable.fill_window=&fill_window_sse; + #elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) + functable.fill_window=&fill_window_arm; + #endif +} diff --git a/functable.h b/functable.h new file mode 100644 index 000000000..a867fa27e --- /dev/null +++ b/functable.h @@ -0,0 +1,18 @@ +/* functable.h -- Struct containing function pointers to optimized functions + * Copyright (C) 2017 Hans Kristian Rosbach + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef FUNCTABLE_H_ +#define FUNCTABLE_H_ + +#include "deflate.h" + +void functableInit(); + +struct functable_s { + void (* fill_window) (deflate_state *s); + Pos (* insert_string) (deflate_state *const s, const Pos str, unsigned int count); +} functable; + +#endif diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 8a34354db..d9b93b747 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -31,7 +31,8 @@ DEFFILE = zlib.def WITH_GZFILEOP = OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \ - infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj crc_pclmulqdq.obj + functable.obj infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj \ + x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj crc_pclmulqdq.obj !if "$(WITH_GZFILEOP)" != "" WFLAGS = $(WFLAGS) -DWITH_GZFILEOP OBJS = $(OBJS) gzclose.obj gzlib.obj gzread.obj gzwrite.obj