From: Nathan Moinvaziri Date: Mon, 29 Aug 2022 03:27:37 +0000 (-0700) Subject: Use arch-specific versions of inflate_fast. X-Git-Tag: 2.1.0-beta1~71 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=aa1109bb2ee79680b0ac966b1de8724ba5083093;p=thirdparty%2Fzlib-ng.git Use arch-specific versions of inflate_fast. This should reduce the cost of indirection that occurs when calling functable chunk copying functions inside inflate_fast. It should also allow the compiler to optimize the inflate fast path for the specific architecture. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f667e8f..48e09d52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -967,7 +967,7 @@ set(ZLIB_PRIVATE_HDRS deflate.h deflate_p.h functable.h - inffast.h + inffast_tpl.h inffixed_tbl.h inflate.h inflate_p.h @@ -1001,7 +1001,6 @@ set(ZLIB_SRCS deflate_stored.c functable.c infback.c - inffast.c inflate.c inftrees.c insert_string.c diff --git a/Makefile.in b/Makefile.in index 98b9042e..4bfb6313 100644 --- a/Makefile.in +++ b/Makefile.in @@ -93,7 +93,6 @@ OBJZ = \ deflate_stored.o \ functable.o \ infback.o \ - inffast.o \ inflate.o \ inftrees.o \ insert_string.o \ @@ -132,7 +131,6 @@ PIC_OBJZ = \ deflate_stored.lo \ functable.lo \ infback.lo \ - inffast.lo \ inflate.lo \ inftrees.lo \ insert_string.lo \ diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 2c64ce59..b119f212 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -94,4 +94,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t #include "chunkset_tpl.h" +#define INFLATE_FAST inflate_fast_neon + +#include "inffast_tpl.h" + #endif diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c index 83928308..abc5f5e2 100644 --- a/arch/power/chunkset_power8.c +++ b/arch/power/chunkset_power8.c @@ -48,4 +48,8 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { #include "chunkset_tpl.h" +#define INFLATE_FAST inflate_fast_power8 + +#include "inffast_tpl.h" + #endif diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c index c4a4d9b0..e128e8f7 100644 --- a/arch/x86/chunkset_avx.c +++ b/arch/x86/chunkset_avx.c @@ -128,4 +128,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t #include "chunkset_tpl.h" +#define INFLATE_FAST inflate_fast_avx + +#include "inffast_tpl.h" + #endif diff --git a/arch/x86/chunkset_sse2.c b/arch/x86/chunkset_sse2.c index eddf5d98..c402c0ee 100644 --- a/arch/x86/chunkset_sse2.c +++ b/arch/x86/chunkset_sse2.c @@ -49,4 +49,8 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { #include "chunkset_tpl.h" +#define INFLATE_FAST inflate_fast_sse2 + +#include "inffast_tpl.h" + #endif diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c index c148db09..9a955885 100644 --- a/arch/x86/chunkset_sse41.c +++ b/arch/x86/chunkset_sse41.c @@ -95,4 +95,8 @@ extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); #include "chunkset_tpl.h" +#define INFLATE_FAST inflate_fast_sse41 + +#include "inffast_tpl.h" + #endif diff --git a/chunkset.c b/chunkset.c index 169e4112..7b2bb7ba 100644 --- a/chunkset.c +++ b/chunkset.c @@ -36,3 +36,7 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { #define CHUNKMEMSET_SAFE chunkmemset_safe_c #include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_c + +#include "inffast_tpl.h" diff --git a/cpu_features.h b/cpu_features.h index 72e40a16..d211cb11 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -110,6 +110,24 @@ extern uint8_t* chunkmemset_power8(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif +/* inflate fast loop */ +extern void inflate_fast_c(void *strm, uint32_t start); +#ifdef X86_SSE2_CHUNKSET +extern void inflate_fast_sse2(void *strm, uint32_t start); +#endif +#ifdef X86_SSE41 +extern void inflate_fast_sse41(void *strm, uint32_t start); +#endif +#ifdef X86_AVX_CHUNKSET +extern void inflate_fast_avx(void *strm, uint32_t start); +#endif +#ifdef ARM_NEON_CHUNKSET +extern void inflate_fast_neon(void *strm, uint32_t start); +#endif +#ifdef POWER8_VSX_CHUNKSET +extern void inflate_fast_power8(void *strm, uint32_t start); +#endif + /* CRC32 */ typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); diff --git a/functable.c b/functable.c index 35474878..1bdba153 100644 --- a/functable.c +++ b/functable.c @@ -482,6 +482,35 @@ static uint32_t compare256_stub(const uint8_t* src0, const uint8_t* src1) { return functable.compare256(src0, src1); } +Z_INTERNAL void inflate_fast_stub(void *strm, uint32_t start) { + functable.inflate_fast = &inflate_fast_c; + +#ifdef X86_SSE2_CHUNKSET +# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2) + if (x86_cpu_has_sse2) +# endif + functable.inflate_fast = &inflate_fast_sse2; +#endif +#if defined(X86_SSE41) && defined(X86_SSE2) + if (x86_cpu_has_sse41) + functable.inflate_fast = &inflate_fast_sse41; +#endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.inflate_fast = &inflate_fast_avx; +#endif +#ifdef ARM_NEON_CHUNKSET + if (arm_cpu_has_neon) + functable.inflate_fast = &inflate_fast_neon; +#endif +#ifdef POWER8_VSX_CHUNKSET + if (power_cpu_has_arch_2_07) + functable.inflate_fast = &inflate_fast_power8; +#endif + + functable.inflate_fast(strm, start); +} + /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { adler32_stub, @@ -497,6 +526,7 @@ Z_INTERNAL Z_TLS struct functable_s functable = { chunkunroll_stub, chunkmemset_stub, chunkmemset_safe_stub, + inflate_fast_stub, insert_string_stub, longest_match_stub, longest_match_slow_stub, diff --git a/functable.h b/functable.h index 531f3a1c..4319b4c1 100644 --- a/functable.h +++ b/functable.h @@ -24,6 +24,7 @@ struct functable_s { uint8_t* (* chunkunroll) (uint8_t *out, unsigned *dist, unsigned *len); uint8_t* (* chunkmemset) (uint8_t *out, unsigned dist, unsigned len); uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left); + void (* inflate_fast) (void *strm, uint32_t start); void (* insert_string) (deflate_state *const s, uint32_t str, uint32_t count); uint32_t (* longest_match) (deflate_state *const s, Pos cur_match); uint32_t (* longest_match_slow) (deflate_state *const s, Pos cur_match); diff --git a/infback.c b/infback.c index a7af88f2..7d4d8892 100644 --- a/infback.c +++ b/infback.c @@ -14,7 +14,6 @@ #include "zutil.h" #include "inftrees.h" #include "inflate.h" -#include "inffast.h" #include "inflate_p.h" #include "functable.h" @@ -358,7 +357,7 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in RESTORE(); if (state->whave < state->wsize) state->whave = state->wsize - left; - zng_inflate_fast(strm, state->wsize); + functable.inflate_fast(strm, state->wsize); LOAD(); break; } diff --git a/inffast.h b/inffast.h deleted file mode 100644 index 179a65da..00000000 --- a/inffast.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INFFAST_H_ -#define INFFAST_H_ -/* inffast.h -- header to use inffast.c - * Copyright (C) 1995-2003, 2010 Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -/* WARNING: this file should *not* be used by applications. It is - part of the implementation of the compression library and is - subject to change. Applications should only use zlib.h. - */ - -void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start); - -#define INFLATE_FAST_MIN_HAVE 8 -#define INFLATE_FAST_MIN_LEFT 258 - -#endif /* INFFAST_H_ */ diff --git a/inffast.c b/inffast_tpl.h similarity index 95% rename from inffast.c rename to inffast_tpl.h index bfb1c831..6977a560 100644 --- a/inffast.c +++ b/inffast_tpl.h @@ -4,24 +4,13 @@ */ #include "zbuild.h" +#include "zendian.h" #include "zutil.h" #include "inftrees.h" #include "inflate.h" -#include "inffast.h" #include "inflate_p.h" #include "functable.h" -/* Load 64 bits from IN and place the bytes at offset BITS in the result. */ -static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { - uint64_t chunk; - memcpy(&chunk, in, sizeof(chunk)); - -#if BYTE_ORDER == LITTLE_ENDIAN - return chunk << bits; -#else - return ZSWAP64(chunk) << bits; -#endif -} /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is @@ -61,7 +50,7 @@ static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { requires strm->avail_out >= 258 for each loop to avoid checking for output space. */ -void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start) { +void Z_INTERNAL INFLATE_FAST(PREFIX3(stream) *strm, uint32_t start) { /* start: inflate()'s starting value for strm->avail_out */ struct inflate_state *state; z_const unsigned char *in; /* local strm->next_in */ @@ -259,7 +248,7 @@ void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start) { if (op < len) { /* still need some from output */ len -= op; out = chunkcopy_safe(out, from, op, safe); - out = functable.chunkunroll(out, &dist, &len); + out = CHUNKUNROLL(out, &dist, &len); out = chunkcopy_safe(out, out - dist, len, safe); } else { out = chunkcopy_safe(out, from, len, safe); @@ -269,7 +258,7 @@ void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start) { if (dist >= len || dist >= state->chunksize) out = chunkcopy_safe(out, out - dist, len, safe); else - out = functable.chunkmemset_safe(out, dist, len, (unsigned)((safe - out) + 1)); + out = CHUNKMEMSET_SAFE(out, dist, len, (unsigned)((safe - out) + 1)); } else { /* Whole reference is in range of current output. No range checks are necessary because we start with room for at least 258 bytes of output, @@ -277,9 +266,9 @@ void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm, unsigned long start) { as they stay within 258 bytes of `out`. */ if (dist >= len || dist >= state->chunksize) - out = functable.chunkcopy(out, out - dist, len); + out = CHUNKCOPY(out, out - dist, len); else - out = functable.chunkmemset(out, dist, len); + out = CHUNKMEMSET(out, dist, len); } } else if ((op & 64) == 0) { /* 2nd level distance code */ here = dcode + here->val + BITS(op); diff --git a/inflate.c b/inflate.c index 79491209..eaf78ebf 100644 --- a/inflate.c +++ b/inflate.c @@ -8,7 +8,6 @@ #include "cpu_features.h" #include "inftrees.h" #include "inflate.h" -#include "inffast.h" #include "inflate_p.h" #include "inffixed_tbl.h" #include "functable.h" @@ -866,7 +865,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { /* use inflate_fast() if we have enough input and output */ if (have >= INFLATE_FAST_MIN_HAVE && left >= INFLATE_FAST_MIN_LEFT) { RESTORE(); - zng_inflate_fast(strm, out); + functable.inflate_fast(strm, out); LOAD(); if (state->mode == TYPE) state->back = -1; diff --git a/inflate_p.h b/inflate_p.h index 27b61027..2b57b317 100644 --- a/inflate_p.h +++ b/inflate_p.h @@ -132,6 +132,21 @@ strm->msg = (char *)errmsg; \ } while (0) +#define INFLATE_FAST_MIN_HAVE 8 +#define INFLATE_FAST_MIN_LEFT 258 + +/* Load 64 bits from IN and place the bytes at offset BITS in the result. */ +static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { + uint64_t chunk; + memcpy(&chunk, in, sizeof(chunk)); + +#if BYTE_ORDER == LITTLE_ENDIAN + return chunk << bits; +#else + return ZSWAP64(chunk) << bits; +#endif +} + /* Behave like chunkcopy, but avoid writing beyond of legal output. */ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) { uint64_t safelen = (safe - out) + 1; diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index efcb0a64..8537bd5f 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -64,7 +64,6 @@ OBJS = \ infback.obj \ inflate.obj \ inftrees.obj \ - inffast.obj \ insert_string.obj \ insert_string_roll.obj \ slide_hash.obj \ @@ -200,9 +199,8 @@ deflate_medium.obj: $(SRCDIR)/deflate_medium.c $(SRCDIR)/zbuild.h $(SRCDIR)/defl deflate_rle.obj: $(SRCDIR)/deflate_rle.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_stored.obj: $(SRCDIR)/deflate_stored.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h -infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h -inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h -inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h +infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h +inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h slide_hash.obj: $(SRCDIR)/slide_hash.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h slide_hash_neon.obj: $(SRCDIR)/arch/arm/slide_hash_neon.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 34dff1a7..58a7fc86 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -68,7 +68,6 @@ OBJS = \ infback.obj \ inflate.obj \ inftrees.obj \ - inffast.obj \ insert_string.obj \ insert_string_roll.obj \ slide_hash.obj \ @@ -213,9 +212,8 @@ deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflat deflate_rle.obj: $(SRCDIR)/deflate_rle.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_stored.obj: $(SRCDIR)/deflate_stored.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h -infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h -inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h -inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h +infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h +inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h slide_hash.obj: $(SRCDIR)/slide_hash.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h trees.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/trees_tbl.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 445b673b..9c00737a 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -81,7 +81,6 @@ OBJS = \ infback.obj \ inflate.obj \ inftrees.obj \ - inffast.obj \ insert_string.obj \ insert_string_roll.obj \ insert_string_sse42.obj \ @@ -220,9 +219,8 @@ deflate_quick.obj: $(SRCDIR)/deflate_quick.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflat deflate_rle.obj: $(SRCDIR)/deflate_rle.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_slow.obj: $(SRCDIR)/deflate_slow.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_stored.obj: $(SRCDIR)/deflate_stored.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h -infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h -inffast.obj: $(SRCDIR)/inffast.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h -inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inffast.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h +infback.obj: $(SRCDIR)/infback.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h +inflate.obj: $(SRCDIR)/inflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h $(SRCDIR)/inflate.h $(SRCDIR)/inflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h $(SRCDIR)/functable.h inftrees.obj: $(SRCDIR)/inftrees.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/inftrees.h slide_hash.obj: $(SRCDIR)/slide_hash.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h slide_hash_avx2.obj: $(SRCDIR)/arch/x86/slide_hash_avx2.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h