From: Sebastian Pop Date: Fri, 1 Mar 2019 15:45:33 +0000 (-0600) Subject: move INFFAST_CHUCKSIZE code to memcopy.h X-Git-Tag: 1.9.9-b1~510 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93dbe3caeee3626fe653a92fa5e2440105d57170;p=thirdparty%2Fzlib-ng.git move INFFAST_CHUCKSIZE code to memcopy.h --- diff --git a/inffast.c b/inffast.c index af623eb63..045242afa 100644 --- a/inffast.c +++ b/inffast.c @@ -21,87 +21,6 @@ bits -= (unsigned)(n); \ } while (0) -#ifdef INFFAST_CHUNKSIZE -/* - Ask the compiler to perform a wide, unaligned load with an machine - instruction appropriate for the inffast_chunk_t type. - */ -static inline inffast_chunk_t loadchunk(unsigned char const* s) { - inffast_chunk_t c; - memcpy(&c, s, sizeof(c)); - return c; -} - -/* - Ask the compiler to perform a wide, unaligned store with an machine - instruction appropriate for the inffast_chunk_t type. - */ -static inline void storechunk(unsigned char* d, inffast_chunk_t c) { - memcpy(d, &c, sizeof(c)); -} - -/* - Behave like memcpy, but assume that it's OK to overwrite at least - INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this, - that the length is non-zero, and that `from` lags `out` by at least - INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that - the distance is less than the length of the copy). - - Aside from better memory bus utilisation, this means that short copies - (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop - without iteration, which will hopefully make the branch prediction more - reliable. - */ -static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) { - --len; - storechunk(out, loadchunk(from)); - out += (len % INFFAST_CHUNKSIZE) + 1; - from += (len % INFFAST_CHUNKSIZE) + 1; - len /= INFFAST_CHUNKSIZE; - while (len-- > 0) { - storechunk(out, loadchunk(from)); - out += INFFAST_CHUNKSIZE; - from += INFFAST_CHUNKSIZE; - } - return out; -} - -/* - Behave like chunkcopy, but avoid writing beyond of legal output. - */ -static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len, - unsigned char *safe) { - if (out > safe) { - while (len-- > 0) { - *out++ = *from++; - } - return out; - } - return chunkcopy(out, from, len); -} - -/* - Perform short copies until distance can be rewritten as being at least - INFFAST_CHUNKSIZE. - - This assumes that it's OK to overwrite at least the first - 2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this. - This assumption holds because inflate_fast() starts every iteration with at - least 258 bytes of output space available (258 being the maximum length - output from a single token; see inflate_fast()'s assumptions below). - */ -static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) { - unsigned char const *from = out - *dist; - while (*dist < *len && *dist < INFFAST_CHUNKSIZE) { - storechunk(out, loadchunk(from)); - out += *dist; - *len -= *dist; - *dist += *dist; - } - return out; -} -#endif - /* Decode literal, length, and distance codes and write out the resulting literal and match bytes until either not enough input or output is diff --git a/inffast.h b/inffast.h index d2a9a72e8..9ad21abc8 100644 --- a/inffast.h +++ b/inffast.h @@ -12,13 +12,6 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start); - -#if (defined(__GNUC__) || defined(__clang__)) && defined(__ARM_NEON__) -# include -typedef uint8x16_t inffast_chunk_t; -# define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t) -#endif - #define INFLATE_FAST_MIN_HAVE 8 #define INFLATE_FAST_MIN_LEFT 258 diff --git a/memcopy.h b/memcopy.h index a5db415c4..e9889a992 100644 --- a/memcopy.h +++ b/memcopy.h @@ -16,6 +16,93 @@ static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { #endif } +#if (defined(__GNUC__) || defined(__clang__)) && defined(__ARM_NEON__) +# include +typedef uint8x16_t inffast_chunk_t; +# define INFFAST_CHUNKSIZE sizeof(inffast_chunk_t) +#endif + +#ifdef INFFAST_CHUNKSIZE +/* + Ask the compiler to perform a wide, unaligned load with an machine + instruction appropriate for the inffast_chunk_t type. + */ +static inline inffast_chunk_t loadchunk(unsigned char const* s) { + inffast_chunk_t c; + memcpy(&c, s, sizeof(c)); + return c; +} + +/* + Ask the compiler to perform a wide, unaligned store with an machine + instruction appropriate for the inffast_chunk_t type. + */ +static inline void storechunk(unsigned char* d, inffast_chunk_t c) { + memcpy(d, &c, sizeof(c)); +} + +/* + Behave like memcpy, but assume that it's OK to overwrite at least + INFFAST_CHUNKSIZE bytes of output even if the length is shorter than this, + that the length is non-zero, and that `from` lags `out` by at least + INFFAST_CHUNKSIZE bytes (or that they don't overlap at all or simply that + the distance is less than the length of the copy). + + Aside from better memory bus utilisation, this means that short copies + (INFFAST_CHUNKSIZE bytes or fewer) will fall straight through the loop + without iteration, which will hopefully make the branch prediction more + reliable. + */ +static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const *from, unsigned len) { + --len; + storechunk(out, loadchunk(from)); + out += (len % INFFAST_CHUNKSIZE) + 1; + from += (len % INFFAST_CHUNKSIZE) + 1; + len /= INFFAST_CHUNKSIZE; + while (len-- > 0) { + storechunk(out, loadchunk(from)); + out += INFFAST_CHUNKSIZE; + from += INFFAST_CHUNKSIZE; + } + return out; +} + +/* + Behave like chunkcopy, but avoid writing beyond of legal output. + */ +static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len, + unsigned char *safe) { + if (out > safe) { + while (len-- > 0) { + *out++ = *from++; + } + return out; + } + return chunkcopy(out, from, len); +} + +/* + Perform short copies until distance can be rewritten as being at least + INFFAST_CHUNKSIZE. + + This assumes that it's OK to overwrite at least the first + 2*INFFAST_CHUNKSIZE bytes of output even if the copy is shorter than this. + This assumption holds because inflate_fast() starts every iteration with at + least 258 bytes of output space available (258 being the maximum length + output from a single token; see inflate_fast()'s assumptions below). + */ +static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, unsigned *len) { + unsigned char const *from = out - *dist; + while (*dist < *len && *dist < INFFAST_CHUNKSIZE) { + storechunk(out, loadchunk(from)); + out += *dist; + *len -= *dist; + *dist += *dist; + } + return out; +} +#endif + static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) { *out++ = *from; return out;