From: Adam Stylinski Date: Fri, 1 Apr 2022 23:02:05 +0000 (-0400) Subject: Create adler32_fold_c* functions X-Git-Tag: 2.1.0-beta1~244 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b1389ac2d5b0850742a10e9ff8243537d678f2f6;p=thirdparty%2Fzlib-ng.git Create adler32_fold_c* functions These are very simple wrappers that do nothing clever but serve as a shim interface for implementing versions which do cleverly track the number of scalar sums performed so that we can minimize rebasing and also have an efficient copy elision. This serves as the baseline as each vectorization gets its own commit. That way the PR will be bisectable. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 24fe98b3e..247af86d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -951,6 +951,7 @@ set(ZLIB_PRIVATE_HDRS ) set(ZLIB_SRCS adler32.c + adler32_fold.c chunkset.c compare256.c compress.c diff --git a/Makefile.in b/Makefile.in index 96bb93828..10decf5d7 100644 --- a/Makefile.in +++ b/Makefile.in @@ -75,6 +75,7 @@ pkgconfigdir = ${libdir}/pkgconfig OBJZ = \ adler32.o \ + adler32_fold.o \ chunkset.o \ compare256.o \ compress.o \ @@ -112,6 +113,7 @@ OBJC = $(OBJZ) $(OBJG) PIC_OBJZ = \ adler32.lo \ + adler32_fold.lo \ chunkset.lo \ compare256.lo \ compress.lo \ diff --git a/adler32_fold.c b/adler32_fold.c new file mode 100644 index 000000000..3f745cd9c --- /dev/null +++ b/adler32_fold.c @@ -0,0 +1,30 @@ +/* crc32_fold.c -- adler32 folding interface + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" +#include "functable.h" +#include "adler32_fold.h" + +Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler) { + /* So, for the "C" version, we'll just stash the value into nsums. + * This is mostly a compatibility shim, these functions in the functable + * will have more optimal versions that make use of adler and sum2. In order + * to make each implementation bisectable, each new implementation will be a + * new commit */ + adler->nsums = init_adler; +} + +Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { + adler->nsums = functable.adler32(adler->nsums, src, len); + memcpy(dst, src, len); +} + +Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len) { + adler->nsums = functable.adler32(adler->nsums, src, len); +} + +Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler) { + return adler->nsums; +} diff --git a/adler32_fold.h b/adler32_fold.h new file mode 100644 index 000000000..d93a51053 --- /dev/null +++ b/adler32_fold.h @@ -0,0 +1,22 @@ +/* adler32_fold.h -- adler32 folding interface + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_FOLD_H_ +#define ADLER32_FOLD_H_ + +#include + +typedef struct adler32_fold_s { + uint8_t adler[64]; // First half of component sums + uint8_t sum2[64]; // Second half of component sums + uint32_t nsums; // The number of scalar sums performed +} adler32_fold; + +Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler); +Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); +Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len); +Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler); + +#endif diff --git a/functable.c b/functable.c index 960c51f1b..317349f30 100644 --- a/functable.c +++ b/functable.c @@ -202,6 +202,26 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ return functable.adler32(adler, buf, len); } +Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) { + functable.adler32_fold_reset = &adler32_fold_reset_c; + functable.adler32_fold_reset(adler, init_adler); +} + +Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { + functable.adler32_fold_copy = &adler32_fold_copy_c; + functable.adler32_fold_copy(adler, dst, src, len); +} + +Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) { + functable.adler32_fold = &adler32_fold_c; + functable.adler32_fold(adler, src, len); +} + +Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) { + functable.adler32_fold_final = &adler32_fold_final_c; + return functable.adler32_fold_final(adler); +} + Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) { functable.crc32_fold_reset = &crc32_fold_reset_c; cpu_check_features(); @@ -437,6 +457,10 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { adler32_stub, + adler32_fold_reset_stub, + adler32_fold_copy_stub, + adler32_fold_stub, + adler32_fold_final_stub, crc32_stub, crc32_fold_reset_stub, crc32_fold_copy_stub, diff --git a/functable.h b/functable.h index 61dde2105..8889e74af 100644 --- a/functable.h +++ b/functable.h @@ -8,9 +8,14 @@ #include "deflate.h" #include "crc32_fold.h" +#include "adler32_fold.h" struct functable_s { uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); + void (* adler32_fold_reset) (adler32_fold *adler, uint32_t init_adler); + void (* adler32_fold_copy) (adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); + void (* adler32_fold) (adler32_fold *adler, const uint8_t *src, size_t len); + uint32_t (* adler32_fold_final) (adler32_fold *adler); uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); uint32_t (* crc32_fold_reset) (crc32_fold *crc); void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/inflate.c b/inflate.c index 8a48e7933..9e816a40d 100644 --- a/inflate.c +++ b/inflate.c @@ -27,8 +27,9 @@ static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, } else #endif { - strm->adler = state->check = functable.adler32(state->check, src, copy); - memcpy(dst, src, copy); + /*strm->adler = state->check = functable.adler32(state->check, src, copy); + memcpy(dst, src, copy);*/ + functable.adler32_fold_copy(&state->adler_fold, dst, src, copy); } } @@ -40,7 +41,8 @@ static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_ } else #endif { - strm->adler = state->check = functable.adler32(state->check, src, len); + //strm->adler = state->check = functable.adler32(state->check, src, len); + functable.adler32_fold(&state->adler_fold, src, len); } } @@ -463,6 +465,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { state->dmax = 1U << len; state->flags = 0; /* indicate zlib header */ Tracev((stderr, "inflate: zlib header ok\n")); + functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE); strm->adler = state->check = ADLER32_INITIAL_VALUE; state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); @@ -609,7 +612,9 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { #endif case DICTID: NEEDBITS(32); + //strm->adler = state->check = ZSWAP32(hold); strm->adler = state->check = ZSWAP32(hold); + functable.adler32_fold_reset(&state->adler_fold, strm->adler); INITBITS(); state->mode = DICT; @@ -619,6 +624,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { return Z_NEED_DICT; } strm->adler = state->check = ADLER32_INITIAL_VALUE; + functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE); state->mode = TYPE; case TYPE: @@ -1011,6 +1017,8 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { #ifdef GUNZIP if (state->flags) strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold); + else + strm->adler = state->check = functable.adler32_fold_final(&state->adler_fold); #endif } out = left; diff --git a/inflate.h b/inflate.h index ffa2d0bd8..8c65f1dbe 100644 --- a/inflate.h +++ b/inflate.h @@ -12,6 +12,7 @@ #define INFLATE_H_ #include "crc32_fold.h" +#include "adler32_fold.h" /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate(). NO_GZIP would be used to avoid linking in the crc code when it is not needed. @@ -103,6 +104,7 @@ struct inflate_state { uint32_t wnext; /* window write index */ unsigned char *window; /* allocated sliding window, if needed */ + struct adler32_fold_s ALIGNED_(64) adler_fold; struct crc32_fold_s ALIGNED_(16) crc_fold; /* bit accumulator */ diff --git a/win32/Makefile.a64 b/win32/Makefile.a64 index b0d7993d5..5d5824bda 100644 --- a/win32/Makefile.a64 +++ b/win32/Makefile.a64 @@ -43,6 +43,7 @@ SUFFIX = OBJS = \ adler32.obj \ + adler32_fold.obj \ arm_features.obj \ chunkset.obj \ compare256.obj \ @@ -179,6 +180,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf SRCDIR = $(TOP) # Keep the dependences in sync with top-level Makefile.in adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h +adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 14df718d8..765bbb83b 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -47,6 +47,7 @@ SUFFIX = OBJS = \ adler32.obj \ + adler32_fold.obj \ arm_features.obj \ chunkset.obj \ compare256.obj \ @@ -192,6 +193,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf SRCDIR = $(TOP) # Keep the dependences in sync with top-level Makefile.in adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h +adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h gzread.obj: $(SRCDIR)/gzread.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 9caa8211c..b121cde22 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -48,6 +48,7 @@ SUFFIX = OBJS = \ adler32.obj \ + adler32_fold.obj \ chunkset.obj \ chunkset_avx.obj \ chunkset_sse2.obj \ @@ -181,6 +182,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf SRCDIR = $(TOP) # Keep the dependences in sync with top-level Makefile.in adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h +adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zendian.h $(SRCDIR)/arch/x86/x86_features.h gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h gzread.obj: $(SRCDIR)/gzread.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h $(SRCDIR)/zutil_p.h