From 3ac4f5de069564698c263097996669e6153286f3 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Mon, 28 Jan 2019 16:05:50 -0600 Subject: [PATCH] only call NEON adler32 for more than 16 bytes improves performance of inflate by up to 6% on an A-73 Hikey running at 2.36 GHz when executing the chromium benchmark on the snappy data set. In a few cases inflate is slower by up to 0.8%. Overall performance of inflate is better by about 0.3%. --- adler32.c | 66 +++--------------------------------- adler32_p.h | 75 +++++++++++++++++++++++++++++++++++++++++ arch/arm/adler32_neon.c | 19 ++++++++--- win32/Makefile.arm | 2 +- win32/Makefile.msc | 2 +- 5 files changed, 97 insertions(+), 67 deletions(-) create mode 100644 adler32_p.h diff --git a/adler32.c b/adler32.c index 651d73c5..feff67bc 100644 --- a/adler32.c +++ b/adler32.c @@ -8,11 +8,11 @@ #include "zbuild.h" #include "zutil.h" #include "functable.h" +#include "adler32_p.h" uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2); -#define BASE 65521U /* largest prime smaller than 65536 */ #define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ @@ -22,46 +22,6 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len #define DO8(buf, i) DO4(buf, i); DO4(buf, i+4); #define DO16(buf) DO8(buf, 0); DO8(buf, 8); -/* use NO_DIVIDE if your processor does not do division in hardware -- - try it both ways to see which is faster */ -#ifdef NO_DIVIDE -/* note that this assumes BASE is 65521, where 65536 % 65521 == 15 - (thank you to John Reiser for pointing this out) */ -# define CHOP(a) \ - do { \ - uint32_t tmp = a >> 16; \ - a &= 0xffff; \ - a += (tmp << 4) - tmp; \ - } while (0) -# define MOD28(a) \ - do { \ - CHOP(a); \ - if (a >= BASE) a -= BASE; \ - } while (0) -# define MOD(a) \ - do { \ - CHOP(a); \ - MOD28(a); \ - } while (0) -# define MOD63(a) \ - do { /* this assumes a is not negative */ \ - z_off64_t tmp = a >> 32; \ - a &= 0xffffffffL; \ - a += (tmp << 8) - (tmp << 5) + tmp; \ - tmp = a >> 16; \ - a &= 0xffffL; \ - a += (tmp << 4) - tmp; \ - tmp = a >> 16; \ - a &= 0xffffL; \ - a += (tmp << 4) - tmp; \ - if (a >= BASE) a -= BASE; \ - } while (0) -#else -# define MOD(a) a %= BASE -# define MOD28(a) a %= BASE -# define MOD63(a) a %= BASE -#endif - /* ========================================================================= */ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) { uint32_t sum2; @@ -72,32 +32,16 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) { adler &= 0xffff; /* in case user likes doing a byte at a time, keep it fast */ - if (len == 1) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } + if (len == 1) + return adler32_len_1(adler, buf, sum2); /* initial Adler-32 value (deferred check for len == 1 speed) */ if (buf == NULL) return 1L; /* in case short lengths are provided, keep it somewhat fast */ - if (len < 16) { - while (len) { - --len; - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD28(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } + if (len < 16) + return adler32_len_16(adler, buf, len, sum2); /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { diff --git a/adler32_p.h b/adler32_p.h new file mode 100644 index 00000000..131513a8 --- /dev/null +++ b/adler32_p.h @@ -0,0 +1,75 @@ +/* adler32_p.h -- Private inline functions and macros shared with + * different computation of the Adler-32 checksum + * of a data stream. + * Copyright (C) 1995-2011, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_P_H +#define ADLER32_P_H + +#define BASE 65521U /* largest prime smaller than 65536 */ + +/* use NO_DIVIDE if your processor does not do division in hardware -- + try it both ways to see which is faster */ +#ifdef NO_DIVIDE +/* note that this assumes BASE is 65521, where 65536 % 65521 == 15 + (thank you to John Reiser for pointing this out) */ +# define CHOP(a) \ + do { \ + uint32_t tmp = a >> 16; \ + a &= 0xffff; \ + a += (tmp << 4) - tmp; \ + } while (0) +# define MOD28(a) \ + do { \ + CHOP(a); \ + if (a >= BASE) a -= BASE; \ + } while (0) +# define MOD(a) \ + do { \ + CHOP(a); \ + MOD28(a); \ + } while (0) +# define MOD63(a) \ + do { /* this assumes a is not negative */ \ + z_off64_t tmp = a >> 32; \ + a &= 0xffffffffL; \ + a += (tmp << 8) - (tmp << 5) + tmp; \ + tmp = a >> 16; \ + a &= 0xffffL; \ + a += (tmp << 4) - tmp; \ + tmp = a >> 16; \ + a &= 0xffffL; \ + a += (tmp << 4) - tmp; \ + if (a >= BASE) a -= BASE; \ + } while (0) +#else +# define MOD(a) a %= BASE +# define MOD28(a) a %= BASE +# define MOD63(a) a %= BASE +#endif + +static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) { + adler += buf[0]; + if (adler >= BASE) + adler -= BASE; + sum2 += adler; + if (sum2 >= BASE) + sum2 -= BASE; + return adler | (sum2 << 16); +} + +static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) { + while (len) { + --len; + adler += *buf++; + sum2 += adler; + } + if (adler >= BASE) + adler -= BASE; + MOD28(sum2); /* only added so many BASE's */ + return adler | (sum2 << 16); +} + +#endif /* ADLER32_P_H */ diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index 8d845a48..f8573ec5 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -19,6 +19,7 @@ #include "adler32_neon.h" #if defined(__ARM_NEON__) || defined(__ARM_NEON) #include +#include "adler32_p.h" static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) { static const uint8_t taps[32] = { @@ -80,15 +81,27 @@ static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t le } uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) { - if (!buf) + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == NULL) return 1L; + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) + return adler32_len_16(adler, buf, len, sum2); + /* The largest prime smaller than 65536. */ const uint32_t M_BASE = 65521; /* This is the threshold where doing accumulation may overflow. */ const int M_NMAX = 5552; - uint32_t sum2; uint32_t pair[2]; int n = M_NMAX; unsigned int done = 0; @@ -98,8 +111,6 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) { /* Split Adler-32 into component sums, it can be supplied by * the caller sites (e.g. in a PNG file). */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; pair[0] = adler; pair[1] = sum2; diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 3a487b2f..0b3f9406 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -117,7 +117,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf SRCDIR = $(TOP) # Keep the dependences in sync with top-level Makefile.in -adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h +adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 0c84a14a..9f4b884c 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -108,7 +108,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf SRCDIR = $(TOP) # Keep the dependences in sync with top-level Makefile.in -adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h +adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h -- 2.47.2