improves performance of inflate by up to 6% on an A-73 Hikey running at 2.36 GHz
when executing the chromium benchmark on the snappy data set. In a few cases
inflate is slower by up to 0.8%. Overall performance of inflate is better by
about 0.3%.
#include "zbuild.h"
#include "zutil.h"
#include "functable.h"
+#include "adler32_p.h"
uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2);
-#define BASE 65521U /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define DO8(buf, i) DO4(buf, i); DO4(buf, i+4);
#define DO16(buf) DO8(buf, 0); DO8(buf, 8);
-/* use NO_DIVIDE if your processor does not do division in hardware --
- try it both ways to see which is faster */
-#ifdef NO_DIVIDE
-/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
- (thank you to John Reiser for pointing this out) */
-# define CHOP(a) \
- do { \
- uint32_t tmp = a >> 16; \
- a &= 0xffff; \
- a += (tmp << 4) - tmp; \
- } while (0)
-# define MOD28(a) \
- do { \
- CHOP(a); \
- if (a >= BASE) a -= BASE; \
- } while (0)
-# define MOD(a) \
- do { \
- CHOP(a); \
- MOD28(a); \
- } while (0)
-# define MOD63(a) \
- do { /* this assumes a is not negative */ \
- z_off64_t tmp = a >> 32; \
- a &= 0xffffffffL; \
- a += (tmp << 8) - (tmp << 5) + tmp; \
- tmp = a >> 16; \
- a &= 0xffffL; \
- a += (tmp << 4) - tmp; \
- tmp = a >> 16; \
- a &= 0xffffL; \
- a += (tmp << 4) - tmp; \
- if (a >= BASE) a -= BASE; \
- } while (0)
-#else
-# define MOD(a) a %= BASE
-# define MOD28(a) a %= BASE
-# define MOD63(a) a %= BASE
-#endif
-
/* ========================================================================= */
uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
uint32_t sum2;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
- if (len == 1) {
- adler += buf[0];
- if (adler >= BASE)
- adler -= BASE;
- sum2 += adler;
- if (sum2 >= BASE)
- sum2 -= BASE;
- return adler | (sum2 << 16);
- }
+ if (len == 1)
+ return adler32_len_1(adler, buf, sum2);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (buf == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
- if (len < 16) {
- while (len) {
- --len;
- adler += *buf++;
- sum2 += adler;
- }
- if (adler >= BASE)
- adler -= BASE;
- MOD28(sum2); /* only added so many BASE's */
- return adler | (sum2 << 16);
- }
+ if (len < 16)
+ return adler32_len_16(adler, buf, len, sum2);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
--- /dev/null
+/* adler32_p.h -- Private inline functions and macros shared with
+ * different computation of the Adler-32 checksum
+ * of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U /* largest prime smaller than 65536 */
+
+/* use NO_DIVIDE if your processor does not do division in hardware --
+ try it both ways to see which is faster */
+#ifdef NO_DIVIDE
+/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
+ (thank you to John Reiser for pointing this out) */
+# define CHOP(a) \
+ do { \
+ uint32_t tmp = a >> 16; \
+ a &= 0xffff; \
+ a += (tmp << 4) - tmp; \
+ } while (0)
+# define MOD28(a) \
+ do { \
+ CHOP(a); \
+ if (a >= BASE) a -= BASE; \
+ } while (0)
+# define MOD(a) \
+ do { \
+ CHOP(a); \
+ MOD28(a); \
+ } while (0)
+# define MOD63(a) \
+ do { /* this assumes a is not negative */ \
+ z_off64_t tmp = a >> 32; \
+ a &= 0xffffffffL; \
+ a += (tmp << 8) - (tmp << 5) + tmp; \
+ tmp = a >> 16; \
+ a &= 0xffffL; \
+ a += (tmp << 4) - tmp; \
+ tmp = a >> 16; \
+ a &= 0xffffL; \
+ a += (tmp << 4) - tmp; \
+ if (a >= BASE) a -= BASE; \
+ } while (0)
+#else
+# define MOD(a) a %= BASE
+# define MOD28(a) a %= BASE
+# define MOD63(a) a %= BASE
+#endif
+
+static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) {
+ adler += buf[0];
+ if (adler >= BASE)
+ adler -= BASE;
+ sum2 += adler;
+ if (sum2 >= BASE)
+ sum2 -= BASE;
+ return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+ while (len) {
+ --len;
+ adler += *buf++;
+ sum2 += adler;
+ }
+ if (adler >= BASE)
+ adler -= BASE;
+ MOD28(sum2); /* only added so many BASE's */
+ return adler | (sum2 << 16);
+}
+
+#endif /* ADLER32_P_H */
#include "adler32_neon.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
+#include "adler32_p.h"
static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
static const uint8_t taps[32] = {
}
uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
- if (!buf)
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (len == 1)
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (buf == NULL)
return 1L;
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (len < 16)
+ return adler32_len_16(adler, buf, len, sum2);
+
/* The largest prime smaller than 65536. */
const uint32_t M_BASE = 65521;
/* This is the threshold where doing accumulation may overflow. */
const int M_NMAX = 5552;
- uint32_t sum2;
uint32_t pair[2];
int n = M_NMAX;
unsigned int done = 0;
/* Split Adler-32 into component sums, it can be supplied by
* the caller sites (e.g. in a PNG file).
*/
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
pair[0] = adler;
pair[1] = sum2;
SRCDIR = $(TOP)
# Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
SRCDIR = $(TOP)
# Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h