From 3ac4f5de069564698c263097996669e6153286f3 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <s.pop@samsung.com>
Date: Mon, 28 Jan 2019 16:05:50 -0600
Subject: [PATCH] only call NEON adler32 for more than 16 bytes

improves performance of inflate by up to 6% on an A-73 Hikey running at 2.36 GHz
when executing the chromium benchmark on the snappy data set.  In a few cases
inflate is slower by up to 0.8%.  Overall performance of inflate is better by
about 0.3%.
---
 adler32.c               | 66 +++---------------------------------
 adler32_p.h             | 75 +++++++++++++++++++++++++++++++++++++++++
 arch/arm/adler32_neon.c | 19 ++++++++---
 win32/Makefile.arm      |  2 +-
 win32/Makefile.msc      |  2 +-
 5 files changed, 97 insertions(+), 67 deletions(-)
 create mode 100644 adler32_p.h

diff --git a/adler32.c b/adler32.c
index 651d73c5..feff67bc 100644
--- a/adler32.c
+++ b/adler32.c
@@ -8,11 +8,11 @@
 #include "zbuild.h"
 #include "zutil.h"
 #include "functable.h"
+#include "adler32_p.h"
 
 uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
 static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2);
 
-#define BASE 65521U     /* largest prime smaller than 65536 */
 #define NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
 
@@ -22,46 +22,6 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len
 #define DO8(buf, i)  DO4(buf, i); DO4(buf, i+4);
 #define DO16(buf)    DO8(buf, 0); DO8(buf, 8);
 
-/* use NO_DIVIDE if your processor does not do division in hardware --
-   try it both ways to see which is faster */
-#ifdef NO_DIVIDE
-/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
-   (thank you to John Reiser for pointing this out) */
-#  define CHOP(a) \
-    do { \
-        uint32_t tmp = a >> 16; \
-        a &= 0xffff; \
-        a += (tmp << 4) - tmp; \
-    } while (0)
-#  define MOD28(a) \
-    do { \
-        CHOP(a); \
-        if (a >= BASE) a -= BASE; \
-    } while (0)
-#  define MOD(a) \
-    do { \
-        CHOP(a); \
-        MOD28(a); \
-    } while (0)
-#  define MOD63(a) \
-    do { /* this assumes a is not negative */ \
-        z_off64_t tmp = a >> 32; \
-        a &= 0xffffffffL; \
-        a += (tmp << 8) - (tmp << 5) + tmp; \
-        tmp = a >> 16; \
-        a &= 0xffffL; \
-        a += (tmp << 4) - tmp; \
-        tmp = a >> 16; \
-        a &= 0xffffL; \
-        a += (tmp << 4) - tmp; \
-        if (a >= BASE) a -= BASE; \
-    } while (0)
-#else
-#  define MOD(a) a %= BASE
-#  define MOD28(a) a %= BASE
-#  define MOD63(a) a %= BASE
-#endif
-
 /* ========================================================================= */
 uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t sum2;
@@ -72,32 +32,16 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
     adler &= 0xffff;
 
     /* in case user likes doing a byte at a time, keep it fast */
-    if (len == 1) {
-        adler += buf[0];
-        if (adler >= BASE)
-            adler -= BASE;
-        sum2 += adler;
-        if (sum2 >= BASE)
-            sum2 -= BASE;
-        return adler | (sum2 << 16);
-    }
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
 
     /* initial Adler-32 value (deferred check for len == 1 speed) */
     if (buf == NULL)
         return 1L;
 
     /* in case short lengths are provided, keep it somewhat fast */
-    if (len < 16) {
-        while (len) {
-            --len;
-            adler += *buf++;
-            sum2 += adler;
-        }
-        if (adler >= BASE)
-            adler -= BASE;
-        MOD28(sum2);            /* only added so many BASE's */
-        return adler | (sum2 << 16);
-    }
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
 
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
diff --git a/adler32_p.h b/adler32_p.h
new file mode 100644
index 00000000..131513a8
--- /dev/null
+++ b/adler32_p.h
@@ -0,0 +1,75 @@
+/* adler32_p.h -- Private inline functions and macros shared with
+ *                different computation of the Adler-32 checksum
+ *                of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+
+/* use NO_DIVIDE if your processor does not do division in hardware --
+   try it both ways to see which is faster */
+#ifdef NO_DIVIDE
+/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
+   (thank you to John Reiser for pointing this out) */
+#  define CHOP(a) \
+    do { \
+        uint32_t tmp = a >> 16; \
+        a &= 0xffff; \
+        a += (tmp << 4) - tmp; \
+    } while (0)
+#  define MOD28(a) \
+    do { \
+        CHOP(a); \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#  define MOD(a) \
+    do { \
+        CHOP(a); \
+        MOD28(a); \
+    } while (0)
+#  define MOD63(a) \
+    do { /* this assumes a is not negative */ \
+        z_off64_t tmp = a >> 32; \
+        a &= 0xffffffffL; \
+        a += (tmp << 8) - (tmp << 5) + tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#else
+#  define MOD(a) a %= BASE
+#  define MOD28(a) a %= BASE
+#  define MOD63(a) a %= BASE
+#endif
+
+static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) {
+    adler += buf[0];
+    if (adler >= BASE)
+        adler -= BASE;
+    sum2 += adler;
+    if (sum2 >= BASE)
+        sum2 -= BASE;
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    if (adler >= BASE)
+        adler -= BASE;
+    MOD28(sum2);            /* only added so many BASE's */
+    return adler | (sum2 << 16);
+}
+
+#endif /* ADLER32_P_H */
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
index 8d845a48..f8573ec5 100644
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -19,6 +19,7 @@
 #include "adler32_neon.h"
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
+#include "adler32_p.h"
 
 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
     static const uint8_t taps[32] = {
@@ -80,15 +81,27 @@ static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t le
 }
 
 uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
-    if (!buf)
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == NULL)
         return 1L;
 
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
+
     /* The largest prime smaller than 65536. */
     const uint32_t M_BASE = 65521;
     /* This is the threshold where doing accumulation may overflow. */
     const int M_NMAX = 5552;
 
-    uint32_t sum2;
     uint32_t pair[2];
     int n = M_NMAX;
     unsigned int done = 0;
@@ -98,8 +111,6 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     /* Split Adler-32 into component sums, it can be supplied by
      * the caller sites (e.g. in a PNG file).
      */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
     pair[0] = adler;
     pair[1] = sum2;
 
diff --git a/win32/Makefile.arm b/win32/Makefile.arm
index 3a487b2f..0b3f9406 100644
--- a/win32/Makefile.arm
+++ b/win32/Makefile.arm
@@ -117,7 +117,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf
 
 SRCDIR = $(TOP)
 # Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
 functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
 gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
 gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
diff --git a/win32/Makefile.msc b/win32/Makefile.msc
index 0c84a14a..9f4b884c 100644
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -108,7 +108,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf
 
 SRCDIR = $(TOP)
 # Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
 functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
 gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
 gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
-- 
2.47.2