only call NEON adler32 for more than 16 bytes

author Sebastian Pop <s.pop@samsung.com>

Mon, 28 Jan 2019 22:05:50 +0000 (16:05 -0600)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Thu, 4 Apr 2019 08:13:26 +0000 (10:13 +0200)
author Sebastian Pop <s.pop@samsung.com>
Mon, 28 Jan 2019 22:05:50 +0000 (16:05 -0600)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Thu, 4 Apr 2019 08:13:26 +0000 (10:13 +0200)
diff --git a/adler32.c b/adler32.c

index 651d73c562cf7037a61e8d4e4234c2c4b04762af..feff67bccee0fe22e8117d8bf491140cb7fe0f4a 100644 (file)
--- a/adler32.c
+++ b/adler32.c
@@ -8,11 +8,11 @@
  #include "zbuild.h"
  #include "zutil.h"
  #include "functable.h"
+#include "adler32_p.h"
  
  uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
  static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2);
  
-#define BASE 65521U     /* largest prime smaller than 65536 */
  #define NMAX 5552
  /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
  
@@ -22,46 +22,6 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len
  #define DO8(buf, i)  DO4(buf, i); DO4(buf, i+4);
  #define DO16(buf)    DO8(buf, 0); DO8(buf, 8);
  
-/* use NO_DIVIDE if your processor does not do division in hardware --
-   try it both ways to see which is faster */
-#ifdef NO_DIVIDE
-/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
-   (thank you to John Reiser for pointing this out) */
-#  define CHOP(a) \
-    do { \
-        uint32_t tmp = a >> 16; \
-        a &= 0xffff; \
-        a += (tmp << 4) - tmp; \
-    } while (0)
-#  define MOD28(a) \
-    do { \
-        CHOP(a); \
-        if (a >= BASE) a -= BASE; \
-    } while (0)
-#  define MOD(a) \
-    do { \
-        CHOP(a); \
-        MOD28(a); \
-    } while (0)
-#  define MOD63(a) \
-    do { /* this assumes a is not negative */ \
-        z_off64_t tmp = a >> 32; \
-        a &= 0xffffffffL; \
-        a += (tmp << 8) - (tmp << 5) + tmp; \
-        tmp = a >> 16; \
-        a &= 0xffffL; \
-        a += (tmp << 4) - tmp; \
-        tmp = a >> 16; \
-        a &= 0xffffL; \
-        a += (tmp << 4) - tmp; \
-        if (a >= BASE) a -= BASE; \
-    } while (0)
-#else
-#  define MOD(a) a %= BASE
-#  define MOD28(a) a %= BASE
-#  define MOD63(a) a %= BASE
-#endif
-
  /* ========================================================================= */
  uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
      uint32_t sum2;
@@ -72,32 +32,16 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
      adler &= 0xffff;
  
      /* in case user likes doing a byte at a time, keep it fast */
-    if (len == 1) {
-        adler += buf[0];
-        if (adler >= BASE)
-            adler -= BASE;
-        sum2 += adler;
-        if (sum2 >= BASE)
-            sum2 -= BASE;
-        return adler | (sum2 << 16);
-    }
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
  
      /* initial Adler-32 value (deferred check for len == 1 speed) */
      if (buf == NULL)
          return 1L;
  
      /* in case short lengths are provided, keep it somewhat fast */
-    if (len < 16) {
-        while (len) {
-            --len;
-            adler += *buf++;
-            sum2 += adler;
-        }
-        if (adler >= BASE)
-            adler -= BASE;
-        MOD28(sum2);            /* only added so many BASE's */
-        return adler | (sum2 << 16);
-    }
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
  
      /* do length NMAX blocks -- requires just one modulo operation */
      while (len >= NMAX) {
diff --git a/adler32_p.h b/adler32_p.h

new file mode 100644 (file)

index 0000000..131513a
--- /dev/null
+++ b/adler32_p.h
@@ -0,0 +1,75 @@
+/* adler32_p.h -- Private inline functions and macros shared with
+ *                different computation of the Adler-32 checksum
+ *                of a data stream.
+ * Copyright (C) 1995-2011, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_P_H
+#define ADLER32_P_H
+
+#define BASE 65521U     /* largest prime smaller than 65536 */
+
+/* use NO_DIVIDE if your processor does not do division in hardware --
+   try it both ways to see which is faster */
+#ifdef NO_DIVIDE
+/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
+   (thank you to John Reiser for pointing this out) */
+#  define CHOP(a) \
+    do { \
+        uint32_t tmp = a >> 16; \
+        a &= 0xffff; \
+        a += (tmp << 4) - tmp; \
+    } while (0)
+#  define MOD28(a) \
+    do { \
+        CHOP(a); \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#  define MOD(a) \
+    do { \
+        CHOP(a); \
+        MOD28(a); \
+    } while (0)
+#  define MOD63(a) \
+    do { /* this assumes a is not negative */ \
+        z_off64_t tmp = a >> 32; \
+        a &= 0xffffffffL; \
+        a += (tmp << 8) - (tmp << 5) + tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        if (a >= BASE) a -= BASE; \
+    } while (0)
+#else
+#  define MOD(a) a %= BASE
+#  define MOD28(a) a %= BASE
+#  define MOD63(a) a %= BASE
+#endif
+
+static inline uint32_t adler32_len_1(uint32_t adler, const unsigned char *buf, uint32_t sum2) {
+    adler += buf[0];
+    if (adler >= BASE)
+        adler -= BASE;
+    sum2 += adler;
+    if (sum2 >= BASE)
+        sum2 -= BASE;
+    return adler | (sum2 << 16);
+}
+
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+    while (len) {
+        --len;
+        adler += *buf++;
+        sum2 += adler;
+    }
+    if (adler >= BASE)
+        adler -= BASE;
+    MOD28(sum2);            /* only added so many BASE's */
+    return adler | (sum2 << 16);
+}
+
+#endif /* ADLER32_P_H */
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c

index 8d845a48dd83606418bd321dba8673b2d8217f8f..f8573ec5536e3bdbf50e8ff85bf8b7a369ddde07 100644 (file)
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -19,6 +19,7 @@
  #include "adler32_neon.h"
  #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  #include <arm_neon.h>
+#include "adler32_p.h"
  
  static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
      static const uint8_t taps[32] = {
@@ -80,15 +81,27 @@ static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t le
  }
  
  uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
-    if (!buf)
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (len == 1)
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (buf == NULL)
          return 1L;
  
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (len < 16)
+        return adler32_len_16(adler, buf, len, sum2);
+
      /* The largest prime smaller than 65536. */
      const uint32_t M_BASE = 65521;
      /* This is the threshold where doing accumulation may overflow. */
      const int M_NMAX = 5552;
  
-    uint32_t sum2;
      uint32_t pair[2];
      int n = M_NMAX;
      unsigned int done = 0;
@@ -98,8 +111,6 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
      /* Split Adler-32 into component sums, it can be supplied by
       * the caller sites (e.g. in a PNG file).
       */
-    sum2 = (adler >> 16) & 0xffff;
-    adler &= 0xffff;
      pair[0] = adler;
      pair[1] = sum2;
  
diff --git a/win32/Makefile.arm b/win32/Makefile.arm

index 3a487b2f61df92681dc5b000aeafef5b89af6627..0b3f94064cad9bdf857c3338014f78bdb9f8bd1e 100644 (file)
--- a/win32/Makefile.arm
+++ b/win32/Makefile.arm
@@ -117,7 +117,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf
  
  SRCDIR = $(TOP)
  # Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
  functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
  gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
  gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
diff --git a/win32/Makefile.msc b/win32/Makefile.msc

index 0c84a14adc0d30d0f28938d21f6e100fdd56c461..9f4b884c3cffdc78a2d8d29255fe3eeea3371722 100644 (file)
--- a/win32/Makefile.msc
+++ b/win32/Makefile.msc
@@ -108,7 +108,7 @@ $(TOP)/zconf$(SUFFIX).h: zconf
  
  SRCDIR = $(TOP)
  # Keep the dependences in sync with top-level Makefile.in
-adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h
+adler32.obj: $(SRCDIR)/adler32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/functable.h $(SRCDIR)/adler32_p.h
  functable.obj: $(SRCDIR)/functable.c $(SRCDIR)/zbuild.h $(SRCDIR)/functable.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/gzendian.h $(SRCDIR)/arch/x86/x86.h
  gzclose.obj: $(SRCDIR)/gzclose.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
  gzlib.obj: $(SRCDIR)/gzlib.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
author	Sebastian Pop <s.pop@samsung.com>
	Mon, 28 Jan 2019 22:05:50 +0000 (16:05 -0600)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Thu, 4 Apr 2019 08:13:26 +0000 (10:13 +0200)
adler32.c		patch \| blob \| blame \| history
adler32_p.h	[new file with mode: 0644]	patch \| blob
arch/arm/adler32_neon.c		patch \| blob \| blame \| history
win32/Makefile.arm		patch \| blob \| blame \| history
win32/Makefile.msc		patch \| blob \| blame \| history