Implementing NEON-ized Adler32 checksum (#102)

author Adenilson Cavalcanti <cavalcantii@gmail.com>

Sat, 22 Apr 2017 09:41:47 +0000 (02:41 -0700)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sat, 22 Apr 2017 09:41:47 +0000 (11:41 +0200)
author Adenilson Cavalcanti <cavalcantii@gmail.com>
Sat, 22 Apr 2017 09:41:47 +0000 (02:41 -0700)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sat, 22 Apr 2017 09:41:47 +0000 (11:41 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index ebe9779f21b77eec5851af7290e26f7db8ab010b..58851962a6056b001b54f38f69c8b0bd12f75e5e 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -378,6 +378,7 @@ if(WITH_OPTIM)
              if(MSVC)
                  add_definitions("-D__ARM_NEON__=1")
              endif(MSVC)
+            set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/adler32_neon.c)
              add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
          endif()
      elseif("${ARCH}" MATCHES "aarch64")
@@ -389,6 +390,7 @@ if(WITH_OPTIM)
          # We need to check WITH_NEON first
          if(WITH_NEON)
              set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}")
+            set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/adler32_neon.c)
              add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
          elseif(WITH_ACLE)
              set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ACLEFLAG}")
diff --git a/adler32.c b/adler32.c

index 0da5deed6238545199a3646d14d8e264dd7eb572..75c72333a04f13795741613f51fee9b0d89ae177 100644 (file)
--- a/adler32.c
+++ b/adler32.c
@@ -7,6 +7,10 @@
  
  #include "zutil.h"
  
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+
  static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2);
  
  #define BASE 65521U     /* largest prime smaller than 65536 */
@@ -61,6 +65,10 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len
  
  /* ========================================================================= */
  uint32_t ZEXPORT adler32_z(uint32_t adler, const unsigned char *buf, size_t len) {
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+    return adler32_neon(adler, buf, len);
+#endif
+
      uint32_t sum2;
      unsigned n;
  
diff --git a/arch/aarch64/Makefile.in b/arch/aarch64/Makefile.in

index d4229827766bbd88540b4bd8fadddb99b8eee68a..e1b152ccf0fc19115c92543d8cf783876cea4f18 100644 (file)
--- a/arch/aarch64/Makefile.in
+++ b/arch/aarch64/Makefile.in
@@ -11,7 +11,13 @@ SRCDIR=.
  SRCTOP=../..
  TOPDIR=$(SRCTOP)
  
-all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+all: adler32_neon.o adler32_neon.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+
+adler32_neon.o: $(SRCDIR)/adler32_neon.c
+       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo: $(SRCDIR)/adler32_neon.c
+       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
  
  crc32_acle.o: $(SRCDIR)/crc32_acle.c
         $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
@@ -48,10 +54,13 @@ depend:
  
  # DO NOT DELETE THIS LINE -- make depend depends on it.
  
+adler32_neon.o: $(SRCDIR)/adler32_neon.h
  crc32_acle.o: $(TOPDIR)/zconf.h
  fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  
+adler32_neon.lo: $(SRCDIR)/adler32_neon.h
  crc32_acle.lo: $(TOPDIR)/zconf.h
  fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
+
diff --git a/arch/aarch64/adler32_neon.c b/arch/aarch64/adler32_neon.c

new file mode 100644 (file)

index 0000000..05cb86a
--- /dev/null
+++ b/arch/aarch64/adler32_neon.c
@@ -0,0 +1,136 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ *  claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#include "adler32_neon.h"
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+
+static void NEON_accum32(uint32_t *s, const unsigned char *buf,
+                         size_t len)
+{
+    static const uint8_t taps[32] = {
+        32, 31, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 11, 10, 9,
+        8, 7, 6, 5, 4, 3, 2, 1 };
+
+    uint32x2_t adacc2, s2acc2, as;
+    uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
+
+    uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    while (len >= 2) {
+        uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
+        uint16x8_t adler, sum2;
+        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
+        adler = vpaddlq_u8(       d0);
+        adler = vpadalq_u8(adler, d1);
+        sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
+        adacc = vpadalq_u16(adacc, adler);
+        s2acc = vpadalq_u16(s2acc, sum2);
+        len -= 2;
+        buf += 32;
+    }
+
+    while (len > 0) {
+        uint8x16_t d0 = vld1q_u8(buf);
+        uint16x8_t adler, sum2;
+        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
+        adler = vpaddlq_u8(d0);
+        sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
+        adacc = vpadalq_u16(adacc, adler);
+        s2acc = vpadalq_u16(s2acc, sum2);
+        buf += 16;
+        len--;
+    }
+
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf,
+                             size_t len)
+{
+    /* Oldie K&R code integration. */
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+uint32_t adler32_neon(uint32_t adler, const unsigned char *buf,
+                      size_t len)
+{
+    if (!buf)
+        return 1L;
+
+    /* The largest prime smaller than 65536. */
+    const uint32_t M_BASE = 65521;
+    /* This is the threshold where doing accumulation may overflow. */
+    const int M_NMAX = 5552;
+
+    uint32_t sum2;
+    uint32_t pair[2];
+    int n = M_NMAX;
+    unsigned int done = 0;
+    /* Oldie K&R code integration. */
+    unsigned int i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    for (i = 0; i < len; i += n) {
+        if ((i + n) > len)
+            n = len - i;
+
+        if (n < 16)
+            break;
+
+        NEON_accum32(pair, buf + i, n / 16);
+        pair[0] %= M_BASE;
+        pair[1] %= M_BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        NEON_handle_tail(pair, (buf + done), len - done);
+        pair[0] %= M_BASE;
+        pair[1] %= M_BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
diff --git a/arch/aarch64/adler32_neon.h b/arch/aarch64/adler32_neon.h

new file mode 100644 (file)

index 0000000..285d193
--- /dev/null
+++ b/arch/aarch64/adler32_neon.h
@@ -0,0 +1,31 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ *  claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#ifndef __NEON_ADLER32__
+#define __NEON_ADLER32__
+
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+// Depending on the compiler flavor, size_t may be defined in
+// one or the other header. See:
+// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t
+#include <stdint.h>
+#include <stddef.h>
+uint32_t adler32_neon(uint32_t adler, const unsigned char *buf,
+                      size_t len);
+#endif
+#endif
diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in

index d4229827766bbd88540b4bd8fadddb99b8eee68a..e1b152ccf0fc19115c92543d8cf783876cea4f18 100644 (file)
--- a/arch/arm/Makefile.in
+++ b/arch/arm/Makefile.in
@@ -11,7 +11,13 @@ SRCDIR=.
  SRCTOP=../..
  TOPDIR=$(SRCTOP)
  
-all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+all: adler32_neon.o adler32_neon.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
+
+adler32_neon.o: $(SRCDIR)/adler32_neon.c
+       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
+
+adler32_neon.lo: $(SRCDIR)/adler32_neon.c
+       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
  
  crc32_acle.o: $(SRCDIR)/crc32_acle.c
         $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
@@ -48,10 +54,13 @@ depend:
  
  # DO NOT DELETE THIS LINE -- make depend depends on it.
  
+adler32_neon.o: $(SRCDIR)/adler32_neon.h
  crc32_acle.o: $(TOPDIR)/zconf.h
  fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  
+adler32_neon.lo: $(SRCDIR)/adler32_neon.h
  crc32_acle.lo: $(TOPDIR)/zconf.h
  fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
  insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
+
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c

new file mode 100644 (file)

index 0000000..05cb86a
--- /dev/null
+++ b/arch/arm/adler32_neon.c
@@ -0,0 +1,136 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ *  claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#include "adler32_neon.h"
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+
+static void NEON_accum32(uint32_t *s, const unsigned char *buf,
+                         size_t len)
+{
+    static const uint8_t taps[32] = {
+        32, 31, 30, 29, 28, 27, 26, 25,
+        24, 23, 22, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 11, 10, 9,
+        8, 7, 6, 5, 4, 3, 2, 1 };
+
+    uint32x2_t adacc2, s2acc2, as;
+    uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
+
+    uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
+    adacc = vsetq_lane_u32(s[0], adacc, 0);
+    s2acc = vsetq_lane_u32(s[1], s2acc, 0);
+
+    while (len >= 2) {
+        uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
+        uint16x8_t adler, sum2;
+        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
+        adler = vpaddlq_u8(       d0);
+        adler = vpadalq_u8(adler, d1);
+        sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
+        adacc = vpadalq_u16(adacc, adler);
+        s2acc = vpadalq_u16(s2acc, sum2);
+        len -= 2;
+        buf += 32;
+    }
+
+    while (len > 0) {
+        uint8x16_t d0 = vld1q_u8(buf);
+        uint16x8_t adler, sum2;
+        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
+        adler = vpaddlq_u8(d0);
+        sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0));
+        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
+        adacc = vpadalq_u16(adacc, adler);
+        s2acc = vpadalq_u16(s2acc, sum2);
+        buf += 16;
+        len--;
+    }
+
+    adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
+    s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
+    as = vpadd_u32(adacc2, s2acc2);
+    s[0] = vget_lane_u32(as, 0);
+    s[1] = vget_lane_u32(as, 1);
+}
+
+static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf,
+                             size_t len)
+{
+    /* Oldie K&R code integration. */
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+uint32_t adler32_neon(uint32_t adler, const unsigned char *buf,
+                      size_t len)
+{
+    if (!buf)
+        return 1L;
+
+    /* The largest prime smaller than 65536. */
+    const uint32_t M_BASE = 65521;
+    /* This is the threshold where doing accumulation may overflow. */
+    const int M_NMAX = 5552;
+
+    uint32_t sum2;
+    uint32_t pair[2];
+    int n = M_NMAX;
+    unsigned int done = 0;
+    /* Oldie K&R code integration. */
+    unsigned int i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    for (i = 0; i < len; i += n) {
+        if ((i + n) > len)
+            n = len - i;
+
+        if (n < 16)
+            break;
+
+        NEON_accum32(pair, buf + i, n / 16);
+        pair[0] %= M_BASE;
+        pair[1] %= M_BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        NEON_handle_tail(pair, (buf + done), len - done);
+        pair[0] %= M_BASE;
+        pair[1] %= M_BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
diff --git a/arch/arm/adler32_neon.h b/arch/arm/adler32_neon.h

new file mode 100644 (file)

index 0000000..285d193
--- /dev/null
+++ b/arch/arm/adler32_neon.h
@@ -0,0 +1,31 @@
+/* Copyright (C) 1995-2011, 2016 Mark Adler
+ * Copyright (C) 2017 ARM Holdings Inc.
+ * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ * 1. The origin of this software must not be misrepresented; you must not
+ *  claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+#ifndef __NEON_ADLER32__
+#define __NEON_ADLER32__
+
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
+// Depending on the compiler flavor, size_t may be defined in
+// one or the other header. See:
+// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t
+#include <stdint.h>
+#include <stddef.h>
+uint32_t adler32_neon(uint32_t adler, const unsigned char *buf,
+                      size_t len);
+#endif
+#endif
author	Adenilson Cavalcanti <cavalcantii@gmail.com>
	Sat, 22 Apr 2017 09:41:47 +0000 (02:41 -0700)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sat, 22 Apr 2017 09:41:47 +0000 (11:41 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
adler32.c		patch \| blob \| blame \| history
arch/aarch64/Makefile.in		patch \| blob \| blame \| history
arch/aarch64/adler32_neon.c	[new file with mode: 0644]	patch \| blob
arch/aarch64/adler32_neon.h	[new file with mode: 0644]	patch \| blob
arch/arm/Makefile.in		patch \| blob \| blame \| history
arch/arm/adler32_neon.c	[new file with mode: 0644]	patch \| blob
arch/arm/adler32_neon.h	[new file with mode: 0644]	patch \| blob