From: Adenilson Cavalcanti Date: Sat, 22 Apr 2017 09:41:47 +0000 (-0700) Subject: Implementing NEON-ized Adler32 checksum (#102) X-Git-Tag: 1.9.9-b1~660^2~18 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ec02ecf104e1d3f1836a908a359f20aa93494df5;p=thirdparty%2Fzlib-ng.git Implementing NEON-ized Adler32 checksum (#102) The checksum is calculated in the uncompressed PNG data and can be made much faster by using SIMD. Tests in ARMv8 yielded an improvement of about 3x (e.g. walltime was 350ms x 125ms for a 4096x4096 bytes executed 30 times). This yields an improvement in image decoding in Chromium around 18% (see https://bugs.chromium.org/p/chromium/issues/detail?id=688601). --- diff --git a/CMakeLists.txt b/CMakeLists.txt index ebe9779f2..58851962a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -378,6 +378,7 @@ if(WITH_OPTIM) if(MSVC) add_definitions("-D__ARM_NEON__=1") endif(MSVC) + set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/adler32_neon.c) add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"") endif() elseif("${ARCH}" MATCHES "aarch64") @@ -389,6 +390,7 @@ if(WITH_OPTIM) # We need to check WITH_NEON first if(WITH_NEON) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}") + set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/adler32_neon.c) add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"") elseif(WITH_ACLE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ACLEFLAG}") diff --git a/adler32.c b/adler32.c index 0da5deed6..75c72333a 100644 --- a/adler32.c +++ b/adler32.c @@ -7,6 +7,10 @@ #include "zutil.h" +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); +#endif + static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len2); #define BASE 65521U /* largest prime smaller than 65536 */ @@ -61,6 +65,10 @@ static uint32_t adler32_combine_(uint32_t adler1, uint32_t adler2, z_off64_t len /* ========================================================================= */ uint32_t ZEXPORT adler32_z(uint32_t adler, const unsigned char *buf, size_t len) { +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) + return adler32_neon(adler, buf, len); +#endif + uint32_t sum2; unsigned n; diff --git a/arch/aarch64/Makefile.in b/arch/aarch64/Makefile.in index d42298277..e1b152ccf 100644 --- a/arch/aarch64/Makefile.in +++ b/arch/aarch64/Makefile.in @@ -11,7 +11,13 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo +all: adler32_neon.o adler32_neon.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo + +adler32_neon.o: $(SRCDIR)/adler32_neon.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +adler32_neon.lo: $(SRCDIR)/adler32_neon.c + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c crc32_acle.o: $(SRCDIR)/crc32_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c @@ -48,10 +54,13 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. +adler32_neon.o: $(SRCDIR)/adler32_neon.h crc32_acle.o: $(TOPDIR)/zconf.h fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h +adler32_neon.lo: $(SRCDIR)/adler32_neon.h crc32_acle.lo: $(TOPDIR)/zconf.h fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h + diff --git a/arch/aarch64/adler32_neon.c b/arch/aarch64/adler32_neon.c new file mode 100644 index 000000000..05cb86aaa --- /dev/null +++ b/arch/aarch64/adler32_neon.c @@ -0,0 +1,136 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Author: Adenilson Cavalcanti + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#include "adler32_neon.h" +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include + +static void NEON_accum32(uint32_t *s, const unsigned char *buf, + size_t len) +{ + static const uint8_t taps[32] = { + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 }; + + uint32x2_t adacc2, s2acc2, as; + uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); + + uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + while (len >= 2) { + uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); + adler = vpaddlq_u8( d0); + adler = vpadalq_u8(adler, d1); + sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); + sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + len -= 2; + buf += 32; + } + + while (len > 0) { + uint8x16_t d0 = vld1q_u8(buf); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); + adler = vpaddlq_u8(d0); + sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + buf += 16; + len--; + } + + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, + size_t len) +{ + /* Oldie K&R code integration. */ + unsigned int i; + for (i = 0; i < len; ++i) { + pair[0] += buf[i]; + pair[1] += pair[0]; + } +} + +uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, + size_t len) +{ + if (!buf) + return 1L; + + /* The largest prime smaller than 65536. */ + const uint32_t M_BASE = 65521; + /* This is the threshold where doing accumulation may overflow. */ + const int M_NMAX = 5552; + + uint32_t sum2; + uint32_t pair[2]; + int n = M_NMAX; + unsigned int done = 0; + /* Oldie K&R code integration. */ + unsigned int i; + + /* Split Adler-32 into component sums, it can be supplied by + * the caller sites (e.g. in a PNG file). + */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + pair[0] = adler; + pair[1] = sum2; + + for (i = 0; i < len; i += n) { + if ((i + n) > len) + n = len - i; + + if (n < 16) + break; + + NEON_accum32(pair, buf + i, n / 16); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + + done += (n / 16) * 16; + } + + /* Handle the tail elements. */ + if (done < len) { + NEON_handle_tail(pair, (buf + done), len - done); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + } + + /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ + return (pair[1] << 16) | pair[0]; +} +#endif diff --git a/arch/aarch64/adler32_neon.h b/arch/aarch64/adler32_neon.h new file mode 100644 index 000000000..285d19378 --- /dev/null +++ b/arch/aarch64/adler32_neon.h @@ -0,0 +1,31 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Author: Adenilson Cavalcanti + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#ifndef __NEON_ADLER32__ +#define __NEON_ADLER32__ + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +// Depending on the compiler flavor, size_t may be defined in +// one or the other header. See: +// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t +#include +#include +uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, + size_t len); +#endif +#endif diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in index d42298277..e1b152ccf 100644 --- a/arch/arm/Makefile.in +++ b/arch/arm/Makefile.in @@ -11,7 +11,13 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo +all: adler32_neon.o adler32_neon.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo + +adler32_neon.o: $(SRCDIR)/adler32_neon.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c + +adler32_neon.lo: $(SRCDIR)/adler32_neon.c + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c crc32_acle.o: $(SRCDIR)/crc32_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c @@ -48,10 +54,13 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. +adler32_neon.o: $(SRCDIR)/adler32_neon.h crc32_acle.o: $(TOPDIR)/zconf.h fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h +adler32_neon.lo: $(SRCDIR)/adler32_neon.h crc32_acle.lo: $(TOPDIR)/zconf.h fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h + diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c new file mode 100644 index 000000000..05cb86aaa --- /dev/null +++ b/arch/arm/adler32_neon.c @@ -0,0 +1,136 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Author: Adenilson Cavalcanti + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#include "adler32_neon.h" +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include + +static void NEON_accum32(uint32_t *s, const unsigned char *buf, + size_t len) +{ + static const uint8_t taps[32] = { + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 }; + + uint32x2_t adacc2, s2acc2, as; + uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); + + uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + while (len >= 2) { + uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); + adler = vpaddlq_u8( d0); + adler = vpadalq_u8(adler, d1); + sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); + sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + len -= 2; + buf += 32; + } + + while (len > 0) { + uint8x16_t d0 = vld1q_u8(buf); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); + adler = vpaddlq_u8(d0); + sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + buf += 16; + len--; + } + + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, + size_t len) +{ + /* Oldie K&R code integration. */ + unsigned int i; + for (i = 0; i < len; ++i) { + pair[0] += buf[i]; + pair[1] += pair[0]; + } +} + +uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, + size_t len) +{ + if (!buf) + return 1L; + + /* The largest prime smaller than 65536. */ + const uint32_t M_BASE = 65521; + /* This is the threshold where doing accumulation may overflow. */ + const int M_NMAX = 5552; + + uint32_t sum2; + uint32_t pair[2]; + int n = M_NMAX; + unsigned int done = 0; + /* Oldie K&R code integration. */ + unsigned int i; + + /* Split Adler-32 into component sums, it can be supplied by + * the caller sites (e.g. in a PNG file). + */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + pair[0] = adler; + pair[1] = sum2; + + for (i = 0; i < len; i += n) { + if ((i + n) > len) + n = len - i; + + if (n < 16) + break; + + NEON_accum32(pair, buf + i, n / 16); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + + done += (n / 16) * 16; + } + + /* Handle the tail elements. */ + if (done < len) { + NEON_handle_tail(pair, (buf + done), len - done); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + } + + /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ + return (pair[1] << 16) | pair[0]; +} +#endif diff --git a/arch/arm/adler32_neon.h b/arch/arm/adler32_neon.h new file mode 100644 index 000000000..285d19378 --- /dev/null +++ b/arch/arm/adler32_neon.h @@ -0,0 +1,31 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Author: Adenilson Cavalcanti + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#ifndef __NEON_ADLER32__ +#define __NEON_ADLER32__ + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +// Depending on the compiler flavor, size_t may be defined in +// one or the other header. See: +// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t +#include +#include +uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, + size_t len); +#endif +#endif