From: Sebastian Pop Date: Tue, 5 Mar 2019 15:57:05 +0000 (-0600) Subject: factor out code in arch/{arm,aarch64} X-Git-Tag: 1.9.9-b1~502 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4a6a599e9312fbb15de83e8445d4bf097f506fd0;p=thirdparty%2Fzlib-ng.git factor out code in arch/{arm,aarch64} --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e613a32..90fc64f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -514,12 +514,9 @@ if("${ARCH}" MATCHES "x86_64" OR "${ARCH}" MATCHES "AMD64" OR "${ARCH}" MATCHES set(ARCHDIR "arch/x86") add_definitions(-DUNALIGNED_OK) add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"") -elseif("${ARCH}" MATCHES "arm") +elseif("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64") set(ARCHDIR "arch/arm") add_definitions(-DUNALIGNED_OK) -elseif("${ARCH}" MATCHES "aarch64") - set(ARCHDIR "arch/aarch64") - add_definitions(-DUNALIGNED_OK) else() message(STATUS "No optimized architecture: using ${ARCHDIR}") endif() diff --git a/Makefile.in b/Makefile.in index 0f8155b6..8f287476 100644 --- a/Makefile.in +++ b/Makefile.in @@ -384,10 +384,6 @@ inflate.o: $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zco inftrees.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h $(SRCDIR)/inftrees.h trees.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h $(SRCDIR)/trees.h zutil.o: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h -arch/aarch64/adler32_neon.o: $(SRCDIR)/arch/aarch64/adler32_neon.h -arch/aarch64/crc32_acle.o: zconf$(SUFFIX).h -arch/aarch64/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h -arch/aarch64/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h arch/arm/adler32_neon.o: $(SRCDIR)/arch/arm/adler32_neon.h arch/arm/crc32_acle.o: zconf$(SUFFIX).h arch/arm/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h @@ -412,10 +408,6 @@ inflate.lo: $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zc inftrees.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h $(SRCDIR)/inftrees.h trees.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h $(SRCDIR)/trees.h zutil.lo: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h -arch/aarch64/adler32_neon.lo: $(SRCDIR)/arch/aarch64/adler32_neon.h -arch/aarch64/crc32_acle.lo: zconf$(SUFFIX).h -arch/aarch64/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h -arch/aarch64/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h arch/arm/adler32_neon.lo: $(SRCDIR)/arch/arm/adler32_neon.h arch/arm/crc32_acle.lo: zconf$(SUFFIX).h arch/arm/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib$(SUFFIX).h zconf$(SUFFIX).h diff --git a/arch/aarch64/Makefile.in b/arch/aarch64/Makefile.in deleted file mode 100644 index 6fcf919a..00000000 --- a/arch/aarch64/Makefile.in +++ /dev/null @@ -1,73 +0,0 @@ -# Makefile for zlib -# Copyright (C) 1995-2013 Jean-loup Gailly, Mark Adler -# For conditions of distribution and use, see copyright notice in zlib.h - -CC= -CFLAGS= -SFLAGS= -INCLUDES= -SUFFIX= - -SRCDIR=. -SRCTOP=../.. -TOPDIR=$(SRCTOP) - -all: adler32_neon.o adler32_neon.lo armfeature.o armfeature.lo crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo - -adler32_neon.o: $(SRCDIR)/adler32_neon.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c - -adler32_neon.lo: $(SRCDIR)/adler32_neon.c - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c - -armfeature.o: $(SRCDIR)/armfeature.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c - -armfeature.lo: $(SRCDIR)/armfeature.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c - -crc32_acle.o: $(SRCDIR)/crc32_acle.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c - -crc32_acle.lo: $(SRCDIR)/crc32_acle.c - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c - -fill_window_arm.o: ${SRCDIR}/fill_window_arm.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c - -fill_window_arm.lo: ${SRCDIR}/fill_window_arm.c - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c - -insert_string_acle.o: $(SRCDIR)/insert_string_acle.c - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c - -insert_string_acle.lo: $(SRCDIR)/insert_string_acle.c - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c - -mostlyclean: clean -clean: - rm -f *.o *.lo *~ - rm -rf objs - rm -f *.gcda *.gcno *.gcov - -distclean: - rm -f Makefile - -depend: - makedepend -Y -- $(CFLAGS) -- $(SRCDIR)/*.c - makedepend -Y -a -o.lo -- $(SFLAGS) -- $(SRCDIR)/*.c - @sed "s=^$(SRCDIR)/\([a-zA-Z0-9_]*\.\(lo\|o\):\)=\1=g" < Makefile > Makefile.tmp - @mv -f Makefile.tmp Makefile - -# DO NOT DELETE THIS LINE -- make depend depends on it. - -adler32_neon.o: $(SRCDIR)/adler32_neon.h -crc32_acle.o: $(TOPDIR)/zconf$(SUFFIX).h -fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/functable.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib$(SUFFIX).h $(TOPDIR)/zconf$(SUFFIX).h -insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib$(SUFFIX).h $(TOPDIR)/zconf$(SUFFIX).h - -adler32_neon.lo: $(SRCDIR)/adler32_neon.h -crc32_acle.lo: $(TOPDIR)/zconf$(SUFFIX).h -fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/functable.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib$(SUFFIX).h $(TOPDIR)/zconf$(SUFFIX).h -insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib$(SUFFIX).h $(TOPDIR)/zconf$(SUFFIX).h - diff --git a/arch/aarch64/adler32_neon.c b/arch/aarch64/adler32_neon.c deleted file mode 100644 index 8d845a48..00000000 --- a/arch/aarch64/adler32_neon.c +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (C) 1995-2011, 2016 Mark Adler - * Copyright (C) 2017 ARM Holdings Inc. - * Author: Adenilson Cavalcanti - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ -#include "adler32_neon.h" -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include - -static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) { - static const uint8_t taps[32] = { - 32, 31, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, - 8, 7, 6, 5, 4, 3, 2, 1 }; - - uint32x2_t adacc2, s2acc2, as; - uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); - - uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); - adacc = vsetq_lane_u32(s[0], adacc, 0); - s2acc = vsetq_lane_u32(s[1], s2acc, 0); - - while (len >= 2) { - uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); - uint16x8_t adler, sum2; - s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); - adler = vpaddlq_u8( d0); - adler = vpadalq_u8(adler, d1); - sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); - sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); - sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); - sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); - adacc = vpadalq_u16(adacc, adler); - s2acc = vpadalq_u16(s2acc, sum2); - len -= 2; - buf += 32; - } - - while (len > 0) { - uint8x16_t d0 = vld1q_u8(buf); - uint16x8_t adler, sum2; - s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); - adler = vpaddlq_u8(d0); - sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); - sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); - adacc = vpadalq_u16(adacc, adler); - s2acc = vpadalq_u16(s2acc, sum2); - buf += 16; - len--; - } - - adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); - s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); - as = vpadd_u32(adacc2, s2acc2); - s[0] = vget_lane_u32(as, 0); - s[1] = vget_lane_u32(as, 1); -} - -static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) { - /* Oldie K&R code integration. */ - unsigned int i; - for (i = 0; i < len; ++i) { - pair[0] += buf[i]; - pair[1] += pair[0]; - } -} - -uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) { - if (!buf) - return 1L; - - /* The largest prime smaller than 65536. */ - const uint32_t M_BASE = 65521; - /* This is the threshold where doing accumulation may overflow. */ - const int M_NMAX = 5552; - - uint32_t sum2; - uint32_t pair[2]; - int n = M_NMAX; - unsigned int done = 0; - /* Oldie K&R code integration. */ - unsigned int i; - - /* Split Adler-32 into component sums, it can be supplied by - * the caller sites (e.g. in a PNG file). - */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - pair[0] = adler; - pair[1] = sum2; - - for (i = 0; i < len; i += n) { - if ((i + n) > len) - n = len - i; - - if (n < 16) - break; - - NEON_accum32(pair, buf + i, n / 16); - pair[0] %= M_BASE; - pair[1] %= M_BASE; - - done += (n / 16) * 16; - } - - /* Handle the tail elements. */ - if (done < len) { - NEON_handle_tail(pair, (buf + done), len - done); - pair[0] %= M_BASE; - pair[1] %= M_BASE; - } - - /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ - return (pair[1] << 16) | pair[0]; -} -#endif diff --git a/arch/aarch64/adler32_neon.h b/arch/aarch64/adler32_neon.h deleted file mode 100644 index 1cb278c7..00000000 --- a/arch/aarch64/adler32_neon.h +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (C) 1995-2011, 2016 Mark Adler - * Copyright (C) 2017 ARM Holdings Inc. - * Author: Adenilson Cavalcanti - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ -#ifndef __ADLER32_NEON__ -#define __ADLER32_NEON__ - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -// Depending on the compiler flavor, size_t may be defined in one or the other header. See: -// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t -#include -#include -uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); -#endif -#endif diff --git a/arch/aarch64/arm.h b/arch/aarch64/arm.h deleted file mode 100644 index baee87f1..00000000 --- a/arch/aarch64/arm.h +++ /dev/null @@ -1,13 +0,0 @@ -/* arm.h -- check for ARM features. - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef ARM_H_ -#define ARM_H_ - -extern int arm_cpu_has_neon; -extern int arm_cpu_has_crc32; - -void ZLIB_INTERNAL arm_check_features(void); - -#endif /* ARM_H_ */ diff --git a/arch/aarch64/armfeature.c b/arch/aarch64/armfeature.c deleted file mode 100644 index 39f185d4..00000000 --- a/arch/aarch64/armfeature.c +++ /dev/null @@ -1,24 +0,0 @@ -#include "zutil.h" - -#if defined(__linux__) -# include -# include -#endif - -static int arm_has_crc32() { -#if defined(__linux__) && defined(HWCAP_CRC32) - return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0 ? 1 : 0; -#elif defined(ARM_NOCHECK_ACLE) - return 1; -#else - return 0; -#endif -} - -ZLIB_INTERNAL int arm_cpu_has_neon; -ZLIB_INTERNAL int arm_cpu_has_crc32; - -void ZLIB_INTERNAL arm_check_features(void) { - arm_cpu_has_neon = 1; /* always available */ - arm_cpu_has_crc32 = arm_has_crc32(); -} diff --git a/arch/aarch64/crc32_acle.c b/arch/aarch64/crc32_acle.c deleted file mode 100644 index 5eeb96fd..00000000 --- a/arch/aarch64/crc32_acle.c +++ /dev/null @@ -1,84 +0,0 @@ -/* crc32_acle.c -- compute the CRC-32 of a data stream - * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler - * Copyright (C) 2016 Yang Zhang - * For conditions of distribution and use, see copyright notice in zlib.h - * -*/ - -#ifdef __ARM_FEATURE_CRC32 -#include -#ifdef ZLIB_COMPAT -# include -#else -# include -#endif -#ifdef __linux__ -# include -#endif - -uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { - register uint32_t c; - register const uint16_t *buf2; - register const uint32_t *buf4; - register const uint64_t *buf8; - - c = ~crc; - if (len && ((ptrdiff_t)buf & 1)) { - c = __crc32b(c, *buf++); - len--; - } - - if ((len > 2) && ((ptrdiff_t)buf & 2)) { - buf2 = (const uint16_t *) buf; - c = __crc32h(c, *buf2++); - len -= 2; - buf4 = (const uint32_t *) buf2; - } else { - buf4 = (const uint32_t *) buf; - } - - if ((len > 4) && ((ptrdiff_t)buf & 4)) { - c = __crc32w(c, *buf4++); - len -= 4; - } - - buf8 = (const uint64_t *) buf4; - -#ifdef UNROLL_MORE - while (len >= 32) { - c = __crc32d(c, *buf8++); - c = __crc32d(c, *buf8++); - c = __crc32d(c, *buf8++); - c = __crc32d(c, *buf8++); - len -= 32; - } -#endif - - while (len >= 8) { - c = __crc32d(c, *buf8++); - len -= 8; - } - - if (len >= 4) { - buf4 = (const uint32_t *) buf8; - c = __crc32w(c, *buf4++); - len -= 4; - buf2 = (const uint16_t *) buf4; - } else { - buf2 = (const uint16_t *) buf8; - } - - if (len >= 2) { - c = __crc32h(c, *buf2++); - len -= 2; - } - - if (len) { - buf = (const unsigned char *) buf2; - c = __crc32b(c, *buf); - } - - c = ~c; - return c; -} -#endif diff --git a/arch/aarch64/fill_window_arm.c b/arch/aarch64/fill_window_arm.c deleted file mode 100644 index c9f042a0..00000000 --- a/arch/aarch64/fill_window_arm.c +++ /dev/null @@ -1,169 +0,0 @@ -/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions - * Copyright (C) 2017 Mika T. Lindqvist - * - * Authors: - * Mika T. Lindqvist - * Jun He - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -/* @(#) $Id$ */ - -#include "zbuild.h" -#include "deflate.h" -#include "deflate_p.h" -#include "functable.h" - -extern ZLIB_INTERNAL int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size); - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#include - -/* SIMD version of hash_chain rebase */ -static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { - register uint16x8_t v, *p; - register size_t n; - - size_t size = entries*sizeof(table[0]); - Assert((size % sizeof(uint16x8_t) * 8 == 0), "hash table size err"); - - Assert(sizeof(Pos) == 2, "Wrong Pos size"); - v = vdupq_n_u16(window_size); - - p = (uint16x8_t *)table; - n = size / (sizeof(uint16x8_t) * 8); - do { - p[0] = vqsubq_u16(p[0], v); - p[1] = vqsubq_u16(p[1], v); - p[2] = vqsubq_u16(p[2], v); - p[3] = vqsubq_u16(p[3], v); - p[4] = vqsubq_u16(p[4], v); - p[5] = vqsubq_u16(p[5], v); - p[6] = vqsubq_u16(p[6], v); - p[7] = vqsubq_u16(p[7], v); - p += 8; - } while (--n); -} -#else -/* generic version for hash rebase */ -static inline void slide_hash_chain(Pos *table, unsigned int entries, uint16_t window_size) { - unsigned int i; - for (i = 0; i < entries; i++) { - table[i] = (table[i] >= window_size) ? (table[i] - window_size) : NIL; - } -} -#endif - -void fill_window_arm(deflate_state *s) { - register unsigned n; - unsigned long more; /* Amount of free space at the end of the window. */ - unsigned int wsize = s->w_size; - - Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); - - do { - more = s->window_size - s->lookahead - s->strstart; - - /* If the window is almost full and there is insufficient lookahead, - * move the upper half to the lower one to make room in the upper half. - */ - if (s->strstart >= wsize+MAX_DIST(s)) { - memcpy(s->window, s->window+wsize, wsize); - s->match_start -= wsize; - s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ - s->block_start -= wsize; - - /* Slide the hash table (could be avoided with 32 bit values - at the expense of memory usage). We slide even when level == 0 - to keep the hash table consistent if we switch back to level > 0 - later. (Using level 0 permanently is not an optimal usage of - zlib, so we don't care about this pathological case.) - */ - - slide_hash_chain(s->head, s->hash_size, wsize); - slide_hash_chain(s->prev, wsize, wsize); - more += wsize; - } - if (s->strm->avail_in == 0) - break; - - /* If there was no sliding: - * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && - * more == window_size - lookahead - strstart - * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) - * => more >= window_size - 2*WSIZE + 2 - * In the BIG_MEM or MMAP case (not yet supported), - * window_size == input_size + MIN_LOOKAHEAD && - * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. - * Otherwise, window_size == 2*WSIZE so more >= 2. - * If there was sliding, more >= WSIZE. So in all cases, more >= 2. - */ - Assert(more >= 2, "more < 2"); - - n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); - s->lookahead += n; - - /* Initialize the hash value now that we have some input: */ - if (s->lookahead + s->insert >= MIN_MATCH) { - unsigned int str = s->strstart - s->insert; - unsigned int insert_cnt = s->insert; - unsigned int slen; - - s->ins_h = s->window[str]; - - if (unlikely(s->lookahead < MIN_MATCH)) - insert_cnt += s->lookahead - MIN_MATCH; - slen = insert_cnt; - if (str >= (MIN_MATCH - 2)) - { - str += 2 - MIN_MATCH; - insert_cnt += MIN_MATCH - 2; - } - if (insert_cnt > 0) - { - functable.insert_string(s, str, insert_cnt); - s->insert -= slen; - } - } - /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, - * but this is not important since only literal bytes will be emitted. - */ - } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); - - /* If the WIN_INIT bytes after the end of the current data have never been - * written, then zero those bytes in order to avoid memory check reports of - * the use of uninitialized (or uninitialised as Julian writes) bytes by - * the longest match routines. Update the high water mark for the next - * time through here. WIN_INIT is set to MAX_MATCH since the longest match - * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. - */ - if (s->high_water < s->window_size) { - unsigned long curr = s->strstart + (unsigned long)s->lookahead; - unsigned long init; - - if (s->high_water < curr) { - /* Previous high water mark below current data -- zero WIN_INIT - * bytes or up to end of window, whichever is less. - */ - init = s->window_size - curr; - if (init > WIN_INIT) - init = WIN_INIT; - memset(s->window + curr, 0, init); - s->high_water = curr + init; - } else if (s->high_water < curr + WIN_INIT) { - /* High water mark at or above current data, but below current data - * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up - * to end of window, whichever is less. - */ - init = curr + WIN_INIT; - if (init > s->window_size) - init = s->window_size; - init -= s->high_water; - memset(s->window + s->high_water, 0, init); - s->high_water += init; - } - } - - Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "not enough room for search"); -} diff --git a/arch/aarch64/insert_string_acle.c b/arch/aarch64/insert_string_acle.c deleted file mode 100644 index 7f9e02b2..00000000 --- a/arch/aarch64/insert_string_acle.c +++ /dev/null @@ -1,55 +0,0 @@ -/* insert_string_acle.c -- insert_string variant using ACLE's CRC instructions - * - * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - */ - -#if defined(__ARM_FEATURE_CRC32) && defined(ARM_ACLE_CRC_HASH) -#include -#include "zbuild.h" -#include "deflate.h" - -/* =========================================================================== - * Insert string str in the dictionary and set match_head to the previous head - * of the hash chain (the most recent string with same hash key). Return - * the previous length of the hash chain. - * IN assertion: all calls to to INSERT_STRING are made with consecutive - * input characters and the first MIN_MATCH bytes of str are valid - * (except for the last MIN_MATCH-1 bytes of the input file). - */ -Pos insert_string_acle(deflate_state *const s, const Pos str, unsigned int count) { - Pos p, lp, ret; - - if (unlikely(count == 0)) { - return s->prev[str & s->w_mask]; - } - - ret = 0; - lp = str + count - 1; /* last position */ - - for (p = str; p <= lp; p++) { - unsigned *ip, val, h, hm; - - ip = (unsigned *)&s->window[p]; - val = *ip; - - if (s->level >= TRIGGER_LEVEL) - val &= 0xFFFFFF; - - h = __crc32w(0, val); - hm = h & s->hash_mask; - - Pos head = s->head[hm]; - if (head != p) { - s->prev[p & s->w_mask] = head; - s->head[hm] = p; - if (p == lp) - ret = head; - } else if (p == lp) { - ret = p; - } - } - return ret; -} -#endif diff --git a/arch/arm/armfeature.c b/arch/arm/armfeature.c index a06fd528..44a16094 100644 --- a/arch/arm/armfeature.c +++ b/arch/arm/armfeature.c @@ -17,7 +17,7 @@ static int arm_has_crc32() { #endif } -static int arm_has_neon() +static inline int arm_has_neon() { #if defined(__linux__) && defined(HWCAP_NEON) return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0 ? 1 : 0; @@ -38,6 +38,10 @@ ZLIB_INTERNAL int arm_cpu_has_neon; ZLIB_INTERNAL int arm_cpu_has_crc32; void ZLIB_INTERNAL arm_check_features(void) { +#if defined(__aarch64__) + arm_cpu_has_neon = 1; /* always available */ +#else arm_cpu_has_neon = arm_has_neon(); +#endif arm_cpu_has_crc32 = arm_has_crc32(); } diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c index 06e6739f..80777bf2 100644 --- a/arch/arm/crc32_acle.c +++ b/arch/arm/crc32_acle.c @@ -6,15 +6,15 @@ */ #ifdef __ARM_FEATURE_CRC32 -#include -#ifdef ZLIB_COMPAT +# include +# ifdef ZLIB_COMPAT # include -#else +# else # include -#endif -#ifdef __linux__ +# endif +# ifdef __linux__ # include -#endif +# endif uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { register uint32_t c; @@ -36,7 +36,47 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { buf4 = (const uint32_t *) buf; } -#ifdef UNROLL_MORE +# if defined(__aarch64__) + if ((len > 4) && ((ptrdiff_t)buf & 4)) { + c = __crc32w(c, *buf4++); + len -= 4; + } + + const uint64_t *buf8 = (const uint64_t *) buf4; + +# ifdef UNROLL_MORE + while (len >= 32) { + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + len -= 32; + } +# endif + + while (len >= 8) { + c = __crc32d(c, *buf8++); + len -= 8; + } + + if (len >= 4) { + buf4 = (const uint32_t *) buf8; + c = __crc32w(c, *buf4++); + len -= 4; + buf2 = (const uint16_t *) buf4; + } else { + buf2 = (const uint16_t *) buf8; + } + + if (len >= 2) { + c = __crc32h(c, *buf2++); + len -= 2; + } + + buf = (const unsigned char *) buf2; +# else /* __aarch64__ */ + +# ifdef UNROLL_MORE while (len >= 32) { c = __crc32w(c, *buf4++); c = __crc32w(c, *buf4++); @@ -48,7 +88,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { c = __crc32w(c, *buf4++); len -= 32; } -#endif +# endif while (len >= 4) { c = __crc32w(c, *buf4++); @@ -63,6 +103,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { } else { buf = (const unsigned char *) buf4; } +# endif /* __aarch64__ */ if (len) { c = __crc32b(c, *buf); @@ -71,4 +112,4 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { c = ~c; return c; } -#endif +#endif /* __ARM_FEATURE_CRC32 */ diff --git a/configure b/configure index 4f932869..e09b2e62 100755 --- a/configure +++ b/configure @@ -1156,7 +1156,7 @@ case "${ARCH}" in # 64-bit ARM specific optimizations aarch64) [ ! -z $CROSS_PREFIX ] && QEMU_ARCH=aarch64 - ARCHDIR=arch/aarch64 + ARCHDIR=arch/arm ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} armfeature.o fill_window_arm.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} armfeature.lo fill_window_arm.lo" diff --git a/zutil.h b/zutil.h index 0dfc328d..8d8ae92e 100644 --- a/zutil.h +++ b/zutil.h @@ -240,9 +240,7 @@ void ZLIB_INTERNAL zcfree(void *opaque, void *ptr); #if defined(X86_CPUID) # include "arch/x86/x86.h" -#elif defined(__aarch64__) -# include "arch/aarch64/arm.h" -#elif defined(__arm__) || defined(_M_ARM) +#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) # include "arch/arm/arm.h" #endif