From: Mika Lindqvist Date: Sun, 10 Apr 2016 10:34:53 +0000 (+0300) Subject: Optimize fill_window_c. X-Git-Tag: 1.9.9-b1~660^2~22^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ce6f8dec88b924b154ebe0ff2701411fb029d16a;p=thirdparty%2Fzlib-ng.git Optimize fill_window_c. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dac05f62..98732ed2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -345,6 +345,9 @@ else() add_definitions(-DX86 -DUNALIGNED_OK -DUNROLL_LESS) add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"") endif() +if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64") + set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/fill_window_arm.c) +endif() if(WITH_OPTIM) if("${ARCH}" MATCHES "arm") if(WITH_ACLE) diff --git a/Makefile.in b/Makefile.in index 12d824e20..f8efa87e7 100644 --- a/Makefile.in +++ b/Makefile.in @@ -324,8 +324,10 @@ inftrees.o: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h trees.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h zutil.o: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/crc32_acle.o: zconf.h +arch/aarch64/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/crc32_acle.o: zconf.h +arch/arm/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_folding.o: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_pclmulqdq.o: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h @@ -347,8 +349,10 @@ inftrees.lo: $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/inftrees.h trees.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h zutil.lo: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/crc32_acle.lo: zconf.h +arch/aarch64/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/aarch64/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/crc32_acle.lo: zconf.h +arch/arm/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/arm/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_folding.lo: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h arch/x86/crc_pclmulqdq.lo: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h diff --git a/arch/aarch64/Makefile.in b/arch/aarch64/Makefile.in index a2c96bd06..d42298277 100644 --- a/arch/aarch64/Makefile.in +++ b/arch/aarch64/Makefile.in @@ -11,7 +11,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: crc32_acle.o crc32_acle.lo insert_string_acle.o insert_string_acle.lo +all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo crc32_acle.o: $(SRCDIR)/crc32_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c @@ -19,6 +19,12 @@ crc32_acle.o: $(SRCDIR)/crc32_acle.c crc32_acle.lo: $(SRCDIR)/crc32_acle.c $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c +fill_window_arm.o: ${SRCDIR}/fill_window_arm.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c + +fill_window_arm.lo: ${SRCDIR}/fill_window_arm.c + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c + insert_string_acle.o: $(SRCDIR)/insert_string_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c @@ -43,7 +49,9 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. crc32_acle.o: $(TOPDIR)/zconf.h +fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h crc32_acle.lo: $(TOPDIR)/zconf.h +fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h diff --git a/arch/aarch64/fill_window_arm.c b/arch/aarch64/fill_window_arm.c new file mode 100644 index 000000000..9608a25f9 --- /dev/null +++ b/arch/aarch64/fill_window_arm.c @@ -0,0 +1,146 @@ +/* fill_window_arm.c -- Optimized hash table shifting for ARM + * Copyright (C) 2017 Mika T. Lindqvist + * + * Authors: + * Mika T. Lindqvist + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#include "deflate.h" +#include "deflate_p.h" + +extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); + +void fill_window_arm(deflate_state *s) { + register unsigned n; + unsigned long more; /* Amount of free space at the end of the window. */ + unsigned int wsize = s->w_size; + + Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); + + do { + more = s->window_size - s->lookahead - s->strstart; + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + if (s->strstart >= wsize+MAX_DIST(s)) { + unsigned int i; + + memcpy(s->window, s->window+wsize, wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + { + n = s->hash_size; + for (i = 0; i < n; i++) { + if (s->head[i] >= wsize) + s->head[i] -= wsize; + else + s->head[i] = NIL; + } + } + + { + for (i = 0; i < wsize; i++) { + if (s->prev[i] >= wsize) + s->prev[i] -= wsize; + else + s->prev[i] = NIL; + } + } + more += wsize; + } + if (s->strm->avail_in == 0) + break; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead + s->insert >= MIN_MATCH) { + unsigned int str = s->strstart - s->insert; + unsigned int insert_cnt = s->insert; + unsigned int slen; + + s->ins_h = s->window[str]; + + if (unlikely(s->lookahead < MIN_MATCH)) + insert_cnt += s->lookahead - MIN_MATCH; + slen = insert_cnt; + if (str >= (MIN_MATCH - 2)) + { + str += 2 - MIN_MATCH; + insert_cnt += MIN_MATCH - 2; + } + if (insert_cnt > 0) + { + insert_string(s, str, insert_cnt); + s->insert -= slen; + } + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); + + /* If the WIN_INIT bytes after the end of the current data have never been + * written, then zero those bytes in order to avoid memory check reports of + * the use of uninitialized (or uninitialised as Julian writes) bytes by + * the longest match routines. Update the high water mark for the next + * time through here. WIN_INIT is set to MAX_MATCH since the longest match + * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. + */ + if (s->high_water < s->window_size) { + unsigned long curr = s->strstart + (unsigned long)s->lookahead; + unsigned long init; + + if (s->high_water < curr) { + /* Previous high water mark below current data -- zero WIN_INIT + * bytes or up to end of window, whichever is less. + */ + init = s->window_size - curr; + if (init > WIN_INIT) + init = WIN_INIT; + memset(s->window + curr, 0, init); + s->high_water = curr + init; + } else if (s->high_water < curr + WIN_INIT) { + /* High water mark at or above current data, but below current data + * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up + * to end of window, whichever is less. + */ + init = curr + WIN_INIT; + if (init > s->window_size) + init = s->window_size; + init -= s->high_water; + memset(s->window + s->high_water, 0, init); + s->high_water += init; + } + } + + Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, + "not enough room for search"); +} diff --git a/arch/arm/Makefile.in b/arch/arm/Makefile.in index a2c96bd06..d42298277 100644 --- a/arch/arm/Makefile.in +++ b/arch/arm/Makefile.in @@ -11,7 +11,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: crc32_acle.o crc32_acle.lo insert_string_acle.o insert_string_acle.lo +all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo crc32_acle.o: $(SRCDIR)/crc32_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c @@ -19,6 +19,12 @@ crc32_acle.o: $(SRCDIR)/crc32_acle.c crc32_acle.lo: $(SRCDIR)/crc32_acle.c $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c +fill_window_arm.o: ${SRCDIR}/fill_window_arm.c + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c + +fill_window_arm.lo: ${SRCDIR}/fill_window_arm.c + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c + insert_string_acle.o: $(SRCDIR)/insert_string_acle.c $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c @@ -43,7 +49,9 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. crc32_acle.o: $(TOPDIR)/zconf.h +fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h crc32_acle.lo: $(TOPDIR)/zconf.h +fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c new file mode 100644 index 000000000..9608a25f9 --- /dev/null +++ b/arch/arm/fill_window_arm.c @@ -0,0 +1,146 @@ +/* fill_window_arm.c -- Optimized hash table shifting for ARM + * Copyright (C) 2017 Mika T. Lindqvist + * + * Authors: + * Mika T. Lindqvist + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#include "deflate.h" +#include "deflate_p.h" + +extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); + +void fill_window_arm(deflate_state *s) { + register unsigned n; + unsigned long more; /* Amount of free space at the end of the window. */ + unsigned int wsize = s->w_size; + + Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead"); + + do { + more = s->window_size - s->lookahead - s->strstart; + + /* If the window is almost full and there is insufficient lookahead, + * move the upper half to the lower one to make room in the upper half. + */ + if (s->strstart >= wsize+MAX_DIST(s)) { + unsigned int i; + + memcpy(s->window, s->window+wsize, wsize); + s->match_start -= wsize; + s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ + s->block_start -= wsize; + + /* Slide the hash table (could be avoided with 32 bit values + at the expense of memory usage). We slide even when level == 0 + to keep the hash table consistent if we switch back to level > 0 + later. (Using level 0 permanently is not an optimal usage of + zlib, so we don't care about this pathological case.) + */ + { + n = s->hash_size; + for (i = 0; i < n; i++) { + if (s->head[i] >= wsize) + s->head[i] -= wsize; + else + s->head[i] = NIL; + } + } + + { + for (i = 0; i < wsize; i++) { + if (s->prev[i] >= wsize) + s->prev[i] -= wsize; + else + s->prev[i] = NIL; + } + } + more += wsize; + } + if (s->strm->avail_in == 0) + break; + + /* If there was no sliding: + * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && + * more == window_size - lookahead - strstart + * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) + * => more >= window_size - 2*WSIZE + 2 + * In the BIG_MEM or MMAP case (not yet supported), + * window_size == input_size + MIN_LOOKAHEAD && + * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. + * Otherwise, window_size == 2*WSIZE so more >= 2. + * If there was sliding, more >= WSIZE. So in all cases, more >= 2. + */ + Assert(more >= 2, "more < 2"); + + n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); + s->lookahead += n; + + /* Initialize the hash value now that we have some input: */ + if (s->lookahead + s->insert >= MIN_MATCH) { + unsigned int str = s->strstart - s->insert; + unsigned int insert_cnt = s->insert; + unsigned int slen; + + s->ins_h = s->window[str]; + + if (unlikely(s->lookahead < MIN_MATCH)) + insert_cnt += s->lookahead - MIN_MATCH; + slen = insert_cnt; + if (str >= (MIN_MATCH - 2)) + { + str += 2 - MIN_MATCH; + insert_cnt += MIN_MATCH - 2; + } + if (insert_cnt > 0) + { + insert_string(s, str, insert_cnt); + s->insert -= slen; + } + } + /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, + * but this is not important since only literal bytes will be emitted. + */ + } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); + + /* If the WIN_INIT bytes after the end of the current data have never been + * written, then zero those bytes in order to avoid memory check reports of + * the use of uninitialized (or uninitialised as Julian writes) bytes by + * the longest match routines. Update the high water mark for the next + * time through here. WIN_INIT is set to MAX_MATCH since the longest match + * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead. + */ + if (s->high_water < s->window_size) { + unsigned long curr = s->strstart + (unsigned long)s->lookahead; + unsigned long init; + + if (s->high_water < curr) { + /* Previous high water mark below current data -- zero WIN_INIT + * bytes or up to end of window, whichever is less. + */ + init = s->window_size - curr; + if (init > WIN_INIT) + init = WIN_INIT; + memset(s->window + curr, 0, init); + s->high_water = curr + init; + } else if (s->high_water < curr + WIN_INIT) { + /* High water mark at or above current data, but below current data + * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up + * to end of window, whichever is less. + */ + init = curr + WIN_INIT; + if (init > s->window_size) + init = s->window_size; + init -= s->high_water; + memset(s->window + s->high_water, 0, init); + s->high_water += init; + } + } + + Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, + "not enough room for search"); +} diff --git a/configure b/configure index 256c4210d..810721bea 100755 --- a/configure +++ b/configure @@ -831,6 +831,8 @@ case "${ARCH}" in # ARM specific optimizations arm | armv3l | armv4b | armv4l | armv4tl | armv5tel | armv5tejl | armv6l | armv6hl | armv7l | armv7hl | armv7hnl | armv8-a | armv8-a+crc | armv8.1-a) ARCHDIR=arch/arm + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo" case "${ARCH}" in armv6l | armv6hl) @@ -853,6 +855,9 @@ case "${ARCH}" in # 64-bit ARM specific optimizations aarch64) ARCHDIR=arch/aarch64 + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo" + if test $buildacle -eq 1; then CFLAGS="-march=armv8-a+crc ${CFLAGS} -DARM_ACLE_CRC_HASH" SFLAGS="-march=armv8-a+crc ${SFLAGS} -DARM_ACLE_CRC_HASH" diff --git a/deflate.c b/deflate.c index 78273e54b..60adf15b7 100644 --- a/deflate.c +++ b/deflate.c @@ -1188,6 +1188,8 @@ void check_match(deflate_state *s, IPos start, IPos match, int length) { */ #ifdef X86_SSE2_FILL_WINDOW extern void fill_window_sse(deflate_state *s); +#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) +extern void fill_window_arm(deflate_state *s); #endif void fill_window_c(deflate_state *s); @@ -1203,6 +1205,8 @@ void fill_window(deflate_state *s) { } # endif +#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) + fill_window_arm(s); #else fill_window_c(s); #endif diff --git a/win32/Makefile.arm b/win32/Makefile.arm index fee2d0548..fb330ce46 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -32,7 +32,7 @@ WITH_GZFILEOP = WITH_ACLE = OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_slow.obj \ - infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj + infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj fill_window_arm.obj !if "$(WITH_GZFILEOP)" != "" WFLAGS = $(WFLAGS) -DWITH_GZFILEOP OBJS = $(OBJS) gzclose.obj gzlib.obj gzread.obj gzwrite.obj