add_definitions(-DX86 -DUNALIGNED_OK -DUNROLL_LESS)
add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"")
endif()
+if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64")
+ set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/fill_window_arm.c)
+endif()
if(WITH_OPTIM)
if("${ARCH}" MATCHES "arm")
if(WITH_ACLE)
trees.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h
zutil.o: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h
arch/aarch64/crc32_acle.o: zconf.h
+arch/aarch64/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/aarch64/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/arm/crc32_acle.o: zconf.h
+arch/arm/fill_window_arm.o: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/arm/insert_string_acle.o: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/x86/crc_folding.o: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/x86/crc_pclmulqdq.o: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
trees.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h $(SRCDIR)/trees.h
zutil.lo: $(SRCDIR)/zutil.h $(SRCDIR)/gzguts.h $(SRCDIR)/zlib.h zconf.h
arch/aarch64/crc32_acle.lo: zconf.h
+arch/aarch64/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/aarch64/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/arm/crc32_acle.lo: zconf.h
+arch/arm/fill_window_arm.lo: $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/arm/insert_string_acle.lo: $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/x86/crc_folding.lo: $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
arch/x86/crc_pclmulqdq.lo: $(SRCDIR)/arch/x86/x86.h $(SRCDIR)/arch/x86/crc_folding.h $(SRCDIR)/deflate.h $(SRCDIR)/zutil.h $(SRCDIR)/zlib.h zconf.h
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: crc32_acle.o crc32_acle.lo insert_string_acle.o insert_string_acle.lo
+all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
crc32_acle.o: $(SRCDIR)/crc32_acle.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
crc32_acle.lo: $(SRCDIR)/crc32_acle.c
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+fill_window_arm.o: ${SRCDIR}/fill_window_arm.c
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+
+fill_window_arm.lo: ${SRCDIR}/fill_window_arm.c
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+
insert_string_acle.o: $(SRCDIR)/insert_string_acle.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
# DO NOT DELETE THIS LINE -- make depend depends on it.
crc32_acle.o: $(TOPDIR)/zconf.h
+fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
crc32_acle.lo: $(TOPDIR)/zconf.h
+fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
--- /dev/null
+/* fill_window_arm.c -- Optimized hash table shifting for ARM
+ * Copyright (C) 2017 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#include "deflate.h"
+#include "deflate_p.h"
+
+extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size);
+
+void fill_window_arm(deflate_state *s) {
+ register unsigned n;
+ unsigned long more; /* Amount of free space at the end of the window. */
+ unsigned int wsize = s->w_size;
+
+ Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+ do {
+ more = s->window_size - s->lookahead - s->strstart;
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+ unsigned int i;
+
+ memcpy(s->window, s->window+wsize, wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ {
+ n = s->hash_size;
+ for (i = 0; i < n; i++) {
+ if (s->head[i] >= wsize)
+ s->head[i] -= wsize;
+ else
+ s->head[i] = NIL;
+ }
+ }
+
+ {
+ for (i = 0; i < wsize; i++) {
+ if (s->prev[i] >= wsize)
+ s->prev[i] -= wsize;
+ else
+ s->prev[i] = NIL;
+ }
+ }
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0)
+ break;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead + s->insert >= MIN_MATCH) {
+ unsigned int str = s->strstart - s->insert;
+ unsigned int insert_cnt = s->insert;
+ unsigned int slen;
+
+ s->ins_h = s->window[str];
+
+ if (unlikely(s->lookahead < MIN_MATCH))
+ insert_cnt += s->lookahead - MIN_MATCH;
+ slen = insert_cnt;
+ if (str >= (MIN_MATCH - 2))
+ {
+ str += 2 - MIN_MATCH;
+ insert_cnt += MIN_MATCH - 2;
+ }
+ if (insert_cnt > 0)
+ {
+ insert_string(s, str, insert_cnt);
+ s->insert -= slen;
+ }
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+ /* If the WIN_INIT bytes after the end of the current data have never been
+ * written, then zero those bytes in order to avoid memory check reports of
+ * the use of uninitialized (or uninitialised as Julian writes) bytes by
+ * the longest match routines. Update the high water mark for the next
+ * time through here. WIN_INIT is set to MAX_MATCH since the longest match
+ * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+ */
+ if (s->high_water < s->window_size) {
+ unsigned long curr = s->strstart + (unsigned long)s->lookahead;
+ unsigned long init;
+
+ if (s->high_water < curr) {
+ /* Previous high water mark below current data -- zero WIN_INIT
+ * bytes or up to end of window, whichever is less.
+ */
+ init = s->window_size - curr;
+ if (init > WIN_INIT)
+ init = WIN_INIT;
+ memset(s->window + curr, 0, init);
+ s->high_water = curr + init;
+ } else if (s->high_water < curr + WIN_INIT) {
+ /* High water mark at or above current data, but below current data
+ * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+ * to end of window, whichever is less.
+ */
+ init = curr + WIN_INIT;
+ if (init > s->window_size)
+ init = s->window_size;
+ init -= s->high_water;
+ memset(s->window + s->high_water, 0, init);
+ s->high_water += init;
+ }
+ }
+
+ Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+ "not enough room for search");
+}
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: crc32_acle.o crc32_acle.lo insert_string_acle.o insert_string_acle.lo
+all: crc32_acle.o crc32_acle.lo fill_window_arm.o fill_window_arm.lo insert_string_acle.o insert_string_acle.lo
crc32_acle.o: $(SRCDIR)/crc32_acle.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
crc32_acle.lo: $(SRCDIR)/crc32_acle.c
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
+fill_window_arm.o: ${SRCDIR}/fill_window_arm.c
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+
+fill_window_arm.lo: ${SRCDIR}/fill_window_arm.c
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/fill_window_arm.c
+
insert_string_acle.o: $(SRCDIR)/insert_string_acle.c
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
# DO NOT DELETE THIS LINE -- make depend depends on it.
crc32_acle.o: $(TOPDIR)/zconf.h
+fill_window_arm.o: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
insert_string_acle.o: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
crc32_acle.lo: $(TOPDIR)/zconf.h
+fill_window_arm.lo: $(SRCTOP)/deflate.h $(SRCTOP)/deflate_p.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
insert_string_acle.lo: $(SRCTOP)/deflate.h $(SRCTOP)/zutil.h $(SRCTOP)/zlib.h $(TOPDIR)/zconf.h
--- /dev/null
+/* fill_window_arm.c -- Optimized hash table shifting for ARM
+ * Copyright (C) 2017 Mika T. Lindqvist
+ *
+ * Authors:
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#include "deflate.h"
+#include "deflate_p.h"
+
+extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size);
+
+void fill_window_arm(deflate_state *s) {
+ register unsigned n;
+ unsigned long more; /* Amount of free space at the end of the window. */
+ unsigned int wsize = s->w_size;
+
+ Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
+ do {
+ more = s->window_size - s->lookahead - s->strstart;
+
+ /* If the window is almost full and there is insufficient lookahead,
+ * move the upper half to the lower one to make room in the upper half.
+ */
+ if (s->strstart >= wsize+MAX_DIST(s)) {
+ unsigned int i;
+
+ memcpy(s->window, s->window+wsize, wsize);
+ s->match_start -= wsize;
+ s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
+ s->block_start -= wsize;
+
+ /* Slide the hash table (could be avoided with 32 bit values
+ at the expense of memory usage). We slide even when level == 0
+ to keep the hash table consistent if we switch back to level > 0
+ later. (Using level 0 permanently is not an optimal usage of
+ zlib, so we don't care about this pathological case.)
+ */
+ {
+ n = s->hash_size;
+ for (i = 0; i < n; i++) {
+ if (s->head[i] >= wsize)
+ s->head[i] -= wsize;
+ else
+ s->head[i] = NIL;
+ }
+ }
+
+ {
+ for (i = 0; i < wsize; i++) {
+ if (s->prev[i] >= wsize)
+ s->prev[i] -= wsize;
+ else
+ s->prev[i] = NIL;
+ }
+ }
+ more += wsize;
+ }
+ if (s->strm->avail_in == 0)
+ break;
+
+ /* If there was no sliding:
+ * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
+ * more == window_size - lookahead - strstart
+ * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
+ * => more >= window_size - 2*WSIZE + 2
+ * In the BIG_MEM or MMAP case (not yet supported),
+ * window_size == input_size + MIN_LOOKAHEAD &&
+ * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
+ * Otherwise, window_size == 2*WSIZE so more >= 2.
+ * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
+ */
+ Assert(more >= 2, "more < 2");
+
+ n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more);
+ s->lookahead += n;
+
+ /* Initialize the hash value now that we have some input: */
+ if (s->lookahead + s->insert >= MIN_MATCH) {
+ unsigned int str = s->strstart - s->insert;
+ unsigned int insert_cnt = s->insert;
+ unsigned int slen;
+
+ s->ins_h = s->window[str];
+
+ if (unlikely(s->lookahead < MIN_MATCH))
+ insert_cnt += s->lookahead - MIN_MATCH;
+ slen = insert_cnt;
+ if (str >= (MIN_MATCH - 2))
+ {
+ str += 2 - MIN_MATCH;
+ insert_cnt += MIN_MATCH - 2;
+ }
+ if (insert_cnt > 0)
+ {
+ insert_string(s, str, insert_cnt);
+ s->insert -= slen;
+ }
+ }
+ /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
+ * but this is not important since only literal bytes will be emitted.
+ */
+ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+ /* If the WIN_INIT bytes after the end of the current data have never been
+ * written, then zero those bytes in order to avoid memory check reports of
+ * the use of uninitialized (or uninitialised as Julian writes) bytes by
+ * the longest match routines. Update the high water mark for the next
+ * time through here. WIN_INIT is set to MAX_MATCH since the longest match
+ * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+ */
+ if (s->high_water < s->window_size) {
+ unsigned long curr = s->strstart + (unsigned long)s->lookahead;
+ unsigned long init;
+
+ if (s->high_water < curr) {
+ /* Previous high water mark below current data -- zero WIN_INIT
+ * bytes or up to end of window, whichever is less.
+ */
+ init = s->window_size - curr;
+ if (init > WIN_INIT)
+ init = WIN_INIT;
+ memset(s->window + curr, 0, init);
+ s->high_water = curr + init;
+ } else if (s->high_water < curr + WIN_INIT) {
+ /* High water mark at or above current data, but below current data
+ * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+ * to end of window, whichever is less.
+ */
+ init = curr + WIN_INIT;
+ if (init > s->window_size)
+ init = s->window_size;
+ init -= s->high_water;
+ memset(s->window + s->high_water, 0, init);
+ s->high_water += init;
+ }
+ }
+
+ Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+ "not enough room for search");
+}
# ARM specific optimizations
arm | armv3l | armv4b | armv4l | armv4tl | armv5tel | armv5tejl | armv6l | armv6hl | armv7l | armv7hl | armv7hnl | armv8-a | armv8-a+crc | armv8.1-a)
ARCHDIR=arch/arm
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
case "${ARCH}" in
armv6l | armv6hl)
# 64-bit ARM specific optimizations
aarch64)
ARCHDIR=arch/aarch64
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
+
if test $buildacle -eq 1; then
CFLAGS="-march=armv8-a+crc ${CFLAGS} -DARM_ACLE_CRC_HASH"
SFLAGS="-march=armv8-a+crc ${SFLAGS} -DARM_ACLE_CRC_HASH"
*/
#ifdef X86_SSE2_FILL_WINDOW
extern void fill_window_sse(deflate_state *s);
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
+extern void fill_window_arm(deflate_state *s);
#endif
void fill_window_c(deflate_state *s);
}
# endif
+#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
+ fill_window_arm(s);
#else
fill_window_c(s);
#endif
WITH_ACLE =
OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_slow.obj \
- infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj
+ infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj fill_window_arm.obj
!if "$(WITH_GZFILEOP)" != ""
WFLAGS = $(WFLAGS) -DWITH_GZFILEOP
OBJS = $(OBJS) gzclose.obj gzlib.obj gzread.obj gzwrite.obj