From: Hans Kristian Rosbach Date: Wed, 16 Jan 2019 10:49:29 +0000 (+0100) Subject: Replace the UNROLL_LESS define with UNROLL_MORE, making UNROLL_LESS the default. X-Git-Tag: 1.9.9-b1~551 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=18d26229390a7e92627ee8350b3a2687e878fd03;p=thirdparty%2Fzlib-ng.git Replace the UNROLL_LESS define with UNROLL_MORE, making UNROLL_LESS the default. Performance benchmarks have so far not shown that any platform benefits from UNROLL_MORE, although this might be beneficial on older compilers/cpus or for compiling without optimizations. The extra UNROLL_MORE code should be considered for removal since it is never enabled by us and will likely only serve to confuse and contribute to bitrot. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 2872fe36..4f9176ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -483,17 +483,17 @@ set(ZLIB_ARCH_SRCS) set(ARCHDIR "arch/generic") if("${ARCH}" MATCHES "x86_64" OR "${ARCH}" MATCHES "AMD64") set(ARCHDIR "arch/x86") - add_definitions(-DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS) + add_definitions(-DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK) add_feature_info(SSE2 1 "Use the SSE2 instruction set, using \"${SSE2FLAG}\"") elseif("${ARCH}" MATCHES "arm") set(ARCHDIR "arch/arm") - add_definitions(-DUNALIGNED_OK -DUNROLL_LESS) + add_definitions(-DUNALIGNED_OK) elseif("${ARCH}" MATCHES "aarch64") set(ARCHDIR "arch/aarch64") - add_definitions(-DUNALIGNED_OK -DUNROLL_LESS) + add_definitions(-DUNALIGNED_OK) elseif("${ARCH}" MATCHES "i[3-6]86") set(ARCHDIR "arch/x86") - add_definitions(-DX86 -DUNALIGNED_OK -DUNROLL_LESS) + add_definitions(-DX86 -DUNALIGNED_OK) add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"") else() message(STATUS "No optimized architecture: using ${ARCHDIR}") diff --git a/adler32.c b/adler32.c index dbbb5fe5..651d73c5 100644 --- a/adler32.c +++ b/adler32.c @@ -102,13 +102,13 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) { /* do length NMAX blocks -- requires just one modulo operation */ while (len >= NMAX) { len -= NMAX; -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE n = NMAX / 16; /* NMAX is divisible by 16 */ #else n = NMAX / 8; /* NMAX is divisible by 8 */ #endif do { -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE DO16(buf); /* 16 sums unrolled */ buf += 16; #else @@ -122,7 +122,7 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) { /* do remaining bytes (less than NMAX, still just one modulo) */ if (len) { /* avoid modulos if none remaining */ -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE while (len >= 16) { len -= 16; DO16(buf); diff --git a/arch/aarch64/crc32_acle.c b/arch/aarch64/crc32_acle.c index cee7825a..5eeb96fd 100644 --- a/arch/aarch64/crc32_acle.c +++ b/arch/aarch64/crc32_acle.c @@ -44,7 +44,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { buf8 = (const uint64_t *) buf4; -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE while (len >= 32) { c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c index d46ca2aa..06e6739f 100644 --- a/arch/arm/crc32_acle.c +++ b/arch/arm/crc32_acle.c @@ -36,7 +36,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) { buf4 = (const uint32_t *) buf; } -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE while (len >= 32) { c = __crc32w(c, *buf4++); c = __crc32w(c, *buf4++); diff --git a/configure b/configure index 393f5c6a..355987ed 100755 --- a/configure +++ b/configure @@ -950,8 +950,8 @@ case "${ARCH}" in i386 | i486 | i586 | i686) ARCHDIR=arch/x86 - CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK" + SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK" # Enable arch-specific optimizations? if test $without_optimizations -eq 0; then @@ -1002,8 +1002,8 @@ case "${ARCH}" in x86_64) ARCHDIR=arch/x86 - CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK" + SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK" # Enable arch-specific optimizations? if test $without_optimizations -eq 0; then @@ -1062,10 +1062,8 @@ case "${ARCH}" in fi ;; armv6l | armv6hl) - # Tests done on Raspberry pi (armv6hl) indicate that UNALIGNED_OK and UNROLL_LESS both - # provide performance improvements, totaling about 1.5% for the two. - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DUNALIGNED_OK" + SFLAGS="${SFLAGS} -DUNALIGNED_OK" if test $buildacle -eq 1; then echo ACLE support not available @@ -1076,8 +1074,8 @@ case "${ARCH}" in fi ;; arm | armv7*) - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DUNALIGNED_OK" + SFLAGS="${SFLAGS} -DUNALIGNED_OK" if test $buildacle -eq 1; then echo ACLE support not available @@ -1092,8 +1090,8 @@ case "${ARCH}" in fi ;; armv8-a | armv8-a+simd) - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="${CFLAGS} -DUNALIGNED_OK" + SFLAGS="${SFLAGS} -DUNALIGNED_OK" if test $buildacle -eq 1; then echo ACLE support not available @@ -1116,8 +1114,8 @@ case "${ARCH}" in fi ;; armv8-a+crc | armv8-a+crc+simd | armv8.[1234]-a | armv8.[1234]-a+simd) - CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK" + SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo" @@ -1171,8 +1169,8 @@ case "${ARCH}" in ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo" fi - CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK" + SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK" ;; powerpc) [ ! -z $CROSS_PREFIX ] && QEMU_ARCH=ppc diff --git a/crc32.c b/crc32.c index 95080b3c..33e82d34 100644 --- a/crc32.c +++ b/crc32.c @@ -255,16 +255,16 @@ ZLIB_INTERNAL uint32_t crc32_generic(uint32_t crc, const unsigned char *buf, uin { crc = crc ^ 0xffffffff; -#ifdef UNROLL_LESS - while (len >= 4) { - DO4; - len -= 4; - } -#else +#ifdef UNROLL_MORE while (len >= 8) { DO8; len -= 8; } +#else + while (len >= 4) { + DO4; + len -= 4; + } #endif if (len) do { @@ -310,7 +310,7 @@ ZLIB_INTERNAL uint32_t crc32_little(uint32_t crc, const unsigned char *buf, uint buf4 = (const uint32_t *)(const void *)buf; -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE while (len >= 32) { DOLIT32; len -= 32; @@ -352,7 +352,7 @@ ZLIB_INTERNAL uint32_t crc32_big(uint32_t crc, const unsigned char *buf, uint64_ buf4 = (const uint32_t *)(const void *)buf; -#ifndef UNROLL_LESS +#ifdef UNROLL_MORE while (len >= 32) { DOBIG32; len -= 32; diff --git a/win32/Makefile.arm b/win32/Makefile.arm index 1aec4d48..727a6f24 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -23,7 +23,7 @@ AR = lib RC = rc CP = copy /y CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) -WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DUNALIGNED_OK -DUNROLL_LESS -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 +WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DUNALIGNED_OK -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo RCFLAGS = /dARM /r diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 236d718d..a99099e5 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -23,7 +23,7 @@ AR = lib RC = rc CP = copy /y CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) -WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2_FILL_WINDOW -DX86_CPUID -DX86_SSE4_2_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS -DX86_QUICK_STRATEGY +WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2_FILL_WINDOW -DX86_CPUID -DX86_SSE4_2_CRC_HASH -DUNALIGNED_OK -DX86_QUICK_STRATEGY LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo RCFLAGS = /dWIN32 /r