]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Replace the UNROLL_LESS define with UNROLL_MORE, making UNROLL_LESS the default.
authorHans Kristian Rosbach <hk-git@circlestorm.org>
Wed, 16 Jan 2019 10:49:29 +0000 (11:49 +0100)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Mon, 21 Jan 2019 09:25:50 +0000 (10:25 +0100)
Performance benchmarks have so far not shown that any platform benefits from UNROLL_MORE,
although this might be beneficial on older compilers/cpus or for compiling without optimizations.

The extra UNROLL_MORE code should be considered for removal since it is never enabled by us
and will likely only serve to confuse and contribute to bitrot.

CMakeLists.txt
adler32.c
arch/aarch64/crc32_acle.c
arch/arm/crc32_acle.c
configure
crc32.c
win32/Makefile.arm
win32/Makefile.msc

index 2872fe3681736f5ff910d1deb1141110c6ecb85e..4f9176ceb00c7dc569787bee9bfefc833abc1123 100644 (file)
@@ -483,17 +483,17 @@ set(ZLIB_ARCH_SRCS)
 set(ARCHDIR "arch/generic")
 if("${ARCH}" MATCHES "x86_64" OR "${ARCH}" MATCHES "AMD64")
     set(ARCHDIR "arch/x86")
-    add_definitions(-DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS)
+    add_definitions(-DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK)
     add_feature_info(SSE2 1 "Use the SSE2 instruction set, using \"${SSE2FLAG}\"")
 elseif("${ARCH}" MATCHES "arm")
     set(ARCHDIR "arch/arm")
-    add_definitions(-DUNALIGNED_OK -DUNROLL_LESS)
+    add_definitions(-DUNALIGNED_OK)
 elseif("${ARCH}" MATCHES "aarch64")
     set(ARCHDIR "arch/aarch64")
-    add_definitions(-DUNALIGNED_OK -DUNROLL_LESS)
+    add_definitions(-DUNALIGNED_OK)
 elseif("${ARCH}" MATCHES "i[3-6]86")
     set(ARCHDIR "arch/x86")
-    add_definitions(-DX86 -DUNALIGNED_OK -DUNROLL_LESS)
+    add_definitions(-DX86 -DUNALIGNED_OK)
     add_feature_info(SSE2 1 "Support the SSE2 instruction set, using \"${SSE2FLAG}\"")
 else()
     message(STATUS "No optimized architecture: using ${ARCHDIR}")
index dbbb5fe5772e8fe5dfdd24db68e841c259034189..651d73c562cf7037a61e8d4e4234c2c4b04762af 100644 (file)
--- a/adler32.c
+++ b/adler32.c
@@ -102,13 +102,13 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
     /* do length NMAX blocks -- requires just one modulo operation */
     while (len >= NMAX) {
         len -= NMAX;
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
         n = NMAX / 16;          /* NMAX is divisible by 16 */
 #else
         n = NMAX / 8;           /* NMAX is divisible by 8 */
 #endif
         do {
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
             DO16(buf);          /* 16 sums unrolled */
             buf += 16;
 #else
@@ -122,7 +122,7 @@ uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
 
     /* do remaining bytes (less than NMAX, still just one modulo) */
     if (len) {                  /* avoid modulos if none remaining */
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
         while (len >= 16) {
             len -= 16;
             DO16(buf);
index cee7825acbba0a8bfb8edce5e055a90534dfb5fa..5eeb96fda56689239f751ff271d7aeebe1f19bb7 100644 (file)
@@ -44,7 +44,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
 
     buf8 = (const uint64_t *) buf4;
 
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
     while (len >= 32) {
         c = __crc32d(c, *buf8++);
         c = __crc32d(c, *buf8++);
index d46ca2aaaac3157ef7acf32c05f946cea251b8d6..06e6739f83764d2a8296cacc3c2d506b938daf44 100644 (file)
@@ -36,7 +36,7 @@ uint32_t crc32_acle(uint32_t crc, const unsigned char *buf, uint64_t len) {
         buf4 = (const uint32_t *) buf;
     }
 
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
     while (len >= 32) {
         c = __crc32w(c, *buf4++);
         c = __crc32w(c, *buf4++);
index 393f5c6a0554f54d5c75d571425f9483dbc9bf33..355987ed0a47a7e181e74b3c74ef29a5587ce74e 100755 (executable)
--- a/configure
+++ b/configure
@@ -950,8 +950,8 @@ case "${ARCH}" in
     i386 | i486 | i586 | i686)
         ARCHDIR=arch/x86
 
-        CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS"
-        SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS"
+        CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK"
+        SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK"
 
         # Enable arch-specific optimizations?
         if test $without_optimizations -eq 0; then
@@ -1002,8 +1002,8 @@ case "${ARCH}" in
     x86_64)
         ARCHDIR=arch/x86
 
-        CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS"
-        SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS"
+        CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK"
+        SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK"
 
         # Enable arch-specific optimizations?
         if test $without_optimizations -eq 0; then
@@ -1062,10 +1062,8 @@ case "${ARCH}" in
                 fi
             ;;
             armv6l | armv6hl)
-                # Tests done on Raspberry pi (armv6hl) indicate that UNALIGNED_OK and UNROLL_LESS both
-                # provide performance improvements, totaling about 1.5% for the two.
-                CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-                SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+                CFLAGS="${CFLAGS} -DUNALIGNED_OK"
+                SFLAGS="${SFLAGS} -DUNALIGNED_OK"
 
                 if test $buildacle -eq 1; then
                     echo ACLE support not available
@@ -1076,8 +1074,8 @@ case "${ARCH}" in
                 fi
             ;;
             arm | armv7*)
-                CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-                SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+                CFLAGS="${CFLAGS} -DUNALIGNED_OK"
+                SFLAGS="${SFLAGS} -DUNALIGNED_OK"
 
                 if test $buildacle -eq 1; then
                     echo ACLE support not available
@@ -1092,8 +1090,8 @@ case "${ARCH}" in
                 fi
             ;;
             armv8-a | armv8-a+simd)
-                CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-                SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+                CFLAGS="${CFLAGS} -DUNALIGNED_OK"
+                SFLAGS="${SFLAGS} -DUNALIGNED_OK"
 
                 if test $buildacle -eq 1; then
                     echo ACLE support not available
@@ -1116,8 +1114,8 @@ case "${ARCH}" in
                 fi
             ;;
             armv8-a+crc | armv8-a+crc+simd | armv8.[1234]-a | armv8.[1234]-a+simd)
-                CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
-                SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
+                CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK"
+                SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK"
 
                 ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o"
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo"
@@ -1171,8 +1169,8 @@ case "${ARCH}" in
             ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o"
             ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo"
         fi
-        CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-        SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+        CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK"
+        SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK"
     ;;
     powerpc)
         [ ! -z $CROSS_PREFIX ] && QEMU_ARCH=ppc
diff --git a/crc32.c b/crc32.c
index 95080b3c5d2a0213afc4f806e3b9e063c0efb714..33e82d34c0f697f3d5ebf8f4dffd86d64747aa23 100644 (file)
--- a/crc32.c
+++ b/crc32.c
@@ -255,16 +255,16 @@ ZLIB_INTERNAL uint32_t crc32_generic(uint32_t crc, const unsigned char *buf, uin
 {
     crc = crc ^ 0xffffffff;
 
-#ifdef UNROLL_LESS
-    while (len >= 4) {
-        DO4;
-        len -= 4;
-    }
-#else
+#ifdef UNROLL_MORE
     while (len >= 8) {
         DO8;
         len -= 8;
     }
+#else
+    while (len >= 4) {
+        DO4;
+        len -= 4;
+    }
 #endif
 
     if (len) do {
@@ -310,7 +310,7 @@ ZLIB_INTERNAL uint32_t crc32_little(uint32_t crc, const unsigned char *buf, uint
 
     buf4 = (const uint32_t *)(const void *)buf;
 
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
     while (len >= 32) {
         DOLIT32;
         len -= 32;
@@ -352,7 +352,7 @@ ZLIB_INTERNAL uint32_t crc32_big(uint32_t crc, const unsigned char *buf, uint64_
 
     buf4 = (const uint32_t *)(const void *)buf;
 
-#ifndef UNROLL_LESS
+#ifdef UNROLL_MORE
     while (len >= 32) {
         DOBIG32;
         len -= 32;
index 1aec4d482d9a84cab8a43771630f75169b699004..727a6f24ed30b5949f49a8854999ce0ddfb7f0d3 100644 (file)
@@ -23,7 +23,7 @@ AR = lib
 RC = rc
 CP = copy /y
 CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
-WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DUNALIGNED_OK -DUNROLL_LESS -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1
+WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DUNALIGNED_OK -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
 RCFLAGS = /dARM /r
index 236d718d83a0e919cd8bd4664a04dbc5007e8998..a99099e5e03e17156009eb8903220ab6e726af1a 100644 (file)
@@ -23,7 +23,7 @@ AR = lib
 RC = rc
 CP = copy /y
 CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
-WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2_FILL_WINDOW -DX86_CPUID -DX86_SSE4_2_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS -DX86_QUICK_STRATEGY
+WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2_FILL_WINDOW -DX86_CPUID -DX86_SSE4_2_CRC_HASH -DUNALIGNED_OK -DX86_QUICK_STRATEGY
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
 RCFLAGS = /dWIN32 /r