]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Separate feature checks for x86 and x86_64
authorMika Lindqvist <postmaster@raasu.org>
Fri, 23 Mar 2018 12:48:53 +0000 (14:48 +0200)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 20 May 2018 12:56:41 +0000 (14:56 +0200)
* Don't check for SSE2 on anything else than i685
* Don't check for PCLMULQDQ on anything else than i686 or x86_64
* Check for SSE4.2 CRC intrinsics

CMakeLists.txt
arch/x86/insert_string_sse.c
arch/x86/x86.c
configure

index d19b26af5e48ab6abf913877ac0142382b3ef8fd..60868dab3e6a69a11057ce8591ec98ce1be14be3 100644 (file)
@@ -318,6 +318,17 @@ else()
         }"
         HAVE_SSE42_INTRIN
     )
+    check_c_source_compile_or_run(
+        "int main(void)
+        {
+            unsigned crc = 0;
+            char c = 'c';
+            crc = __builtin_ia32_crc32qi(crc, c);
+            (void)crc;
+            return 0;
+        }"
+        HAVE_SSE42CRC_INTRIN
+    )
     if(WITH_NATIVE_INSTRUCTIONS)
         set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
     else()
@@ -431,6 +442,9 @@ if(WITH_OPTIM)
             set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/insert_string_sse.c)
             add_feature_info(SSE4_CRC 1 "Support CRC hash generation using the SSE4.2 instruction set, using \"${SSE4FLAG}\"")
             add_intrinsics_option("${SSE4FLAG}")
+            if(HAVE_SSE42CRC_INTRIN)
+                add_definitions(-DX86_SSE4_2_CRC_INTRIN)
+            endif()
             if(WITH_NEW_STRATEGIES)
                 add_definitions(-DX86_QUICK_STRATEGY)
                 set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/deflate_quick.c)
index cb756ac5c371a9eeb5c1523ca1f7bd1ea32efa5d..bf09aabb8bd10260de13bfaee4b018b9cfb80f22 100644 (file)
@@ -32,6 +32,8 @@ ZLIB_INTERNAL Pos insert_string_sse(deflate_state *const s, const Pos str, unsig
 
 #ifdef _MSC_VER
         h = _mm_crc32_u32(h, val);
+#elif defined(X86_SSE4_2_CRC_INTRIN)
+        h = __builtin_ia32_crc32si(h, val);
 #else
         __asm__ __volatile__ (
             "crc32 %1,%0\n\t"
index 45bd63de2d67cead952e97cbe8ba72c64b7a3214..c04e0a79ad750114e5ea1cbc77ee922b8fe14824 100644 (file)
@@ -46,15 +46,23 @@ static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigne
 
 void ZLIB_INTERNAL x86_check_features(void) {
        unsigned eax, ebx, ecx, edx;
+       unsigned maxbasic;
+
+       cpuid(0, &maxbasic, &ebx, &ecx, &edx);
+
        cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
 
        x86_cpu_has_sse2 = edx & 0x4000000;
        x86_cpu_has_sse42 = ecx & 0x100000;
        x86_cpu_has_pclmulqdq = ecx & 0x2;
 
-       cpuid(7, &eax, &ebx, &ecx, &edx);
+       if (maxbasic >= 7) {
+         cpuid(7, &eax, &ebx, &ecx, &edx);
 
-       // check BMI1 bit
-       // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
-       x86_cpu_has_tzcnt = ebx & 0x8;
+         // check BMI1 bit
+         // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
+         x86_cpu_has_tzcnt = ebx & 0x8;
+       } else {
+         x86_cpu_has_tzcnt = 0;
+       }
 }
index 9de84b975adf626c1f9c8100bda8a12ef5007e9f..71e2ff60159a2b1896ac978cd0659798404fbb0b 100755 (executable)
--- a/configure
+++ b/configure
@@ -102,6 +102,7 @@ floatabi=
 native=0
 sse2flag="-msse2"
 sse4flag="-msse4"
+sse42flag="-msse4.2"
 pclmulflag="-mpclmul"
 without_optimizations=0
 without_new_strategies=0
@@ -751,6 +752,7 @@ else
 fi
 
 # Check for SSE2 intrinsics
+if test "${ARCH}" = "i686"; then
 cat > $test.c << EOF
 #include <immintrin.h>
 int main(void) {
@@ -767,7 +769,31 @@ else
     HAVE_SSE2_INTRIN=0
 fi
 
+fi
+
+# Check for SSE4.2 CRC intrinsics
+if test "${ARCH}" = "i686" || test "${ARCH}" = "x86_64"; then
+cat > $test.c << EOF
+int main(void) {
+    unsigned crc = 0;
+    char c = 'c';
+    crc = __builtin_ia32_crc32qi(crc, c);
+    (void)crc;
+    return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} ${sse42flag} $test.c; then
+    echo "Checking for SSE4.2 CRC intrinsics ... Yes." | tee -a configure.log
+    HAVE_SSE42CRC_INTRIN=1
+else
+    echo "Checking for SSE4.2 CRC intrinsics ... No." | tee -a configure.log
+    HAVE_SSE42CRC_INTRIN=0
+fi
+
+fi
+
 # Check for PCLMULQDQ intrinsics
+if test "${ARCH}" = "i686" || test "${ARCH}" = "x86_64"; then
 cat > $test.c << EOF
 #include <immintrin.h>
 #include <wmmintrin.h>
@@ -793,32 +819,23 @@ if test $without_new_strategies -eq 0; then
     SFLAGS="${SFLAGS} -DMEDIUM_STRATEGY"
 fi
 
+fi
+
 ARCHDIR='arch/generic'
 ARCH_STATIC_OBJS=''
 ARCH_SHARED_OBJS=''
 
 # Set ARCH specific FLAGS
 case "${ARCH}" in
-    # x86 and x86_64 specific optimizations
-    i386 | i486 | i586 | i686 | x86_64)
-           ARCHDIR=arch/x86
-
-        case "${ARCH}" in
-            x86_64)
-                CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2"
-                SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2"
-            ;;
-            i386 | i486 | i586 | i686)
-                CFLAGS="${CFLAGS} -DX86"
-                SFLAGS="${SFLAGS} -DX86"
-            ;;
-        esac
+    # x86 specific optimizations
+    i386 | i486 | i586 | i686)
+        ARCHDIR=arch/x86
 
-        CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-        SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+        CFLAGS="${CFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS"
+        SFLAGS="${SFLAGS} -DX86 -DUNALIGNED_OK -DUNROLL_LESS"
 
-       # Enable arch-specific optimizations?
-           if test $without_optimizations -eq 0; then
+        # Enable arch-specific optimizations?
+        if test $without_optimizations -eq 0; then
             CFLAGS="${CFLAGS} -DX86_CPUID"
             SFLAGS="${SFLAGS} -DX86_CPUID"
 
@@ -830,6 +847,21 @@ case "${ARCH}" in
                 SFLAGS="${SFLAGS} -DX86_SSE2_FILL_WINDOW"
                 ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_sse.o"
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_sse.lo"
+
+                # Enable deflate_quick at level 1?
+                # requires SSE2: code uses fill_window_sse
+                if test $without_new_strategies -eq 0; then
+                    CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
+                    SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY"
+
+                    ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} deflate_quick.o"
+                    ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} deflate_quick.lo"
+                fi
+            fi
+
+            if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_INTRIN"
+                SFLAGS="${SFLAGS} -DX86_SSE4_2_CRC_INTRIN"
             fi
 
             CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_HASH"
@@ -844,9 +876,38 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo crc_pclmulqdq.lo"
             fi
 
-           # Enable deflate_quick at level 1?
-           # requires SSE2: code uses fill_window_sse
-            if test ${HAVE_SSE2_INTRIN} -eq 1 && test $without_new_strategies -eq 0; then
+        fi
+    ;;
+
+    # x86_64 specific optimizations
+    x86_64)
+        ARCHDIR=arch/x86
+
+        CFLAGS="${CFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS"
+        SFLAGS="${SFLAGS} -DX86_64 -DX86_NOCHECK_SSE2 -DUNALIGNED_OK -DUNROLL_LESS"
+
+        # Enable arch-specific optimizations?
+        if test $without_optimizations -eq 0; then
+            CFLAGS="${CFLAGS} -DX86_CPUID -DX86_SSE2_FILL_WINDOW -DX86_SSE4_2_CRC_HASH"
+            SFLAGS="${SFLAGS} -DX86_CPUID -DX86_SSE2_FILL_WINDOW -DX86_SSE4_2_CRC_HASH"
+
+            ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o fill_window_sse.o insert_string_sse.o"
+            ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo fill_window_sse.lo insert_string_sse.lo"
+
+            if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_SSE4_2_CRC_INTRIN"
+                SFLAGS="${SFLAGS} -DX86_SSE4_2_CRC_INTRIN"
+            fi
+
+            if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC"
+                SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc_folding.o crc_pclmulqdq.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo crc_pclmulqdq.lo"
+            fi
+
+            # Enable deflate_quick at level 1?
+            if test $without_new_strategies -eq 0; then
                 CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
                 SFLAGS="${SFLAGS} -DX86_QUICK_STRATEGY"