]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
VPCLMULQDQ implementation for Intel's CRC32 folding.
authorNathan Moinvaziri <nathan@nathanm.com>
Wed, 15 Dec 2021 22:21:58 +0000 (14:21 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 9 Jan 2022 20:27:42 +0000 (21:27 +0100)
Based on PR https://github.com/jtkukunas/zlib/pull/28.

Co-authored-by: Wangyang Guo <wangyang.guo@intel.com>
CMakeLists.txt
README.md
arch/x86/INDEX.md
arch/x86/Makefile.in
arch/x86/crc32_fold_pclmulqdq.c
arch/x86/crc32_fold_vpclmulqdq.c [new file with mode: 0644]
arch/x86/x86.c
arch/x86/x86.h
cmake/detect-intrinsics.cmake
configure

index 956f735634692c1321489988a536718a618d124c..ecb9ac6b279f64ccb7762a97644ad9df0bc4f6cc 100644 (file)
@@ -113,6 +113,7 @@ elseif(BASEARCH_X86_FOUND)
     option(WITH_SSE41 "Build with SSE41" ON)
     option(WITH_SSE42 "Build with SSE42" ON)
     option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
+    option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON)
 endif()
 
 option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF)
@@ -827,11 +828,28 @@ if(WITH_OPTIM)
                 add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"")
                 list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
                 set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+
+                if(WITH_VPCLMULQDQ AND WITH_AVX512)
+                    check_vpclmulqdq_intrinsics()
+                    if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
+                        add_definitions(-DX86_VPCLMULQDQ_CRC)
+                        set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c)
+                        add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"")
+                        list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+                        set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+                    else()
+                        set(WITH_VPCLMULQDQ OFF)
+                    endif()
+                else()
+                    set(WITH_VPCLMULQDQ OFF)
+                endif()
             else()
                 set(WITH_PCLMULQDQ OFF)
+                set(WITH_VPCLMULQDQ OFF)
             endif()
         else()
             set(WITH_PCLMULQDQ OFF)
+            set(WITH_VPCLMULQDQ OFF)
         endif()
     endif()
 endif()
@@ -1458,6 +1476,7 @@ elseif(BASEARCH_X86_FOUND)
     add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
     add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
     add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
+    add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
 endif()
 
 add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install")
index 44e63db3025ac9f91b65a1d65316c03da7b07548..79d6648bdb8fe7861b1e8d954eadbaaec08c85ca 100644 (file)
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Features
 * Deflate medium and quick algorithms based on Intels zlib fork
 * Support for CPU intrinsics when available
   * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
-  * CRC32-B implementation using PCLMULQDQ & ACLE
+  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, & ACLE
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
   * Compare256/258 implementations using SSE4.2 & AVX2
@@ -203,6 +203,7 @@ Advanced Build Options
 | WITH_SSE41                      |                       | Build with SSE41 intrinsics                                         | ON                     |
 | WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
 | WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
+| WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
 | WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
 | WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
 | WITH_ALTIVEC                    | --without-altivec     | Build with AltiVec (VMX) intrinsics                                 | ON                     |
index e20245a5e1df41396dcce233bb0514cc9a938d43..af987a25c430be4016760849543685071c1ba042 100644 (file)
@@ -5,4 +5,5 @@ Contents
 |:-|:-|
 |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
 |crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|crc32_fold_vpclmulqdq.c|VPCLMULQDQ optimized CRC folding implementation|
 |slide_hash_sse2.c|SSE2 optimized slide_hash|
index f54a695c22cc88f50c29425db73ffd3e53587b47..0a1dc07662c71390257ce110d9402be0124d274e 100644 (file)
@@ -16,6 +16,7 @@ SSSE3FLAG=-mssse3
 SSE41FLAG=-msse4.1
 SSE42FLAG=-msse4.2
 PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
 NOLTOFLAG=
 
 SRCDIR=.
@@ -35,6 +36,7 @@ all: \
        compare258_sse.o compare258_sse.lo \
        insert_string_sse.o insert_string_sse.lo \
        crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
+       crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
        slide_hash_avx.o slide_hash_avx.lo \
        slide_hash_sse.o slide_hash_sse.lo
 
@@ -80,6 +82,12 @@ crc32_fold_pclmulqdq.o:
 crc32_fold_pclmulqdq.lo:
        $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
 
+crc32_fold_vpclmulqdq.o:
+       $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+
+crc32_fold_vpclmulqdq.lo:
+       $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+
 slide_hash_avx.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c
 
index d07ffb4085bd4b193fdb0517c220fcf7cf430757..1434357a875bdc70968926e1bfaeb5c3f7c3f743 100644 (file)
 
 #include "../../crc32_fold.h"
 
+#ifdef X86_VPCLMULQDQ_CRC
+extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
 static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
     const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
                                              0x00000001, 0xc6e41596);
@@ -275,6 +280,16 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u
         xmm_crc_part = _mm_setzero_si128();
     }
 
+#ifdef X86_VPCLMULQDQ_CRC
+    if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+        size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+
+        len -= n;
+        src += n;
+        dst += n;
+    }
+#endif
+
     while (len >= 64) {
         crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
 
diff --git a/arch/x86/crc32_fold_vpclmulqdq.c b/arch/x86/crc32_fold_vpclmulqdq.c
new file mode 100644 (file)
index 0000000..9ed54b3
--- /dev/null
@@ -0,0 +1,108 @@
+/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_CRC
+#include "../../zutil.h"
+
+#include <immintrin.h>
+
+size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+    __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+    size_t len_tmp = len;
+    __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+    __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+    __m512i z0, z1, z2, z3;
+    z_const __m512i zmm_fold4 = _mm512_set4_epi32(
+        0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+    z_const __m512i zmm_fold16 = _mm512_set4_epi32(
+        0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+    // zmm register init
+    zmm_crc0 = _mm512_setzero_si512();
+    zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+    zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+    zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+    zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+    /* already have intermediate CRC in xmm registers
+        * fold4 with 4 xmm_crc to get zmm_crc0
+    */
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+    zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+    zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+
+    _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+    _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+    _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+    _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+    len -= 256;
+    src += 256;
+    dst += 256;
+
+    // fold-16 loops
+    while (len >= 256) {
+        zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+        zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+        zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+        zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+        z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+        z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+        z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+        z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+        zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+        zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+        zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+        zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+        zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+        zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
+        zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
+        zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
+
+        zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+        zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
+        zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
+        zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
+
+        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+        _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+        _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+        _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+        len -= 256;
+        src += 256;
+        dst += 256;
+    }
+    // zmm_crc[0,1,2,3] -> zmm_crc0
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+    zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+    zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
+
+    z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+    zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+    zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+    zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
+
+    // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+    *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+    *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+    *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+    *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+    return (len_tmp - len);  // return n bytes processed
+}
+#endif
index 065a717032d8dd3a1f9579314956a598b3ceb6a6..32baf8a749494793ab8d8a2fd7e2f8f6c1bc27d0 100644 (file)
@@ -27,6 +27,7 @@ Z_INTERNAL int x86_cpu_has_ssse3;
 Z_INTERNAL int x86_cpu_has_sse41;
 Z_INTERNAL int x86_cpu_has_sse42;
 Z_INTERNAL int x86_cpu_has_pclmulqdq;
+Z_INTERNAL int x86_cpu_has_vpclmulqdq;
 Z_INTERNAL int x86_cpu_has_tzcnt;
 Z_INTERNAL int x86_cpu_well_suited_avx512;
 
@@ -98,9 +99,11 @@ void Z_INTERNAL x86_check_features(void) {
         x86_cpu_has_avx2 = ebx & 0x20;
         x86_cpu_has_avx512 = ebx & 0x00010000;
         x86_cpu_has_avx512vnni = ecx & 0x800;
+        x86_cpu_has_vpclmulqdq = ecx & 0x400;
     } else {
         x86_cpu_has_tzcnt = 0;
         x86_cpu_has_avx2 = 0;
+        x86_cpu_has_vpclmulqdq = 0;
     }
 
 
index 80da6f32aac3a85a6702a4ee338713be539b8b64..00f8d9efc7878cccecf6612e4f9447e6fd96497f 100644 (file)
@@ -14,6 +14,7 @@ extern int x86_cpu_has_ssse3;
 extern int x86_cpu_has_sse41;
 extern int x86_cpu_has_sse42;
 extern int x86_cpu_has_pclmulqdq;
+extern int x86_cpu_has_vpclmulqdq;
 extern int x86_cpu_has_tzcnt;
 extern int x86_cpu_well_suited_avx512;
 
index 2facf505139d11cd61102020d99cc252ce5df9a4..47d93d0c15c4b3ec071a006db40b3e528d28041f 100644 (file)
@@ -168,6 +168,32 @@ macro(check_pclmulqdq_intrinsics)
     endif()
 endmacro()
 
+macro(check_vpclmulqdq_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            set(VPCLMULFLAG "-mvpclmulqdq")
+        endif()
+    endif()
+    # Check whether compiler supports VPCLMULQDQ intrinsics
+    if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+        set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG}")
+        check_c_source_compile_or_run(
+            "#include <immintrin.h>
+            int main(void) {
+                __m512i a = _mm512_setzero_si512();
+                __m512i b = _mm512_setzero_si512();
+                __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
+                (void)c;
+                return 0;
+            }"
+            HAVE_VPCLMULQDQ_INTRIN
+        )
+        set(CMAKE_REQUIRED_FLAGS)
+    else()
+        set(HAVE_VPCLMULQDQ_INTRIN OFF)
+    endif()
+endmacro()
+
 macro(check_ppc_intrinsics)
     # Check if compiler supports AltiVec
     set(CMAKE_REQUIRED_FLAGS "-maltivec")
index 2c17507a861e648d2089e482b5869f5414120f1e..143d6e4e67138cf10eb9d7e8d13830b88aac3622 100755 (executable)
--- a/configure
+++ b/configure
@@ -90,6 +90,7 @@ compat=0
 cover=0
 build32=0
 build64=0
+buildvpclmulqdq=1
 buildacle=1
 buildaltivec=1
 buildpower8=1
@@ -112,6 +113,7 @@ ssse3flag="-mssse3"
 sse41flag="-msse4.1"
 sse42flag="-msse4.2"
 pclmulflag="-mpclmul"
+vpclmulflag="-mvpclmulqdq"
 acleflag=
 neonflag=
 noltoflag="-fno-lto"
@@ -194,6 +196,7 @@ case "$1" in
     --cover) cover=1; shift ;;
     -3* | --32) build32=1; shift ;;
     -6* | --64) build64=1; shift ;;
+    --without-vpclmulqdq) buildvpclmulqdq=0; shift ;;
     --without-acle) buildacle=0; shift ;;
     --without-neon) buildneon=0; shift ;;
     --without-altivec) buildaltivec=0 ; shift ;;
@@ -262,6 +265,7 @@ if test $native -eq 1; then
   sse4flag=""
   sse42flag=""
   pclmulflag=""
+  vpclmulflag=""
   noltoflag=""
 fi
 
@@ -1161,6 +1165,28 @@ EOF
     fi
 }
 
+check_vpclmulqdq_intrinsics() {
+    # Check whether compiler supports VPCLMULQDQ intrinsics
+    cat > $test.c << EOF
+#include <immintrin.h>
+#include <wmmintrin.h>
+int main(void) {
+    __m512i a = _mm512_setzero_si512();
+    __m512i b = _mm512_setzero_si512();
+    __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
+    (void)c;
+    return 0;
+}
+EOF
+    if try ${CC} ${CFLAGS} ${vpclmulflag} $test.c; then
+        echo "Checking for VPCLMULQDQ intrinsics ... Yes." | tee -a configure.log
+        HAVE_VPCLMULQDQ_INTRIN=1
+    else
+        echo "Checking for VPCLMULQDQ intrinsics ... No." | tee -a configure.log
+        HAVE_VPCLMULQDQ_INTRIN=0
+    fi
+}
+
 check_ppc_intrinsics() {
         cat > $test.c << EOF
 #include <altivec.h>
@@ -1503,6 +1529,17 @@ case "${ARCH}" in
                 SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
                 ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
+
+                if test $buildvpclmulqdq -eq 1; then
+                    check_vpclmulqdq_intrinsics
+
+                    if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then
+                        CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC"
+                        SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo"
+                    fi
+                fi
             fi
         fi
     ;;
@@ -1912,6 +1949,7 @@ echo ssse3flag = $ssse3flag >> configure.log
 echo sse41flag = $sse41flag >> configure.log
 echo sse42flag = $sse42flag >> configure.log
 echo pclmulflag = $pclmulflag >> configure.log
+echo vpclmulflag = $vpclmulflag >> configure.log
 echo acleflag = $acleflag >> configure.log
 echo neonflag = $neonflag >> configure.log
 echo ARCHDIR = ${ARCHDIR} >> configure.log
@@ -2049,6 +2087,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^SSE41FLAG *=/s#=.*#=$sse41flag#
 /^SSE42FLAG *=/s#=.*#=$sse42flag#
 /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
+/^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
 /^ACLEFLAG *=/s#=.*#=$acleflag#
 /^NEONFLAG *=/s#=.*#=$neonflag#
 /^NOLTOFLAG *=/s#=.*#=$noltoflag#