From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 6 Apr 2021 11:51:16 +0000 (+0200)
Subject: IBM Z: Add vectorized CRC32 implementation
X-Git-Tag: 2.1.0-beta1~534
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0573840dd07fdd85f249af267fcfc3a6dafaed35;p=thirdparty%2Fzlib-ng.git

IBM Z: Add vectorized CRC32 implementation

While DFLTCC takes care of accelerating compression on level 1, other
levels can be sped up too by computing CRC32 using various vector
instructions.

Take the Linux kernel assembly code that does that - its original
author (Hendrik Brueckner) works for IBM at the time of writing and has
allowed reusing the code under the zlib license. Rewrite it in C for
better maintainability, but keep the original structure, variable names
and comments.

Update the documentation.

Add CI configurations.
---

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index cfbf002a2..a172451bb 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -209,6 +209,16 @@ jobs:
             ldflags: -static
             codecov: ubuntu_gcc_s390x
 
+          - name: Ubuntu GCC S390X No vectorized CRC32
+            os: ubuntu-latest
+            compiler: s390x-linux-gnu-gcc
+            cmake-args: -DCMAKE_TOOLCHAIN_FILE=cmake/toolchain-s390x.cmake -DWITH_CRC32_VX=OFF -DWITH_SANITIZER=Address
+            asan-options: detect_leaks=0
+            packages: qemu qemu-user gcc-s390x-linux-gnu libc-dev-s390x-cross
+            qemu-run: qemu-s390x
+            ldflags: -static
+            codecov: ubuntu_gcc_s390x
+
           - name: Ubuntu GCC S390X DFLTCC
             os: ubuntu-latest
             compiler: s390x-linux-gnu-gcc
diff --git a/.github/workflows/configure.yml b/.github/workflows/configure.yml
index c3bc86789..9d9dc3598 100644
--- a/.github/workflows/configure.yml
+++ b/.github/workflows/configure.yml
@@ -148,6 +148,16 @@ jobs:
             cflags: -static
             ldflags: -static
 
+          - name: Ubuntu GCC S390X No vectorized CRC32
+            os: ubuntu-latest
+            compiler: s390x-linux-gnu-gcc
+            configure-args: --warn --static --without-crc32-vx
+            chost: s390x-linux-gnu
+            packages: qemu qemu-user gcc-s390x-linux-gnu libc-dev-s390x-cross
+            qemu-run: qemu-s390x
+            cflags: -static
+            ldflags: -static
+
           - name: Ubuntu GCC S390X DFLTCC
             os: ubuntu-latest
             compiler: s390x-linux-gnu-gcc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a194d13b5..1530e4f91 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,7 @@ elseif(BASEARCH_PPC_FOUND)
 elseif(BASEARCH_S360_FOUND)
     option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
     option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
+    option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
 elseif(BASEARCH_X86_FOUND)
     option(WITH_AVX2 "Build with AVX2" ON)
     option(WITH_SSE2 "Build with SSE2" ON)
@@ -114,6 +115,7 @@ mark_as_advanced(FORCE
     WITH_ACLE WITH_NEON
     WITH_DFLTCC_DEFLATE
     WITH_DFLTCC_INFLATE
+    WITH_CRC32_VX
     WITH_AVX2 WITH_SSE2
     WITH_SSSE3 WITH_SSE4
     WITH_PCLMULQDQ
@@ -618,6 +620,10 @@ if(WITH_OPTIM)
             endif()
         endif()
     elseif(BASEARCH_S360_FOUND)
+        if(WITH_CRC32_VX)
+            add_definitions(-DS390_FEATURES)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/s390.c)
+        endif()
         if(WITH_DFLTCC_DEFLATE OR WITH_DFLTCC_INFLATE)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_common.c)
         endif()
@@ -629,6 +635,17 @@ if(WITH_OPTIM)
             add_definitions(-DS390_DFLTCC_INFLATE)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_inflate.c)
         endif()
+        if(WITH_CRC32_VX)
+            check_vgfma_intrinsics()
+            if(HAVE_VGFMA_INTRIN)
+                add_definitions(-DS390_CRC32_VX)
+                set(CRC32_VX_SRCS ${ARCHDIR}/crc32-vx.c)
+                list(APPEND ZLIB_ARCH_SRCS ${CRC32_VX_SRCS})
+                set_property(SOURCE ${CRC32_VX_SRCS} PROPERTY COMPILE_FLAGS "${VGFMAFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_CRC32_VX OFF)
+            endif()
+        endif()
     elseif(BASEARCH_X86_FOUND)
         add_definitions(-DX86_FEATURES)
         list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h)
@@ -1295,6 +1312,7 @@ elseif(BASEARCH_PPC_FOUND)
 elseif(BASEARCH_S360_FOUND)
     add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
     add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
+    add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z")
 elseif(BASEARCH_X86_FOUND)
     add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2")
     add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
diff --git a/README.md b/README.md
index 6efbda5dc..161a1b633 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Features
   * Slide hash implementations using SSE2, AVX2, Neon & VSX
   * Compare256/258 implementations using SSE4.2 & AVX2
   * Inflate chunk copying using SSE2, AVX2, Neon & VSX
+  * CRC32 implementation using IBM Z vector instructions
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
 * Includes improvements from Cloudflare and Intel forks
@@ -202,6 +203,7 @@ Advanced Build Options
 | WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
 | WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
 | WITH_POWER8                     |                       | Build with POWER8 optimisations                                     | ON                     |
+| WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
 | WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
 | WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
 | WITH_UNALIGNED                  |                       | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
diff --git a/arch/s390/Makefile.in b/arch/s390/Makefile.in
index 2652fe62d..9780f24cf 100644
--- a/arch/s390/Makefile.in
+++ b/arch/s390/Makefile.in
@@ -7,11 +7,19 @@ CFLAGS=
 SFLAGS=
 INCLUDES=
 SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
 
 SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
+s390.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390.c
+
+s390.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390.c
+
 dfltcc_common.o:
 	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
 
@@ -30,6 +38,12 @@ dfltcc_inflate.o:
 dfltcc_inflate.lo:
 	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
 
+crc32-vx.o:
+	$(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+	$(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
 mostlyclean: clean
 clean:
 	rm -f *.o *.lo *~
diff --git a/arch/s390/crc32-vx.c b/arch/s390/crc32-vx.c
new file mode 100644
index 000000000..bb23289cd
--- /dev/null
+++ b/arch/s390/crc32-vx.c
@@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "../../zutil.h"
+#include "../../crc32_p.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL s390_crc32_vx(uint32_t crc, const unsigned char *buf, uint64_t len) {
+    uint64_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return crc32_big(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = crc32_big(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = crc32_big(crc, buf + aligned, remaining);
+
+    return crc;
+}
diff --git a/arch/s390/s390.c b/arch/s390/s390.c
new file mode 100644
index 000000000..6dd5252ce
--- /dev/null
+++ b/arch/s390/s390.c
@@ -0,0 +1,10 @@
+#include "../../zutil.h"
+#include "s390.h"
+
+#include <sys/auxv.h>
+
+Z_INTERNAL int s390_cpu_has_vx;
+
+void Z_INTERNAL s390_check_features(void) {
+    s390_cpu_has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VX;
+}
diff --git a/arch/s390/s390.h b/arch/s390/s390.h
new file mode 100644
index 000000000..dec035b3d
--- /dev/null
+++ b/arch/s390/s390.h
@@ -0,0 +1,10 @@
+#ifndef S390_H_
+#define S390_H_
+
+#include "../../zutil.h"
+
+extern int s390_cpu_has_vx;
+
+void Z_INTERNAL s390_check_features(void);
+
+#endif
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake
index a360b7e66..90c5f3bee 100644
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -219,3 +219,28 @@ macro(check_sse4_intrinsics)
     )
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
+
+macro(check_vgfma_intrinsics)
+    if(NOT NATIVEFLAG)
+        set(VGFMAFLAG "-march=z13")
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+            set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
+        endif()
+        if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
+        endif()
+    endif()
+    # Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}")
+    check_c_source_compiles(
+        "#include <vecintrin.h>
+        int main(void) {
+            unsigned long long a __attribute__((vector_size(16))) = { 0 };
+            unsigned long long b __attribute__((vector_size(16))) = { 0 };
+            unsigned char c __attribute__((vector_size(16))) = { 0 };
+            c = vec_gfmsum_accum_128(a, b, c);
+            return c[0];
+        }"
+        HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
diff --git a/configure b/configure
index e4738dca1..0dc1dd6a7 100755
--- a/configure
+++ b/configure
@@ -94,6 +94,7 @@ buildacle=1
 buildneon=1
 builddfltccdeflate=0
 builddfltccinflate=0
+buildcrc32vx=1
 with_sanitizer=""
 with_fuzzers=0
 floatabi=
@@ -108,6 +109,7 @@ pclmulflag="-mpclmul"
 acleflag=
 neonflag=
 noltoflag="-fno-lto"
+vgfmaflag="-march=z13"
 without_optimizations=0
 without_new_strategies=0
 reducedmem=0
@@ -155,6 +157,7 @@ case "$1" in
       echo '    [--without-neon]            Compiles without ARM Neon SIMD instruction set' | tee -a configure.log
       echo '    [--with-dfltcc-deflate]     Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log
       echo '    [--with-dfltcc-inflate]     Use DEFLATE CONVERSION CALL instruction for decompression on IBM Z' | tee -a configure.log
+      echo '    [--without-crc32-vx]        Build without vectorized CRC32 on IBM Z' | tee -a configure.log
       echo '    [--with-reduced-mem]        Reduced memory usage for special cases (reduces performance)' | tee -a configure.log
       echo '    [--force-sse2]              Assume SSE2 instructions are always available (disabled by default on x86, enabled on x86_64)' | tee -a configure.log
       echo '    [--with-sanitizer]          Build with sanitizer (memory, address, undefined)' | tee -a configure.log
@@ -182,6 +185,7 @@ case "$1" in
     --without-neon) buildneon=0; shift ;;
     --with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
     --with-dfltcc-inflate) builddfltccinflate=1; shift ;;
+    --without-crc32-vx) buildcrc32vx=0; shift ;;
     --with-reduced-mem) reducedmem=1; shift ;;
     --force-sse2) forcesse2=1; shift ;;
     -n | --native) native=1; shift ;;
@@ -1152,6 +1156,42 @@ EOF
     fi
 }
 
+check_vgfma_intrinsics() {
+    # Check whether "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic is available
+    echo -n "Checking for -mzarch... " | tee -a configure.log
+    if try $CC -x c -c /dev/null -o /dev/null -mzarch; then
+        echo Yes. | tee -a configure.log
+        vgfmaflag="${vgfmaflag} -mzarch"
+    else
+        echo No. | tee -a configure.log
+    fi
+    echo -n "Checking for -fzvector... " | tee -a configure.log
+    if try $CC -x c -c /dev/null -o /dev/null -fzvector; then
+        echo Yes. | tee -a configure.log
+        vgfmaflag="${vgfmaflag} -fzvector"
+    else
+        echo No. | tee -a configure.log
+    fi
+    cat > $test.c << EOF
+#include <vecintrin.h>
+int main(void) {
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    c = vec_gfmsum_accum_128(a, b, c);
+    return c[0];
+}
+EOF
+    echo -n "Checking for VGFMA support... " | tee -a configure.log
+    if try $CC -c $CFLAGS $vgfmaflag $test.c; then
+        HAVE_VGFMA_INTRIN=1
+        echo "Yes." | tee -a configure.log
+    else
+        HAVE_VGFMA_INTRIN=0
+        echo "No." | tee -a configure.log
+    fi
+}
+
 case "${ARCH}" in
     i386 | i486 | i586 | i686 | x86_64)
         # Enable deflate_medium at level 1
@@ -1538,6 +1578,13 @@ EOF
         ARCHDIR=arch/s390
 
         if test $without_optimizations -eq 0; then
+            if test $buildcrc32vx -eq 1; then
+                CFLAGS="${CFLAGS} -DS390_FEATURES"
+                SFLAGS="${SFLAGS} -DS390_FEATURES"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} s390.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} s390.lo"
+            fi
+
             if test $builddfltccdeflate -eq 1 -o $builddfltccinflate -eq 1; then
                 ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} dfltcc_common.o"
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} dfltcc_common.lo"
@@ -1558,6 +1605,17 @@ EOF
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} dfltcc_inflate.lo"
                 ARCH="${ARCH}+dfltcc-inflate"
             fi
+
+            if test $buildcrc32vx -eq 1; then
+                check_vgfma_intrinsics
+                if test $HAVE_VGFMA_INTRIN -eq 1; then
+                    CFLAGS="${CFLAGS} -DS390_CRC32_VX"
+                    SFLAGS="${SFLAGS} -DS390_CRC32_VX"
+                    ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32-vx.o"
+                    ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32-vx.lo"
+                    ARCH="${ARCH}+crc32-vx"
+                fi
+            fi
         fi
     ;;
     *)
@@ -1751,6 +1809,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^ACLEFLAG *=/s#=.*#=$acleflag#
 /^NEONFLAG *=/s#=.*#=$neonflag#
 /^NOLTOFLAG *=/s#=.*#=$noltoflag#
+/^VGFMAFLAG *=/s#=.*#=$vgfmaflag#
 " > $ARCHDIR/Makefile
 
 # Append header files dependences.
diff --git a/crc32_p.h b/crc32_p.h
index 47b4b3751..fbaf8db88 100644
--- a/crc32_p.h
+++ b/crc32_p.h
@@ -1,6 +1,9 @@
 #ifndef CRC32_P_H_
 #define CRC32_P_H_
 
+#include "zutil.h"
+#include "zendian.h"
+
 #define GF2_DIM 32      /* dimension of GF(2) vectors (length of CRC) */
 
 
@@ -16,4 +19,10 @@ static inline uint32_t gf2_matrix_times(const uint32_t *mat, uint32_t vec) {
 }
 
 
+#if BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
+#elif BYTE_ORDER == BIG_ENDIAN
+extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
+#endif
+
 #endif /* CRC32_P_H_ */
diff --git a/functable.c b/functable.c
index 12feedfb3..af3aaa2d8 100644
--- a/functable.c
+++ b/functable.c
@@ -5,6 +5,7 @@
 
 #include "zbuild.h"
 #include "zendian.h"
+#include "crc32_p.h"
 #include "deflate.h"
 #include "deflate_p.h"
 
@@ -111,11 +112,8 @@ Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
 #ifdef ARM_ACLE_CRC_HASH
 extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
 #endif
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
-#elif BYTE_ORDER == BIG_ENDIAN
-extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
+#ifdef S390_CRC32_VX
+extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t);
 #endif
 
 /* compare258 */
@@ -179,6 +177,8 @@ Z_INTERNAL void cpu_check_features(void)
     arm_check_features();
 #elif defined(POWER_FEATURES)
     power_check_features();
+#elif defined(S390_FEATURES)
+    s390_check_features();
 #endif
     features_checked = 1;
 }
@@ -463,6 +463,10 @@ Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t
 #  endif
 #elif BYTE_ORDER == BIG_ENDIAN
         functable.crc32 = crc32_big;
+#  if defined(S390_CRC32_VX)
+        if (s390_cpu_has_vx)
+            functable.crc32 = s390_crc32_vx;
+#  endif
 #else
 #  error No endian defined
 #endif
diff --git a/zutil.h b/zutil.h
index f8d8d07b9..4e8950955 100644
--- a/zutil.h
+++ b/zutil.h
@@ -252,6 +252,8 @@ void Z_INTERNAL   zng_cfree(void *opaque, void *ptr);
 #  include "arch/arm/arm.h"
 #elif defined(POWER_FEATURES)
 #  include "arch/power/power.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390.h"
 #endif
 
 #endif /* ZUTIL_H_ */