IBM Z: Add vectorized CRC32 implementation

author Ilya Leoshkevich <iii@linux.ibm.com>

Tue, 6 Apr 2021 11:51:16 +0000 (13:51 +0200)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 7 Jul 2021 17:54:01 +0000 (19:54 +0200)
author Ilya Leoshkevich <iii@linux.ibm.com>
Tue, 6 Apr 2021 11:51:16 +0000 (13:51 +0200)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 7 Jul 2021 17:54:01 +0000 (19:54 +0200)
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml

index cfbf002a21e8cf481506d60b1304f236859aa8b1..a172451bbfbd6a52fab57e73112ab749fa2a094e 100644 (file)
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -209,6 +209,16 @@ jobs:
              ldflags: -static
              codecov: ubuntu_gcc_s390x
  
+          - name: Ubuntu GCC S390X No vectorized CRC32
+            os: ubuntu-latest
+            compiler: s390x-linux-gnu-gcc
+            cmake-args: -DCMAKE_TOOLCHAIN_FILE=cmake/toolchain-s390x.cmake -DWITH_CRC32_VX=OFF -DWITH_SANITIZER=Address
+            asan-options: detect_leaks=0
+            packages: qemu qemu-user gcc-s390x-linux-gnu libc-dev-s390x-cross
+            qemu-run: qemu-s390x
+            ldflags: -static
+            codecov: ubuntu_gcc_s390x
+
            - name: Ubuntu GCC S390X DFLTCC
              os: ubuntu-latest
              compiler: s390x-linux-gnu-gcc
diff --git a/.github/workflows/configure.yml b/.github/workflows/configure.yml

index c3bc8678960aa11433970a97ad7cc7dcc21112ca..9d9dc3598533ae9b9381ab7bbc2373e0346b2fbd 100644 (file)
--- a/.github/workflows/configure.yml
+++ b/.github/workflows/configure.yml
@@ -148,6 +148,16 @@ jobs:
              cflags: -static
              ldflags: -static
  
+          - name: Ubuntu GCC S390X No vectorized CRC32
+            os: ubuntu-latest
+            compiler: s390x-linux-gnu-gcc
+            configure-args: --warn --static --without-crc32-vx
+            chost: s390x-linux-gnu
+            packages: qemu qemu-user gcc-s390x-linux-gnu libc-dev-s390x-cross
+            qemu-run: qemu-s390x
+            cflags: -static
+            ldflags: -static
+
            - name: Ubuntu GCC S390X DFLTCC
              os: ubuntu-latest
              compiler: s390x-linux-gnu-gcc
diff --git a/CMakeLists.txt b/CMakeLists.txt

index a194d13b5ab8acb3364d5da2a0f58b5a7fddd74e..1530e4f911c9f581941cb955d6d281f855b50997 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,7 @@ elseif(BASEARCH_PPC_FOUND)
  elseif(BASEARCH_S360_FOUND)
      option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
      option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
+    option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
  elseif(BASEARCH_X86_FOUND)
      option(WITH_AVX2 "Build with AVX2" ON)
      option(WITH_SSE2 "Build with SSE2" ON)
@@ -114,6 +115,7 @@ mark_as_advanced(FORCE
      WITH_ACLE WITH_NEON
      WITH_DFLTCC_DEFLATE
      WITH_DFLTCC_INFLATE
+    WITH_CRC32_VX
      WITH_AVX2 WITH_SSE2
      WITH_SSSE3 WITH_SSE4
      WITH_PCLMULQDQ
@@ -618,6 +620,10 @@ if(WITH_OPTIM)
              endif()
          endif()
      elseif(BASEARCH_S360_FOUND)
+        if(WITH_CRC32_VX)
+            add_definitions(-DS390_FEATURES)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/s390.c)
+        endif()
          if(WITH_DFLTCC_DEFLATE OR WITH_DFLTCC_INFLATE)
              list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_common.c)
          endif()
@@ -629,6 +635,17 @@ if(WITH_OPTIM)
              add_definitions(-DS390_DFLTCC_INFLATE)
              list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/dfltcc_inflate.c)
          endif()
+        if(WITH_CRC32_VX)
+            check_vgfma_intrinsics()
+            if(HAVE_VGFMA_INTRIN)
+                add_definitions(-DS390_CRC32_VX)
+                set(CRC32_VX_SRCS ${ARCHDIR}/crc32-vx.c)
+                list(APPEND ZLIB_ARCH_SRCS ${CRC32_VX_SRCS})
+                set_property(SOURCE ${CRC32_VX_SRCS} PROPERTY COMPILE_FLAGS "${VGFMAFLAG} ${NOLTOFLAG}")
+            else()
+                set(WITH_CRC32_VX OFF)
+            endif()
+        endif()
      elseif(BASEARCH_X86_FOUND)
          add_definitions(-DX86_FEATURES)
          list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/x86.h)
@@ -1295,6 +1312,7 @@ elseif(BASEARCH_PPC_FOUND)
  elseif(BASEARCH_S360_FOUND)
      add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
      add_feature_info(WITH_DFLTCC_INFLATE WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z")
+    add_feature_info(WITH_CRC32_VX WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z")
  elseif(BASEARCH_X86_FOUND)
      add_feature_info(WITH_AVX2 WITH_AVX2 "Build with AVX2")
      add_feature_info(WITH_SSE2 WITH_SSE2 "Build with SSE2")
diff --git a/README.md b/README.md

index 6efbda5dc613be5fde8a7759d90d51dda395dd3f..161a1b633311375ac165fca14ad00b8ba494c63c 100644 (file)
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Features
    * Slide hash implementations using SSE2, AVX2, Neon & VSX
    * Compare256/258 implementations using SSE4.2 & AVX2
    * Inflate chunk copying using SSE2, AVX2, Neon & VSX
+  * CRC32 implementation using IBM Z vector instructions
    * Support for hardware-accelerated deflate using IBM Z DFLTCC
  * Unaligned memory read/writes and large bit buffer improvements
  * Includes improvements from Cloudflare and Intel forks
@@ -202,6 +203,7 @@ Advanced Build Options
  | WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
  | WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
  | WITH_POWER8                     |                       | Build with POWER8 optimisations                                     | ON                     |
+| WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
  | WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
  | WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
  | WITH_UNALIGNED                  |                       | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
diff --git a/arch/s390/Makefile.in b/arch/s390/Makefile.in

index 2652fe62d9f15fba2ab248524d192fcefdb4927c..9780f24cffcb4d72c9ce1961cac5c342ce42802e 100644 (file)
--- a/arch/s390/Makefile.in
+++ b/arch/s390/Makefile.in
@@ -7,11 +7,19 @@ CFLAGS=
  SFLAGS=
  INCLUDES=
  SUFFIX=
+VGFMAFLAG=
+NOLTOFLAG=
  
  SRCDIR=.
  SRCTOP=../..
  TOPDIR=$(SRCTOP)
  
+s390.o:
+       $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390.c
+
+s390.lo:
+       $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/s390.c
+
  dfltcc_common.o:
         $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_common.c
  
@@ -30,6 +38,12 @@ dfltcc_inflate.o:
  dfltcc_inflate.lo:
         $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/dfltcc_inflate.c
  
+crc32-vx.o:
+       $(CC) $(CFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
+crc32-vx.lo:
+       $(CC) $(SFLAGS) $(VGFMAFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32-vx.c
+
  mostlyclean: clean
  clean:
         rm -f *.o *.lo *~
diff --git a/arch/s390/crc32-vx.c b/arch/s390/crc32-vx.c

new file mode 100644 (file)

index 0000000..bb23289
--- /dev/null
+++ b/arch/s390/crc32-vx.c
@@ -0,0 +1,222 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <brueckner@linux.vnet.ibm.com> for use in the Linux kernel and has been
+ * relicensed under the zlib license.
+ */
+
+#include "../../zutil.h"
+#include "../../crc32_p.h"
+
+#include <vecintrin.h>
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) {
+    /*
+     * The CRC-32 constant block contains reduction constants to fold and
+     * process particular chunks of the input data stream in parallel.
+     *
+     * For the CRC-32 variants, the constants are precomputed according to
+     * these definitions:
+     *
+     *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+     *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+     *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+     *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+     *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+     *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+     *
+     *      The bitreflected Barret reduction constant, u', is defined as
+     *      the bit reversal of floor(x**64 / P(x)).
+     *
+     *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+     *      polynomial in the reversed (bitreflected) domain.
+     *
+     * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+     *
+     *      P(x)  = 0x04C11DB7
+     *      P'(x) = 0xEDB88320
+     */
+    const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};  /* BE->LE mask */
+    const uv2di r2r1 = {0x1C6E41596, 0x154442BD4};                                     /* R2, R1 */
+    const uv2di r4r3 = {0x0CCAA009E, 0x1751997D0};                                     /* R4, R3 */
+    const uv2di r5 = {0, 0x163CD6124};                                                 /* R5 */
+    const uv2di ru_poly = {0, 0x1F7011641};                                            /* u' */
+    const uv2di crc_poly = {0, 0x1DB710641};                                           /* P'(x) << 1 */
+
+    /*
+     * Load the initial CRC value.
+     *
+     * The CRC value is loaded into the rightmost word of the
+     * vector register and is later XORed with the LSB portion
+     * of the loaded input data.
+     */
+    uv2di v0 = {0, 0};
+    v0 = (uv2di)vec_insert(crc, (uv4si)v0, 3);
+
+    /* Load a 64-byte data chunk and XOR with CRC */
+    uv2di v1 = vec_perm(((uv2di *)buf)[0], ((uv2di *)buf)[0], perm_le2be);
+    uv2di v2 = vec_perm(((uv2di *)buf)[1], ((uv2di *)buf)[1], perm_le2be);
+    uv2di v3 = vec_perm(((uv2di *)buf)[2], ((uv2di *)buf)[2], perm_le2be);
+    uv2di v4 = vec_perm(((uv2di *)buf)[3], ((uv2di *)buf)[3], perm_le2be);
+
+    v1 ^= v0;
+    buf += 64;
+    len -= 64;
+
+    while (len >= 64) {
+        /* Load the next 64-byte data chunk */
+        uv16qi part1 = vec_perm(((uv16qi *)buf)[0], ((uv16qi *)buf)[0], perm_le2be);
+        uv16qi part2 = vec_perm(((uv16qi *)buf)[1], ((uv16qi *)buf)[1], perm_le2be);
+        uv16qi part3 = vec_perm(((uv16qi *)buf)[2], ((uv16qi *)buf)[2], perm_le2be);
+        uv16qi part4 = vec_perm(((uv16qi *)buf)[3], ((uv16qi *)buf)[3], perm_le2be);
+
+        /*
+         * Perform a GF(2) multiplication of the doublewords in V1 with
+         * the R1 and R2 reduction constants in V0.  The intermediate result
+         * is then folded (accumulated) with the next data chunk in PART1 and
+         * stored in V1. Repeat this step for the register contents
+         * in V2, V3, and V4 respectively.
+         */
+        v1 = (uv2di)vec_gfmsum_accum_128(r2r1, v1, part1);
+        v2 = (uv2di)vec_gfmsum_accum_128(r2r1, v2, part2);
+        v3 = (uv2di)vec_gfmsum_accum_128(r2r1, v3, part3);
+        v4 = (uv2di)vec_gfmsum_accum_128(r2r1, v4, part4);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
+     * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+     * value remains.
+     */
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v3);
+    v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v4);
+
+    while (len >= 16) {
+        /* Load next data chunk */
+        v2 = vec_perm(*(uv2di *)buf, *(uv2di *)buf, perm_le2be);
+
+        /* Fold next data chunk */
+        v1 = (uv2di)vec_gfmsum_accum_128(r4r3, v1, (uv16qi)v2);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Set up a vector register for byte shifts.  The shift value must
+     * be loaded in bits 1-4 in byte element 7 of a vector register.
+     * Shift by 8 bytes: 0x40
+     * Shift by 4 bytes: 0x20
+     */
+    uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    v9 = vec_insert((unsigned char)0x40, v9, 7);
+
+    /*
+     * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+     * to move R4 into the rightmost doubleword and set the leftmost
+     * doubleword to 0x1.
+     */
+    v0 = vec_srb(r4r3, (uv2di)v9);
+    v0[0] = 1;
+
+    /*
+     * Compute GF(2) product of V1 and V0.  The rightmost doubleword
+     * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
+     * multiplied by 0x1 and is then XORed with rightmost product.
+     * Implicitly, the intermediate leftmost product becomes padded
+     */
+    v1 = (uv2di)vec_gfmsum_128(v0, v1);
+
+    /*
+     * Now do the final 32-bit fold by multiplying the rightmost word
+     * in V1 with R5 and XOR the result with the remaining bits in V1.
+     *
+     * To achieve this by a single VGFMAG, right shift V1 by a word
+     * and store the result in V2 which is then accumulated.  Use the
+     * vector unpack instruction to load the rightmost half of the
+     * doubleword into the rightmost doubleword element of V1; the other
+     * half is loaded in the leftmost doubleword.
+     * The vector register with CONST_R5 contains the R5 constant in the
+     * rightmost doubleword and the leftmost doubleword is zero to ignore
+     * the leftmost product of V1.
+     */
+    v9 = vec_insert((unsigned char)0x20, v9, 7);
+    v2 = vec_srb(v1, (uv2di)v9);
+    v1 = vec_unpackl((uv4si)v1);  /* Split rightmost doubleword */
+    v1 = (uv2di)vec_gfmsum_accum_128(r5, v1, (uv16qi)v2);
+
+    /*
+     * Apply a Barret reduction to compute the final 32-bit CRC value.
+     *
+     * The input values to the Barret reduction are the degree-63 polynomial
+     * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+     * constant u.  The Barret reduction result is the CRC value of R(x) mod
+     * P(x).
+     *
+     * The Barret reduction algorithm is defined as:
+     *
+     *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+     *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+     *    3. C(x)  = R(x) XOR T2(x) mod x^32
+     *
+     *  Note: The leftmost doubleword of vector register containing
+     *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+     *  is zero and does not contribute to the final result.
+     */
+
+    /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+    v2 = vec_unpackl((uv4si)v1);
+    v2 = (uv2di)vec_gfmsum_128(ru_poly, v2);
+
+    /*
+     * Compute the GF(2) product of the CRC polynomial with T1(x) in
+     * V2 and XOR the intermediate result, T2(x), with the value in V1.
+     * The final result is stored in word element 2 of V2.
+     */
+    v2 = vec_unpackl((uv4si)v2);
+    v2 = (uv2di)vec_gfmsum_accum_128(crc_poly, v2, (uv16qi)v1);
+
+    return ((uv4si)v2)[2];
+}
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+uint32_t Z_INTERNAL s390_crc32_vx(uint32_t crc, const unsigned char *buf, uint64_t len) {
+    uint64_t prealign, aligned, remaining;
+
+    if (len < VX_MIN_LEN + VX_ALIGN_MASK)
+        return crc32_big(crc, buf, len);
+
+    if ((uintptr_t)buf & VX_ALIGN_MASK) {
+        prealign = VX_ALIGNMENT - ((uintptr_t)buf & VX_ALIGN_MASK);
+        len -= prealign;
+        crc = crc32_big(crc, buf, prealign);
+        buf += prealign;
+    }
+    aligned = len & ~VX_ALIGN_MASK;
+    remaining = len & VX_ALIGN_MASK;
+
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
+
+    if (remaining)
+        crc = crc32_big(crc, buf + aligned, remaining);
+
+    return crc;
+}
diff --git a/arch/s390/s390.c b/arch/s390/s390.c

new file mode 100644 (file)

index 0000000..6dd5252
--- /dev/null
+++ b/arch/s390/s390.c
@@ -0,0 +1,10 @@
+#include "../../zutil.h"
+#include "s390.h"
+
+#include <sys/auxv.h>
+
+Z_INTERNAL int s390_cpu_has_vx;
+
+void Z_INTERNAL s390_check_features(void) {
+    s390_cpu_has_vx = getauxval(AT_HWCAP) & HWCAP_S390_VX;
+}
diff --git a/arch/s390/s390.h b/arch/s390/s390.h

new file mode 100644 (file)

index 0000000..dec035b
--- /dev/null
+++ b/arch/s390/s390.h
@@ -0,0 +1,10 @@
+#ifndef S390_H_
+#define S390_H_
+
+#include "../../zutil.h"
+
+extern int s390_cpu_has_vx;
+
+void Z_INTERNAL s390_check_features(void);
+
+#endif
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index a360b7e6649fa40b84d7ab51c194a4b2f930f3e8..90c5f3bee108014ae7fc1246eb2243d4b50da067 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -219,3 +219,28 @@ macro(check_sse4_intrinsics)
      )
      set(CMAKE_REQUIRED_FLAGS)
  endmacro()
+
+macro(check_vgfma_intrinsics)
+    if(NOT NATIVEFLAG)
+        set(VGFMAFLAG "-march=z13")
+        if(CMAKE_C_COMPILER_ID MATCHES "GNU")
+            set(VGFMAFLAG "${VGFMAFLAG} -mzarch")
+        endif()
+        if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(VGFMAFLAG "${VGFMAFLAG} -fzvector")
+        endif()
+    endif()
+    # Check whether compiler supports "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic
+    set(CMAKE_REQUIRED_FLAGS "${VGFMAFLAG}")
+    check_c_source_compiles(
+        "#include <vecintrin.h>
+        int main(void) {
+            unsigned long long a __attribute__((vector_size(16))) = { 0 };
+            unsigned long long b __attribute__((vector_size(16))) = { 0 };
+            unsigned char c __attribute__((vector_size(16))) = { 0 };
+            c = vec_gfmsum_accum_128(a, b, c);
+            return c[0];
+        }"
+        HAVE_VGFMA_INTRIN FAIL_REGEX "not supported")
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
diff --git a/configure b/configure

index e4738dca16af6aac0bc867afd979e18ebae62b80..0dc1dd6a76acefab0b985953ce3f45f7c4a1af69 100755 (executable)
--- a/configure
+++ b/configure
@@ -94,6 +94,7 @@ buildacle=1
  buildneon=1
  builddfltccdeflate=0
  builddfltccinflate=0
+buildcrc32vx=1
  with_sanitizer=""
  with_fuzzers=0
  floatabi=
@@ -108,6 +109,7 @@ pclmulflag="-mpclmul"
  acleflag=
  neonflag=
  noltoflag="-fno-lto"
+vgfmaflag="-march=z13"
  without_optimizations=0
  without_new_strategies=0
  reducedmem=0
@@ -155,6 +157,7 @@ case "$1" in
        echo '    [--without-neon]            Compiles without ARM Neon SIMD instruction set' | tee -a configure.log
        echo '    [--with-dfltcc-deflate]     Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log
        echo '    [--with-dfltcc-inflate]     Use DEFLATE CONVERSION CALL instruction for decompression on IBM Z' | tee -a configure.log
+      echo '    [--without-crc32-vx]        Build without vectorized CRC32 on IBM Z' | tee -a configure.log
        echo '    [--with-reduced-mem]        Reduced memory usage for special cases (reduces performance)' | tee -a configure.log
        echo '    [--force-sse2]              Assume SSE2 instructions are always available (disabled by default on x86, enabled on x86_64)' | tee -a configure.log
        echo '    [--with-sanitizer]          Build with sanitizer (memory, address, undefined)' | tee -a configure.log
@@ -182,6 +185,7 @@ case "$1" in
      --without-neon) buildneon=0; shift ;;
      --with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
      --with-dfltcc-inflate) builddfltccinflate=1; shift ;;
+    --without-crc32-vx) buildcrc32vx=0; shift ;;
      --with-reduced-mem) reducedmem=1; shift ;;
      --force-sse2) forcesse2=1; shift ;;
      -n | --native) native=1; shift ;;
@@ -1152,6 +1156,42 @@ EOF
      fi
  }
  
+check_vgfma_intrinsics() {
+    # Check whether "VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE" intrinsic is available
+    echo -n "Checking for -mzarch... " | tee -a configure.log
+    if try $CC -x c -c /dev/null -o /dev/null -mzarch; then
+        echo Yes. | tee -a configure.log
+        vgfmaflag="${vgfmaflag} -mzarch"
+    else
+        echo No. | tee -a configure.log
+    fi
+    echo -n "Checking for -fzvector... " | tee -a configure.log
+    if try $CC -x c -c /dev/null -o /dev/null -fzvector; then
+        echo Yes. | tee -a configure.log
+        vgfmaflag="${vgfmaflag} -fzvector"
+    else
+        echo No. | tee -a configure.log
+    fi
+    cat > $test.c << EOF
+#include <vecintrin.h>
+int main(void) {
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    c = vec_gfmsum_accum_128(a, b, c);
+    return c[0];
+}
+EOF
+    echo -n "Checking for VGFMA support... " | tee -a configure.log
+    if try $CC -c $CFLAGS $vgfmaflag $test.c; then
+        HAVE_VGFMA_INTRIN=1
+        echo "Yes." | tee -a configure.log
+    else
+        HAVE_VGFMA_INTRIN=0
+        echo "No." | tee -a configure.log
+    fi
+}
+
  case "${ARCH}" in
      i386 | i486 | i586 | i686 | x86_64)
          # Enable deflate_medium at level 1
@@ -1538,6 +1578,13 @@ EOF
          ARCHDIR=arch/s390
  
          if test $without_optimizations -eq 0; then
+            if test $buildcrc32vx -eq 1; then
+                CFLAGS="${CFLAGS} -DS390_FEATURES"
+                SFLAGS="${SFLAGS} -DS390_FEATURES"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} s390.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} s390.lo"
+            fi
+
              if test $builddfltccdeflate -eq 1 -o $builddfltccinflate -eq 1; then
                  ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} dfltcc_common.o"
                  ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} dfltcc_common.lo"
@@ -1558,6 +1605,17 @@ EOF
                  ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} dfltcc_inflate.lo"
                  ARCH="${ARCH}+dfltcc-inflate"
              fi
+
+            if test $buildcrc32vx -eq 1; then
+                check_vgfma_intrinsics
+                if test $HAVE_VGFMA_INTRIN -eq 1; then
+                    CFLAGS="${CFLAGS} -DS390_CRC32_VX"
+                    SFLAGS="${SFLAGS} -DS390_CRC32_VX"
+                    ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32-vx.o"
+                    ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32-vx.lo"
+                    ARCH="${ARCH}+crc32-vx"
+                fi
+            fi
          fi
      ;;
      *)
@@ -1751,6 +1809,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
  /^ACLEFLAG *=/s#=.*#=$acleflag#
  /^NEONFLAG *=/s#=.*#=$neonflag#
  /^NOLTOFLAG *=/s#=.*#=$noltoflag#
+/^VGFMAFLAG *=/s#=.*#=$vgfmaflag#
  " > $ARCHDIR/Makefile
  
  # Append header files dependences.
diff --git a/crc32_p.h b/crc32_p.h

index 47b4b3751b6c861cc78aedd12232b8fede43c1e2..fbaf8db88c3b3b10b140dbd5fc7c84b8ad7307f1 100644 (file)
--- a/crc32_p.h
+++ b/crc32_p.h
@@ -1,6 +1,9 @@
  #ifndef CRC32_P_H_
  #define CRC32_P_H_
  
+#include "zutil.h"
+#include "zendian.h"
+
  #define GF2_DIM 32      /* dimension of GF(2) vectors (length of CRC) */
  
  
@@ -16,4 +19,10 @@ static inline uint32_t gf2_matrix_times(const uint32_t *mat, uint32_t vec) {
  }
  
  
+#if BYTE_ORDER == LITTLE_ENDIAN
+extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
+#elif BYTE_ORDER == BIG_ENDIAN
+extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
+#endif
+
  #endif /* CRC32_P_H_ */
diff --git a/functable.c b/functable.c

index 12feedfb31eaf55c2984ee800b127fdd13fc8830..af3aaa2d8b3fcc7138c3538f5476b98fefff8866 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -5,6 +5,7 @@
  
  #include "zbuild.h"
  #include "zendian.h"
+#include "crc32_p.h"
  #include "deflate.h"
  #include "deflate_p.h"
  
@@ -111,11 +112,8 @@ Z_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
  #ifdef ARM_ACLE_CRC_HASH
  extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t);
  #endif
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-extern uint32_t crc32_little(uint32_t, const unsigned char *, uint64_t);
-#elif BYTE_ORDER == BIG_ENDIAN
-extern uint32_t crc32_big(uint32_t, const unsigned char *, uint64_t);
+#ifdef S390_CRC32_VX
+extern uint32_t s390_crc32_vx(uint32_t, const unsigned char *, uint64_t);
  #endif
  
  /* compare258 */
@@ -179,6 +177,8 @@ Z_INTERNAL void cpu_check_features(void)
      arm_check_features();
  #elif defined(POWER_FEATURES)
      power_check_features();
+#elif defined(S390_FEATURES)
+    s390_check_features();
  #endif
      features_checked = 1;
  }
@@ -463,6 +463,10 @@ Z_INTERNAL uint32_t crc32_stub(uint32_t crc, const unsigned char *buf, uint64_t
  #  endif
  #elif BYTE_ORDER == BIG_ENDIAN
          functable.crc32 = crc32_big;
+#  if defined(S390_CRC32_VX)
+        if (s390_cpu_has_vx)
+            functable.crc32 = s390_crc32_vx;
+#  endif
  #else
  #  error No endian defined
  #endif
diff --git a/zutil.h b/zutil.h

index f8d8d07b914cf9b8bf640eb2f6e5947e5cc7e5bc..4e895095556b2349dfad1621afa66e5db0980ec8 100644 (file)
--- a/zutil.h
+++ b/zutil.h
@@ -252,6 +252,8 @@ void Z_INTERNAL   zng_cfree(void *opaque, void *ptr);
  #  include "arch/arm/arm.h"
  #elif defined(POWER_FEATURES)
  #  include "arch/power/power.h"
+#elif defined(S390_FEATURES)
+#  include "arch/s390/s390.h"
  #endif
  
  #endif /* ZUTIL_H_ */
author	Ilya Leoshkevich <iii@linux.ibm.com>
	Tue, 6 Apr 2021 11:51:16 +0000 (13:51 +0200)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 7 Jul 2021 17:54:01 +0000 (19:54 +0200)
.github/workflows/cmake.yml		patch \| blob \| blame \| history
.github/workflows/configure.yml		patch \| blob \| blame \| history
CMakeLists.txt		patch \| blob \| blame \| history
README.md		patch \| blob \| blame \| history
arch/s390/Makefile.in		patch \| blob \| blame \| history
arch/s390/crc32-vx.c	[new file with mode: 0644]	patch \| blob
arch/s390/s390.c	[new file with mode: 0644]	patch \| blob
arch/s390/s390.h	[new file with mode: 0644]	patch \| blob
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
crc32_p.h		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
zutil.h		patch \| blob \| blame \| history