PowerPC: Add initial support for AltiVec.

author Mika Lindqvist <postmaster@raasu.org>

Sun, 26 Mar 2017 19:54:17 +0000 (22:54 +0300)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 11 Aug 2021 10:02:55 +0000 (12:02 +0200)
author Mika Lindqvist <postmaster@raasu.org>
Sun, 26 Mar 2017 19:54:17 +0000 (22:54 +0300)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 11 Aug 2021 10:02:55 +0000 (12:02 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 614d7e5934924cbcfa76646b2fbfb43e61b8356c..8c4523b3de7b427807630e774f7258798e789713 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,7 @@ if(BASEARCH_ARM_FOUND)
      option(WITH_ACLE "Build with ACLE" ON)
      option(WITH_NEON "Build with NEON intrinsics" ON)
  elseif(BASEARCH_PPC_FOUND)
+    option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
      option(WITH_POWER8 "Build with optimisations for POWER8" ON)
  elseif(BASEARCH_S360_FOUND)
      option(WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z" OFF)
@@ -119,6 +120,7 @@ mark_as_advanced(FORCE
      WITH_AVX2 WITH_SSE2
      WITH_SSSE3 WITH_SSE4
      WITH_PCLMULQDQ
+    WITH_ALTIVEC
      WITH_POWER8
      WITH_INFLATE_STRICT
      WITH_INFLATE_ALLOW_INVALID_DIST
@@ -608,16 +610,41 @@ if(WITH_OPTIM)
              endif()
          endif()
      elseif(BASEARCH_PPC_FOUND)
+        # Common arch detection code
+        if(WITH_ALTIVEC)
+            check_ppc_intrinsics()
+        endif()
          if(WITH_POWER8)
              check_power8_intrinsics()
+        endif()
+        if(HAVE_VMX OR HAVE_POWER8_INTRIN)
+            list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
+        endif()
+        # VMX specific options and files
+        if(WITH_ALTIVEC)
+            if(HAVE_VMX)
+                add_definitions(-DPPC_FEATURES)
+                if(HAVE_ALTIVEC)
+                    add_definitions(-DPPC_VMX_ADLER32)
+                    add_definitions(-DPPC_VMX_SLIDEHASH)
+                    set(PPC_SRCS ${ARCHDIR}/adler32_vmx.c ${ARCHDIR}/slide_hash_vmx.c)
+                    list(APPEND ZLIB_ARCH_SRCS ${PPC_SRCS})
+                    add_feature_info(ALTIVEC 1 "Support the AltiVec instruction set, using \"-maltivec\"")
+                    set_property(SOURCE ${PPC_SRCS} PROPERTY COMPILE_FLAGS "${PPCFLAGS}")
+                else()
+                    set(WITH_ALTIVEC OFF)
+                endif()
+            endif()
+        endif()
+        # Power8 specific options and files
+        if(WITH_POWER8)
              if(HAVE_POWER8_INTRIN)
                  add_definitions(-DPOWER8)
                  add_definitions(-DPOWER_FEATURES)
                  add_definitions(-DPOWER8_VSX_ADLER32)
                  add_definitions(-DPOWER8_VSX_CHUNKSET)
                  add_definitions(-DPOWER8_VSX_SLIDEHASH)
-                list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/power.h)
-                list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/power.c)
                  set(POWER8_SRCS ${ARCHDIR}/adler32_power8.c ${ARCHDIR}/chunkset_power8.c ${ARCHDIR}/slide_hash_power8.c)
                  list(APPEND ZLIB_ARCH_SRCS ${POWER8_SRCS})
                  set_property(SOURCE ${POWER8_SRCS} PROPERTY COMPILE_FLAGS "${POWER8FLAG} ${NOLTOFLAG}")
@@ -1315,6 +1342,7 @@ if(BASEARCH_ARM_FOUND)
      add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE")
      add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
  elseif(BASEARCH_PPC_FOUND)
+    add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
      add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
  elseif(BASEARCH_S360_FOUND)
      add_feature_info(WITH_DFLTCC_DEFLATE WITH_DFLTCC_DEFLATE "Build with DFLTCC intrinsics for compression on IBM Z")
diff --git a/arch/power/Makefile.in b/arch/power/Makefile.in

index 3e4b41fb0cbe11b5b2a5fa148d93710981c1e1ce..9b94e0b7c8c438ff2c3464b1200453492c12f7c1 100644 (file)
--- a/arch/power/Makefile.in
+++ b/arch/power/Makefile.in
@@ -1,5 +1,6 @@
  # Makefile for POWER-specific files
  # Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+# Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
  # For conditions of distribution and use, see copyright notice in zlib.h
  
  CC=
@@ -9,6 +10,7 @@ INCLUDES=
  SUFFIX=
  
  P8FLAGS=-mcpu=power8
+PPCFLAGS=-maltivec
  NOLTOFLAG=
  
  SRCDIR=.
@@ -19,10 +21,14 @@ all: power.o \
       power.lo \
       adler32_power8.o \
       adler32_power8.lo \
+     adler32_vmx.o \
+     adler32_vmx.lo \
       chunkset_power8.o \
       chunkset_power8.lo \
       slide_hash_power8.o \
-     slide_hash_power8.lo
+     slide_hash_power8.lo \
+     slide_hash_vmx.o \
+     slide_hash_vmx.lo
  
  power.o:
         $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/power.c
@@ -36,6 +42,12 @@ adler32_power8.o:
  adler32_power8.lo:
         $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_power8.c
  
+adler32_vmx.o:
+       $(CC) $(CFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
+adler32_vmx.lo:
+       $(CC) $(SFLAGS) $(PPCFLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_vmx.c
+
  chunkset_power8.o:
         $(CC) $(CFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_power8.c
  
@@ -48,6 +60,12 @@ slide_hash_power8.o:
  slide_hash_power8.lo:
         $(CC) $(SFLAGS) $(P8FLAGS) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_power8.c
  
+slide_hash_vmx.o:
+       $(CC) $(CFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
+slide_hash_vmx.lo:
+       $(CC) $(SFLAGS) ${PPCFLAGS} $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_vmx.c
+
  mostlyclean: clean
  clean:
         rm -f *.o *.lo *~
diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c

new file mode 100644 (file)

index 0000000..5d7dc43
--- /dev/null
+++ b/arch/power/adler32_vmx.c
@@ -0,0 +1,118 @@
+/* adler32_vmx.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef PPC_VMX_ADLER32
+#include <altivec.h>
+#include "zutil.h"
+#include "adler32_p.h"
+
+#define vmx_zero()  (vec_splat_u32(0))
+
+vector unsigned short vec_hadduh(vector unsigned char a) {
+    vector unsigned char vmx_one = vec_splat_u8(1);
+    return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
+}
+
+vector unsigned int vec_hadduw(vector unsigned short a) {
+    vector unsigned short vmx_one = vec_splat_u16(1);
+    return vec_add(vec_mulo(a, vmx_one), vec_mule(a, vmx_one));
+}
+
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+    unsigned int i;
+    for (i = 0; i < len; ++i) {
+        pair[0] += buf[i];
+        pair[1] += pair[0];
+    }
+}
+
+static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+    static const uint8_t tc0[16] ALIGNED_(16) = {16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1};
+
+    vector unsigned char t0 = vec_ld(0, tc0);
+    vector unsigned int  adacc, s2acc;
+    adacc = vec_insert(s[0], vmx_zero(), 0);
+    s2acc = vec_insert(s[1], vmx_zero(), 0);
+
+    while (len > 0) {
+        vector unsigned char d0 = vec_ld(0, buf);
+        vector unsigned short sum2;
+        sum2  = vec_add(vec_mulo(t0, d0), vec_mule(t0, d0));
+        s2acc = vec_add(s2acc, vec_sl(adacc, vec_splat_u32(4)));
+        s2acc = vec_add(s2acc, vec_hadduw(sum2));
+        adacc = vec_add(adacc, vec_hadduw(vec_hadduh(d0)));
+        buf += 16;
+        len--;
+    }
+
+    s[0] = vec_extract(adacc, 0) + vec_extract(adacc, 1) + vec_extract(adacc, 2) + vec_extract(adacc, 3); /* Horizontal add */
+    s[1] = vec_extract(s2acc, 0) + vec_extract(s2acc, 1) + vec_extract(s2acc, 2) + vec_extract(s2acc, 3); /* Horizontal add */
+}
+
+uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
+    uint32_t sum2;
+    uint32_t pair[2];
+    int n = NMAX;
+    unsigned int done = 0, i;
+
+    /* Split Adler-32 into component sums, it can be supplied by
+     * the caller sites (e.g. in a PNG file).
+     */
+    sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+    pair[0] = adler;
+    pair[1] = sum2;
+
+    /* in case user likes doing a byte at a time, keep it fast */
+    if (UNLIKELY(len == 1))
+        return adler32_len_1(adler, buf, sum2);
+
+    /* initial Adler-32 value (deferred check for len == 1 speed) */
+    if (UNLIKELY(buf == NULL))
+        return 1L;
+
+    /* in case short lengths are provided, keep it somewhat fast */
+    if (UNLIKELY(len < 16))
+        return adler32_len_16(adler, buf, len, sum2);
+
+    // Align buffer
+    unsigned int al = 0;
+    if ((uintptr_t)buf & 0xf) {
+        al = 16-((uintptr_t)buf & 0xf);
+        if (al > len) {
+            al=len;
+        }
+        vmx_handle_head_or_tail(pair, buf, al);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += al;
+    }
+    for (i = al; i < len; i += n) {
+        if ((i + n) > len)
+            n = (int)(len - i);
+
+        if (n < 16)
+            break;
+
+        vmx_accum32(pair, buf + i, n / 16);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+
+        done += (n / 16) * 16;
+    }
+
+    /* Handle the tail elements. */
+    if (done < len) {
+        vmx_handle_head_or_tail(pair, (buf + done), len - done);
+        pair[0] %= BASE;
+        pair[1] %= BASE;
+    }
+
+    /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
+    return (pair[1] << 16) | pair[0];
+}
+#endif
diff --git a/arch/power/power.c b/arch/power/power.c

index f93b586d5076676eb8f5e0b435c4e98aef3ace43..d17fe3f4f33c17af3f72479060d3eb23cfb27fd1 100644 (file)
--- a/arch/power/power.c
+++ b/arch/power/power.c
@@ -1,19 +1,30 @@
  /* POWER feature check
   * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
  #include <sys/auxv.h>
  #include "../../zutil.h"
+#include "power.h"
  
-Z_INTERNAL int power_cpu_has_arch_2_07;
+Z_INTERNAL int power_cpu_has_altivec = 0;
+Z_INTERNAL int power_cpu_has_arch_2_07 = 0;
  
  void Z_INTERNAL power_check_features(void) {
+#ifdef PPC_FEATURES
+    unsigned long hwcap;
+    hwcap = getauxval(AT_HWCAP);
+
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+        power_cpu_has_altivec = 1;
+#endif
+
+#ifdef POWER_FEATURES
      unsigned long hwcap2;
      hwcap2 = getauxval(AT_HWCAP2);
  
-#ifdef POWER8
      if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-      power_cpu_has_arch_2_07 = 1;
+        power_cpu_has_arch_2_07 = 1;
  #endif
  }
diff --git a/arch/power/power.h b/arch/power/power.h

index b36c26141024d2877f6d9bbda0e49ceb6d8616bf..81ce0096173fff65078dad9a5f633fb764b62c77 100644 (file)
--- a/arch/power/power.h
+++ b/arch/power/power.h
@@ -1,11 +1,13 @@
  /* power.h -- check for POWER CPU features
   * Copyright (C) 2020 Matheus Castanho <msc@linux.ibm.com>, IBM
+ * Copyright (C) 2021 Mika T. Lindqvist <postmaster@raasu.org>
   * For conditions of distribution and use, see copyright notice in zlib.h
   */
  
  #ifndef POWER_H_
  #define POWER_H_
  
+extern int power_cpu_has_altivec;
  extern int power_cpu_has_arch_2_07;
  
  void Z_INTERNAL power_check_features(void);
diff --git a/arch/power/slide_hash_vmx.c b/arch/power/slide_hash_vmx.c

new file mode 100644 (file)

index 0000000..b16df15
--- /dev/null
+++ b/arch/power/slide_hash_vmx.c
@@ -0,0 +1,34 @@
+/* Optimized slide_hash for PowerPC processors with VMX instructions
+ * Copyright (C) 2017-2021 Mika T. Lindqvist <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#ifdef PPC_VMX_SLIDEHASH
+
+#include <altivec.h>
+#include "zbuild.h"
+#include "deflate.h"
+
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+    const vector unsigned short vmx_wsize = vec_splats(wsize);
+    Pos *p = table;
+
+    do {
+        vector unsigned short value, result;
+
+        value = vec_ld(0, p);
+        result = vec_subs(value, vmx_wsize);
+        vec_st(result, 0, p);
+
+        p += 8;
+        entries -= 8;
+   } while (entries > 0);
+}
+
+void Z_INTERNAL slide_hash_vmx(deflate_state *s) {
+    uint16_t wsize = s->w_size;
+
+    slide_hash_chain(s->head, HASH_SIZE, wsize);
+    slide_hash_chain(s->prev, wsize, wsize);
+}
+
+#endif /* PPC_VMX_SLIDEHASH */
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index 90c5f3bee108014ae7fc1246eb2243d4b50da067..c6085018a6fe164c1e6d49c4a143b295cf3f49de 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -86,6 +86,56 @@ macro(check_pclmulqdq_intrinsics)
      endif()
  endmacro()
  
+macro(check_ppc_intrinsics)
+    # Check if compiler supports AltiVec
+    set(CMAKE_REQUIRED_FLAGS "-maltivec")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_ALTIVEC
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_ALTIVEC)
+        set(PPCFLAGS "-maltivec")
+    endif()
+
+    set(CMAKE_REQUIRED_FLAGS "-maltivec -mno-vsx")
+    check_c_source_compiles(
+        "#include <altivec.h>
+        int main(void)
+        {
+            vector int a = vec_splats(0);
+            vector int b = vec_splats(0);
+            a = vec_add(a, b);
+            return 0;
+        }"
+        HAVE_NOVSX
+        )
+    set(CMAKE_REQUIRED_FLAGS)
+
+    if(HAVE_NOVSX)
+        set(PPCFLAGS "${PPCFLAGS} -mno-vsx")
+    endif()
+
+    # Check if we have what we need for AltiVec optimizations
+    set(CMAKE_REQUIRED_FLAGS "${PPCFLAGS}")
+    check_c_source_compiles(
+        "#include <sys/auxv.h>
+        int main() {
+            return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC);
+        }"
+        HAVE_VMX
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
  macro(check_power8_intrinsics)
      if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
          if(NOT NATIVEFLAG)
diff --git a/configure b/configure

index badaa0ae80239eeb2bda5b82e0f46492bc6f8213..e48dfdc565b4abc147f81fd2c00274bfc7a57b46 100755 (executable)
--- a/configure
+++ b/configure
@@ -91,6 +91,7 @@ cover=0
  build32=0
  build64=0
  buildacle=1
+buildaltivec=1
  buildneon=1
  builddfltccdeflate=0
  builddfltccinflate=0
@@ -110,6 +111,7 @@ acleflag=
  neonflag=
  noltoflag="-fno-lto"
  vgfmaflag="-march=z13"
+vmxflag="-maltivec"
  without_optimizations=0
  without_new_strategies=0
  reducedmem=0
@@ -155,6 +157,7 @@ case "$1" in
        echo '    [--without-new-strategies]  Compiles without using new additional deflate strategies' | tee -a configure.log
        echo '    [--without-acle]            Compiles without ARM C Language Extensions' | tee -a configure.log
        echo '    [--without-neon]            Compiles without ARM Neon SIMD instruction set' | tee -a configure.log
+      echo '    [--without-altivec]         Compiles without PPC AltiVec support' | tee -a configure.log
        echo '    [--with-dfltcc-deflate]     Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log
        echo '    [--with-dfltcc-inflate]     Use DEFLATE CONVERSION CALL instruction for decompression on IBM Z' | tee -a configure.log
        echo '    [--without-crc32-vx]        Build without vectorized CRC32 on IBM Z' | tee -a configure.log
@@ -183,6 +186,7 @@ case "$1" in
      -6* | --64) build64=1; shift ;;
      --without-acle) buildacle=0; shift ;;
      --without-neon) buildneon=0; shift ;;
+    --without-altivec) buildaltivec=0 ; shift ;;
      --with-dfltcc-deflate) builddfltccdeflate=1; shift ;;
      --with-dfltcc-inflate) builddfltccinflate=1; shift ;;
      --without-crc32-vx) buildcrc32vx=0; shift ;;
@@ -1061,6 +1065,43 @@ EOF
      fi
  }
  
+check_ppc_intrinsics() {
+        cat > $test.c << EOF
+#include <altivec.h>
+int main(void)
+{
+    vector int a = vec_splats(0);
+    vector int b = vec_splats(0);
+    a = vec_add(a, b);
+    return 0;
+}
+EOF
+        if test $buildaltivec -eq 1 && try ${CC} ${CFLAGS} -maltivec $test.c; then
+            echo "Checking for AltiVec intrinsics ... Yes." | tee -a configure.log
+            HAVE_ALTIVEC_INTRIN=1
+        else
+            echo "Checking for AltiVec intrinsics ... No." | tee -a configure.log
+            HAVE_ALTIVEC_INTRIN=0
+        fi
+        if test $buildaltivec -eq 1 && try ${CC} ${CFLAGS} -maltivec -mno-vsx $test.c; then
+            echo "Checking if -mno-vsx is supported ... Yes." | tee -a configure.log
+            vmxflag="$vmxflag -mno-vsx"
+        else
+            echo "Checking if -mno-vsx is supported ... No." | tee -a configure.log
+        fi
+        cat > $test.c << EOF
+#include <sys/auxv.h>
+int main() { return (getauxval(AT_HWCAP) & PPC_FEATURE_HAS_ALTIVEC); }
+EOF
+        if try $CC -c $CFLAGS -maltivec $test.c; then
+            HAVE_VMX=1
+            echo "Check whether VMX instructions are available ... Yes." | tee -a configure.log
+        else
+            HAVE_VMX=0
+            echo "Check whether VMX instructions are available ... No." | tee -a configure.log
+        fi
+}
+
  check_power8_intrinsics() {
      # Check whether features needed by POWER optimisations are available
      cat > $test.c << EOF
@@ -1581,14 +1622,30 @@ EOF
  
          if test $without_optimizations -eq 0; then
  
+            check_ppc_intrinsics
              check_power8_intrinsics
  
+            if test $HAVE_VMX -eq 1; then
+                CFLAGS="${CFLAGS} -DPPC_FEATURES"
+                SFLAGS="${SFLAGS} -DPPC_FEATURES"
+            fi
+            if test $HAVE_VMX -eq 1 -o $HAVE_POWER8_INTRIN -eq 1; then
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo"
+            fi
+            if test $HAVE_VMX -eq 1 -a $HAVE_ALTIVEC_INTRIN -eq 1; then
+                CFLAGS="${CFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH"
+                SFLAGS="${SFLAGS} -DPPC_VMX_ADLER32 -DPPC_VMX_SLIDEHASH"
+
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_vmx.o slide_hash_vmx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_vmx.lo slide_hash_vmx.lo"
+            fi
              if test $HAVE_POWER8_INTRIN -eq 1; then
                  CFLAGS="${CFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
                  SFLAGS="${SFLAGS} -DPOWER8 -DPOWER_FEATURES -DPOWER8_VSX_ADLER32 -DPOWER8_VSX_CHUNKSET -DPOWER8_VSX_SLIDEHASH"
  
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} power.o adler32_power8.o chunkset_power8.o slide_hash_power8.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} power.lo adler32_power8.lo chunkset_power8.lo slide_hash_power8.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_power8.o chunkset_power8.o slide_hash_power8.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_power8.lo chunkset_power8.lo slide_hash_power8.lo"
              fi
          fi
      ;;
@@ -1829,6 +1886,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
  /^NEONFLAG *=/s#=.*#=$neonflag#
  /^NOLTOFLAG *=/s#=.*#=$noltoflag#
  /^VGFMAFLAG *=/s#=.*#=$vgfmaflag#
+/^PPCFLAGS *=/s#=.*#=$vmxflag#
  " > $ARCHDIR/Makefile
  
  # Append header files dependences.
diff --git a/functable.c b/functable.c

index af3aaa2d8b3fcc7138c3538f5476b98fefff8866..17544feb9ee362d3b88606f4b3f47d32ddd79552 100644 (file)
--- a/functable.c
+++ b/functable.c
@@ -44,7 +44,11 @@ extern Pos quick_insert_string_acle(deflate_state *const s, const uint32_t str);
  void slide_hash_sse2(deflate_state *s);
  #elif defined(ARM_NEON_SLIDEHASH)
  void slide_hash_neon(deflate_state *s);
-#elif defined(POWER8_VSX_SLIDEHASH)
+#endif
+#if defined(PPC_VMX_SLIDEHASH)
+void slide_hash_vmx(deflate_state *s);
+#endif
+#if defined(POWER8_VSX_SLIDEHASH)
  void slide_hash_power8(deflate_state *s);
  #endif
  #ifdef X86_AVX2
@@ -56,6 +60,9 @@ extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
  #ifdef ARM_NEON_ADLER32
  extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
  #endif
+#ifdef PPC_VMX_ADLER32
+extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
  #ifdef X86_SSSE3_ADLER32
  extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
  #endif
@@ -175,7 +182,7 @@ Z_INTERNAL void cpu_check_features(void)
      x86_check_features();
  #elif defined(ARM_FEATURES)
      arm_check_features();
-#elif defined(POWER_FEATURES)
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
      power_check_features();
  #elif defined(S390_FEATURES)
      s390_check_features();
@@ -252,6 +259,10 @@ Z_INTERNAL void slide_hash_stub(deflate_state *s) {
      if (x86_cpu_has_avx2)
          functable.slide_hash = &slide_hash_avx2;
  #endif
+#ifdef PPC_VMX_SLIDEHASH
+    if (power_cpu_has_altivec)
+        functable.slide_hash = &slide_hash_vmx;
+#endif
  #ifdef POWER8_VSX_SLIDEHASH
      if (power_cpu_has_arch_2_07)
          functable.slide_hash = &slide_hash_power8;
@@ -279,6 +290,10 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_
      if (x86_cpu_has_avx2)
          functable.adler32 = &adler32_avx2;
  #endif
+#ifdef PPC_VMX_ADLER32
+    if (power_cpu_has_altivec)
+        functable.adler32 = &adler32_vmx;
+#endif
  #ifdef POWER8_VSX_ADLER32
      if (power_cpu_has_arch_2_07)
          functable.adler32 = &adler32_power8;
diff --git a/zutil.h b/zutil.h

index 4e895095556b2349dfad1621afa66e5db0980ec8..bcb9ed12fccba0eb2d605d021f1bc7a57913c0f9 100644 (file)
--- a/zutil.h
+++ b/zutil.h
@@ -250,7 +250,7 @@ void Z_INTERNAL   zng_cfree(void *opaque, void *ptr);
  #  include "arch/x86/x86.h"
  #elif defined(ARM_FEATURES)
  #  include "arch/arm/arm.h"
-#elif defined(POWER_FEATURES)
+#elif defined(PPC_FEATURES) || defined(POWER_FEATURES)
  #  include "arch/power/power.h"
  #elif defined(S390_FEATURES)
  #  include "arch/s390/s390.h"
author	Mika Lindqvist <postmaster@raasu.org>
	Sun, 26 Mar 2017 19:54:17 +0000 (22:54 +0300)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 11 Aug 2021 10:02:55 +0000 (12:02 +0200)
CMakeLists.txt		patch \| blob \| blame \| history
arch/power/Makefile.in		patch \| blob \| blame \| history
arch/power/adler32_vmx.c	[new file with mode: 0644]	patch \| blob
arch/power/power.c		patch \| blob \| blame \| history
arch/power/power.h		patch \| blob \| blame \| history
arch/power/slide_hash_vmx.c	[new file with mode: 0644]	patch \| blob
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
functable.c		patch \| blob \| blame \| history
zutil.h		patch \| blob \| blame \| history