]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Optimize compression by using neon function.
authorcaoyzh <caoyazhen_ok@163.com>
Mon, 16 Mar 2020 03:07:31 +0000 (11:07 +0800)
committerNick Terrell <nickrterrell@gmail.com>
Thu, 7 May 2020 20:10:46 +0000 (13:10 -0700)
lib/common/zstd_internal.h
lib/compress/zstd_compress.c

index bd08de821202576f57f01e4c719fd706ea020a24..745a1d0ce884d3b9a3534be65e37764722e40465 100644 (file)
@@ -21,6 +21,9 @@
 ***************************************/
 #include "compiler.h"
 #include "mem.h"
+#ifdef __aarch64__
+#include "arm_neon.h"
+#endif
 #include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
 #include "error_private.h"
 #define ZSTD_STATIC_LINKING_ONLY
@@ -224,10 +227,22 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
 /*-*******************************************
 *  Shared functions to include for inlining
 *********************************************/
-static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+static void ZSTD_copy8(void* dst, const void* src) {
+#ifdef __aarch64__
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+    memcpy(dst, src, 8);
+#endif
+}
 
 #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
-static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
+static void ZSTD_copy16(void* dst, const void* src) {
+#ifdef __aarch64__
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#else
+    memcpy(dst, src, 16);
+#endif
+}
 #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
 
 #define WILDCOPY_OVERLENGTH 32
@@ -269,8 +284,10 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
         * one COPY16() in the first call. Then, do two calls per loop since
         * at that point it is more likely to have a high trip count.
          */
+#ifndef __aarch64__
         COPY16(op, ip);
         if (op >= oend) return;
+#endif
         do {
             COPY16(op, ip);
             COPY16(op, ip);
index ff8a00e6b3b2aded7efb2f2ccf1a276a7306a330..b2830b01e86527c0c9303c5c0127ac2939a4eb9a 100644 (file)
@@ -1865,6 +1865,20 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
 
     for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
         int column;
+#ifdef __aarch64__
+        for (column=0; column<ZSTD_ROWSIZE; column+=4) {
+            uint32x4_t const zero = {0, 0, 0, 0};
+            uint32x4_t const reducer = vdupq_n_u32(reducerValue);
+            uint32x4_t data = vld1q_u32(table + cellNb);
+            if (preserveMark) {
+                uint32x4_t const mark = {ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK};
+                data = vbslq_u32(vceqq_u32(data, mark), vaddq_u32(data, reducer), data);
+            }
+            data = vbslq_u32(vcltq_u32(data, reducer), zero, vsubq_u32(data, reducer));
+            vst1q_u32(table + cellNb, data);
+            cellNb+=4;
+        }
+#else
         for (column=0; column<ZSTD_ROWSIZE; column++) {
             if (preserveMark) {
                 U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
@@ -1873,7 +1887,9 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
             if (table[cellNb] < reducerValue) table[cellNb] = 0;
             else table[cellNb] -= reducerValue;
             cellNb++;
-    }   }
+        }
+#endif
+    }
 }
 
 static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)