***************************************/
#include "compiler.h"
#include "mem.h"
+#ifdef __aarch64__
+#include "arm_neon.h"
+#endif
#include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */
#include "error_private.h"
#define ZSTD_STATIC_LINKING_ONLY
/*-*******************************************
* Shared functions to include for inlining
*********************************************/
-static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+static void ZSTD_copy8(void* dst, const void* src) {
+#ifdef __aarch64__
+ vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+ memcpy(dst, src, 8);
+#endif
+}
#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
-static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
+static void ZSTD_copy16(void* dst, const void* src) {
+#ifdef __aarch64__
+ vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#else
+ memcpy(dst, src, 16);
+#endif
+}
#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
#define WILDCOPY_OVERLENGTH 32
* one COPY16() in the first call. Then, do two calls per loop since
* at that point it is more likely to have a high trip count.
*/
+#ifndef __aarch64__
COPY16(op, ip);
if (op >= oend) return;
+#endif
do {
COPY16(op, ip);
COPY16(op, ip);
for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
int column;
+#ifdef __aarch64__
+ for (column=0; column<ZSTD_ROWSIZE; column+=4) {
+ uint32x4_t const zero = {0, 0, 0, 0};
+ uint32x4_t const reducer = vdupq_n_u32(reducerValue);
+ uint32x4_t data = vld1q_u32(table + cellNb);
+ if (preserveMark) {
+ uint32x4_t const mark = {ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK, ZSTD_DUBT_UNSORTED_MARK};
+ data = vbslq_u32(vceqq_u32(data, mark), vaddq_u32(data, reducer), data);
+ }
+ data = vbslq_u32(vcltq_u32(data, reducer), zero, vsubq_u32(data, reducer));
+ vst1q_u32(table + cellNb, data);
+ cellNb+=4;
+ }
+#else
for (column=0; column<ZSTD_ROWSIZE; column++) {
if (preserveMark) {
U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
if (table[cellNb] < reducerValue) table[cellNb] = 0;
else table[cellNb] -= reducerValue;
cellNb++;
- } }
+ }
+#endif
+ }
}
static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)