From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Thu, 2 Jun 2022 22:46:56 +0000 (-0400)
Subject: Improve the swizzle of the memory magazine fed in a chunk copy for neon
X-Git-Tag: 2.1.0-beta1~206
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1bcd15eaf32dbfcd834f1b21f5894edd6b466095;p=thirdparty%2Fzlib-ng.git

Improve the swizzle of the memory magazine fed in a chunk copy for neon

Like the x86 variant, we can leverage the same tables to load a vector
register worth of values. This shows a vast improvement in places where
very large run length encodes can be found in the lz runs.
---

diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
index ca8420d34..29065f77c 100644
--- a/arch/arm/chunkset_neon.c
+++ b/arch/arm/chunkset_neon.c
@@ -9,6 +9,7 @@
 #  include <arm_neon.h>
 #endif
 #include "../../zbuild.h"
+#include "../generic/chunk_permute_table.h"
 
 typedef uint8x16_t chunk_t;
 
@@ -17,6 +18,23 @@ typedef uint8x16_t chunk_t;
 #define HAVE_CHUNKMEMSET_2
 #define HAVE_CHUNKMEMSET_4
 #define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+static const lut_rem_pair perm_idx_lut[13] = {
+    {0, 1},      /* 3 */
+    {0, 0},      /* don't care */
+    {1 * 32, 1}, /* 5 */
+    {2 * 32, 4}, /* 6 */
+    {3 * 32, 2}, /* 7 */
+    {0 * 32, 0}, /* don't care */
+    {4 * 32, 7}, /* 9 */
+    {5 * 32, 6}, /* 10 */
+    {6 * 32, 5}, /* 11 */
+    {7 * 32, 4}, /* 12 */
+    {8 * 32, 3}, /* 13 */
+    {9 * 32, 2}, /* 14 */
+    {10 * 32, 1},/* 15 */
+};
 
 static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
     uint16_t tmp;
@@ -50,6 +68,34 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
     vst1q_u8(out, *chunk);
 }
 
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+    *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+    /* See note in chunkset_sse41.c for why this is ok */
+    __msan_unpoison(buf + dist, 16 - dist);
+#endif
+    
+    /* This version of table is only available on aarch64 */
+#if defined(_M_ARM64) || defined(__aarch64__)
+    uint8x16_t ret_vec = vld1q_u8(buf);
+
+    uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);
+    return vqtbl1q_u8(ret_vec, perm_vec);
+#else
+    uint8x8_t ret0, ret1, a, b, perm_vec0, perm_vec1;
+    perm_vec0 = vld1_u8(permute_table + lut_rem.idx);
+    perm_vec1 = vld1_u8(permute_table + lut_rem.idx + 8);
+    a = vld1_u8(buf);
+    b = vld1_u8(buf + 8);
+    ret0 = vtbl1_u8(a, perm_vec0);
+    uint8x8x2_t ab = {{a, b}};
+    ret1 = vtbl2_u8(ab, perm_vec1);
+    return vcombine_u8(ret0, ret1);
+#endif
+}
+
 #include "chunkset_tpl.h"
 
 #endif
diff --git a/arch/x86/chunk_permute_table.h b/arch/generic/chunk_permute_table.h
similarity index 100%
rename from arch/x86/chunk_permute_table.h
rename to arch/generic/chunk_permute_table.h
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c
index bf49c712d..024b37c30 100644
--- a/arch/x86/chunkset_avx.c
+++ b/arch/x86/chunkset_avx.c
@@ -5,7 +5,7 @@
 
 #ifdef X86_AVX_CHUNKSET
 #include <immintrin.h>
-#include "chunk_permute_table.h"
+#include "../generic/chunk_permute_table.h"
 
 typedef __m256i chunk_t;
 
diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c
index 9789df76a..42b44d051 100644
--- a/arch/x86/chunkset_sse41.c
+++ b/arch/x86/chunkset_sse41.c
@@ -9,7 +9,7 @@
  * to identical machine code */
 #if defined(X86_SSE41) && defined(X86_SSE2)
 #include <immintrin.h>
-#include "chunk_permute_table.h"
+#include "../generic/chunk_permute_table.h"
 
 typedef __m128i chunk_t;