From: Nathan Moinvaziri <nathan@nathanm.com>
Date: Fri, 2 Jan 2026 23:03:33 +0000 (-0800)
Subject: Generate shuffle masks in registers for partial_fold.
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7f735dbc18513e2c8eac939443703681f2855fa8;p=thirdparty%2Fzlib-ng.git

Generate shuffle masks in registers for partial_fold.

Faster than loading table into memory.
---

diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h
index d18839bbb..c20e9a1ce 100644
--- a/arch/x86/crc32_pclmulqdq_tpl.h
+++ b/arch/x86/crc32_pclmulqdq_tpl.h
@@ -122,32 +122,15 @@ static inline void fold_16(__m512i *zmm_crc0, __m512i *zmm_crc1, __m512i *zmm_cr
 }
 #endif
 
-static const unsigned ALIGNED_(32) pshufb_shf_table[60] = {
-    0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
-    0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
-    0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
-    0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
-    0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
-    0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
-    0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl  9 (16 - 7)/shr7 */
-    0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl  8 (16 - 8)/shr8 */
-    0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl  7 (16 - 9)/shr9 */
-    0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl  6 (16 -10)/shr10*/
-    0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl  5 (16 -11)/shr11*/
-    0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl  4 (16 -12)/shr12*/
-    0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
-    0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
-    0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
-};
-
 static inline void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
     __m128i *xmm_crc3, __m128i *xmm_crc_part, const __m128i xmm_fold4) {
     const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
+    const __m128i xmm_seq = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
     __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
     __m128i xmm_a0_0, xmm_a0_1;
 
-    xmm_shl = _mm_load_si128((__m128i *)(pshufb_shf_table + (4 * (len - 1))));
+    xmm_shl = _mm_add_epi8(xmm_seq, _mm_set1_epi8((char)(len - 16)));
     xmm_shr = _mm_xor_si128(xmm_shl, xmm_mask3);
 
     xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);