From: Adam Stylinski <kungfujesus06@gmail.com>
Date: Fri, 12 Dec 2025 21:23:27 +0000 (-0500)
Subject: Force purely aligned loads in inflate_table code length counting
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f921e202f4de7ee380a7cdb6c0b60e69404a566d;p=thirdparty%2Fzlib-ng.git

Force purely aligned loads in inflate_table code length counting

At the expense of some extra stack space and eating about 4 more cache
lines, let's make these loads purely aligned. On potato CPUs such as the
Core 2, unaligned loads in a loop are not ideal. Additionally some SBC
based ARM chips (usually the little in big.little variants) suffer a
penalty for unaligned loads. This also paves the way for a trivial
altivec implementation, for which unaligned loads don't exist and need
to be synthesized with permutation vectors.
---

diff --git a/inftrees.c b/inftrees.c
index 793a59c91..63c9b75f1 100644
--- a/inftrees.c
+++ b/inftrees.c
@@ -25,22 +25,35 @@ const char PREFIX(inflate_copyright)[] = " inflate 1.3.1 Copyright 1995-2024 Mar
 /* Count number of codes for each code length. */
 static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) {
     int sym;
-    static const ALIGNED_(32) uint8_t one[32] = {
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    static const ALIGNED_(16) uint8_t one[256] = {
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
     };
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
     uint8x16_t s1 = vdupq_n_u8(0);
     uint8x16_t s2 = vdupq_n_u8(0);
 
-    const uint8_t *p = &one[16];
     if (codes & 1) {
-        s1 = vld1q_u8(&p[-lens[0]]);
+        s1 = vld1q_u8(&one[16 * lens[0]]);
     }
     for (sym = codes & 1; sym < codes; sym += 2) {
-      s1 = vaddq_u8(s1, vld1q_u8(&p[-lens[sym]]));
-      s2 = vaddq_u8(s2, vld1q_u8(&p[-lens[sym+1]]));
+      s1 = vaddq_u8(s1, vld1q_u8(&one[16 * lens[sym]]));
+      s2 = vaddq_u8(s2, vld1q_u8(&one[16 * lens[sym+1]]));
     }
 
     vst1q_u16(&count[0], vaddl_u8(vget_low_u8(s1), vget_low_u8(s2)));
@@ -50,13 +63,12 @@ static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) {
     __m128i s1 = _mm_setzero_si128();
     __m128i s2 = _mm_setzero_si128();
 
-    const uint8_t *p = (uint8_t*)&one[16];
     if (codes & 1) {
-        s1 = _mm_loadu_si128((const __m128i*)&p[-lens[0]]);
+        s1 = _mm_load_si128((const __m128i*)&one[16 * lens[0]]);
     }
     for (sym = codes & 1; sym < codes; sym += 2) {
-        s1 = _mm_add_epi8(s1, _mm_loadu_si128((const __m128i*)&p[-lens[sym]]));   // vaddq_u8
-        s2 = _mm_add_epi8(s2, _mm_loadu_si128((const __m128i*)&p[-lens[sym+1]]));
+        s1 = _mm_add_epi8(s1, _mm_load_si128((const __m128i*)&one[16 * lens[sym]]));  // vaddq_u8
+        s2 = _mm_add_epi8(s2, _mm_load_si128((const __m128i*)&one[16 * lens[sym+1]]));
     }
 
 #  if defined(__AVX2__)