Force purely aligned loads in inflate_table code length counting

author Adam Stylinski <kungfujesus06@gmail.com>

Fri, 12 Dec 2025 21:23:27 +0000 (16:23 -0500)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Sun, 28 Dec 2025 18:31:05 +0000 (19:31 +0100)
author Adam Stylinski <kungfujesus06@gmail.com>
Fri, 12 Dec 2025 21:23:27 +0000 (16:23 -0500)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 28 Dec 2025 18:31:05 +0000 (19:31 +0100)
diff --git a/inftrees.c b/inftrees.c

index 793a59c916e6d248f397cf95cada5736f59c6393..63c9b75f10a821d909d0c3301abfa151b06d3f3e 100644 (file)
--- a/inftrees.c
+++ b/inftrees.c
@@ -25,22 +25,35 @@ const char PREFIX(inflate_copyright)[] = " inflate 1.3.1 Copyright 1995-2024 Mar
  /* Count number of codes for each code length. */
  static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) {
      int sym;
-    static const ALIGNED_(32) uint8_t one[32] = {
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    static const ALIGNED_(16) uint8_t one[256] = {
          1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
      };
  
  #if defined(__ARM_NEON) || defined(__ARM_NEON__)
      uint8x16_t s1 = vdupq_n_u8(0);
      uint8x16_t s2 = vdupq_n_u8(0);
  
-    const uint8_t *p = &one[16];
      if (codes & 1) {
-        s1 = vld1q_u8(&p[-lens[0]]);
+        s1 = vld1q_u8(&one[16 * lens[0]]);
      }
      for (sym = codes & 1; sym < codes; sym += 2) {
-      s1 = vaddq_u8(s1, vld1q_u8(&p[-lens[sym]]));
-      s2 = vaddq_u8(s2, vld1q_u8(&p[-lens[sym+1]]));
+      s1 = vaddq_u8(s1, vld1q_u8(&one[16 * lens[sym]]));
+      s2 = vaddq_u8(s2, vld1q_u8(&one[16 * lens[sym+1]]));
      }
  
      vst1q_u16(&count[0], vaddl_u8(vget_low_u8(s1), vget_low_u8(s2)));
@@ -50,13 +63,12 @@ static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) {
      __m128i s1 = _mm_setzero_si128();
      __m128i s2 = _mm_setzero_si128();
  
-    const uint8_t *p = (uint8_t*)&one[16];
      if (codes & 1) {
-        s1 = _mm_loadu_si128((const __m128i*)&p[-lens[0]]);
+        s1 = _mm_load_si128((const __m128i*)&one[16 * lens[0]]);
      }
      for (sym = codes & 1; sym < codes; sym += 2) {
-        s1 = _mm_add_epi8(s1, _mm_loadu_si128((const __m128i*)&p[-lens[sym]]));   // vaddq_u8
-        s2 = _mm_add_epi8(s2, _mm_loadu_si128((const __m128i*)&p[-lens[sym+1]]));
+        s1 = _mm_add_epi8(s1, _mm_load_si128((const __m128i*)&one[16 * lens[sym]]));  // vaddq_u8
+        s2 = _mm_add_epi8(s2, _mm_load_si128((const __m128i*)&one[16 * lens[sym+1]]));
      }
  
  #  if defined(__AVX2__)
author	Adam Stylinski <kungfujesus06@gmail.com>
	Fri, 12 Dec 2025 21:23:27 +0000 (16:23 -0500)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Sun, 28 Dec 2025 18:31:05 +0000 (19:31 +0100)