From: Adam Stylinski Date: Fri, 12 Dec 2025 21:23:27 +0000 (-0500) Subject: Force purely aligned loads in inflate_table code length counting X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f921e202f4de7ee380a7cdb6c0b60e69404a566d;p=thirdparty%2Fzlib-ng.git Force purely aligned loads in inflate_table code length counting At the expense of some extra stack space and eating about 4 more cache lines, let's make these loads purely aligned. On potato CPUs such as the Core 2, unaligned loads in a loop are not ideal. Additionally some SBC based ARM chips (usually the little in big.little variants) suffer a penalty for unaligned loads. This also paves the way for a trivial altivec implementation, for which unaligned loads don't exist and need to be synthesized with permutation vectors. --- diff --git a/inftrees.c b/inftrees.c index 793a59c91..63c9b75f1 100644 --- a/inftrees.c +++ b/inftrees.c @@ -25,22 +25,35 @@ const char PREFIX(inflate_copyright)[] = " inflate 1.3.1 Copyright 1995-2024 Mar /* Count number of codes for each code length. */ static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) { int sym; - static const ALIGNED_(32) uint8_t one[32] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + static const ALIGNED_(16) uint8_t one[256] = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; #if defined(__ARM_NEON) || defined(__ARM_NEON__) uint8x16_t s1 = vdupq_n_u8(0); uint8x16_t s2 = vdupq_n_u8(0); - const uint8_t *p = &one[16]; if (codes & 1) { - s1 = vld1q_u8(&p[-lens[0]]); + s1 = vld1q_u8(&one[16 * lens[0]]); } for (sym = codes & 1; sym < codes; sym += 2) { - s1 = vaddq_u8(s1, vld1q_u8(&p[-lens[sym]])); - s2 = vaddq_u8(s2, vld1q_u8(&p[-lens[sym+1]])); + s1 = vaddq_u8(s1, vld1q_u8(&one[16 * lens[sym]])); + s2 = vaddq_u8(s2, vld1q_u8(&one[16 * lens[sym+1]])); } vst1q_u16(&count[0], vaddl_u8(vget_low_u8(s1), vget_low_u8(s2))); @@ -50,13 +63,12 @@ static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) { __m128i s1 = _mm_setzero_si128(); __m128i s2 = _mm_setzero_si128(); - const uint8_t *p = (uint8_t*)&one[16]; if (codes & 1) { - s1 = _mm_loadu_si128((const __m128i*)&p[-lens[0]]); + s1 = _mm_load_si128((const __m128i*)&one[16 * lens[0]]); } for (sym = codes & 1; sym < codes; sym += 2) { - s1 = _mm_add_epi8(s1, _mm_loadu_si128((const __m128i*)&p[-lens[sym]])); // vaddq_u8 - s2 = _mm_add_epi8(s2, _mm_loadu_si128((const __m128i*)&p[-lens[sym+1]])); + s1 = _mm_add_epi8(s1, _mm_load_si128((const __m128i*)&one[16 * lens[sym]])); // vaddq_u8 + s2 = _mm_add_epi8(s2, _mm_load_si128((const __m128i*)&one[16 * lens[sym+1]])); } # if defined(__AVX2__)