]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Optimize symbol buffer access based on platform unaligned access
authorNathan Moinvaziri <nathan@nathanm.com>
Sun, 1 Feb 2026 00:57:24 +0000 (16:57 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 18 Feb 2026 12:57:07 +0000 (13:57 +0100)
deflate.c
deflate.h
deflate_p.h
trees.c

index e0f89fab9d3b86e40463b2c9eb5736451ea5b197..81e1ac5a5fc50de9f3f1324c452d53d9020b1531 100644 (file)
--- a/deflate.c
+++ b/deflate.c
@@ -167,7 +167,7 @@ Z_INTERNAL deflate_allocs* alloc_deflate(PREFIX3(stream) *strm, int windowBits,
     int window_size = DEFLATE_ADJUST_WINDOW_SIZE((1 << windowBits) * 2);
     int prev_size = (1 << windowBits) * (int)sizeof(Pos);
     int head_size = HASH_SIZE * sizeof(Pos);
-    int pending_size = lit_bufsize * LIT_BUFS;
+    int pending_size = (lit_bufsize * LIT_BUFS) + 1;
     int state_size = sizeof(deflate_state);
     int alloc_size = sizeof(deflate_allocs);
 
index 85435636d4f4d846dc6bb1d96dc8cc03d4708e72..3f9f8f468652a7fbff015c7ffe45737354270a3f 100644 (file)
--- a/deflate.h
+++ b/deflate.h
 #  define GZIP
 #endif
 
-/* define LIT_MEM to slightly increase the speed of deflate (order 1% to 2%) at
-   the cost of a larger memory footprint */
-#ifndef NO_LIT_MEM
+/* LIT_MEM uses separate distance/length buffers instead of the overlaid sym_buf.
+   This uses ~20% more memory but is 1-2% faster on platforms without fast unaligned
+   access. By default, LIT_MEM is only enabled when OPTIMAL_CMP < 32. Define LIT_MEM
+   to force separate buffers, or NO_LIT_MEM to force sym_buf usage. */
+#if !defined(LIT_MEM) && !defined(NO_LIT_MEM) && (OPTIMAL_CMP < 32)
 #  define LIT_MEM
 #endif
 
index ae340f8f371f65a26ba52ce9468bc2c8e04637da..f60970bab37666b82f00fb182e39169a1b93ba1d 100644 (file)
@@ -11,6 +11,7 @@
 
 #include "functable.h"
 #include "fallback_builtins.h"
+#include "zmemory.h"
 
 /* Forward declare common non-inlined functions declared in deflate.c */
 
@@ -68,9 +69,13 @@ static inline int zng_tr_tally_lit(deflate_state *s, unsigned char c) {
     s->l_buf[sym_next] = c;
     s->sym_next = sym_next + 1;
 #else
+#  if OPTIMAL_CMP >= 32
+    zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE((uint32_t)c << 16));
+#  else
     s->sym_buf[sym_next] = 0;
     s->sym_buf[sym_next+1] = 0;
     s->sym_buf[sym_next+2] = c;
+#  endif
     s->sym_next = sym_next + 3;
 #endif
     s->dyn_ltree[c].Freq++;
@@ -90,9 +95,13 @@ static inline int zng_tr_tally_dist(deflate_state* s, uint32_t dist, uint32_t le
     s->l_buf[sym_next] = (uint8_t)len;
     s->sym_next = sym_next + 1;
 #else
+#  if OPTIMAL_CMP >= 32
+    zng_memwrite_4(&s->sym_buf[sym_next], Z_U32_TO_LE(dist | ((uint32_t)len << 16)));
+#  else
     s->sym_buf[sym_next] = (uint8_t)(dist);
     s->sym_buf[sym_next+1] = (uint8_t)(dist >> 8);
     s->sym_buf[sym_next+2] = (uint8_t)len;
+#  endif
     s->sym_next = sym_next + 3;
 #endif
     s->matches++;
diff --git a/trees.c b/trees.c
index 28ea5d2f1f28aeeb6e3b83cbfa203798464bbd23..0f35b68d935e19341ab0ff79580a8da7a25d2e03 100644 (file)
--- a/trees.c
+++ b/trees.c
@@ -730,9 +730,15 @@ static void compress_block(deflate_state *s, const ct_data *ltree, const ct_data
             dist = d_buf[sx];
             lc = l_buf[sx++];
 #else
-            dist = sym_buf[sx++] & 0xff;
-            dist += (unsigned)(sym_buf[sx++] & 0xff) << 8;
-            lc = sym_buf[sx++];
+#  if OPTIMAL_CMP >= 32
+            uint32_t val = Z_U32_FROM_LE(zng_memread_4(&sym_buf[sx]));
+            dist = val & 0xffff;
+            lc = (val >> 16) & 0xff;
+#  else
+            dist = sym_buf[sx] + ((unsigned)sym_buf[sx + 1] << 8);
+            lc = sym_buf[sx + 2];
+#  endif
+            sx += 3;
 #endif
             if (dist == 0) {
                 zng_emit_lit(s, ltree, lc);