Optimize adler32_swar alignment and remove platform conditionals

author Nathan Moinvaziri <nathan@nathanm.com>

Thu, 19 Mar 2026 19:28:22 +0000 (12:28 -0700)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Fri, 1 May 2026 12:34:00 +0000 (14:34 +0200)
author Nathan Moinvaziri <nathan@nathanm.com>
Thu, 19 Mar 2026 19:28:22 +0000 (12:28 -0700)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 1 May 2026 12:34:00 +0000 (14:34 +0200)
diff --git a/adler32_p.h b/adler32_p.h

index 5b33d2bc8388e42192d19ca3927c3aa4b633449b..efd8878e10ea5c88d1f36b0856c4944593035867 100644 (file)
--- a/adler32_p.h
+++ b/adler32_p.h
@@ -52,9 +52,8 @@ Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t
      }
  }
  
-#if OPTIMAL_CMP >= 64
-/* SWAR scalar adler32 for 64-bit platforms with fast unaligned access. Splits bytes
- * into even/odd lanes packed as 4x16-bit in uint64_t, with prefix sums for s2.
+/* SIMD Within A Register (SWAR) scalar adler32. Splits bytes into
+ * even/odd lanes packed as 4x16-bit in uint64_t, with prefix sums for s2.
   * Reduction uses multiply-and-shift with positional weight constants.
   *
   * Technique pioneered by Michael Niedermayer <michaelni@gmx.at>.
@@ -69,10 +68,11 @@ Z_FORCEINLINE static void adler32_swar(uint32_t *adler, uint8_t *dst, const uint
  
      *sum2 += *adler * (uint32_t)len;
  
+    const uint64_t *src64 = (const uint64_t *)buf;
+
      while (len >= 16) {
-        uint64_t v0, v1;
-        memcpy(&v0, buf, sizeof(v0));
-        memcpy(&v1, buf + 8, sizeof(v1));
+        uint64_t v0 = src64[0];
+        uint64_t v1 = src64[1];
          if (COPY) {
              memcpy(dst, &v0, sizeof(v0));
              memcpy(dst + 8, &v1, sizeof(v1));
@@ -89,14 +89,13 @@ Z_FORCEINLINE static void adler32_swar(uint32_t *adler, uint8_t *dst, const uint
          sum_even +=  v1       & ADLER32_SWAR_EVEN_MASK;
          sum_odd  += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK;
  
-        buf += 16;
+        src64 += 2;
          len -= 16;
      }
  
      /* Handle remaining 8 bytes if present */
      if (len >= 8) {
-        uint64_t v;
-        memcpy(&v, buf, sizeof(v));
+        uint64_t v = *src64;
          if (COPY)
              memcpy(dst, &v, sizeof(v));
  
@@ -131,15 +130,12 @@ Z_FORCEINLINE static void adler32_swar(uint32_t *adler, uint8_t *dst, const uint
  #endif
  }
  
-#endif
-
  Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len,
                                                  uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) {
      if (len) {
-#if OPTIMAL_CMP >= 64
          Z_UNUSED(MAX_LEN);
-        /* Process using packed 64-bit arithmetic */
-        while (len >= 8) {
+        /* Process using packed 64-bit arithmetic when source is aligned */
+        while (len >= 8 && ((uintptr_t)buf & 7) == 0) {
              size_t chunk = MIN(ALIGN_DOWN(len, 8), ADLER32_SWAR_MAX_BYTES);
              adler32_swar(&adler, dst, buf, chunk, &sum2, COPY);
              buf += chunk;
@@ -147,20 +143,6 @@ Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, co
                  dst += chunk;
              len -= chunk;
          }
-#else
-        /* DO16 loop for large remainders only (scalar, risc-v). */
-        if (MAX_LEN >= 32) {
-            while (len >= 16) {
-                if (COPY) {
-                    memcpy(dst, buf, 16);
-                    dst += 16;
-                }
-                len -= 16;
-                ADLER_DO16(adler, sum2, buf);
-                buf += 16;
-            }
-        }
-#endif
          /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */
          while (len >= 4) {
              if (COPY) {
diff --git a/arch/generic/adler32_c.c b/arch/generic/adler32_c.c

index 5161406bce41ca7d3a4d680931f68caca58966a2..0f2224b06f2588719928eeaf8da90b9f9e2fee90 100644 (file)
--- a/arch/generic/adler32_c.c
+++ b/arch/generic/adler32_c.c
@@ -27,24 +27,26 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
      if (UNLIKELY(len < 16))
          return adler32_copy_tail(adler, NULL, buf, len, sum2, 1, 15, 0);
  
+    /* Align source to 8 bytes so SWAR loads are naturally aligned */
+    size_t align_diff = ALIGN_DIFF(buf, 8);
+    if (align_diff) {
+        adler32_copy_align(&adler, NULL, buf, align_diff, &sum2, 7, 0);
+        buf += align_diff;
+        len -= align_diff;
+    }
+
      /* do length NMAX blocks -- requires just one modulo operation */
      while (len >= NMAX) {
          len -= NMAX;
-#if OPTIMAL_CMP >= 64
          n = NMAX;
+
          do {
              size_t chunk = MIN(ALIGN_DOWN(n, 8), ADLER32_SWAR_MAX_BYTES);
              adler32_swar(&adler, NULL, buf, chunk, &sum2, 0);
              buf += chunk;
              n -= chunk;
          } while (n >= 8);
-#else
-        n = NMAX / 8;
-        do {
-            ADLER_DO8(adler, sum2, buf, 0);
-            buf += 8;
-        } while (--n);
-#endif
+
          adler %= BASE;
          sum2 %= BASE;
      }
author	Nathan Moinvaziri <nathan@nathanm.com>
	Thu, 19 Mar 2026 19:28:22 +0000 (12:28 -0700)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Fri, 1 May 2026 12:34:00 +0000 (14:34 +0200)
adler32_p.h		patch \| blob \| blame \| history
arch/generic/adler32_c.c		patch \| blob \| blame \| history