]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Fix initial crc value loading in crc32_(v)pclmulqdq
authorNathan Moinvaziri <nathan@nathanm.com>
Sun, 14 Dec 2025 08:57:37 +0000 (00:57 -0800)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 28 Dec 2025 13:07:03 +0000 (14:07 +0100)
In main function, alignment diff processing was getting in the way of XORing
the initial CRC, because it does not guarantee at least 16 bytes have been
loaded.

In fold_16, src data modified by initial crc XORing before being stored to dst.

arch/x86/crc32_pclmulqdq.c
arch/x86/crc32_pclmulqdq_tpl.h
arch/x86/crc32_vpclmulqdq.c

index 9cff7a8c50ff844164fb086654909035659002ae..c8be1b43bad1e73be3d8bcf546fe474fafe30268 100644 (file)
 #include "crc32_pclmulqdq_tpl.h"
 
 Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
-    /* For lens smaller than ~12, crc32_small method is faster.
-     * But there are also minimum requirements for the pclmul functions due to alignment */
-    if (len < 16)
-        return crc32_small(crc, buf, len);
-
     return crc32_copy_impl(crc, NULL, buf, len, 0);
 }
 
 Z_INTERNAL uint32_t crc32_copy_pclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    /* For lens smaller than ~12, crc32_small method is faster.
-     * But there are also minimum requirements for the pclmul functions due to alignment */
-    if (len < 16)
-        return crc32_small_copy(crc, dst, src, len);
-
     return crc32_copy_impl(crc, dst, src, len, 1);
 }
 #endif
index b5aa8c9d14d57863ffb656f1c1ec7ba8688140b6..ffd70a2484a9e0579a11df222e284e68b5af918b 100644 (file)
@@ -45,8 +45,8 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };
 
-#define ONCE(op)                if (first) { first = 0; op; }
-#define XOR_INITIAL128(where)   ONCE(where = _mm_xor_si128(where, xmm_initial))
+#define XOR_INITIAL128(where, crc)  if (crc != 0) { where = _mm_xor_si128(where, _mm_cvtsi32_si128(crc)); crc = 0; }
+#define XOR_INITIAL512(where, crc)  if (crc != 0) { where = _mm512_xor_si512(where, _mm512_zextsi128_si512(_mm_cvtsi32_si128(crc))); crc = 0; }
 
 static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
     const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
@@ -219,12 +219,8 @@ static void fold_12(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m
 }
 
 #ifdef X86_VPCLMULQDQ
-
-#define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
-
 static size_t fold_16(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst,
-    const uint8_t *src, size_t len, __m128i init_crc, int32_t first, const int COPY) {
-    __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+    const uint8_t *src, size_t len, uint32_t *crc, const int COPY) {
     __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
     __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
     __m512i z0, z1, z2, z3;
@@ -237,16 +233,21 @@ static size_t fold_16(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, _
     // zmm register init
     zmm_crc0 = _mm512_setzero_si512();
     zmm_t0 = _mm512_loadu_si512((__m512i *)src);
-    if (!COPY) {
-        XOR_INITIAL512(zmm_t0);
-    }
     zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
     zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
     zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
 
-    /* already have intermediate CRC in xmm registers
-        * fold4 with 4 xmm_crc to get zmm_crc0
-    */
+    if (COPY) {
+        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+        _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+        _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+        _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+        dst += 256;
+    }
+
+    XOR_INITIAL512(zmm_t0, *crc);
+
+    // already have intermediate CRC in xmm registers fold4 with 4 xmm_crc to get zmm_crc0
     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
     zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
@@ -255,13 +256,6 @@ static size_t fold_16(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, _
     zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
     zmm_crc0 = _mm512_ternarylogic_epi32(zmm_crc0, z0, zmm_t0, 0x96);
 
-    if (COPY) {
-        _mm512_storeu_si512((__m512i *)dst, zmm_t0);
-        _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
-        _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
-        _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
-        dst += 256;
-    }
     len -= 256;
     src += 256;
 
@@ -383,23 +377,14 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
     *xmm_crc3 = _mm_castps_si128(ps_res);
 }
 
-static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) {
-    uint32_t c = (~crc) & 0xffffffff;
-
-    while (len) {
-        len--;
-        CRC_DO1;
-    }
-
-    return c ^ 0xffffffff;
-}
-
-static inline uint32_t crc32_small_copy(uint32_t crc, uint8_t *dst, const uint8_t *buf, size_t len) {
+static inline uint32_t crc32_copy_small(uint32_t crc, uint8_t *dst, const uint8_t *buf, size_t len, const int COPY) {
     uint32_t c = (~crc) & 0xffffffff;
 
     while (len) {
         len--;
-        *dst++ = *buf;
+        if (COPY) {
+            *dst++ = *buf;
+        }
         CRC_DO1;
     }
 
@@ -447,6 +432,7 @@ static inline uint32_t fold_final(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i
     *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, crc_fold, 0x10);
     *xmm_crc3 = _mm_xor_si128(*xmm_crc3, *xmm_crc0);
     *xmm_crc3 = _mm_and_si128(*xmm_crc3, xmm_mask2);
+
     /*
      * k7
      */
@@ -469,88 +455,43 @@ static inline uint32_t fold_final(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i
 }
 
 static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
-    unsigned long algn_diff;
-    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
-    __m128i xmm_crc_part = _mm_setzero_si128();
-    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
-    __m128i xmm_crc1 = _mm_setzero_si128();
-    __m128i xmm_crc2 = _mm_setzero_si128();
-    __m128i xmm_crc3 = _mm_setzero_si128();
-    char ALIGNED_(16) partial_buf[16] = { 0 };
-    __m128i xmm_initial = _mm_cvtsi32_si128(crc);
-    int32_t first = crc != 0;
-
-    if (!COPY) {
-        /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed
-         * for the aligning load that occurs.  If there's an initial CRC, to carry it forward through
-         * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be
-         * up to 15 bytes + one full vector load. */
-        Assert(len >= 16 || first == 0, "Insufficient data for initial CRC");
+    size_t copy_len = len;
+    if (len >= 16) {
+        /* Calculate 16-byte alignment offset */
+        unsigned algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+
+        /* If total length is less than (alignment bytes + 16), use the faster small method.
+         * Handles both initially small buffers and cases where alignment would leave < 16 bytes */
+        copy_len = len < algn_diff + 16 ? len : algn_diff;
     }
 
-    if (len < 16) {
-        if (len == 0)
-            return crc;
-
-        memcpy(partial_buf, src, len);
-        xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
+    if (copy_len > 0) {
+        crc = crc32_copy_small(crc, dst, src, copy_len, COPY);
+        src += copy_len;
+        len -= copy_len;
         if (COPY) {
-            memcpy(dst, partial_buf, len);
+            dst += copy_len;
         }
-        goto partial;
     }
 
-    algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
-    if (algn_diff) {
-        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
-        if (COPY) {
-            _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
-            dst += algn_diff;
-        } else {
-            XOR_INITIAL128(xmm_crc_part);
-
-            if (algn_diff < 4 && crc != 0) {
-                xmm_t0 = xmm_crc_part;
-                if (len >= 32) {
-                    xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
-                    fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
-                    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
-                } else {
-                    memcpy(partial_buf, src + 16, len - 16);
-                    xmm_crc_part = _mm_load_si128((__m128i*)partial_buf);
-                    fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
-                    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
-                    src += 16;
-                    len -= 16;
-                    if (COPY) {
-                        dst -= algn_diff;
-                    }
-                    goto partial;
-                }
-
-                src += 16;
-                len -= 16;
-            }
-        }
-
-        partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+    if (len == 0)
+        return crc;
 
-        src += algn_diff;
-        len -= algn_diff;
-    }
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+    __m128i xmm_crc_part = _mm_setzero_si128();
+    __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    __m128i xmm_crc1 = _mm_setzero_si128();
+    __m128i xmm_crc2 = _mm_setzero_si128();
+    __m128i xmm_crc3 = _mm_setzero_si128();
 
 #ifdef X86_VPCLMULQDQ
     if (len >= 256) {
-        size_t n;
+        size_t n = fold_16(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len, &crc, COPY);
+        len -= n;
+        src += n;
         if (COPY) {
-            n = fold_16(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len, xmm_initial, first, 1);
             dst += n;
-        } else {
-            n = fold_16(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len, xmm_initial, first, 0);
-            first = 0;
         }
-        len -= n;
-        src += n;
     }
 #endif
 
@@ -582,9 +523,9 @@ static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t
             _mm_storeu_si128((__m128i *)dst + 6, chorba2);
             _mm_storeu_si128((__m128i *)dst + 7, chorba1);
             dst += 16*8;
-        } else {
-            XOR_INITIAL128(chorba8);
         }
+        XOR_INITIAL128(chorba8, crc);
+
         chorba2 = _mm_xor_si128(chorba2, chorba8);
         chorba1 = _mm_xor_si128(chorba1, chorba7);
         src += 16*8;
@@ -793,9 +734,8 @@ static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t
             _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
             _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
             dst += 64;
-        } else {
-            XOR_INITIAL128(xmm_t0);
         }
+        XOR_INITIAL128(xmm_t0, crc);
 
         xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
         xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
@@ -818,9 +758,8 @@ static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t
             _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
             _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
             dst += 48;
-        } else {
-            XOR_INITIAL128(xmm_t0);
         }
+        XOR_INITIAL128(xmm_t0, crc);
         fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
         xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
@@ -836,9 +775,8 @@ static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t
             _mm_storeu_si128((__m128i *)dst, xmm_t0);
             _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
             dst += 32;
-        } else {
-            XOR_INITIAL128(xmm_t0);
         }
+        XOR_INITIAL128(xmm_t0, crc);
         fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
         xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
@@ -850,18 +788,17 @@ static inline uint32_t crc32_copy_impl(uint32_t crc, uint8_t *dst, const uint8_t
         if (COPY) {
             _mm_storeu_si128((__m128i *)dst, xmm_t0);
             dst += 16;
-        } else {
-            XOR_INITIAL128(xmm_t0);
         }
+        XOR_INITIAL128(xmm_t0, crc);
         fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
         xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
     }
 
-partial:
     if (len) {
         memcpy(&xmm_crc_part, src, len);
         if (COPY) {
+            uint8_t ALIGNED_(16) partial_buf[16] = { 0 };
             _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part);
             memcpy(dst, partial_buf, len);
         }
index 7eabbdb4d5ba54a62501f30bce70488a6c552b23..793d8ab99af83457bbc72a9cec3ce05790fcac6f 100644 (file)
@@ -9,20 +9,10 @@
 #include "crc32_pclmulqdq_tpl.h"
 
 Z_INTERNAL uint32_t crc32_vpclmulqdq(uint32_t crc, const uint8_t *buf, size_t len) {
-    /* For lens smaller than ~12, crc32_small method is faster.
-     * But there are also minimum requirements for the pclmul functions due to alignment */
-    if (len < 16)
-        return crc32_small(crc, buf, len);
-
     return crc32_copy_impl(crc, NULL, buf, len, 0);
 }
 
 Z_INTERNAL uint32_t crc32_copy_vpclmulqdq(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len) {
-    /* For lens smaller than ~12, crc32_small method is faster.
-     * But there are also minimum requirements for the pclmul functions due to alignment */
-    if (len < 16)
-        return crc32_small_copy(crc, dst, src, len);
-
     return crc32_copy_impl(crc, dst, src, len, 1);
 }
 #endif