lib/crc: x86/crc32c: Enable VPCLMULQDQ optimization where beneficial

author Eric Biggers <ebiggers@kernel.org>

Sat, 19 Jul 2025 22:49:38 +0000 (15:49 -0700)

committer Eric Biggers <ebiggers@kernel.org>

Mon, 21 Jul 2025 03:52:34 +0000 (20:52 -0700)
author Eric Biggers <ebiggers@kernel.org>
Sat, 19 Jul 2025 22:49:38 +0000 (15:49 -0700)
committer Eric Biggers <ebiggers@kernel.org>
Mon, 21 Jul 2025 03:52:34 +0000 (20:52 -0700)
diff --git a/lib/crc/x86/crc-pclmul-consts.h b/lib/crc/x86/crc-pclmul-consts.h

index fcc63c0643330f5acb4f99a892cffa131075b06d..6ae94158fca287b8f590c4fa1becce7407b4317d 100644 (file)
--- a/lib/crc/x86/crc-pclmul-consts.h
+++ b/lib/crc/x86/crc-pclmul-consts.h
@@ -2,7 +2,7 @@
  /*
   * CRC constants generated by:
   *
- *     ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ *     ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
   *
   * Do not edit manually.
   */
@@ -98,6 +98,51 @@ static const struct {
         },
  };
  
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ *        x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+       u64 fold_across_2048_bits_consts[2];
+       u64 fold_across_1024_bits_consts[2];
+       u64 fold_across_512_bits_consts[2];
+       u64 fold_across_256_bits_consts[2];
+       u64 fold_across_128_bits_consts[2];
+       u8 shuf_table[48];
+       u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+       .fold_across_2048_bits_consts = {
+               0x00000000dcb17aa4,     /* HI64_TERMS: (x^2079 mod G) * x^32 */
+               0x00000000b9e02b86,     /* LO64_TERMS: (x^2015 mod G) * x^32 */
+       },
+       .fold_across_1024_bits_consts = {
+               0x000000006992cea2,     /* HI64_TERMS: (x^1055 mod G) * x^32 */
+               0x000000000d3b6092,     /* LO64_TERMS: (x^991 mod G) * x^32 */
+       },
+       .fold_across_512_bits_consts = {
+               0x00000000740eef02,     /* HI64_TERMS: (x^543 mod G) * x^32 */
+               0x000000009e4addf8,     /* LO64_TERMS: (x^479 mod G) * x^32 */
+       },
+       .fold_across_256_bits_consts = {
+               0x000000003da6d0cb,     /* HI64_TERMS: (x^287 mod G) * x^32 */
+               0x00000000ba4fc28e,     /* LO64_TERMS: (x^223 mod G) * x^32 */
+       },
+       .fold_across_128_bits_consts = {
+               0x00000000f20c0dfe,     /* HI64_TERMS: (x^159 mod G) * x^32 */
+               0x00000000493c7d27,     /* LO64_TERMS: (x^95 mod G) * x^32 */
+       },
+       .shuf_table = {
+               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+       },
+       .barrett_reduction_consts = {
+               0x4869ec38dea713f1,     /* HI64_TERMS: floor(x^95 / G) */
+               0x0000000105ec76f0,     /* LO64_TERMS: (G - x^32) * x^31 */
+       },
+};
+
  /*
   * CRC folding constants generated for most-significant-bit-first CRC-64 using
   * G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
diff --git a/lib/crc/x86/crc32.h b/lib/crc/x86/crc32.h

index ba4dacf23340c82c6eba2014a3457f866347ec30..cea2c96d08d09ec0374239df188fd888265bb4aa 100644 (file)
--- a/lib/crc/x86/crc32.h
+++ b/lib/crc/x86/crc32.h
@@ -11,6 +11,7 @@
  
  static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
  static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
  
  DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
  
@@ -44,12 +45,46 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
  
         if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
             static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+               /*
+                * Long length, the vector registers are usable, and the CPU is
+                * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+                * It is worthwhile to divide the data into multiple streams,
+                * CRC them independently, and combine them using PCLMULQDQ.
+                * crc32c_x86_3way() does this using 3 streams, which is the
+                * most that x86_64 CPUs have traditionally been capable of.
+                *
+                * However, due to improved VPCLMULQDQ performance on newer
+                * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+                * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+                * "good" implementation of AVX-512.
+                *
+                * Future work: the optimal strategy on Zen 3--5 is actually to
+                * use both crc32q and VPCLMULQDQ in parallel.  Unfortunately,
+                * different numbers of streams and vector lengths are optimal
+                * on each CPU microarchitecture, making it challenging to take
+                * advantage of this.  (Zen 5 even supports 7 parallel crc32q, a
+                * major upgrade.)  For now, just choose between
+                * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512().  The latter
+                * is needed anyway for crc32_le(), so we just reuse it here.
+                */
                 kernel_fpu_begin();
-               crc = crc32c_x86_3way(crc, p, len);
+               if (static_branch_likely(&have_vpclmul_avx512))
+                       crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+                                      crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+               else
+                       crc = crc32c_x86_3way(crc, p, len);
                 kernel_fpu_end();
                 return crc;
         }
  
+       /*
+        * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+        * CPU supports CRC32 instructions.  Just issue a single stream of CRC32
+        * instructions inline.  While this doesn't use the CPU's CRC32
+        * throughput very well, it avoids the need to combine streams.  Stream
+        * combination would be inefficient here.
+        */
+
         for (num_longs = len / sizeof(unsigned long);
              num_longs != 0; num_longs--, p += sizeof(unsigned long))
                 asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
@@ -81,6 +116,7 @@ static inline void crc32_mod_init_arch(void)
                         if (have_avx512()) {
                                 static_call_update(crc32_lsb_pclmul,
                                                    crc32_lsb_vpclmul_avx512);
+                               static_branch_enable(&have_vpclmul_avx512);
                         } else {
                                 static_call_update(crc32_lsb_pclmul,
                                                    crc32_lsb_vpclmul_avx2);
author	Eric Biggers <ebiggers@kernel.org>
	Sat, 19 Jul 2025 22:49:38 +0000 (15:49 -0700)
committer	Eric Biggers <ebiggers@kernel.org>
	Mon, 21 Jul 2025 03:52:34 +0000 (20:52 -0700)
lib/crc/x86/crc-pclmul-consts.h		patch \| blob \| blame \| history
lib/crc/x86/crc32.h		patch \| blob \| blame \| history