/*
* CRC constants generated by:
*
- * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
+ * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320,crc32_lsb_0x82f63b78,crc64_msb_0x42f0e1eba9ea3693,crc64_lsb_0x9a6c9329ac4bc9b5
*
* Do not edit manually.
*/
},
};
+/*
+ * CRC folding constants generated for least-significant-bit-first CRC-32 using
+ * G(x) = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 +
+ * x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ */
+static const struct {
+ u64 fold_across_2048_bits_consts[2];
+ u64 fold_across_1024_bits_consts[2];
+ u64 fold_across_512_bits_consts[2];
+ u64 fold_across_256_bits_consts[2];
+ u64 fold_across_128_bits_consts[2];
+ u8 shuf_table[48];
+ u64 barrett_reduction_consts[2];
+} crc32_lsb_0x82f63b78_consts ____cacheline_aligned __maybe_unused = {
+ .fold_across_2048_bits_consts = {
+ 0x00000000dcb17aa4, /* HI64_TERMS: (x^2079 mod G) * x^32 */
+ 0x00000000b9e02b86, /* LO64_TERMS: (x^2015 mod G) * x^32 */
+ },
+ .fold_across_1024_bits_consts = {
+ 0x000000006992cea2, /* HI64_TERMS: (x^1055 mod G) * x^32 */
+ 0x000000000d3b6092, /* LO64_TERMS: (x^991 mod G) * x^32 */
+ },
+ .fold_across_512_bits_consts = {
+ 0x00000000740eef02, /* HI64_TERMS: (x^543 mod G) * x^32 */
+ 0x000000009e4addf8, /* LO64_TERMS: (x^479 mod G) * x^32 */
+ },
+ .fold_across_256_bits_consts = {
+ 0x000000003da6d0cb, /* HI64_TERMS: (x^287 mod G) * x^32 */
+ 0x00000000ba4fc28e, /* LO64_TERMS: (x^223 mod G) * x^32 */
+ },
+ .fold_across_128_bits_consts = {
+ 0x00000000f20c0dfe, /* HI64_TERMS: (x^159 mod G) * x^32 */
+ 0x00000000493c7d27, /* LO64_TERMS: (x^95 mod G) * x^32 */
+ },
+ .shuf_table = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ },
+ .barrett_reduction_consts = {
+ 0x4869ec38dea713f1, /* HI64_TERMS: floor(x^95 / G) */
+ 0x0000000105ec76f0, /* LO64_TERMS: (G - x^32) * x^31 */
+ },
+};
+
/*
* CRC folding constants generated for most-significant-bit-first CRC-64 using
* G(x) = x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
+ /*
+ * Long length, the vector registers are usable, and the CPU is
+ * 64-bit and supports both CRC32 and PCLMULQDQ instructions.
+ * It is worthwhile to divide the data into multiple streams,
+ * CRC them independently, and combine them using PCLMULQDQ.
+ * crc32c_x86_3way() does this using 3 streams, which is the
+ * most that x86_64 CPUs have traditionally been capable of.
+ *
+ * However, due to improved VPCLMULQDQ performance on newer
+ * CPUs, use crc32_lsb_vpclmul_avx512() instead of
+ * crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
+ * "good" implementation of AVX-512.
+ *
+ * Future work: the optimal strategy on Zen 3--5 is actually to
+ * use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
+ * different numbers of streams and vector lengths are optimal
+ * on each CPU microarchitecture, making it challenging to take
+ * advantage of this. (Zen 5 even supports 7 parallel crc32q, a
+ * major upgrade.) For now, just choose between
+ * crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
+ * is needed anyway for crc32_le(), so we just reuse it here.
+ */
kernel_fpu_begin();
- crc = crc32c_x86_3way(crc, p, len);
+ if (static_branch_likely(&have_vpclmul_avx512))
+ crc = crc32_lsb_vpclmul_avx512(crc, p, len,
+ crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
+ else
+ crc = crc32c_x86_3way(crc, p, len);
kernel_fpu_end();
return crc;
}
+ /*
+ * Short length, XMM registers unusable, or the CPU is 32-bit; but the
+ * CPU supports CRC32 instructions. Just issue a single stream of CRC32
+ * instructions inline. While this doesn't use the CPU's CRC32
+ * throughput very well, it avoids the need to combine streams. Stream
+ * combination would be inefficient here.
+ */
+
for (num_longs = len / sizeof(unsigned long);
num_longs != 0; num_longs--, p += sizeof(unsigned long))
asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
if (have_avx512()) {
static_call_update(crc32_lsb_pclmul,
crc32_lsb_vpclmul_avx512);
+ static_branch_enable(&have_vpclmul_avx512);
} else {
static_call_update(crc32_lsb_pclmul,
crc32_lsb_vpclmul_avx2);