From: Eric Biggers Date: Tue, 4 Mar 2025 21:32:16 +0000 (-0800) Subject: x86/crc32: optimize tail handling for crc32c short inputs X-Git-Tag: v6.15-rc1~184^2~9 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5aebe00b2f7215d996926517cc9710a1d2d8b7f9;p=thirdparty%2Fkernel%2Flinux.git x86/crc32: optimize tail handling for crc32c short inputs For handling the 0 <= len < sizeof(unsigned long) bytes left at the end, do a 4-2-1 step-down instead of a byte-at-a-time loop. This allows taking advantage of wider CRC instructions. Note that crc32c-3way.S already uses this same optimization too. crc_kunit shows an improvement of about 25% for len=127. Suggested-by: "H. Peter Anvin" Acked-by: Uros Bizjak Link: https://lore.kernel.org/r/20250304213216.108925-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c index 4b4721176799a..e3f93b17ac3f1 100644 --- a/arch/x86/lib/crc32-glue.c +++ b/arch/x86/lib/crc32-glue.c @@ -57,7 +57,15 @@ u32 crc32c_arch(u32 crc, const u8 *p, size_t len) num_longs != 0; num_longs--, p += sizeof(unsigned long)) asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); - for (len %= sizeof(unsigned long); len; len--, p++) + if (sizeof(unsigned long) > 4 && (len & 4)) { + asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); + p += 4; + } + if (len & 2) { + asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); + p += 2; + } + if (len & 1) asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); return crc;