]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
lib/crc: arm: Enable arm64's NEON intrinsics implementation of crc64
authorArd Biesheuvel <ardb@kernel.org>
Wed, 22 Apr 2026 17:17:01 +0000 (19:17 +0200)
committerEric Biggers <ebiggers@kernel.org>
Thu, 28 May 2026 20:14:23 +0000 (13:14 -0700)
Tweak the NEON intrinsics crc64 code written for arm64 so it can be
built for 32-bit ARM as well. The only workaround needed is to provide
alternatives for vmull_p64() and vmull_high_p64() on Clang, which only
defines those when building for the AArch64 or arm64ec ISA. Use the same
helpers for GCC too, to avoid doubling the size of the test/validation
matrix.

KUnit benchmark results (Cortex-A53 @ 1 Ghz)

Before:

   # crc64_nvme_benchmark: len=1: 35 MB/s
   # crc64_nvme_benchmark: len=16: 78 MB/s
   # crc64_nvme_benchmark: len=64: 87 MB/s
   # crc64_nvme_benchmark: len=127: 88 MB/s
   # crc64_nvme_benchmark: len=128: 88 MB/s
   # crc64_nvme_benchmark: len=200: 89 MB/s
   # crc64_nvme_benchmark: len=256: 89 MB/s
   # crc64_nvme_benchmark: len=511: 89 MB/s
   # crc64_nvme_benchmark: len=512: 89 MB/s
   # crc64_nvme_benchmark: len=1024: 90 MB/s
   # crc64_nvme_benchmark: len=3173: 90 MB/s
   # crc64_nvme_benchmark: len=4096: 90 MB/s
   # crc64_nvme_benchmark: len=16384: 90 MB/s

After:

   # crc64_nvme_benchmark: len=1: 32 MB/s
   # crc64_nvme_benchmark: len=16: 76 MB/s
   # crc64_nvme_benchmark: len=64: 71 MB/s
   # crc64_nvme_benchmark: len=127: 88 MB/s
   # crc64_nvme_benchmark: len=128: 618 MB/s
   # crc64_nvme_benchmark: len=200: 542 MB/s
   # crc64_nvme_benchmark: len=256: 920 MB/s
   # crc64_nvme_benchmark: len=511: 836 MB/s
   # crc64_nvme_benchmark: len=512: 1261 MB/s
   # crc64_nvme_benchmark: len=1024: 1531 MB/s
   # crc64_nvme_benchmark: len=3173: 1731 MB/s
   # crc64_nvme_benchmark: len=4096: 1851 MB/s
   # crc64_nvme_benchmark: len=16384: 1858 MB/s

Don't bother with big-endian, as it doesn't work correctly on Clang, and
is barely used these days.

Note that ARM disables preemption and softirq processing when using
kernel mode SIMD, so take care not to hog the CPU for too long.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://patch.msgid.link/20260422171655.3437334-15-ardb+git@google.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
lib/crc/Kconfig
lib/crc/Makefile
lib/crc/arm/crc64-neon.h [new file with mode: 0644]
lib/crc/arm/crc64.h [new file with mode: 0644]

index f47bb4c706fb7880b2586d4a63fdceea5d833306..927fc6a6b2b9d32fc78bf6f21d113c78c53894ab 100644 (file)
@@ -82,6 +82,7 @@ config CRC64
 config CRC64_ARCH
        bool
        depends on CRC64 && CRC_OPTIMIZATIONS
+       default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
        default y if ARM64
        default y if RISCV && RISCV_ISA_ZBC && 64BIT
        default y if X86_64
index 193257ae466fc15f54d564ff9fe753f4eb311eec..386e9c1752632015fb22357c00828774e338f555 100644 (file)
@@ -39,8 +39,11 @@ crc64-y := crc64-main.o
 ifeq ($(CONFIG_CRC64_ARCH),y)
 CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
 
+crc64-cflags-$(CONFIG_ARM) += -march=armv8-a -mfpu=crypto-neon-fp-armv8
+crc64-cflags-$(CONFIG_ARM64) += -march=armv8-a+crypto
 CFLAGS_REMOVE_crc64-neon.o += $(CC_FLAGS_NO_FPU)
-CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) -march=armv8-a+crypto
+CFLAGS_crc64-neon.o += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH) $(crc64-cflags-y)
+crc64-$(CONFIG_ARM) += crc64-neon.o
 crc64-$(CONFIG_ARM64) += crc64-neon.o
 
 crc64-$(CONFIG_RISCV) += riscv/crc64_lsb.o riscv/crc64_msb.o
diff --git a/lib/crc/arm/crc64-neon.h b/lib/crc/arm/crc64-neon.h
new file mode 100644 (file)
index 0000000..645f553
--- /dev/null
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+static inline uint64x2_t pmull64(uint64x2_t a, uint64x2_t b)
+{
+       uint64_t l = vgetq_lane_u64(a, 0);
+       uint64_t m = vgetq_lane_u64(b, 0);
+       uint64x2_t result;
+
+       asm("vmull.p64  %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+       return result;
+}
+
+static inline uint64x2_t pmull64_high(uint64x2_t a, uint64x2_t b)
+{
+       uint64_t l = vgetq_lane_u64(a, 1);
+       uint64_t m = vgetq_lane_u64(b, 1);
+       uint64x2_t result;
+
+       asm("vmull.p64  %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+       return result;
+}
+
+static inline uint64x2_t pmull64_hi_lo(uint64x2_t a, uint64x2_t b)
+{
+       uint64_t l = vgetq_lane_u64(a, 1);
+       uint64_t m = vgetq_lane_u64(b, 0);
+       uint64x2_t result;
+
+       asm("vmull.p64  %q0, %P1, %P2" : "=w"(result) : "w"(l), "w"(m));
+
+       return result;
+}
diff --git a/lib/crc/arm/crc64.h b/lib/crc/arm/crc64.h
new file mode 100644 (file)
index 0000000..de27428
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * CRC64 using ARM PMULL instructions
+ */
+
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+u64 crc64_nvme_neon(u64 crc, const u8 *p, size_t len);
+
+#define crc64_be_arch crc64_be_generic
+
+static inline u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len)
+{
+       if (len >= 128 && static_branch_likely(&have_pmull) &&
+           likely(may_use_simd())) {
+               do {
+                       size_t chunk = min_t(size_t, len & ~15, SZ_4K);
+
+                       scoped_ksimd()
+                               crc = crc64_nvme_neon(crc, p, chunk);
+
+                       p += chunk;
+                       len -= chunk;
+               } while (len >= 128);
+       }
+       return crc64_nvme_generic(crc, p, len);
+}
+
+#define crc64_mod_init_arch crc64_mod_init_arch
+static void crc64_mod_init_arch(void)
+{
+       if (elf_hwcap2 & HWCAP2_PMULL)
+               static_branch_enable(&have_pmull);
+}