]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
xor/arm: Replace vectorized implementation with arm64's intrinsics
authorArd Biesheuvel <ardb@kernel.org>
Wed, 22 Apr 2026 17:16:58 +0000 (19:16 +0200)
committerEric Biggers <ebiggers@kernel.org>
Thu, 28 May 2026 20:14:19 +0000 (13:14 -0700)
Drop the XOR implementation generated by the vectorizer: this has always
been a bit of a hack, and now that arm64 has an intrinsics version that
works on ARM too, let's use that instead.

So copy the part of the arm64 code that can be shared (so not the EOR3
version). The arm64 code will be updated in a subsequent patch to share
this implementation.

Performance (QEMU mach-virt VM running on Synquacer [Cortex-A53 @ 1 GHz]

Before:

[    3.519687] xor: measuring software checksum speed
[    3.521725]    neon            :  1660 MB/sec
[    3.524733]    32regs          :  1105 MB/sec
[    3.527751]    8regs           :  1098 MB/sec
[    3.529911]    arm4regs        :  1540 MB/sec

After:

[    3.517654] xor: measuring software checksum speed
[    3.519454]    neon            :  1896 MB/sec
[    3.522499]    32regs          :  1090 MB/sec
[    3.525560]    8regs           :  1083 MB/sec
[    3.527700]    arm4regs        :  1556 MB/sec

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Christoph Hellwig <hch@lst.de>
Link: https://patch.msgid.link/20260422171655.3437334-12-ardb+git@google.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
lib/raid/xor/Makefile
lib/raid/xor/arm/xor-neon.c [deleted file]
lib/raid/xor/arm/xor-neon.h [new file with mode: 0644]
lib/raid/xor/arm/xor_arch.h
lib/raid/xor/xor-8regs.c
lib/raid/xor/xor-neon.c [new file with mode: 0644]

index 4d633dfd5b90cfbad4b196074f67e988f1e88d06..d78400f2427ab436cf82c04d47381cd8e6f2b353 100644 (file)
@@ -17,7 +17,7 @@ endif
 xor-$(CONFIG_ALPHA)            += alpha/xor.o
 xor-$(CONFIG_ARM)              += arm/xor.o
 ifeq ($(CONFIG_ARM),y)
-xor-$(CONFIG_KERNEL_MODE_NEON) += arm/xor-neon.o arm/xor-neon-glue.o
+xor-$(CONFIG_KERNEL_MODE_NEON) += xor-neon.o arm/xor-neon-glue.o
 endif
 xor-$(CONFIG_ARM64)            += arm64/xor-neon.o arm64/xor-neon-glue.o
 xor-$(CONFIG_CPU_HAS_LSX)      += loongarch/xor_simd.o
@@ -31,8 +31,8 @@ xor-$(CONFIG_X86_32)          += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
 xor-$(CONFIG_X86_64)           += x86/xor-avx.o x86/xor-sse.o
 obj-y                          += tests/
 
-CFLAGS_arm/xor-neon.o          += $(CC_FLAGS_FPU)
-CFLAGS_REMOVE_arm/xor-neon.o   += $(CC_FLAGS_NO_FPU)
+CFLAGS_xor-neon.o              += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH)
+CFLAGS_REMOVE_xor-neon.o       += $(CC_FLAGS_NO_FPU)
 
 CFLAGS_arm64/xor-neon.o                += $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c
deleted file mode 100644 (file)
index 23147e3..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include "xor_impl.h"
-#include "xor_arch.h"
-
-#ifndef __ARM_NEON__
-#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
-#endif
-
-/*
- * Pull in the reference implementations while instructing GCC (through
- * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
- * NEON instructions. Clang does this by default at O2 so no pragma is
- * needed.
- */
-#ifdef CONFIG_CC_IS_GCC
-#pragma GCC optimize "tree-vectorize"
-#endif
-
-#define NO_TEMPLATE
-#include "../xor-8regs.c"
-
-__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
diff --git a/lib/raid/xor/arm/xor-neon.h b/lib/raid/xor/arm/xor-neon.h
new file mode 100644 (file)
index 0000000..406e035
--- /dev/null
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+extern struct xor_block_template xor_block_arm4regs;
+extern struct xor_block_template xor_block_neon;
+
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
index 775ff835df656ef1f879d8628a70bf999a98aff4..f1ddb64fe62abe8e37153db410f1b19118457d8f 100644 (file)
@@ -3,12 +3,7 @@
  *  Copyright (C) 2001 Russell King
  */
 #include <asm/neon.h>
-
-extern struct xor_block_template xor_block_arm4regs;
-extern struct xor_block_template xor_block_neon;
-
-void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
-               unsigned int bytes);
+#include "xor-neon.h"
 
 static __always_inline void __init arch_xor_init(void)
 {
index 1edaed8acffe60b8c6343696b915def31d6d32dd..46b3c8bdc27f33b32e0efd2f20f14df9b3f4d470 100644 (file)
@@ -93,11 +93,9 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-#ifndef NO_TEMPLATE
 DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
 
 struct xor_block_template xor_block_8regs = {
        .name           = "8regs",
        .xor_gen        = xor_gen_8regs,
 };
-#endif /* NO_TEMPLATE */
diff --git a/lib/raid/xor/xor-neon.c b/lib/raid/xor/xor-neon.c
new file mode 100644 (file)
index 0000000..a3e2b4a
--- /dev/null
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors: Jackie Liu <liuyun01@kylinos.cn>
+ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
+ */
+
+#include "xor_impl.h"
+#include "xor-neon.h"
+
+#include <asm/neon-intrinsics.h>
+
+static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3,
+               const unsigned long * __restrict p4)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* p1 ^= p4 */
+               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3,
+               const unsigned long * __restrict p4,
+               const unsigned long * __restrict p5)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+       uint64_t *dp5 = (uint64_t *)p5;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* p1 ^= p4 */
+               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+               /* p1 ^= p5 */
+               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+               dp5 += 8;
+       } while (--lines > 0);
+}
+
+__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
+               __xor_neon_5);