xor/arm: Replace vectorized implementation with arm64's intrinsics

author Ard Biesheuvel <ardb@kernel.org>

Wed, 22 Apr 2026 17:16:58 +0000 (19:16 +0200)

committer Eric Biggers <ebiggers@kernel.org>

Thu, 28 May 2026 20:14:19 +0000 (13:14 -0700)
author Ard Biesheuvel <ardb@kernel.org>
Wed, 22 Apr 2026 17:16:58 +0000 (19:16 +0200)
committer Eric Biggers <ebiggers@kernel.org>
Thu, 28 May 2026 20:14:19 +0000 (13:14 -0700)
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile

index 4d633dfd5b90cfbad4b196074f67e988f1e88d06..d78400f2427ab436cf82c04d47381cd8e6f2b353 100644 (file)
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -17,7 +17,7 @@ endif
  xor-$(CONFIG_ALPHA)            += alpha/xor.o
  xor-$(CONFIG_ARM)              += arm/xor.o
  ifeq ($(CONFIG_ARM),y)
-xor-$(CONFIG_KERNEL_MODE_NEON) += arm/xor-neon.o arm/xor-neon-glue.o
+xor-$(CONFIG_KERNEL_MODE_NEON) += xor-neon.o arm/xor-neon-glue.o
  endif
  xor-$(CONFIG_ARM64)            += arm64/xor-neon.o arm64/xor-neon-glue.o
  xor-$(CONFIG_CPU_HAS_LSX)      += loongarch/xor_simd.o
@@ -31,8 +31,8 @@ xor-$(CONFIG_X86_32)          += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
  xor-$(CONFIG_X86_64)           += x86/xor-avx.o x86/xor-sse.o
  obj-y                          += tests/
  
-CFLAGS_arm/xor-neon.o          += $(CC_FLAGS_FPU)
-CFLAGS_REMOVE_arm/xor-neon.o   += $(CC_FLAGS_NO_FPU)
+CFLAGS_xor-neon.o              += $(CC_FLAGS_FPU) -I$(src)/$(SRCARCH)
+CFLAGS_REMOVE_xor-neon.o       += $(CC_FLAGS_NO_FPU)
  
  CFLAGS_arm64/xor-neon.o                += $(CC_FLAGS_FPU)
  CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c

deleted file mode 100644 (file)

index 23147e3..0000000
--- a/lib/raid/xor/arm/xor-neon.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include "xor_impl.h"
-#include "xor_arch.h"
-
-#ifndef __ARM_NEON__
-#error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
-#endif
-
-/*
- * Pull in the reference implementations while instructing GCC (through
- * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
- * NEON instructions. Clang does this by default at O2 so no pragma is
- * needed.
- */
-#ifdef CONFIG_CC_IS_GCC
-#pragma GCC optimize "tree-vectorize"
-#endif
-
-#define NO_TEMPLATE
-#include "../xor-8regs.c"
-
-__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
diff --git a/lib/raid/xor/arm/xor-neon.h b/lib/raid/xor/arm/xor-neon.h

new file mode 100644 (file)

index 0000000..406e035
--- /dev/null
+++ b/lib/raid/xor/arm/xor-neon.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+extern struct xor_block_template xor_block_arm4regs;
+extern struct xor_block_template xor_block_neon;
+
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h

index 775ff835df656ef1f879d8628a70bf999a98aff4..f1ddb64fe62abe8e37153db410f1b19118457d8f 100644 (file)
--- a/lib/raid/xor/arm/xor_arch.h
+++ b/lib/raid/xor/arm/xor_arch.h
@@ -3,12 +3,7 @@
   *  Copyright (C) 2001 Russell King
   */
  #include <asm/neon.h>
-
-extern struct xor_block_template xor_block_arm4regs;
-extern struct xor_block_template xor_block_neon;
-
-void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
-               unsigned int bytes);
+#include "xor-neon.h"
  
  static __always_inline void __init arch_xor_init(void)
  {
diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c

index 1edaed8acffe60b8c6343696b915def31d6d32dd..46b3c8bdc27f33b32e0efd2f20f14df9b3f4d470 100644 (file)
--- a/lib/raid/xor/xor-8regs.c
+++ b/lib/raid/xor/xor-8regs.c
@@ -93,11 +93,9 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
         } while (--lines > 0);
  }
  
-#ifndef NO_TEMPLATE
  DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
  
  struct xor_block_template xor_block_8regs = {
         .name           = "8regs",
         .xor_gen        = xor_gen_8regs,
  };
-#endif /* NO_TEMPLATE */
diff --git a/lib/raid/xor/xor-neon.c b/lib/raid/xor/xor-neon.c

new file mode 100644 (file)

index 0000000..a3e2b4a
--- /dev/null
+++ b/lib/raid/xor/xor-neon.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors: Jackie Liu <liuyun01@kylinos.cn>
+ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
+ */
+
+#include "xor_impl.h"
+#include "xor-neon.h"
+
+#include <asm/neon-intrinsics.h>
+
+static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3,
+               const unsigned long * __restrict p4)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* p1 ^= p4 */
+               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+       } while (--lines > 0);
+}
+
+static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+               const unsigned long * __restrict p2,
+               const unsigned long * __restrict p3,
+               const unsigned long * __restrict p4,
+               const unsigned long * __restrict p5)
+{
+       uint64_t *dp1 = (uint64_t *)p1;
+       uint64_t *dp2 = (uint64_t *)p2;
+       uint64_t *dp3 = (uint64_t *)p3;
+       uint64_t *dp4 = (uint64_t *)p4;
+       uint64_t *dp5 = (uint64_t *)p5;
+
+       register uint64x2_t v0, v1, v2, v3;
+       long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+       do {
+               /* p1 ^= p2 */
+               v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
+               v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
+               v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
+               v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
+
+               /* p1 ^= p3 */
+               v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
+
+               /* p1 ^= p4 */
+               v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
+
+               /* p1 ^= p5 */
+               v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
+               v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
+               v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
+               v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
+
+               /* store */
+               vst1q_u64(dp1 +  0, v0);
+               vst1q_u64(dp1 +  2, v1);
+               vst1q_u64(dp1 +  4, v2);
+               vst1q_u64(dp1 +  6, v3);
+
+               dp1 += 8;
+               dp2 += 8;
+               dp3 += 8;
+               dp4 += 8;
+               dp5 += 8;
+       } while (--lines > 0);
+}
+
+__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
+               __xor_neon_5);
author	Ard Biesheuvel <ardb@kernel.org>
	Wed, 22 Apr 2026 17:16:58 +0000 (19:16 +0200)
committer	Eric Biggers <ebiggers@kernel.org>
	Thu, 28 May 2026 20:14:19 +0000 (13:14 -0700)
lib/raid/xor/Makefile		patch \| blob \| blame \| history
lib/raid/xor/arm/xor-neon.c	[deleted file]	patch \| blob \| blame \| history
lib/raid/xor/arm/xor-neon.h	[new file with mode: 0644]	patch \| blob
lib/raid/xor/arm/xor_arch.h		patch \| blob \| blame \| history
lib/raid/xor/xor-8regs.c		patch \| blob \| blame \| history
lib/raid/xor/xor-neon.c	[new file with mode: 0644]	patch \| blob