From: Christoph Hellwig Date: Fri, 27 Mar 2026 06:16:41 +0000 (+0100) Subject: xor: remove macro abuse for XOR implementation registrations X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=35ebc4de105989034f1250e40eb6dbf5e136b04e;p=thirdparty%2Fkernel%2Flinux.git xor: remove macro abuse for XOR implementation registrations Drop the pretty confusing historic XOR_TRY_TEMPLATES and XOR_SELECT_TEMPLATE, and instead let the architectures provide a arch_xor_init that calls either xor_register to register candidates or xor_force to force a specific implementation. Link: https://lkml.kernel.org/r/20260327061704.3707577-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Tested-by: Eric Biggers Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Anton Ivanov Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: "Borislav Petkov (AMD)" Cc: Catalin Marinas Cc: Chris Mason Cc: Christian Borntraeger Cc: Dan Williams Cc: David S. Miller Cc: David Sterba Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Johannes Berg Cc: Li Nan Cc: Madhavan Srinivasan Cc: Magnus Lindholm Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h index e0de0c233ab92..4c8085711df18 100644 --- a/arch/alpha/include/asm/xor.h +++ b/arch/alpha/include/asm/xor.h @@ -851,16 +851,19 @@ static struct xor_block_template xor_block_alpha_prefetch = { /* For grins, also test the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_alpha); \ - xor_speed(&xor_block_alpha_prefetch); \ - } while (0) - -/* Force the use of alpha_prefetch if EV6, as it is significantly - faster in the cold cache case. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST) +/* + * Force the use of alpha_prefetch if EV6, as it is significantly faster in the + * cold cache case. + */ +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (implver() == IMPLVER_EV6) { + xor_force(&xor_block_alpha_prefetch); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_alpha); + xor_register(&xor_block_alpha_prefetch); + } +} diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h index bca2a6514746e..b2dcd49186e2b 100644 --- a/arch/arm/include/asm/xor.h +++ b/arch/arm/include/asm/xor.h @@ -138,15 +138,6 @@ static struct xor_block_template xor_block_arm4regs = { .do_5 = xor_arm4regs_5, }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_arm4regs); \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - NEON_TEMPLATES; \ - } while (0) - #ifdef CONFIG_KERNEL_MODE_NEON extern struct xor_block_template const xor_block_neon_inner; @@ -201,8 +192,16 @@ static struct xor_block_template xor_block_neon = { .do_5 = xor_neon_5 }; -#define NEON_TEMPLATES \ - do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0) -#else -#define NEON_TEMPLATES +#endif /* CONFIG_KERNEL_MODE_NEON */ + +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_arm4regs); + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_KERNEL_MODE_NEON + if (cpu_has_neon()) + xor_register(&xor_block_neon); #endif +} diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h index bb7428d4ebc65..3cee1eb86371b 100644 --- a/arch/arm64/include/asm/xor.h +++ b/arch/arm64/include/asm/xor.h @@ -60,14 +60,14 @@ static struct xor_block_template xor_block_arm64 = { .do_4 = xor_neon_4, .do_5 = xor_neon_5 }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (cpu_has_neon()) { \ - xor_speed(&xor_block_arm64);\ - } \ - } while (0) + +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + if (cpu_has_neon()) + xor_register(&xor_block_arm64); +} #endif /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h index 12467fffee468..d17c0e3b047f1 100644 --- a/arch/loongarch/include/asm/xor.h +++ b/arch/loongarch/include/asm/xor.h @@ -16,14 +16,6 @@ static struct xor_block_template xor_block_lsx = { .do_4 = xor_lsx_4, .do_5 = xor_lsx_5, }; - -#define XOR_SPEED_LSX() \ - do { \ - if (cpu_has_lsx) \ - xor_speed(&xor_block_lsx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LSX */ -#define XOR_SPEED_LSX() #endif /* CONFIG_CPU_HAS_LSX */ #ifdef CONFIG_CPU_HAS_LASX @@ -34,14 +26,6 @@ static struct xor_block_template xor_block_lasx = { .do_4 = xor_lasx_4, .do_5 = xor_lasx_5, }; - -#define XOR_SPEED_LASX() \ - do { \ - if (cpu_has_lasx) \ - xor_speed(&xor_block_lasx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LASX */ -#define XOR_SPEED_LASX() #endif /* CONFIG_CPU_HAS_LASX */ /* @@ -54,15 +38,21 @@ static struct xor_block_template xor_block_lasx = { */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_LSX(); \ - XOR_SPEED_LASX(); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_CPU_HAS_LSX + if (cpu_has_lsx) + xor_register(&xor_block_lsx); +#endif +#ifdef CONFIG_CPU_HAS_LASX + if (cpu_has_lasx) + xor_register(&xor_block_lasx); +#endif +} #endif /* _ASM_LOONGARCH_XOR_H */ diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h index 37d05c11d09cd..30224c5279c4b 100644 --- a/arch/powerpc/include/asm/xor.h +++ b/arch/powerpc/include/asm/xor.h @@ -21,27 +21,22 @@ static struct xor_block_template xor_block_altivec = { .do_4 = xor_altivec_4, .do_5 = xor_altivec_5, }; - -#define XOR_SPEED_ALTIVEC() \ - do { \ - if (cpu_has_feature(CPU_FTR_ALTIVEC)) \ - xor_speed(&xor_block_altivec); \ - } while (0) -#else -#define XOR_SPEED_ALTIVEC() -#endif +#endif /* CONFIG_ALTIVEC */ /* Also try the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_ALTIVEC(); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + xor_register(&xor_block_altivec); +#endif +} #endif /* _ASM_POWERPC_XOR_H */ diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h index 96011861e46b4..ed5f27903efc4 100644 --- a/arch/riscv/include/asm/xor.h +++ b/arch/riscv/include/asm/xor.h @@ -55,14 +55,15 @@ static struct xor_block_template xor_block_rvv = { .do_4 = xor_vector_4, .do_5 = xor_vector_5 }; +#endif /* CONFIG_RISCV_ISA_V */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (has_vector()) { \ - xor_speed(&xor_block_rvv);\ - } \ - } while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_RISCV_ISA_V + if (has_vector()) + xor_register(&xor_block_rvv); #endif +} diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h index 857d6759b67f0..4e2233f64da98 100644 --- a/arch/s390/include/asm/xor.h +++ b/arch/s390/include/asm/xor.h @@ -10,12 +10,10 @@ extern struct xor_block_template xor_block_xc; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_xc); \ -} while (0) - -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_force(&xor_block_xc); +} #endif /* _ASM_S390_XOR_H */ diff --git a/arch/sparc/include/asm/xor_32.h b/arch/sparc/include/asm/xor_32.h index 0351813cf3af5..8fbf0c07ec289 100644 --- a/arch/sparc/include/asm/xor_32.h +++ b/arch/sparc/include/asm/xor_32.h @@ -259,10 +259,10 @@ static struct xor_block_template xor_block_SPARC = { /* For grins, also test the generic routines. */ #include -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_SPARC); \ - } while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_SPARC); +} diff --git a/arch/sparc/include/asm/xor_64.h b/arch/sparc/include/asm/xor_64.h index caaddea8ad79d..e0482ecc0a68b 100644 --- a/arch/sparc/include/asm/xor_64.h +++ b/arch/sparc/include/asm/xor_64.h @@ -60,20 +60,17 @@ static struct xor_block_template xor_block_niagara = { .do_5 = xor_niagara_5, }; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_VIS); \ - xor_speed(&xor_block_niagara); \ - } while (0) - -/* For VIS for everything except Niagara. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - ((tlb_type == hypervisor && \ - (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) ? \ - &xor_block_niagara : \ - &xor_block_VIS) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + /* Force VIS for everything except Niagara. */ + if (tlb_type == hypervisor && + (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) + xor_force(&xor_block_niagara); + else + xor_force(&xor_block_VIS); +} diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 7b0307acc4103..33f5620d8d691 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -496,7 +496,4 @@ static struct xor_block_template xor_block_sse_pf64 = { # include #endif -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(FASTEST) - #endif /* _ASM_X86_XOR_H */ diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 7a6b9474591e7..ee32d08c27bc5 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -552,22 +552,24 @@ static struct xor_block_template xor_block_pIII_sse = { /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - if (boot_cpu_has(X86_FEATURE_XMM)) { \ - xor_speed(&xor_block_pIII_sse); \ - xor_speed(&xor_block_sse_pf64); \ - } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ - xor_speed(&xor_block_pII_mmx); \ - xor_speed(&xor_block_p5_mmx); \ - } else { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else if (boot_cpu_has(X86_FEATURE_XMM)) { + xor_register(&xor_block_pIII_sse); + xor_register(&xor_block_sse_pf64); + } else if (boot_cpu_has(X86_FEATURE_MMX)) { + xor_register(&xor_block_pII_mmx); + xor_register(&xor_block_p5_mmx); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); + } +} #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 0307e4ec50440..2d2ceb2418665 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -17,12 +17,16 @@ static struct xor_block_template xor_block_sse = { /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - xor_speed(&xor_block_sse_pf64); \ - xor_speed(&xor_block_sse); \ -} while (0) +#define arch_xor_init arch_xor_init +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else { + xor_register(&xor_block_sse_pf64); + xor_register(&xor_block_sse); + } +} #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 7f81dd5897f41..c600888436bb9 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -166,13 +166,4 @@ static struct xor_block_template xor_block_avx = { .do_5 = xor_avx_5, }; -#define AVX_XOR_SPEED \ -do { \ - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ - xor_speed(&xor_block_avx); \ -} while (0) - -#define AVX_SELECT(FASTEST) \ - (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) - #endif diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h index 44509d48fca21..79c0096aa9d95 100644 --- a/include/asm-generic/xor.h +++ b/include/asm-generic/xor.h @@ -728,11 +728,3 @@ static struct xor_block_template xor_block_32regs_p __maybe_unused = { .do_4 = xor_32regs_p_4, .do_5 = xor_32regs_p_5, }; - -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } while (0) diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h index a1890cd668124..6ed4c445ab24c 100644 --- a/include/linux/raid/xor_impl.h +++ b/include/linux/raid/xor_impl.h @@ -2,6 +2,8 @@ #ifndef _XOR_IMPL_H #define _XOR_IMPL_H +#include + struct xor_block_template { struct xor_block_template *next; const char *name; @@ -22,4 +24,7 @@ struct xor_block_template { const unsigned long * __restrict); }; +void __init xor_register(struct xor_block_template *tmpl); +void __init xor_force(struct xor_block_template *tmpl); + #endif /* _XOR_IMPL_H */ diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c index db1824011a12a..93608b5fece9e 100644 --- a/lib/raid/xor/xor-core.c +++ b/lib/raid/xor/xor-core.c @@ -14,10 +14,6 @@ #include #include -#ifndef XOR_SELECT_TEMPLATE -#define XOR_SELECT_TEMPLATE(x) (x) -#endif - /* The xor routines to use. */ static struct xor_block_template *active_template; @@ -55,12 +51,33 @@ EXPORT_SYMBOL(xor_blocks); static struct xor_block_template *__initdata template_list; static bool __initdata xor_forced = false; -static void __init do_xor_register(struct xor_block_template *tmpl) +/** + * xor_register - register a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core. Registered implementations + * will be measured by a trivial benchmark, and the fastest one is chosen + * unless an implementation is forced using xor_force(). + */ +void __init xor_register(struct xor_block_template *tmpl) { tmpl->next = template_list; template_list = tmpl; } +/** + * xor_force - force use of a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core and force using it. Forcing + * an implementation will make the core ignore any template registered using + * xor_register(), or any previous implementation forced using xor_force(). + */ +void __init xor_force(struct xor_block_template *tmpl) +{ + active_template = tmpl; +} + #define BENCH_SIZE 4096 #define REPS 800U @@ -126,11 +143,19 @@ static int __init calibrate_xor_blocks(void) static int __init xor_init(void) { +#ifdef arch_xor_init + arch_xor_init(); +#else + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#endif + /* * If this arch/cpu has a short-circuited selection, don't loop through * all the possible functions, just use the best one. */ - active_template = XOR_SELECT_TEMPLATE(NULL); if (active_template) { pr_info("xor: automatically using best checksumming function %-10s\n", active_template->name); @@ -138,10 +163,6 @@ static int __init xor_init(void) return 0; } -#define xor_speed do_xor_register - XOR_TRY_TEMPLATES; -#undef xor_speed - #ifdef MODULE return calibrate_xor_blocks(); #else