/* For grins, also test the generic routines. */
#include <asm-generic/xor.h>
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_alpha); \
- xor_speed(&xor_block_alpha_prefetch); \
- } while (0)
-
-/* Force the use of alpha_prefetch if EV6, as it is significantly
- faster in the cold cache case. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
+/*
+ * Force the use of alpha_prefetch if EV6, as it is significantly faster in the
+ * cold cache case.
+ */
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ if (implver() == IMPLVER_EV6) {
+ xor_force(&xor_block_alpha_prefetch);
+ } else {
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_alpha);
+ xor_register(&xor_block_alpha_prefetch);
+ }
+}
.do_5 = xor_arm4regs_5,
};
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_arm4regs); \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_32regs); \
- NEON_TEMPLATES; \
- } while (0)
-
#ifdef CONFIG_KERNEL_MODE_NEON
extern struct xor_block_template const xor_block_neon_inner;
.do_5 = xor_neon_5
};
-#define NEON_TEMPLATES \
- do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
-#else
-#define NEON_TEMPLATES
+#endif /* CONFIG_KERNEL_MODE_NEON */
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_arm4regs);
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_32regs);
+#ifdef CONFIG_KERNEL_MODE_NEON
+ if (cpu_has_neon())
+ xor_register(&xor_block_neon);
#endif
+}
.do_4 = xor_neon_4,
.do_5 = xor_neon_5
};
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_32regs); \
- if (cpu_has_neon()) { \
- xor_speed(&xor_block_arm64);\
- } \
- } while (0)
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_32regs);
+ if (cpu_has_neon())
+ xor_register(&xor_block_arm64);
+}
#endif /* ! CONFIG_KERNEL_MODE_NEON */
.do_4 = xor_lsx_4,
.do_5 = xor_lsx_5,
};
-
-#define XOR_SPEED_LSX() \
- do { \
- if (cpu_has_lsx) \
- xor_speed(&xor_block_lsx); \
- } while (0)
-#else /* CONFIG_CPU_HAS_LSX */
-#define XOR_SPEED_LSX()
#endif /* CONFIG_CPU_HAS_LSX */
#ifdef CONFIG_CPU_HAS_LASX
.do_4 = xor_lasx_4,
.do_5 = xor_lasx_5,
};
-
-#define XOR_SPEED_LASX() \
- do { \
- if (cpu_has_lasx) \
- xor_speed(&xor_block_lasx); \
- } while (0)
-#else /* CONFIG_CPU_HAS_LASX */
-#define XOR_SPEED_LASX()
#endif /* CONFIG_CPU_HAS_LASX */
/*
*/
#include <asm-generic/xor.h>
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
-do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
- XOR_SPEED_LSX(); \
- XOR_SPEED_LASX(); \
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_8regs_p);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_32regs_p);
+#ifdef CONFIG_CPU_HAS_LSX
+ if (cpu_has_lsx)
+ xor_register(&xor_block_lsx);
+#endif
+#ifdef CONFIG_CPU_HAS_LASX
+ if (cpu_has_lasx)
+ xor_register(&xor_block_lasx);
+#endif
+}
#endif /* _ASM_LOONGARCH_XOR_H */
.do_4 = xor_altivec_4,
.do_5 = xor_altivec_5,
};
-
-#define XOR_SPEED_ALTIVEC() \
- do { \
- if (cpu_has_feature(CPU_FTR_ALTIVEC)) \
- xor_speed(&xor_block_altivec); \
- } while (0)
-#else
-#define XOR_SPEED_ALTIVEC()
-#endif
+#endif /* CONFIG_ALTIVEC */
/* Also try the generic routines. */
#include <asm-generic/xor.h>
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
-do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
- XOR_SPEED_ALTIVEC(); \
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_8regs_p);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_32regs_p);
+#ifdef CONFIG_ALTIVEC
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ xor_register(&xor_block_altivec);
+#endif
+}
#endif /* _ASM_POWERPC_XOR_H */
.do_4 = xor_vector_4,
.do_5 = xor_vector_5
};
+#endif /* CONFIG_RISCV_ISA_V */
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_32regs); \
- if (has_vector()) { \
- xor_speed(&xor_block_rvv);\
- } \
- } while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_32regs);
+#ifdef CONFIG_RISCV_ISA_V
+ if (has_vector())
+ xor_register(&xor_block_rvv);
#endif
+}
extern struct xor_block_template xor_block_xc;
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
-do { \
- xor_speed(&xor_block_xc); \
-} while (0)
-
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_force(&xor_block_xc);
+}
#endif /* _ASM_S390_XOR_H */
/* For grins, also test the generic routines. */
#include <asm-generic/xor.h>
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_SPARC); \
- } while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_SPARC);
+}
.do_5 = xor_niagara_5,
};
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_VIS); \
- xor_speed(&xor_block_niagara); \
- } while (0)
-
-/* For VIS for everything except Niagara. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- ((tlb_type == hypervisor && \
- (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || \
- sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || \
- sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || \
- sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || \
- sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) ? \
- &xor_block_niagara : \
- &xor_block_VIS)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ /* Force VIS for everything except Niagara. */
+ if (tlb_type == hypervisor &&
+ (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
+ sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+ sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
+ sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
+ sun4v_chip_type == SUN4V_CHIP_NIAGARA5))
+ xor_force(&xor_block_niagara);
+ else
+ xor_force(&xor_block_VIS);
+}
# include <asm/xor_64.h>
#endif
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(FASTEST)
-
#endif /* _ASM_X86_XOR_H */
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
-do { \
- AVX_XOR_SPEED; \
- if (boot_cpu_has(X86_FEATURE_XMM)) { \
- xor_speed(&xor_block_pIII_sse); \
- xor_speed(&xor_block_sse_pf64); \
- } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
- xor_speed(&xor_block_pII_mmx); \
- xor_speed(&xor_block_p5_mmx); \
- } else { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
- } \
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ xor_force(&xor_block_avx);
+ } else if (boot_cpu_has(X86_FEATURE_XMM)) {
+ xor_register(&xor_block_pIII_sse);
+ xor_register(&xor_block_sse_pf64);
+ } else if (boot_cpu_has(X86_FEATURE_MMX)) {
+ xor_register(&xor_block_pII_mmx);
+ xor_register(&xor_block_p5_mmx);
+ } else {
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_8regs_p);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_32regs_p);
+ }
+}
#endif /* _ASM_X86_XOR_32_H */
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES \
-do { \
- AVX_XOR_SPEED; \
- xor_speed(&xor_block_sse_pf64); \
- xor_speed(&xor_block_sse); \
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+ if (boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ xor_force(&xor_block_avx);
+ } else {
+ xor_register(&xor_block_sse_pf64);
+ xor_register(&xor_block_sse);
+ }
+}
#endif /* _ASM_X86_XOR_64_H */
.do_5 = xor_avx_5,
};
-#define AVX_XOR_SPEED \
-do { \
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
- xor_speed(&xor_block_avx); \
-} while (0)
-
-#define AVX_SELECT(FASTEST) \
- (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
-
#endif
.do_4 = xor_32regs_p_4,
.do_5 = xor_32regs_p_5,
};
-
-#define XOR_TRY_TEMPLATES \
- do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
- } while (0)
#ifndef _XOR_IMPL_H
#define _XOR_IMPL_H
+#include <linux/init.h>
+
struct xor_block_template {
struct xor_block_template *next;
const char *name;
const unsigned long * __restrict);
};
+void __init xor_register(struct xor_block_template *tmpl);
+void __init xor_force(struct xor_block_template *tmpl);
+
#endif /* _XOR_IMPL_H */
#include <linux/preempt.h>
#include <asm/xor.h>
-#ifndef XOR_SELECT_TEMPLATE
-#define XOR_SELECT_TEMPLATE(x) (x)
-#endif
-
/* The xor routines to use. */
static struct xor_block_template *active_template;
static struct xor_block_template *__initdata template_list;
static bool __initdata xor_forced = false;
-static void __init do_xor_register(struct xor_block_template *tmpl)
+/**
+ * xor_register - register a XOR template
+ * @tmpl: template to register
+ *
+ * Register a XOR implementation with the core. Registered implementations
+ * will be measured by a trivial benchmark, and the fastest one is chosen
+ * unless an implementation is forced using xor_force().
+ */
+void __init xor_register(struct xor_block_template *tmpl)
{
tmpl->next = template_list;
template_list = tmpl;
}
+/**
+ * xor_force - force use of a XOR template
+ * @tmpl: template to register
+ *
+ * Register a XOR implementation with the core and force using it. Forcing
+ * an implementation will make the core ignore any template registered using
+ * xor_register(), or any previous implementation forced using xor_force().
+ */
+void __init xor_force(struct xor_block_template *tmpl)
+{
+ active_template = tmpl;
+}
+
#define BENCH_SIZE 4096
#define REPS 800U
static int __init xor_init(void)
{
+#ifdef arch_xor_init
+ arch_xor_init();
+#else
+ xor_register(&xor_block_8regs);
+ xor_register(&xor_block_8regs_p);
+ xor_register(&xor_block_32regs);
+ xor_register(&xor_block_32regs_p);
+#endif
+
/*
* If this arch/cpu has a short-circuited selection, don't loop through
* all the possible functions, just use the best one.
*/
- active_template = XOR_SELECT_TEMPLATE(NULL);
if (active_template) {
pr_info("xor: automatically using best checksumming function %-10s\n",
active_template->name);
return 0;
}
-#define xor_speed do_xor_register
- XOR_TRY_TEMPLATES;
-#undef xor_speed
-
#ifdef MODULE
return calibrate_xor_blocks();
#else