]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
xor: pass the entire operation to the low-level ops
authorChristoph Hellwig <hch@lst.de>
Fri, 27 Mar 2026 06:16:58 +0000 (07:16 +0100)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 3 Apr 2026 06:36:21 +0000 (23:36 -0700)
Currently the high-level xor code chunks up all operations into small
units for only up to 1 + 4 vectors, and passes it to four different
methods.  This means the FPU/vector context is entered and left a lot for
wide stripes, and a lot of indirect expensive indirect calls are
performed.  Switch to passing the entire gen_xor request to the low-level
ops, and provide a macro to dispatch it to the existing helper.

This reduce the number of indirect calls and FPU/vector context switches
by a factor approaching nr_stripes / 4, and also reduces source and binary
code size.

Link: https://lkml.kernel.org/r/20260327061704.3707577-27-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: David Sterba <dsterba@suse.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Li Nan <linan122@huawei.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Song Liu <song@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
26 files changed:
include/linux/raid/xor.h
lib/raid/xor/alpha/xor.c
lib/raid/xor/arm/xor-neon-glue.c
lib/raid/xor/arm/xor-neon.c
lib/raid/xor/arm/xor.c
lib/raid/xor/arm/xor_arch.h
lib/raid/xor/arm64/xor-neon-glue.c
lib/raid/xor/arm64/xor-neon.c
lib/raid/xor/arm64/xor-neon.h
lib/raid/xor/loongarch/xor_simd_glue.c
lib/raid/xor/powerpc/xor_vmx.c
lib/raid/xor/powerpc/xor_vmx.h
lib/raid/xor/powerpc/xor_vmx_glue.c
lib/raid/xor/riscv/xor-glue.c
lib/raid/xor/s390/xor.c
lib/raid/xor/sparc/xor-sparc32.c
lib/raid/xor/sparc/xor-sparc64-glue.c
lib/raid/xor/x86/xor-avx.c
lib/raid/xor/x86/xor-mmx.c
lib/raid/xor/x86/xor-sse.c
lib/raid/xor/xor-32regs-prefetch.c
lib/raid/xor/xor-32regs.c
lib/raid/xor/xor-8regs-prefetch.c
lib/raid/xor/xor-8regs.c
lib/raid/xor/xor-core.c
lib/raid/xor/xor_impl.h

index 6d9a39fd85ddabdfb6c757af0fd158537fc11892..870558c9d36ee35034b43ba1ed74a7ba14782c4a 100644 (file)
@@ -2,11 +2,6 @@
 #ifndef _XOR_H
 #define _XOR_H
 
-#define MAX_XOR_BLOCKS 4
-
-extern void xor_blocks(unsigned int count, unsigned int bytes,
-       void *dest, void **srcs);
-
 void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes);
 
 #endif /* _XOR_H */
index 90694cc47395b9ba5ad34e551bea0e52f70d4c7f..a8f72f2dd3a5e2ddf8666a9bd7a1cac3e20f1307 100644 (file)
@@ -832,18 +832,17 @@ xor_alpha_prefetch_5:                                             \n\
        .end xor_alpha_prefetch_5                               \n\
 ");
 
+DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5);
+
 struct xor_block_template xor_block_alpha = {
-       .name   = "alpha",
-       .do_2   = xor_alpha_2,
-       .do_3   = xor_alpha_3,
-       .do_4   = xor_alpha_4,
-       .do_5   = xor_alpha_5,
+       .name           = "alpha",
+       .xor_gen        = xor_gen_alpha,
 };
 
+DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3,
+               xor_alpha_prefetch_4, xor_alpha_prefetch_5);
+
 struct xor_block_template xor_block_alpha_prefetch = {
-       .name   = "alpha prefetch",
-       .do_2   = xor_alpha_prefetch_2,
-       .do_3   = xor_alpha_prefetch_3,
-       .do_4   = xor_alpha_prefetch_4,
-       .do_5   = xor_alpha_prefetch_5,
+       .name           = "alpha prefetch",
+       .xor_gen        = xor_gen_alpha_prefetch,
 };
index 7afd6294464bc559979db7a67aadc9765f6aa34b..cea39e0199048efef004bf7136d982ce0d79187d 100644 (file)
@@ -5,54 +5,15 @@
 #include "xor_impl.h"
 #include "xor_arch.h"
 
-extern struct xor_block_template const xor_block_neon_inner;
-
-static void
-xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-          const unsigned long * __restrict p2)
-{
-       kernel_neon_begin();
-       xor_block_neon_inner.do_2(bytes, p1, p2);
-       kernel_neon_end();
-}
-
-static void
-xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-          const unsigned long * __restrict p2,
-          const unsigned long * __restrict p3)
-{
-       kernel_neon_begin();
-       xor_block_neon_inner.do_3(bytes, p1, p2, p3);
-       kernel_neon_end();
-}
-
-static void
-xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-          const unsigned long * __restrict p2,
-          const unsigned long * __restrict p3,
-          const unsigned long * __restrict p4)
-{
-       kernel_neon_begin();
-       xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
-       kernel_neon_end();
-}
-
-static void
-xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-          const unsigned long * __restrict p2,
-          const unsigned long * __restrict p3,
-          const unsigned long * __restrict p4,
-          const unsigned long * __restrict p5)
+static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes)
 {
        kernel_neon_begin();
-       xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
+       xor_gen_neon_inner(dest, srcs, src_cnt, bytes);
        kernel_neon_end();
 }
 
 struct xor_block_template xor_block_neon = {
-       .name   = "neon",
-       .do_2   = xor_neon_2,
-       .do_3   = xor_neon_3,
-       .do_4   = xor_neon_4,
-       .do_5   = xor_neon_5
+       .name           = "neon",
+       .xor_gen        = xor_gen_neon,
 };
index 806a42c5952c505289951c267510a0bd00c93e03..23147e3a79044fd1c8b670dba9cda2157053f9be 100644 (file)
@@ -4,6 +4,7 @@
  */
 
 #include "xor_impl.h"
+#include "xor_arch.h"
 
 #ifndef __ARM_NEON__
 #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
 #define NO_TEMPLATE
 #include "../xor-8regs.c"
 
-struct xor_block_template const xor_block_neon_inner = {
-       .name   = "__inner_neon__",
-       .do_2   = xor_8regs_2,
-       .do_3   = xor_8regs_3,
-       .do_4   = xor_8regs_4,
-       .do_5   = xor_8regs_5,
-};
+__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
index 5bd5f048bbe9ea1d39d2eb3175b31eb19d9b75e3..45139b6c55eaa87279fd75f47725d679e5902ada 100644 (file)
@@ -127,10 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines);
 }
 
+DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4,
+               xor_arm4regs_5);
+
 struct xor_block_template xor_block_arm4regs = {
-       .name   = "arm4regs",
-       .do_2   = xor_arm4regs_2,
-       .do_3   = xor_arm4regs_3,
-       .do_4   = xor_arm4regs_4,
-       .do_5   = xor_arm4regs_5,
+       .name           = "arm4regs",
+       .xor_gen        = xor_gen_arm4regs,
 };
index 5a7eedb48fbb94ff1fb036a4416559c99a955371..775ff835df656ef1f879d8628a70bf999a98aff4 100644 (file)
@@ -7,6 +7,9 @@
 extern struct xor_block_template xor_block_arm4regs;
 extern struct xor_block_template xor_block_neon;
 
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
+
 static __always_inline void __init arch_xor_init(void)
 {
        xor_register(&xor_block_arm4regs);
index 3db0a318cf5bb509197eec925119a50b5743b54e..f0284f86feb4c83c6f5cf12d0decff2ea32cfc4a 100644 (file)
 #include "xor-neon.h"
 
 #define XOR_TEMPLATE(_name)                                            \
-static void                                                            \
-xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1,    \
-          const unsigned long * __restrict p2)                         \
+static void xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \
+               unsigned int bytes)                                     \
 {                                                                      \
        scoped_ksimd()                                                  \
-               __xor_##_name##_2(bytes, p1, p2);                       \
-}                                                                      \
-                                                                       \
-static void                                                            \
-xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1,    \
-          const unsigned long * __restrict p2,                         \
-          const unsigned long * __restrict p3)                         \
-{                                                                      \
-       scoped_ksimd()                                                  \
-               __xor_##_name##_3(bytes, p1, p2, p3);                   \
-}                                                                      \
-                                                                       \
-static void                                                            \
-xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1,    \
-          const unsigned long * __restrict p2,                         \
-          const unsigned long * __restrict p3,                         \
-          const unsigned long * __restrict p4)                         \
-{                                                                      \
-       scoped_ksimd()                                                  \
-               __xor_##_name##_4(bytes, p1, p2, p3, p4);               \
-}                                                                      \
-                                                                       \
-static void                                                            \
-xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1,    \
-          const unsigned long * __restrict p2,                         \
-          const unsigned long * __restrict p3,                         \
-          const unsigned long * __restrict p4,                         \
-          const unsigned long * __restrict p5)                         \
-{                                                                      \
-       scoped_ksimd()                                                  \
-               __xor_##_name##_5(bytes, p1, p2, p3, p4, p5);           \
+               xor_gen_##_name##_inner(dest, srcs, src_cnt, bytes);    \
 }                                                                      \
                                                                        \
 struct xor_block_template xor_block_##_name = {                                \
-       .name   = __stringify(_name),                                   \
-       .do_2   = xor_##_name##_2,                                      \
-       .do_3   = xor_##_name##_3,                                      \
-       .do_4   = xor_##_name##_4,                                      \
-       .do_5   = xor_##_name##_5                                       \
+       .name           = __stringify(_name),                           \
+       .xor_gen        = xor_gen_##_name,                              \
 };
 
 XOR_TEMPLATE(neon);
index 61f00c4fee495244f38ad4ff5e809d3b1304de56..97ef3cb924968d112254cff0d9df3a027f625649 100644 (file)
@@ -10,7 +10,7 @@
 #include "xor_arch.h"
 #include "xor-neon.h"
 
-void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2)
 {
        uint64_t *dp1 = (uint64_t *)p1;
@@ -37,7 +37,7 @@ void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3)
 {
@@ -73,7 +73,7 @@ void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3,
                const unsigned long * __restrict p4)
@@ -118,7 +118,7 @@ void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3,
                const unsigned long * __restrict p4,
@@ -172,6 +172,9 @@ void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
+__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
+               __xor_neon_5);
+
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
        uint64x2_t res;
@@ -182,7 +185,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
        return res;
 }
 
-void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3)
 {
@@ -216,7 +219,7 @@ void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3,
                const unsigned long * __restrict p4)
@@ -259,7 +262,7 @@ void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
-void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
                const unsigned long * __restrict p2,
                const unsigned long * __restrict p3,
                const unsigned long * __restrict p4,
@@ -304,3 +307,6 @@ void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
                dp5 += 8;
        } while (--lines > 0);
 }
+
+__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
+               __xor_eor3_5);
index cec0ac846feabd4fd31b038e974e6d8c282df676..514699ba8f5f8e5de59444d6f5680848a3b8ea16 100644 (file)
@@ -1,30 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 
-void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2);
-void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3);
-void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4);
-void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4,
-               const unsigned long * __restrict p5);
-
-#define __xor_eor3_2   __xor_neon_2
-void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3);
-void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4);
-void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4,
-               const unsigned long * __restrict p5);
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
+void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
index b387aa0213b4710d47253f333eb8fff0fcd89d76..7f324d924f8791c1acd735355185e5d7d46ef243 100644 (file)
 #include "xor_arch.h"
 #include "xor_simd.h"
 
-#define MAKE_XOR_GLUE_2(flavor)                                                        \
-static void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,\
-                     const unsigned long * __restrict p2)                      \
+#define MAKE_XOR_GLUES(flavor)                                                 \
+DO_XOR_BLOCKS(flavor##_inner, __xor_##flavor##_2, __xor_##flavor##_3,          \
+               __xor_##flavor##_4, __xor_##flavor##_5);                        \
+                                                                               \
+static void xor_gen_##flavor(void *dest, void **srcs, unsigned int src_cnt,    \
+               unsigned int bytes)                                             \
 {                                                                              \
        kernel_fpu_begin();                                                     \
-       __xor_##flavor##_2(bytes, p1, p2);                                      \
+       xor_gen_##flavor##_inner(dest, srcs, src_cnt, bytes);                   \
        kernel_fpu_end();                                                       \
 }                                                                              \
-
-#define MAKE_XOR_GLUE_3(flavor)                                                        \
-static void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,\
-                     const unsigned long * __restrict p2,                      \
-                     const unsigned long * __restrict p3)                      \
-{                                                                              \
-       kernel_fpu_begin();                                                     \
-       __xor_##flavor##_3(bytes, p1, p2, p3);                                  \
-       kernel_fpu_end();                                                       \
-}                                                                              \
-
-#define MAKE_XOR_GLUE_4(flavor)                                                        \
-static void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,\
-                     const unsigned long * __restrict p2,                      \
-                     const unsigned long * __restrict p3,                      \
-                     const unsigned long * __restrict p4)                      \
-{                                                                              \
-       kernel_fpu_begin();                                                     \
-       __xor_##flavor##_4(bytes, p1, p2, p3, p4);                              \
-       kernel_fpu_end();                                                       \
-}                                                                              \
-
-#define MAKE_XOR_GLUE_5(flavor)                                                        \
-static void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,\
-                     const unsigned long * __restrict p2,                      \
-                     const unsigned long * __restrict p3,                      \
-                     const unsigned long * __restrict p4,                      \
-                     const unsigned long * __restrict p5)                      \
-{                                                                              \
-       kernel_fpu_begin();                                                     \
-       __xor_##flavor##_5(bytes, p1, p2, p3, p4, p5);                          \
-       kernel_fpu_end();                                                       \
-}                                                                              \
-
-#define MAKE_XOR_GLUES(flavor)                         \
-       MAKE_XOR_GLUE_2(flavor);                        \
-       MAKE_XOR_GLUE_3(flavor);                        \
-       MAKE_XOR_GLUE_4(flavor);                        \
-       MAKE_XOR_GLUE_5(flavor);                        \
-                                                       \
-struct xor_block_template xor_block_##flavor = {       \
-       .name = __stringify(flavor),                    \
-       .do_2 = xor_##flavor##_2,                       \
-       .do_3 = xor_##flavor##_3,                       \
-       .do_4 = xor_##flavor##_4,                       \
-       .do_5 = xor_##flavor##_5,                       \
+                                                                               \
+struct xor_block_template xor_block_##flavor = {                               \
+       .name           = __stringify(flavor),                                  \
+       .xor_gen        = xor_gen_##flavor                                      \
 }
 
-
 #ifdef CONFIG_CPU_HAS_LSX
 MAKE_XOR_GLUES(lsx);
 #endif /* CONFIG_CPU_HAS_LSX */
index aab49d056d1883a42c8fffd7a3d5c3549c8d73f9..09bed98c1bc7239d3f2d86912c104fed446f95cb 100644 (file)
@@ -10,6 +10,7 @@
  * Sparse (as at v0.5.0) gets very, very confused by this file.
  * Make it a bit simpler for it.
  */
+#include "xor_impl.h"
 #if !defined(__CHECKER__)
 #include <altivec.h>
 #else
@@ -49,9 +50,9 @@ typedef vector signed char unative_t;
                V1##_3 = vec_xor(V1##_3, V2##_3);       \
        } while (0)
 
-void __xor_altivec_2(unsigned long bytes,
-                    unsigned long * __restrict v1_in,
-                    const unsigned long * __restrict v2_in)
+static void __xor_altivec_2(unsigned long bytes,
+               unsigned long * __restrict v1_in,
+               const unsigned long * __restrict v2_in)
 {
        DEFINE(v1);
        DEFINE(v2);
@@ -68,10 +69,10 @@ void __xor_altivec_2(unsigned long bytes,
        } while (--lines > 0);
 }
 
-void __xor_altivec_3(unsigned long bytes,
-                    unsigned long * __restrict v1_in,
-                    const unsigned long * __restrict v2_in,
-                    const unsigned long * __restrict v3_in)
+static void __xor_altivec_3(unsigned long bytes,
+               unsigned long * __restrict v1_in,
+               const unsigned long * __restrict v2_in,
+               const unsigned long * __restrict v3_in)
 {
        DEFINE(v1);
        DEFINE(v2);
@@ -92,11 +93,11 @@ void __xor_altivec_3(unsigned long bytes,
        } while (--lines > 0);
 }
 
-void __xor_altivec_4(unsigned long bytes,
-                    unsigned long * __restrict v1_in,
-                    const unsigned long * __restrict v2_in,
-                    const unsigned long * __restrict v3_in,
-                    const unsigned long * __restrict v4_in)
+static void __xor_altivec_4(unsigned long bytes,
+               unsigned long * __restrict v1_in,
+               const unsigned long * __restrict v2_in,
+               const unsigned long * __restrict v3_in,
+               const unsigned long * __restrict v4_in)
 {
        DEFINE(v1);
        DEFINE(v2);
@@ -121,12 +122,12 @@ void __xor_altivec_4(unsigned long bytes,
        } while (--lines > 0);
 }
 
-void __xor_altivec_5(unsigned long bytes,
-                    unsigned long * __restrict v1_in,
-                    const unsigned long * __restrict v2_in,
-                    const unsigned long * __restrict v3_in,
-                    const unsigned long * __restrict v4_in,
-                    const unsigned long * __restrict v5_in)
+static void __xor_altivec_5(unsigned long bytes,
+               unsigned long * __restrict v1_in,
+               const unsigned long * __restrict v2_in,
+               const unsigned long * __restrict v3_in,
+               const unsigned long * __restrict v4_in,
+               const unsigned long * __restrict v5_in)
 {
        DEFINE(v1);
        DEFINE(v2);
@@ -154,3 +155,6 @@ void __xor_altivec_5(unsigned long bytes,
                v5 += 4;
        } while (--lines > 0);
 }
+
+__DO_XOR_BLOCKS(altivec_inner, __xor_altivec_2, __xor_altivec_3,
+               __xor_altivec_4, __xor_altivec_5);
index 573c41d90dac5297f0eac9859f5b718931f2f1b3..1d26c1133a86856f086df00fab4974d016eec41a 100644 (file)
@@ -6,17 +6,5 @@
  * outside of the enable/disable altivec block.
  */
 
-void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-                    const unsigned long * __restrict p2);
-void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-                    const unsigned long * __restrict p2,
-                    const unsigned long * __restrict p3);
-void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-                    const unsigned long * __restrict p2,
-                    const unsigned long * __restrict p3,
-                    const unsigned long * __restrict p4);
-void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-                    const unsigned long * __restrict p2,
-                    const unsigned long * __restrict p3,
-                    const unsigned long * __restrict p4,
-                    const unsigned long * __restrict p5);
+void xor_gen_altivec_inner(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes);
index 56e99ddfb64f61dd421840ef6229e970858b9a29..dbfbb5cadc36afaa62d9a17da24129c2cf645854 100644 (file)
 #include "xor_arch.h"
 #include "xor_vmx.h"
 
-static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2)
+static void xor_gen_altivec(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes)
 {
        preempt_disable();
        enable_kernel_altivec();
-       __xor_altivec_2(bytes, p1, p2);
-       disable_kernel_altivec();
-       preempt_enable();
-}
-
-static void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3)
-{
-       preempt_disable();
-       enable_kernel_altivec();
-       __xor_altivec_3(bytes, p1, p2, p3);
-       disable_kernel_altivec();
-       preempt_enable();
-}
-
-static void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4)
-{
-       preempt_disable();
-       enable_kernel_altivec();
-       __xor_altivec_4(bytes, p1, p2, p3, p4);
-       disable_kernel_altivec();
-       preempt_enable();
-}
-
-static void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-               const unsigned long * __restrict p2,
-               const unsigned long * __restrict p3,
-               const unsigned long * __restrict p4,
-               const unsigned long * __restrict p5)
-{
-       preempt_disable();
-       enable_kernel_altivec();
-       __xor_altivec_5(bytes, p1, p2, p3, p4, p5);
+       xor_gen_altivec_inner(dest, srcs, src_cnt, bytes);
        disable_kernel_altivec();
        preempt_enable();
 }
 
 struct xor_block_template xor_block_altivec = {
-       .name = "altivec",
-       .do_2 = xor_altivec_2,
-       .do_3 = xor_altivec_3,
-       .do_4 = xor_altivec_4,
-       .do_5 = xor_altivec_5,
+       .name           = "altivec",
+       .xor_gen        = xor_gen_altivec,
 };
index 060e5f22ebcce1e70b6d7c289a64aa75745f3b27..2e4c1b05d998fac0a9a695a6984e7840d0d80db4 100644 (file)
@@ -9,48 +9,17 @@
 #include "xor_impl.h"
 #include "xor_arch.h"
 
-static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
-                        const unsigned long *__restrict p2)
-{
-       kernel_vector_begin();
-       xor_regs_2_(bytes, p1, p2);
-       kernel_vector_end();
-}
-
-static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1,
-                        const unsigned long *__restrict p2,
-                        const unsigned long *__restrict p3)
-{
-       kernel_vector_begin();
-       xor_regs_3_(bytes, p1, p2, p3);
-       kernel_vector_end();
-}
-
-static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1,
-                        const unsigned long *__restrict p2,
-                        const unsigned long *__restrict p3,
-                        const unsigned long *__restrict p4)
-{
-       kernel_vector_begin();
-       xor_regs_4_(bytes, p1, p2, p3, p4);
-       kernel_vector_end();
-}
+DO_XOR_BLOCKS(vector_inner, xor_regs_2_, xor_regs_3_, xor_regs_4_, xor_regs_5_);
 
-static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1,
-                        const unsigned long *__restrict p2,
-                        const unsigned long *__restrict p3,
-                        const unsigned long *__restrict p4,
-                        const unsigned long *__restrict p5)
+static void xor_gen_vector(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes)
 {
        kernel_vector_begin();
-       xor_regs_5_(bytes, p1, p2, p3, p4, p5);
+       xor_gen_vector_inner(dest, srcs, src_cnt, bytes);
        kernel_vector_end();
 }
 
 struct xor_block_template xor_block_rvv = {
-       .name = "rvv",
-       .do_2 = xor_vector_2,
-       .do_3 = xor_vector_3,
-       .do_4 = xor_vector_4,
-       .do_5 = xor_vector_5
+       .name           = "rvv",
+       .xor_gen        = xor_gen_vector,
 };
index c28cb56fec92bb3ea0fa024d25b2cd5fc9f22dd3..0c478678a1291ff0d2fd82703e899bafa151d652 100644 (file)
@@ -125,10 +125,9 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
                : : "0", "cc", "memory");
 }
 
+DO_XOR_BLOCKS(xc, xor_xc_2, xor_xc_3, xor_xc_4, xor_xc_5);
+
 struct xor_block_template xor_block_xc = {
-       .name = "xc",
-       .do_2 = xor_xc_2,
-       .do_3 = xor_xc_3,
-       .do_4 = xor_xc_4,
-       .do_5 = xor_xc_5,
+       .name           = "xc",
+       .xor_gen        = xor_gen_xc,
 };
index 307c4a84f535d4607cac89e74ca55e33f7a9b943..fb37631e90e6970d52fa9432b225800f3bf7b9f4 100644 (file)
@@ -244,10 +244,9 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
+DO_XOR_BLOCKS(sparc32, sparc_2, sparc_3, sparc_4, sparc_5);
+
 struct xor_block_template xor_block_SPARC = {
-       .name   = "SPARC",
-       .do_2   = sparc_2,
-       .do_3   = sparc_3,
-       .do_4   = sparc_4,
-       .do_5   = sparc_5,
+       .name           = "SPARC",
+       .xor_gen        = xor_gen_sparc32,
 };
index 5f90c2460b54cff5ce16f86d130db3f89c2d8a4b..a8a686e0d25830838dd69ae4d98d3f0a408cd442 100644 (file)
@@ -28,12 +28,11 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
 
 /* XXX Ugh, write cheetah versions... -DaveM */
 
+DO_XOR_BLOCKS(vis, xor_vis_2, xor_vis_3, xor_vis_4, xor_vis_5);
+
 struct xor_block_template xor_block_VIS = {
-        .name  = "VIS",
-        .do_2  = xor_vis_2,
-        .do_3  = xor_vis_3,
-        .do_4  = xor_vis_4,
-        .do_5  = xor_vis_5,
+        .name          = "VIS",
+       .xor_gen        = xor_gen_vis,
 };
 
 void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1,
@@ -51,10 +50,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
                   const unsigned long * __restrict p4,
                   const unsigned long * __restrict p5);
 
+DO_XOR_BLOCKS(niagara, xor_niagara_2, xor_niagara_3, xor_niagara_4,
+               xor_niagara_5);
+
 struct xor_block_template xor_block_niagara = {
-        .name  = "Niagara",
-        .do_2  = xor_niagara_2,
-        .do_3  = xor_niagara_3,
-        .do_4  = xor_niagara_4,
-        .do_5  = xor_niagara_5,
+        .name          = "Niagara",
+       .xor_gen        = xor_gen_niagara,
 };
index d411efa1ff4355c06273d800d40ff3c64480a152..f7777d7aa269bdc175aeee41814957137ec149d0 100644 (file)
@@ -29,8 +29,6 @@ static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
 {
        unsigned long lines = bytes >> 9;
 
-       kernel_fpu_begin();
-
        while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -47,8 +45,6 @@ do { \
                p0 = (unsigned long *)((uintptr_t)p0 + 512);
                p1 = (unsigned long *)((uintptr_t)p1 + 512);
        }
-
-       kernel_fpu_end();
 }
 
 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
@@ -57,8 +53,6 @@ static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
 {
        unsigned long lines = bytes >> 9;
 
-       kernel_fpu_begin();
-
        while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -78,8 +72,6 @@ do { \
                p1 = (unsigned long *)((uintptr_t)p1 + 512);
                p2 = (unsigned long *)((uintptr_t)p2 + 512);
        }
-
-       kernel_fpu_end();
 }
 
 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
@@ -89,8 +81,6 @@ static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
 {
        unsigned long lines = bytes >> 9;
 
-       kernel_fpu_begin();
-
        while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -113,8 +103,6 @@ do { \
                p2 = (unsigned long *)((uintptr_t)p2 + 512);
                p3 = (unsigned long *)((uintptr_t)p3 + 512);
        }
-
-       kernel_fpu_end();
 }
 
 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
@@ -125,8 +113,6 @@ static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
 {
        unsigned long lines = bytes >> 9;
 
-       kernel_fpu_begin();
-
        while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -152,14 +138,19 @@ do { \
                p3 = (unsigned long *)((uintptr_t)p3 + 512);
                p4 = (unsigned long *)((uintptr_t)p4 + 512);
        }
+}
+
+DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
 
+static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
+                       unsigned int bytes)
+{
+       kernel_fpu_begin();
+       xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
        kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_avx = {
-       .name = "avx",
-       .do_2 = xor_avx_2,
-       .do_3 = xor_avx_3,
-       .do_4 = xor_avx_4,
-       .do_5 = xor_avx_5,
+       .name           = "avx",
+       .xor_gen        = xor_gen_avx,
 };
index e48c58f92874089f19f34745d34bfc1adb93aba4..63a8b0444fcef1c0d24a18b3a7d7439550e06d9a 100644 (file)
@@ -21,8 +21,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 7;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                               \
@@ -55,8 +53,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2)
        :
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -66,8 +62,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 7;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                               \
@@ -105,8 +99,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2), "+r" (p3)
        :
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -117,8 +109,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 7;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                               \
@@ -161,8 +151,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
        :
        : "memory");
-
-       kernel_fpu_end();
 }
 
 
@@ -175,8 +163,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 7;
 
-       kernel_fpu_begin();
-
        /* Make sure GCC forgets anything it knows about p4 or p5,
           such that it won't pass to the asm volatile below a
           register that is shared with any other variable.  That's
@@ -237,8 +223,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
           Clobber them just to be sure nobody does something stupid
           like assuming they have some legal value.  */
        asm("" : "=r" (p4), "=r" (p5));
-
-       kernel_fpu_end();
 }
 
 #undef LD
@@ -255,8 +239,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 6;
 
-       kernel_fpu_begin();
-
        asm volatile(
        " .align 32                  ;\n"
        " 1:                         ;\n"
@@ -293,8 +275,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2)
        :
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -304,8 +284,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 6;
 
-       kernel_fpu_begin();
-
        asm volatile(
        " .align 32,0x90             ;\n"
        " 1:                         ;\n"
@@ -351,8 +329,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2), "+r" (p3)
        :
        : "memory" );
-
-       kernel_fpu_end();
 }
 
 static void
@@ -363,8 +339,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 6;
 
-       kernel_fpu_begin();
-
        asm volatile(
        " .align 32,0x90             ;\n"
        " 1:                         ;\n"
@@ -419,8 +393,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
        :
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -432,8 +404,6 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 6;
 
-       kernel_fpu_begin();
-
        /* Make sure GCC forgets anything it knows about p4 or p5,
           such that it won't pass to the asm volatile below a
           register that is shared with any other variable.  That's
@@ -510,22 +480,36 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
           Clobber them just to be sure nobody does something stupid
           like assuming they have some legal value.  */
        asm("" : "=r" (p4), "=r" (p5));
+}
+
+DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4,
+               xor_pII_mmx_5);
 
+static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes)
+{
+       kernel_fpu_begin();
+       xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes);
        kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_pII_mmx = {
-       .name = "pII_mmx",
-       .do_2 = xor_pII_mmx_2,
-       .do_3 = xor_pII_mmx_3,
-       .do_4 = xor_pII_mmx_4,
-       .do_5 = xor_pII_mmx_5,
+       .name           = "pII_mmx",
+       .xor_gen        = xor_gen_pII_mmx,
 };
 
+DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4,
+               xor_p5_mmx_5);
+
+static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt,
+               unsigned int bytes)
+{
+       kernel_fpu_begin();
+       xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes);
+       kernel_fpu_end();
+}
+
 struct xor_block_template xor_block_p5_mmx = {
-       .name = "p5_mmx",
-       .do_2 = xor_p5_mmx_2,
-       .do_3 = xor_p5_mmx_3,
-       .do_4 = xor_p5_mmx_4,
-       .do_5 = xor_p5_mmx_5,
+       .name           = "p5_mmx",
+       .xor_gen        = xor_gen_p5_mmx,
 };
index 5993ed688c15539fcb241a96fe2d7af83d0cbf59..c6626ecae6ba5d323206cacc2a6c70c756667e24 100644 (file)
@@ -51,8 +51,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                                       \
@@ -93,8 +91,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
          [p1] "+r" (p1), [p2] "+r" (p2)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -103,8 +99,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                       \
@@ -128,8 +122,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
          [p1] "+r" (p1), [p2] "+r" (p2)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -139,8 +131,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -188,8 +178,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -199,8 +187,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                       \
@@ -226,8 +212,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -238,8 +222,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -294,8 +276,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -306,8 +286,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                       \
@@ -335,8 +313,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
          [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -348,8 +324,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -411,8 +385,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
-
-       kernel_fpu_end();
 }
 
 static void
@@ -424,8 +396,6 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
        unsigned long lines = bytes >> 8;
 
-       kernel_fpu_begin();
-
        asm volatile(
 #undef BLOCK
 #define BLOCK(i)                       \
@@ -455,22 +425,35 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
          [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
        : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
        : "memory");
+}
+
+DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5);
 
+static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt,
+                       unsigned int bytes)
+{
+       kernel_fpu_begin();
+       xor_gen_sse_inner(dest, srcs, src_cnt, bytes);
        kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_sse = {
-       .name = "sse",
-       .do_2 = xor_sse_2,
-       .do_3 = xor_sse_3,
-       .do_4 = xor_sse_4,
-       .do_5 = xor_sse_5,
+       .name           = "sse",
+       .xor_gen        = xor_gen_sse,
 };
 
+DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64,
+               xor_sse_5_pf64);
+
+static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt,
+                       unsigned int bytes)
+{
+       kernel_fpu_begin();
+       xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes);
+       kernel_fpu_end();
+}
+
 struct xor_block_template xor_block_sse_pf64 = {
-       .name = "prefetch64-sse",
-       .do_2 = xor_sse_2_pf64,
-       .do_3 = xor_sse_3_pf64,
-       .do_4 = xor_sse_4_pf64,
-       .do_5 = xor_sse_5_pf64,
+       .name           = "prefetch64-sse",
+       .xor_gen        = xor_gen_sse_pf64,
 };
index 2856a8e50cb840847d8b54865035c919f899c479..ade2a7d8cbe2aeed19194761d29ad5ab50d2b8dc 100644 (file)
@@ -258,10 +258,10 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
                goto once_more;
 }
 
+DO_XOR_BLOCKS(32regs_p, xor_32regs_p_2, xor_32regs_p_3, xor_32regs_p_4,
+               xor_32regs_p_5);
+
 struct xor_block_template xor_block_32regs_p = {
-       .name = "32regs_prefetch",
-       .do_2 = xor_32regs_p_2,
-       .do_3 = xor_32regs_p_3,
-       .do_4 = xor_32regs_p_4,
-       .do_5 = xor_32regs_p_5,
+       .name           = "32regs_prefetch",
+       .xor_gen        = xor_gen_32regs_p,
 };
index cc44d64032fa77cad16de120f01aeda9a130d390..acb4a10d1e95bd7fb8de60106d0b5c08b889d8cd 100644 (file)
@@ -209,10 +209,9 @@ xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
        } while (--lines > 0);
 }
 
+DO_XOR_BLOCKS(32regs, xor_32regs_2, xor_32regs_3, xor_32regs_4, xor_32regs_5);
+
 struct xor_block_template xor_block_32regs = {
-       .name = "32regs",
-       .do_2 = xor_32regs_2,
-       .do_3 = xor_32regs_3,
-       .do_4 = xor_32regs_4,
-       .do_5 = xor_32regs_5,
+       .name           = "32regs",
+       .xor_gen        = xor_gen_32regs,
 };
index 1d53aec50d27b6b59e32d8ddd6585c09f7d22d56..451527a951b1a2440584a7bfe13cdc2df6cc0d39 100644 (file)
@@ -136,10 +136,11 @@ xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
                goto once_more;
 }
 
+
+DO_XOR_BLOCKS(8regs_p, xor_8regs_p_2, xor_8regs_p_3, xor_8regs_p_4,
+               xor_8regs_p_5);
+
 struct xor_block_template xor_block_8regs_p = {
-       .name = "8regs_prefetch",
-       .do_2 = xor_8regs_p_2,
-       .do_3 = xor_8regs_p_3,
-       .do_4 = xor_8regs_p_4,
-       .do_5 = xor_8regs_p_5,
+       .name           = "8regs_prefetch",
+       .xor_gen        = xor_gen_8regs_p,
 };
index 72a44e898c5513739a980945fe72d9bf5de03e3a..1edaed8acffe60b8c6343696b915def31d6d32dd 100644 (file)
@@ -94,11 +94,10 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
 }
 
 #ifndef NO_TEMPLATE
+DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
+
 struct xor_block_template xor_block_8regs = {
-       .name = "8regs",
-       .do_2 = xor_8regs_2,
-       .do_3 = xor_8regs_3,
-       .do_4 = xor_8regs_4,
-       .do_5 = xor_8regs_5,
+       .name           = "8regs",
+       .xor_gen        = xor_gen_8regs,
 };
 #endif /* NO_TEMPLATE */
index 2e46b6b83b0af291b85bd392a8e040eb31deed36..9e043d8c3a7ab979e4391ebd87811ba505fe327f 100644 (file)
 #include <linux/preempt.h>
 #include "xor_impl.h"
 
-/* The xor routines to use.  */
+/* The xor routine to use.  */
 static struct xor_block_template *active_template;
 
-void
-xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
-{
-       unsigned long *p1, *p2, *p3, *p4;
-
-       WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count());
-
-       p1 = (unsigned long *) srcs[0];
-       if (src_count == 1) {
-               active_template->do_2(bytes, dest, p1);
-               return;
-       }
-
-       p2 = (unsigned long *) srcs[1];
-       if (src_count == 2) {
-               active_template->do_3(bytes, dest, p1, p2);
-               return;
-       }
-
-       p3 = (unsigned long *) srcs[2];
-       if (src_count == 3) {
-               active_template->do_4(bytes, dest, p1, p2, p3);
-               return;
-       }
-
-       p4 = (unsigned long *) srcs[3];
-       active_template->do_5(bytes, dest, p1, p2, p3, p4);
-}
-EXPORT_SYMBOL(xor_blocks);
-
 /**
  * xor_gen - generate RAID-style XOR information
  * @dest:      destination vector
@@ -63,20 +33,11 @@ EXPORT_SYMBOL(xor_blocks);
  */
 void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes)
 {
-       unsigned int src_off = 0;
-
-       WARN_ON_ONCE(in_interrupt());
+       WARN_ON_ONCE(!in_task() || irqs_disabled() || softirq_count());
        WARN_ON_ONCE(bytes == 0);
        WARN_ON_ONCE(bytes & 511);
 
-       while (src_cnt > 0) {
-               unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS);
-
-               xor_blocks(this_cnt, bytes, dest, srcs + src_off);
-
-               src_cnt -= this_cnt;
-               src_off += this_cnt;
-       }
+       active_template->xor_gen(dest, srcs, src_cnt, bytes);
 }
 EXPORT_SYMBOL(xor_gen);
 
@@ -120,6 +81,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
        int speed;
        unsigned long reps;
        ktime_t min, start, t0;
+       void *srcs[1] = { b2 };
 
        preempt_disable();
 
@@ -130,7 +92,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
                cpu_relax();
        do {
                mb(); /* prevent loop optimization */
-               tmpl->do_2(BENCH_SIZE, b1, b2);
+               tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE);
                mb();
        } while (reps++ < REPS || (t0 = ktime_get()) == start);
        min = ktime_sub(t0, start);
index 44b6c99e2093e18b838f79dbf0f75af17f6faa5c..09ae2916f71ecb68321e0f9befdae6a0d08be274 100644 (file)
@@ -3,27 +3,47 @@
 #define _XOR_IMPL_H
 
 #include <linux/init.h>
+#include <linux/minmax.h>
 
 struct xor_block_template {
        struct xor_block_template *next;
        const char *name;
        int speed;
-       void (*do_2)(unsigned long, unsigned long * __restrict,
-                    const unsigned long * __restrict);
-       void (*do_3)(unsigned long, unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict);
-       void (*do_4)(unsigned long, unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict);
-       void (*do_5)(unsigned long, unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict,
-                    const unsigned long * __restrict);
+       void (*xor_gen)(void *dest, void **srcs, unsigned int src_cnt,
+                       unsigned int bytes);
 };
 
+#define __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \
+void                                                           \
+xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt,         \
+               unsigned int bytes)                                     \
+{                                                                      \
+       unsigned int src_off = 0;                                       \
+                                                                       \
+       while (src_cnt > 0) {                                           \
+               unsigned int this_cnt = min(src_cnt, 4);                \
+                                                                       \
+               if (this_cnt == 1)                                      \
+                       _handle1(bytes, dest, srcs[src_off]);           \
+               else if (this_cnt == 2)                                 \
+                       _handle2(bytes, dest, srcs[src_off],            \
+                               srcs[src_off + 1]);                     \
+               else if (this_cnt == 3)                                 \
+                       _handle3(bytes, dest, srcs[src_off],            \
+                               srcs[src_off + 1], srcs[src_off + 2]);  \
+               else                                                    \
+                       _handle4(bytes, dest, srcs[src_off],            \
+                               srcs[src_off + 1], srcs[src_off + 2],   \
+                               srcs[src_off + 3]);                     \
+                                                                       \
+               src_cnt -= this_cnt;                                    \
+               src_off += this_cnt;                                    \
+       }                                                               \
+}
+
+#define DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)   \
+       static __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)
+
 /* generic implementations */
 extern struct xor_block_template xor_block_8regs;
 extern struct xor_block_template xor_block_32regs;