]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
arm64: lib: Use MOPS for memcpy() routines
authorKristina Martsenko <kristina.martsenko@arm.com>
Mon, 30 Sep 2024 16:10:50 +0000 (17:10 +0100)
committerCatalin Marinas <catalin.marinas@arm.com>
Thu, 17 Oct 2024 15:42:51 +0000 (16:42 +0100)
Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS
instructions when implemented on the CPU.

The CPY*/SET* instructions copy or set a block of memory of arbitrary
size and alignment. They can be interrupted by the CPU and the copying
resumed later. Their performance is expected to be close to the best
generic copy/set sequence of loads/stores for a given CPU. Using them in
the kernel's copy/set routines therefore avoids the need to periodically
rewrite the routines to optimize for new microarchitectures. It could
also lead to a performance improvement for some CPUs and systems.

With this change the kernel will always use the instructions if they are
implemented on the CPU (and have not been disabled by the arm64.nomops
command line parameter). When not implemented the usual routines will be
used (patched via alternatives). Note, we need to patch B/NOP instead of
the whole sequence to avoid executing a partially patched sequence in
case the compiler generates a mem*() call inside the alternatives
patching code.

Note that MOPS instructions have relaxed behavior on Device memory, but
it is expected that these routines are not generally used on MMIO.

Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as
CPY* allows overlaps between the source and destination buffers, and
despite contradicting the C standard, compilers require that memcpy()
work on exactly overlapping source and destination:
  https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language
  https://reviews.llvm.org/D86993

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-5-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
arch/arm64/Kconfig
arch/arm64/lib/memcpy.S
arch/arm64/lib/memset.S

index 3e29b44d2d7bd693c6209383a67b8d0d14345406..d0fe90ea704dd9dfb098920ff9c8a0fe0f2aaabe 100644 (file)
@@ -2155,6 +2155,9 @@ config ARM64_EPAN
          if the cpu does not implement the feature.
 endmenu # "ARMv8.7 architectural features"
 
+config AS_HAS_MOPS
+       def_bool $(as-instr,.arch_extension mops)
+
 menu "ARMv8.9 architectural features"
 
 config ARM64_POE
index 4ab48d49c451564a4edb24b5a4ff2d158b85be5c..9b99106fb95f1cdd9276f00084e46767a3b7013c 100644 (file)
@@ -57,7 +57,7 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
        add     srcend, src, count
        add     dstend, dstin, count
        cmp     count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
        stp     B_l, B_h, [dstin, 16]
        stp     C_l, C_h, [dstin]
        ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+       .arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+       b       __pi_memcpy_generic
+alternative_else_nop_endif
+
+       mov     dst, dstin
+       cpyp    [dst]!, [src]!, count!
+       cpym    [dst]!, [src]!, count!
+       cpye    [dst]!, [src]!, count!
+       ret
 SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
 EXPORT_SYMBOL(__memcpy)
index a5aebe82ad73b963d0afd331d68b13e87b5a7f40..97157da65ec6b6fbac41c0627e759fcdf1d7f589 100644 (file)
@@ -26,6 +26,7 @@
  */
 
 dstin          .req    x0
+val_x          .req    x1
 val            .req    w1
 count          .req    x2
 tmp1           .req    x3
@@ -42,7 +43,7 @@ dst           .req    x8
 tmp3w          .req    w9
 tmp3           .req    x9
 
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
        mov     dst, dstin      /* Preserve return value.  */
        and     A_lw, val, #255
        orr     A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
        ands    count, count, zva_bits_x
        b.ne    .Ltail_maybe_long
        ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+       .arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+       b       __pi_memset_generic
+alternative_else_nop_endif
+
+       mov     dst, dstin
+       setp    [dst]!, count!, val_x
+       setm    [dst]!, count!, val_x
+       sete    [dst]!, count!, val_x
+       ret
 SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memset, __pi_memset)
 EXPORT_SYMBOL(__memset)