]> git.ipfire.org Git - thirdparty/ipxe.git/commitdiff
[loong64] Replace optimised string operations
authorMichael Brown <mcb30@ipxe.org>
Thu, 21 May 2026 14:19:10 +0000 (15:19 +0100)
committerMichael Brown <mcb30@ipxe.org>
Thu, 21 May 2026 14:21:20 +0000 (15:21 +0100)
The current implementation of the optimised string operations appears
to have been ported from the (old) arm64 implementation, and does not
cleanly match the LoongArch64 instruction set.

Replace with code derived from the riscv64 implementation, modified to
use indexed load and store instructions.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
src/arch/loong64/core/loong64_string.c
src/arch/loong64/include/bits/string.h

index 941b7e2b121676f235b0b416ef509fd81479d638..79ef316ec1a1a30db3208d9dc92171d9f37fa348 100644 (file)
@@ -1,6 +1,5 @@
 /*
- * Copyright (C) 2016 Michael Brown <mbrown@fensystems.co.uk>.
- * Copyright (c) 2023, Xiaotian Wu <wuxiaotian@loongson.cn>
+ * Copyright (C) 2026 Michael Brown <mbrown@fensystems.co.uk>.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -29,6 +28,7 @@
  */
 
 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+FILE_SECBOOT ( PERMITTED );
 
 #include <string.h>
 
@@ -41,68 +41,65 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
  * @ret dest           Destination address
  */
 void loong64_memcpy ( void *dest, const void *src, size_t len ) {
-       void *discard_dest;
-       void *discard_end;
-       const void *discard_src;
-       size_t discard_offset;
+       size_t len_pre;
+       size_t len_mid;
+       size_t len_post;
+       size_t offset;
        unsigned long discard_data;
-       unsigned long discard_low;
-       unsigned long discard_high;
 
-       /* If length is too short, then just copy individual bytes.
+       /* Calculate pre-aligned, aligned, and post-aligned lengths.
+        * (Align on the destination address, on the assumption that
+        * misaligned stores are likely to be more expensive than
+        * misaligned loads.)
         */
-       if ( len < 16 ) {
-               __asm__ __volatile__ ( "beqz %0, 2f\n\t"
-                                      "\n1:\n\t"
-                                      "addi.d %0, %0, -1\n\t"
-                                      "ldx.b %1, %3, %0\n\t"
-                                      "stx.b %1, %2, %0\n\t"
-                                      "bnez %0, 1b\n\t"
-                                      "\n2:\n\t"
-                                      : "=&r" ( discard_offset ),
-                                        "=&r" ( discard_data )
-                                      : "r" ( dest ), "r" ( src ), "0" ( len )
-                                      : "memory", "t0" );
-               return;
-       }
+       len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
+                   ( sizeof ( unsigned long ) - 1 ) );
+       if ( len_pre > len )
+               len_pre = len;
+       len -= len_pre;
+       len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
+       len -= len_mid;
+       len_post = len;
+
+       /* Copy pre-aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
+                              "\n1:\n\t"
+                              "ldx.b %1, %3, %0\n\t"
+                              "stx.b %1, %2, %0\n\t"
+                              "addi.d %0, %0, 1\n\t"
+                              "\n2:\n\t"
+                              "bne %0, %4, 1b\n\t"
+                              : "=&r" ( offset ), "=&r" ( discard_data )
+                              : "r" ( dest ), "r" ( src ), "r" ( len_pre ),
+                                "0" ( 0 )
+                              : "memory" );
 
-       /* Copy 16 bytes at a time: one initial
-        * potentially unaligned access, multiple destination-aligned
-        * accesses, one final potentially unaligned access.
-        */
-       __asm__ __volatile__ ( "ld.d %3, %1, 0\n\t"
-                              "ld.d %4, %1, 8\n\t"
-                              "addi.d %1, %1, 16\n\t"
-                              "st.d %3, %0, 0\n\t"
-                              "st.d %4, %0, 8\n\t"
-                              "addi.d %0, %0, 16\n\t"
-                              "andi %3, %0, 15\n\t"
-                              "sub.d %0, %0, %3\n\t"
-                              "sub.d %1, %1, %3\n\t"
-                              "addi.d $t0, $zero, 0xf\n\t"
-                              "andn %2, %5, $t0\n\t"
-                              "b 2f\n\t"
+       /* Copy aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
                               "\n1:\n\t"
-                              "ld.d %3, %1, 0\n\t"
-                              "ld.d %4, %1, 8\n\t"
-                              "addi.d %1, %1, 16\n\t"
-                              "st.d %3, %0, 0\n\t"
-                              "st.d %4, %0, 8\n\t"
-                              "addi.d %0, %0, 16\n\t"
+                              "ldx.d %1, %3, %0\n\t"
+                              "stx.d %1, %2, %0\n\t"
+                              "addi.d %0, %0, %5\n\t"
                               "\n2:\n\t"
-                              "bne %0, %2, 1b\n\t"
-                              "ld.d %3, %6, -16\n\t"
-                              "ld.d %4, %6, -8\n\t"
-                              "st.d %3, %5, -16\n\t"
-                              "st.d %4, %5, -8\n\t"
-                              : "=&r" ( discard_dest ),
-                                "=&r" ( discard_src ),
-                                "=&r" ( discard_end ),
-                                "=&r" ( discard_low ),
-                                "=&r" ( discard_high )
-                              : "r" ( dest + len ), "r" ( src + len ),
-                                "0" ( dest ), "1" ( src )
-                              : "memory", "t0" );
+                              "bne %0, %4, 1b\n\t"
+                              : "+r" ( offset ), "=&r" ( discard_data )
+                              : "r" ( dest ), "r" ( src ),
+                                "r" ( offset + len_mid ),
+                                "i" ( sizeof ( unsigned long ) )
+                              : "memory" );
+
+       /* Copy post-aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
+                              "\n1:\n\t"
+                              "ldx.b %1, %3, %0\n\t"
+                              "stx.b %1, %2, %0\n\t"
+                              "addi.d %0, %0, 1\n\t"
+                              "\n2:\n\t"
+                              "bne %0, %4, 1b\n\t"
+                              : "+r" ( offset ), "=&r" ( discard_data )
+                              : "r" ( dest ), "r" ( src ),
+                                "r" ( offset + len_post )
+                              : "memory" );
 }
 
 /**
@@ -112,50 +109,54 @@ void loong64_memcpy ( void *dest, const void *src, size_t len ) {
  * @v len              Length
  */
 void loong64_bzero ( void *dest, size_t len ) {
-       size_t discard_offset;
-       void *discard_dest;
-       void *discard_end;
-
-       /* If length is too short, then just zero individual bytes.
-        */
-       if ( len < 16 ) {
-               __asm__ __volatile__ ( "beqz %0, 2f\n\t"
-                                      "\n1:\n\t"
-                                      "addi.d %0, %0, -1\n\t"
-                                      "stx.b $zero, %1, %0\n\t"
-                                      "bnez %0, 1b\n\t"
-                                      "\n2:\n\t"
-                                      : "=&r" ( discard_offset )
-                                      : "r" ( dest ), "0" ( len )
-                                      : "memory" );
-               return;
-       }
+       size_t len_pre;
+       size_t len_mid;
+       size_t len_post;
+       size_t offset;
+
+       /* Calculate pre-aligned, aligned, and post-aligned lengths */
+       len_pre = ( ( sizeof ( unsigned long ) - ( ( intptr_t ) dest ) ) &
+                   ( sizeof ( unsigned long ) - 1 ) );
+       if ( len_pre > len )
+               len_pre = len;
+       len -= len_pre;
+       len_mid = ( len & ~( sizeof ( unsigned long ) - 1 ) );
+       len -= len_mid;
+       len_post = len;
+
+       /* Zero pre-aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
+                              "\n1:\n\t"
+                              "stx.b $zero, %1, %0\n\t"
+                              "addi.d %0, %0, 1\n\t"
+                              "\n2:\n\t"
+                              "bne %0, %2, 1b\n\t"
+                              : "=&r" ( offset )
+                              : "r" ( dest ), "r" ( len_pre ), "0" ( 0 )
+                              : "memory" );
 
-       /* To zero 16 bytes at a time: one initial
-        * potentially unaligned access, multiple aligned accesses,
-        * one final potentially unaligned access.
-        */
+       /* Zero aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
+                              "\n1:\n\t"
+                              "stx.d $zero, %1, %0\n\t"
+                              "addi.d %0, %0, %3\n\t"
+                              "\n2:\n\t"
+                              "bne %0, %2, 1b\n\t"
+                              : "+r" ( offset )
+                              : "r" ( dest ), "r" ( offset + len_mid ),
+                                "i" ( sizeof ( unsigned long ) )
+                              : "memory" );
 
-       __asm__ __volatile__ ( "st.d $zero, %0, 0\n\t"
-                              "st.d $zero, %0, 8\n\t"
-                              "addi.d %0, %0, 16\n\t"
-                              "addi.w $t0, $zero, 15\n\t"
-                              "andn %0, %0, $t0\n\t"
-                              "addi.w $t0, $zero, 15\n\t"
-                              "andn %1, %2, $t0\n\t"
-                              "b 2f\n\t"
+       /* Zero post-aligned section */
+       __asm__ __volatile__ ( "b 2f\n\t"
                               "\n1:\n\t"
-                              "st.d $zero, %0, 0\n\t"
-                              "st.d $zero, %0, 8\n\t"
-                              "addi.d %0, %0, 16\n\t"
+                              "stx.b $zero, %1, %0\n\t"
+                              "addi.d %0, %0, 1\n\t"
                               "\n2:\n\t"
-                              "bne %0, %1, 1b\n\t"
-                              "st.d $zero, %2, -16\n\t"
-                              "st.d $zero, %2, -8\n\t"
-                              : "=&r" ( discard_dest ),
-                                "=&r" ( discard_end )
-                              : "r" ( dest + len ), "0" ( dest )
-                              : "memory", "t0" );
+                              "bne %0, %2, 1b\n\t"
+                              : "+r" ( offset )
+                              : "r" ( dest ), "r" ( offset + len_post )
+                              : "memory" );
 }
 
 /**
@@ -166,10 +167,14 @@ void loong64_bzero ( void *dest, size_t len ) {
  * @v character                Fill character
  *
  * The unusual parameter order is to allow for more efficient
- * tail-calling to loong64_memset() when zeroing a region.
+ * tail-calling to loong64_bzero() when zeroing a region.
  */
 void loong64_memset ( void *dest, size_t len, int character ) {
-       size_t discard_offset;
+       size_t offset;
+
+       /* Do nothing if length is zero */
+       if ( ! len )
+               return;
 
        /* Use optimised zeroing code if applicable */
        if ( character == 0 ) {
@@ -181,86 +186,49 @@ void loong64_memset ( void *dest, size_t len, int character ) {
         * value is relatively rare and unlikely to be
         * performance-critical.
         */
-       __asm__ __volatile__ ( "beqz %0, 2f\n\t"
-                              "\n1:\n\t"
-                              "addi.d %0, %0, -1\n\t"
+       __asm__ __volatile__ ( "\n1:\n\t"
                               "stx.b %2, %1, %0\n\t"
-                              "bnez %0, 1b\n\t"
-                              "\n2:\n\t"
-                              : "=&r" ( discard_offset )
-                              : "r" ( dest ), "r" ( character ), "0" ( len )
-                              : "memory" );
-}
-
-/**
- * Copy (possibly overlapping) memory region forwards
- *
- * @v dest             Destination region
- * @v src              Source region
- * @v len              Length
- */
-void loong64_memmove_forwards ( void *dest, const void *src, size_t len ) {
-       void *discard_dest;
-       const void *discard_src;
-       unsigned long discard_data;
-
-       /* Assume memmove() is not performance-critical, and perform a
-        * bytewise copy for simplicity.
-        */
-       __asm__ __volatile__ ( "b 2f\n\t"
-                              "\n1:\n\t"
-                              "ld.b %2, %1, 0\n\t"
-                              "addi.d %1, %1, 1\n\t"
-                              "st.b %2, %0, 0\n\t"
                               "addi.d %0, %0, 1\n\t"
                               "\n2:\n\t"
                               "bne %0, %3, 1b\n\t"
-                              : "=&r" ( discard_dest ),
-                                "=&r" ( discard_src ),
-                                "=&r" ( discard_data )
-                              : "r" ( dest + len ), "0" ( dest ), "1" ( src )
+                              : "=&r" ( offset )
+                              : "r" ( dest ), "r" ( character ), "r" ( len ),
+                                "0" ( 0 )
                               : "memory" );
 }
 
 /**
- * Copy (possibly overlapping) memory region backwards
+ * Copy (possibly overlapping) memory region
  *
  * @v dest             Destination region
  * @v src              Source region
  * @v len              Length
  */
-void loong64_memmove_backwards ( void *dest, const void *src, size_t len ) {
-       size_t discard_offset;
+void loong64_memmove ( void *dest, const void *src, size_t len ) {
+       size_t offset;
        unsigned long discard_data;
 
+       /* Do nothing if length is zero */
+       if ( ! len )
+               return;
+
+       /* Use memcpy() if copy direction is forwards */
+       if ( dest <= src ) {
+               memcpy ( dest, src, len );
+               return;
+       }
+
        /* Assume memmove() is not performance-critical, and perform a
-        * bytewise copy for simplicity.
+        * bytewise copy backwards for simplicity.
         */
-       __asm__ __volatile__ ( "beqz %0, 2f\n\t"
-                              "\n1:\n\t"
+       __asm__ __volatile__ ( "\n1:\n\t"
                               "addi.d %0, %0, -1\n\t"
                               "ldx.b %1, %3, %0\n\t"
                               "stx.b %1, %2, %0\n\t"
-                              "bnez %0, 1b\n\t"
                               "\n2:\n\t"
-                              : "=&r" ( discard_offset ),
-                                "=&r" ( discard_data )
-                              : "r" ( dest ), "r" ( src ), "0" ( len )
+                              "bnez %0, 1b\n\t"
+                              : "=&r" ( offset ), "=&r" ( discard_data )
+                              : "r" ( dest ), "r" ( src ),
+                                "0" ( len )
                               : "memory" );
 }
-
-/**
- * Copy (possibly overlapping) memory region
- *
- * @v dest             Destination region
- * @v src              Source region
- * @v len              Length
- */
-void loong64_memmove ( void *dest, const void *src, size_t len ) {
-
-       if ( dest <= src ) {
-               loong64_memmove_forwards ( dest, src, len );
-       } else {
-               loong64_memmove_backwards ( dest, src, len );
-       }
-}
index 8f8917c46431c6fb01416462d0e79135b591296f..c6bb0811d33b0c9a772812d9411da39972ba7751 100644 (file)
@@ -13,8 +13,6 @@ FILE_SECBOOT ( PERMITTED );
 extern void loong64_bzero ( void *dest, size_t len );
 extern void loong64_memset ( void *dest, size_t len, int character );
 extern void loong64_memcpy ( void *dest, const void *src, size_t len );
-extern void loong64_memmove_forwards ( void *dest, const void *src, size_t len );
-extern void loong64_memmove_backwards ( void *dest, const void *src, size_t len );
 extern void loong64_memmove ( void *dest, const void *src, size_t len );
 
 /**
@@ -27,6 +25,14 @@ extern void loong64_memmove ( void *dest, const void *src, size_t len );
  */
 static inline __attribute__ (( always_inline )) void *
 memset ( void *dest, int character, size_t len ) {
+
+       /* Zeroing: use the optimised variable-length zeroing code */
+       if ( __builtin_constant_p ( character ) && ( character == 0 ) ) {
+               loong64_bzero ( dest, len );
+               return dest;
+       }
+
+       /* Not necessarily zeroing: use basic variable-length code */
        loong64_memset ( dest, len, character );
        return dest;
 }
@@ -41,6 +47,7 @@ memset ( void *dest, int character, size_t len ) {
  */
 static inline __attribute__ (( always_inline )) void *
 memcpy ( void *dest, const void *src, size_t len ) {
+
        loong64_memcpy ( dest, src, len );
        return dest;
 }
@@ -55,6 +62,17 @@ memcpy ( void *dest, const void *src, size_t len ) {
  */
 static inline __attribute__ (( always_inline )) void *
 memmove ( void *dest, const void *src, size_t len ) {
+       ssize_t offset = ( dest - src );
+
+       /* If direction of copy is known to be forwards at build time,
+        * then use variable-length memcpy().
+        */
+       if ( __builtin_constant_p ( offset ) && ( offset <= 0 ) ) {
+               loong64_memcpy ( dest, src, len );
+               return dest;
+       }
+
+       /* Otherwise, use ambidirectional copy */
        loong64_memmove ( dest, src, len );
        return dest;
 }