aarch64: Use memcpy_simd as the default memcpy

author Wilco Dijkstra <wdijkstr@arm.com>

Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Tue, 9 Apr 2024 18:48:23 +0000 (19:48 +0100)
author Wilco Dijkstra <wdijkstr@arm.com>
Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Tue, 9 Apr 2024 18:48:23 +0000 (19:48 +0100)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S

index cc8142dd40652be2d99139cbe3a18461d48a8ef2..36353cab21a769b55934147e51d0e788f0759aa8 100644 (file)
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2019 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2012-2022 Free Software Foundation, Inc.
  
     This file is part of the GNU C Library.
  
@@ -20,7 +21,7 @@
  
  /* Assumptions:
   *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
   *
   */
  
@@ -36,21 +37,18 @@
  #define B_l    x8
  #define B_lw   w8
  #define B_h    x9
-#define C_l    x10
  #define C_lw   w10
-#define C_h    x11
-#define D_l    x12
-#define D_h    x13
-#define E_l    x14
-#define E_h    x15
-#define F_l    x16
-#define F_h    x17
-#define G_l    count
-#define G_h    dst
-#define H_l    src
-#define H_h    srcend
  #define tmp1   x14
  
+#define A_q    q0
+#define B_q    q1
+#define C_q    q2
+#define D_q    q3
+#define E_q    q4
+#define F_q    q5
+#define G_q    q6
+#define H_q    q7
+
  #ifndef MEMMOVE
  # define MEMMOVE memmove
  #endif
@@ -69,8 +67,7 @@
     Large copies use a software pipelined loop processing 64 bytes per
     iteration.  The destination pointer is 16-byte aligned to minimize
     unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.
-*/
+   from the end.  */
  
  ENTRY_ALIGN (MEMCPY, 6)
         DELOUSE (0)
@@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
         /* Small copies: 0..32 bytes.  */
         cmp     count, 16
         b.lo    L(copy16)
-       ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
         ret
  
         /* Copy 8-15 bytes.  */
@@ -102,7 +99,6 @@ L(copy16):
         str     A_h, [dstend, -8]
         ret
  
-       .p2align 3
         /* Copy 4-7 bytes.  */
  L(copy8):
         tbz     count, 2, L(copy4)
@@ -128,81 +124,62 @@ L(copy0):
         .p2align 4
         /* Medium copies: 33..128 bytes.  */
  L(copy32_128):
-       ldp     A_l, A_h, [src]
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       ldp     D_l, D_h, [srcend, -16]
+       ldp     A_q, B_q, [src]
+       ldp     C_q, D_q, [srcend, -32]
         cmp     count, 64
         b.hi    L(copy128)
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-       stp     D_l, D_h, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       stp     C_q, D_q, [dstend, -32]
         ret
  
         .p2align 4
         /* Copy 65..128 bytes.  */
  L(copy128):
-       ldp     E_l, E_h, [src, 32]
-       ldp     F_l, F_h, [src, 48]
+       ldp     E_q, F_q, [src, 32]
         cmp     count, 96
         b.ls    L(copy96)
-       ldp     G_l, G_h, [srcend, -64]
-       ldp     H_l, H_h, [srcend, -48]
-       stp     G_l, G_h, [dstend, -64]
-       stp     H_l, H_h, [dstend, -48]
+       ldp     G_q, H_q, [srcend, -64]
+       stp     G_q, H_q, [dstend, -64]
  L(copy96):
-       stp     A_l, A_h, [dstin]
-       stp     B_l, B_h, [dstin, 16]
-       stp     E_l, E_h, [dstin, 32]
-       stp     F_l, F_h, [dstin, 48]
-       stp     C_l, C_h, [dstend, -32]
-       stp     D_l, D_h, [dstend, -16]
+       stp     A_q, B_q, [dstin]
+       stp     E_q, F_q, [dstin, 32]
+       stp     C_q, D_q, [dstend, -32]
         ret
  
-       .p2align 4
+       /* Align loop64 below to 16 bytes.  */
+       nop
+
         /* Copy more than 128 bytes.  */
  L(copy_long):
-       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
-       ldp     D_l, D_h, [src]
-       and     tmp1, dstin, 15
-       bic     dst, dstin, 15
-       sub     src, src, tmp1
+       /* Copy 16 bytes and then align src to 16-byte alignment.  */
+       ldr     D_q, [src]
+       and     tmp1, src, 15
+       bic     src, src, 15
+       sub     dst, dstin, tmp1
         add     count, count, tmp1      /* Count is now 16 too large.  */
-       ldp     A_l, A_h, [src, 16]
-       stp     D_l, D_h, [dstin]
-       ldp     B_l, B_h, [src, 32]
-       ldp     C_l, C_h, [src, 48]
-       ldp     D_l, D_h, [src, 64]!
+       ldp     A_q, B_q, [src, 16]
+       str     D_q, [dstin]
+       ldp     C_q, D_q, [src, 48]
         subs    count, count, 128 + 16  /* Test and readjust count.  */
         b.ls    L(copy64_from_end)
-
  L(loop64):
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [src, 16]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [src, 32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [src, 48]
-       stp     D_l, D_h, [dst, 64]!
-       ldp     D_l, D_h, [src, 64]!
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [src, 80]
+       stp     C_q, D_q, [dst, 48]
+       ldp     C_q, D_q, [src, 112]
+       add     src, src, 64
+       add     dst, dst, 64
         subs    count, count, 64
         b.hi    L(loop64)
  
         /* Write the last iteration and copy 64 bytes from the end.  */
  L(copy64_from_end):
-       ldp     E_l, E_h, [srcend, -64]
-       stp     A_l, A_h, [dst, 16]
-       ldp     A_l, A_h, [srcend, -48]
-       stp     B_l, B_h, [dst, 32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dst, 48]
-       ldp     C_l, C_h, [srcend, -16]
-       stp     D_l, D_h, [dst, 64]
-       stp     E_l, E_h, [dstend, -64]
-       stp     A_l, A_h, [dstend, -48]
-       stp     B_l, B_h, [dstend, -32]
-       stp     C_l, C_h, [dstend, -16]
+       ldp     E_q, F_q, [srcend, -64]
+       stp     A_q, B_q, [dst, 16]
+       ldp     A_q, B_q, [srcend, -32]
+       stp     C_q, D_q, [dst, 48]
+       stp     E_q, F_q, [dstend, -64]
+       stp     A_q, B_q, [dstend, -32]
         ret
  
  END (MEMCPY)
@@ -220,64 +197,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
         cmp     count, 32
         b.hi    L(copy32_128)
  
-       /* Small copies: 0..32 bytes.  */
+       /* Small moves: 0..32 bytes.  */
         cmp     count, 16
         b.lo    L(copy16)
-       ldp     A_l, A_h, [src]
-       ldp     D_l, D_h, [srcend, -16]
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
+       ldr     A_q, [src]
+       ldr     B_q, [srcend, -16]
+       str     A_q, [dstin]
+       str     B_q, [dstend, -16]
         ret
  
-       .p2align 4
  L(move_long):
         /* Only use backward copy if there is an overlap.  */
         sub     tmp1, dstin, src
-       cbz     tmp1, L(copy0)
+       cbz     tmp1, L(move0)
         cmp     tmp1, count
         b.hs    L(copy_long)
  
         /* Large backwards copy for overlapping copies.
-          Copy 16 bytes and then align dst to 16-byte alignment.  */
-       ldp     D_l, D_h, [srcend, -16]
-       and     tmp1, dstend, 15
-       sub     srcend, srcend, tmp1
+          Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldr     D_q, [srcend, -16]
+       and     tmp1, srcend, 15
+       bic     srcend, srcend, 15
         sub     count, count, tmp1
-       ldp     A_l, A_h, [srcend, -16]
-       stp     D_l, D_h, [dstend, -16]
-       ldp     B_l, B_h, [srcend, -32]
-       ldp     C_l, C_h, [srcend, -48]
-       ldp     D_l, D_h, [srcend, -64]!
+       ldp     A_q, B_q, [srcend, -32]
+       str     D_q, [dstend, -16]
+       ldp     C_q, D_q, [srcend, -64]
         sub     dstend, dstend, tmp1
         subs    count, count, 128
         b.ls    L(copy64_from_start)
  
  L(loop64_backwards):
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [srcend, -48]
-       stp     D_l, D_h, [dstend, -64]!
-       ldp     D_l, D_h, [srcend, -64]!
+       str     B_q, [dstend, -16]
+       str     A_q, [dstend, -32]
+       ldp     A_q, B_q, [srcend, -96]
+       str     D_q, [dstend, -48]
+       str     C_q, [dstend, -64]!
+       ldp     C_q, D_q, [srcend, -128]
+       sub     srcend, srcend, 64
         subs    count, count, 64
         b.hi    L(loop64_backwards)
  
         /* Write the last iteration and copy 64 bytes from the start.  */
  L(copy64_from_start):
-       ldp     G_l, G_h, [src, 48]
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [src, 32]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [src, 16]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [src]
-       stp     D_l, D_h, [dstend, -64]
-       stp     G_l, G_h, [dstin, 48]
-       stp     A_l, A_h, [dstin, 32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin]
+       ldp     E_q, F_q, [src, 32]
+       stp     A_q, B_q, [dstend, -32]
+       ldp     A_q, B_q, [src]
+       stp     C_q, D_q, [dstend, -64]
+       stp     E_q, F_q, [dstin, 32]
+       stp     A_q, B_q, [dstin]
+L(move0):
         ret
  
  END (MEMMOVE)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile

index b9277158f2aaace21d17d8ce8f2e8d445159de8d..b1a5f59fcd6d729e12c07c1446f067496234cddd 100644 (file)
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,5 +1,5 @@
  ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
                    memcpy_falkor memmove_falkor memset_generic memset_falkor \
                    strlen_generic strlen_asimd
  endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c

index f6cf3e495d933de0a7b102afb0175940d407ae54..a00c329bca80961fb21f3c5f7737b3ee2e72fbaa 100644 (file)
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -42,12 +42,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
-             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
               IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
    IFUNC_IMPL (i, name, memmove,
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
-             IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
    IFUNC_IMPL (i, name, memset,
               /* Enable this on non-falkor processors too so that other cores
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c

index 1528d89e5404e09cad1bfcd6d7ba502f24ea5b23..a27d571434073ae1ceaefa252d7d84546b3f2bf5 100644 (file)
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,7 +29,6 @@
  extern __typeof (__redirect_memcpy) __libc_memcpy;
  
  extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
-extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
@@ -41,10 +40,7 @@ libc_ifunc (__libc_memcpy,
                 ? __memcpy_falkor
                 : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
                   ? __memcpy_thunderx2
-                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-                    || IS_NEOVERSE_V1 (midr)
-                    ? __memcpy_simd
-                    : __memcpy_generic)))));
+                 : __memcpy_generic))));
  
  # undef memcpy
  strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S

deleted file mode 100644 (file)

index 48bb6d7..0000000
--- a/sysdeps/aarch64/multiarch/memcpy_advsimd.S
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Generic optimized memcpy using SIMD.
-   Copyright (C) 2020 Free Software Foundation, Inc.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin  x0
-#define src    x1
-#define count  x2
-#define dst    x3
-#define srcend x4
-#define dstend x5
-#define A_l    x6
-#define A_lw   w6
-#define A_h    x7
-#define B_l    x8
-#define B_lw   w8
-#define B_h    x9
-#define C_lw   w10
-#define tmp1   x14
-
-#define A_q    q0
-#define B_q    q1
-#define C_q    q2
-#define D_q    q3
-#define E_q    q4
-#define F_q    q5
-#define G_q    q6
-#define H_q    q7
-
-
-/* This implementation supports both memcpy and memmove and shares most code.
-   It uses unaligned accesses and branchless sequences to keep the code small,
-   simple and improve performance.
-
-   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-   copies of up to 128 bytes, and large copies.  The overhead of the overlap
-   check in memmove is negligible since it is only required for large copies.
-
-   Large copies use a software pipelined loop processing 64 bytes per
-   iteration.  The destination pointer is 16-byte aligned to minimize
-   unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.  */
-
-ENTRY (__memcpy_simd)
-       DELOUSE (0)
-       DELOUSE (1)
-       DELOUSE (2)
-
-       add     srcend, src, count
-       add     dstend, dstin, count
-       cmp     count, 128
-       b.hi    L(copy_long)
-       cmp     count, 32
-       b.hi    L(copy32_128)
-
-       /* Small copies: 0..32 bytes.  */
-       cmp     count, 16
-       b.lo    L(copy16)
-       ldr     A_q, [src]
-       ldr     B_q, [srcend, -16]
-       str     A_q, [dstin]
-       str     B_q, [dstend, -16]
-       ret
-
-       /* Copy 8-15 bytes.  */
-L(copy16):
-       tbz     count, 3, L(copy8)
-       ldr     A_l, [src]
-       ldr     A_h, [srcend, -8]
-       str     A_l, [dstin]
-       str     A_h, [dstend, -8]
-       ret
-
-       /* Copy 4-7 bytes.  */
-L(copy8):
-       tbz     count, 2, L(copy4)
-       ldr     A_lw, [src]
-       ldr     B_lw, [srcend, -4]
-       str     A_lw, [dstin]
-       str     B_lw, [dstend, -4]
-       ret
-
-       /* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-       cbz     count, L(copy0)
-       lsr     tmp1, count, 1
-       ldrb    A_lw, [src]
-       ldrb    C_lw, [srcend, -1]
-       ldrb    B_lw, [src, tmp1]
-       strb    A_lw, [dstin]
-       strb    B_lw, [dstin, tmp1]
-       strb    C_lw, [dstend, -1]
-L(copy0):
-       ret
-
-       .p2align 4
-       /* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-       ldp     A_q, B_q, [src]
-       ldp     C_q, D_q, [srcend, -32]
-       cmp     count, 64
-       b.hi    L(copy128)
-       stp     A_q, B_q, [dstin]
-       stp     C_q, D_q, [dstend, -32]
-       ret
-
-       .p2align 4
-       /* Copy 65..128 bytes.  */
-L(copy128):
-       ldp     E_q, F_q, [src, 32]
-       cmp     count, 96
-       b.ls    L(copy96)
-       ldp     G_q, H_q, [srcend, -64]
-       stp     G_q, H_q, [dstend, -64]
-L(copy96):
-       stp     A_q, B_q, [dstin]
-       stp     E_q, F_q, [dstin, 32]
-       stp     C_q, D_q, [dstend, -32]
-       ret
-
-       /* Align loop64 below to 16 bytes.  */
-       nop
-
-       /* Copy more than 128 bytes.  */
-L(copy_long):
-       /* Copy 16 bytes and then align src to 16-byte alignment.  */
-       ldr     D_q, [src]
-       and     tmp1, src, 15
-       bic     src, src, 15
-       sub     dst, dstin, tmp1
-       add     count, count, tmp1      /* Count is now 16 too large.  */
-       ldp     A_q, B_q, [src, 16]
-       str     D_q, [dstin]
-       ldp     C_q, D_q, [src, 48]
-       subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    L(copy64_from_end)
-L(loop64):
-       stp     A_q, B_q, [dst, 16]
-       ldp     A_q, B_q, [src, 80]
-       stp     C_q, D_q, [dst, 48]
-       ldp     C_q, D_q, [src, 112]
-       add     src, src, 64
-       add     dst, dst, 64
-       subs    count, count, 64
-       b.hi    L(loop64)
-
-       /* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
-       ldp     E_q, F_q, [srcend, -64]
-       stp     A_q, B_q, [dst, 16]
-       ldp     A_q, B_q, [srcend, -32]
-       stp     C_q, D_q, [dst, 48]
-       stp     E_q, F_q, [dstend, -64]
-       stp     A_q, B_q, [dstend, -32]
-       ret
-
-END (__memcpy_simd)
-libc_hidden_builtin_def (__memcpy_simd)
-
-
-ENTRY (__memmove_simd)
-       DELOUSE (0)
-       DELOUSE (1)
-       DELOUSE (2)
-
-       add     srcend, src, count
-       add     dstend, dstin, count
-       cmp     count, 128
-       b.hi    L(move_long)
-       cmp     count, 32
-       b.hi    L(copy32_128)
-
-       /* Small moves: 0..32 bytes.  */
-       cmp     count, 16
-       b.lo    L(copy16)
-       ldr     A_q, [src]
-       ldr     B_q, [srcend, -16]
-       str     A_q, [dstin]
-       str     B_q, [dstend, -16]
-       ret
-
-L(move_long):
-       /* Only use backward copy if there is an overlap.  */
-       sub     tmp1, dstin, src
-       cbz     tmp1, L(move0)
-       cmp     tmp1, count
-       b.hs    L(copy_long)
-
-       /* Large backwards copy for overlapping copies.
-          Copy 16 bytes and then align srcend to 16-byte alignment.  */
-L(copy_long_backwards):
-       ldr     D_q, [srcend, -16]
-       and     tmp1, srcend, 15
-       bic     srcend, srcend, 15
-       sub     count, count, tmp1
-       ldp     A_q, B_q, [srcend, -32]
-       str     D_q, [dstend, -16]
-       ldp     C_q, D_q, [srcend, -64]
-       sub     dstend, dstend, tmp1
-       subs    count, count, 128
-       b.ls    L(copy64_from_start)
-
-L(loop64_backwards):
-       str     B_q, [dstend, -16]
-       str     A_q, [dstend, -32]
-       ldp     A_q, B_q, [srcend, -96]
-       str     D_q, [dstend, -48]
-       str     C_q, [dstend, -64]!
-       ldp     C_q, D_q, [srcend, -128]
-       sub     srcend, srcend, 64
-       subs    count, count, 64
-       b.hi    L(loop64_backwards)
-
-       /* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
-       ldp     E_q, F_q, [src, 32]
-       stp     A_q, B_q, [dstend, -32]
-       ldp     A_q, B_q, [src]
-       stp     C_q, D_q, [dstend, -64]
-       stp     E_q, F_q, [dstin, 32]
-       stp     A_q, B_q, [dstin]
-L(move0):
-       ret
-
-END (__memmove_simd)
-libc_hidden_builtin_def (__memmove_simd)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c

index 621af2b2968de1d0978dd52e1ddd1ce2a931ea54..4eb19da43dea96671e146bb864ed808ec9b81b21 100644 (file)
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,7 +29,6 @@
  extern __typeof (__redirect_memmove) __libc_memmove;
  
  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
  
@@ -38,10 +37,7 @@ libc_ifunc (__libc_memmove,
              ? __memmove_thunderx
              : (IS_FALKOR (midr) || IS_PHECDA (midr)
                 ? __memmove_falkor
-                 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
-                    || IS_NEOVERSE_V1 (midr)
-                    ? __memmove_simd
-                    : __memmove_generic))));
+                 : __memmove_generic)));
  
  # undef memmove
  strong_alias (__libc_memmove, memmove);
author	Wilco Dijkstra <wdijkstr@arm.com>
	Wed, 26 Oct 2022 13:16:50 +0000 (14:16 +0100)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Tue, 9 Apr 2024 18:48:23 +0000 (19:48 +0100)
sysdeps/aarch64/memcpy.S		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/Makefile		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/ifunc-impl-list.c		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memcpy.c		patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memcpy_advsimd.S	[deleted file]	patch \| blob \| blame \| history
sysdeps/aarch64/multiarch/memmove.c		patch \| blob \| blame \| history