1 /* Generic optimized memcpy using SIMD.
2 Copyright (C) 2020 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
53 /* This implementation supports both memcpy and memmove and shares most code.
54 It uses unaligned accesses and branchless sequences to keep the code small,
55 simple and improve performance.
57 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
58 copies of up to 128 bytes, and large copies. The overhead of the overlap
59 check in memmove is negligible since it is only required for large copies.
61 Large copies use a software pipelined loop processing 64 bytes per
62 iteration. The destination pointer is 16-byte aligned to minimize
63 unaligned accesses. The loop tail is handled by always copying 64 bytes
71 add srcend, src, count
72 add dstend, dstin, count
78 /* Small copies: 0..32 bytes. */
82 ldr B_q, [srcend, -16]
84 str B_q, [dstend, -16]
87 /* Copy 8-15 bytes. */
89 tbz count, 3, L(copy8)
98 tbz count, 2, L(copy4)
100 ldr B_lw, [srcend, -4]
102 str B_lw, [dstend, -4]
105 /* Copy 0..3 bytes using a branchless sequence. */
110 ldrb C_lw, [srcend, -1]
111 ldrb B_lw, [src, tmp1]
113 strb B_lw, [dstin, tmp1]
114 strb C_lw, [dstend, -1]
119 /* Medium copies: 33..128 bytes. */
122 ldp C_q, D_q, [srcend, -32]
125 stp A_q, B_q, [dstin]
126 stp C_q, D_q, [dstend, -32]
130 /* Copy 65..128 bytes. */
132 ldp E_q, F_q, [src, 32]
135 ldp G_q, H_q, [srcend, -64]
136 stp G_q, H_q, [dstend, -64]
138 stp A_q, B_q, [dstin]
139 stp E_q, F_q, [dstin, 32]
140 stp C_q, D_q, [dstend, -32]
143 /* Align loop64 below to 16 bytes. */
146 /* Copy more than 128 bytes. */
148 /* Copy 16 bytes and then align src to 16-byte alignment. */
153 add count, count, tmp1 /* Count is now 16 too large. */
154 ldp A_q, B_q, [src, 16]
156 ldp C_q, D_q, [src, 48]
157 subs count, count, 128 + 16 /* Test and readjust count. */
158 b.ls L(copy64_from_end)
160 stp A_q, B_q, [dst, 16]
161 ldp A_q, B_q, [src, 80]
162 stp C_q, D_q, [dst, 48]
163 ldp C_q, D_q, [src, 112]
166 subs count, count, 64
169 /* Write the last iteration and copy 64 bytes from the end. */
171 ldp E_q, F_q, [srcend, -64]
172 stp A_q, B_q, [dst, 16]
173 ldp A_q, B_q, [srcend, -32]
174 stp C_q, D_q, [dst, 48]
175 stp E_q, F_q, [dstend, -64]
176 stp A_q, B_q, [dstend, -32]
180 libc_hidden_builtin_def (__memcpy_simd)
183 ENTRY (__memmove_simd)
188 add srcend, src, count
189 add dstend, dstin, count
195 /* Small moves: 0..32 bytes. */
199 ldr B_q, [srcend, -16]
201 str B_q, [dstend, -16]
205 /* Only use backward copy if there is an overlap. */
211 /* Large backwards copy for overlapping copies.
212 Copy 16 bytes and then align srcend to 16-byte alignment. */
213 L(copy_long_backwards):
214 ldr D_q, [srcend, -16]
216 bic srcend, srcend, 15
217 sub count, count, tmp1
218 ldp A_q, B_q, [srcend, -32]
219 str D_q, [dstend, -16]
220 ldp C_q, D_q, [srcend, -64]
221 sub dstend, dstend, tmp1
222 subs count, count, 128
223 b.ls L(copy64_from_start)
226 stp A_q, B_q, [dstend, -32]
227 ldp A_q, B_q, [srcend, -96]
228 stp C_q, D_q, [dstend, -64]
229 ldp C_q, D_q, [srcend, -128]
230 sub srcend, srcend, 64
231 sub dstend, dstend, 64
232 subs count, count, 64
233 b.hi L(loop64_backwards)
235 /* Write the last iteration and copy 64 bytes from the start. */
236 L(copy64_from_start):
237 ldp E_q, F_q, [src, 32]
238 stp A_q, B_q, [dstend, -32]
240 stp C_q, D_q, [dstend, -64]
241 stp E_q, F_q, [dstin, 32]
242 stp A_q, B_q, [dstin]
247 libc_hidden_builtin_def (__memmove_simd)