]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/multiarch/memcpy_advsimd.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / aarch64 / multiarch / memcpy_advsimd.S
1 /* Generic optimized memcpy using SIMD.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* Assumptions:
23 *
24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
25 *
26 */
27
28 #define dstin x0
29 #define src x1
30 #define count x2
31 #define dst x3
32 #define srcend x4
33 #define dstend x5
34 #define A_l x6
35 #define A_lw w6
36 #define A_h x7
37 #define B_l x8
38 #define B_lw w8
39 #define B_h x9
40 #define C_lw w10
41 #define tmp1 x14
42
43 #define A_q q0
44 #define B_q q1
45 #define C_q q2
46 #define D_q q3
47 #define E_q q4
48 #define F_q q5
49 #define G_q q6
50 #define H_q q7
51
52
53 /* This implementation supports both memcpy and memmove and shares most code.
54 It uses unaligned accesses and branchless sequences to keep the code small,
55 simple and improve performance.
56
57 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
58 copies of up to 128 bytes, and large copies. The overhead of the overlap
59 check in memmove is negligible since it is only required for large copies.
60
61 Large copies use a software pipelined loop processing 64 bytes per
62 iteration. The destination pointer is 16-byte aligned to minimize
63 unaligned accesses. The loop tail is handled by always copying 64 bytes
64 from the end. */
65
66 ENTRY (__memcpy_simd)
67 PTR_ARG (0)
68 PTR_ARG (1)
69 SIZE_ARG (2)
70
71 add srcend, src, count
72 add dstend, dstin, count
73 cmp count, 128
74 b.hi L(copy_long)
75 cmp count, 32
76 b.hi L(copy32_128)
77
78 /* Small copies: 0..32 bytes. */
79 cmp count, 16
80 b.lo L(copy16)
81 ldr A_q, [src]
82 ldr B_q, [srcend, -16]
83 str A_q, [dstin]
84 str B_q, [dstend, -16]
85 ret
86
87 /* Copy 8-15 bytes. */
88 L(copy16):
89 tbz count, 3, L(copy8)
90 ldr A_l, [src]
91 ldr A_h, [srcend, -8]
92 str A_l, [dstin]
93 str A_h, [dstend, -8]
94 ret
95
96 /* Copy 4-7 bytes. */
97 L(copy8):
98 tbz count, 2, L(copy4)
99 ldr A_lw, [src]
100 ldr B_lw, [srcend, -4]
101 str A_lw, [dstin]
102 str B_lw, [dstend, -4]
103 ret
104
105 /* Copy 0..3 bytes using a branchless sequence. */
106 L(copy4):
107 cbz count, L(copy0)
108 lsr tmp1, count, 1
109 ldrb A_lw, [src]
110 ldrb C_lw, [srcend, -1]
111 ldrb B_lw, [src, tmp1]
112 strb A_lw, [dstin]
113 strb B_lw, [dstin, tmp1]
114 strb C_lw, [dstend, -1]
115 L(copy0):
116 ret
117
118 .p2align 4
119 /* Medium copies: 33..128 bytes. */
120 L(copy32_128):
121 ldp A_q, B_q, [src]
122 ldp C_q, D_q, [srcend, -32]
123 cmp count, 64
124 b.hi L(copy128)
125 stp A_q, B_q, [dstin]
126 stp C_q, D_q, [dstend, -32]
127 ret
128
129 .p2align 4
130 /* Copy 65..128 bytes. */
131 L(copy128):
132 ldp E_q, F_q, [src, 32]
133 cmp count, 96
134 b.ls L(copy96)
135 ldp G_q, H_q, [srcend, -64]
136 stp G_q, H_q, [dstend, -64]
137 L(copy96):
138 stp A_q, B_q, [dstin]
139 stp E_q, F_q, [dstin, 32]
140 stp C_q, D_q, [dstend, -32]
141 ret
142
143 /* Align loop64 below to 16 bytes. */
144 nop
145
146 /* Copy more than 128 bytes. */
147 L(copy_long):
148 /* Copy 16 bytes and then align src to 16-byte alignment. */
149 ldr D_q, [src]
150 and tmp1, src, 15
151 bic src, src, 15
152 sub dst, dstin, tmp1
153 add count, count, tmp1 /* Count is now 16 too large. */
154 ldp A_q, B_q, [src, 16]
155 str D_q, [dstin]
156 ldp C_q, D_q, [src, 48]
157 subs count, count, 128 + 16 /* Test and readjust count. */
158 b.ls L(copy64_from_end)
159 L(loop64):
160 stp A_q, B_q, [dst, 16]
161 ldp A_q, B_q, [src, 80]
162 stp C_q, D_q, [dst, 48]
163 ldp C_q, D_q, [src, 112]
164 add src, src, 64
165 add dst, dst, 64
166 subs count, count, 64
167 b.hi L(loop64)
168
169 /* Write the last iteration and copy 64 bytes from the end. */
170 L(copy64_from_end):
171 ldp E_q, F_q, [srcend, -64]
172 stp A_q, B_q, [dst, 16]
173 ldp A_q, B_q, [srcend, -32]
174 stp C_q, D_q, [dst, 48]
175 stp E_q, F_q, [dstend, -64]
176 stp A_q, B_q, [dstend, -32]
177 ret
178
179 END (__memcpy_simd)
180 libc_hidden_builtin_def (__memcpy_simd)
181
182
183 ENTRY (__memmove_simd)
184 PTR_ARG (0)
185 PTR_ARG (1)
186 SIZE_ARG (2)
187
188 add srcend, src, count
189 add dstend, dstin, count
190 cmp count, 128
191 b.hi L(move_long)
192 cmp count, 32
193 b.hi L(copy32_128)
194
195 /* Small moves: 0..32 bytes. */
196 cmp count, 16
197 b.lo L(copy16)
198 ldr A_q, [src]
199 ldr B_q, [srcend, -16]
200 str A_q, [dstin]
201 str B_q, [dstend, -16]
202 ret
203
204 L(move_long):
205 /* Only use backward copy if there is an overlap. */
206 sub tmp1, dstin, src
207 cbz tmp1, L(move0)
208 cmp tmp1, count
209 b.hs L(copy_long)
210
211 /* Large backwards copy for overlapping copies.
212 Copy 16 bytes and then align srcend to 16-byte alignment. */
213 L(copy_long_backwards):
214 ldr D_q, [srcend, -16]
215 and tmp1, srcend, 15
216 bic srcend, srcend, 15
217 sub count, count, tmp1
218 ldp A_q, B_q, [srcend, -32]
219 str D_q, [dstend, -16]
220 ldp C_q, D_q, [srcend, -64]
221 sub dstend, dstend, tmp1
222 subs count, count, 128
223 b.ls L(copy64_from_start)
224
225 L(loop64_backwards):
226 str B_q, [dstend, -16]
227 str A_q, [dstend, -32]
228 ldp A_q, B_q, [srcend, -96]
229 str D_q, [dstend, -48]
230 str C_q, [dstend, -64]!
231 ldp C_q, D_q, [srcend, -128]
232 sub srcend, srcend, 64
233 subs count, count, 64
234 b.hi L(loop64_backwards)
235
236 /* Write the last iteration and copy 64 bytes from the start. */
237 L(copy64_from_start):
238 ldp E_q, F_q, [src, 32]
239 stp A_q, B_q, [dstend, -32]
240 ldp A_q, B_q, [src]
241 stp C_q, D_q, [dstend, -64]
242 stp E_q, F_q, [dstin, 32]
243 stp A_q, B_q, [dstin]
244 L(move0):
245 ret
246
247 END (__memmove_simd)
248 libc_hidden_builtin_def (__memmove_simd)