]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/aarch64/memcpy.S
[AArch64] Improve integer memcpy
[thirdparty/glibc.git] / sysdeps / aarch64 / memcpy.S
CommitLineData
688903eb 1/* Copyright (C) 2012-2018 Free Software Foundation, Inc.
857c8d22
MS
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
b998e16e
WD
19#include <sysdep.h>
20
857c8d22
MS
21/* Assumptions:
22 *
b998e16e 23 * ARMv8-a, AArch64, unaligned accesses.
857c8d22
MS
24 *
25 */
26
27#define dstin x0
28#define src x1
29#define count x2
b998e16e
WD
30#define dst x3
31#define srcend x4
32#define dstend x5
33#define A_l x6
34#define A_lw w6
35#define A_h x7
b998e16e 36#define B_l x8
a024b39a 37#define B_lw w8
b998e16e
WD
38#define B_h x9
39#define C_l x10
fe09348c 40#define C_lw w10
b998e16e
WD
41#define C_h x11
42#define D_l x12
43#define D_h x13
722c9357
KK
44#define E_l x14
45#define E_h x15
46#define F_l x16
47#define F_h x17
b998e16e
WD
48#define G_l count
49#define G_h dst
722c9357
KK
50#define H_l src
51#define H_h srcend
b998e16e 52#define tmp1 x14
857c8d22 53
6a2c6952
SE
54#ifndef MEMMOVE
55# define MEMMOVE memmove
56#endif
57#ifndef MEMCPY
58# define MEMCPY memcpy
59#endif
60
fe09348c
WD
61/* This implementation supports both memcpy and memmove and shares most code.
62 It uses unaligned accesses and branchless sequences to keep the code small,
63 simple and improve performance.
b998e16e 64
fe09348c
WD
65 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
66 copies of up to 128 bytes, and large copies. The overhead of the overlap
67 check in memmove is negligible since it is only required for large copies.
389d1f1b 68
fe09348c
WD
69 Large copies use a software pipelined loop processing 64 bytes per
70 iteration. The destination pointer is 16-byte aligned to minimize
71 unaligned accesses. The loop tail is handled by always copying 64 bytes
72 from the end.
73*/
b998e16e 74
fe09348c 75ENTRY_ALIGN (MEMCPY, 6)
389d1f1b
SE
76 DELOUSE (0)
77 DELOUSE (1)
78 DELOUSE (2)
79
b998e16e
WD
80 add srcend, src, count
81 add dstend, dstin, count
722c9357 82 cmp count, 128
b998e16e 83 b.hi L(copy_long)
fe09348c
WD
84 cmp count, 32
85 b.hi L(copy32_128)
b998e16e 86
fe09348c
WD
87 /* Small copies: 0..32 bytes. */
88 cmp count, 16
89 b.lo L(copy16)
a024b39a 90 ldp A_l, A_h, [src]
722c9357 91 ldp D_l, D_h, [srcend, -16]
722c9357 92 stp A_l, A_h, [dstin]
a024b39a
WD
93 stp D_l, D_h, [dstend, -16]
94 ret
95
fe09348c
WD
96 /* Copy 8-15 bytes. */
97L(copy16):
98 tbz count, 3, L(copy8)
b998e16e
WD
99 ldr A_l, [src]
100 ldr A_h, [srcend, -8]
101 str A_l, [dstin]
102 str A_h, [dstend, -8]
103 ret
fe09348c
WD
104
105 .p2align 3
106 /* Copy 4-7 bytes. */
107L(copy8):
108 tbz count, 2, L(copy4)
b998e16e 109 ldr A_lw, [src]
fe09348c 110 ldr B_lw, [srcend, -4]
b998e16e 111 str A_lw, [dstin]
fe09348c 112 str B_lw, [dstend, -4]
b998e16e 113 ret
a024b39a 114
fe09348c
WD
115 /* Copy 0..3 bytes using a branchless sequence. */
116L(copy4):
117 cbz count, L(copy0)
a024b39a 118 lsr tmp1, count, 1
b998e16e 119 ldrb A_lw, [src]
fe09348c 120 ldrb C_lw, [srcend, -1]
a024b39a
WD
121 ldrb B_lw, [src, tmp1]
122 strb A_lw, [dstin]
123 strb B_lw, [dstin, tmp1]
fe09348c
WD
124 strb C_lw, [dstend, -1]
125L(copy0):
126 ret
127
128 .p2align 4
129 /* Medium copies: 33..128 bytes. */
130L(copy32_128):
131 ldp A_l, A_h, [src]
132 ldp B_l, B_h, [src, 16]
133 ldp C_l, C_h, [srcend, -32]
134 ldp D_l, D_h, [srcend, -16]
135 cmp count, 64
136 b.hi L(copy128)
137 stp A_l, A_h, [dstin]
138 stp B_l, B_h, [dstin, 16]
139 stp C_l, C_h, [dstend, -32]
140 stp D_l, D_h, [dstend, -16]
141 ret
b998e16e 142
b998e16e 143 .p2align 4
fe09348c 144 /* Copy 65..128 bytes. */
722c9357
KK
145L(copy128):
146 ldp E_l, E_h, [src, 32]
147 ldp F_l, F_h, [src, 48]
fe09348c
WD
148 cmp count, 96
149 b.ls L(copy96)
722c9357
KK
150 ldp G_l, G_h, [srcend, -64]
151 ldp H_l, H_h, [srcend, -48]
fe09348c
WD
152 stp G_l, G_h, [dstend, -64]
153 stp H_l, H_h, [dstend, -48]
154L(copy96):
b998e16e
WD
155 stp A_l, A_h, [dstin]
156 stp B_l, B_h, [dstin, 16]
722c9357
KK
157 stp E_l, E_h, [dstin, 32]
158 stp F_l, F_h, [dstin, 48]
722c9357
KK
159 stp C_l, C_h, [dstend, -32]
160 stp D_l, D_h, [dstend, -16]
b998e16e
WD
161 ret
162
b998e16e 163 .p2align 4
fe09348c 164 /* Copy more than 128 bytes. */
b998e16e 165L(copy_long):
fe09348c
WD
166 /* Copy 16 bytes and then align dst to 16-byte alignment. */
167 ldp D_l, D_h, [src]
b998e16e
WD
168 and tmp1, dstin, 15
169 bic dst, dstin, 15
b998e16e
WD
170 sub src, src, tmp1
171 add count, count, tmp1 /* Count is now 16 too large. */
172 ldp A_l, A_h, [src, 16]
173 stp D_l, D_h, [dstin]
174 ldp B_l, B_h, [src, 32]
175 ldp C_l, C_h, [src, 48]
176 ldp D_l, D_h, [src, 64]!
177 subs count, count, 128 + 16 /* Test and readjust count. */
fe09348c
WD
178 b.ls L(copy64_from_end)
179
6a2c6952 180L(loop64):
b998e16e
WD
181 stp A_l, A_h, [dst, 16]
182 ldp A_l, A_h, [src, 16]
183 stp B_l, B_h, [dst, 32]
184 ldp B_l, B_h, [src, 32]
185 stp C_l, C_h, [dst, 48]
186 ldp C_l, C_h, [src, 48]
187 stp D_l, D_h, [dst, 64]!
188 ldp D_l, D_h, [src, 64]!
189 subs count, count, 64
6a2c6952 190 b.hi L(loop64)
b998e16e 191
fe09348c
WD
192 /* Write the last iteration and copy 64 bytes from the end. */
193L(copy64_from_end):
b998e16e
WD
194 ldp E_l, E_h, [srcend, -64]
195 stp A_l, A_h, [dst, 16]
196 ldp A_l, A_h, [srcend, -48]
197 stp B_l, B_h, [dst, 32]
198 ldp B_l, B_h, [srcend, -32]
199 stp C_l, C_h, [dst, 48]
200 ldp C_l, C_h, [srcend, -16]
201 stp D_l, D_h, [dst, 64]
202 stp E_l, E_h, [dstend, -64]
203 stp A_l, A_h, [dstend, -48]
204 stp B_l, B_h, [dstend, -32]
205 stp C_l, C_h, [dstend, -16]
206 ret
207
fe09348c
WD
208END (MEMCPY)
209libc_hidden_builtin_def (MEMCPY)
210
211ENTRY_ALIGN (MEMMOVE, 4)
212 DELOUSE (0)
213 DELOUSE (1)
214 DELOUSE (2)
b998e16e
WD
215
216 add srcend, src, count
217 add dstend, dstin, count
fe09348c
WD
218 cmp count, 128
219 b.hi L(move_long)
220 cmp count, 32
221 b.hi L(copy32_128)
b998e16e 222
fe09348c
WD
223 /* Small copies: 0..32 bytes. */
224 cmp count, 16
225 b.lo L(copy16)
226 ldp A_l, A_h, [src]
227 ldp D_l, D_h, [srcend, -16]
228 stp A_l, A_h, [dstin]
229 stp D_l, D_h, [dstend, -16]
230 ret
b998e16e 231
fe09348c
WD
232 .p2align 4
233L(move_long):
234 /* Only use backward copy if there is an overlap. */
235 sub tmp1, dstin, src
236 cbz tmp1, L(copy0)
237 cmp tmp1, count
238 b.hs L(copy_long)
239
240 /* Large backwards copy for overlapping copies.
241 Copy 16 bytes and then align dst to 16-byte alignment. */
b998e16e 242 ldp D_l, D_h, [srcend, -16]
fe09348c 243 and tmp1, dstend, 15
b998e16e
WD
244 sub srcend, srcend, tmp1
245 sub count, count, tmp1
246 ldp A_l, A_h, [srcend, -16]
247 stp D_l, D_h, [dstend, -16]
248 ldp B_l, B_h, [srcend, -32]
249 ldp C_l, C_h, [srcend, -48]
250 ldp D_l, D_h, [srcend, -64]!
251 sub dstend, dstend, tmp1
252 subs count, count, 128
fe09348c 253 b.ls L(copy64_from_start)
b998e16e 254
fe09348c 255L(loop64_backwards):
b998e16e
WD
256 stp A_l, A_h, [dstend, -16]
257 ldp A_l, A_h, [srcend, -16]
258 stp B_l, B_h, [dstend, -32]
259 ldp B_l, B_h, [srcend, -32]
260 stp C_l, C_h, [dstend, -48]
261 ldp C_l, C_h, [srcend, -48]
262 stp D_l, D_h, [dstend, -64]!
263 ldp D_l, D_h, [srcend, -64]!
264 subs count, count, 64
fe09348c 265 b.hi L(loop64_backwards)
b998e16e 266
fe09348c
WD
267 /* Write the last iteration and copy 64 bytes from the start. */
268L(copy64_from_start):
b998e16e
WD
269 ldp G_l, G_h, [src, 48]
270 stp A_l, A_h, [dstend, -16]
271 ldp A_l, A_h, [src, 32]
272 stp B_l, B_h, [dstend, -32]
273 ldp B_l, B_h, [src, 16]
274 stp C_l, C_h, [dstend, -48]
275 ldp C_l, C_h, [src]
276 stp D_l, D_h, [dstend, -64]
277 stp G_l, G_h, [dstin, 48]
278 stp A_l, A_h, [dstin, 32]
279 stp B_l, B_h, [dstin, 16]
280 stp C_l, C_h, [dstin]
fe09348c 281 ret
b998e16e 282
fe09348c
WD
283END (MEMMOVE)
284libc_hidden_builtin_def (MEMMOVE)