]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/aarch64/memcpy.S
7e1163e6a0a1805877bc46defd4ee3e56a340927
[thirdparty/glibc.git] / sysdeps / aarch64 / memcpy.S
1 /* Copyright (C) 2012-2018 Free Software Foundation, Inc.
2
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* Assumptions:
22 *
23 * ARMv8-a, AArch64, unaligned accesses.
24 *
25 */
26
27 #define dstin x0
28 #define src x1
29 #define count x2
30 #define dst x3
31 #define srcend x4
32 #define dstend x5
33 #define A_l x6
34 #define A_lw w6
35 #define A_h x7
36 #define A_hw w7
37 #define B_l x8
38 #define B_lw w8
39 #define B_h x9
40 #define C_l x10
41 #define C_h x11
42 #define D_l x12
43 #define D_h x13
44 #define E_l src
45 #define E_h count
46 #define F_l srcend
47 #define F_h dst
48 #define G_l count
49 #define G_h dst
50 #define tmp1 x14
51
52 /* Copies are split into 3 main cases: small copies of up to 16 bytes,
53 medium copies of 17..96 bytes which are fully unrolled. Large copies
54 of more than 96 bytes align the destination and use an unrolled loop
55 processing 64 bytes per iteration.
56 In order to share code with memmove, small and medium copies read all
57 data before writing, allowing any kind of overlap. So small, medium
58 and large backwards memmoves are handled by falling through into memcpy.
59 Overlapping large forward memmoves use a loop that copies backwards.
60 */
61
62 #ifndef MEMMOVE
63 # define MEMMOVE memmove
64 #endif
65 #ifndef MEMCPY
66 # define MEMCPY memcpy
67 #endif
68
69 ENTRY_ALIGN (MEMMOVE, 6)
70
71 DELOUSE (0)
72 DELOUSE (1)
73 DELOUSE (2)
74
75 sub tmp1, dstin, src
76 cmp count, 96
77 ccmp tmp1, count, 2, hi
78 b.lo L(move_long)
79
80 /* Common case falls through into memcpy. */
81 END (MEMMOVE)
82 libc_hidden_builtin_def (MEMMOVE)
83 ENTRY (MEMCPY)
84
85 DELOUSE (0)
86 DELOUSE (1)
87 DELOUSE (2)
88
89 prfm PLDL1KEEP, [src]
90 add srcend, src, count
91 add dstend, dstin, count
92 cmp count, 16
93 b.ls L(copy16)
94 cmp count, 96
95 b.hi L(copy_long)
96
97 /* Medium copies: 17..96 bytes. */
98 sub tmp1, count, 1
99 ldp A_l, A_h, [src]
100 tbnz tmp1, 6, L(copy96)
101 ldp D_l, D_h, [srcend, -16]
102 tbz tmp1, 5, 1f
103 ldp B_l, B_h, [src, 16]
104 ldp C_l, C_h, [srcend, -32]
105 stp B_l, B_h, [dstin, 16]
106 stp C_l, C_h, [dstend, -32]
107 1:
108 stp A_l, A_h, [dstin]
109 stp D_l, D_h, [dstend, -16]
110 ret
111
112 .p2align 4
113 /* Small copies: 0..16 bytes. */
114 L(copy16):
115 cmp count, 8
116 b.lo 1f
117 ldr A_l, [src]
118 ldr A_h, [srcend, -8]
119 str A_l, [dstin]
120 str A_h, [dstend, -8]
121 ret
122 .p2align 4
123 1:
124 tbz count, 2, 1f
125 ldr A_lw, [src]
126 ldr A_hw, [srcend, -4]
127 str A_lw, [dstin]
128 str A_hw, [dstend, -4]
129 ret
130
131 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
132 byte 3 times if count==1, or the 2nd byte twice if count==2. */
133 1:
134 cbz count, 2f
135 lsr tmp1, count, 1
136 ldrb A_lw, [src]
137 ldrb A_hw, [srcend, -1]
138 ldrb B_lw, [src, tmp1]
139 strb A_lw, [dstin]
140 strb B_lw, [dstin, tmp1]
141 strb A_hw, [dstend, -1]
142 2: ret
143
144 .p2align 4
145 /* Copy 64..96 bytes. Copy 64 bytes from the start and
146 32 bytes from the end. */
147 L(copy96):
148 ldp B_l, B_h, [src, 16]
149 ldp C_l, C_h, [src, 32]
150 ldp D_l, D_h, [src, 48]
151 ldp E_l, E_h, [srcend, -32]
152 ldp F_l, F_h, [srcend, -16]
153 stp A_l, A_h, [dstin]
154 stp B_l, B_h, [dstin, 16]
155 stp C_l, C_h, [dstin, 32]
156 stp D_l, D_h, [dstin, 48]
157 stp E_l, E_h, [dstend, -32]
158 stp F_l, F_h, [dstend, -16]
159 ret
160
161 /* Align DST to 16 byte alignment so that we don't cross cache line
162 boundaries on both loads and stores. There are at least 96 bytes
163 to copy, so copy 16 bytes unaligned and then align. The loop
164 copies 64 bytes per iteration and prefetches one iteration ahead. */
165
166 .p2align 4
167 L(copy_long):
168 and tmp1, dstin, 15
169 bic dst, dstin, 15
170 ldp D_l, D_h, [src]
171 sub src, src, tmp1
172 add count, count, tmp1 /* Count is now 16 too large. */
173 ldp A_l, A_h, [src, 16]
174 stp D_l, D_h, [dstin]
175 ldp B_l, B_h, [src, 32]
176 ldp C_l, C_h, [src, 48]
177 ldp D_l, D_h, [src, 64]!
178 subs count, count, 128 + 16 /* Test and readjust count. */
179 b.ls L(last64)
180 L(loop64):
181 stp A_l, A_h, [dst, 16]
182 ldp A_l, A_h, [src, 16]
183 stp B_l, B_h, [dst, 32]
184 ldp B_l, B_h, [src, 32]
185 stp C_l, C_h, [dst, 48]
186 ldp C_l, C_h, [src, 48]
187 stp D_l, D_h, [dst, 64]!
188 ldp D_l, D_h, [src, 64]!
189 subs count, count, 64
190 b.hi L(loop64)
191
192 /* Write the last full set of 64 bytes. The remainder is at most 64
193 bytes, so it is safe to always copy 64 bytes from the end even if
194 there is just 1 byte left. */
195 L(last64):
196 ldp E_l, E_h, [srcend, -64]
197 stp A_l, A_h, [dst, 16]
198 ldp A_l, A_h, [srcend, -48]
199 stp B_l, B_h, [dst, 32]
200 ldp B_l, B_h, [srcend, -32]
201 stp C_l, C_h, [dst, 48]
202 ldp C_l, C_h, [srcend, -16]
203 stp D_l, D_h, [dst, 64]
204 stp E_l, E_h, [dstend, -64]
205 stp A_l, A_h, [dstend, -48]
206 stp B_l, B_h, [dstend, -32]
207 stp C_l, C_h, [dstend, -16]
208 ret
209
210 .p2align 4
211 L(move_long):
212 cbz tmp1, 3f
213
214 add srcend, src, count
215 add dstend, dstin, count
216
217 /* Align dstend to 16 byte alignment so that we don't cross cache line
218 boundaries on both loads and stores. There are at least 96 bytes
219 to copy, so copy 16 bytes unaligned and then align. The loop
220 copies 64 bytes per iteration and prefetches one iteration ahead. */
221
222 and tmp1, dstend, 15
223 ldp D_l, D_h, [srcend, -16]
224 sub srcend, srcend, tmp1
225 sub count, count, tmp1
226 ldp A_l, A_h, [srcend, -16]
227 stp D_l, D_h, [dstend, -16]
228 ldp B_l, B_h, [srcend, -32]
229 ldp C_l, C_h, [srcend, -48]
230 ldp D_l, D_h, [srcend, -64]!
231 sub dstend, dstend, tmp1
232 subs count, count, 128
233 b.ls 2f
234
235 nop
236 1:
237 stp A_l, A_h, [dstend, -16]
238 ldp A_l, A_h, [srcend, -16]
239 stp B_l, B_h, [dstend, -32]
240 ldp B_l, B_h, [srcend, -32]
241 stp C_l, C_h, [dstend, -48]
242 ldp C_l, C_h, [srcend, -48]
243 stp D_l, D_h, [dstend, -64]!
244 ldp D_l, D_h, [srcend, -64]!
245 subs count, count, 64
246 b.hi 1b
247
248 /* Write the last full set of 64 bytes. The remainder is at most 64
249 bytes, so it is safe to always copy 64 bytes from the start even if
250 there is just 1 byte left. */
251 2:
252 ldp G_l, G_h, [src, 48]
253 stp A_l, A_h, [dstend, -16]
254 ldp A_l, A_h, [src, 32]
255 stp B_l, B_h, [dstend, -32]
256 ldp B_l, B_h, [src, 16]
257 stp C_l, C_h, [dstend, -48]
258 ldp C_l, C_h, [src]
259 stp D_l, D_h, [dstend, -64]
260 stp G_l, G_h, [dstin, 48]
261 stp A_l, A_h, [dstin, 32]
262 stp B_l, B_h, [dstin, 16]
263 stp C_l, C_h, [dstin]
264 3: ret
265
266 END (MEMCPY)
267 libc_hidden_builtin_def (MEMCPY)