]>
Commit | Line | Data |
---|---|---|
688903eb | 1 | /* Copyright (C) 2012-2018 Free Software Foundation, Inc. |
857c8d22 MS |
2 | |
3 | This file is part of the GNU C Library. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library. If not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
b998e16e WD |
19 | #include <sysdep.h> |
20 | ||
857c8d22 MS |
21 | /* Assumptions: |
22 | * | |
b998e16e | 23 | * ARMv8-a, AArch64, unaligned accesses. |
857c8d22 MS |
24 | * |
25 | */ | |
26 | ||
27 | #define dstin x0 | |
28 | #define src x1 | |
29 | #define count x2 | |
b998e16e WD |
30 | #define dst x3 |
31 | #define srcend x4 | |
32 | #define dstend x5 | |
33 | #define A_l x6 | |
34 | #define A_lw w6 | |
35 | #define A_h x7 | |
b998e16e | 36 | #define B_l x8 |
a024b39a | 37 | #define B_lw w8 |
b998e16e WD |
38 | #define B_h x9 |
39 | #define C_l x10 | |
fe09348c | 40 | #define C_lw w10 |
b998e16e WD |
41 | #define C_h x11 |
42 | #define D_l x12 | |
43 | #define D_h x13 | |
722c9357 KK |
44 | #define E_l x14 |
45 | #define E_h x15 | |
46 | #define F_l x16 | |
47 | #define F_h x17 | |
b998e16e WD |
48 | #define G_l count |
49 | #define G_h dst | |
722c9357 KK |
50 | #define H_l src |
51 | #define H_h srcend | |
b998e16e | 52 | #define tmp1 x14 |
857c8d22 | 53 | |
6a2c6952 SE |
54 | #ifndef MEMMOVE |
55 | # define MEMMOVE memmove | |
56 | #endif | |
57 | #ifndef MEMCPY | |
58 | # define MEMCPY memcpy | |
59 | #endif | |
60 | ||
fe09348c WD |
61 | /* This implementation supports both memcpy and memmove and shares most code. |
62 | It uses unaligned accesses and branchless sequences to keep the code small, | |
63 | simple and improve performance. | |
b998e16e | 64 | |
fe09348c WD |
65 | Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
66 | copies of up to 128 bytes, and large copies. The overhead of the overlap | |
67 | check in memmove is negligible since it is only required for large copies. | |
389d1f1b | 68 | |
fe09348c WD |
69 | Large copies use a software pipelined loop processing 64 bytes per |
70 | iteration. The destination pointer is 16-byte aligned to minimize | |
71 | unaligned accesses. The loop tail is handled by always copying 64 bytes | |
72 | from the end. | |
73 | */ | |
b998e16e | 74 | |
fe09348c | 75 | ENTRY_ALIGN (MEMCPY, 6) |
389d1f1b SE |
76 | DELOUSE (0) |
77 | DELOUSE (1) | |
78 | DELOUSE (2) | |
79 | ||
b998e16e WD |
80 | add srcend, src, count |
81 | add dstend, dstin, count | |
722c9357 | 82 | cmp count, 128 |
b998e16e | 83 | b.hi L(copy_long) |
fe09348c WD |
84 | cmp count, 32 |
85 | b.hi L(copy32_128) | |
b998e16e | 86 | |
fe09348c WD |
87 | /* Small copies: 0..32 bytes. */ |
88 | cmp count, 16 | |
89 | b.lo L(copy16) | |
a024b39a | 90 | ldp A_l, A_h, [src] |
722c9357 | 91 | ldp D_l, D_h, [srcend, -16] |
722c9357 | 92 | stp A_l, A_h, [dstin] |
a024b39a WD |
93 | stp D_l, D_h, [dstend, -16] |
94 | ret | |
95 | ||
fe09348c WD |
96 | /* Copy 8-15 bytes. */ |
97 | L(copy16): | |
98 | tbz count, 3, L(copy8) | |
b998e16e WD |
99 | ldr A_l, [src] |
100 | ldr A_h, [srcend, -8] | |
101 | str A_l, [dstin] | |
102 | str A_h, [dstend, -8] | |
103 | ret | |
fe09348c WD |
104 | |
105 | .p2align 3 | |
106 | /* Copy 4-7 bytes. */ | |
107 | L(copy8): | |
108 | tbz count, 2, L(copy4) | |
b998e16e | 109 | ldr A_lw, [src] |
fe09348c | 110 | ldr B_lw, [srcend, -4] |
b998e16e | 111 | str A_lw, [dstin] |
fe09348c | 112 | str B_lw, [dstend, -4] |
b998e16e | 113 | ret |
a024b39a | 114 | |
fe09348c WD |
115 | /* Copy 0..3 bytes using a branchless sequence. */ |
116 | L(copy4): | |
117 | cbz count, L(copy0) | |
a024b39a | 118 | lsr tmp1, count, 1 |
b998e16e | 119 | ldrb A_lw, [src] |
fe09348c | 120 | ldrb C_lw, [srcend, -1] |
a024b39a WD |
121 | ldrb B_lw, [src, tmp1] |
122 | strb A_lw, [dstin] | |
123 | strb B_lw, [dstin, tmp1] | |
fe09348c WD |
124 | strb C_lw, [dstend, -1] |
125 | L(copy0): | |
126 | ret | |
127 | ||
128 | .p2align 4 | |
129 | /* Medium copies: 33..128 bytes. */ | |
130 | L(copy32_128): | |
131 | ldp A_l, A_h, [src] | |
132 | ldp B_l, B_h, [src, 16] | |
133 | ldp C_l, C_h, [srcend, -32] | |
134 | ldp D_l, D_h, [srcend, -16] | |
135 | cmp count, 64 | |
136 | b.hi L(copy128) | |
137 | stp A_l, A_h, [dstin] | |
138 | stp B_l, B_h, [dstin, 16] | |
139 | stp C_l, C_h, [dstend, -32] | |
140 | stp D_l, D_h, [dstend, -16] | |
141 | ret | |
b998e16e | 142 | |
b998e16e | 143 | .p2align 4 |
fe09348c | 144 | /* Copy 65..128 bytes. */ |
722c9357 KK |
145 | L(copy128): |
146 | ldp E_l, E_h, [src, 32] | |
147 | ldp F_l, F_h, [src, 48] | |
fe09348c WD |
148 | cmp count, 96 |
149 | b.ls L(copy96) | |
722c9357 KK |
150 | ldp G_l, G_h, [srcend, -64] |
151 | ldp H_l, H_h, [srcend, -48] | |
fe09348c WD |
152 | stp G_l, G_h, [dstend, -64] |
153 | stp H_l, H_h, [dstend, -48] | |
154 | L(copy96): | |
b998e16e WD |
155 | stp A_l, A_h, [dstin] |
156 | stp B_l, B_h, [dstin, 16] | |
722c9357 KK |
157 | stp E_l, E_h, [dstin, 32] |
158 | stp F_l, F_h, [dstin, 48] | |
722c9357 KK |
159 | stp C_l, C_h, [dstend, -32] |
160 | stp D_l, D_h, [dstend, -16] | |
b998e16e WD |
161 | ret |
162 | ||
b998e16e | 163 | .p2align 4 |
fe09348c | 164 | /* Copy more than 128 bytes. */ |
b998e16e | 165 | L(copy_long): |
fe09348c WD |
166 | /* Copy 16 bytes and then align dst to 16-byte alignment. */ |
167 | ldp D_l, D_h, [src] | |
b998e16e WD |
168 | and tmp1, dstin, 15 |
169 | bic dst, dstin, 15 | |
b998e16e WD |
170 | sub src, src, tmp1 |
171 | add count, count, tmp1 /* Count is now 16 too large. */ | |
172 | ldp A_l, A_h, [src, 16] | |
173 | stp D_l, D_h, [dstin] | |
174 | ldp B_l, B_h, [src, 32] | |
175 | ldp C_l, C_h, [src, 48] | |
176 | ldp D_l, D_h, [src, 64]! | |
177 | subs count, count, 128 + 16 /* Test and readjust count. */ | |
fe09348c WD |
178 | b.ls L(copy64_from_end) |
179 | ||
6a2c6952 | 180 | L(loop64): |
b998e16e WD |
181 | stp A_l, A_h, [dst, 16] |
182 | ldp A_l, A_h, [src, 16] | |
183 | stp B_l, B_h, [dst, 32] | |
184 | ldp B_l, B_h, [src, 32] | |
185 | stp C_l, C_h, [dst, 48] | |
186 | ldp C_l, C_h, [src, 48] | |
187 | stp D_l, D_h, [dst, 64]! | |
188 | ldp D_l, D_h, [src, 64]! | |
189 | subs count, count, 64 | |
6a2c6952 | 190 | b.hi L(loop64) |
b998e16e | 191 | |
fe09348c WD |
192 | /* Write the last iteration and copy 64 bytes from the end. */ |
193 | L(copy64_from_end): | |
b998e16e WD |
194 | ldp E_l, E_h, [srcend, -64] |
195 | stp A_l, A_h, [dst, 16] | |
196 | ldp A_l, A_h, [srcend, -48] | |
197 | stp B_l, B_h, [dst, 32] | |
198 | ldp B_l, B_h, [srcend, -32] | |
199 | stp C_l, C_h, [dst, 48] | |
200 | ldp C_l, C_h, [srcend, -16] | |
201 | stp D_l, D_h, [dst, 64] | |
202 | stp E_l, E_h, [dstend, -64] | |
203 | stp A_l, A_h, [dstend, -48] | |
204 | stp B_l, B_h, [dstend, -32] | |
205 | stp C_l, C_h, [dstend, -16] | |
206 | ret | |
207 | ||
fe09348c WD |
208 | END (MEMCPY) |
209 | libc_hidden_builtin_def (MEMCPY) | |
210 | ||
211 | ENTRY_ALIGN (MEMMOVE, 4) | |
212 | DELOUSE (0) | |
213 | DELOUSE (1) | |
214 | DELOUSE (2) | |
b998e16e WD |
215 | |
216 | add srcend, src, count | |
217 | add dstend, dstin, count | |
fe09348c WD |
218 | cmp count, 128 |
219 | b.hi L(move_long) | |
220 | cmp count, 32 | |
221 | b.hi L(copy32_128) | |
b998e16e | 222 | |
fe09348c WD |
223 | /* Small copies: 0..32 bytes. */ |
224 | cmp count, 16 | |
225 | b.lo L(copy16) | |
226 | ldp A_l, A_h, [src] | |
227 | ldp D_l, D_h, [srcend, -16] | |
228 | stp A_l, A_h, [dstin] | |
229 | stp D_l, D_h, [dstend, -16] | |
230 | ret | |
b998e16e | 231 | |
fe09348c WD |
232 | .p2align 4 |
233 | L(move_long): | |
234 | /* Only use backward copy if there is an overlap. */ | |
235 | sub tmp1, dstin, src | |
236 | cbz tmp1, L(copy0) | |
237 | cmp tmp1, count | |
238 | b.hs L(copy_long) | |
239 | ||
240 | /* Large backwards copy for overlapping copies. | |
241 | Copy 16 bytes and then align dst to 16-byte alignment. */ | |
b998e16e | 242 | ldp D_l, D_h, [srcend, -16] |
fe09348c | 243 | and tmp1, dstend, 15 |
b998e16e WD |
244 | sub srcend, srcend, tmp1 |
245 | sub count, count, tmp1 | |
246 | ldp A_l, A_h, [srcend, -16] | |
247 | stp D_l, D_h, [dstend, -16] | |
248 | ldp B_l, B_h, [srcend, -32] | |
249 | ldp C_l, C_h, [srcend, -48] | |
250 | ldp D_l, D_h, [srcend, -64]! | |
251 | sub dstend, dstend, tmp1 | |
252 | subs count, count, 128 | |
fe09348c | 253 | b.ls L(copy64_from_start) |
b998e16e | 254 | |
fe09348c | 255 | L(loop64_backwards): |
b998e16e WD |
256 | stp A_l, A_h, [dstend, -16] |
257 | ldp A_l, A_h, [srcend, -16] | |
258 | stp B_l, B_h, [dstend, -32] | |
259 | ldp B_l, B_h, [srcend, -32] | |
260 | stp C_l, C_h, [dstend, -48] | |
261 | ldp C_l, C_h, [srcend, -48] | |
262 | stp D_l, D_h, [dstend, -64]! | |
263 | ldp D_l, D_h, [srcend, -64]! | |
264 | subs count, count, 64 | |
fe09348c | 265 | b.hi L(loop64_backwards) |
b998e16e | 266 | |
fe09348c WD |
267 | /* Write the last iteration and copy 64 bytes from the start. */ |
268 | L(copy64_from_start): | |
b998e16e WD |
269 | ldp G_l, G_h, [src, 48] |
270 | stp A_l, A_h, [dstend, -16] | |
271 | ldp A_l, A_h, [src, 32] | |
272 | stp B_l, B_h, [dstend, -32] | |
273 | ldp B_l, B_h, [src, 16] | |
274 | stp C_l, C_h, [dstend, -48] | |
275 | ldp C_l, C_h, [src] | |
276 | stp D_l, D_h, [dstend, -64] | |
277 | stp G_l, G_h, [dstin, 48] | |
278 | stp A_l, A_h, [dstin, 32] | |
279 | stp B_l, B_h, [dstin, 16] | |
280 | stp C_l, C_h, [dstin] | |
fe09348c | 281 | ret |
b998e16e | 282 | |
fe09348c WD |
283 | END (MEMMOVE) |
284 | libc_hidden_builtin_def (MEMMOVE) |