]>
Commit | Line | Data |
---|---|---|
ae65139d | 1 | /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. |
04277e02 | 2 | Copyright (C) 2013-2019 Free Software Foundation, Inc. |
ae65139d WN |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. | |
18 | ||
19 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage | |
20 | of VFP or NEON when built with the appropriate flags. | |
21 | ||
22 | Assumptions: | |
23 | ||
24 | ARMv6 (ARMv7-a if using Neon) | |
25 | ARM state | |
26 | Unaligned accesses | |
ae65139d WN |
27 | |
28 | */ | |
29 | ||
30 | /* Thumb cannot encode negative immediate offsets in memory operations. */ | |
31 | #ifndef NO_THUMB | |
32 | #define NO_THUMB | |
33 | #endif | |
34 | #include <sysdep.h> | |
733edfb8 | 35 | #include <arm-features.h> |
ae65139d WN |
36 | |
37 | .syntax unified | |
38 | /* This implementation requires ARM state. */ | |
39 | .arm | |
40 | ||
41 | #ifdef MEMCPY_NEON | |
42 | ||
43 | .fpu neon | |
44 | .arch armv7-a | |
45 | # define FRAME_SIZE 4 | |
46 | # define USE_VFP | |
47 | # define USE_NEON | |
48 | ||
49 | #elif defined (MEMCPY_VFP) | |
50 | ||
51 | .arch armv6 | |
52 | .fpu vfpv2 | |
53 | # define FRAME_SIZE 32 | |
54 | # define USE_VFP | |
55 | ||
56 | #else | |
57 | .arch armv6 | |
58 | # define FRAME_SIZE 32 | |
59 | ||
60 | #endif | |
61 | ||
62 | #define ALIGN(addr, align) addr:align | |
63 | ||
64 | #define INSN_SIZE 4 | |
65 | ||
66 | /* Call parameters. */ | |
67 | #define dstin r0 | |
68 | #define src r1 | |
69 | #define count r2 | |
70 | ||
71 | /* Locals. */ | |
72 | #define tmp1 r3 | |
73 | #define dst ip | |
733edfb8 RM |
74 | #define tmp2 r8 |
75 | ||
76 | /* These two macros both work by repeated invocation of the macro | |
77 | dispatch_step (not defined here). That macro performs one "step", | |
78 | doing one load instruction and one store instruction to copy one | |
79 | "unit". On entry, TMP1 contains the number of bytes to be copied, | |
80 | a multiple of the unit size. The macro clobbers TMP1 in the | |
81 | process of doing a computed jump to the tail containing the | |
82 | appropriate number of steps. | |
83 | ||
84 | In dispatch_7_dword, dispatch_step is invoked seven times, with an | |
85 | argument that is 7 for the first and 1 for the last. Units are | |
86 | double-words (8 bytes). TMP1 is at most 56. | |
87 | ||
88 | In dispatch_15_word, dispatch_step is invoked fifteen times, | |
89 | with an argument that is 15 for the first and 1 for the last. | |
90 | Units are words (4 bytes). TMP1 is at most 60. */ | |
91 | ||
92 | #ifndef ARM_ALWAYS_BX | |
93 | # if ARM_BX_ALIGN_LOG2 != 2 | |
94 | # error case not handled | |
95 | # endif | |
96 | .macro dispatch_7_dword | |
97 | rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) | |
98 | add pc, pc, tmp1 | |
99 | dispatch_step 7 | |
100 | dispatch_step 6 | |
101 | dispatch_step 5 | |
102 | dispatch_step 4 | |
103 | dispatch_step 3 | |
104 | dispatch_step 2 | |
105 | dispatch_step 1 | |
106 | .purgem dispatch_step | |
107 | .endm | |
108 | ||
109 | .macro dispatch_15_word | |
110 | rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) | |
111 | add pc, pc, tmp1, lsl #1 | |
112 | dispatch_step 15 | |
113 | dispatch_step 14 | |
114 | dispatch_step 13 | |
115 | dispatch_step 12 | |
116 | dispatch_step 11 | |
117 | dispatch_step 10 | |
118 | dispatch_step 9 | |
119 | dispatch_step 8 | |
120 | dispatch_step 7 | |
121 | dispatch_step 6 | |
122 | dispatch_step 5 | |
123 | dispatch_step 4 | |
124 | dispatch_step 3 | |
125 | dispatch_step 2 | |
126 | dispatch_step 1 | |
127 | .purgem dispatch_step | |
128 | .endm | |
129 | #else | |
068dcfd6 | 130 | # if ARM_BX_ALIGN_LOG2 < 3 |
733edfb8 RM |
131 | # error case not handled |
132 | # endif | |
133 | .macro dispatch_helper steps, log2_bytes_per_step | |
733edfb8 RM |
134 | /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is |
135 | (STEPS << LOG2_BYTES_PER_STEP). | |
068dcfd6 RM |
136 | So this is (steps_to_skip << LOG2_BYTES_PER_STEP). |
137 | Then it needs further adjustment to compensate for the | |
138 | distance between the PC value taken below (0f + PC_OFS) | |
139 | and the first step's instructions (1f). */ | |
140 | rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ | |
141 | + ((1f - PC_OFS - 0f) \ | |
142 | >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) | |
733edfb8 RM |
143 | /* Shifting down LOG2_BYTES_PER_STEP gives us the number of |
144 | steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us | |
145 | the (byte) distance to add to the PC. */ | |
068dcfd6 | 146 | 0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) |
733edfb8 | 147 | bx tmp1 |
068dcfd6 RM |
148 | .p2align ARM_BX_ALIGN_LOG2 |
149 | 1: | |
733edfb8 RM |
150 | .endm |
151 | ||
152 | .macro dispatch_7_dword | |
153 | dispatch_helper 7, 3 | |
154 | .p2align ARM_BX_ALIGN_LOG2 | |
155 | dispatch_step 7 | |
156 | .p2align ARM_BX_ALIGN_LOG2 | |
157 | dispatch_step 6 | |
158 | .p2align ARM_BX_ALIGN_LOG2 | |
159 | dispatch_step 5 | |
160 | .p2align ARM_BX_ALIGN_LOG2 | |
161 | dispatch_step 4 | |
162 | .p2align ARM_BX_ALIGN_LOG2 | |
163 | dispatch_step 3 | |
164 | .p2align ARM_BX_ALIGN_LOG2 | |
165 | dispatch_step 2 | |
166 | .p2align ARM_BX_ALIGN_LOG2 | |
167 | dispatch_step 1 | |
168 | .p2align ARM_BX_ALIGN_LOG2 | |
169 | .purgem dispatch_step | |
170 | .endm | |
171 | ||
172 | .macro dispatch_15_word | |
173 | dispatch_helper 15, 2 | |
174 | dispatch_step 15 | |
175 | .p2align ARM_BX_ALIGN_LOG2 | |
176 | dispatch_step 14 | |
177 | .p2align ARM_BX_ALIGN_LOG2 | |
178 | dispatch_step 13 | |
179 | .p2align ARM_BX_ALIGN_LOG2 | |
180 | dispatch_step 12 | |
181 | .p2align ARM_BX_ALIGN_LOG2 | |
182 | dispatch_step 11 | |
183 | .p2align ARM_BX_ALIGN_LOG2 | |
184 | dispatch_step 10 | |
185 | .p2align ARM_BX_ALIGN_LOG2 | |
186 | dispatch_step 9 | |
187 | .p2align ARM_BX_ALIGN_LOG2 | |
188 | dispatch_step 8 | |
189 | .p2align ARM_BX_ALIGN_LOG2 | |
190 | dispatch_step 7 | |
191 | .p2align ARM_BX_ALIGN_LOG2 | |
192 | dispatch_step 6 | |
193 | .p2align ARM_BX_ALIGN_LOG2 | |
194 | dispatch_step 5 | |
195 | .p2align ARM_BX_ALIGN_LOG2 | |
196 | dispatch_step 4 | |
197 | .p2align ARM_BX_ALIGN_LOG2 | |
198 | dispatch_step 3 | |
199 | .p2align ARM_BX_ALIGN_LOG2 | |
200 | dispatch_step 2 | |
201 | .p2align ARM_BX_ALIGN_LOG2 | |
202 | dispatch_step 1 | |
203 | .p2align ARM_BX_ALIGN_LOG2 | |
204 | .purgem dispatch_step | |
205 | .endm | |
206 | ||
207 | #endif | |
ae65139d WN |
208 | |
209 | #ifndef USE_NEON | |
210 | /* For bulk copies using GP registers. */ | |
211 | #define A_l r2 /* Call-clobbered. */ | |
212 | #define A_h r3 /* Call-clobbered. */ | |
213 | #define B_l r4 | |
214 | #define B_h r5 | |
215 | #define C_l r6 | |
216 | #define C_h r7 | |
733edfb8 RM |
217 | /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ |
218 | #define D_l r10 | |
219 | #define D_h r11 | |
ae65139d WN |
220 | #endif |
221 | ||
222 | /* Number of lines ahead to pre-fetch data. If you change this the code | |
223 | below will need adjustment to compensate. */ | |
224 | ||
225 | #define prefetch_lines 5 | |
226 | ||
227 | #ifdef USE_VFP | |
228 | .macro cpy_line_vfp vreg, base | |
81cb7a0b ZW |
229 | vstr \vreg, [dst, #\base] |
230 | vldr \vreg, [src, #\base] | |
231 | vstr d0, [dst, #\base + 8] | |
232 | vldr d0, [src, #\base + 8] | |
233 | vstr d1, [dst, #\base + 16] | |
234 | vldr d1, [src, #\base + 16] | |
235 | vstr d2, [dst, #\base + 24] | |
236 | vldr d2, [src, #\base + 24] | |
237 | vstr \vreg, [dst, #\base + 32] | |
238 | vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] | |
239 | vstr d0, [dst, #\base + 40] | |
240 | vldr d0, [src, #\base + 40] | |
241 | vstr d1, [dst, #\base + 48] | |
242 | vldr d1, [src, #\base + 48] | |
243 | vstr d2, [dst, #\base + 56] | |
244 | vldr d2, [src, #\base + 56] | |
ae65139d WN |
245 | .endm |
246 | ||
247 | .macro cpy_tail_vfp vreg, base | |
81cb7a0b ZW |
248 | vstr \vreg, [dst, #\base] |
249 | vldr \vreg, [src, #\base] | |
250 | vstr d0, [dst, #\base + 8] | |
251 | vldr d0, [src, #\base + 8] | |
252 | vstr d1, [dst, #\base + 16] | |
253 | vldr d1, [src, #\base + 16] | |
254 | vstr d2, [dst, #\base + 24] | |
255 | vldr d2, [src, #\base + 24] | |
256 | vstr \vreg, [dst, #\base + 32] | |
257 | vstr d0, [dst, #\base + 40] | |
258 | vldr d0, [src, #\base + 40] | |
259 | vstr d1, [dst, #\base + 48] | |
260 | vldr d1, [src, #\base + 48] | |
261 | vstr d2, [dst, #\base + 56] | |
262 | vldr d2, [src, #\base + 56] | |
ae65139d WN |
263 | .endm |
264 | #endif | |
265 | ||
266 | .p2align 6 | |
267 | ENTRY(memcpy) | |
268 | ||
269 | mov dst, dstin /* Preserve dstin, we need to return it. */ | |
270 | cmp count, #64 | |
271 | bge .Lcpy_not_short | |
272 | /* Deal with small copies quickly by dropping straight into the | |
273 | exit block. */ | |
274 | ||
275 | .Ltail63unaligned: | |
276 | #ifdef USE_NEON | |
733edfb8 RM |
277 | /* These need an extra layer of macro just to work around a |
278 | bug in the assembler's parser when an operand starts with | |
a306c790 | 279 | a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647 |
733edfb8 RM |
280 | tracks that bug; it was not fixed as of binutils-2.23.2. */ |
281 | .macro neon_load_d0 reg | |
282 | vld1.8 {d0}, [\reg]! | |
283 | .endm | |
284 | .macro neon_store_d0 reg | |
285 | vst1.8 {d0}, [\reg]! | |
286 | .endm | |
287 | ||
ae65139d | 288 | and tmp1, count, #0x38 |
733edfb8 | 289 | .macro dispatch_step i |
81cb7a0b ZW |
290 | neon_load_d0 src |
291 | neon_store_d0 dst | |
733edfb8 RM |
292 | .endm |
293 | dispatch_7_dword | |
ae65139d WN |
294 | |
295 | tst count, #4 | |
81cb7a0b ZW |
296 | ldrne tmp1, [src], #4 |
297 | strne tmp1, [dst], #4 | |
ae65139d WN |
298 | #else |
299 | /* Copy up to 15 full words of data. May not be aligned. */ | |
300 | /* Cannot use VFP for unaligned data. */ | |
301 | and tmp1, count, #0x3c | |
302 | add dst, dst, tmp1 | |
303 | add src, src, tmp1 | |
ae65139d | 304 | /* Jump directly into the sequence below at the correct offset. */ |
733edfb8 | 305 | .macro dispatch_step i |
81cb7a0b ZW |
306 | ldr tmp1, [src, #-(\i * 4)] |
307 | str tmp1, [dst, #-(\i * 4)] | |
733edfb8 RM |
308 | .endm |
309 | dispatch_15_word | |
ae65139d WN |
310 | #endif |
311 | ||
312 | lsls count, count, #31 | |
81cb7a0b ZW |
313 | ldrhcs tmp1, [src], #2 |
314 | ldrbne src, [src] /* Src is dead, use as a scratch. */ | |
315 | strhcs tmp1, [dst], #2 | |
316 | strbne src, [dst] | |
ae65139d WN |
317 | bx lr |
318 | ||
319 | .Lcpy_not_short: | |
320 | /* At least 64 bytes to copy, but don't know the alignment yet. */ | |
321 | str tmp2, [sp, #-FRAME_SIZE]! | |
322 | cfi_adjust_cfa_offset (FRAME_SIZE) | |
323 | cfi_rel_offset (tmp2, 0) | |
324 | cfi_remember_state | |
cd90698b WN |
325 | and tmp2, src, #7 |
326 | and tmp1, dst, #7 | |
ae65139d WN |
327 | cmp tmp1, tmp2 |
328 | bne .Lcpy_notaligned | |
329 | ||
330 | #ifdef USE_VFP | |
331 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show | |
332 | that the FP pipeline is much better at streaming loads and | |
333 | stores. This is outside the critical loop. */ | |
334 | vmov.f32 s0, s0 | |
335 | #endif | |
336 | ||
cd90698b | 337 | /* SRC and DST have the same mutual 64-bit alignment, but we may |
ae65139d | 338 | still need to pre-copy some bytes to get to natural alignment. |
cd90698b | 339 | We bring SRC and DST into full 64-bit alignment. */ |
ae65139d WN |
340 | lsls tmp2, dst, #29 |
341 | beq 1f | |
342 | rsbs tmp2, tmp2, #0 | |
343 | sub count, count, tmp2, lsr #29 | |
81cb7a0b ZW |
344 | ldrmi tmp1, [src], #4 |
345 | strmi tmp1, [dst], #4 | |
ae65139d | 346 | lsls tmp2, tmp2, #2 |
81cb7a0b ZW |
347 | ldrhcs tmp1, [src], #2 |
348 | ldrbne tmp2, [src], #1 | |
349 | strhcs tmp1, [dst], #2 | |
350 | strbne tmp2, [dst], #1 | |
ae65139d WN |
351 | |
352 | 1: | |
353 | subs tmp2, count, #64 /* Use tmp2 for count. */ | |
354 | blt .Ltail63aligned | |
355 | ||
356 | cmp tmp2, #512 | |
357 | bge .Lcpy_body_long | |
358 | ||
359 | .Lcpy_body_medium: /* Count in tmp2. */ | |
360 | #ifdef USE_VFP | |
361 | 1: | |
81cb7a0b | 362 | vldr d0, [src, #0] |
ae65139d | 363 | subs tmp2, tmp2, #64 |
81cb7a0b ZW |
364 | vldr d1, [src, #8] |
365 | vstr d0, [dst, #0] | |
366 | vldr d0, [src, #16] | |
367 | vstr d1, [dst, #8] | |
368 | vldr d1, [src, #24] | |
369 | vstr d0, [dst, #16] | |
370 | vldr d0, [src, #32] | |
371 | vstr d1, [dst, #24] | |
372 | vldr d1, [src, #40] | |
373 | vstr d0, [dst, #32] | |
374 | vldr d0, [src, #48] | |
375 | vstr d1, [dst, #40] | |
376 | vldr d1, [src, #56] | |
377 | vstr d0, [dst, #48] | |
ae65139d | 378 | add src, src, #64 |
81cb7a0b | 379 | vstr d1, [dst, #56] |
ae65139d WN |
380 | add dst, dst, #64 |
381 | bge 1b | |
382 | tst tmp2, #0x3f | |
383 | beq .Ldone | |
384 | ||
385 | .Ltail63aligned: /* Count in tmp2. */ | |
386 | and tmp1, tmp2, #0x38 | |
387 | add dst, dst, tmp1 | |
388 | add src, src, tmp1 | |
733edfb8 | 389 | .macro dispatch_step i |
81cb7a0b ZW |
390 | vldr d0, [src, #-(\i * 8)] |
391 | vstr d0, [dst, #-(\i * 8)] | |
733edfb8 RM |
392 | .endm |
393 | dispatch_7_dword | |
ae65139d WN |
394 | #else |
395 | sub src, src, #8 | |
396 | sub dst, dst, #8 | |
397 | 1: | |
81cb7a0b ZW |
398 | ldrd A_l, A_h, [src, #8] |
399 | strd A_l, A_h, [dst, #8] | |
400 | ldrd A_l, A_h, [src, #16] | |
401 | strd A_l, A_h, [dst, #16] | |
402 | ldrd A_l, A_h, [src, #24] | |
403 | strd A_l, A_h, [dst, #24] | |
404 | ldrd A_l, A_h, [src, #32] | |
405 | strd A_l, A_h, [dst, #32] | |
406 | ldrd A_l, A_h, [src, #40] | |
407 | strd A_l, A_h, [dst, #40] | |
408 | ldrd A_l, A_h, [src, #48] | |
409 | strd A_l, A_h, [dst, #48] | |
410 | ldrd A_l, A_h, [src, #56] | |
411 | strd A_l, A_h, [dst, #56] | |
412 | ldrd A_l, A_h, [src, #64]! | |
413 | strd A_l, A_h, [dst, #64]! | |
ae65139d WN |
414 | subs tmp2, tmp2, #64 |
415 | bge 1b | |
416 | tst tmp2, #0x3f | |
417 | bne 1f | |
418 | ldr tmp2,[sp], #FRAME_SIZE | |
419 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
420 | cfi_restore (tmp2) | |
421 | bx lr | |
422 | ||
423 | cfi_restore_state | |
424 | cfi_remember_state | |
425 | 1: | |
426 | add src, src, #8 | |
427 | add dst, dst, #8 | |
428 | ||
429 | .Ltail63aligned: /* Count in tmp2. */ | |
430 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but | |
cd90698b | 431 | we know that the src and dest are 64-bit aligned so we can use |
ae65139d WN |
432 | LDRD/STRD to improve efficiency. */ |
433 | /* TMP2 is now negative, but we don't care about that. The bottom | |
434 | six bits still tell us how many bytes are left to copy. */ | |
435 | ||
436 | and tmp1, tmp2, #0x38 | |
437 | add dst, dst, tmp1 | |
438 | add src, src, tmp1 | |
733edfb8 | 439 | .macro dispatch_step i |
81cb7a0b ZW |
440 | ldrd A_l, A_h, [src, #-(\i * 8)] |
441 | strd A_l, A_h, [dst, #-(\i * 8)] | |
733edfb8 RM |
442 | .endm |
443 | dispatch_7_dword | |
ae65139d | 444 | #endif |
733edfb8 | 445 | |
ae65139d | 446 | tst tmp2, #4 |
81cb7a0b ZW |
447 | ldrne tmp1, [src], #4 |
448 | strne tmp1, [dst], #4 | |
ae65139d | 449 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
81cb7a0b ZW |
450 | ldrhcs tmp1, [src], #2 |
451 | ldrbne tmp2, [src] | |
452 | strhcs tmp1, [dst], #2 | |
453 | strbne tmp2, [dst] | |
ae65139d WN |
454 | |
455 | .Ldone: | |
456 | ldr tmp2, [sp], #FRAME_SIZE | |
457 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
458 | cfi_restore (tmp2) | |
459 | bx lr | |
460 | ||
461 | cfi_restore_state | |
462 | cfi_remember_state | |
463 | ||
464 | .Lcpy_body_long: /* Count in tmp2. */ | |
465 | ||
466 | /* Long copy. We know that there's at least (prefetch_lines * 64) | |
467 | bytes to go. */ | |
468 | #ifdef USE_VFP | |
469 | /* Don't use PLD. Instead, read some data in advance of the current | |
470 | copy position into a register. This should act like a PLD | |
471 | operation but we won't have to repeat the transfer. */ | |
472 | ||
81cb7a0b ZW |
473 | vldr d3, [src, #0] |
474 | vldr d4, [src, #64] | |
475 | vldr d5, [src, #128] | |
476 | vldr d6, [src, #192] | |
477 | vldr d7, [src, #256] | |
478 | ||
479 | vldr d0, [src, #8] | |
480 | vldr d1, [src, #16] | |
481 | vldr d2, [src, #24] | |
ae65139d WN |
482 | add src, src, #32 |
483 | ||
484 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 | |
485 | blt 2f | |
486 | 1: | |
487 | cpy_line_vfp d3, 0 | |
488 | cpy_line_vfp d4, 64 | |
489 | cpy_line_vfp d5, 128 | |
490 | add dst, dst, #3 * 64 | |
491 | add src, src, #3 * 64 | |
492 | cpy_line_vfp d6, 0 | |
493 | cpy_line_vfp d7, 64 | |
494 | add dst, dst, #2 * 64 | |
495 | add src, src, #2 * 64 | |
496 | subs tmp2, tmp2, #prefetch_lines * 64 | |
497 | bge 1b | |
498 | ||
499 | 2: | |
500 | cpy_tail_vfp d3, 0 | |
501 | cpy_tail_vfp d4, 64 | |
502 | cpy_tail_vfp d5, 128 | |
503 | add src, src, #3 * 64 | |
504 | add dst, dst, #3 * 64 | |
505 | cpy_tail_vfp d6, 0 | |
81cb7a0b ZW |
506 | vstr d7, [dst, #64] |
507 | vldr d7, [src, #64] | |
508 | vstr d0, [dst, #64 + 8] | |
509 | vldr d0, [src, #64 + 8] | |
510 | vstr d1, [dst, #64 + 16] | |
511 | vldr d1, [src, #64 + 16] | |
512 | vstr d2, [dst, #64 + 24] | |
513 | vldr d2, [src, #64 + 24] | |
514 | vstr d7, [dst, #64 + 32] | |
ae65139d | 515 | add src, src, #96 |
81cb7a0b ZW |
516 | vstr d0, [dst, #64 + 40] |
517 | vstr d1, [dst, #64 + 48] | |
518 | vstr d2, [dst, #64 + 56] | |
ae65139d WN |
519 | add dst, dst, #128 |
520 | add tmp2, tmp2, #prefetch_lines * 64 | |
521 | b .Lcpy_body_medium | |
522 | #else | |
523 | /* Long copy. Use an SMS style loop to maximize the I/O | |
524 | bandwidth of the core. We don't have enough spare registers | |
525 | to synthesise prefetching, so use PLD operations. */ | |
526 | /* Pre-bias src and dst. */ | |
527 | sub src, src, #8 | |
528 | sub dst, dst, #8 | |
81cb7a0b ZW |
529 | pld [src, #8] |
530 | pld [src, #72] | |
ae65139d | 531 | subs tmp2, tmp2, #64 |
81cb7a0b ZW |
532 | pld [src, #136] |
533 | ldrd A_l, A_h, [src, #8] | |
ae65139d WN |
534 | strd B_l, B_h, [sp, #8] |
535 | cfi_rel_offset (B_l, 8) | |
536 | cfi_rel_offset (B_h, 12) | |
81cb7a0b | 537 | ldrd B_l, B_h, [src, #16] |
ae65139d WN |
538 | strd C_l, C_h, [sp, #16] |
539 | cfi_rel_offset (C_l, 16) | |
540 | cfi_rel_offset (C_h, 20) | |
81cb7a0b | 541 | ldrd C_l, C_h, [src, #24] |
ae65139d WN |
542 | strd D_l, D_h, [sp, #24] |
543 | cfi_rel_offset (D_l, 24) | |
544 | cfi_rel_offset (D_h, 28) | |
81cb7a0b ZW |
545 | pld [src, #200] |
546 | ldrd D_l, D_h, [src, #32]! | |
ae65139d WN |
547 | b 1f |
548 | .p2align 6 | |
549 | 2: | |
81cb7a0b ZW |
550 | pld [src, #232] |
551 | strd A_l, A_h, [dst, #40] | |
552 | ldrd A_l, A_h, [src, #40] | |
553 | strd B_l, B_h, [dst, #48] | |
554 | ldrd B_l, B_h, [src, #48] | |
555 | strd C_l, C_h, [dst, #56] | |
556 | ldrd C_l, C_h, [src, #56] | |
557 | strd D_l, D_h, [dst, #64]! | |
558 | ldrd D_l, D_h, [src, #64]! | |
ae65139d WN |
559 | subs tmp2, tmp2, #64 |
560 | 1: | |
81cb7a0b ZW |
561 | strd A_l, A_h, [dst, #8] |
562 | ldrd A_l, A_h, [src, #8] | |
563 | strd B_l, B_h, [dst, #16] | |
564 | ldrd B_l, B_h, [src, #16] | |
565 | strd C_l, C_h, [dst, #24] | |
566 | ldrd C_l, C_h, [src, #24] | |
567 | strd D_l, D_h, [dst, #32] | |
568 | ldrd D_l, D_h, [src, #32] | |
ae65139d WN |
569 | bcs 2b |
570 | /* Save the remaining bytes and restore the callee-saved regs. */ | |
81cb7a0b | 571 | strd A_l, A_h, [dst, #40] |
ae65139d | 572 | add src, src, #40 |
81cb7a0b | 573 | strd B_l, B_h, [dst, #48] |
ae65139d WN |
574 | ldrd B_l, B_h, [sp, #8] |
575 | cfi_restore (B_l) | |
576 | cfi_restore (B_h) | |
81cb7a0b | 577 | strd C_l, C_h, [dst, #56] |
ae65139d WN |
578 | ldrd C_l, C_h, [sp, #16] |
579 | cfi_restore (C_l) | |
580 | cfi_restore (C_h) | |
81cb7a0b | 581 | strd D_l, D_h, [dst, #64] |
ae65139d WN |
582 | ldrd D_l, D_h, [sp, #24] |
583 | cfi_restore (D_l) | |
584 | cfi_restore (D_h) | |
585 | add dst, dst, #72 | |
586 | tst tmp2, #0x3f | |
587 | bne .Ltail63aligned | |
588 | ldr tmp2, [sp], #FRAME_SIZE | |
589 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
590 | cfi_restore (tmp2) | |
591 | bx lr | |
592 | #endif | |
593 | ||
594 | cfi_restore_state | |
595 | cfi_remember_state | |
596 | ||
597 | .Lcpy_notaligned: | |
81cb7a0b ZW |
598 | pld [src, #0] |
599 | pld [src, #64] | |
ae65139d WN |
600 | /* There's at least 64 bytes to copy, but there is no mutual |
601 | alignment. */ | |
602 | /* Bring DST to 64-bit alignment. */ | |
603 | lsls tmp2, dst, #29 | |
81cb7a0b | 604 | pld [src, #(2 * 64)] |
ae65139d WN |
605 | beq 1f |
606 | rsbs tmp2, tmp2, #0 | |
607 | sub count, count, tmp2, lsr #29 | |
81cb7a0b ZW |
608 | ldrmi tmp1, [src], #4 |
609 | strmi tmp1, [dst], #4 | |
ae65139d | 610 | lsls tmp2, tmp2, #2 |
81cb7a0b ZW |
611 | ldrbne tmp1, [src], #1 |
612 | ldrhcs tmp2, [src], #2 | |
613 | strbne tmp1, [dst], #1 | |
614 | strhcs tmp2, [dst], #2 | |
ae65139d | 615 | 1: |
81cb7a0b | 616 | pld [src, #(3 * 64)] |
ae65139d WN |
617 | subs count, count, #64 |
618 | ldrmi tmp2, [sp], #FRAME_SIZE | |
619 | bmi .Ltail63unaligned | |
81cb7a0b | 620 | pld [src, #(4 * 64)] |
ae65139d WN |
621 | |
622 | #ifdef USE_NEON | |
733edfb8 RM |
623 | /* These need an extra layer of macro just to work around a |
624 | bug in the assembler's parser when an operand starts with | |
625 | a {...}. */ | |
626 | .macro neon_load_multi reglist, basereg | |
627 | vld1.8 {\reglist}, [\basereg]! | |
628 | .endm | |
629 | .macro neon_store_multi reglist, basereg | |
630 | vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! | |
631 | .endm | |
632 | ||
81cb7a0b ZW |
633 | neon_load_multi d0-d3, src |
634 | neon_load_multi d4-d7, src | |
ae65139d WN |
635 | subs count, count, #64 |
636 | bmi 2f | |
637 | 1: | |
81cb7a0b ZW |
638 | pld [src, #(4 * 64)] |
639 | neon_store_multi d0-d3, dst | |
640 | neon_load_multi d0-d3, src | |
641 | neon_store_multi d4-d7, dst | |
642 | neon_load_multi d4-d7, src | |
ae65139d WN |
643 | subs count, count, #64 |
644 | bpl 1b | |
645 | 2: | |
81cb7a0b ZW |
646 | neon_store_multi d0-d3, dst |
647 | neon_store_multi d4-d7, dst | |
ae65139d WN |
648 | ands count, count, #0x3f |
649 | #else | |
650 | /* Use an SMS style loop to maximize the I/O bandwidth. */ | |
651 | sub src, src, #4 | |
652 | sub dst, dst, #8 | |
653 | subs tmp2, count, #64 /* Use tmp2 for count. */ | |
81cb7a0b ZW |
654 | ldr A_l, [src, #4] |
655 | ldr A_h, [src, #8] | |
ae65139d WN |
656 | strd B_l, B_h, [sp, #8] |
657 | cfi_rel_offset (B_l, 8) | |
658 | cfi_rel_offset (B_h, 12) | |
81cb7a0b ZW |
659 | ldr B_l, [src, #12] |
660 | ldr B_h, [src, #16] | |
ae65139d WN |
661 | strd C_l, C_h, [sp, #16] |
662 | cfi_rel_offset (C_l, 16) | |
663 | cfi_rel_offset (C_h, 20) | |
81cb7a0b ZW |
664 | ldr C_l, [src, #20] |
665 | ldr C_h, [src, #24] | |
ae65139d WN |
666 | strd D_l, D_h, [sp, #24] |
667 | cfi_rel_offset (D_l, 24) | |
668 | cfi_rel_offset (D_h, 28) | |
81cb7a0b ZW |
669 | ldr D_l, [src, #28] |
670 | ldr D_h, [src, #32]! | |
ae65139d WN |
671 | b 1f |
672 | .p2align 6 | |
673 | 2: | |
81cb7a0b ZW |
674 | pld [src, #(5 * 64) - (32 - 4)] |
675 | strd A_l, A_h, [dst, #40] | |
676 | ldr A_l, [src, #36] | |
677 | ldr A_h, [src, #40] | |
678 | strd B_l, B_h, [dst, #48] | |
679 | ldr B_l, [src, #44] | |
680 | ldr B_h, [src, #48] | |
681 | strd C_l, C_h, [dst, #56] | |
682 | ldr C_l, [src, #52] | |
683 | ldr C_h, [src, #56] | |
684 | strd D_l, D_h, [dst, #64]! | |
685 | ldr D_l, [src, #60] | |
686 | ldr D_h, [src, #64]! | |
ae65139d WN |
687 | subs tmp2, tmp2, #64 |
688 | 1: | |
81cb7a0b ZW |
689 | strd A_l, A_h, [dst, #8] |
690 | ldr A_l, [src, #4] | |
691 | ldr A_h, [src, #8] | |
692 | strd B_l, B_h, [dst, #16] | |
693 | ldr B_l, [src, #12] | |
694 | ldr B_h, [src, #16] | |
695 | strd C_l, C_h, [dst, #24] | |
696 | ldr C_l, [src, #20] | |
697 | ldr C_h, [src, #24] | |
698 | strd D_l, D_h, [dst, #32] | |
699 | ldr D_l, [src, #28] | |
700 | ldr D_h, [src, #32] | |
ae65139d WN |
701 | bcs 2b |
702 | ||
703 | /* Save the remaining bytes and restore the callee-saved regs. */ | |
81cb7a0b | 704 | strd A_l, A_h, [dst, #40] |
ae65139d | 705 | add src, src, #36 |
81cb7a0b | 706 | strd B_l, B_h, [dst, #48] |
ae65139d WN |
707 | ldrd B_l, B_h, [sp, #8] |
708 | cfi_restore (B_l) | |
709 | cfi_restore (B_h) | |
81cb7a0b | 710 | strd C_l, C_h, [dst, #56] |
ae65139d WN |
711 | ldrd C_l, C_h, [sp, #16] |
712 | cfi_restore (C_l) | |
713 | cfi_restore (C_h) | |
81cb7a0b | 714 | strd D_l, D_h, [dst, #64] |
ae65139d WN |
715 | ldrd D_l, D_h, [sp, #24] |
716 | cfi_restore (D_l) | |
717 | cfi_restore (D_h) | |
718 | add dst, dst, #72 | |
719 | ands count, tmp2, #0x3f | |
720 | #endif | |
721 | ldr tmp2, [sp], #FRAME_SIZE | |
722 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
723 | cfi_restore (tmp2) | |
724 | bne .Ltail63unaligned | |
725 | bx lr | |
726 | ||
727 | END(memcpy) | |
728 | libc_hidden_builtin_def (memcpy) |