]>
Commit | Line | Data |
---|---|---|
ae65139d | 1 | /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. |
f7a9f785 | 2 | Copyright (C) 2013-2016 Free Software Foundation, Inc. |
ae65139d WN |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. | |
18 | ||
19 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage | |
20 | of VFP or NEON when built with the appropriate flags. | |
21 | ||
22 | Assumptions: | |
23 | ||
24 | ARMv6 (ARMv7-a if using Neon) | |
25 | ARM state | |
26 | Unaligned accesses | |
ae65139d WN |
27 | |
28 | */ | |
29 | ||
30 | /* Thumb cannot encode negative immediate offsets in memory operations. */ | |
31 | #ifndef NO_THUMB | |
32 | #define NO_THUMB | |
33 | #endif | |
34 | #include <sysdep.h> | |
733edfb8 | 35 | #include <arm-features.h> |
ae65139d WN |
36 | |
37 | .syntax unified | |
38 | /* This implementation requires ARM state. */ | |
39 | .arm | |
40 | ||
41 | #ifdef MEMCPY_NEON | |
42 | ||
43 | .fpu neon | |
44 | .arch armv7-a | |
45 | # define FRAME_SIZE 4 | |
46 | # define USE_VFP | |
47 | # define USE_NEON | |
48 | ||
49 | #elif defined (MEMCPY_VFP) | |
50 | ||
51 | .arch armv6 | |
52 | .fpu vfpv2 | |
53 | # define FRAME_SIZE 32 | |
54 | # define USE_VFP | |
55 | ||
56 | #else | |
57 | .arch armv6 | |
58 | # define FRAME_SIZE 32 | |
59 | ||
60 | #endif | |
61 | ||
62 | #define ALIGN(addr, align) addr:align | |
63 | ||
64 | #define INSN_SIZE 4 | |
65 | ||
66 | /* Call parameters. */ | |
67 | #define dstin r0 | |
68 | #define src r1 | |
69 | #define count r2 | |
70 | ||
71 | /* Locals. */ | |
72 | #define tmp1 r3 | |
73 | #define dst ip | |
733edfb8 RM |
74 | #define tmp2 r8 |
75 | ||
76 | /* These two macros both work by repeated invocation of the macro | |
77 | dispatch_step (not defined here). That macro performs one "step", | |
78 | doing one load instruction and one store instruction to copy one | |
79 | "unit". On entry, TMP1 contains the number of bytes to be copied, | |
80 | a multiple of the unit size. The macro clobbers TMP1 in the | |
81 | process of doing a computed jump to the tail containing the | |
82 | appropriate number of steps. | |
83 | ||
84 | In dispatch_7_dword, dispatch_step is invoked seven times, with an | |
85 | argument that is 7 for the first and 1 for the last. Units are | |
86 | double-words (8 bytes). TMP1 is at most 56. | |
87 | ||
88 | In dispatch_15_word, dispatch_step is invoked fifteen times, | |
89 | with an argument that is 15 for the first and 1 for the last. | |
90 | Units are words (4 bytes). TMP1 is at most 60. */ | |
91 | ||
92 | #ifndef ARM_ALWAYS_BX | |
93 | # if ARM_BX_ALIGN_LOG2 != 2 | |
94 | # error case not handled | |
95 | # endif | |
96 | .macro dispatch_7_dword | |
97 | rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) | |
98 | add pc, pc, tmp1 | |
99 | dispatch_step 7 | |
100 | dispatch_step 6 | |
101 | dispatch_step 5 | |
102 | dispatch_step 4 | |
103 | dispatch_step 3 | |
104 | dispatch_step 2 | |
105 | dispatch_step 1 | |
106 | .purgem dispatch_step | |
107 | .endm | |
108 | ||
109 | .macro dispatch_15_word | |
110 | rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) | |
111 | add pc, pc, tmp1, lsl #1 | |
112 | dispatch_step 15 | |
113 | dispatch_step 14 | |
114 | dispatch_step 13 | |
115 | dispatch_step 12 | |
116 | dispatch_step 11 | |
117 | dispatch_step 10 | |
118 | dispatch_step 9 | |
119 | dispatch_step 8 | |
120 | dispatch_step 7 | |
121 | dispatch_step 6 | |
122 | dispatch_step 5 | |
123 | dispatch_step 4 | |
124 | dispatch_step 3 | |
125 | dispatch_step 2 | |
126 | dispatch_step 1 | |
127 | .purgem dispatch_step | |
128 | .endm | |
129 | #else | |
068dcfd6 | 130 | # if ARM_BX_ALIGN_LOG2 < 3 |
733edfb8 RM |
131 | # error case not handled |
132 | # endif | |
133 | .macro dispatch_helper steps, log2_bytes_per_step | |
733edfb8 RM |
134 | /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is |
135 | (STEPS << LOG2_BYTES_PER_STEP). | |
068dcfd6 RM |
136 | So this is (steps_to_skip << LOG2_BYTES_PER_STEP). |
137 | Then it needs further adjustment to compensate for the | |
138 | distance between the PC value taken below (0f + PC_OFS) | |
139 | and the first step's instructions (1f). */ | |
140 | rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ | |
141 | + ((1f - PC_OFS - 0f) \ | |
142 | >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) | |
733edfb8 RM |
143 | /* Shifting down LOG2_BYTES_PER_STEP gives us the number of |
144 | steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us | |
145 | the (byte) distance to add to the PC. */ | |
068dcfd6 | 146 | 0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) |
733edfb8 | 147 | bx tmp1 |
068dcfd6 RM |
148 | .p2align ARM_BX_ALIGN_LOG2 |
149 | 1: | |
733edfb8 RM |
150 | .endm |
151 | ||
152 | .macro dispatch_7_dword | |
153 | dispatch_helper 7, 3 | |
154 | .p2align ARM_BX_ALIGN_LOG2 | |
155 | dispatch_step 7 | |
156 | .p2align ARM_BX_ALIGN_LOG2 | |
157 | dispatch_step 6 | |
158 | .p2align ARM_BX_ALIGN_LOG2 | |
159 | dispatch_step 5 | |
160 | .p2align ARM_BX_ALIGN_LOG2 | |
161 | dispatch_step 4 | |
162 | .p2align ARM_BX_ALIGN_LOG2 | |
163 | dispatch_step 3 | |
164 | .p2align ARM_BX_ALIGN_LOG2 | |
165 | dispatch_step 2 | |
166 | .p2align ARM_BX_ALIGN_LOG2 | |
167 | dispatch_step 1 | |
168 | .p2align ARM_BX_ALIGN_LOG2 | |
169 | .purgem dispatch_step | |
170 | .endm | |
171 | ||
172 | .macro dispatch_15_word | |
173 | dispatch_helper 15, 2 | |
174 | dispatch_step 15 | |
175 | .p2align ARM_BX_ALIGN_LOG2 | |
176 | dispatch_step 14 | |
177 | .p2align ARM_BX_ALIGN_LOG2 | |
178 | dispatch_step 13 | |
179 | .p2align ARM_BX_ALIGN_LOG2 | |
180 | dispatch_step 12 | |
181 | .p2align ARM_BX_ALIGN_LOG2 | |
182 | dispatch_step 11 | |
183 | .p2align ARM_BX_ALIGN_LOG2 | |
184 | dispatch_step 10 | |
185 | .p2align ARM_BX_ALIGN_LOG2 | |
186 | dispatch_step 9 | |
187 | .p2align ARM_BX_ALIGN_LOG2 | |
188 | dispatch_step 8 | |
189 | .p2align ARM_BX_ALIGN_LOG2 | |
190 | dispatch_step 7 | |
191 | .p2align ARM_BX_ALIGN_LOG2 | |
192 | dispatch_step 6 | |
193 | .p2align ARM_BX_ALIGN_LOG2 | |
194 | dispatch_step 5 | |
195 | .p2align ARM_BX_ALIGN_LOG2 | |
196 | dispatch_step 4 | |
197 | .p2align ARM_BX_ALIGN_LOG2 | |
198 | dispatch_step 3 | |
199 | .p2align ARM_BX_ALIGN_LOG2 | |
200 | dispatch_step 2 | |
201 | .p2align ARM_BX_ALIGN_LOG2 | |
202 | dispatch_step 1 | |
203 | .p2align ARM_BX_ALIGN_LOG2 | |
204 | .purgem dispatch_step | |
205 | .endm | |
206 | ||
207 | #endif | |
ae65139d WN |
208 | |
209 | #ifndef USE_NEON | |
210 | /* For bulk copies using GP registers. */ | |
211 | #define A_l r2 /* Call-clobbered. */ | |
212 | #define A_h r3 /* Call-clobbered. */ | |
213 | #define B_l r4 | |
214 | #define B_h r5 | |
215 | #define C_l r6 | |
216 | #define C_h r7 | |
733edfb8 RM |
217 | /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ |
218 | #define D_l r10 | |
219 | #define D_h r11 | |
ae65139d WN |
220 | #endif |
221 | ||
222 | /* Number of lines ahead to pre-fetch data. If you change this the code | |
223 | below will need adjustment to compensate. */ | |
224 | ||
225 | #define prefetch_lines 5 | |
226 | ||
227 | #ifdef USE_VFP | |
228 | .macro cpy_line_vfp vreg, base | |
733edfb8 RM |
229 | sfi_breg dst, \ |
230 | vstr \vreg, [\B, #\base] | |
231 | sfi_breg src, \ | |
232 | vldr \vreg, [\B, #\base] | |
233 | sfi_breg dst, \ | |
234 | vstr d0, [\B, #\base + 8] | |
235 | sfi_breg src, \ | |
236 | vldr d0, [\B, #\base + 8] | |
237 | sfi_breg dst, \ | |
238 | vstr d1, [\B, #\base + 16] | |
239 | sfi_breg src, \ | |
240 | vldr d1, [\B, #\base + 16] | |
241 | sfi_breg dst, \ | |
242 | vstr d2, [\B, #\base + 24] | |
243 | sfi_breg src, \ | |
244 | vldr d2, [\B, #\base + 24] | |
245 | sfi_breg dst, \ | |
246 | vstr \vreg, [\B, #\base + 32] | |
247 | sfi_breg src, \ | |
248 | vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32] | |
249 | sfi_breg dst, \ | |
250 | vstr d0, [\B, #\base + 40] | |
251 | sfi_breg src, \ | |
252 | vldr d0, [\B, #\base + 40] | |
253 | sfi_breg dst, \ | |
254 | vstr d1, [\B, #\base + 48] | |
255 | sfi_breg src, \ | |
256 | vldr d1, [\B, #\base + 48] | |
257 | sfi_breg dst, \ | |
258 | vstr d2, [\B, #\base + 56] | |
259 | sfi_breg src, \ | |
260 | vldr d2, [\B, #\base + 56] | |
ae65139d WN |
261 | .endm |
262 | ||
263 | .macro cpy_tail_vfp vreg, base | |
733edfb8 RM |
264 | sfi_breg dst, \ |
265 | vstr \vreg, [\B, #\base] | |
266 | sfi_breg src, \ | |
267 | vldr \vreg, [\B, #\base] | |
268 | sfi_breg dst, \ | |
269 | vstr d0, [\B, #\base + 8] | |
270 | sfi_breg src, \ | |
271 | vldr d0, [\B, #\base + 8] | |
272 | sfi_breg dst, \ | |
273 | vstr d1, [\B, #\base + 16] | |
274 | sfi_breg src, \ | |
275 | vldr d1, [\B, #\base + 16] | |
276 | sfi_breg dst, \ | |
277 | vstr d2, [\B, #\base + 24] | |
278 | sfi_breg src, \ | |
279 | vldr d2, [\B, #\base + 24] | |
280 | sfi_breg dst, \ | |
281 | vstr \vreg, [\B, #\base + 32] | |
282 | sfi_breg dst, \ | |
283 | vstr d0, [\B, #\base + 40] | |
284 | sfi_breg src, \ | |
285 | vldr d0, [\B, #\base + 40] | |
286 | sfi_breg dst, \ | |
287 | vstr d1, [\B, #\base + 48] | |
288 | sfi_breg src, \ | |
289 | vldr d1, [\B, #\base + 48] | |
290 | sfi_breg dst, \ | |
291 | vstr d2, [\B, #\base + 56] | |
292 | sfi_breg src, \ | |
293 | vldr d2, [\B, #\base + 56] | |
ae65139d WN |
294 | .endm |
295 | #endif | |
296 | ||
297 | .p2align 6 | |
298 | ENTRY(memcpy) | |
299 | ||
300 | mov dst, dstin /* Preserve dstin, we need to return it. */ | |
301 | cmp count, #64 | |
302 | bge .Lcpy_not_short | |
303 | /* Deal with small copies quickly by dropping straight into the | |
304 | exit block. */ | |
305 | ||
306 | .Ltail63unaligned: | |
307 | #ifdef USE_NEON | |
733edfb8 RM |
308 | /* These need an extra layer of macro just to work around a |
309 | bug in the assembler's parser when an operand starts with | |
310 | a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647 | |
311 | tracks that bug; it was not fixed as of binutils-2.23.2. */ | |
312 | .macro neon_load_d0 reg | |
313 | vld1.8 {d0}, [\reg]! | |
314 | .endm | |
315 | .macro neon_store_d0 reg | |
316 | vst1.8 {d0}, [\reg]! | |
317 | .endm | |
318 | ||
319 | /* These are used by the NaCl sfi_breg macro. */ | |
320 | .macro _sfi_breg_dmask_neon_load_d0 reg | |
321 | _sfi_dmask \reg | |
322 | .endm | |
323 | .macro _sfi_breg_dmask_neon_store_d0 reg | |
324 | _sfi_dmask \reg | |
325 | .endm | |
326 | ||
ae65139d | 327 | and tmp1, count, #0x38 |
733edfb8 RM |
328 | .macro dispatch_step i |
329 | sfi_breg src, neon_load_d0 \B | |
330 | sfi_breg dst, neon_store_d0 \B | |
331 | .endm | |
332 | dispatch_7_dword | |
ae65139d WN |
333 | |
334 | tst count, #4 | |
733edfb8 RM |
335 | sfi_breg src, \ |
336 | ldrne tmp1, [\B], #4 | |
337 | sfi_breg dst, \ | |
338 | strne tmp1, [\B], #4 | |
ae65139d WN |
339 | #else |
340 | /* Copy up to 15 full words of data. May not be aligned. */ | |
341 | /* Cannot use VFP for unaligned data. */ | |
342 | and tmp1, count, #0x3c | |
343 | add dst, dst, tmp1 | |
344 | add src, src, tmp1 | |
ae65139d | 345 | /* Jump directly into the sequence below at the correct offset. */ |
733edfb8 RM |
346 | .macro dispatch_step i |
347 | sfi_breg src, \ | |
348 | ldr tmp1, [\B, #-(\i * 4)] | |
349 | sfi_breg dst, \ | |
350 | str tmp1, [\B, #-(\i * 4)] | |
351 | .endm | |
352 | dispatch_15_word | |
ae65139d WN |
353 | #endif |
354 | ||
355 | lsls count, count, #31 | |
733edfb8 RM |
356 | sfi_breg src, \ |
357 | ldrhcs tmp1, [\B], #2 | |
358 | sfi_breg src, \ | |
359 | ldrbne src, [\B] /* Src is dead, use as a scratch. */ | |
360 | sfi_breg dst, \ | |
361 | strhcs tmp1, [\B], #2 | |
362 | sfi_breg dst, \ | |
363 | strbne src, [\B] | |
ae65139d WN |
364 | bx lr |
365 | ||
366 | .Lcpy_not_short: | |
367 | /* At least 64 bytes to copy, but don't know the alignment yet. */ | |
368 | str tmp2, [sp, #-FRAME_SIZE]! | |
369 | cfi_adjust_cfa_offset (FRAME_SIZE) | |
370 | cfi_rel_offset (tmp2, 0) | |
371 | cfi_remember_state | |
cd90698b WN |
372 | and tmp2, src, #7 |
373 | and tmp1, dst, #7 | |
ae65139d WN |
374 | cmp tmp1, tmp2 |
375 | bne .Lcpy_notaligned | |
376 | ||
377 | #ifdef USE_VFP | |
378 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show | |
379 | that the FP pipeline is much better at streaming loads and | |
380 | stores. This is outside the critical loop. */ | |
381 | vmov.f32 s0, s0 | |
382 | #endif | |
383 | ||
cd90698b | 384 | /* SRC and DST have the same mutual 64-bit alignment, but we may |
ae65139d | 385 | still need to pre-copy some bytes to get to natural alignment. |
cd90698b | 386 | We bring SRC and DST into full 64-bit alignment. */ |
ae65139d WN |
387 | lsls tmp2, dst, #29 |
388 | beq 1f | |
389 | rsbs tmp2, tmp2, #0 | |
390 | sub count, count, tmp2, lsr #29 | |
733edfb8 RM |
391 | sfi_breg src, \ |
392 | ldrmi tmp1, [\B], #4 | |
393 | sfi_breg dst, \ | |
394 | strmi tmp1, [\B], #4 | |
ae65139d | 395 | lsls tmp2, tmp2, #2 |
733edfb8 RM |
396 | sfi_breg src, \ |
397 | ldrhcs tmp1, [\B], #2 | |
398 | sfi_breg src, \ | |
399 | ldrbne tmp2, [\B], #1 | |
400 | sfi_breg dst, \ | |
401 | strhcs tmp1, [\B], #2 | |
402 | sfi_breg dst, \ | |
403 | strbne tmp2, [\B], #1 | |
ae65139d WN |
404 | |
405 | 1: | |
406 | subs tmp2, count, #64 /* Use tmp2 for count. */ | |
407 | blt .Ltail63aligned | |
408 | ||
409 | cmp tmp2, #512 | |
410 | bge .Lcpy_body_long | |
411 | ||
412 | .Lcpy_body_medium: /* Count in tmp2. */ | |
413 | #ifdef USE_VFP | |
414 | 1: | |
733edfb8 RM |
415 | sfi_breg src, \ |
416 | vldr d0, [\B, #0] | |
ae65139d | 417 | subs tmp2, tmp2, #64 |
733edfb8 RM |
418 | sfi_breg src, \ |
419 | vldr d1, [\B, #8] | |
420 | sfi_breg dst, \ | |
421 | vstr d0, [\B, #0] | |
422 | sfi_breg src, \ | |
423 | vldr d0, [\B, #16] | |
424 | sfi_breg dst, \ | |
425 | vstr d1, [\B, #8] | |
426 | sfi_breg src, \ | |
427 | vldr d1, [\B, #24] | |
428 | sfi_breg dst, \ | |
429 | vstr d0, [\B, #16] | |
430 | sfi_breg src, \ | |
431 | vldr d0, [\B, #32] | |
432 | sfi_breg dst, \ | |
433 | vstr d1, [\B, #24] | |
434 | sfi_breg src, \ | |
435 | vldr d1, [\B, #40] | |
436 | sfi_breg dst, \ | |
437 | vstr d0, [\B, #32] | |
438 | sfi_breg src, \ | |
439 | vldr d0, [\B, #48] | |
440 | sfi_breg dst, \ | |
441 | vstr d1, [\B, #40] | |
442 | sfi_breg src, \ | |
443 | vldr d1, [\B, #56] | |
444 | sfi_breg dst, \ | |
445 | vstr d0, [\B, #48] | |
ae65139d | 446 | add src, src, #64 |
733edfb8 RM |
447 | sfi_breg dst, \ |
448 | vstr d1, [\B, #56] | |
ae65139d WN |
449 | add dst, dst, #64 |
450 | bge 1b | |
451 | tst tmp2, #0x3f | |
452 | beq .Ldone | |
453 | ||
454 | .Ltail63aligned: /* Count in tmp2. */ | |
455 | and tmp1, tmp2, #0x38 | |
456 | add dst, dst, tmp1 | |
457 | add src, src, tmp1 | |
733edfb8 RM |
458 | .macro dispatch_step i |
459 | sfi_breg src, \ | |
460 | vldr d0, [\B, #-(\i * 8)] | |
461 | sfi_breg dst, \ | |
462 | vstr d0, [\B, #-(\i * 8)] | |
463 | .endm | |
464 | dispatch_7_dword | |
ae65139d WN |
465 | #else |
466 | sub src, src, #8 | |
467 | sub dst, dst, #8 | |
468 | 1: | |
733edfb8 RM |
469 | sfi_breg src, \ |
470 | ldrd A_l, A_h, [\B, #8] | |
471 | sfi_breg dst, \ | |
472 | strd A_l, A_h, [\B, #8] | |
473 | sfi_breg src, \ | |
474 | ldrd A_l, A_h, [\B, #16] | |
475 | sfi_breg dst, \ | |
476 | strd A_l, A_h, [\B, #16] | |
477 | sfi_breg src, \ | |
478 | ldrd A_l, A_h, [\B, #24] | |
479 | sfi_breg dst, \ | |
480 | strd A_l, A_h, [\B, #24] | |
481 | sfi_breg src, \ | |
482 | ldrd A_l, A_h, [\B, #32] | |
483 | sfi_breg dst, \ | |
484 | strd A_l, A_h, [\B, #32] | |
485 | sfi_breg src, \ | |
486 | ldrd A_l, A_h, [\B, #40] | |
487 | sfi_breg dst, \ | |
488 | strd A_l, A_h, [\B, #40] | |
489 | sfi_breg src, \ | |
490 | ldrd A_l, A_h, [\B, #48] | |
491 | sfi_breg dst, \ | |
492 | strd A_l, A_h, [\B, #48] | |
493 | sfi_breg src, \ | |
494 | ldrd A_l, A_h, [\B, #56] | |
495 | sfi_breg dst, \ | |
496 | strd A_l, A_h, [\B, #56] | |
497 | sfi_breg src, \ | |
498 | ldrd A_l, A_h, [\B, #64]! | |
499 | sfi_breg dst, \ | |
500 | strd A_l, A_h, [\B, #64]! | |
ae65139d WN |
501 | subs tmp2, tmp2, #64 |
502 | bge 1b | |
503 | tst tmp2, #0x3f | |
504 | bne 1f | |
505 | ldr tmp2,[sp], #FRAME_SIZE | |
506 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
507 | cfi_restore (tmp2) | |
508 | bx lr | |
509 | ||
510 | cfi_restore_state | |
511 | cfi_remember_state | |
512 | 1: | |
513 | add src, src, #8 | |
514 | add dst, dst, #8 | |
515 | ||
516 | .Ltail63aligned: /* Count in tmp2. */ | |
517 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but | |
cd90698b | 518 | we know that the src and dest are 64-bit aligned so we can use |
ae65139d WN |
519 | LDRD/STRD to improve efficiency. */ |
520 | /* TMP2 is now negative, but we don't care about that. The bottom | |
521 | six bits still tell us how many bytes are left to copy. */ | |
522 | ||
523 | and tmp1, tmp2, #0x38 | |
524 | add dst, dst, tmp1 | |
525 | add src, src, tmp1 | |
733edfb8 RM |
526 | .macro dispatch_step i |
527 | sfi_breg src, \ | |
528 | ldrd A_l, A_h, [\B, #-(\i * 8)] | |
529 | sfi_breg dst, \ | |
530 | strd A_l, A_h, [\B, #-(\i * 8)] | |
531 | .endm | |
532 | dispatch_7_dword | |
ae65139d | 533 | #endif |
733edfb8 | 534 | |
ae65139d | 535 | tst tmp2, #4 |
733edfb8 RM |
536 | sfi_breg src, \ |
537 | ldrne tmp1, [\B], #4 | |
538 | sfi_breg dst, \ | |
539 | strne tmp1, [\B], #4 | |
ae65139d | 540 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
733edfb8 RM |
541 | sfi_breg src, \ |
542 | ldrhcs tmp1, [\B], #2 | |
543 | sfi_breg src, \ | |
544 | ldrbne tmp2, [\B] | |
545 | sfi_breg dst, \ | |
546 | strhcs tmp1, [\B], #2 | |
547 | sfi_breg dst, \ | |
548 | strbne tmp2, [\B] | |
ae65139d WN |
549 | |
550 | .Ldone: | |
551 | ldr tmp2, [sp], #FRAME_SIZE | |
552 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
553 | cfi_restore (tmp2) | |
554 | bx lr | |
555 | ||
556 | cfi_restore_state | |
557 | cfi_remember_state | |
558 | ||
559 | .Lcpy_body_long: /* Count in tmp2. */ | |
560 | ||
561 | /* Long copy. We know that there's at least (prefetch_lines * 64) | |
562 | bytes to go. */ | |
563 | #ifdef USE_VFP | |
564 | /* Don't use PLD. Instead, read some data in advance of the current | |
565 | copy position into a register. This should act like a PLD | |
566 | operation but we won't have to repeat the transfer. */ | |
567 | ||
733edfb8 RM |
568 | sfi_breg src, \ |
569 | vldr d3, [\B, #0] | |
570 | sfi_breg src, \ | |
571 | vldr d4, [\B, #64] | |
572 | sfi_breg src, \ | |
573 | vldr d5, [\B, #128] | |
574 | sfi_breg src, \ | |
575 | vldr d6, [\B, #192] | |
576 | sfi_breg src, \ | |
577 | vldr d7, [\B, #256] | |
578 | ||
579 | sfi_breg src, \ | |
580 | vldr d0, [\B, #8] | |
581 | sfi_breg src, \ | |
582 | vldr d1, [\B, #16] | |
583 | sfi_breg src, \ | |
584 | vldr d2, [\B, #24] | |
ae65139d WN |
585 | add src, src, #32 |
586 | ||
587 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 | |
588 | blt 2f | |
589 | 1: | |
590 | cpy_line_vfp d3, 0 | |
591 | cpy_line_vfp d4, 64 | |
592 | cpy_line_vfp d5, 128 | |
593 | add dst, dst, #3 * 64 | |
594 | add src, src, #3 * 64 | |
595 | cpy_line_vfp d6, 0 | |
596 | cpy_line_vfp d7, 64 | |
597 | add dst, dst, #2 * 64 | |
598 | add src, src, #2 * 64 | |
599 | subs tmp2, tmp2, #prefetch_lines * 64 | |
600 | bge 1b | |
601 | ||
602 | 2: | |
603 | cpy_tail_vfp d3, 0 | |
604 | cpy_tail_vfp d4, 64 | |
605 | cpy_tail_vfp d5, 128 | |
606 | add src, src, #3 * 64 | |
607 | add dst, dst, #3 * 64 | |
608 | cpy_tail_vfp d6, 0 | |
733edfb8 RM |
609 | sfi_breg dst, \ |
610 | vstr d7, [\B, #64] | |
611 | sfi_breg src, \ | |
612 | vldr d7, [\B, #64] | |
613 | sfi_breg dst, \ | |
614 | vstr d0, [\B, #64 + 8] | |
615 | sfi_breg src, \ | |
616 | vldr d0, [\B, #64 + 8] | |
617 | sfi_breg dst, \ | |
618 | vstr d1, [\B, #64 + 16] | |
619 | sfi_breg src, \ | |
620 | vldr d1, [\B, #64 + 16] | |
621 | sfi_breg dst, \ | |
622 | vstr d2, [\B, #64 + 24] | |
623 | sfi_breg src, \ | |
624 | vldr d2, [\B, #64 + 24] | |
625 | sfi_breg dst, \ | |
626 | vstr d7, [\B, #64 + 32] | |
ae65139d | 627 | add src, src, #96 |
733edfb8 RM |
628 | sfi_breg dst, \ |
629 | vstr d0, [\B, #64 + 40] | |
630 | sfi_breg dst, \ | |
631 | vstr d1, [\B, #64 + 48] | |
632 | sfi_breg dst, \ | |
633 | vstr d2, [\B, #64 + 56] | |
ae65139d WN |
634 | add dst, dst, #128 |
635 | add tmp2, tmp2, #prefetch_lines * 64 | |
636 | b .Lcpy_body_medium | |
637 | #else | |
638 | /* Long copy. Use an SMS style loop to maximize the I/O | |
639 | bandwidth of the core. We don't have enough spare registers | |
640 | to synthesise prefetching, so use PLD operations. */ | |
641 | /* Pre-bias src and dst. */ | |
642 | sub src, src, #8 | |
643 | sub dst, dst, #8 | |
733edfb8 RM |
644 | sfi_pld src, #8 |
645 | sfi_pld src, #72 | |
ae65139d | 646 | subs tmp2, tmp2, #64 |
733edfb8 RM |
647 | sfi_pld src, #136 |
648 | sfi_breg src, \ | |
649 | ldrd A_l, A_h, [\B, #8] | |
ae65139d WN |
650 | strd B_l, B_h, [sp, #8] |
651 | cfi_rel_offset (B_l, 8) | |
652 | cfi_rel_offset (B_h, 12) | |
733edfb8 RM |
653 | sfi_breg src, \ |
654 | ldrd B_l, B_h, [\B, #16] | |
ae65139d WN |
655 | strd C_l, C_h, [sp, #16] |
656 | cfi_rel_offset (C_l, 16) | |
657 | cfi_rel_offset (C_h, 20) | |
733edfb8 RM |
658 | sfi_breg src, \ |
659 | ldrd C_l, C_h, [\B, #24] | |
ae65139d WN |
660 | strd D_l, D_h, [sp, #24] |
661 | cfi_rel_offset (D_l, 24) | |
662 | cfi_rel_offset (D_h, 28) | |
733edfb8 RM |
663 | sfi_pld src, #200 |
664 | sfi_breg src, \ | |
665 | ldrd D_l, D_h, [\B, #32]! | |
ae65139d WN |
666 | b 1f |
667 | .p2align 6 | |
668 | 2: | |
733edfb8 RM |
669 | sfi_pld src, #232 |
670 | sfi_breg dst, \ | |
671 | strd A_l, A_h, [\B, #40] | |
672 | sfi_breg src, \ | |
673 | ldrd A_l, A_h, [\B, #40] | |
674 | sfi_breg dst, \ | |
675 | strd B_l, B_h, [\B, #48] | |
676 | sfi_breg src, \ | |
677 | ldrd B_l, B_h, [\B, #48] | |
678 | sfi_breg dst, \ | |
679 | strd C_l, C_h, [\B, #56] | |
680 | sfi_breg src, \ | |
681 | ldrd C_l, C_h, [\B, #56] | |
682 | sfi_breg dst, \ | |
683 | strd D_l, D_h, [\B, #64]! | |
684 | sfi_breg src, \ | |
685 | ldrd D_l, D_h, [\B, #64]! | |
ae65139d WN |
686 | subs tmp2, tmp2, #64 |
687 | 1: | |
733edfb8 RM |
688 | sfi_breg dst, \ |
689 | strd A_l, A_h, [\B, #8] | |
690 | sfi_breg src, \ | |
691 | ldrd A_l, A_h, [\B, #8] | |
692 | sfi_breg dst, \ | |
693 | strd B_l, B_h, [\B, #16] | |
694 | sfi_breg src, \ | |
695 | ldrd B_l, B_h, [\B, #16] | |
696 | sfi_breg dst, \ | |
697 | strd C_l, C_h, [\B, #24] | |
698 | sfi_breg src, \ | |
699 | ldrd C_l, C_h, [\B, #24] | |
700 | sfi_breg dst, \ | |
701 | strd D_l, D_h, [\B, #32] | |
702 | sfi_breg src, \ | |
703 | ldrd D_l, D_h, [\B, #32] | |
ae65139d WN |
704 | bcs 2b |
705 | /* Save the remaining bytes and restore the callee-saved regs. */ | |
733edfb8 RM |
706 | sfi_breg dst, \ |
707 | strd A_l, A_h, [\B, #40] | |
ae65139d | 708 | add src, src, #40 |
733edfb8 RM |
709 | sfi_breg dst, \ |
710 | strd B_l, B_h, [\B, #48] | |
ae65139d WN |
711 | ldrd B_l, B_h, [sp, #8] |
712 | cfi_restore (B_l) | |
713 | cfi_restore (B_h) | |
733edfb8 RM |
714 | sfi_breg dst, \ |
715 | strd C_l, C_h, [\B, #56] | |
ae65139d WN |
716 | ldrd C_l, C_h, [sp, #16] |
717 | cfi_restore (C_l) | |
718 | cfi_restore (C_h) | |
733edfb8 RM |
719 | sfi_breg dst, \ |
720 | strd D_l, D_h, [\B, #64] | |
ae65139d WN |
721 | ldrd D_l, D_h, [sp, #24] |
722 | cfi_restore (D_l) | |
723 | cfi_restore (D_h) | |
724 | add dst, dst, #72 | |
725 | tst tmp2, #0x3f | |
726 | bne .Ltail63aligned | |
727 | ldr tmp2, [sp], #FRAME_SIZE | |
728 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
729 | cfi_restore (tmp2) | |
730 | bx lr | |
731 | #endif | |
732 | ||
733 | cfi_restore_state | |
734 | cfi_remember_state | |
735 | ||
736 | .Lcpy_notaligned: | |
733edfb8 RM |
737 | sfi_pld src |
738 | sfi_pld src, #64 | |
ae65139d WN |
739 | /* There's at least 64 bytes to copy, but there is no mutual |
740 | alignment. */ | |
741 | /* Bring DST to 64-bit alignment. */ | |
742 | lsls tmp2, dst, #29 | |
733edfb8 | 743 | sfi_pld src, #(2 * 64) |
ae65139d WN |
744 | beq 1f |
745 | rsbs tmp2, tmp2, #0 | |
746 | sub count, count, tmp2, lsr #29 | |
733edfb8 RM |
747 | sfi_breg src, \ |
748 | ldrmi tmp1, [\B], #4 | |
749 | sfi_breg dst, \ | |
750 | strmi tmp1, [\B], #4 | |
ae65139d | 751 | lsls tmp2, tmp2, #2 |
733edfb8 RM |
752 | sfi_breg src, \ |
753 | ldrbne tmp1, [\B], #1 | |
754 | sfi_breg src, \ | |
755 | ldrhcs tmp2, [\B], #2 | |
756 | sfi_breg dst, \ | |
757 | strbne tmp1, [\B], #1 | |
758 | sfi_breg dst, \ | |
759 | strhcs tmp2, [\B], #2 | |
ae65139d | 760 | 1: |
733edfb8 | 761 | sfi_pld src, #(3 * 64) |
ae65139d WN |
762 | subs count, count, #64 |
763 | ldrmi tmp2, [sp], #FRAME_SIZE | |
764 | bmi .Ltail63unaligned | |
733edfb8 | 765 | sfi_pld src, #(4 * 64) |
ae65139d WN |
766 | |
767 | #ifdef USE_NEON | |
733edfb8 RM |
768 | /* These need an extra layer of macro just to work around a |
769 | bug in the assembler's parser when an operand starts with | |
770 | a {...}. */ | |
771 | .macro neon_load_multi reglist, basereg | |
772 | vld1.8 {\reglist}, [\basereg]! | |
773 | .endm | |
774 | .macro neon_store_multi reglist, basereg | |
775 | vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! | |
776 | .endm | |
777 | ||
778 | /* These are used by the NaCl sfi_breg macro. */ | |
779 | .macro _sfi_breg_dmask_neon_load_multi reg | |
780 | _sfi_dmask \reg | |
781 | .endm | |
782 | .macro _sfi_breg_dmask_neon_store_multi reg | |
783 | _sfi_dmask \reg | |
784 | .endm | |
785 | ||
786 | sfi_breg src, neon_load_multi d0-d3, \B | |
787 | sfi_breg src, neon_load_multi d4-d7, \B | |
ae65139d WN |
788 | subs count, count, #64 |
789 | bmi 2f | |
790 | 1: | |
733edfb8 RM |
791 | sfi_pld src, #(4 * 64) |
792 | sfi_breg dst, neon_store_multi d0-d3, \B | |
793 | sfi_breg src, neon_load_multi d0-d3, \B | |
794 | sfi_breg dst, neon_store_multi d4-d7, \B | |
795 | sfi_breg src, neon_load_multi d4-d7, \B | |
ae65139d WN |
796 | subs count, count, #64 |
797 | bpl 1b | |
798 | 2: | |
733edfb8 RM |
799 | sfi_breg dst, neon_store_multi d0-d3, \B |
800 | sfi_breg dst, neon_store_multi d4-d7, \B | |
ae65139d WN |
801 | ands count, count, #0x3f |
802 | #else | |
803 | /* Use an SMS style loop to maximize the I/O bandwidth. */ | |
804 | sub src, src, #4 | |
805 | sub dst, dst, #8 | |
806 | subs tmp2, count, #64 /* Use tmp2 for count. */ | |
733edfb8 RM |
807 | sfi_breg src, \ |
808 | ldr A_l, [\B, #4] | |
809 | sfi_breg src, \ | |
810 | ldr A_h, [\B, #8] | |
ae65139d WN |
811 | strd B_l, B_h, [sp, #8] |
812 | cfi_rel_offset (B_l, 8) | |
813 | cfi_rel_offset (B_h, 12) | |
733edfb8 RM |
814 | sfi_breg src, \ |
815 | ldr B_l, [\B, #12] | |
816 | sfi_breg src, \ | |
817 | ldr B_h, [\B, #16] | |
ae65139d WN |
818 | strd C_l, C_h, [sp, #16] |
819 | cfi_rel_offset (C_l, 16) | |
820 | cfi_rel_offset (C_h, 20) | |
733edfb8 RM |
821 | sfi_breg src, \ |
822 | ldr C_l, [\B, #20] | |
823 | sfi_breg src, \ | |
824 | ldr C_h, [\B, #24] | |
ae65139d WN |
825 | strd D_l, D_h, [sp, #24] |
826 | cfi_rel_offset (D_l, 24) | |
827 | cfi_rel_offset (D_h, 28) | |
733edfb8 RM |
828 | sfi_breg src, \ |
829 | ldr D_l, [\B, #28] | |
830 | sfi_breg src, \ | |
831 | ldr D_h, [\B, #32]! | |
ae65139d WN |
832 | b 1f |
833 | .p2align 6 | |
834 | 2: | |
733edfb8 RM |
835 | sfi_pld src, #(5 * 64) - (32 - 4) |
836 | sfi_breg dst, \ | |
837 | strd A_l, A_h, [\B, #40] | |
838 | sfi_breg src, \ | |
839 | ldr A_l, [\B, #36] | |
840 | sfi_breg src, \ | |
841 | ldr A_h, [\B, #40] | |
842 | sfi_breg dst, \ | |
843 | strd B_l, B_h, [\B, #48] | |
844 | sfi_breg src, \ | |
845 | ldr B_l, [\B, #44] | |
846 | sfi_breg src, \ | |
847 | ldr B_h, [\B, #48] | |
848 | sfi_breg dst, \ | |
849 | strd C_l, C_h, [\B, #56] | |
850 | sfi_breg src, \ | |
851 | ldr C_l, [\B, #52] | |
852 | sfi_breg src, \ | |
853 | ldr C_h, [\B, #56] | |
854 | sfi_breg dst, \ | |
855 | strd D_l, D_h, [\B, #64]! | |
856 | sfi_breg src, \ | |
857 | ldr D_l, [\B, #60] | |
858 | sfi_breg src, \ | |
859 | ldr D_h, [\B, #64]! | |
ae65139d WN |
860 | subs tmp2, tmp2, #64 |
861 | 1: | |
733edfb8 RM |
862 | sfi_breg dst, \ |
863 | strd A_l, A_h, [\B, #8] | |
864 | sfi_breg src, \ | |
865 | ldr A_l, [\B, #4] | |
866 | sfi_breg src, \ | |
867 | ldr A_h, [\B, #8] | |
868 | sfi_breg dst, \ | |
869 | strd B_l, B_h, [\B, #16] | |
870 | sfi_breg src, \ | |
871 | ldr B_l, [\B, #12] | |
872 | sfi_breg src, \ | |
873 | ldr B_h, [\B, #16] | |
874 | sfi_breg dst, \ | |
875 | strd C_l, C_h, [\B, #24] | |
876 | sfi_breg src, \ | |
877 | ldr C_l, [\B, #20] | |
878 | sfi_breg src, \ | |
879 | ldr C_h, [\B, #24] | |
880 | sfi_breg dst, \ | |
881 | strd D_l, D_h, [\B, #32] | |
882 | sfi_breg src, \ | |
883 | ldr D_l, [\B, #28] | |
884 | sfi_breg src, \ | |
885 | ldr D_h, [\B, #32] | |
ae65139d WN |
886 | bcs 2b |
887 | ||
888 | /* Save the remaining bytes and restore the callee-saved regs. */ | |
733edfb8 RM |
889 | sfi_breg dst, \ |
890 | strd A_l, A_h, [\B, #40] | |
ae65139d | 891 | add src, src, #36 |
733edfb8 RM |
892 | sfi_breg dst, \ |
893 | strd B_l, B_h, [\B, #48] | |
ae65139d WN |
894 | ldrd B_l, B_h, [sp, #8] |
895 | cfi_restore (B_l) | |
896 | cfi_restore (B_h) | |
733edfb8 RM |
897 | sfi_breg dst, \ |
898 | strd C_l, C_h, [\B, #56] | |
ae65139d WN |
899 | ldrd C_l, C_h, [sp, #16] |
900 | cfi_restore (C_l) | |
901 | cfi_restore (C_h) | |
733edfb8 RM |
902 | sfi_breg dst, \ |
903 | strd D_l, D_h, [\B, #64] | |
ae65139d WN |
904 | ldrd D_l, D_h, [sp, #24] |
905 | cfi_restore (D_l) | |
906 | cfi_restore (D_h) | |
907 | add dst, dst, #72 | |
908 | ands count, tmp2, #0x3f | |
909 | #endif | |
910 | ldr tmp2, [sp], #FRAME_SIZE | |
911 | cfi_adjust_cfa_offset (-FRAME_SIZE) | |
912 | cfi_restore (tmp2) | |
913 | bne .Ltail63unaligned | |
914 | bx lr | |
915 | ||
916 | END(memcpy) | |
917 | libc_hidden_builtin_def (memcpy) |