]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/arm/armv7/multiarch/memcpy_impl.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / arm / armv7 / multiarch / memcpy_impl.S
CommitLineData
ae65139d 1/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
f7a9f785 2 Copyright (C) 2013-2016 Free Software Foundation, Inc.
ae65139d
WN
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>.
18
19 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20 of VFP or NEON when built with the appropriate flags.
21
22 Assumptions:
23
24 ARMv6 (ARMv7-a if using Neon)
25 ARM state
26 Unaligned accesses
ae65139d
WN
27
28 */
29
30/* Thumb cannot encode negative immediate offsets in memory operations. */
31#ifndef NO_THUMB
32#define NO_THUMB
33#endif
34#include <sysdep.h>
733edfb8 35#include <arm-features.h>
ae65139d
WN
36
37 .syntax unified
38 /* This implementation requires ARM state. */
39 .arm
40
41#ifdef MEMCPY_NEON
42
43 .fpu neon
44 .arch armv7-a
45# define FRAME_SIZE 4
46# define USE_VFP
47# define USE_NEON
48
49#elif defined (MEMCPY_VFP)
50
51 .arch armv6
52 .fpu vfpv2
53# define FRAME_SIZE 32
54# define USE_VFP
55
56#else
57 .arch armv6
58# define FRAME_SIZE 32
59
60#endif
61
62#define ALIGN(addr, align) addr:align
63
64#define INSN_SIZE 4
65
66/* Call parameters. */
67#define dstin r0
68#define src r1
69#define count r2
70
71/* Locals. */
72#define tmp1 r3
73#define dst ip
733edfb8
RM
74#define tmp2 r8
75
76/* These two macros both work by repeated invocation of the macro
77 dispatch_step (not defined here). That macro performs one "step",
78 doing one load instruction and one store instruction to copy one
79 "unit". On entry, TMP1 contains the number of bytes to be copied,
80 a multiple of the unit size. The macro clobbers TMP1 in the
81 process of doing a computed jump to the tail containing the
82 appropriate number of steps.
83
84 In dispatch_7_dword, dispatch_step is invoked seven times, with an
85 argument that is 7 for the first and 1 for the last. Units are
86 double-words (8 bytes). TMP1 is at most 56.
87
88 In dispatch_15_word, dispatch_step is invoked fifteen times,
89 with an argument that is 15 for the first and 1 for the last.
90 Units are words (4 bytes). TMP1 is at most 60. */
91
92#ifndef ARM_ALWAYS_BX
93# if ARM_BX_ALIGN_LOG2 != 2
94# error case not handled
95# endif
96 .macro dispatch_7_dword
97 rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
98 add pc, pc, tmp1
99 dispatch_step 7
100 dispatch_step 6
101 dispatch_step 5
102 dispatch_step 4
103 dispatch_step 3
104 dispatch_step 2
105 dispatch_step 1
106 .purgem dispatch_step
107 .endm
108
109 .macro dispatch_15_word
110 rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
111 add pc, pc, tmp1, lsl #1
112 dispatch_step 15
113 dispatch_step 14
114 dispatch_step 13
115 dispatch_step 12
116 dispatch_step 11
117 dispatch_step 10
118 dispatch_step 9
119 dispatch_step 8
120 dispatch_step 7
121 dispatch_step 6
122 dispatch_step 5
123 dispatch_step 4
124 dispatch_step 3
125 dispatch_step 2
126 dispatch_step 1
127 .purgem dispatch_step
128 .endm
129#else
068dcfd6 130# if ARM_BX_ALIGN_LOG2 < 3
733edfb8
RM
131# error case not handled
132# endif
133 .macro dispatch_helper steps, log2_bytes_per_step
733edfb8
RM
134 /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
135 (STEPS << LOG2_BYTES_PER_STEP).
068dcfd6
RM
136 So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137 Then it needs further adjustment to compensate for the
138 distance between the PC value taken below (0f + PC_OFS)
139 and the first step's instructions (1f). */
140 rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141 + ((1f - PC_OFS - 0f) \
142 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
733edfb8
RM
143 /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
144 steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145 the (byte) distance to add to the PC. */
068dcfd6 1460: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
733edfb8 147 bx tmp1
068dcfd6
RM
148 .p2align ARM_BX_ALIGN_LOG2
1491:
733edfb8
RM
150 .endm
151
152 .macro dispatch_7_dword
153 dispatch_helper 7, 3
154 .p2align ARM_BX_ALIGN_LOG2
155 dispatch_step 7
156 .p2align ARM_BX_ALIGN_LOG2
157 dispatch_step 6
158 .p2align ARM_BX_ALIGN_LOG2
159 dispatch_step 5
160 .p2align ARM_BX_ALIGN_LOG2
161 dispatch_step 4
162 .p2align ARM_BX_ALIGN_LOG2
163 dispatch_step 3
164 .p2align ARM_BX_ALIGN_LOG2
165 dispatch_step 2
166 .p2align ARM_BX_ALIGN_LOG2
167 dispatch_step 1
168 .p2align ARM_BX_ALIGN_LOG2
169 .purgem dispatch_step
170 .endm
171
172 .macro dispatch_15_word
173 dispatch_helper 15, 2
174 dispatch_step 15
175 .p2align ARM_BX_ALIGN_LOG2
176 dispatch_step 14
177 .p2align ARM_BX_ALIGN_LOG2
178 dispatch_step 13
179 .p2align ARM_BX_ALIGN_LOG2
180 dispatch_step 12
181 .p2align ARM_BX_ALIGN_LOG2
182 dispatch_step 11
183 .p2align ARM_BX_ALIGN_LOG2
184 dispatch_step 10
185 .p2align ARM_BX_ALIGN_LOG2
186 dispatch_step 9
187 .p2align ARM_BX_ALIGN_LOG2
188 dispatch_step 8
189 .p2align ARM_BX_ALIGN_LOG2
190 dispatch_step 7
191 .p2align ARM_BX_ALIGN_LOG2
192 dispatch_step 6
193 .p2align ARM_BX_ALIGN_LOG2
194 dispatch_step 5
195 .p2align ARM_BX_ALIGN_LOG2
196 dispatch_step 4
197 .p2align ARM_BX_ALIGN_LOG2
198 dispatch_step 3
199 .p2align ARM_BX_ALIGN_LOG2
200 dispatch_step 2
201 .p2align ARM_BX_ALIGN_LOG2
202 dispatch_step 1
203 .p2align ARM_BX_ALIGN_LOG2
204 .purgem dispatch_step
205 .endm
206
207#endif
ae65139d
WN
208
209#ifndef USE_NEON
210/* For bulk copies using GP registers. */
211#define A_l r2 /* Call-clobbered. */
212#define A_h r3 /* Call-clobbered. */
213#define B_l r4
214#define B_h r5
215#define C_l r6
216#define C_h r7
733edfb8
RM
217/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
218#define D_l r10
219#define D_h r11
ae65139d
WN
220#endif
221
222/* Number of lines ahead to pre-fetch data. If you change this the code
223 below will need adjustment to compensate. */
224
225#define prefetch_lines 5
226
227#ifdef USE_VFP
228 .macro cpy_line_vfp vreg, base
733edfb8
RM
229 sfi_breg dst, \
230 vstr \vreg, [\B, #\base]
231 sfi_breg src, \
232 vldr \vreg, [\B, #\base]
233 sfi_breg dst, \
234 vstr d0, [\B, #\base + 8]
235 sfi_breg src, \
236 vldr d0, [\B, #\base + 8]
237 sfi_breg dst, \
238 vstr d1, [\B, #\base + 16]
239 sfi_breg src, \
240 vldr d1, [\B, #\base + 16]
241 sfi_breg dst, \
242 vstr d2, [\B, #\base + 24]
243 sfi_breg src, \
244 vldr d2, [\B, #\base + 24]
245 sfi_breg dst, \
246 vstr \vreg, [\B, #\base + 32]
247 sfi_breg src, \
248 vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32]
249 sfi_breg dst, \
250 vstr d0, [\B, #\base + 40]
251 sfi_breg src, \
252 vldr d0, [\B, #\base + 40]
253 sfi_breg dst, \
254 vstr d1, [\B, #\base + 48]
255 sfi_breg src, \
256 vldr d1, [\B, #\base + 48]
257 sfi_breg dst, \
258 vstr d2, [\B, #\base + 56]
259 sfi_breg src, \
260 vldr d2, [\B, #\base + 56]
ae65139d
WN
261 .endm
262
263 .macro cpy_tail_vfp vreg, base
733edfb8
RM
264 sfi_breg dst, \
265 vstr \vreg, [\B, #\base]
266 sfi_breg src, \
267 vldr \vreg, [\B, #\base]
268 sfi_breg dst, \
269 vstr d0, [\B, #\base + 8]
270 sfi_breg src, \
271 vldr d0, [\B, #\base + 8]
272 sfi_breg dst, \
273 vstr d1, [\B, #\base + 16]
274 sfi_breg src, \
275 vldr d1, [\B, #\base + 16]
276 sfi_breg dst, \
277 vstr d2, [\B, #\base + 24]
278 sfi_breg src, \
279 vldr d2, [\B, #\base + 24]
280 sfi_breg dst, \
281 vstr \vreg, [\B, #\base + 32]
282 sfi_breg dst, \
283 vstr d0, [\B, #\base + 40]
284 sfi_breg src, \
285 vldr d0, [\B, #\base + 40]
286 sfi_breg dst, \
287 vstr d1, [\B, #\base + 48]
288 sfi_breg src, \
289 vldr d1, [\B, #\base + 48]
290 sfi_breg dst, \
291 vstr d2, [\B, #\base + 56]
292 sfi_breg src, \
293 vldr d2, [\B, #\base + 56]
ae65139d
WN
294 .endm
295#endif
296
297 .p2align 6
298ENTRY(memcpy)
299
300 mov dst, dstin /* Preserve dstin, we need to return it. */
301 cmp count, #64
302 bge .Lcpy_not_short
303 /* Deal with small copies quickly by dropping straight into the
304 exit block. */
305
306.Ltail63unaligned:
307#ifdef USE_NEON
733edfb8
RM
308 /* These need an extra layer of macro just to work around a
309 bug in the assembler's parser when an operand starts with
310 a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647
311 tracks that bug; it was not fixed as of binutils-2.23.2. */
312 .macro neon_load_d0 reg
313 vld1.8 {d0}, [\reg]!
314 .endm
315 .macro neon_store_d0 reg
316 vst1.8 {d0}, [\reg]!
317 .endm
318
319 /* These are used by the NaCl sfi_breg macro. */
320 .macro _sfi_breg_dmask_neon_load_d0 reg
321 _sfi_dmask \reg
322 .endm
323 .macro _sfi_breg_dmask_neon_store_d0 reg
324 _sfi_dmask \reg
325 .endm
326
ae65139d 327 and tmp1, count, #0x38
733edfb8
RM
328 .macro dispatch_step i
329 sfi_breg src, neon_load_d0 \B
330 sfi_breg dst, neon_store_d0 \B
331 .endm
332 dispatch_7_dword
ae65139d
WN
333
334 tst count, #4
733edfb8
RM
335 sfi_breg src, \
336 ldrne tmp1, [\B], #4
337 sfi_breg dst, \
338 strne tmp1, [\B], #4
ae65139d
WN
339#else
340 /* Copy up to 15 full words of data. May not be aligned. */
341 /* Cannot use VFP for unaligned data. */
342 and tmp1, count, #0x3c
343 add dst, dst, tmp1
344 add src, src, tmp1
ae65139d 345 /* Jump directly into the sequence below at the correct offset. */
733edfb8
RM
346 .macro dispatch_step i
347 sfi_breg src, \
348 ldr tmp1, [\B, #-(\i * 4)]
349 sfi_breg dst, \
350 str tmp1, [\B, #-(\i * 4)]
351 .endm
352 dispatch_15_word
ae65139d
WN
353#endif
354
355 lsls count, count, #31
733edfb8
RM
356 sfi_breg src, \
357 ldrhcs tmp1, [\B], #2
358 sfi_breg src, \
359 ldrbne src, [\B] /* Src is dead, use as a scratch. */
360 sfi_breg dst, \
361 strhcs tmp1, [\B], #2
362 sfi_breg dst, \
363 strbne src, [\B]
ae65139d
WN
364 bx lr
365
366.Lcpy_not_short:
367 /* At least 64 bytes to copy, but don't know the alignment yet. */
368 str tmp2, [sp, #-FRAME_SIZE]!
369 cfi_adjust_cfa_offset (FRAME_SIZE)
370 cfi_rel_offset (tmp2, 0)
371 cfi_remember_state
cd90698b
WN
372 and tmp2, src, #7
373 and tmp1, dst, #7
ae65139d
WN
374 cmp tmp1, tmp2
375 bne .Lcpy_notaligned
376
377#ifdef USE_VFP
378 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
379 that the FP pipeline is much better at streaming loads and
380 stores. This is outside the critical loop. */
381 vmov.f32 s0, s0
382#endif
383
cd90698b 384 /* SRC and DST have the same mutual 64-bit alignment, but we may
ae65139d 385 still need to pre-copy some bytes to get to natural alignment.
cd90698b 386 We bring SRC and DST into full 64-bit alignment. */
ae65139d
WN
387 lsls tmp2, dst, #29
388 beq 1f
389 rsbs tmp2, tmp2, #0
390 sub count, count, tmp2, lsr #29
733edfb8
RM
391 sfi_breg src, \
392 ldrmi tmp1, [\B], #4
393 sfi_breg dst, \
394 strmi tmp1, [\B], #4
ae65139d 395 lsls tmp2, tmp2, #2
733edfb8
RM
396 sfi_breg src, \
397 ldrhcs tmp1, [\B], #2
398 sfi_breg src, \
399 ldrbne tmp2, [\B], #1
400 sfi_breg dst, \
401 strhcs tmp1, [\B], #2
402 sfi_breg dst, \
403 strbne tmp2, [\B], #1
ae65139d
WN
404
4051:
406 subs tmp2, count, #64 /* Use tmp2 for count. */
407 blt .Ltail63aligned
408
409 cmp tmp2, #512
410 bge .Lcpy_body_long
411
412.Lcpy_body_medium: /* Count in tmp2. */
413#ifdef USE_VFP
4141:
733edfb8
RM
415 sfi_breg src, \
416 vldr d0, [\B, #0]
ae65139d 417 subs tmp2, tmp2, #64
733edfb8
RM
418 sfi_breg src, \
419 vldr d1, [\B, #8]
420 sfi_breg dst, \
421 vstr d0, [\B, #0]
422 sfi_breg src, \
423 vldr d0, [\B, #16]
424 sfi_breg dst, \
425 vstr d1, [\B, #8]
426 sfi_breg src, \
427 vldr d1, [\B, #24]
428 sfi_breg dst, \
429 vstr d0, [\B, #16]
430 sfi_breg src, \
431 vldr d0, [\B, #32]
432 sfi_breg dst, \
433 vstr d1, [\B, #24]
434 sfi_breg src, \
435 vldr d1, [\B, #40]
436 sfi_breg dst, \
437 vstr d0, [\B, #32]
438 sfi_breg src, \
439 vldr d0, [\B, #48]
440 sfi_breg dst, \
441 vstr d1, [\B, #40]
442 sfi_breg src, \
443 vldr d1, [\B, #56]
444 sfi_breg dst, \
445 vstr d0, [\B, #48]
ae65139d 446 add src, src, #64
733edfb8
RM
447 sfi_breg dst, \
448 vstr d1, [\B, #56]
ae65139d
WN
449 add dst, dst, #64
450 bge 1b
451 tst tmp2, #0x3f
452 beq .Ldone
453
454.Ltail63aligned: /* Count in tmp2. */
455 and tmp1, tmp2, #0x38
456 add dst, dst, tmp1
457 add src, src, tmp1
733edfb8
RM
458 .macro dispatch_step i
459 sfi_breg src, \
460 vldr d0, [\B, #-(\i * 8)]
461 sfi_breg dst, \
462 vstr d0, [\B, #-(\i * 8)]
463 .endm
464 dispatch_7_dword
ae65139d
WN
465#else
466 sub src, src, #8
467 sub dst, dst, #8
4681:
733edfb8
RM
469 sfi_breg src, \
470 ldrd A_l, A_h, [\B, #8]
471 sfi_breg dst, \
472 strd A_l, A_h, [\B, #8]
473 sfi_breg src, \
474 ldrd A_l, A_h, [\B, #16]
475 sfi_breg dst, \
476 strd A_l, A_h, [\B, #16]
477 sfi_breg src, \
478 ldrd A_l, A_h, [\B, #24]
479 sfi_breg dst, \
480 strd A_l, A_h, [\B, #24]
481 sfi_breg src, \
482 ldrd A_l, A_h, [\B, #32]
483 sfi_breg dst, \
484 strd A_l, A_h, [\B, #32]
485 sfi_breg src, \
486 ldrd A_l, A_h, [\B, #40]
487 sfi_breg dst, \
488 strd A_l, A_h, [\B, #40]
489 sfi_breg src, \
490 ldrd A_l, A_h, [\B, #48]
491 sfi_breg dst, \
492 strd A_l, A_h, [\B, #48]
493 sfi_breg src, \
494 ldrd A_l, A_h, [\B, #56]
495 sfi_breg dst, \
496 strd A_l, A_h, [\B, #56]
497 sfi_breg src, \
498 ldrd A_l, A_h, [\B, #64]!
499 sfi_breg dst, \
500 strd A_l, A_h, [\B, #64]!
ae65139d
WN
501 subs tmp2, tmp2, #64
502 bge 1b
503 tst tmp2, #0x3f
504 bne 1f
505 ldr tmp2,[sp], #FRAME_SIZE
506 cfi_adjust_cfa_offset (-FRAME_SIZE)
507 cfi_restore (tmp2)
508 bx lr
509
510 cfi_restore_state
511 cfi_remember_state
5121:
513 add src, src, #8
514 add dst, dst, #8
515
516.Ltail63aligned: /* Count in tmp2. */
517 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
cd90698b 518 we know that the src and dest are 64-bit aligned so we can use
ae65139d
WN
519 LDRD/STRD to improve efficiency. */
520 /* TMP2 is now negative, but we don't care about that. The bottom
521 six bits still tell us how many bytes are left to copy. */
522
523 and tmp1, tmp2, #0x38
524 add dst, dst, tmp1
525 add src, src, tmp1
733edfb8
RM
526 .macro dispatch_step i
527 sfi_breg src, \
528 ldrd A_l, A_h, [\B, #-(\i * 8)]
529 sfi_breg dst, \
530 strd A_l, A_h, [\B, #-(\i * 8)]
531 .endm
532 dispatch_7_dword
ae65139d 533#endif
733edfb8 534
ae65139d 535 tst tmp2, #4
733edfb8
RM
536 sfi_breg src, \
537 ldrne tmp1, [\B], #4
538 sfi_breg dst, \
539 strne tmp1, [\B], #4
ae65139d 540 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
733edfb8
RM
541 sfi_breg src, \
542 ldrhcs tmp1, [\B], #2
543 sfi_breg src, \
544 ldrbne tmp2, [\B]
545 sfi_breg dst, \
546 strhcs tmp1, [\B], #2
547 sfi_breg dst, \
548 strbne tmp2, [\B]
ae65139d
WN
549
550.Ldone:
551 ldr tmp2, [sp], #FRAME_SIZE
552 cfi_adjust_cfa_offset (-FRAME_SIZE)
553 cfi_restore (tmp2)
554 bx lr
555
556 cfi_restore_state
557 cfi_remember_state
558
559.Lcpy_body_long: /* Count in tmp2. */
560
561 /* Long copy. We know that there's at least (prefetch_lines * 64)
562 bytes to go. */
563#ifdef USE_VFP
564 /* Don't use PLD. Instead, read some data in advance of the current
565 copy position into a register. This should act like a PLD
566 operation but we won't have to repeat the transfer. */
567
733edfb8
RM
568 sfi_breg src, \
569 vldr d3, [\B, #0]
570 sfi_breg src, \
571 vldr d4, [\B, #64]
572 sfi_breg src, \
573 vldr d5, [\B, #128]
574 sfi_breg src, \
575 vldr d6, [\B, #192]
576 sfi_breg src, \
577 vldr d7, [\B, #256]
578
579 sfi_breg src, \
580 vldr d0, [\B, #8]
581 sfi_breg src, \
582 vldr d1, [\B, #16]
583 sfi_breg src, \
584 vldr d2, [\B, #24]
ae65139d
WN
585 add src, src, #32
586
587 subs tmp2, tmp2, #prefetch_lines * 64 * 2
588 blt 2f
5891:
590 cpy_line_vfp d3, 0
591 cpy_line_vfp d4, 64
592 cpy_line_vfp d5, 128
593 add dst, dst, #3 * 64
594 add src, src, #3 * 64
595 cpy_line_vfp d6, 0
596 cpy_line_vfp d7, 64
597 add dst, dst, #2 * 64
598 add src, src, #2 * 64
599 subs tmp2, tmp2, #prefetch_lines * 64
600 bge 1b
601
6022:
603 cpy_tail_vfp d3, 0
604 cpy_tail_vfp d4, 64
605 cpy_tail_vfp d5, 128
606 add src, src, #3 * 64
607 add dst, dst, #3 * 64
608 cpy_tail_vfp d6, 0
733edfb8
RM
609 sfi_breg dst, \
610 vstr d7, [\B, #64]
611 sfi_breg src, \
612 vldr d7, [\B, #64]
613 sfi_breg dst, \
614 vstr d0, [\B, #64 + 8]
615 sfi_breg src, \
616 vldr d0, [\B, #64 + 8]
617 sfi_breg dst, \
618 vstr d1, [\B, #64 + 16]
619 sfi_breg src, \
620 vldr d1, [\B, #64 + 16]
621 sfi_breg dst, \
622 vstr d2, [\B, #64 + 24]
623 sfi_breg src, \
624 vldr d2, [\B, #64 + 24]
625 sfi_breg dst, \
626 vstr d7, [\B, #64 + 32]
ae65139d 627 add src, src, #96
733edfb8
RM
628 sfi_breg dst, \
629 vstr d0, [\B, #64 + 40]
630 sfi_breg dst, \
631 vstr d1, [\B, #64 + 48]
632 sfi_breg dst, \
633 vstr d2, [\B, #64 + 56]
ae65139d
WN
634 add dst, dst, #128
635 add tmp2, tmp2, #prefetch_lines * 64
636 b .Lcpy_body_medium
637#else
638 /* Long copy. Use an SMS style loop to maximize the I/O
639 bandwidth of the core. We don't have enough spare registers
640 to synthesise prefetching, so use PLD operations. */
641 /* Pre-bias src and dst. */
642 sub src, src, #8
643 sub dst, dst, #8
733edfb8
RM
644 sfi_pld src, #8
645 sfi_pld src, #72
ae65139d 646 subs tmp2, tmp2, #64
733edfb8
RM
647 sfi_pld src, #136
648 sfi_breg src, \
649 ldrd A_l, A_h, [\B, #8]
ae65139d
WN
650 strd B_l, B_h, [sp, #8]
651 cfi_rel_offset (B_l, 8)
652 cfi_rel_offset (B_h, 12)
733edfb8
RM
653 sfi_breg src, \
654 ldrd B_l, B_h, [\B, #16]
ae65139d
WN
655 strd C_l, C_h, [sp, #16]
656 cfi_rel_offset (C_l, 16)
657 cfi_rel_offset (C_h, 20)
733edfb8
RM
658 sfi_breg src, \
659 ldrd C_l, C_h, [\B, #24]
ae65139d
WN
660 strd D_l, D_h, [sp, #24]
661 cfi_rel_offset (D_l, 24)
662 cfi_rel_offset (D_h, 28)
733edfb8
RM
663 sfi_pld src, #200
664 sfi_breg src, \
665 ldrd D_l, D_h, [\B, #32]!
ae65139d
WN
666 b 1f
667 .p2align 6
6682:
733edfb8
RM
669 sfi_pld src, #232
670 sfi_breg dst, \
671 strd A_l, A_h, [\B, #40]
672 sfi_breg src, \
673 ldrd A_l, A_h, [\B, #40]
674 sfi_breg dst, \
675 strd B_l, B_h, [\B, #48]
676 sfi_breg src, \
677 ldrd B_l, B_h, [\B, #48]
678 sfi_breg dst, \
679 strd C_l, C_h, [\B, #56]
680 sfi_breg src, \
681 ldrd C_l, C_h, [\B, #56]
682 sfi_breg dst, \
683 strd D_l, D_h, [\B, #64]!
684 sfi_breg src, \
685 ldrd D_l, D_h, [\B, #64]!
ae65139d
WN
686 subs tmp2, tmp2, #64
6871:
733edfb8
RM
688 sfi_breg dst, \
689 strd A_l, A_h, [\B, #8]
690 sfi_breg src, \
691 ldrd A_l, A_h, [\B, #8]
692 sfi_breg dst, \
693 strd B_l, B_h, [\B, #16]
694 sfi_breg src, \
695 ldrd B_l, B_h, [\B, #16]
696 sfi_breg dst, \
697 strd C_l, C_h, [\B, #24]
698 sfi_breg src, \
699 ldrd C_l, C_h, [\B, #24]
700 sfi_breg dst, \
701 strd D_l, D_h, [\B, #32]
702 sfi_breg src, \
703 ldrd D_l, D_h, [\B, #32]
ae65139d
WN
704 bcs 2b
705 /* Save the remaining bytes and restore the callee-saved regs. */
733edfb8
RM
706 sfi_breg dst, \
707 strd A_l, A_h, [\B, #40]
ae65139d 708 add src, src, #40
733edfb8
RM
709 sfi_breg dst, \
710 strd B_l, B_h, [\B, #48]
ae65139d
WN
711 ldrd B_l, B_h, [sp, #8]
712 cfi_restore (B_l)
713 cfi_restore (B_h)
733edfb8
RM
714 sfi_breg dst, \
715 strd C_l, C_h, [\B, #56]
ae65139d
WN
716 ldrd C_l, C_h, [sp, #16]
717 cfi_restore (C_l)
718 cfi_restore (C_h)
733edfb8
RM
719 sfi_breg dst, \
720 strd D_l, D_h, [\B, #64]
ae65139d
WN
721 ldrd D_l, D_h, [sp, #24]
722 cfi_restore (D_l)
723 cfi_restore (D_h)
724 add dst, dst, #72
725 tst tmp2, #0x3f
726 bne .Ltail63aligned
727 ldr tmp2, [sp], #FRAME_SIZE
728 cfi_adjust_cfa_offset (-FRAME_SIZE)
729 cfi_restore (tmp2)
730 bx lr
731#endif
732
733 cfi_restore_state
734 cfi_remember_state
735
736.Lcpy_notaligned:
733edfb8
RM
737 sfi_pld src
738 sfi_pld src, #64
ae65139d
WN
739 /* There's at least 64 bytes to copy, but there is no mutual
740 alignment. */
741 /* Bring DST to 64-bit alignment. */
742 lsls tmp2, dst, #29
733edfb8 743 sfi_pld src, #(2 * 64)
ae65139d
WN
744 beq 1f
745 rsbs tmp2, tmp2, #0
746 sub count, count, tmp2, lsr #29
733edfb8
RM
747 sfi_breg src, \
748 ldrmi tmp1, [\B], #4
749 sfi_breg dst, \
750 strmi tmp1, [\B], #4
ae65139d 751 lsls tmp2, tmp2, #2
733edfb8
RM
752 sfi_breg src, \
753 ldrbne tmp1, [\B], #1
754 sfi_breg src, \
755 ldrhcs tmp2, [\B], #2
756 sfi_breg dst, \
757 strbne tmp1, [\B], #1
758 sfi_breg dst, \
759 strhcs tmp2, [\B], #2
ae65139d 7601:
733edfb8 761 sfi_pld src, #(3 * 64)
ae65139d
WN
762 subs count, count, #64
763 ldrmi tmp2, [sp], #FRAME_SIZE
764 bmi .Ltail63unaligned
733edfb8 765 sfi_pld src, #(4 * 64)
ae65139d
WN
766
767#ifdef USE_NEON
733edfb8
RM
768 /* These need an extra layer of macro just to work around a
769 bug in the assembler's parser when an operand starts with
770 a {...}. */
771 .macro neon_load_multi reglist, basereg
772 vld1.8 {\reglist}, [\basereg]!
773 .endm
774 .macro neon_store_multi reglist, basereg
775 vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
776 .endm
777
778 /* These are used by the NaCl sfi_breg macro. */
779 .macro _sfi_breg_dmask_neon_load_multi reg
780 _sfi_dmask \reg
781 .endm
782 .macro _sfi_breg_dmask_neon_store_multi reg
783 _sfi_dmask \reg
784 .endm
785
786 sfi_breg src, neon_load_multi d0-d3, \B
787 sfi_breg src, neon_load_multi d4-d7, \B
ae65139d
WN
788 subs count, count, #64
789 bmi 2f
7901:
733edfb8
RM
791 sfi_pld src, #(4 * 64)
792 sfi_breg dst, neon_store_multi d0-d3, \B
793 sfi_breg src, neon_load_multi d0-d3, \B
794 sfi_breg dst, neon_store_multi d4-d7, \B
795 sfi_breg src, neon_load_multi d4-d7, \B
ae65139d
WN
796 subs count, count, #64
797 bpl 1b
7982:
733edfb8
RM
799 sfi_breg dst, neon_store_multi d0-d3, \B
800 sfi_breg dst, neon_store_multi d4-d7, \B
ae65139d
WN
801 ands count, count, #0x3f
802#else
803 /* Use an SMS style loop to maximize the I/O bandwidth. */
804 sub src, src, #4
805 sub dst, dst, #8
806 subs tmp2, count, #64 /* Use tmp2 for count. */
733edfb8
RM
807 sfi_breg src, \
808 ldr A_l, [\B, #4]
809 sfi_breg src, \
810 ldr A_h, [\B, #8]
ae65139d
WN
811 strd B_l, B_h, [sp, #8]
812 cfi_rel_offset (B_l, 8)
813 cfi_rel_offset (B_h, 12)
733edfb8
RM
814 sfi_breg src, \
815 ldr B_l, [\B, #12]
816 sfi_breg src, \
817 ldr B_h, [\B, #16]
ae65139d
WN
818 strd C_l, C_h, [sp, #16]
819 cfi_rel_offset (C_l, 16)
820 cfi_rel_offset (C_h, 20)
733edfb8
RM
821 sfi_breg src, \
822 ldr C_l, [\B, #20]
823 sfi_breg src, \
824 ldr C_h, [\B, #24]
ae65139d
WN
825 strd D_l, D_h, [sp, #24]
826 cfi_rel_offset (D_l, 24)
827 cfi_rel_offset (D_h, 28)
733edfb8
RM
828 sfi_breg src, \
829 ldr D_l, [\B, #28]
830 sfi_breg src, \
831 ldr D_h, [\B, #32]!
ae65139d
WN
832 b 1f
833 .p2align 6
8342:
733edfb8
RM
835 sfi_pld src, #(5 * 64) - (32 - 4)
836 sfi_breg dst, \
837 strd A_l, A_h, [\B, #40]
838 sfi_breg src, \
839 ldr A_l, [\B, #36]
840 sfi_breg src, \
841 ldr A_h, [\B, #40]
842 sfi_breg dst, \
843 strd B_l, B_h, [\B, #48]
844 sfi_breg src, \
845 ldr B_l, [\B, #44]
846 sfi_breg src, \
847 ldr B_h, [\B, #48]
848 sfi_breg dst, \
849 strd C_l, C_h, [\B, #56]
850 sfi_breg src, \
851 ldr C_l, [\B, #52]
852 sfi_breg src, \
853 ldr C_h, [\B, #56]
854 sfi_breg dst, \
855 strd D_l, D_h, [\B, #64]!
856 sfi_breg src, \
857 ldr D_l, [\B, #60]
858 sfi_breg src, \
859 ldr D_h, [\B, #64]!
ae65139d
WN
860 subs tmp2, tmp2, #64
8611:
733edfb8
RM
862 sfi_breg dst, \
863 strd A_l, A_h, [\B, #8]
864 sfi_breg src, \
865 ldr A_l, [\B, #4]
866 sfi_breg src, \
867 ldr A_h, [\B, #8]
868 sfi_breg dst, \
869 strd B_l, B_h, [\B, #16]
870 sfi_breg src, \
871 ldr B_l, [\B, #12]
872 sfi_breg src, \
873 ldr B_h, [\B, #16]
874 sfi_breg dst, \
875 strd C_l, C_h, [\B, #24]
876 sfi_breg src, \
877 ldr C_l, [\B, #20]
878 sfi_breg src, \
879 ldr C_h, [\B, #24]
880 sfi_breg dst, \
881 strd D_l, D_h, [\B, #32]
882 sfi_breg src, \
883 ldr D_l, [\B, #28]
884 sfi_breg src, \
885 ldr D_h, [\B, #32]
ae65139d
WN
886 bcs 2b
887
888 /* Save the remaining bytes and restore the callee-saved regs. */
733edfb8
RM
889 sfi_breg dst, \
890 strd A_l, A_h, [\B, #40]
ae65139d 891 add src, src, #36
733edfb8
RM
892 sfi_breg dst, \
893 strd B_l, B_h, [\B, #48]
ae65139d
WN
894 ldrd B_l, B_h, [sp, #8]
895 cfi_restore (B_l)
896 cfi_restore (B_h)
733edfb8
RM
897 sfi_breg dst, \
898 strd C_l, C_h, [\B, #56]
ae65139d
WN
899 ldrd C_l, C_h, [sp, #16]
900 cfi_restore (C_l)
901 cfi_restore (C_h)
733edfb8
RM
902 sfi_breg dst, \
903 strd D_l, D_h, [\B, #64]
ae65139d
WN
904 ldrd D_l, D_h, [sp, #24]
905 cfi_restore (D_l)
906 cfi_restore (D_h)
907 add dst, dst, #72
908 ands count, tmp2, #0x3f
909#endif
910 ldr tmp2, [sp], #FRAME_SIZE
911 cfi_adjust_cfa_offset (-FRAME_SIZE)
912 cfi_restore (tmp2)
913 bne .Ltail63unaligned
914 bx lr
915
916END(memcpy)
917libc_hidden_builtin_def (memcpy)