1 /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
2 Copyright (C) 2013-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>.
19 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20 of VFP or NEON when built with the appropriate flags.
24 ARMv6 (ARMv7-a if using Neon)
30 /* Thumb cannot encode negative immediate offsets in memory operations. */
35 #include <arm-features.h>
38 /* This implementation requires ARM state. */
49 #elif defined (MEMCPY_VFP)
53 # define FRAME_SIZE 32
58 # define FRAME_SIZE 32
62 #define ALIGN(addr, align) addr:align
66 /* Call parameters. */
76 /* These two macros both work by repeated invocation of the macro
77 dispatch_step (not defined here). That macro performs one "step",
78 doing one load instruction and one store instruction to copy one
79 "unit". On entry, TMP1 contains the number of bytes to be copied,
80 a multiple of the unit size. The macro clobbers TMP1 in the
81 process of doing a computed jump to the tail containing the
82 appropriate number of steps.
84 In dispatch_7_dword, dispatch_step is invoked seven times, with an
85 argument that is 7 for the first and 1 for the last. Units are
86 double-words (8 bytes). TMP1 is at most 56.
88 In dispatch_15_word, dispatch_step is invoked fifteen times,
89 with an argument that is 15 for the first and 1 for the last.
90 Units are words (4 bytes). TMP1 is at most 60. */
93 # if ARM_BX_ALIGN_LOG2 != 2
94 # error case not handled
96 .macro dispatch_7_dword
97 rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
106 .purgem dispatch_step
109 .macro dispatch_15_word
110 rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
111 add pc, pc, tmp1, lsl #1
127 .purgem dispatch_step
130 # if ARM_BX_ALIGN_LOG2 < 3
131 # error case not handled
133 .macro dispatch_helper steps, log2_bytes_per_step
134 /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
135 (STEPS << LOG2_BYTES_PER_STEP).
136 So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137 Then it needs further adjustment to compensate for the
138 distance between the PC value taken below (0f + PC_OFS)
139 and the first step's instructions (1f). */
140 rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141 + ((1f - PC_OFS - 0f) \
142 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
143 /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
144 steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145 the (byte) distance to add to the PC. */
146 0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
148 .p2align ARM_BX_ALIGN_LOG2
152 .macro dispatch_7_dword
154 .p2align ARM_BX_ALIGN_LOG2
156 .p2align ARM_BX_ALIGN_LOG2
158 .p2align ARM_BX_ALIGN_LOG2
160 .p2align ARM_BX_ALIGN_LOG2
162 .p2align ARM_BX_ALIGN_LOG2
164 .p2align ARM_BX_ALIGN_LOG2
166 .p2align ARM_BX_ALIGN_LOG2
168 .p2align ARM_BX_ALIGN_LOG2
169 .purgem dispatch_step
172 .macro dispatch_15_word
173 dispatch_helper 15, 2
175 .p2align ARM_BX_ALIGN_LOG2
177 .p2align ARM_BX_ALIGN_LOG2
179 .p2align ARM_BX_ALIGN_LOG2
181 .p2align ARM_BX_ALIGN_LOG2
183 .p2align ARM_BX_ALIGN_LOG2
185 .p2align ARM_BX_ALIGN_LOG2
187 .p2align ARM_BX_ALIGN_LOG2
189 .p2align ARM_BX_ALIGN_LOG2
191 .p2align ARM_BX_ALIGN_LOG2
193 .p2align ARM_BX_ALIGN_LOG2
195 .p2align ARM_BX_ALIGN_LOG2
197 .p2align ARM_BX_ALIGN_LOG2
199 .p2align ARM_BX_ALIGN_LOG2
201 .p2align ARM_BX_ALIGN_LOG2
203 .p2align ARM_BX_ALIGN_LOG2
204 .purgem dispatch_step
210 /* For bulk copies using GP registers. */
211 #define A_l r2 /* Call-clobbered. */
212 #define A_h r3 /* Call-clobbered. */
217 /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
222 /* Number of lines ahead to pre-fetch data. If you change this the code
223 below will need adjustment to compensate. */
225 #define prefetch_lines 5
228 .macro cpy_line_vfp vreg, base
229 vstr \vreg, [dst, #\base]
230 vldr \vreg, [src, #\base]
231 vstr d0, [dst, #\base + 8]
232 vldr d0, [src, #\base + 8]
233 vstr d1, [dst, #\base + 16]
234 vldr d1, [src, #\base + 16]
235 vstr d2, [dst, #\base + 24]
236 vldr d2, [src, #\base + 24]
237 vstr \vreg, [dst, #\base + 32]
238 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
239 vstr d0, [dst, #\base + 40]
240 vldr d0, [src, #\base + 40]
241 vstr d1, [dst, #\base + 48]
242 vldr d1, [src, #\base + 48]
243 vstr d2, [dst, #\base + 56]
244 vldr d2, [src, #\base + 56]
247 .macro cpy_tail_vfp vreg, base
248 vstr \vreg, [dst, #\base]
249 vldr \vreg, [src, #\base]
250 vstr d0, [dst, #\base + 8]
251 vldr d0, [src, #\base + 8]
252 vstr d1, [dst, #\base + 16]
253 vldr d1, [src, #\base + 16]
254 vstr d2, [dst, #\base + 24]
255 vldr d2, [src, #\base + 24]
256 vstr \vreg, [dst, #\base + 32]
257 vstr d0, [dst, #\base + 40]
258 vldr d0, [src, #\base + 40]
259 vstr d1, [dst, #\base + 48]
260 vldr d1, [src, #\base + 48]
261 vstr d2, [dst, #\base + 56]
262 vldr d2, [src, #\base + 56]
269 mov dst, dstin /* Preserve dstin, we need to return it. */
272 /* Deal with small copies quickly by dropping straight into the
277 /* These need an extra layer of macro just to work around a
278 bug in the assembler's parser when an operand starts with
279 a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647
280 tracks that bug; it was not fixed as of binutils-2.23.2. */
281 .macro neon_load_d0 reg
284 .macro neon_store_d0 reg
288 and tmp1, count, #0x38
289 .macro dispatch_step i
296 ldrne tmp1, [src], #4
297 strne tmp1, [dst], #4
299 /* Copy up to 15 full words of data. May not be aligned. */
300 /* Cannot use VFP for unaligned data. */
301 and tmp1, count, #0x3c
304 /* Jump directly into the sequence below at the correct offset. */
305 .macro dispatch_step i
306 ldr tmp1, [src, #-(\i * 4)]
307 str tmp1, [dst, #-(\i * 4)]
312 lsls count, count, #31
313 ldrhcs tmp1, [src], #2
314 ldrbne src, [src] /* Src is dead, use as a scratch. */
315 strhcs tmp1, [dst], #2
320 /* At least 64 bytes to copy, but don't know the alignment yet. */
321 str tmp2, [sp, #-FRAME_SIZE]!
322 cfi_adjust_cfa_offset (FRAME_SIZE)
323 cfi_rel_offset (tmp2, 0)
331 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
332 that the FP pipeline is much better at streaming loads and
333 stores. This is outside the critical loop. */
337 /* SRC and DST have the same mutual 64-bit alignment, but we may
338 still need to pre-copy some bytes to get to natural alignment.
339 We bring SRC and DST into full 64-bit alignment. */
343 sub count, count, tmp2, lsr #29
344 ldrmi tmp1, [src], #4
345 strmi tmp1, [dst], #4
347 ldrhcs tmp1, [src], #2
348 ldrbne tmp2, [src], #1
349 strhcs tmp1, [dst], #2
350 strbne tmp2, [dst], #1
353 subs tmp2, count, #64 /* Use tmp2 for count. */
359 .Lcpy_body_medium: /* Count in tmp2. */
385 .Ltail63aligned: /* Count in tmp2. */
386 and tmp1, tmp2, #0x38
389 .macro dispatch_step i
390 vldr d0, [src, #-(\i * 8)]
391 vstr d0, [dst, #-(\i * 8)]
398 ldrd A_l, A_h, [src, #8]
399 strd A_l, A_h, [dst, #8]
400 ldrd A_l, A_h, [src, #16]
401 strd A_l, A_h, [dst, #16]
402 ldrd A_l, A_h, [src, #24]
403 strd A_l, A_h, [dst, #24]
404 ldrd A_l, A_h, [src, #32]
405 strd A_l, A_h, [dst, #32]
406 ldrd A_l, A_h, [src, #40]
407 strd A_l, A_h, [dst, #40]
408 ldrd A_l, A_h, [src, #48]
409 strd A_l, A_h, [dst, #48]
410 ldrd A_l, A_h, [src, #56]
411 strd A_l, A_h, [dst, #56]
412 ldrd A_l, A_h, [src, #64]!
413 strd A_l, A_h, [dst, #64]!
418 ldr tmp2,[sp], #FRAME_SIZE
419 cfi_adjust_cfa_offset (-FRAME_SIZE)
429 .Ltail63aligned: /* Count in tmp2. */
430 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
431 we know that the src and dest are 64-bit aligned so we can use
432 LDRD/STRD to improve efficiency. */
433 /* TMP2 is now negative, but we don't care about that. The bottom
434 six bits still tell us how many bytes are left to copy. */
436 and tmp1, tmp2, #0x38
439 .macro dispatch_step i
440 ldrd A_l, A_h, [src, #-(\i * 8)]
441 strd A_l, A_h, [dst, #-(\i * 8)]
447 ldrne tmp1, [src], #4
448 strne tmp1, [dst], #4
449 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
450 ldrhcs tmp1, [src], #2
452 strhcs tmp1, [dst], #2
456 ldr tmp2, [sp], #FRAME_SIZE
457 cfi_adjust_cfa_offset (-FRAME_SIZE)
464 .Lcpy_body_long: /* Count in tmp2. */
466 /* Long copy. We know that there's at least (prefetch_lines * 64)
469 /* Don't use PLD. Instead, read some data in advance of the current
470 copy position into a register. This should act like a PLD
471 operation but we won't have to repeat the transfer. */
484 subs tmp2, tmp2, #prefetch_lines * 64 * 2
490 add dst, dst, #3 * 64
491 add src, src, #3 * 64
494 add dst, dst, #2 * 64
495 add src, src, #2 * 64
496 subs tmp2, tmp2, #prefetch_lines * 64
503 add src, src, #3 * 64
504 add dst, dst, #3 * 64
508 vstr d0, [dst, #64 + 8]
509 vldr d0, [src, #64 + 8]
510 vstr d1, [dst, #64 + 16]
511 vldr d1, [src, #64 + 16]
512 vstr d2, [dst, #64 + 24]
513 vldr d2, [src, #64 + 24]
514 vstr d7, [dst, #64 + 32]
516 vstr d0, [dst, #64 + 40]
517 vstr d1, [dst, #64 + 48]
518 vstr d2, [dst, #64 + 56]
520 add tmp2, tmp2, #prefetch_lines * 64
523 /* Long copy. Use an SMS style loop to maximize the I/O
524 bandwidth of the core. We don't have enough spare registers
525 to synthesise prefetching, so use PLD operations. */
526 /* Pre-bias src and dst. */
533 ldrd A_l, A_h, [src, #8]
534 strd B_l, B_h, [sp, #8]
535 cfi_rel_offset (B_l, 8)
536 cfi_rel_offset (B_h, 12)
537 ldrd B_l, B_h, [src, #16]
538 strd C_l, C_h, [sp, #16]
539 cfi_rel_offset (C_l, 16)
540 cfi_rel_offset (C_h, 20)
541 ldrd C_l, C_h, [src, #24]
542 strd D_l, D_h, [sp, #24]
543 cfi_rel_offset (D_l, 24)
544 cfi_rel_offset (D_h, 28)
546 ldrd D_l, D_h, [src, #32]!
551 strd A_l, A_h, [dst, #40]
552 ldrd A_l, A_h, [src, #40]
553 strd B_l, B_h, [dst, #48]
554 ldrd B_l, B_h, [src, #48]
555 strd C_l, C_h, [dst, #56]
556 ldrd C_l, C_h, [src, #56]
557 strd D_l, D_h, [dst, #64]!
558 ldrd D_l, D_h, [src, #64]!
561 strd A_l, A_h, [dst, #8]
562 ldrd A_l, A_h, [src, #8]
563 strd B_l, B_h, [dst, #16]
564 ldrd B_l, B_h, [src, #16]
565 strd C_l, C_h, [dst, #24]
566 ldrd C_l, C_h, [src, #24]
567 strd D_l, D_h, [dst, #32]
568 ldrd D_l, D_h, [src, #32]
570 /* Save the remaining bytes and restore the callee-saved regs. */
571 strd A_l, A_h, [dst, #40]
573 strd B_l, B_h, [dst, #48]
574 ldrd B_l, B_h, [sp, #8]
577 strd C_l, C_h, [dst, #56]
578 ldrd C_l, C_h, [sp, #16]
581 strd D_l, D_h, [dst, #64]
582 ldrd D_l, D_h, [sp, #24]
588 ldr tmp2, [sp], #FRAME_SIZE
589 cfi_adjust_cfa_offset (-FRAME_SIZE)
600 /* There's at least 64 bytes to copy, but there is no mutual
602 /* Bring DST to 64-bit alignment. */
607 sub count, count, tmp2, lsr #29
608 ldrmi tmp1, [src], #4
609 strmi tmp1, [dst], #4
611 ldrbne tmp1, [src], #1
612 ldrhcs tmp2, [src], #2
613 strbne tmp1, [dst], #1
614 strhcs tmp2, [dst], #2
617 subs count, count, #64
618 ldrlo tmp2, [sp], #FRAME_SIZE
619 blo .Ltail63unaligned
623 /* These need an extra layer of macro just to work around a
624 bug in the assembler's parser when an operand starts with
626 .macro neon_load_multi reglist, basereg
627 vld1.8 {\reglist}, [\basereg]!
629 .macro neon_store_multi reglist, basereg
630 vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
633 neon_load_multi d0-d3, src
634 neon_load_multi d4-d7, src
635 subs count, count, #64
639 neon_store_multi d0-d3, dst
640 neon_load_multi d0-d3, src
641 neon_store_multi d4-d7, dst
642 neon_load_multi d4-d7, src
643 subs count, count, #64
646 neon_store_multi d0-d3, dst
647 neon_store_multi d4-d7, dst
648 ands count, count, #0x3f
650 /* Use an SMS style loop to maximize the I/O bandwidth. */
653 subs tmp2, count, #64 /* Use tmp2 for count. */
656 strd B_l, B_h, [sp, #8]
657 cfi_rel_offset (B_l, 8)
658 cfi_rel_offset (B_h, 12)
661 strd C_l, C_h, [sp, #16]
662 cfi_rel_offset (C_l, 16)
663 cfi_rel_offset (C_h, 20)
666 strd D_l, D_h, [sp, #24]
667 cfi_rel_offset (D_l, 24)
668 cfi_rel_offset (D_h, 28)
674 pld [src, #(5 * 64) - (32 - 4)]
675 strd A_l, A_h, [dst, #40]
678 strd B_l, B_h, [dst, #48]
681 strd C_l, C_h, [dst, #56]
684 strd D_l, D_h, [dst, #64]!
689 strd A_l, A_h, [dst, #8]
692 strd B_l, B_h, [dst, #16]
695 strd C_l, C_h, [dst, #24]
698 strd D_l, D_h, [dst, #32]
703 /* Save the remaining bytes and restore the callee-saved regs. */
704 strd A_l, A_h, [dst, #40]
706 strd B_l, B_h, [dst, #48]
707 ldrd B_l, B_h, [sp, #8]
710 strd C_l, C_h, [dst, #56]
711 ldrd C_l, C_h, [sp, #16]
714 strd D_l, D_h, [dst, #64]
715 ldrd D_l, D_h, [sp, #24]
719 ands count, tmp2, #0x3f
721 ldr tmp2, [sp], #FRAME_SIZE
722 cfi_adjust_cfa_offset (-FRAME_SIZE)
724 bne .Ltail63unaligned
728 libc_hidden_builtin_def (memcpy)