sysdeps/arm/armv7/multiarch/memcpy_impl.S

   1 /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
   2    Copyright (C) 2013-2020 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.
  18
  19    This memcpy routine is optimised for Cortex-A15 cores and takes advantage
  20    of VFP or NEON when built with the appropriate flags.
  21
  22    Assumptions:
  23
  24     ARMv6 (ARMv7-a if using Neon)
  25     ARM state
  26     Unaligned accesses
  27
  28  */
  29
  30 /* Thumb cannot encode negative immediate offsets in memory operations.  */
  31 #ifndef NO_THUMB
  32 #define NO_THUMB
  33 #endif
  34 #include <sysdep.h>
  35 #include <arm-features.h>
  36
  37         .syntax unified
  38         /* This implementation requires ARM state.  */
  39         .arm
  40
  41 #ifdef MEMCPY_NEON
  42
  43         .fpu    neon
  44         .arch   armv7-a
  45 # define FRAME_SIZE     4
  46 # define USE_VFP
  47 # define USE_NEON
  48
  49 #elif defined (MEMCPY_VFP)
  50
  51         .arch   armv6
  52         .fpu    vfpv2
  53 # define FRAME_SIZE     32
  54 # define USE_VFP
  55
  56 #else
  57         .arch   armv6
  58 # define FRAME_SIZE    32
  59
  60 #endif
  61
  62 #define ALIGN(addr, align) addr:align
  63
  64 #define INSN_SIZE       4
  65
  66 /* Call parameters.  */
  67 #define dstin   r0
  68 #define src     r1
  69 #define count   r2
  70
  71 /* Locals.  */
  72 #define tmp1    r3
  73 #define dst     ip
  74 #define tmp2    r8
  75
  76 /* These two macros both work by repeated invocation of the macro
  77    dispatch_step (not defined here).  That macro performs one "step",
  78    doing one load instruction and one store instruction to copy one
  79    "unit".  On entry, TMP1 contains the number of bytes to be copied,
  80    a multiple of the unit size.  The macro clobbers TMP1 in the
  81    process of doing a computed jump to the tail containing the
  82    appropriate number of steps.
  83
  84    In dispatch_7_dword, dispatch_step is invoked seven times, with an
  85    argument that is 7 for the first and 1 for the last.  Units are
  86    double-words (8 bytes).  TMP1 is at most 56.
  87
  88    In dispatch_15_word, dispatch_step is invoked fifteen times,
  89    with an argument that is 15 for the first and 1 for the last.
  90    Units are words (4 bytes).  TMP1 is at most 60.  */
  91
  92 #ifndef ARM_ALWAYS_BX
  93 # if ARM_BX_ALIGN_LOG2 != 2
  94 #  error case not handled
  95 # endif
  96         .macro dispatch_7_dword
  97         rsb     tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
  98         add     pc, pc, tmp1
  99         dispatch_step 7
 100         dispatch_step 6
 101         dispatch_step 5
 102         dispatch_step 4
 103         dispatch_step 3
 104         dispatch_step 2
 105         dispatch_step 1
 106         .purgem dispatch_step
 107         .endm
 108
 109         .macro dispatch_15_word
 110         rsb     tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
 111         add     pc, pc, tmp1, lsl #1
 112         dispatch_step 15
 113         dispatch_step 14
 114         dispatch_step 13
 115         dispatch_step 12
 116         dispatch_step 11
 117         dispatch_step 10
 118         dispatch_step 9
 119         dispatch_step 8
 120         dispatch_step 7
 121         dispatch_step 6
 122         dispatch_step 5
 123         dispatch_step 4
 124         dispatch_step 3
 125         dispatch_step 2
 126         dispatch_step 1
 127         .purgem dispatch_step
 128         .endm
 129 #else
 130 # if ARM_BX_ALIGN_LOG2 < 3
 131 #  error case not handled
 132 # endif
 133         .macro dispatch_helper steps, log2_bytes_per_step
 134         /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
 135            (STEPS << LOG2_BYTES_PER_STEP).
 136            So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
 137            Then it needs further adjustment to compensate for the
 138            distance between the PC value taken below (0f + PC_OFS)
 139            and the first step's instructions (1f).  */
 140         rsb     tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
 141                               + ((1f - PC_OFS - 0f) \
 142                                  >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
 143         /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
 144            steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
 145            the (byte) distance to add to the PC.  */
 146 0:      add     tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
 147         bx      tmp1
 148         .p2align ARM_BX_ALIGN_LOG2
 149 1:
 150         .endm
 151
 152         .macro dispatch_7_dword
 153         dispatch_helper 7, 3
 154         .p2align ARM_BX_ALIGN_LOG2
 155         dispatch_step 7
 156         .p2align ARM_BX_ALIGN_LOG2
 157         dispatch_step 6
 158         .p2align ARM_BX_ALIGN_LOG2
 159         dispatch_step 5
 160         .p2align ARM_BX_ALIGN_LOG2
 161         dispatch_step 4
 162         .p2align ARM_BX_ALIGN_LOG2
 163         dispatch_step 3
 164         .p2align ARM_BX_ALIGN_LOG2
 165         dispatch_step 2
 166         .p2align ARM_BX_ALIGN_LOG2
 167         dispatch_step 1
 168         .p2align ARM_BX_ALIGN_LOG2
 169         .purgem dispatch_step
 170         .endm
 171
 172         .macro dispatch_15_word
 173         dispatch_helper 15, 2
 174         dispatch_step 15
 175         .p2align ARM_BX_ALIGN_LOG2
 176         dispatch_step 14
 177         .p2align ARM_BX_ALIGN_LOG2
 178         dispatch_step 13
 179         .p2align ARM_BX_ALIGN_LOG2
 180         dispatch_step 12
 181         .p2align ARM_BX_ALIGN_LOG2
 182         dispatch_step 11
 183         .p2align ARM_BX_ALIGN_LOG2
 184         dispatch_step 10
 185         .p2align ARM_BX_ALIGN_LOG2
 186         dispatch_step 9
 187         .p2align ARM_BX_ALIGN_LOG2
 188         dispatch_step 8
 189         .p2align ARM_BX_ALIGN_LOG2
 190         dispatch_step 7
 191         .p2align ARM_BX_ALIGN_LOG2
 192         dispatch_step 6
 193         .p2align ARM_BX_ALIGN_LOG2
 194         dispatch_step 5
 195         .p2align ARM_BX_ALIGN_LOG2
 196         dispatch_step 4
 197         .p2align ARM_BX_ALIGN_LOG2
 198         dispatch_step 3
 199         .p2align ARM_BX_ALIGN_LOG2
 200         dispatch_step 2
 201         .p2align ARM_BX_ALIGN_LOG2
 202         dispatch_step 1
 203         .p2align ARM_BX_ALIGN_LOG2
 204         .purgem dispatch_step
 205         .endm
 206
 207 #endif
 208
 209 #ifndef USE_NEON
 210 /* For bulk copies using GP registers.  */
 211 #define A_l     r2              /* Call-clobbered.  */
 212 #define A_h     r3              /* Call-clobbered.  */
 213 #define B_l     r4
 214 #define B_h     r5
 215 #define C_l     r6
 216 #define C_h     r7
 217 /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
 218 #define D_l     r10
 219 #define D_h     r11
 220 #endif
 221
 222 /* Number of lines ahead to pre-fetch data.  If you change this the code
 223    below will need adjustment to compensate.  */
 224
 225 #define prefetch_lines  5
 226
 227 #ifdef USE_VFP
 228         .macro  cpy_line_vfp vreg, base
 229         vstr    \vreg, [dst, #\base]
 230         vldr    \vreg, [src, #\base]
 231         vstr    d0, [dst, #\base + 8]
 232         vldr    d0, [src, #\base + 8]
 233         vstr    d1, [dst, #\base + 16]
 234         vldr    d1, [src, #\base + 16]
 235         vstr    d2, [dst, #\base + 24]
 236         vldr    d2, [src, #\base + 24]
 237         vstr    \vreg, [dst, #\base + 32]
 238         vldr    \vreg, [src, #\base + prefetch_lines * 64 - 32]
 239         vstr    d0, [dst, #\base + 40]
 240         vldr    d0, [src, #\base + 40]
 241         vstr    d1, [dst, #\base + 48]
 242         vldr    d1, [src, #\base + 48]
 243         vstr    d2, [dst, #\base + 56]
 244         vldr    d2, [src, #\base + 56]
 245         .endm
 246
 247         .macro  cpy_tail_vfp vreg, base
 248         vstr    \vreg, [dst, #\base]
 249         vldr    \vreg, [src, #\base]
 250         vstr    d0, [dst, #\base + 8]
 251         vldr    d0, [src, #\base + 8]
 252         vstr    d1, [dst, #\base + 16]
 253         vldr    d1, [src, #\base + 16]
 254         vstr    d2, [dst, #\base + 24]
 255         vldr    d2, [src, #\base + 24]
 256         vstr    \vreg, [dst, #\base + 32]
 257         vstr    d0, [dst, #\base + 40]
 258         vldr    d0, [src, #\base + 40]
 259         vstr    d1, [dst, #\base + 48]
 260         vldr    d1, [src, #\base + 48]
 261         vstr    d2, [dst, #\base + 56]
 262         vldr    d2, [src, #\base + 56]
 263         .endm
 264 #endif
 265
 266         .p2align 6
 267 ENTRY(memcpy)
 268
 269         mov     dst, dstin      /* Preserve dstin, we need to return it.  */
 270         cmp     count, #64
 271         bhs     .Lcpy_not_short
 272         /* Deal with small copies quickly by dropping straight into the
 273            exit block.  */
 274
 275 .Ltail63unaligned:
 276 #ifdef USE_NEON
 277         /* These need an extra layer of macro just to work around a
 278            bug in the assembler's parser when an operand starts with
 279            a {...}.  https://sourceware.org/bugzilla/show_bug.cgi?id=15647
 280            tracks that bug; it was not fixed as of binutils-2.23.2.  */
 281         .macro neon_load_d0 reg
 282         vld1.8  {d0}, [\reg]!
 283         .endm
 284         .macro neon_store_d0 reg
 285         vst1.8  {d0}, [\reg]!
 286         .endm
 287
 288         and     tmp1, count, #0x38
 289         .macro dispatch_step i
 290         neon_load_d0 src
 291         neon_store_d0 dst
 292         .endm
 293         dispatch_7_dword
 294
 295         tst     count, #4
 296         ldrne   tmp1, [src], #4
 297         strne   tmp1, [dst], #4
 298 #else
 299         /* Copy up to 15 full words of data.  May not be aligned.  */
 300         /* Cannot use VFP for unaligned data.  */
 301         and     tmp1, count, #0x3c
 302         add     dst, dst, tmp1
 303         add     src, src, tmp1
 304         /* Jump directly into the sequence below at the correct offset.  */
 305         .macro dispatch_step i
 306         ldr     tmp1, [src, #-(\i * 4)]
 307         str     tmp1, [dst, #-(\i * 4)]
 308         .endm
 309         dispatch_15_word
 310 #endif
 311
 312         lsls    count, count, #31
 313         ldrhcs  tmp1, [src], #2
 314         ldrbne  src, [src]              /* Src is dead, use as a scratch.  */
 315         strhcs  tmp1, [dst], #2
 316         strbne  src, [dst]
 317         bx      lr
 318
 319 .Lcpy_not_short:
 320         /* At least 64 bytes to copy, but don't know the alignment yet.  */
 321         str     tmp2, [sp, #-FRAME_SIZE]!
 322         cfi_adjust_cfa_offset (FRAME_SIZE)
 323         cfi_rel_offset (tmp2, 0)
 324         cfi_remember_state
 325         and     tmp2, src, #7
 326         and     tmp1, dst, #7
 327         cmp     tmp1, tmp2
 328         bne     .Lcpy_notaligned
 329
 330 #ifdef USE_VFP
 331         /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 332            that the FP pipeline is much better at streaming loads and
 333            stores.  This is outside the critical loop.  */
 334         vmov.f32        s0, s0
 335 #endif
 336
 337         /* SRC and DST have the same mutual 64-bit alignment, but we may
 338            still need to pre-copy some bytes to get to natural alignment.
 339            We bring SRC and DST into full 64-bit alignment.  */
 340         lsls    tmp2, dst, #29
 341         beq     1f
 342         rsbs    tmp2, tmp2, #0
 343         sub     count, count, tmp2, lsr #29
 344         ldrmi   tmp1, [src], #4
 345         strmi   tmp1, [dst], #4
 346         lsls    tmp2, tmp2, #2
 347         ldrhcs  tmp1, [src], #2
 348         ldrbne  tmp2, [src], #1
 349         strhcs  tmp1, [dst], #2
 350         strbne  tmp2, [dst], #1
 351
 352 1:
 353         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 354         blo     .Ltail63aligned
 355
 356         cmp     tmp2, #512
 357         bhs     .Lcpy_body_long
 358
 359 .Lcpy_body_medium:                      /* Count in tmp2.  */
 360 #ifdef USE_VFP
 361 1:
 362         vldr    d0, [src, #0]
 363         subs    tmp2, tmp2, #64
 364         vldr    d1, [src, #8]
 365         vstr    d0, [dst, #0]
 366         vldr    d0, [src, #16]
 367         vstr    d1, [dst, #8]
 368         vldr    d1, [src, #24]
 369         vstr    d0, [dst, #16]
 370         vldr    d0, [src, #32]
 371         vstr    d1, [dst, #24]
 372         vldr    d1, [src, #40]
 373         vstr    d0, [dst, #32]
 374         vldr    d0, [src, #48]
 375         vstr    d1, [dst, #40]
 376         vldr    d1, [src, #56]
 377         vstr    d0, [dst, #48]
 378         add     src, src, #64
 379         vstr    d1, [dst, #56]
 380         add     dst, dst, #64
 381         bhs     1b
 382         tst     tmp2, #0x3f
 383         beq     .Ldone
 384
 385 .Ltail63aligned:                        /* Count in tmp2.  */
 386         and     tmp1, tmp2, #0x38
 387         add     dst, dst, tmp1
 388         add     src, src, tmp1
 389         .macro dispatch_step i
 390         vldr    d0, [src, #-(\i * 8)]
 391         vstr    d0, [dst, #-(\i * 8)]
 392         .endm
 393         dispatch_7_dword
 394 #else
 395         sub     src, src, #8
 396         sub     dst, dst, #8
 397 1:
 398         ldrd    A_l, A_h, [src, #8]
 399         strd    A_l, A_h, [dst, #8]
 400         ldrd    A_l, A_h, [src, #16]
 401         strd    A_l, A_h, [dst, #16]
 402         ldrd    A_l, A_h, [src, #24]
 403         strd    A_l, A_h, [dst, #24]
 404         ldrd    A_l, A_h, [src, #32]
 405         strd    A_l, A_h, [dst, #32]
 406         ldrd    A_l, A_h, [src, #40]
 407         strd    A_l, A_h, [dst, #40]
 408         ldrd    A_l, A_h, [src, #48]
 409         strd    A_l, A_h, [dst, #48]
 410         ldrd    A_l, A_h, [src, #56]
 411         strd    A_l, A_h, [dst, #56]
 412         ldrd    A_l, A_h, [src, #64]!
 413         strd    A_l, A_h, [dst, #64]!
 414         subs    tmp2, tmp2, #64
 415         bhs     1b
 416         tst     tmp2, #0x3f
 417         bne     1f
 418         ldr     tmp2,[sp], #FRAME_SIZE
 419         cfi_adjust_cfa_offset (-FRAME_SIZE)
 420         cfi_restore (tmp2)
 421         bx      lr
 422
 423         cfi_restore_state
 424         cfi_remember_state
 425 1:
 426         add     src, src, #8
 427         add     dst, dst, #8
 428
 429 .Ltail63aligned:                        /* Count in tmp2.  */
 430         /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 431            we know that the src and dest are 64-bit aligned so we can use
 432            LDRD/STRD to improve efficiency.  */
 433         /* TMP2 is now negative, but we don't care about that.  The bottom
 434            six bits still tell us how many bytes are left to copy.  */
 435
 436         and     tmp1, tmp2, #0x38
 437         add     dst, dst, tmp1
 438         add     src, src, tmp1
 439         .macro dispatch_step i
 440         ldrd    A_l, A_h, [src, #-(\i * 8)]
 441         strd    A_l, A_h, [dst, #-(\i * 8)]
 442         .endm
 443         dispatch_7_dword
 444 #endif
 445
 446         tst     tmp2, #4
 447         ldrne   tmp1, [src], #4
 448         strne   tmp1, [dst], #4
 449         lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
 450         ldrhcs  tmp1, [src], #2
 451         ldrbne  tmp2, [src]
 452         strhcs  tmp1, [dst], #2
 453         strbne  tmp2, [dst]
 454
 455 .Ldone:
 456         ldr     tmp2, [sp], #FRAME_SIZE
 457         cfi_adjust_cfa_offset (-FRAME_SIZE)
 458         cfi_restore (tmp2)
 459         bx      lr
 460
 461         cfi_restore_state
 462         cfi_remember_state
 463
 464 .Lcpy_body_long:                        /* Count in tmp2.  */
 465
 466         /* Long copy.  We know that there's at least (prefetch_lines * 64)
 467            bytes to go.  */
 468 #ifdef USE_VFP
 469         /* Don't use PLD.  Instead, read some data in advance of the current
 470            copy position into a register.  This should act like a PLD
 471            operation but we won't have to repeat the transfer.  */
 472
 473         vldr    d3, [src, #0]
 474         vldr    d4, [src, #64]
 475         vldr    d5, [src, #128]
 476         vldr    d6, [src, #192]
 477         vldr    d7, [src, #256]
 478
 479         vldr    d0, [src, #8]
 480         vldr    d1, [src, #16]
 481         vldr    d2, [src, #24]
 482         add     src, src, #32
 483
 484         subs    tmp2, tmp2, #prefetch_lines * 64 * 2
 485         blo     2f
 486 1:
 487         cpy_line_vfp    d3, 0
 488         cpy_line_vfp    d4, 64
 489         cpy_line_vfp    d5, 128
 490         add     dst, dst, #3 * 64
 491         add     src, src, #3 * 64
 492         cpy_line_vfp    d6, 0
 493         cpy_line_vfp    d7, 64
 494         add     dst, dst, #2 * 64
 495         add     src, src, #2 * 64
 496         subs    tmp2, tmp2, #prefetch_lines * 64
 497         bhs     1b
 498
 499 2:
 500         cpy_tail_vfp    d3, 0
 501         cpy_tail_vfp    d4, 64
 502         cpy_tail_vfp    d5, 128
 503         add     src, src, #3 * 64
 504         add     dst, dst, #3 * 64
 505         cpy_tail_vfp    d6, 0
 506         vstr    d7, [dst, #64]
 507         vldr    d7, [src, #64]
 508         vstr    d0, [dst, #64 + 8]
 509         vldr    d0, [src, #64 + 8]
 510         vstr    d1, [dst, #64 + 16]
 511         vldr    d1, [src, #64 + 16]
 512         vstr    d2, [dst, #64 + 24]
 513         vldr    d2, [src, #64 + 24]
 514         vstr    d7, [dst, #64 + 32]
 515         add     src, src, #96
 516         vstr    d0, [dst, #64 + 40]
 517         vstr    d1, [dst, #64 + 48]
 518         vstr    d2, [dst, #64 + 56]
 519         add     dst, dst, #128
 520         add     tmp2, tmp2, #prefetch_lines * 64
 521         b       .Lcpy_body_medium
 522 #else
 523         /* Long copy.  Use an SMS style loop to maximize the I/O
 524            bandwidth of the core.  We don't have enough spare registers
 525            to synthesise prefetching, so use PLD operations.  */
 526         /* Pre-bias src and dst.  */
 527         sub     src, src, #8
 528         sub     dst, dst, #8
 529         pld     [src, #8]
 530         pld     [src, #72]
 531         subs    tmp2, tmp2, #64
 532         pld     [src, #136]
 533         ldrd    A_l, A_h, [src, #8]
 534         strd    B_l, B_h, [sp, #8]
 535         cfi_rel_offset (B_l, 8)
 536         cfi_rel_offset (B_h, 12)
 537         ldrd    B_l, B_h, [src, #16]
 538         strd    C_l, C_h, [sp, #16]
 539         cfi_rel_offset (C_l, 16)
 540         cfi_rel_offset (C_h, 20)
 541         ldrd    C_l, C_h, [src, #24]
 542         strd    D_l, D_h, [sp, #24]
 543         cfi_rel_offset (D_l, 24)
 544         cfi_rel_offset (D_h, 28)
 545         pld     [src, #200]
 546         ldrd    D_l, D_h, [src, #32]!
 547         b       1f
 548         .p2align        6
 549 2:
 550         pld     [src, #232]
 551         strd    A_l, A_h, [dst, #40]
 552         ldrd    A_l, A_h, [src, #40]
 553         strd    B_l, B_h, [dst, #48]
 554         ldrd    B_l, B_h, [src, #48]
 555         strd    C_l, C_h, [dst, #56]
 556         ldrd    C_l, C_h, [src, #56]
 557         strd    D_l, D_h, [dst, #64]!
 558         ldrd    D_l, D_h, [src, #64]!
 559         subs    tmp2, tmp2, #64
 560 1:
 561         strd    A_l, A_h, [dst, #8]
 562         ldrd    A_l, A_h, [src, #8]
 563         strd    B_l, B_h, [dst, #16]
 564         ldrd    B_l, B_h, [src, #16]
 565         strd    C_l, C_h, [dst, #24]
 566         ldrd    C_l, C_h, [src, #24]
 567         strd    D_l, D_h, [dst, #32]
 568         ldrd    D_l, D_h, [src, #32]
 569         bcs     2b
 570         /* Save the remaining bytes and restore the callee-saved regs.  */
 571         strd    A_l, A_h, [dst, #40]
 572         add     src, src, #40
 573         strd    B_l, B_h, [dst, #48]
 574         ldrd    B_l, B_h, [sp, #8]
 575         cfi_restore (B_l)
 576         cfi_restore (B_h)
 577         strd    C_l, C_h, [dst, #56]
 578         ldrd    C_l, C_h, [sp, #16]
 579         cfi_restore (C_l)
 580         cfi_restore (C_h)
 581         strd    D_l, D_h, [dst, #64]
 582         ldrd    D_l, D_h, [sp, #24]
 583         cfi_restore (D_l)
 584         cfi_restore (D_h)
 585         add     dst, dst, #72
 586         tst     tmp2, #0x3f
 587         bne     .Ltail63aligned
 588         ldr     tmp2, [sp], #FRAME_SIZE
 589         cfi_adjust_cfa_offset (-FRAME_SIZE)
 590         cfi_restore (tmp2)
 591         bx      lr
 592 #endif
 593
 594         cfi_restore_state
 595         cfi_remember_state
 596
 597 .Lcpy_notaligned:
 598         pld     [src, #0]
 599         pld     [src, #64]
 600         /* There's at least 64 bytes to copy, but there is no mutual
 601            alignment.  */
 602         /* Bring DST to 64-bit alignment.  */
 603         lsls    tmp2, dst, #29
 604         pld     [src, #(2 * 64)]
 605         beq     1f
 606         rsbs    tmp2, tmp2, #0
 607         sub     count, count, tmp2, lsr #29
 608         ldrmi   tmp1, [src], #4
 609         strmi   tmp1, [dst], #4
 610         lsls    tmp2, tmp2, #2
 611         ldrbne  tmp1, [src], #1
 612         ldrhcs  tmp2, [src], #2
 613         strbne  tmp1, [dst], #1
 614         strhcs  tmp2, [dst], #2
 615 1:
 616         pld     [src, #(3 * 64)]
 617         subs    count, count, #64
 618         ldrlo   tmp2, [sp], #FRAME_SIZE
 619         blo     .Ltail63unaligned
 620         pld     [src, #(4 * 64)]
 621
 622 #ifdef USE_NEON
 623         /* These need an extra layer of macro just to work around a
 624            bug in the assembler's parser when an operand starts with
 625            a {...}.  */
 626         .macro neon_load_multi reglist, basereg
 627         vld1.8  {\reglist}, [\basereg]!
 628         .endm
 629         .macro neon_store_multi reglist, basereg
 630         vst1.8  {\reglist}, [ALIGN (\basereg, 64)]!
 631         .endm
 632
 633         neon_load_multi d0-d3, src
 634         neon_load_multi d4-d7, src
 635         subs    count, count, #64
 636         blo     2f
 637 1:
 638         pld     [src, #(4 * 64)]
 639         neon_store_multi d0-d3, dst
 640         neon_load_multi d0-d3, src
 641         neon_store_multi d4-d7, dst
 642         neon_load_multi d4-d7, src
 643         subs    count, count, #64
 644         bhs     1b
 645 2:
 646         neon_store_multi d0-d3, dst
 647         neon_store_multi d4-d7, dst
 648         ands    count, count, #0x3f
 649 #else
 650         /* Use an SMS style loop to maximize the I/O bandwidth.  */
 651         sub     src, src, #4
 652         sub     dst, dst, #8
 653         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 654         ldr     A_l, [src, #4]
 655         ldr     A_h, [src, #8]
 656         strd    B_l, B_h, [sp, #8]
 657         cfi_rel_offset (B_l, 8)
 658         cfi_rel_offset (B_h, 12)
 659         ldr     B_l, [src, #12]
 660         ldr     B_h, [src, #16]
 661         strd    C_l, C_h, [sp, #16]
 662         cfi_rel_offset (C_l, 16)
 663         cfi_rel_offset (C_h, 20)
 664         ldr     C_l, [src, #20]
 665         ldr     C_h, [src, #24]
 666         strd    D_l, D_h, [sp, #24]
 667         cfi_rel_offset (D_l, 24)
 668         cfi_rel_offset (D_h, 28)
 669         ldr     D_l, [src, #28]
 670         ldr     D_h, [src, #32]!
 671         b       1f
 672         .p2align        6
 673 2:
 674         pld     [src, #(5 * 64) - (32 - 4)]
 675         strd    A_l, A_h, [dst, #40]
 676         ldr     A_l, [src, #36]
 677         ldr     A_h, [src, #40]
 678         strd    B_l, B_h, [dst, #48]
 679         ldr     B_l, [src, #44]
 680         ldr     B_h, [src, #48]
 681         strd    C_l, C_h, [dst, #56]
 682         ldr     C_l, [src, #52]
 683         ldr     C_h, [src, #56]
 684         strd    D_l, D_h, [dst, #64]!
 685         ldr     D_l, [src, #60]
 686         ldr     D_h, [src, #64]!
 687         subs    tmp2, tmp2, #64
 688 1:
 689         strd    A_l, A_h, [dst, #8]
 690         ldr     A_l, [src, #4]
 691         ldr     A_h, [src, #8]
 692         strd    B_l, B_h, [dst, #16]
 693         ldr     B_l, [src, #12]
 694         ldr     B_h, [src, #16]
 695         strd    C_l, C_h, [dst, #24]
 696         ldr     C_l, [src, #20]
 697         ldr     C_h, [src, #24]
 698         strd    D_l, D_h, [dst, #32]
 699         ldr     D_l, [src, #28]
 700         ldr     D_h, [src, #32]
 701         bcs     2b
 702
 703         /* Save the remaining bytes and restore the callee-saved regs.  */
 704         strd    A_l, A_h, [dst, #40]
 705         add     src, src, #36
 706         strd    B_l, B_h, [dst, #48]
 707         ldrd    B_l, B_h, [sp, #8]
 708         cfi_restore (B_l)
 709         cfi_restore (B_h)
 710         strd    C_l, C_h, [dst, #56]
 711         ldrd    C_l, C_h, [sp, #16]
 712         cfi_restore (C_l)
 713         cfi_restore (C_h)
 714         strd    D_l, D_h, [dst, #64]
 715         ldrd    D_l, D_h, [sp, #24]
 716         cfi_restore (D_l)
 717         cfi_restore (D_h)
 718         add     dst, dst, #72
 719         ands    count, tmp2, #0x3f
 720 #endif
 721         ldr     tmp2, [sp], #FRAME_SIZE
 722         cfi_adjust_cfa_offset (-FRAME_SIZE)
 723         cfi_restore (tmp2)
 724         bne     .Ltail63unaligned
 725         bx      lr
 726
 727 END(memcpy)
 728 libc_hidden_builtin_def (memcpy)