sysdeps/powerpc/powerpc64/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include <bp-sym.h>
  21 #include <bp-asm.h>
  22
  23 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  24    Returns 'dst'.
  25
  26    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  27    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  28    with the appropriate combination of byte and halfword load/stores.
  29    There is minimal effort to optimize the alignment of short moves.
  30    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  31    of handling unaligned load/stores that do not cross 32-byte boundaries.
  32
  33    Longer moves (>= 32-bytes) justify the effort to get at least the
  34    destination doubleword (8-byte) aligned.  Further optimization is
  35    possible when both source and destination are doubleword aligned.
  36    Each case has a optimized unrolled loop.
  37
  38    For POWER6 unaligned loads will take a 20+ cycle hiccup for any
  39    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
  40    is more forgiving and does not take a hiccup until page or
  41    segment boundaries.  So we require doubleword alignment for
  42    the source but may take a risk and only require word alignment
  43    for the destination.  */
  44
  45         .machine        "power6"
  46 EALIGN (BP_SYM (memcpy), 7, 0)
  47         CALL_MCOUNT 3
  48
  49     cmpldi cr1,5,31
  50     neg   0,3
  51     std   3,-16(1)
  52     std   31,-8(1)
  53     andi. 11,3,7        /* check alignment of dst.  */
  54     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  55     clrldi 10,4,61      /* check alignment of src.  */
  56     cmpldi cr6,5,8
  57     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  58     mtcrf 0x01,0
  59     cmpld cr6,10,11
  60     srdi  9,5,3         /* Number of full double words remaining.  */
  61     beq   .L0
  62
  63     subf  5,0,5
  64   /* Move 0-7 bytes as needed to get the destination doubleword aligned.
  65      Duplicate some code to maximize fall-through and minimize agen delays.  */
  66 1:  bf    31,2f
  67     lbz   6,0(4)
  68     stb   6,0(3)
  69     bf    30,5f
  70     lhz   6,1(4)
  71     sth   6,1(3)
  72     bf    29,0f
  73     lwz   6,3(4)
  74     stw   6,3(3)
  75     b     0f
  76 5:
  77     bf    29,0f
  78     lwz   6,1(4)
  79     stw   6,1(3)
  80     b     0f
  81
  82 2:  bf    30,4f
  83     lhz   6,0(4)
  84     sth   6,0(3)
  85     bf    29,0f
  86     lwz   6,2(4)
  87     stw   6,2(3)
  88     b     0f
  89
  90 4:  bf    29,0f
  91     lwz   6,0(4)
  92     stw   6,0(3)
  93 0:
  94 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
  95     add   4,4,0
  96     add   3,3,0
  97
  98     clrldi 10,4,61      /* check alignment of src again.  */
  99     srdi  9,5,3 /* Number of full double words remaining.  */
 100
 101   /* Copy doublewords from source to destination, assuming the
 102      destination is aligned on a doubleword boundary.
 103
 104      At this point we know there are at least 25 bytes left (32-7) to copy.
 105      The next step is to determine if the source is also doubleword aligned.
 106      If not branch to the unaligned move code at .L6. which uses
 107      a load, shift, store strategy.
 108
 109      Otherwise source and destination are doubleword aligned, and we can
 110      the optimized doubleword copy loop.  */
 111     .align  4
 112 .L0:
 113     clrldi  11,5,61
 114     andi.   0,5,0x78
 115     srdi    12,5,7      /* Number of 128-byte blocks to move.  */
 116     cmpldi  cr1,11,0    /* If the tail is 0 bytes  */
 117     bne-    cr6,.L6     /* If source is not DW aligned.  */
 118
 119   /* Move doublewords where destination and source are DW aligned.
 120      Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
 121      If the copy is not an exact multiple of 128 bytes, 1-15
 122      doublewords are copied as needed to set up the main loop.  After
 123      the main loop exits there may be a tail of 1-7 bytes. These byte
 124      are copied a word/halfword/byte at a time as needed to preserve
 125      alignment.
 126
 127      For POWER6 the L1 is store-through and the L2 is store-in.  The
 128      L2 is clocked at half CPU clock so we can store 16 bytes every
 129      other cycle.  POWER6 also has a load/store bypass so we can do
 130      load, load, store, store every 2 cycles.
 131
 132      The following code is sensitive to cache line alignment.  Do not
 133      make any change with out first making sure they don't result in
 134      splitting ld/std pairs across a cache line.  */
 135
 136     mtcrf 0x02,5
 137     mtcrf 0x01,5
 138     cmpldi  cr5,12,1
 139     beq   L(das_loop)
 140
 141     bf    25,4f
 142     .align  3
 143     ld    6,0(4)
 144     ld    7,8(4)
 145     mr    11,4
 146     mr    10,3
 147     std   6,0(3)
 148     std   7,8(3)
 149     ld    6,16(4)
 150     ld    7,24(4)
 151     std   6,16(3)
 152     std   7,24(3)
 153     ld    6,0+32(4)
 154     ld    7,8+32(4)
 155     addi  4,4,64
 156     addi  3,3,64
 157     std   6,0+32(10)
 158     std   7,8+32(10)
 159     ld    6,16+32(11)
 160     ld    7,24+32(11)
 161     std   6,16+32(10)
 162     std   7,24+32(10)
 163 4:
 164     mr    10,3
 165     bf    26,2f
 166     ld    6,0(4)
 167     ld    7,8(4)
 168     mr    11,4
 169     nop
 170     std   6,0(3)
 171     std   7,8(3)
 172     ld    6,16(4)
 173     ld    7,24(4)
 174     addi  4,4,32
 175     std   6,16(3)
 176     std   7,24(3)
 177     addi  3,3,32
 178 6:
 179     nop
 180     bf    27,5f
 181     ld    6,0+32(11)
 182     ld    7,8+32(11)
 183     addi  4,4,16
 184     addi  3,3,16
 185     std   6,0+32(10)
 186     std   7,8+32(10)
 187     bf    28,L(das_loop_s)
 188     ld    0,16+32(11)
 189     addi  4,4,8
 190     addi  3,3,8
 191     std   0,16+32(10)
 192     blt   cr5,L(das_tail)
 193     b     L(das_loop)
 194     .align  3
 195 5:
 196     nop
 197     bf    28,L(das_loop_s)
 198     ld    6,32(11)
 199     addi  4,4,8
 200     addi  3,3,8
 201     std   6,32(10)
 202     blt   cr5,L(das_tail)
 203     b     L(das_loop)
 204     .align  3
 205 2:
 206     mr    11,4
 207     bf    27,1f
 208     ld    6,0(4)
 209     ld    7,8(4)
 210     addi  4,4,16
 211     addi  3,3,16
 212     std   6,0(10)
 213     std   7,8(10)
 214     bf    28,L(das_loop_s)
 215     ld    0,16(11)
 216     addi  4,11,24
 217     addi  3,10,24
 218     std   0,16(10)
 219     blt   cr5,L(das_tail)
 220     b     L(das_loop)
 221     .align  3
 222 1:
 223     nop
 224     bf    28,L(das_loop_s)
 225     ld    6,0(4)
 226     addi  4,4,8
 227     addi  3,3,8
 228     std   6,0(10)
 229 L(das_loop_s):
 230     nop
 231     blt   cr5,L(das_tail)
 232     .align  4
 233 L(das_loop):
 234     ld    6,0(4)
 235     ld    7,8(4)
 236     mr    10,3
 237     mr    11,4
 238     std   6,0(3)
 239     std   7,8(3)
 240     addi  12,12,-1
 241     nop
 242     ld    8,16(4)
 243     ld    0,24(4)
 244     std   8,16(3)
 245     std   0,24(3)
 246
 247     ld    6,0+32(4)
 248     ld    7,8+32(4)
 249     std   6,0+32(3)
 250     std   7,8+32(3)
 251     ld    8,16+32(4)
 252     ld    0,24+32(4)
 253     std   8,16+32(3)
 254     std   0,24+32(3)
 255
 256     ld    6,0+64(11)
 257     ld    7,8+64(11)
 258     std   6,0+64(10)
 259     std   7,8+64(10)
 260     ld    8,16+64(11)
 261     ld    0,24+64(11)
 262     std   8,16+64(10)
 263     std   0,24+64(10)
 264
 265     ld    6,0+96(11)
 266     ld    7,8+96(11)
 267     addi  4,4,128
 268     addi  3,3,128
 269     std   6,0+96(10)
 270     std   7,8+96(10)
 271     ld    8,16+96(11)
 272     ld    0,24+96(11)
 273     std   8,16+96(10)
 274     std   0,24+96(10)
 275     ble   cr5,L(das_loop_e)
 276
 277     mtctr   12
 278     .align  4
 279 L(das_loop2):
 280     ld    6,0(4)
 281     ld    7,8(4)
 282     mr    10,3
 283     mr    11,4
 284     std   6,0(3)
 285     std   7,8(3)
 286     ld    8,16(4)
 287     ld    0,24(4)
 288     std   8,16(3)
 289     std   0,24(3)
 290
 291     ld    6,0+32(4)
 292     ld    7,8+32(4)
 293     std   6,0+32(3)
 294     std   7,8+32(3)
 295     ld    8,16+32(4)
 296     ld    0,24+32(4)
 297     std   8,16+32(3)
 298     std   0,24+32(3)
 299
 300     ld    6,0+64(11)
 301     ld    7,8+64(11)
 302     std   6,0+64(10)
 303     std   7,8+64(10)
 304     ld    8,16+64(11)
 305     ld    0,24+64(11)
 306     std   8,16+64(10)
 307     std   0,24+64(10)
 308
 309     ld    6,0+96(11)
 310     ld    7,8+96(11)
 311     addi  4,4,128
 312     addi  3,3,128
 313     std   6,0+96(10)
 314     std   7,8+96(10)
 315     ld    8,16+96(11)
 316     ld    0,24+96(11)
 317     std   8,16+96(10)
 318     std   0,24+96(10)
 319     bdnz  L(das_loop2)
 320 L(das_loop_e):
 321 /* Check of a 1-7 byte tail, return if none.  */
 322     bne   cr1,L(das_tail2)
 323 /* Return original dst pointer.  */
 324     ld 3,-16(1)
 325     blr
 326     .align  4
 327 L(das_tail):
 328     beq   cr1,0f
 329
 330 L(das_tail2):
 331 /*  At this point we have a tail of 0-7 bytes and we know that the
 332     destination is double word aligned.  */
 333 4:  bf    29,2f
 334     lwz   6,0(4)
 335     stw   6,0(3)
 336     bf    30,5f
 337     lhz   6,4(4)
 338     sth   6,4(3)
 339     bf    31,0f
 340     lbz   6,6(4)
 341     stb   6,6(3)
 342     b     0f
 343 5:  bf    31,0f
 344     lbz   6,4(4)
 345     stb   6,4(3)
 346     b     0f
 347
 348 2:  bf    30,1f
 349     lhz   6,0(4)
 350     sth   6,0(3)
 351     bf    31,0f
 352     lbz   6,2(4)
 353     stb   6,2(3)
 354     b     0f
 355
 356 1:  bf    31,0f
 357     lbz   6,0(4)
 358     stb   6,0(3)
 359 0:
 360   /* Return original dst pointer.  */
 361     ld 3,-16(1)
 362     blr
 363
 364 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 365    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 366    tests.
 367
 368    In the short (0-8 byte) case no attempt is made to force alignment
 369    of either source or destination.  The hardware will handle the
 370    unaligned load/stores with small delays for crossing 32- 128-byte,
 371    and 4096-byte boundaries. Since these short moves are unlikely to be
 372    unaligned or cross these boundaries, the overhead to force
 373    alignment is not justified.
 374
 375    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 376    boundaries.  Since only loads are sensitive to the 32-/128-byte
 377    boundaries it is more important to align the source then the
 378    destination.  If the source is not already word aligned, we first
 379    move 1-3 bytes as needed.  Since we are only word aligned we don't
 380    use double word load/stores to insure that all loads are aligned.
 381    While the destination and stores may still be unaligned, this
 382    is only an issue for page (4096 byte boundary) crossing, which
 383    should be rare for these short moves.  The hardware handles this
 384    case automatically with a small (~20 cycle) delay.  */
 385     .align  4
 386 .L2:
 387     mtcrf 0x01,5
 388     neg   8,4
 389     clrrdi      11,4,2
 390     andi. 0,8,3
 391     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 392 /* At least 9 bytes left.  Get the source word aligned.  */
 393     cmpldi      cr1,5,16
 394     mr    10,5
 395     mr    12,4
 396     cmpldi      cr6,0,2
 397     beq   L(dus_tail)   /* If the source is already word aligned skip this.  */
 398 /* Copy 1-3 bytes to get source address word aligned.  */
 399     lwz   6,0(11)
 400     subf  10,0,5
 401     add   12,4,0
 402     blt   cr6,5f
 403     srdi  7,6,16
 404     bgt   cr6,3f
 405     sth   6,0(3)
 406     b     7f
 407     .align  4
 408 3:
 409     stb   7,0(3)
 410     sth   6,1(3)
 411     b     7f
 412     .align  4
 413 5:
 414     stb   6,0(3)
 415 7:
 416     cmpldi      cr1,10,16
 417     add   3,3,0
 418     mtcrf 0x01,10
 419     .align  4
 420 L(dus_tail):
 421 /* At least 6 bytes left and the source is word aligned.  This allows
 422    some speculative loads up front.  */
 423 /* We need to special case the fall-through because the biggest delays
 424    are due to address computation not being ready in time for the
 425    AGEN.  */
 426     lwz   6,0(12)
 427     lwz   7,4(12)
 428     blt   cr1,L(dus_tail8)
 429     cmpldi      cr0,10,24
 430 L(dus_tail16): /* Move 16 bytes.  */
 431     stw   6,0(3)
 432     stw   7,4(3)
 433     lwz   6,8(12)
 434     lwz   7,12(12)
 435     stw   6,8(3)
 436     stw   7,12(3)
 437 /* Move 8 bytes more.  */
 438     bf    28,L(dus_tail16p8)
 439     cmpldi      cr1,10,28
 440     lwz   6,16(12)
 441     lwz   7,20(12)
 442     stw   6,16(3)
 443     stw   7,20(3)
 444 /* Move 4 bytes more.  */
 445     bf    29,L(dus_tail16p4)
 446     lwz   6,24(12)
 447     stw   6,24(3)
 448     addi  12,12,28
 449     addi  3,3,28
 450     bgt   cr1,L(dus_tail2)
 451  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 452     ld    3,-16(1)
 453     blr
 454     .align  4
 455 L(dus_tail16p8):  /* less then 8 bytes left.  */
 456     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
 457     cmpldi      cr1,10,20
 458     bf    29,L(dus_tail16p2)
 459 /* Move 4 bytes more.  */
 460     lwz   6,16(12)
 461     stw   6,16(3)
 462     addi  12,12,20
 463     addi  3,3,20
 464     bgt   cr1,L(dus_tail2)
 465  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 466     ld    3,-16(1)
 467     blr
 468     .align  4
 469 L(dus_tail16p4):  /* less then 4 bytes left.  */
 470     addi  12,12,24
 471     addi  3,3,24
 472     bgt   cr0,L(dus_tail2)
 473  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 474     ld    3,-16(1)
 475     blr
 476     .align  4
 477 L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
 478     addi  12,12,16
 479     addi  3,3,16
 480     b     L(dus_tail2)
 481
 482     .align  4
 483 L(dus_tail8):  /* Move 8 bytes.  */
 484 /*  r6, r7 already loaded speculatively.  */
 485     cmpldi      cr1,10,8
 486     cmpldi      cr0,10,12
 487     bf    28,L(dus_tail4)
 488     .align  2
 489     stw   6,0(3)
 490     stw   7,4(3)
 491 /* Move 4 bytes more.  */
 492     bf    29,L(dus_tail8p4)
 493     lwz   6,8(12)
 494     stw   6,8(3)
 495     addi  12,12,12
 496     addi  3,3,12
 497     bgt   cr0,L(dus_tail2)
 498  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 499     ld    3,-16(1)
 500     blr
 501     .align  4
 502 L(dus_tail8p4):  /* less then 4 bytes left.  */
 503     addi  12,12,8
 504     addi  3,3,8
 505     bgt   cr1,L(dus_tail2)
 506  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 507     ld    3,-16(1)
 508     blr
 509
 510     .align  4
 511 L(dus_tail4):  /* Move 4 bytes.  */
 512 /*  r6 already loaded speculatively.  If we are here we know there is
 513     more then 4 bytes left.  So there is no need to test.  */
 514     addi  12,12,4
 515     stw   6,0(3)
 516     addi  3,3,4
 517 L(dus_tail2):  /* Move 2-3 bytes.  */
 518     bf    30,L(dus_tail1)
 519     lhz   6,0(12)
 520     sth   6,0(3)
 521     bf    31,L(dus_tailX)
 522     lbz   7,2(12)
 523     stb   7,2(3)
 524     ld 3,-16(1)
 525     blr
 526 L(dus_tail1):  /* Move 1 byte.  */
 527     bf    31,L(dus_tailX)
 528     lbz   6,0(12)
 529     stb   6,0(3)
 530 L(dus_tailX):
 531   /* Return original dst pointer.  */
 532     ld    3,-16(1)
 533     blr
 534
 535 /* Special case to copy 0-8 bytes.  */
 536     .align  4
 537 .LE8:
 538     mr    12,4
 539     bne   cr6,L(dus_4)
 540 /* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
 541    cycle delay.  This case should be rare and any attempt to avoid this
 542    would take most of 20 cycles any way.  */
 543     ld   6,0(4)
 544     std   6,0(3)
 545   /* Return original dst pointer.  */
 546     ld    3,-16(1)
 547     blr
 548     .align  4
 549 L(dus_4):
 550     bf    29,L(dus_tail2)
 551     lwz   6,0(4)
 552     stw   6,0(3)
 553     bf    30,L(dus_5)
 554     lhz   7,4(4)
 555     sth   7,4(3)
 556     bf    31,L(dus_0)
 557     lbz   8,6(4)
 558     stb   8,6(3)
 559     ld 3,-16(1)
 560     blr
 561     .align  4
 562 L(dus_5):
 563     bf    31,L(dus_0)
 564     lbz   6,4(4)
 565     stb   6,4(3)
 566 L(dus_0):
 567   /* Return original dst pointer.  */
 568     ld    3,-16(1)
 569     blr
 570
 571     .align  4
 572 .L6:
 573     cfi_offset(31,-8)
 574     mr    12,4
 575     mr    31,5
 576   /* Copy doublewords where the destination is aligned but the source is
 577      not.  Use aligned doubleword loads from the source, shifted to realign
 578      the data, to allow aligned destination stores.  */
 579     addi    11,9,-1  /* loop DW count is one less than total */
 580     subf    5,10,12  /* Move source addr to previous full double word.  */
 581     cmpldi  cr5, 10, 2
 582     cmpldi  cr0, 10, 4
 583     mr      4,3
 584     srdi    8,11,2   /* calculate the 32 byte loop count */
 585     ld      6,0(5)   /* pre load 1st full doubleword.  */
 586     mtcrf   0x01,11
 587     cmpldi  cr6,9,4
 588     mtctr   8
 589     ld      7,8(5)   /* pre load 2nd full doubleword.  */
 590     bge     cr0, L(du4_do)
 591     blt     cr5, L(du1_do)
 592     beq     cr5, L(du2_do)
 593     b       L(du3_do)
 594
 595     .align 4
 596 L(du1_do):
 597     bf      30,L(du1_1dw)
 598
 599     /* there are at least two DWs to copy */
 600     sldi     0,6, 8
 601     srdi     8,7, 64-8
 602     or      0,0,8
 603     ld      6,16(5)
 604     std     0,0(4)
 605     sldi     0,7, 8
 606     srdi     8,6, 64-8
 607     or      0,0,8
 608     ld      7,24(5)
 609     std     0,8(4)
 610     addi    4,4,16
 611     addi    5,5,32
 612     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
 613     bf      31,L(du1_loop)
 614     /* there is a third DW to copy */
 615     sldi     0,6, 8
 616     srdi     8,7, 64-8
 617     or      0,0,8
 618     std     0,0(4)
 619     mr      6,7
 620     ld      7,0(5)
 621     addi    5,5,8
 622     addi    4,4,8
 623     beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
 624     b       L(du1_loop)
 625     .align 4
 626 L(du1_1dw):
 627     sldi     0,6, 8
 628     srdi     8,7, 64-8
 629     addi    5,5,16
 630     or      0,0,8
 631     bf      31,L(du1_loop)
 632     mr      6,7
 633     ld      7,0(5)
 634     addi    5,5,8
 635     std     0,0(4)
 636     addi    4,4,8
 637     .align 4
 638 /* copy 32 bytes at a time */
 639 L(du1_loop):
 640     sldi   0,6, 8
 641     srdi   8,7, 64-8
 642     or    0,0,8
 643     ld    6,0(5)
 644     std   0,0(4)
 645     sldi   0,7, 8
 646     srdi   8,6, 64-8
 647     or    0,0,8
 648     ld    7,8(5)
 649     std   0,8(4)
 650     sldi   0,6, 8
 651     srdi   8,7, 64-8
 652     or    0,0,8
 653     ld    6,16(5)
 654     std   0,16(4)
 655     sldi   0,7, 8
 656     srdi   8,6, 64-8
 657     or    0,0,8
 658     ld    7,24(5)
 659     std   0,24(4)
 660     addi  5,5,32
 661     addi  4,4,32
 662     bdnz+ L(du1_loop)
 663     .align 4
 664 L(du1_fini):
 665     /* calculate and store the final DW */
 666     sldi   0,6, 8
 667     srdi   8,7, 64-8
 668     or    0,0,8
 669     std   0,0(4)
 670     b     L(du_done)
 671
 672     .align 4
 673 L(du2_do):
 674     bf      30,L(du2_1dw)
 675
 676     /* there are at least two DWs to copy */
 677     sldi     0,6, 16
 678     srdi     8,7, 64-16
 679     or      0,0,8
 680     ld      6,16(5)
 681     std     0,0(4)
 682     sldi     0,7, 16
 683     srdi     8,6, 64-16
 684     or      0,0,8
 685     ld      7,24(5)
 686     std     0,8(4)
 687     addi    4,4,16
 688     addi    5,5,32
 689     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
 690     bf      31,L(du2_loop)
 691     /* there is a third DW to copy */
 692     sldi     0,6, 16
 693     srdi     8,7, 64-16
 694     or      0,0,8
 695     std     0,0(4)
 696     mr      6,7
 697     ld      7,0(5)
 698     addi    5,5,8
 699     addi    4,4,8
 700     beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
 701     b       L(du2_loop)
 702     .align 4
 703 L(du2_1dw):
 704     sldi     0,6, 16
 705     srdi     8,7, 64-16
 706     addi    5,5,16
 707     or      0,0,8
 708     bf      31,L(du2_loop)
 709     mr      6,7
 710     ld      7,0(5)
 711     addi    5,5,8
 712     std     0,0(4)
 713     addi    4,4,8
 714     .align 4
 715 /* copy 32 bytes at a time */
 716 L(du2_loop):
 717     sldi   0,6, 16
 718     srdi   8,7, 64-16
 719     or    0,0,8
 720     ld    6,0(5)
 721     std   0,0(4)
 722     sldi   0,7, 16
 723     srdi   8,6, 64-16
 724     or    0,0,8
 725     ld    7,8(5)
 726     std   0,8(4)
 727     sldi   0,6, 16
 728     srdi   8,7, 64-16
 729     or    0,0,8
 730     ld    6,16(5)
 731     std   0,16(4)
 732     sldi   0,7, 16
 733     srdi   8,6, 64-16
 734     or    0,0,8
 735     ld    7,24(5)
 736     std   0,24(4)
 737     addi  5,5,32
 738     addi  4,4,32
 739     bdnz+ L(du2_loop)
 740     .align 4
 741 L(du2_fini):
 742     /* calculate and store the final DW */
 743     sldi   0,6, 16
 744     srdi   8,7, 64-16
 745     or    0,0,8
 746     std   0,0(4)
 747     b     L(du_done)
 748
 749     .align 4
 750 L(du3_do):
 751     bf      30,L(du3_1dw)
 752
 753     /* there are at least two DWs to copy */
 754     sldi     0,6, 24
 755     srdi     8,7, 64-24
 756     or      0,0,8
 757     ld      6,16(5)
 758     std     0,0(4)
 759     sldi     0,7, 24
 760     srdi     8,6, 64-24
 761     or      0,0,8
 762     ld      7,24(5)
 763     std     0,8(4)
 764     addi    4,4,16
 765     addi    5,5,32
 766     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
 767     bf      31,L(du3_loop)
 768     /* there is a third DW to copy */
 769     sldi     0,6, 24
 770     srdi     8,7, 64-24
 771     or      0,0,8
 772     std     0,0(4)
 773     mr      6,7
 774     ld      7,0(5)
 775     addi    5,5,8
 776     addi    4,4,8
 777     beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
 778     b       L(du3_loop)
 779     .align 4
 780 L(du3_1dw):
 781     sldi     0,6, 24
 782     srdi     8,7, 64-24
 783     addi    5,5,16
 784     or      0,0,8
 785     bf      31,L(du3_loop)
 786     mr      6,7
 787     ld      7,0(5)
 788     addi    5,5,8
 789     std     0,0(4)
 790     addi    4,4,8
 791     .align 4
 792 /* copy 32 bytes at a time */
 793 L(du3_loop):
 794     sldi   0,6, 24
 795     srdi   8,7, 64-24
 796     or    0,0,8
 797     ld    6,0(5)
 798     std   0,0(4)
 799     sldi   0,7, 24
 800     srdi   8,6, 64-24
 801     or    0,0,8
 802     ld    7,8(5)
 803     std   0,8(4)
 804     sldi   0,6, 24
 805     srdi   8,7, 64-24
 806     or    0,0,8
 807     ld    6,16(5)
 808     std   0,16(4)
 809     sldi   0,7, 24
 810     srdi   8,6, 64-24
 811     or    0,0,8
 812     ld    7,24(5)
 813     std   0,24(4)
 814     addi  5,5,32
 815     addi  4,4,32
 816     bdnz+ L(du3_loop)
 817     .align 4
 818 L(du3_fini):
 819     /* calculate and store the final DW */
 820     sldi   0,6, 24
 821     srdi   8,7, 64-24
 822     or    0,0,8
 823     std   0,0(4)
 824     b     L(du_done)
 825
 826     .align 4
 827 L(du4_do):
 828     cmpldi  cr5, 10, 6
 829     beq     cr0, L(du4_dox)
 830     blt     cr5, L(du5_do)
 831     beq     cr5, L(du6_do)
 832     b       L(du7_do)
 833 L(du4_dox):
 834     bf      30,L(du4_1dw)
 835
 836     /* there are at least two DWs to copy */
 837     sldi     0,6, 32
 838     srdi     8,7, 64-32
 839     or      0,0,8
 840     ld      6,16(5)
 841     std     0,0(4)
 842     sldi     0,7, 32
 843     srdi     8,6, 64-32
 844     or      0,0,8
 845     ld      7,24(5)
 846     std     0,8(4)
 847     addi    4,4,16
 848     addi    5,5,32
 849     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
 850     bf      31,L(du4_loop)
 851     /* there is a third DW to copy */
 852     sldi     0,6, 32
 853     srdi     8,7, 64-32
 854     or      0,0,8
 855     std     0,0(4)
 856     mr      6,7
 857     ld      7,0(5)
 858     addi    5,5,8
 859     addi    4,4,8
 860     beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
 861     b       L(du4_loop)
 862     .align 4
 863 L(du4_1dw):
 864     sldi     0,6, 32
 865     srdi     8,7, 64-32
 866     addi    5,5,16
 867     or      0,0,8
 868     bf      31,L(du4_loop)
 869     mr      6,7
 870     ld      7,0(5)
 871     addi    5,5,8
 872     std     0,0(4)
 873     addi    4,4,8
 874     .align 4
 875 /* copy 32 bytes at a time */
 876 L(du4_loop):
 877     sldi   0,6, 32
 878     srdi   8,7, 64-32
 879     or    0,0,8
 880     ld    6,0(5)
 881     std   0,0(4)
 882     sldi   0,7, 32
 883     srdi   8,6, 64-32
 884     or    0,0,8
 885     ld    7,8(5)
 886     std   0,8(4)
 887     sldi   0,6, 32
 888     srdi   8,7, 64-32
 889     or    0,0,8
 890     ld    6,16(5)
 891     std   0,16(4)
 892     sldi   0,7, 32
 893     srdi   8,6, 64-32
 894     or    0,0,8
 895     ld    7,24(5)
 896     std   0,24(4)
 897     addi  5,5,32
 898     addi  4,4,32
 899     bdnz+ L(du4_loop)
 900     .align 4
 901 L(du4_fini):
 902     /* calculate and store the final DW */
 903     sldi   0,6, 32
 904     srdi   8,7, 64-32
 905     or    0,0,8
 906     std   0,0(4)
 907     b     L(du_done)
 908
 909     .align 4
 910 L(du5_do):
 911     bf      30,L(du5_1dw)
 912
 913     /* there are at least two DWs to copy */
 914     sldi     0,6, 40
 915     srdi     8,7, 64-40
 916     or      0,0,8
 917     ld      6,16(5)
 918     std     0,0(4)
 919     sldi     0,7, 40
 920     srdi     8,6, 64-40
 921     or      0,0,8
 922     ld      7,24(5)
 923     std     0,8(4)
 924     addi    4,4,16
 925     addi    5,5,32
 926     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
 927     bf      31,L(du5_loop)
 928     /* there is a third DW to copy */
 929     sldi     0,6, 40
 930     srdi     8,7, 64-40
 931     or      0,0,8
 932     std     0,0(4)
 933     mr      6,7
 934     ld      7,0(5)
 935     addi    5,5,8
 936     addi    4,4,8
 937     beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
 938     b       L(du5_loop)
 939     .align 4
 940 L(du5_1dw):
 941     sldi     0,6, 40
 942     srdi     8,7, 64-40
 943     addi    5,5,16
 944     or      0,0,8
 945     bf      31,L(du5_loop)
 946     mr      6,7
 947     ld      7,0(5)
 948     addi    5,5,8
 949     std     0,0(4)
 950     addi    4,4,8
 951     .align 4
 952 /* copy 32 bytes at a time */
 953 L(du5_loop):
 954     sldi   0,6, 40
 955     srdi   8,7, 64-40
 956     or    0,0,8
 957     ld    6,0(5)
 958     std   0,0(4)
 959     sldi   0,7, 40
 960     srdi   8,6, 64-40
 961     or    0,0,8
 962     ld    7,8(5)
 963     std   0,8(4)
 964     sldi   0,6, 40
 965     srdi   8,7, 64-40
 966     or    0,0,8
 967     ld    6,16(5)
 968     std   0,16(4)
 969     sldi   0,7, 40
 970     srdi   8,6, 64-40
 971     or    0,0,8
 972     ld    7,24(5)
 973     std   0,24(4)
 974     addi  5,5,32
 975     addi  4,4,32
 976     bdnz+ L(du5_loop)
 977     .align 4
 978 L(du5_fini):
 979     /* calculate and store the final DW */
 980     sldi   0,6, 40
 981     srdi   8,7, 64-40
 982     or    0,0,8
 983     std   0,0(4)
 984     b     L(du_done)
 985
 986     .align 4
 987 L(du6_do):
 988     bf      30,L(du6_1dw)
 989
 990     /* there are at least two DWs to copy */
 991     sldi     0,6, 48
 992     srdi     8,7, 64-48
 993     or      0,0,8
 994     ld      6,16(5)
 995     std     0,0(4)
 996     sldi     0,7, 48
 997     srdi     8,6, 64-48
 998     or      0,0,8
 999     ld      7,24(5)
1000     std     0,8(4)
1001     addi    4,4,16
1002     addi    5,5,32
1003     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
1004     bf      31,L(du6_loop)
1005     /* there is a third DW to copy */
1006     sldi     0,6, 48
1007     srdi     8,7, 64-48
1008     or      0,0,8
1009     std     0,0(4)
1010     mr      6,7
1011     ld      7,0(5)
1012     addi    5,5,8
1013     addi    4,4,8
1014     beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
1015     b       L(du6_loop)
1016     .align 4
1017 L(du6_1dw):
1018     sldi     0,6, 48
1019     srdi     8,7, 64-48
1020     addi    5,5,16
1021     or      0,0,8
1022     bf      31,L(du6_loop)
1023     mr      6,7
1024     ld      7,0(5)
1025     addi    5,5,8
1026     std     0,0(4)
1027     addi    4,4,8
1028     .align 4
1029 /* copy 32 bytes at a time */
1030 L(du6_loop):
1031     sldi   0,6, 48
1032     srdi   8,7, 64-48
1033     or    0,0,8
1034     ld    6,0(5)
1035     std   0,0(4)
1036     sldi   0,7, 48
1037     srdi   8,6, 64-48
1038     or    0,0,8
1039     ld    7,8(5)
1040     std   0,8(4)
1041     sldi   0,6, 48
1042     srdi   8,7, 64-48
1043     or    0,0,8
1044     ld    6,16(5)
1045     std   0,16(4)
1046     sldi   0,7, 48
1047     srdi   8,6, 64-48
1048     or    0,0,8
1049     ld    7,24(5)
1050     std   0,24(4)
1051     addi  5,5,32
1052     addi  4,4,32
1053     bdnz+ L(du6_loop)
1054     .align 4
1055 L(du6_fini):
1056     /* calculate and store the final DW */
1057     sldi   0,6, 48
1058     srdi   8,7, 64-48
1059     or    0,0,8
1060     std   0,0(4)
1061     b     L(du_done)
1062
1063     .align 4
1064 L(du7_do):
1065     bf      30,L(du7_1dw)
1066
1067     /* there are at least two DWs to copy */
1068     sldi     0,6, 56
1069     srdi     8,7, 64-56
1070     or      0,0,8
1071     ld      6,16(5)
1072     std     0,0(4)
1073     sldi     0,7, 56
1074     srdi     8,6, 64-56
1075     or      0,0,8
1076     ld      7,24(5)
1077     std     0,8(4)
1078     addi    4,4,16
1079     addi    5,5,32
1080     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
1081     bf      31,L(du7_loop)
1082     /* there is a third DW to copy */
1083     sldi     0,6, 56
1084     srdi     8,7, 64-56
1085     or      0,0,8
1086     std     0,0(4)
1087     mr      6,7
1088     ld      7,0(5)
1089     addi    5,5,8
1090     addi    4,4,8
1091     beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
1092     b       L(du7_loop)
1093     .align 4
1094 L(du7_1dw):
1095     sldi     0,6, 56
1096     srdi     8,7, 64-56
1097     addi    5,5,16
1098     or      0,0,8
1099     bf      31,L(du7_loop)
1100     mr      6,7
1101     ld      7,0(5)
1102     addi    5,5,8
1103     std     0,0(4)
1104     addi    4,4,8
1105     .align 4
1106 /* copy 32 bytes at a time */
1107 L(du7_loop):
1108     sldi   0,6, 56
1109     srdi   8,7, 64-56
1110     or    0,0,8
1111     ld    6,0(5)
1112     std   0,0(4)
1113     sldi   0,7, 56
1114     srdi   8,6, 64-56
1115     or    0,0,8
1116     ld    7,8(5)
1117     std   0,8(4)
1118     sldi   0,6, 56
1119     srdi   8,7, 64-56
1120     or    0,0,8
1121     ld    6,16(5)
1122     std   0,16(4)
1123     sldi   0,7, 56
1124     srdi   8,6, 64-56
1125     or    0,0,8
1126     ld    7,24(5)
1127     std   0,24(4)
1128     addi  5,5,32
1129     addi  4,4,32
1130     bdnz+ L(du7_loop)
1131     .align 4
1132 L(du7_fini):
1133     /* calculate and store the final DW */
1134     sldi   0,6, 56
1135     srdi   8,7, 64-56
1136     or    0,0,8
1137     std   0,0(4)
1138     b     L(du_done)
1139
1140     .align 4
1141 L(du_done):
1142     rldicr 0,31,0,60
1143     mtcrf 0x01,31
1144     beq   cr1,0f        /* If the tail is 0 bytes we are done!  */
1145
1146     add   3,3,0
1147     add   12,12,0
1148 /*  At this point we have a tail of 0-7 bytes and we know that the
1149     destination is double word aligned.  */
1150 4:  bf    29,2f
1151     lwz   6,0(12)
1152     addi  12,12,4
1153     stw   6,0(3)
1154     addi  3,3,4
1155 2:  bf    30,1f
1156     lhz   6,0(12)
1157     addi  12,12,2
1158     sth   6,0(3)
1159     addi  3,3,2
1160 1:  bf    31,0f
1161     lbz   6,0(12)
1162     stb   6,0(3)
1163 0:
1164   /* Return original dst pointer.  */
1165     ld 31,-8(1)
1166     ld 3,-16(1)
1167     blr
1168 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
1169 libc_hidden_builtin_def (memcpy)