sysdeps/powerpc/powerpc32/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28
  29    Longer moves (>= 32-bytes) justify the effort to get at least the
  30    destination word (4-byte) aligned.  Further optimization is
  31    possible when both source and destination are word aligned.
  32    Each case has an optimized unrolled loop.   */
  33
  34         .machine power6
  35 EALIGN (memcpy, 5, 0)
  36         CALL_MCOUNT
  37
  38     stwu   1,-32(1)
  39     cfi_adjust_cfa_offset(32)
  40     cmplwi cr1,5,31     /* check for short move.  */
  41     neg    0,3
  42     cmplwi cr1,5,31
  43     clrlwi 10,4,30      /* check alignment of src.  */
  44     andi.  11,3,3       /* check alignment of dst.  */
  45     clrlwi 0,0,30       /* Number of bytes until the 1st word of dst.  */
  46     ble-   cr1,L(word_unaligned_short)  /* If move < 32 bytes.  */
  47     cmplw  cr6,10,11
  48     stw    31,24(1)
  49     cfi_offset(31,(24-32))
  50     stw    30,20(1)
  51     cfi_offset(30,(20-32))
  52     mr     30,3
  53     beq    .L0
  54     mtcrf  0x01,0
  55     subf  31,0,5        /* Length after alignment.  */
  56     add   12,4,0        /* Compute src addr after alignment.  */
  57   /* Move 0-3 bytes as needed to get the destination word aligned.  */
  58 1:  bf    31,2f
  59     lbz   6,0(4)
  60     bf    30,3f
  61     lhz   7,1(4)
  62     stb   6,0(3)
  63     sth   7,1(3)
  64     addi  3,3,3
  65     b     0f
  66 3:
  67     stb   6,0(3)
  68     addi  3,3,1
  69     b     0f
  70 2:  bf    30,0f
  71     lhz   6,0(4)
  72     sth   6,0(3)
  73     addi  3,3,2
  74 0:
  75     clrlwi 10,12,30     /* check alignment of src again.  */
  76     srwi   9,31,2       /* Number of full words remaining.  */
  77     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  78     clrlwi 11,31,30  /* calculate the number of tail bytes */
  79     b      L(word_aligned)
  80   /* Copy words from source to destination, assuming the destination is
  81      aligned on a word boundary.
  82
  83      At this point we know there are at least 29 bytes left (32-3) to copy.
  84      The next step is to determine if the source is also word aligned.
  85      If not branch to the unaligned move code at .L6. which uses
  86      a load, shift, store strategy.
  87
  88      Otherwise source and destination are word aligned, and we can use
  89      the optimized word copy loop.  */
  90     .align  4
  91 .L0:
  92     mr     31,5
  93     mr     12,4
  94     bne-   cr6,L(wdu)   /* If source is not word aligned. .L6 */
  95     srwi   9,5,2        /* Number of full words remaining.  */
  96     clrlwi 11,5,30      /* calculate the number of tail bytes */
  97
  98   /* Move words where destination and source are word aligned.
  99      Use an unrolled loop to copy 4 words (16-bytes) per iteration.
 100      If the copy is not an exact multiple of 16 bytes, 1-3
 101      words are copied as needed to set up the main loop.  After
 102      the main loop exits there may be a tail of 1-3 bytes. These bytes are
 103      copied a halfword/byte at a time as needed to preserve alignment.  */
 104 L(word_aligned):
 105     mtcrf 0x01,9
 106     srwi  8,31,4    /* calculate the 16 byte loop count */
 107     cmplwi      cr1,9,4
 108     cmplwi      cr6,11,0
 109     mr    11,12
 110
 111     bf    30,1f
 112     lwz   6,0(12)
 113     lwz   7,4(12)
 114     addi  11,12,8
 115     mtctr 8
 116     stw   6,0(3)
 117     stw   7,4(3)
 118     addi  10,3,8
 119     bf    31,4f
 120     lwz   0,8(12)
 121     stw   0,8(3)
 122     blt   cr1,3f
 123     addi  11,12,12
 124     addi  10,3,12
 125     b     4f
 126     .align  4
 127 1:
 128     mr    10,3
 129     mtctr 8
 130     bf    31,4f
 131     lwz   6,0(12)
 132     addi  11,12,4
 133     stw   6,0(3)
 134     addi  10,3,4
 135
 136     .align  4
 137 4:
 138     lwz   6,0(11)
 139     lwz   7,4(11)
 140     lwz   8,8(11)
 141     lwz   0,12(11)
 142     stw   6,0(10)
 143     stw   7,4(10)
 144     stw   8,8(10)
 145     stw   0,12(10)
 146     addi  11,11,16
 147     addi  10,10,16
 148     bdnz  4b
 149 3:
 150     clrrwi 0,31,2
 151     mtcrf 0x01,31
 152     beq   cr6,0f
 153 .L9:
 154     add   3,3,0
 155     add   12,12,0
 156
 157 /*  At this point we have a tail of 0-3 bytes and we know that the
 158     destination is word aligned.  */
 159 2:  bf    30,1f
 160     lhz   6,0(12)
 161     addi  12,12,2
 162     sth   6,0(3)
 163     addi  3,3,2
 164 1:  bf    31,0f
 165     lbz   6,0(12)
 166     stb   6,0(3)
 167 0:
 168   /* Return original dst pointer.  */
 169     mr  3,30
 170     lwz 30,20(1)
 171     lwz 31,24(1)
 172     addi 1,1,32
 173     blr
 174
 175 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 176    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 177    tests.
 178
 179    In the short (0-8 byte) case no attempt is made to force alignment
 180    of either source or destination.  The hardware will handle the
 181    unaligned load/stores with small delays for crossing 32- 128-byte,
 182    and 4096-byte boundaries. Since these short moves are unlikely to be
 183    unaligned or cross these boundaries, the overhead to force
 184    alignment is not justified.
 185
 186    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 187    boundaries.  Since only loads are sensitive to the 32-/128-byte
 188    boundaries it is more important to align the source then the
 189    destination.  If the source is not already word aligned, we first
 190    move 1-3 bytes as needed.  Since we are only word aligned we don't
 191    use double word load/stores to insure that all loads are aligned.
 192    While the destination and stores may still be unaligned, this
 193    is only an issue for page (4096 byte boundary) crossing, which
 194    should be rare for these short moves.  The hardware handles this
 195    case automatically with a small (~20 cycle) delay.  */
 196     .align  4
 197
 198     cfi_same_value (31)
 199     cfi_same_value (30)
 200 L(word_unaligned_short):
 201     mtcrf 0x01,5
 202     cmplwi cr6,5,8
 203     neg   8,4
 204     clrrwi      9,4,2
 205     andi. 0,8,3
 206     beq   cr6,L(wus_8)  /* Handle moves of 8 bytes.  */
 207 /* At least 9 bytes left.  Get the source word aligned.  */
 208     cmplwi      cr1,5,16
 209     mr    12,4
 210     ble   cr6,L(wus_4)  /* Handle moves of 0-8 bytes.  */
 211     mr    11,3
 212     mr    10,5
 213     cmplwi      cr6,0,2
 214     beq   L(wus_tail)   /* If the source is already word aligned skip this.  */
 215 /* Copy 1-3 bytes to get source address word aligned.  */
 216     lwz   6,0(9)
 217     subf  10,0,5
 218     add   12,4,0
 219     blt   cr6,5f
 220     srwi  7,6,16
 221     bgt   cr6,3f
 222 #ifdef __LITTLE_ENDIAN__
 223     sth   7,0(3)
 224 #else
 225     sth   6,0(3)
 226 #endif
 227     b     7f
 228     .align  4
 229 3:
 230 #ifdef __LITTLE_ENDIAN__
 231     rotlwi 6,6,24
 232     stb   6,0(3)
 233     sth   7,1(3)
 234 #else
 235     stb   7,0(3)
 236     sth   6,1(3)
 237 #endif
 238     b     7f
 239     .align  4
 240 5:
 241 #ifdef __LITTLE_ENDIAN__
 242     rotlwi 6,6,8
 243 #endif
 244     stb   6,0(3)
 245 7:
 246     cmplwi      cr1,10,16
 247     add   11,3,0
 248     mtcrf 0x01,10
 249     .align  4
 250 L(wus_tail):
 251 /* At least 6 bytes left and the source is word aligned.  This allows
 252    some speculative loads up front.  */
 253 /* We need to special case the fall-through because the biggest delays
 254    are due to address computation not being ready in time for the
 255    AGEN.  */
 256     lwz   6,0(12)
 257     lwz   7,4(12)
 258     blt   cr1,L(wus_tail8)
 259     cmplwi      cr0,10,24
 260 L(wus_tail16): /* Move 16 bytes.  */
 261     stw   6,0(11)
 262     stw   7,4(11)
 263     lwz   6,8(12)
 264     lwz   7,12(12)
 265     stw   6,8(11)
 266     stw   7,12(11)
 267 /* Move 8 bytes more.  */
 268     bf    28,L(wus_tail16p8)
 269     cmplwi      cr1,10,28
 270     lwz   6,16(12)
 271     lwz   7,20(12)
 272     stw   6,16(11)
 273     stw   7,20(11)
 274 /* Move 4 bytes more.  */
 275     bf    29,L(wus_tail16p4)
 276     lwz   6,24(12)
 277     stw   6,24(11)
 278     addi  12,12,28
 279     addi  11,11,28
 280     bgt   cr1,L(wus_tail2)
 281  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 282     addi  1,1,32
 283     blr
 284     .align  4
 285 L(wus_tail16p8):  /* less than 8 bytes left.  */
 286     beq   cr1,L(wus_tailX) /* exactly 16 bytes, early exit.  */
 287     cmplwi      cr1,10,20
 288     bf    29,L(wus_tail16p2)
 289 /* Move 4 bytes more.  */
 290     lwz   6,16(12)
 291     stw   6,16(11)
 292     addi  12,12,20
 293     addi  11,11,20
 294     bgt   cr1,L(wus_tail2)
 295  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 296     addi  1,1,32
 297     blr
 298     .align  4
 299 L(wus_tail16p4):  /* less than 4 bytes left.  */
 300     addi  12,12,24
 301     addi  11,11,24
 302     bgt   cr0,L(wus_tail2)
 303  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 304     addi  1,1,32
 305     blr
 306     .align  4
 307 L(wus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
 308     addi  12,12,16
 309     addi  11,11,16
 310     b     L(wus_tail2)
 311
 312     .align  4
 313 L(wus_tail8):  /* Move 8 bytes.  */
 314 /*  r6, r7 already loaded speculatively.  */
 315     cmplwi      cr1,10,8
 316     cmplwi      cr0,10,12
 317     bf    28,L(wus_tail4)
 318     stw   6,0(11)
 319     stw   7,4(11)
 320 /* Move 4 bytes more.  */
 321     bf    29,L(wus_tail8p4)
 322     lwz   6,8(12)
 323     stw   6,8(11)
 324     addi  12,12,12
 325     addi  11,11,12
 326     bgt   cr0,L(wus_tail2)
 327  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 328     addi  1,1,32
 329     blr
 330     .align  4
 331 L(wus_tail8p4):  /* less than 4 bytes left.  */
 332     addi  12,12,8
 333     addi  11,11,8
 334     bgt   cr1,L(wus_tail2)
 335  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 336     addi  1,1,32
 337     blr
 338
 339     .align  4
 340 L(wus_tail4):  /* Move 4 bytes.  */
 341 /*  r6 already loaded speculatively.  If we are here we know there is
 342     more than 4 bytes left.  So there is no need to test.  */
 343     addi  12,12,4
 344     stw   6,0(11)
 345     addi  11,11,4
 346 L(wus_tail2):  /* Move 2-3 bytes.  */
 347     bf    30,L(wus_tail1)
 348     lhz   6,0(12)
 349     sth   6,0(11)
 350     bf    31,L(wus_tailX)
 351     lbz   7,2(12)
 352     stb   7,2(11)
 353     addi  1,1,32
 354     blr
 355 L(wus_tail1):  /* Move 1 byte.  */
 356     bf    31,L(wus_tailX)
 357     lbz   6,0(12)
 358     stb   6,0(11)
 359 L(wus_tailX):
 360   /* Return original dst pointer.  */
 361     addi  1,1,32
 362     blr
 363
 364 /* Special case to copy 0-8 bytes.  */
 365     .align  4
 366 L(wus_8):
 367     lwz   6,0(4)
 368     lwz   7,4(4)
 369     stw   6,0(3)
 370     stw   7,4(3)
 371   /* Return original dst pointer.  */
 372     addi  1,1,32
 373     blr
 374     .align  4
 375 L(wus_4):
 376     bf    29,L(wus_2)
 377     lwz   6,0(4)
 378     stw   6,0(3)
 379     bf    30,L(wus_5)
 380     lhz   7,4(4)
 381     sth   7,4(3)
 382     bf    31,L(wus_0)
 383     lbz   8,6(4)
 384     stb   8,6(3)
 385     addi  1,1,32
 386     blr
 387     .align  4
 388 L(wus_5):
 389     bf    31,L(wus_0)
 390     lbz   6,4(4)
 391     stb   6,4(3)
 392   /* Return original dst pointer.  */
 393     addi 1,1,32
 394     blr
 395     .align  4
 396 L(wus_2):  /* Move 2-3 bytes.  */
 397     bf    30,L(wus_1)
 398     lhz   6,0(4)
 399     sth   6,0(3)
 400     bf    31,L(wus_0)
 401     lbz   7,2(4)
 402     stb   7,2(3)
 403     addi  1,1,32
 404     blr
 405     .align  4
 406 L(wus_1):  /* Move 1 byte.  */
 407     bf    31,L(wus_0)
 408     lbz   6,0(4)
 409     stb   6,0(3)
 410     .align  3
 411 L(wus_0):
 412   /* Return original dst pointer.  */
 413     addi  1,1,32
 414     blr
 415
 416     .align  4
 417     cfi_offset(31,(24-32))
 418     cfi_offset(30,(20-32))
 419 L(wdu):
 420
 421   /* Copy words where the destination is aligned but the source is
 422      not.  For power4, power5 and power6 machines there is penalty for
 423      unaligned loads (src) that cross 32-byte, cacheline, or page
 424      boundaries. So we want to use simple (unaligned) loads where
 425      possible but avoid them where we know the load would span a 32-byte
 426      boundary.
 427
 428      At this point we know we have at least 29 (32-3) bytes to copy
 429      the src is unaligned. and we may cross at least one 32-byte
 430      boundary. Also we have the following register values:
 431      r3 == adjusted dst, word aligned
 432      r4 == unadjusted src
 433      r5 == unadjusted len
 434      r9 == adjusted Word length
 435      r10 == src alignment (1-3)
 436      r12 == adjusted src, not aligned
 437      r31 == adjusted len
 438
 439      First we need to copy word up to but not crossing the next 32-byte
 440      boundary. Then perform aligned loads just before and just after
 441      the boundary and use shifts and or to generate the next aligned
 442      word for dst. If more than 32 bytes remain we copy (unaligned src)
 443      the next 7 words and repeat the loop until less than 32-bytes
 444      remain.
 445
 446      Then if more than 4 bytes remain we again use aligned loads,
 447      shifts and or to generate the next dst word. We then process the
 448      remaining words using unaligned loads as needed. Finally we check
 449      if there are more than 0 bytes (1-3) bytes remaining and use
 450      halfword and or byte load/stores to complete the copy.
 451 */
 452     mr      4,12      /* restore unaligned adjusted src ptr */
 453     clrlwi  0,12,27   /* Find dist from previous 32-byte boundary.  */
 454     slwi    10,10,3   /* calculate number of bits to shift 1st word left */
 455     cmplwi  cr5,0,16
 456     subfic  8,0,32   /* Number of bytes to next 32-byte boundary.  */
 457
 458     mtcrf   0x01,8
 459     cmplwi  cr1,10,16
 460     subfic  9,10,32  /* number of bits to shift 2nd word right */
 461 /*  This test is reversed because the timing to compare the bytes to
 462     32-byte boundary could not be meet.  So we compare the bytes from
 463     previous 32-byte boundary and invert the test.  */
 464     bge     cr5,L(wdu_h32_8)
 465     .align  4
 466     lwz   6,0(4)
 467     lwz   7,4(4)
 468     addi  12,4,16    /* generate alternate pointers to avoid agen */
 469     addi  11,3,16    /* timing issues downstream.  */
 470     stw   6,0(3)
 471     stw   7,4(3)
 472     subi  31,31,16
 473     lwz   6,8(4)
 474     lwz   7,12(4)
 475     addi  4,4,16
 476     stw   6,8(3)
 477     stw   7,12(3)
 478     addi  3,3,16
 479     bf    28,L(wdu_h32_4)
 480     lwz   6,0(12)
 481     lwz   7,4(12)
 482     subi  31,31,8
 483     addi  4,4,8
 484     stw   6,0(11)
 485     stw   7,4(11)
 486     addi  3,3,8
 487     bf    29,L(wdu_h32_0)
 488     lwz   6,8(12)
 489     addi  4,4,4
 490     subi  31,31,4
 491     stw   6,8(11)
 492     addi  3,3,4
 493     b     L(wdu_h32_0)
 494     .align  4
 495 L(wdu_h32_8):
 496     bf    28,L(wdu_h32_4)
 497     lwz   6,0(4)
 498     lwz   7,4(4)
 499     subi  31,31,8
 500     bf    29,L(wdu_h32_8x)
 501     stw   6,0(3)
 502     stw   7,4(3)
 503     lwz   6,8(4)
 504     addi  4,4,12
 505     subi  31,31,4
 506     stw   6,8(3)
 507     addi  3,3,12
 508     b     L(wdu_h32_0)
 509     .align  4
 510 L(wdu_h32_8x):
 511     addi  4,4,8
 512     stw   6,0(3)
 513     stw   7,4(3)
 514     addi  3,3,8
 515     b     L(wdu_h32_0)
 516     .align  4
 517 L(wdu_h32_4):
 518     bf    29,L(wdu_h32_0)
 519     lwz   6,0(4)
 520     subi  31,31,4
 521     addi  4,4,4
 522     stw   6,0(3)
 523     addi  3,3,4
 524     .align  4
 525 L(wdu_h32_0):
 526 /*  set up for 32-byte boundary crossing word move and possibly 32-byte
 527     move loop.  */
 528     clrrwi  12,4,2
 529     cmplwi  cr5,31,32
 530     bge     cr1,L(wdu2_32)
 531 #if 0
 532     b       L(wdu1_32)
 533 /*
 534     cmplwi  cr1,10,8
 535     beq     cr1,L(wdu1_32)
 536     cmplwi  cr1,10,16
 537     beq     cr1,L(wdu2_32)
 538     cmplwi  cr1,10,24
 539     beq     cr1,L(wdu3_32)
 540 */
 541 L(wdu_32):
 542     lwz     6,0(12)
 543     cmplwi  cr6,31,4
 544     srwi    8,31,5    /* calculate the 32 byte loop count */
 545     slw     0,6,10
 546     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 547     blt     cr5,L(wdu_32tail)
 548     mtctr   8
 549     cmplwi  cr6,31,4
 550     .align  4
 551 L(wdu_loop32):
 552     /* copy 32 bytes at a time */
 553     lwz   8,4(12)
 554     addi  12,12,32
 555     lwz   7,4(4)
 556     srw   8,8,9
 557     or    0,0,8
 558     stw   0,0(3)
 559     stw   7,4(3)
 560     lwz   6,8(4)
 561     lwz   7,12(4)
 562     stw   6,8(3)
 563     stw   7,12(3)
 564     lwz   6,16(4)
 565     lwz   7,20(4)
 566     stw   6,16(3)
 567     stw   7,20(3)
 568     lwz   6,24(4)
 569     lwz   7,28(4)
 570     lwz   8,0(12)
 571     addi  4,4,32
 572     stw   6,24(3)
 573     stw   7,28(3)
 574     addi  3,3,32
 575     slw   0,8,10
 576     bdnz+ L(wdu_loop32)
 577
 578 L(wdu_32tail):
 579     mtcrf   0x01,31
 580     cmplwi  cr5,31,16
 581     blt     cr6,L(wdu_4tail)
 582     /* calculate and store the final word */
 583     lwz   8,4(12)
 584     srw   8,8,9
 585     or    6,0,8
 586     b     L(wdu_32tailx)
 587 #endif
 588     .align  4
 589 L(wdu1_32):
 590     lwz     6,-1(4)
 591     cmplwi  cr6,31,4
 592     srwi    8,31,5    /* calculate the 32 byte loop count */
 593 #ifdef __LITTLE_ENDIAN__
 594     srwi    6,6,8
 595 #else
 596     slwi    6,6,8
 597 #endif
 598     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 599     blt     cr5,L(wdu1_32tail)
 600     mtctr   8
 601     cmplwi  cr6,31,4
 602
 603     lwz   8,3(4)
 604     lwz   7,4(4)
 605 #ifdef __LITTLE_ENDIAN__
 606     rldimi 6,8,24,32
 607 #else
 608 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
 609     rlwimi 6,8,8,(32-8),31
 610 #endif
 611     b      L(wdu1_loop32x)
 612     .align  4
 613 L(wdu1_loop32):
 614     /* copy 32 bytes at a time */
 615     lwz   8,3(4)
 616     lwz   7,4(4)
 617     stw   10,-8(3)
 618     stw   11,-4(3)
 619 #ifdef __LITTLE_ENDIAN__
 620     rldimi 6,8,24,32
 621 #else
 622 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
 623     rlwimi 6,8,8,(32-8),31
 624 #endif
 625 L(wdu1_loop32x):
 626     lwz   10,8(4)
 627     lwz   11,12(4)
 628     stw   6,0(3)
 629     stw   7,4(3)
 630     lwz   6,16(4)
 631     lwz   7,20(4)
 632     stw   10,8(3)
 633     stw   11,12(3)
 634     lwz   10,24(4)
 635     lwz   11,28(4)
 636     lwz   8,32-1(4)
 637     addi  4,4,32
 638     stw   6,16(3)
 639     stw   7,20(3)
 640     addi  3,3,32
 641 #ifdef __LITTLE_ENDIAN__
 642     srwi  6,8,8
 643 #else
 644     slwi  6,8,8
 645 #endif
 646     bdnz+ L(wdu1_loop32)
 647     stw   10,-8(3)
 648     stw   11,-4(3)
 649
 650 L(wdu1_32tail):
 651     mtcrf   0x01,31
 652     cmplwi  cr5,31,16
 653     blt     cr6,L(wdu_4tail)
 654     /* calculate and store the final word */
 655     lwz   8,3(4)
 656 #ifdef __LITTLE_ENDIAN__
 657     rldimi 6,8,24,32
 658 #else
 659 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
 660     rlwimi 6,8,8,(32-8),31
 661 #endif
 662     b     L(wdu_32tailx)
 663
 664 L(wdu2_32):
 665     bgt     cr1,L(wdu3_32)
 666     lwz     6,-2(4)
 667     cmplwi  cr6,31,4
 668     srwi    8,31,5    /* calculate the 32 byte loop count */
 669 #ifdef __LITTLE_ENDIAN__
 670     srwi    6,6,16
 671 #else
 672     slwi    6,6,16
 673 #endif
 674     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 675     blt     cr5,L(wdu2_32tail)
 676     mtctr   8
 677     cmplwi  cr6,31,4
 678
 679     lwz   8,2(4)
 680     lwz   7,4(4)
 681 #ifdef __LITTLE_ENDIAN__
 682     rldimi 6,8,16,32
 683 #else
 684     rlwimi 6,8,16,(32-16),31
 685 #endif
 686     b      L(wdu2_loop32x)
 687     .align  4
 688 L(wdu2_loop32):
 689     /* copy 32 bytes at a time */
 690     lwz   8,2(4)
 691     lwz   7,4(4)
 692     stw   10,-8(3)
 693     stw   11,-4(3)
 694 #ifdef __LITTLE_ENDIAN__
 695     rldimi 6,8,16,32
 696 #else
 697     rlwimi 6,8,16,(32-16),31
 698 #endif
 699 L(wdu2_loop32x):
 700     lwz   10,8(4)
 701     lwz   11,12(4)
 702     stw   6,0(3)
 703     stw   7,4(3)
 704     lwz   6,16(4)
 705     lwz   7,20(4)
 706     stw   10,8(3)
 707     stw   11,12(3)
 708     lwz   10,24(4)
 709     lwz   11,28(4)
 710 /*    lwz   8,0(12) */
 711     lwz   8,32-2(4)
 712     addi  4,4,32
 713     stw   6,16(3)
 714     stw   7,20(3)
 715     addi  3,3,32
 716 #ifdef __LITTLE_ENDIAN__
 717     srwi  6,8,16
 718 #else
 719     slwi  6,8,16
 720 #endif
 721     bdnz+ L(wdu2_loop32)
 722     stw   10,-8(3)
 723     stw   11,-4(3)
 724
 725 L(wdu2_32tail):
 726     mtcrf   0x01,31
 727     cmplwi  cr5,31,16
 728     blt     cr6,L(wdu_4tail)
 729     /* calculate and store the final word */
 730     lwz   8,2(4)
 731 #ifdef __LITTLE_ENDIAN__
 732     rldimi 6,8,16,32
 733 #else
 734     rlwimi 6,8,16,(32-16),31
 735 #endif
 736     b     L(wdu_32tailx)
 737
 738 L(wdu3_32):
 739 /*    lwz     6,0(12) */
 740     lwz     6,-3(4)
 741     cmplwi  cr6,31,4
 742     srwi    8,31,5    /* calculate the 32 byte loop count */
 743 #ifdef __LITTLE_ENDIAN__
 744     srwi    6,6,24
 745 #else
 746     slwi    6,6,24
 747 #endif
 748     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
 749     blt     cr5,L(wdu3_32tail)
 750     mtctr   8
 751     cmplwi  cr6,31,4
 752
 753     lwz   8,1(4)
 754     lwz   7,4(4)
 755 #ifdef __LITTLE_ENDIAN__
 756     rldimi 6,8,8,32
 757 #else
 758     rlwimi 6,8,24,(32-24),31
 759 #endif
 760     b      L(wdu3_loop32x)
 761     .align  4
 762 L(wdu3_loop32):
 763     /* copy 32 bytes at a time */
 764     lwz   8,1(4)
 765     lwz   7,4(4)
 766     stw   10,-8(3)
 767     stw   11,-4(3)
 768 #ifdef __LITTLE_ENDIAN__
 769     rldimi 6,8,8,32
 770 #else
 771     rlwimi 6,8,24,(32-24),31
 772 #endif
 773 L(wdu3_loop32x):
 774     lwz   10,8(4)
 775     lwz   11,12(4)
 776     stw   6,0(3)
 777     stw   7,4(3)
 778     lwz   6,16(4)
 779     lwz   7,20(4)
 780     stw   10,8(3)
 781     stw   11,12(3)
 782     lwz   10,24(4)
 783     lwz   11,28(4)
 784     lwz   8,32-3(4)
 785     addi  4,4,32
 786     stw   6,16(3)
 787     stw   7,20(3)
 788     addi  3,3,32
 789 #ifdef __LITTLE_ENDIAN__
 790     srwi  6,8,24
 791 #else
 792     slwi  6,8,24
 793 #endif
 794     bdnz+ L(wdu3_loop32)
 795     stw   10,-8(3)
 796     stw   11,-4(3)
 797
 798 L(wdu3_32tail):
 799     mtcrf   0x01,31
 800     cmplwi  cr5,31,16
 801     blt     cr6,L(wdu_4tail)
 802     /* calculate and store the final word */
 803     lwz   8,1(4)
 804 #ifdef __LITTLE_ENDIAN__
 805     rldimi 6,8,8,32
 806 #else
 807     rlwimi 6,8,24,(32-24),31
 808 #endif
 809     b     L(wdu_32tailx)
 810     .align  4
 811 L(wdu_32tailx):
 812     blt     cr5,L(wdu_t32_8)
 813     lwz   7,4(4)
 814     addi  12,4,16    /* generate alternate pointers to avoid agen */
 815     addi  11,3,16    /* timing issues downstream.  */
 816     stw   6,0(3)
 817     stw   7,4(3)
 818     subi  31,31,16
 819     lwz   6,8(4)
 820     lwz   7,12(4)
 821     addi  4,4,16
 822     stw   6,8(3)
 823     stw   7,12(3)
 824     addi  3,3,16
 825     bf    28,L(wdu_t32_4x)
 826     lwz   6,0(12)
 827     lwz   7,4(12)
 828     addi  4,4,8
 829     subi  31,31,8
 830     stw   6,0(11)
 831     stw   7,4(11)
 832     addi  3,3,8
 833     bf    29,L(wdu_t32_0)
 834     lwz   6,8(12)
 835     addi  4,4,4
 836     subi  31,31,4
 837     stw   6,8(11)
 838     addi  3,3,4
 839     b     L(wdu_t32_0)
 840     .align  4
 841 L(wdu_t32_4x):
 842     bf    29,L(wdu_t32_0)
 843     lwz   6,0(4)
 844     addi  4,4,4
 845     subi  31,31,4
 846     stw   6,0(3)
 847     addi  3,3,4
 848     b     L(wdu_t32_0)
 849     .align  4
 850 L(wdu_t32_8):
 851     bf    28,L(wdu_t32_4)
 852     lwz   7,4(4)
 853     subi  31,31,8
 854     bf    29,L(wdu_t32_8x)
 855     stw   6,0(3)
 856     stw   7,4(3)
 857     lwz   6,8(4)
 858     subi  31,31,4
 859     addi  4,4,12
 860     stw   6,8(3)
 861     addi  3,3,12
 862     b     L(wdu_t32_0)
 863     .align  4
 864 L(wdu_t32_8x):
 865     addi  4,4,8
 866     stw   6,0(3)
 867     stw   7,4(3)
 868     addi  3,3,8
 869     b     L(wdu_t32_0)
 870     .align  4
 871 L(wdu_t32_4):
 872     subi  31,31,4
 873     stw   6,0(3)
 874     addi  4,4,4
 875     addi  3,3,4
 876     .align  4
 877 L(wdu_t32_0):
 878 L(wdu_4tail):
 879     cmplwi  cr6,31,0
 880     beq   cr6,L(wdus_0) /* If the tail is 0 bytes we are done!  */
 881     bf    30,L(wdus_3)
 882     lhz   7,0(4)
 883     sth   7,0(3)
 884     bf    31,L(wdus_0)
 885     lbz   8,2(4)
 886     stb   8,2(3)
 887     mr    3,30
 888     lwz   30,20(1)
 889     lwz   31,24(1)
 890     addi  1,1,32
 891     blr
 892     .align  4
 893 L(wdus_3):
 894     bf    31,L(wus_0)
 895     lbz   6,0(4)
 896     stb   6,0(3)
 897     .align  4
 898 L(wdus_0):
 899   /* Return original dst pointer.  */
 900     mr   3,30
 901     lwz  30,20(1)
 902     lwz  31,24(1)
 903     addi 1,1,32
 904     blr
 905 END (memcpy)
 906
 907 libc_hidden_builtin_def (memcpy)