sysdeps/powerpc/powerpc64/power6/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.
  35
  36    For POWER6 unaligned loads will take a 20+ cycle hiccup for any
  37    L1 cache miss that crosses a 32- or 128-byte boundary.  Store
  38    is more forgiving and does not take a hiccup until page or
  39    segment boundaries.  So we require doubleword alignment for
  40    the source but may take a risk and only require word alignment
  41    for the destination.  */
  42
  43 #ifndef MEMCPY
  44 # define MEMCPY memcpy
  45 #endif
  46         .machine        "power6"
  47 ENTRY_TOCLESS (MEMCPY, 7)
  48         CALL_MCOUNT 3
  49
  50     cmpldi cr1,5,31
  51     neg   0,3
  52     std   3,-16(1)
  53     std   31,-8(1)
  54     andi. 11,3,7        /* check alignment of dst.  */
  55     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  56     clrldi 10,4,61      /* check alignment of src.  */
  57     cmpldi cr6,5,8
  58     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  59     mtcrf 0x01,0
  60     cmpld cr6,10,11
  61     srdi  9,5,3         /* Number of full double words remaining.  */
  62     beq   .L0
  63
  64     subf  5,0,5
  65   /* Move 0-7 bytes as needed to get the destination doubleword aligned.
  66      Duplicate some code to maximize fall-through and minimize agen delays.  */
  67 1:  bf    31,2f
  68     lbz   6,0(4)
  69     stb   6,0(3)
  70     bf    30,5f
  71     lhz   6,1(4)
  72     sth   6,1(3)
  73     bf    29,0f
  74     lwz   6,3(4)
  75     stw   6,3(3)
  76     b     0f
  77 5:
  78     bf    29,0f
  79     lwz   6,1(4)
  80     stw   6,1(3)
  81     b     0f
  82
  83 2:  bf    30,4f
  84     lhz   6,0(4)
  85     sth   6,0(3)
  86     bf    29,0f
  87     lwz   6,2(4)
  88     stw   6,2(3)
  89     b     0f
  90
  91 4:  bf    29,0f
  92     lwz   6,0(4)
  93     stw   6,0(3)
  94 0:
  95 /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
  96     add   4,4,0
  97     add   3,3,0
  98
  99     clrldi 10,4,61      /* check alignment of src again.  */
 100     srdi  9,5,3 /* Number of full double words remaining.  */
 101
 102   /* Copy doublewords from source to destination, assuming the
 103      destination is aligned on a doubleword boundary.
 104
 105      At this point we know there are at least 25 bytes left (32-7) to copy.
 106      The next step is to determine if the source is also doubleword aligned.
 107      If not branch to the unaligned move code at .L6. which uses
 108      a load, shift, store strategy.
 109
 110      Otherwise source and destination are doubleword aligned, and we can
 111      the optimized doubleword copy loop.  */
 112     .align  4
 113 .L0:
 114     clrldi  11,5,61
 115     andi.   0,5,0x78
 116     srdi    12,5,7      /* Number of 128-byte blocks to move.  */
 117     cmpldi  cr1,11,0    /* If the tail is 0 bytes  */
 118     bne-    cr6,.L6     /* If source is not DW aligned.  */
 119
 120   /* Move doublewords where destination and source are DW aligned.
 121      Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
 122      If the copy is not an exact multiple of 128 bytes, 1-15
 123      doublewords are copied as needed to set up the main loop.  After
 124      the main loop exits there may be a tail of 1-7 bytes. These byte
 125      are copied a word/halfword/byte at a time as needed to preserve
 126      alignment.
 127
 128      For POWER6 the L1 is store-through and the L2 is store-in.  The
 129      L2 is clocked at half CPU clock so we can store 16 bytes every
 130      other cycle.  POWER6 also has a load/store bypass so we can do
 131      load, load, store, store every 2 cycles.
 132
 133      The following code is sensitive to cache line alignment.  Do not
 134      make any change with out first making sure they don't result in
 135      splitting ld/std pairs across a cache line.  */
 136
 137     mtcrf 0x02,5
 138     mtcrf 0x01,5
 139     cmpldi  cr5,12,1
 140     beq   L(das_loop)
 141
 142     bf    25,4f
 143     .align  3
 144     ld    6,0(4)
 145     ld    7,8(4)
 146     mr    11,4
 147     mr    10,3
 148     std   6,0(3)
 149     std   7,8(3)
 150     ld    6,16(4)
 151     ld    7,24(4)
 152     std   6,16(3)
 153     std   7,24(3)
 154     ld    6,0+32(4)
 155     ld    7,8+32(4)
 156     addi  4,4,64
 157     addi  3,3,64
 158     std   6,0+32(10)
 159     std   7,8+32(10)
 160     ld    6,16+32(11)
 161     ld    7,24+32(11)
 162     std   6,16+32(10)
 163     std   7,24+32(10)
 164 4:
 165     mr    10,3
 166     bf    26,2f
 167     ld    6,0(4)
 168     ld    7,8(4)
 169     mr    11,4
 170     nop
 171     std   6,0(3)
 172     std   7,8(3)
 173     ld    6,16(4)
 174     ld    7,24(4)
 175     addi  4,4,32
 176     std   6,16(3)
 177     std   7,24(3)
 178     addi  3,3,32
 179 6:
 180     nop
 181     bf    27,5f
 182     ld    6,0+32(11)
 183     ld    7,8+32(11)
 184     addi  4,4,16
 185     addi  3,3,16
 186     std   6,0+32(10)
 187     std   7,8+32(10)
 188     bf    28,L(das_loop_s)
 189     ld    0,16+32(11)
 190     addi  4,4,8
 191     addi  3,3,8
 192     std   0,16+32(10)
 193     blt   cr5,L(das_tail)
 194     b     L(das_loop)
 195     .align  3
 196 5:
 197     nop
 198     bf    28,L(das_loop_s)
 199     ld    6,32(11)
 200     addi  4,4,8
 201     addi  3,3,8
 202     std   6,32(10)
 203     blt   cr5,L(das_tail)
 204     b     L(das_loop)
 205     .align  3
 206 2:
 207     mr    11,4
 208     bf    27,1f
 209     ld    6,0(4)
 210     ld    7,8(4)
 211     addi  4,4,16
 212     addi  3,3,16
 213     std   6,0(10)
 214     std   7,8(10)
 215     bf    28,L(das_loop_s)
 216     ld    0,16(11)
 217     addi  4,11,24
 218     addi  3,10,24
 219     std   0,16(10)
 220     blt   cr5,L(das_tail)
 221     b     L(das_loop)
 222     .align  3
 223 1:
 224     nop
 225     bf    28,L(das_loop_s)
 226     ld    6,0(4)
 227     addi  4,4,8
 228     addi  3,3,8
 229     std   6,0(10)
 230 L(das_loop_s):
 231     nop
 232     blt   cr5,L(das_tail)
 233     .align  4
 234 L(das_loop):
 235     ld    6,0(4)
 236     ld    7,8(4)
 237     mr    10,3
 238     mr    11,4
 239     std   6,0(3)
 240     std   7,8(3)
 241     addi  12,12,-1
 242     nop
 243     ld    8,16(4)
 244     ld    0,24(4)
 245     std   8,16(3)
 246     std   0,24(3)
 247
 248     ld    6,0+32(4)
 249     ld    7,8+32(4)
 250     std   6,0+32(3)
 251     std   7,8+32(3)
 252     ld    8,16+32(4)
 253     ld    0,24+32(4)
 254     std   8,16+32(3)
 255     std   0,24+32(3)
 256
 257     ld    6,0+64(11)
 258     ld    7,8+64(11)
 259     std   6,0+64(10)
 260     std   7,8+64(10)
 261     ld    8,16+64(11)
 262     ld    0,24+64(11)
 263     std   8,16+64(10)
 264     std   0,24+64(10)
 265
 266     ld    6,0+96(11)
 267     ld    7,8+96(11)
 268     addi  4,4,128
 269     addi  3,3,128
 270     std   6,0+96(10)
 271     std   7,8+96(10)
 272     ld    8,16+96(11)
 273     ld    0,24+96(11)
 274     std   8,16+96(10)
 275     std   0,24+96(10)
 276     ble   cr5,L(das_loop_e)
 277
 278     mtctr   12
 279     .align  4
 280 L(das_loop2):
 281     ld    6,0(4)
 282     ld    7,8(4)
 283     mr    10,3
 284     mr    11,4
 285     std   6,0(3)
 286     std   7,8(3)
 287     ld    8,16(4)
 288     ld    0,24(4)
 289     std   8,16(3)
 290     std   0,24(3)
 291
 292     ld    6,0+32(4)
 293     ld    7,8+32(4)
 294     std   6,0+32(3)
 295     std   7,8+32(3)
 296     ld    8,16+32(4)
 297     ld    0,24+32(4)
 298     std   8,16+32(3)
 299     std   0,24+32(3)
 300
 301     ld    6,0+64(11)
 302     ld    7,8+64(11)
 303     std   6,0+64(10)
 304     std   7,8+64(10)
 305     ld    8,16+64(11)
 306     ld    0,24+64(11)
 307     std   8,16+64(10)
 308     std   0,24+64(10)
 309
 310     ld    6,0+96(11)
 311     ld    7,8+96(11)
 312     addi  4,4,128
 313     addi  3,3,128
 314     std   6,0+96(10)
 315     std   7,8+96(10)
 316     ld    8,16+96(11)
 317     ld    0,24+96(11)
 318     std   8,16+96(10)
 319     std   0,24+96(10)
 320     bdnz  L(das_loop2)
 321 L(das_loop_e):
 322 /* Check of a 1-7 byte tail, return if none.  */
 323     bne   cr1,L(das_tail2)
 324 /* Return original dst pointer.  */
 325     ld 3,-16(1)
 326     blr
 327     .align  4
 328 L(das_tail):
 329     beq   cr1,0f
 330
 331 L(das_tail2):
 332 /*  At this point we have a tail of 0-7 bytes and we know that the
 333     destination is double word aligned.  */
 334 4:  bf    29,2f
 335     lwz   6,0(4)
 336     stw   6,0(3)
 337     bf    30,5f
 338     lhz   6,4(4)
 339     sth   6,4(3)
 340     bf    31,0f
 341     lbz   6,6(4)
 342     stb   6,6(3)
 343     b     0f
 344 5:  bf    31,0f
 345     lbz   6,4(4)
 346     stb   6,4(3)
 347     b     0f
 348
 349 2:  bf    30,1f
 350     lhz   6,0(4)
 351     sth   6,0(3)
 352     bf    31,0f
 353     lbz   6,2(4)
 354     stb   6,2(3)
 355     b     0f
 356
 357 1:  bf    31,0f
 358     lbz   6,0(4)
 359     stb   6,0(3)
 360 0:
 361   /* Return original dst pointer.  */
 362     ld 3,-16(1)
 363     blr
 364
 365 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 366    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 367    tests.
 368
 369    In the short (0-8 byte) case no attempt is made to force alignment
 370    of either source or destination.  The hardware will handle the
 371    unaligned load/stores with small delays for crossing 32- 128-byte,
 372    and 4096-byte boundaries. Since these short moves are unlikely to be
 373    unaligned or cross these boundaries, the overhead to force
 374    alignment is not justified.
 375
 376    The longer (9-31 byte) move is more likely to cross 32- or 128-byte
 377    boundaries.  Since only loads are sensitive to the 32-/128-byte
 378    boundaries it is more important to align the source then the
 379    destination.  If the source is not already word aligned, we first
 380    move 1-3 bytes as needed.  Since we are only word aligned we don't
 381    use double word load/stores to insure that all loads are aligned.
 382    While the destination and stores may still be unaligned, this
 383    is only an issue for page (4096 byte boundary) crossing, which
 384    should be rare for these short moves.  The hardware handles this
 385    case automatically with a small (~20 cycle) delay.  */
 386     .align  4
 387 .L2:
 388     mtcrf 0x01,5
 389     neg   8,4
 390     clrrdi      11,4,2
 391     andi. 0,8,3
 392     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 393 /* At least 9 bytes left.  Get the source word aligned.  */
 394     cmpldi      cr1,5,16
 395     mr    10,5
 396     mr    12,4
 397     cmpldi      cr6,0,2
 398     beq   L(dus_tail)   /* If the source is already word aligned skip this.  */
 399 /* Copy 1-3 bytes to get source address word aligned.  */
 400     lwz   6,0(11)
 401     subf  10,0,5
 402     add   12,4,0
 403     blt   cr6,5f
 404     srdi  7,6,16
 405     bgt   cr6,3f
 406 #ifdef __LITTLE_ENDIAN__
 407     sth   7,0(3)
 408 #else
 409     sth   6,0(3)
 410 #endif
 411     b     7f
 412     .align  4
 413 3:
 414 #ifdef __LITTLE_ENDIAN__
 415     rotlwi 6,6,24
 416     stb   6,0(3)
 417     sth   7,1(3)
 418 #else
 419     stb   7,0(3)
 420     sth   6,1(3)
 421 #endif
 422     b     7f
 423     .align  4
 424 5:
 425 #ifdef __LITTLE_ENDIAN__
 426     rotlwi 6,6,8
 427 #endif
 428     stb   6,0(3)
 429 7:
 430     cmpldi      cr1,10,16
 431     add   3,3,0
 432     mtcrf 0x01,10
 433     .align  4
 434 L(dus_tail):
 435 /* At least 6 bytes left and the source is word aligned.  This allows
 436    some speculative loads up front.  */
 437 /* We need to special case the fall-through because the biggest delays
 438    are due to address computation not being ready in time for the
 439    AGEN.  */
 440     lwz   6,0(12)
 441     lwz   7,4(12)
 442     blt   cr1,L(dus_tail8)
 443     cmpldi      cr0,10,24
 444 L(dus_tail16): /* Move 16 bytes.  */
 445     stw   6,0(3)
 446     stw   7,4(3)
 447     lwz   6,8(12)
 448     lwz   7,12(12)
 449     stw   6,8(3)
 450     stw   7,12(3)
 451 /* Move 8 bytes more.  */
 452     bf    28,L(dus_tail16p8)
 453     cmpldi      cr1,10,28
 454     lwz   6,16(12)
 455     lwz   7,20(12)
 456     stw   6,16(3)
 457     stw   7,20(3)
 458 /* Move 4 bytes more.  */
 459     bf    29,L(dus_tail16p4)
 460     lwz   6,24(12)
 461     stw   6,24(3)
 462     addi  12,12,28
 463     addi  3,3,28
 464     bgt   cr1,L(dus_tail2)
 465  /* exactly 28 bytes.  Return original dst pointer and exit.  */
 466     ld    3,-16(1)
 467     blr
 468     .align  4
 469 L(dus_tail16p8):  /* less than 8 bytes left.  */
 470     beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
 471     cmpldi      cr1,10,20
 472     bf    29,L(dus_tail16p2)
 473 /* Move 4 bytes more.  */
 474     lwz   6,16(12)
 475     stw   6,16(3)
 476     addi  12,12,20
 477     addi  3,3,20
 478     bgt   cr1,L(dus_tail2)
 479  /* exactly 20 bytes.  Return original dst pointer and exit.  */
 480     ld    3,-16(1)
 481     blr
 482     .align  4
 483 L(dus_tail16p4):  /* less than 4 bytes left.  */
 484     addi  12,12,24
 485     addi  3,3,24
 486     bgt   cr0,L(dus_tail2)
 487  /* exactly 24 bytes.  Return original dst pointer and exit.  */
 488     ld    3,-16(1)
 489     blr
 490     .align  4
 491 L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
 492     addi  12,12,16
 493     addi  3,3,16
 494     b     L(dus_tail2)
 495
 496     .align  4
 497 L(dus_tail8):  /* Move 8 bytes.  */
 498 /*  r6, r7 already loaded speculatively.  */
 499     cmpldi      cr1,10,8
 500     cmpldi      cr0,10,12
 501     bf    28,L(dus_tail4)
 502     .align  2
 503     stw   6,0(3)
 504     stw   7,4(3)
 505 /* Move 4 bytes more.  */
 506     bf    29,L(dus_tail8p4)
 507     lwz   6,8(12)
 508     stw   6,8(3)
 509     addi  12,12,12
 510     addi  3,3,12
 511     bgt   cr0,L(dus_tail2)
 512  /* exactly 12 bytes.  Return original dst pointer and exit.  */
 513     ld    3,-16(1)
 514     blr
 515     .align  4
 516 L(dus_tail8p4):  /* less than 4 bytes left.  */
 517     addi  12,12,8
 518     addi  3,3,8
 519     bgt   cr1,L(dus_tail2)
 520  /* exactly 8 bytes.  Return original dst pointer and exit.  */
 521     ld    3,-16(1)
 522     blr
 523
 524     .align  4
 525 L(dus_tail4):  /* Move 4 bytes.  */
 526 /*  r6 already loaded speculatively.  If we are here we know there is
 527     more than 4 bytes left.  So there is no need to test.  */
 528     addi  12,12,4
 529     stw   6,0(3)
 530     addi  3,3,4
 531 L(dus_tail2):  /* Move 2-3 bytes.  */
 532     bf    30,L(dus_tail1)
 533     lhz   6,0(12)
 534     sth   6,0(3)
 535     bf    31,L(dus_tailX)
 536     lbz   7,2(12)
 537     stb   7,2(3)
 538     ld 3,-16(1)
 539     blr
 540 L(dus_tail1):  /* Move 1 byte.  */
 541     bf    31,L(dus_tailX)
 542     lbz   6,0(12)
 543     stb   6,0(3)
 544 L(dus_tailX):
 545   /* Return original dst pointer.  */
 546     ld    3,-16(1)
 547     blr
 548
 549 /* Special case to copy 0-8 bytes.  */
 550     .align  4
 551 .LE8:
 552     mr    12,4
 553     bne   cr6,L(dus_4)
 554 /* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
 555    cycle delay.  This case should be rare and any attempt to avoid this
 556    would take most of 20 cycles any way.  */
 557     ld   6,0(4)
 558     std   6,0(3)
 559   /* Return original dst pointer.  */
 560     ld    3,-16(1)
 561     blr
 562     .align  4
 563 L(dus_4):
 564     bf    29,L(dus_tail2)
 565     lwz   6,0(4)
 566     stw   6,0(3)
 567     bf    30,L(dus_5)
 568     lhz   7,4(4)
 569     sth   7,4(3)
 570     bf    31,L(dus_0)
 571     lbz   8,6(4)
 572     stb   8,6(3)
 573     ld 3,-16(1)
 574     blr
 575     .align  4
 576 L(dus_5):
 577     bf    31,L(dus_0)
 578     lbz   6,4(4)
 579     stb   6,4(3)
 580 L(dus_0):
 581   /* Return original dst pointer.  */
 582     ld    3,-16(1)
 583     blr
 584
 585     .align  4
 586 .L6:
 587     cfi_offset(31,-8)
 588     mr    12,4
 589     mr    31,5
 590   /* Copy doublewords where the destination is aligned but the source is
 591      not.  Use aligned doubleword loads from the source, shifted to realign
 592      the data, to allow aligned destination stores.  */
 593     addi    11,9,-1  /* loop DW count is one less than total */
 594     subf    5,10,12  /* Move source addr to previous full double word.  */
 595     cmpldi  cr5, 10, 2
 596     cmpldi  cr0, 10, 4
 597     mr      4,3
 598     srdi    8,11,2   /* calculate the 32 byte loop count */
 599     ld      6,0(5)   /* pre load 1st full doubleword.  */
 600     mtcrf   0x01,11
 601     cmpldi  cr6,9,4
 602     mtctr   8
 603     ld      7,8(5)   /* pre load 2nd full doubleword.  */
 604     bge     cr0, L(du4_do)
 605     blt     cr5, L(du1_do)
 606     beq     cr5, L(du2_do)
 607     b       L(du3_do)
 608
 609     .align 4
 610 L(du1_do):
 611     bf      30,L(du1_1dw)
 612
 613     /* there are at least two DWs to copy */
 614     /* FIXME: can combine last shift and "or" into "rldimi" */
 615 #ifdef __LITTLE_ENDIAN__
 616     srdi     0,6, 8
 617     sldi     8,7, 64-8
 618 #else
 619     sldi     0,6, 8
 620     srdi     8,7, 64-8
 621 #endif
 622     or      0,0,8
 623     ld      6,16(5)
 624     std     0,0(4)
 625 #ifdef __LITTLE_ENDIAN__
 626     srdi     0,7, 8
 627     sldi     8,6, 64-8
 628 #else
 629     sldi     0,7, 8
 630     srdi     8,6, 64-8
 631 #endif
 632     or      0,0,8
 633     ld      7,24(5)
 634     std     0,8(4)
 635     addi    4,4,16
 636     addi    5,5,32
 637     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
 638     bf      31,L(du1_loop)
 639     /* there is a third DW to copy */
 640 #ifdef __LITTLE_ENDIAN__
 641     srdi     0,6, 8
 642     sldi     8,7, 64-8
 643 #else
 644     sldi     0,6, 8
 645     srdi     8,7, 64-8
 646 #endif
 647     or      0,0,8
 648     std     0,0(4)
 649     mr      6,7
 650     ld      7,0(5)
 651     addi    5,5,8
 652     addi    4,4,8
 653     beq     cr6,L(du1_fini)  /* if total DWs = 4, then bypass loop */
 654     b       L(du1_loop)
 655     .align 4
 656 L(du1_1dw):
 657 #ifdef __LITTLE_ENDIAN__
 658     srdi     0,6, 8
 659     sldi     8,7, 64-8
 660 #else
 661     sldi     0,6, 8
 662     srdi     8,7, 64-8
 663 #endif
 664     addi    5,5,16
 665     or      0,0,8
 666     bf      31,L(du1_loop)
 667     mr      6,7
 668     ld      7,0(5)
 669     addi    5,5,8
 670     std     0,0(4)
 671     addi    4,4,8
 672     .align 4
 673 /* copy 32 bytes at a time */
 674 L(du1_loop):
 675 #ifdef __LITTLE_ENDIAN__
 676     srdi   0,6, 8
 677     sldi   8,7, 64-8
 678 #else
 679     sldi   0,6, 8
 680     srdi   8,7, 64-8
 681 #endif
 682     or    0,0,8
 683     ld    6,0(5)
 684     std   0,0(4)
 685 #ifdef __LITTLE_ENDIAN__
 686     srdi   0,7, 8
 687     sldi   8,6, 64-8
 688 #else
 689     sldi   0,7, 8
 690     srdi   8,6, 64-8
 691 #endif
 692     or    0,0,8
 693     ld    7,8(5)
 694     std   0,8(4)
 695 #ifdef __LITTLE_ENDIAN__
 696     srdi   0,6, 8
 697     sldi   8,7, 64-8
 698 #else
 699     sldi   0,6, 8
 700     srdi   8,7, 64-8
 701 #endif
 702     or    0,0,8
 703     ld    6,16(5)
 704     std   0,16(4)
 705 #ifdef __LITTLE_ENDIAN__
 706     srdi   0,7, 8
 707     sldi   8,6, 64-8
 708 #else
 709     sldi   0,7, 8
 710     srdi   8,6, 64-8
 711 #endif
 712     or    0,0,8
 713     ld    7,24(5)
 714     std   0,24(4)
 715     addi  5,5,32
 716     addi  4,4,32
 717     bdnz+ L(du1_loop)
 718     .align 4
 719 L(du1_fini):
 720     /* calculate and store the final DW */
 721 #ifdef __LITTLE_ENDIAN__
 722     srdi   0,6, 8
 723     sldi   8,7, 64-8
 724 #else
 725     sldi   0,6, 8
 726     srdi   8,7, 64-8
 727 #endif
 728     or    0,0,8
 729     std   0,0(4)
 730     b     L(du_done)
 731
 732     .align 4
 733 L(du2_do):
 734     bf      30,L(du2_1dw)
 735
 736     /* there are at least two DWs to copy */
 737 #ifdef __LITTLE_ENDIAN__
 738     srdi     0,6, 16
 739     sldi     8,7, 64-16
 740 #else
 741     sldi     0,6, 16
 742     srdi     8,7, 64-16
 743 #endif
 744     or      0,0,8
 745     ld      6,16(5)
 746     std     0,0(4)
 747 #ifdef __LITTLE_ENDIAN__
 748     srdi     0,7, 16
 749     sldi     8,6, 64-16
 750 #else
 751     sldi     0,7, 16
 752     srdi     8,6, 64-16
 753 #endif
 754     or      0,0,8
 755     ld      7,24(5)
 756     std     0,8(4)
 757     addi    4,4,16
 758     addi    5,5,32
 759     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
 760     bf      31,L(du2_loop)
 761     /* there is a third DW to copy */
 762 #ifdef __LITTLE_ENDIAN__
 763     srdi     0,6, 16
 764     sldi     8,7, 64-16
 765 #else
 766     sldi     0,6, 16
 767     srdi     8,7, 64-16
 768 #endif
 769     or      0,0,8
 770     std     0,0(4)
 771     mr      6,7
 772     ld      7,0(5)
 773     addi    5,5,8
 774     addi    4,4,8
 775     beq     cr6,L(du2_fini)  /* if total DWs = 4, then bypass loop */
 776     b       L(du2_loop)
 777     .align 4
 778 L(du2_1dw):
 779 #ifdef __LITTLE_ENDIAN__
 780     srdi     0,6, 16
 781     sldi     8,7, 64-16
 782 #else
 783     sldi     0,6, 16
 784     srdi     8,7, 64-16
 785 #endif
 786     addi    5,5,16
 787     or      0,0,8
 788     bf      31,L(du2_loop)
 789     mr      6,7
 790     ld      7,0(5)
 791     addi    5,5,8
 792     std     0,0(4)
 793     addi    4,4,8
 794     .align 4
 795 /* copy 32 bytes at a time */
 796 L(du2_loop):
 797 #ifdef __LITTLE_ENDIAN__
 798     srdi   0,6, 16
 799     sldi   8,7, 64-16
 800 #else
 801     sldi   0,6, 16
 802     srdi   8,7, 64-16
 803 #endif
 804     or    0,0,8
 805     ld    6,0(5)
 806     std   0,0(4)
 807 #ifdef __LITTLE_ENDIAN__
 808     srdi   0,7, 16
 809     sldi   8,6, 64-16
 810 #else
 811     sldi   0,7, 16
 812     srdi   8,6, 64-16
 813 #endif
 814     or    0,0,8
 815     ld    7,8(5)
 816     std   0,8(4)
 817 #ifdef __LITTLE_ENDIAN__
 818     srdi   0,6, 16
 819     sldi   8,7, 64-16
 820 #else
 821     sldi   0,6, 16
 822     srdi   8,7, 64-16
 823 #endif
 824     or    0,0,8
 825     ld    6,16(5)
 826     std   0,16(4)
 827 #ifdef __LITTLE_ENDIAN__
 828     srdi   0,7, 16
 829     sldi   8,6, 64-16
 830 #else
 831     sldi   0,7, 16
 832     srdi   8,6, 64-16
 833 #endif
 834     or    0,0,8
 835     ld    7,24(5)
 836     std   0,24(4)
 837     addi  5,5,32
 838     addi  4,4,32
 839     bdnz+ L(du2_loop)
 840     .align 4
 841 L(du2_fini):
 842     /* calculate and store the final DW */
 843 #ifdef __LITTLE_ENDIAN__
 844     srdi   0,6, 16
 845     sldi   8,7, 64-16
 846 #else
 847     sldi   0,6, 16
 848     srdi   8,7, 64-16
 849 #endif
 850     or    0,0,8
 851     std   0,0(4)
 852     b     L(du_done)
 853
 854     .align 4
 855 L(du3_do):
 856     bf      30,L(du3_1dw)
 857
 858     /* there are at least two DWs to copy */
 859 #ifdef __LITTLE_ENDIAN__
 860     srdi     0,6, 24
 861     sldi     8,7, 64-24
 862 #else
 863     sldi     0,6, 24
 864     srdi     8,7, 64-24
 865 #endif
 866     or      0,0,8
 867     ld      6,16(5)
 868     std     0,0(4)
 869 #ifdef __LITTLE_ENDIAN__
 870     srdi     0,7, 24
 871     sldi     8,6, 64-24
 872 #else
 873     sldi     0,7, 24
 874     srdi     8,6, 64-24
 875 #endif
 876     or      0,0,8
 877     ld      7,24(5)
 878     std     0,8(4)
 879     addi    4,4,16
 880     addi    5,5,32
 881     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
 882     bf      31,L(du3_loop)
 883     /* there is a third DW to copy */
 884 #ifdef __LITTLE_ENDIAN__
 885     srdi     0,6, 24
 886     sldi     8,7, 64-24
 887 #else
 888     sldi     0,6, 24
 889     srdi     8,7, 64-24
 890 #endif
 891     or      0,0,8
 892     std     0,0(4)
 893     mr      6,7
 894     ld      7,0(5)
 895     addi    5,5,8
 896     addi    4,4,8
 897     beq     cr6,L(du3_fini)  /* if total DWs = 4, then bypass loop */
 898     b       L(du3_loop)
 899     .align 4
 900 L(du3_1dw):
 901 #ifdef __LITTLE_ENDIAN__
 902     srdi     0,6, 24
 903     sldi     8,7, 64-24
 904 #else
 905     sldi     0,6, 24
 906     srdi     8,7, 64-24
 907 #endif
 908     addi    5,5,16
 909     or      0,0,8
 910     bf      31,L(du3_loop)
 911     mr      6,7
 912     ld      7,0(5)
 913     addi    5,5,8
 914     std     0,0(4)
 915     addi    4,4,8
 916     .align 4
 917 /* copy 32 bytes at a time */
 918 L(du3_loop):
 919 #ifdef __LITTLE_ENDIAN__
 920     srdi   0,6, 24
 921     sldi   8,7, 64-24
 922 #else
 923     sldi   0,6, 24
 924     srdi   8,7, 64-24
 925 #endif
 926     or    0,0,8
 927     ld    6,0(5)
 928     std   0,0(4)
 929 #ifdef __LITTLE_ENDIAN__
 930     srdi   0,7, 24
 931     sldi   8,6, 64-24
 932 #else
 933     sldi   0,7, 24
 934     srdi   8,6, 64-24
 935 #endif
 936     or    0,0,8
 937     ld    7,8(5)
 938     std   0,8(4)
 939 #ifdef __LITTLE_ENDIAN__
 940     srdi   0,6, 24
 941     sldi   8,7, 64-24
 942 #else
 943     sldi   0,6, 24
 944     srdi   8,7, 64-24
 945 #endif
 946     or    0,0,8
 947     ld    6,16(5)
 948     std   0,16(4)
 949 #ifdef __LITTLE_ENDIAN__
 950     srdi   0,7, 24
 951     sldi   8,6, 64-24
 952 #else
 953     sldi   0,7, 24
 954     srdi   8,6, 64-24
 955 #endif
 956     or    0,0,8
 957     ld    7,24(5)
 958     std   0,24(4)
 959     addi  5,5,32
 960     addi  4,4,32
 961     bdnz+ L(du3_loop)
 962     .align 4
 963 L(du3_fini):
 964     /* calculate and store the final DW */
 965 #ifdef __LITTLE_ENDIAN__
 966     srdi   0,6, 24
 967     sldi   8,7, 64-24
 968 #else
 969     sldi   0,6, 24
 970     srdi   8,7, 64-24
 971 #endif
 972     or    0,0,8
 973     std   0,0(4)
 974     b     L(du_done)
 975
 976     .align 4
 977 L(du4_do):
 978     cmpldi  cr5, 10, 6
 979     beq     cr0, L(du4_dox)
 980     blt     cr5, L(du5_do)
 981     beq     cr5, L(du6_do)
 982     b       L(du7_do)
 983 L(du4_dox):
 984     bf      30,L(du4_1dw)
 985
 986     /* there are at least two DWs to copy */
 987 #ifdef __LITTLE_ENDIAN__
 988     srdi     0,6, 32
 989     sldi     8,7, 64-32
 990 #else
 991     sldi     0,6, 32
 992     srdi     8,7, 64-32
 993 #endif
 994     or      0,0,8
 995     ld      6,16(5)
 996     std     0,0(4)
 997 #ifdef __LITTLE_ENDIAN__
 998     srdi     0,7, 32
 999     sldi     8,6, 64-32
1000 #else
1001     sldi     0,7, 32
1002     srdi     8,6, 64-32
1003 #endif
1004     or      0,0,8
1005     ld      7,24(5)
1006     std     0,8(4)
1007     addi    4,4,16
1008     addi    5,5,32
1009     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
1010     bf      31,L(du4_loop)
1011     /* there is a third DW to copy */
1012 #ifdef __LITTLE_ENDIAN__
1013     srdi     0,6, 32
1014     sldi     8,7, 64-32
1015 #else
1016     sldi     0,6, 32
1017     srdi     8,7, 64-32
1018 #endif
1019     or      0,0,8
1020     std     0,0(4)
1021     mr      6,7
1022     ld      7,0(5)
1023     addi    5,5,8
1024     addi    4,4,8
1025     beq     cr6,L(du4_fini)  /* if total DWs = 4, then bypass loop */
1026     b       L(du4_loop)
1027     .align 4
1028 L(du4_1dw):
1029 #ifdef __LITTLE_ENDIAN__
1030     srdi     0,6, 32
1031     sldi     8,7, 64-32
1032 #else
1033     sldi     0,6, 32
1034     srdi     8,7, 64-32
1035 #endif
1036     addi    5,5,16
1037     or      0,0,8
1038     bf      31,L(du4_loop)
1039     mr      6,7
1040     ld      7,0(5)
1041     addi    5,5,8
1042     std     0,0(4)
1043     addi    4,4,8
1044     .align 4
1045 /* copy 32 bytes at a time */
1046 L(du4_loop):
1047 #ifdef __LITTLE_ENDIAN__
1048     srdi   0,6, 32
1049     sldi   8,7, 64-32
1050 #else
1051     sldi   0,6, 32
1052     srdi   8,7, 64-32
1053 #endif
1054     or    0,0,8
1055     ld    6,0(5)
1056     std   0,0(4)
1057 #ifdef __LITTLE_ENDIAN__
1058     srdi   0,7, 32
1059     sldi   8,6, 64-32
1060 #else
1061     sldi   0,7, 32
1062     srdi   8,6, 64-32
1063 #endif
1064     or    0,0,8
1065     ld    7,8(5)
1066     std   0,8(4)
1067 #ifdef __LITTLE_ENDIAN__
1068     srdi   0,6, 32
1069     sldi   8,7, 64-32
1070 #else
1071     sldi   0,6, 32
1072     srdi   8,7, 64-32
1073 #endif
1074     or    0,0,8
1075     ld    6,16(5)
1076     std   0,16(4)
1077 #ifdef __LITTLE_ENDIAN__
1078     srdi   0,7, 32
1079     sldi   8,6, 64-32
1080 #else
1081     sldi   0,7, 32
1082     srdi   8,6, 64-32
1083 #endif
1084     or    0,0,8
1085     ld    7,24(5)
1086     std   0,24(4)
1087     addi  5,5,32
1088     addi  4,4,32
1089     bdnz+ L(du4_loop)
1090     .align 4
1091 L(du4_fini):
1092     /* calculate and store the final DW */
1093 #ifdef __LITTLE_ENDIAN__
1094     srdi   0,6, 32
1095     sldi   8,7, 64-32
1096 #else
1097     sldi   0,6, 32
1098     srdi   8,7, 64-32
1099 #endif
1100     or    0,0,8
1101     std   0,0(4)
1102     b     L(du_done)
1103
1104     .align 4
1105 L(du5_do):
1106     bf      30,L(du5_1dw)
1107
1108     /* there are at least two DWs to copy */
1109 #ifdef __LITTLE_ENDIAN__
1110     srdi     0,6, 40
1111     sldi     8,7, 64-40
1112 #else
1113     sldi     0,6, 40
1114     srdi     8,7, 64-40
1115 #endif
1116     or      0,0,8
1117     ld      6,16(5)
1118     std     0,0(4)
1119 #ifdef __LITTLE_ENDIAN__
1120     srdi     0,7, 40
1121     sldi     8,6, 64-40
1122 #else
1123     sldi     0,7, 40
1124     srdi     8,6, 64-40
1125 #endif
1126     or      0,0,8
1127     ld      7,24(5)
1128     std     0,8(4)
1129     addi    4,4,16
1130     addi    5,5,32
1131     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
1132     bf      31,L(du5_loop)
1133     /* there is a third DW to copy */
1134 #ifdef __LITTLE_ENDIAN__
1135     srdi     0,6, 40
1136     sldi     8,7, 64-40
1137 #else
1138     sldi     0,6, 40
1139     srdi     8,7, 64-40
1140 #endif
1141     or      0,0,8
1142     std     0,0(4)
1143     mr      6,7
1144     ld      7,0(5)
1145     addi    5,5,8
1146     addi    4,4,8
1147     beq     cr6,L(du5_fini)  /* if total DWs = 4, then bypass loop */
1148     b       L(du5_loop)
1149     .align 4
1150 L(du5_1dw):
1151 #ifdef __LITTLE_ENDIAN__
1152     srdi     0,6, 40
1153     sldi     8,7, 64-40
1154 #else
1155     sldi     0,6, 40
1156     srdi     8,7, 64-40
1157 #endif
1158     addi    5,5,16
1159     or      0,0,8
1160     bf      31,L(du5_loop)
1161     mr      6,7
1162     ld      7,0(5)
1163     addi    5,5,8
1164     std     0,0(4)
1165     addi    4,4,8
1166     .align 4
1167 /* copy 32 bytes at a time */
1168 L(du5_loop):
1169 #ifdef __LITTLE_ENDIAN__
1170     srdi   0,6, 40
1171     sldi   8,7, 64-40
1172 #else
1173     sldi   0,6, 40
1174     srdi   8,7, 64-40
1175 #endif
1176     or    0,0,8
1177     ld    6,0(5)
1178     std   0,0(4)
1179 #ifdef __LITTLE_ENDIAN__
1180     srdi   0,7, 40
1181     sldi   8,6, 64-40
1182 #else
1183     sldi   0,7, 40
1184     srdi   8,6, 64-40
1185 #endif
1186     or    0,0,8
1187     ld    7,8(5)
1188     std   0,8(4)
1189 #ifdef __LITTLE_ENDIAN__
1190     srdi   0,6, 40
1191     sldi   8,7, 64-40
1192 #else
1193     sldi   0,6, 40
1194     srdi   8,7, 64-40
1195 #endif
1196     or    0,0,8
1197     ld    6,16(5)
1198     std   0,16(4)
1199 #ifdef __LITTLE_ENDIAN__
1200     srdi   0,7, 40
1201     sldi   8,6, 64-40
1202 #else
1203     sldi   0,7, 40
1204     srdi   8,6, 64-40
1205 #endif
1206     or    0,0,8
1207     ld    7,24(5)
1208     std   0,24(4)
1209     addi  5,5,32
1210     addi  4,4,32
1211     bdnz+ L(du5_loop)
1212     .align 4
1213 L(du5_fini):
1214     /* calculate and store the final DW */
1215 #ifdef __LITTLE_ENDIAN__
1216     srdi   0,6, 40
1217     sldi   8,7, 64-40
1218 #else
1219     sldi   0,6, 40
1220     srdi   8,7, 64-40
1221 #endif
1222     or    0,0,8
1223     std   0,0(4)
1224     b     L(du_done)
1225
1226     .align 4
1227 L(du6_do):
1228     bf      30,L(du6_1dw)
1229
1230     /* there are at least two DWs to copy */
1231 #ifdef __LITTLE_ENDIAN__
1232     srdi     0,6, 48
1233     sldi     8,7, 64-48
1234 #else
1235     sldi     0,6, 48
1236     srdi     8,7, 64-48
1237 #endif
1238     or      0,0,8
1239     ld      6,16(5)
1240     std     0,0(4)
1241 #ifdef __LITTLE_ENDIAN__
1242     srdi     0,7, 48
1243     sldi     8,6, 64-48
1244 #else
1245     sldi     0,7, 48
1246     srdi     8,6, 64-48
1247 #endif
1248     or      0,0,8
1249     ld      7,24(5)
1250     std     0,8(4)
1251     addi    4,4,16
1252     addi    5,5,32
1253     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
1254     bf      31,L(du6_loop)
1255     /* there is a third DW to copy */
1256 #ifdef __LITTLE_ENDIAN__
1257     srdi     0,6, 48
1258     sldi     8,7, 64-48
1259 #else
1260     sldi     0,6, 48
1261     srdi     8,7, 64-48
1262 #endif
1263     or      0,0,8
1264     std     0,0(4)
1265     mr      6,7
1266     ld      7,0(5)
1267     addi    5,5,8
1268     addi    4,4,8
1269     beq     cr6,L(du6_fini)  /* if total DWs = 4, then bypass loop */
1270     b       L(du6_loop)
1271     .align 4
1272 L(du6_1dw):
1273 #ifdef __LITTLE_ENDIAN__
1274     srdi     0,6, 48
1275     sldi     8,7, 64-48
1276 #else
1277     sldi     0,6, 48
1278     srdi     8,7, 64-48
1279 #endif
1280     addi    5,5,16
1281     or      0,0,8
1282     bf      31,L(du6_loop)
1283     mr      6,7
1284     ld      7,0(5)
1285     addi    5,5,8
1286     std     0,0(4)
1287     addi    4,4,8
1288     .align 4
1289 /* copy 32 bytes at a time */
1290 L(du6_loop):
1291 #ifdef __LITTLE_ENDIAN__
1292     srdi   0,6, 48
1293     sldi   8,7, 64-48
1294 #else
1295     sldi   0,6, 48
1296     srdi   8,7, 64-48
1297 #endif
1298     or    0,0,8
1299     ld    6,0(5)
1300     std   0,0(4)
1301 #ifdef __LITTLE_ENDIAN__
1302     srdi   0,7, 48
1303     sldi   8,6, 64-48
1304 #else
1305     sldi   0,7, 48
1306     srdi   8,6, 64-48
1307 #endif
1308     or    0,0,8
1309     ld    7,8(5)
1310     std   0,8(4)
1311 #ifdef __LITTLE_ENDIAN__
1312     srdi   0,6, 48
1313     sldi   8,7, 64-48
1314 #else
1315     sldi   0,6, 48
1316     srdi   8,7, 64-48
1317 #endif
1318     or    0,0,8
1319     ld    6,16(5)
1320     std   0,16(4)
1321 #ifdef __LITTLE_ENDIAN__
1322     srdi   0,7, 48
1323     sldi   8,6, 64-48
1324 #else
1325     sldi   0,7, 48
1326     srdi   8,6, 64-48
1327 #endif
1328     or    0,0,8
1329     ld    7,24(5)
1330     std   0,24(4)
1331     addi  5,5,32
1332     addi  4,4,32
1333     bdnz+ L(du6_loop)
1334     .align 4
1335 L(du6_fini):
1336     /* calculate and store the final DW */
1337 #ifdef __LITTLE_ENDIAN__
1338     srdi   0,6, 48
1339     sldi   8,7, 64-48
1340 #else
1341     sldi   0,6, 48
1342     srdi   8,7, 64-48
1343 #endif
1344     or    0,0,8
1345     std   0,0(4)
1346     b     L(du_done)
1347
1348     .align 4
1349 L(du7_do):
1350     bf      30,L(du7_1dw)
1351
1352     /* there are at least two DWs to copy */
1353 #ifdef __LITTLE_ENDIAN__
1354     srdi     0,6, 56
1355     sldi     8,7, 64-56
1356 #else
1357     sldi     0,6, 56
1358     srdi     8,7, 64-56
1359 #endif
1360     or      0,0,8
1361     ld      6,16(5)
1362     std     0,0(4)
1363 #ifdef __LITTLE_ENDIAN__
1364     srdi     0,7, 56
1365     sldi     8,6, 64-56
1366 #else
1367     sldi     0,7, 56
1368     srdi     8,6, 64-56
1369 #endif
1370     or      0,0,8
1371     ld      7,24(5)
1372     std     0,8(4)
1373     addi    4,4,16
1374     addi    5,5,32
1375     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
1376     bf      31,L(du7_loop)
1377     /* there is a third DW to copy */
1378 #ifdef __LITTLE_ENDIAN__
1379     srdi     0,6, 56
1380     sldi     8,7, 64-56
1381 #else
1382     sldi     0,6, 56
1383     srdi     8,7, 64-56
1384 #endif
1385     or      0,0,8
1386     std     0,0(4)
1387     mr      6,7
1388     ld      7,0(5)
1389     addi    5,5,8
1390     addi    4,4,8
1391     beq     cr6,L(du7_fini)  /* if total DWs = 4, then bypass loop */
1392     b       L(du7_loop)
1393     .align 4
1394 L(du7_1dw):
1395 #ifdef __LITTLE_ENDIAN__
1396     srdi     0,6, 56
1397     sldi     8,7, 64-56
1398 #else
1399     sldi     0,6, 56
1400     srdi     8,7, 64-56
1401 #endif
1402     addi    5,5,16
1403     or      0,0,8
1404     bf      31,L(du7_loop)
1405     mr      6,7
1406     ld      7,0(5)
1407     addi    5,5,8
1408     std     0,0(4)
1409     addi    4,4,8
1410     .align 4
1411 /* copy 32 bytes at a time */
1412 L(du7_loop):
1413 #ifdef __LITTLE_ENDIAN__
1414     srdi   0,6, 56
1415     sldi   8,7, 64-56
1416 #else
1417     sldi   0,6, 56
1418     srdi   8,7, 64-56
1419 #endif
1420     or    0,0,8
1421     ld    6,0(5)
1422     std   0,0(4)
1423 #ifdef __LITTLE_ENDIAN__
1424     srdi   0,7, 56
1425     sldi   8,6, 64-56
1426 #else
1427     sldi   0,7, 56
1428     srdi   8,6, 64-56
1429 #endif
1430     or    0,0,8
1431     ld    7,8(5)
1432     std   0,8(4)
1433 #ifdef __LITTLE_ENDIAN__
1434     srdi   0,6, 56
1435     sldi   8,7, 64-56
1436 #else
1437     sldi   0,6, 56
1438     srdi   8,7, 64-56
1439 #endif
1440     or    0,0,8
1441     ld    6,16(5)
1442     std   0,16(4)
1443 #ifdef __LITTLE_ENDIAN__
1444     srdi   0,7, 56
1445     sldi   8,6, 64-56
1446 #else
1447     sldi   0,7, 56
1448     srdi   8,6, 64-56
1449 #endif
1450     or    0,0,8
1451     ld    7,24(5)
1452     std   0,24(4)
1453     addi  5,5,32
1454     addi  4,4,32
1455     bdnz+ L(du7_loop)
1456     .align 4
1457 L(du7_fini):
1458     /* calculate and store the final DW */
1459 #ifdef __LITTLE_ENDIAN__
1460     srdi   0,6, 56
1461     sldi   8,7, 64-56
1462 #else
1463     sldi   0,6, 56
1464     srdi   8,7, 64-56
1465 #endif
1466     or    0,0,8
1467     std   0,0(4)
1468     b     L(du_done)
1469
1470     .align 4
1471 L(du_done):
1472     rldicr 0,31,0,60
1473     mtcrf 0x01,31
1474     beq   cr1,0f        /* If the tail is 0 bytes we are done!  */
1475
1476     add   3,3,0
1477     add   12,12,0
1478 /*  At this point we have a tail of 0-7 bytes and we know that the
1479     destination is double word aligned.  */
1480 4:  bf    29,2f
1481     lwz   6,0(12)
1482     addi  12,12,4
1483     stw   6,0(3)
1484     addi  3,3,4
1485 2:  bf    30,1f
1486     lhz   6,0(12)
1487     addi  12,12,2
1488     sth   6,0(3)
1489     addi  3,3,2
1490 1:  bf    31,0f
1491     lbz   6,0(12)
1492     stb   6,0(3)
1493 0:
1494   /* Return original dst pointer.  */
1495     ld 31,-8(1)
1496     ld 3,-16(1)
1497     blr
1498 END_GEN_TB (MEMCPY,TB_TOCLESS)
1499 libc_hidden_builtin_def (memcpy)