Prefer https to http for gnu.org and fsf.org URLs

[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memcpy.S
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S

index d105f8302ed8d63a844bf0192d2aa041e284f73d..89d36316e6e7f5ab522a93e88aacfd2823667f0e 100644 (file)
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@@ -1,5 +1,5 @@
  /* Optimized memcpy implementation for PowerPC64.
-   Copyright (C) 2003, 2006, 2007 Free Software Foundation, Inc.
+   Copyright (C) 2003-2019 Free Software Foundation, Inc.
     This file is part of the GNU C Library.
  
     The GNU C Library is free software; you can redistribute it and/or
@@ -13,57 +13,57 @@
     Lesser General Public License for more details.
  
     You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
-   02110-1301 USA.  */
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
  
  #include <sysdep.h>
-#include <bp-sym.h>
-#include <bp-asm.h>
  
-/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
     Returns 'dst'.
  
-   Memcpy handles short copies (< 32-bytes) using a binary move blocks 
-   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled 
-   with the appropriate combination of byte and halfword load/stores. 
-   There is minimal effort to optimize the alignment of short moves.  
+   Memcpy handles short copies (< 32-bytes) using a binary move blocks
+   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
+   with the appropriate combination of byte and halfword load/stores.
+   There is minimal effort to optimize the alignment of short moves.
     The 64-bit implementations of POWER3 and POWER4 do a reasonable job
-   of handling unligned load/stores that do not cross 32-byte boundries.
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
  
     Longer moves (>= 32-bytes) justify the effort to get at least the
     destination doubleword (8-byte) aligned.  Further optimization is
-   posible when both source and destination are doubleword aligned.
-   Each case has a optimized unrolled loop.  
-     
-   For POWER6 unaligned loads will take a 20+ cycle hicup for any
+   possible when both source and destination are doubleword aligned.
+   Each case has a optimized unrolled loop.
+
+   For POWER6 unaligned loads will take a 20+ cycle hiccup for any
     L1 cache miss that crosses a 32- or 128-byte boundary.  Store
-   is more forgiving and does not take a hicup until page or 
-   segment boundaries.  So we require doubleword alignment for 
+   is more forgiving and does not take a hiccup until page or
+   segment boundaries.  So we require doubleword alignment for
     the source but may take a risk and only require word alignment
     for the destination.  */
  
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
         .machine        "power6"
-EALIGN (BP_SYM (memcpy), 7, 0)
+ENTRY_TOCLESS (MEMCPY, 7)
         CALL_MCOUNT 3
  
      cmpldi cr1,5,31
      neg   0,3
      std   3,-16(1)
      std   31,-8(1)
-    andi. 11,3,7       /* check alignement of dst.  */
+    andi. 11,3,7       /* check alignment of dst.  */
      clrldi 0,0,61      /* Number of bytes until the 1st doubleword of dst.  */
-    clrldi 10,4,61     /* check alignement of src.  */
+    clrldi 10,4,61     /* check alignment of src.  */
      cmpldi cr6,5,8
      ble-  cr1,.L2      /* If move < 32 bytes use short move code.  */
      mtcrf 0x01,0
-    cmpld cr6,10,11  
+    cmpld cr6,10,11
      srdi  9,5,3                /* Number of full double words remaining.  */
      beq   .L0
-  
+
      subf  5,0,5
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.
-     Duplicate some code to maximize fall-throught and minimize agen delays.  */
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.
+     Duplicate some code to maximize fall-through and minimize agen delays.  */
  1:  bf    31,2f
      lbz   6,0(4)
      stb   6,0(3)
@@ -79,7 +79,7 @@ EALIGN (BP_SYM (memcpy), 7, 0)
      lwz   6,1(4)
      stw   6,1(3)
      b     0f
-    
+
  2:  bf    30,4f
      lhz   6,0(4)
      sth   6,0(3)
@@ -87,26 +87,26 @@ EALIGN (BP_SYM (memcpy), 7, 0)
      lwz   6,2(4)
      stw   6,2(3)
      b     0f
-    
+
  4:  bf    29,0f
      lwz   6,0(4)
      stw   6,0(3)
-0: 
+0:
  /* Add the number of bytes until the 1st doubleword of dst to src and dst.  */
      add   4,4,0
      add   3,3,0
-    
-    clrldi 10,4,61     /* check alignement of src again.  */     
+
+    clrldi 10,4,61     /* check alignment of src again.  */
      srdi  9,5,3        /* Number of full double words remaining.  */
-    
-  /* Copy doublewords from source to destination, assumpting the
+
+  /* Copy doublewords from source to destination, assuming the
       destination is aligned on a doubleword boundary.
  
       At this point we know there are at least 25 bytes left (32-7) to copy.
-     The next step is to determine if the source is also doubleword aligned. 
+     The next step is to determine if the source is also doubleword aligned.
       If not branch to the unaligned move code at .L6. which uses
       a load, shift, store strategy.
-     
+
       Otherwise source and destination are doubleword aligned, and we can
       the optimized doubleword copy loop.  */
      .align  4
@@ -119,19 +119,19 @@ EALIGN (BP_SYM (memcpy), 7, 0)
  
    /* Move doublewords where destination and source are DW aligned.
       Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
-     If the the copy is not an exact multiple of 128 bytes, 1-15
+     If the copy is not an exact multiple of 128 bytes, 1-15
       doublewords are copied as needed to set up the main loop.  After
       the main loop exits there may be a tail of 1-7 bytes. These byte
       are copied a word/halfword/byte at a time as needed to preserve
       alignment.
-     
+
       For POWER6 the L1 is store-through and the L2 is store-in.  The
       L2 is clocked at half CPU clock so we can store 16 bytes every
       other cycle.  POWER6 also has a load/store bypass so we can do
-     load, load, store, store every 2 cycles.  
-     
+     load, load, store, store every 2 cycles.
+
       The following code is sensitive to cache line alignment.  Do not
-     make any change with out first making sure thay don't result in
+     make any change with out first making sure they don't result in
       splitting ld/std pairs across a cache line.  */
  
      mtcrf 0x02,5
@@ -274,7 +274,7 @@ L(das_loop):
      std   8,16+96(10)
      std   0,24+96(10)
      ble   cr5,L(das_loop_e)
-    
+
      mtctr   12
      .align  4
  L(das_loop2):
@@ -327,10 +327,10 @@ L(das_loop_e):
      .align  4
  L(das_tail):
      beq   cr1,0f
-    
+
  L(das_tail2):
  /*  At this point we have a tail of 0-7 bytes and we know that the
-    destiniation is double word aligned.  */
+    destination is double word aligned.  */
  4:  bf    29,2f
      lwz   6,0(4)
      stw   6,0(3)
@@ -345,7 +345,7 @@ L(das_tail2):
      lbz   6,4(4)
      stb   6,4(3)
      b     0f
-  
+
  2:  bf    30,1f
      lhz   6,0(4)
      sth   6,0(3)
@@ -353,7 +353,7 @@ L(das_tail2):
      lbz   6,2(4)
      stb   6,2(3)
      b     0f
-    
+
  1:  bf    31,0f
      lbz   6,0(4)
      stb   6,0(3)
@@ -362,7 +362,7 @@ L(das_tail2):
      ld 3,-16(1)
      blr
  
-/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31 
+/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
     bytes.  Each case is handled without loops, using binary (1,2,4,8)
     tests.
  
@@ -403,15 +403,28 @@ L(das_tail2):
      blt   cr6,5f
      srdi  7,6,16
      bgt          cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
      sth   6,0(3)
+#endif
      b     7f
      .align  4
  3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
      stb   7,0(3)
      sth   6,1(3)
+#endif
      b     7f
      .align  4
  5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
      stb   6,0(3)
  7:
      cmpldi     cr1,10,16
@@ -422,7 +435,7 @@ L(dus_tail):
  /* At least 6 bytes left and the source is word aligned.  This allows
     some speculative loads up front.  */
  /* We need to special case the fall-through because the biggest delays
-   are due to address computation not being ready in time for the 
+   are due to address computation not being ready in time for the
     AGEN.  */
      lwz   6,0(12)
      lwz   7,4(12)
@@ -453,7 +466,7 @@ L(dus_tail16): /* Move 16 bytes.  */
      ld    3,-16(1)
      blr
      .align  4
-L(dus_tail16p8):  /* less then 8 bytes left.  */
+L(dus_tail16p8):  /* less than 8 bytes left.  */
      beq   cr1,L(dus_tailX) /* exactly 16 bytes, early exit.  */
      cmpldi     cr1,10,20
      bf    29,L(dus_tail16p2)
@@ -467,7 +480,7 @@ L(dus_tail16p8):  /* less then 8 bytes left.  */
      ld    3,-16(1)
      blr
      .align  4
-L(dus_tail16p4):  /* less then 4 bytes left.  */
+L(dus_tail16p4):  /* less than 4 bytes left.  */
      addi  12,12,24
      addi  3,3,24
      bgt   cr0,L(dus_tail2)
@@ -475,7 +488,7 @@ L(dus_tail16p4):  /* less then 4 bytes left.  */
      ld    3,-16(1)
      blr
      .align  4
-L(dus_tail16p2):  /* 16 bytes moved, less then 4 bytes left.  */
+L(dus_tail16p2):  /* 16 bytes moved, less than 4 bytes left.  */
      addi  12,12,16
      addi  3,3,16
      b     L(dus_tail2)
@@ -500,7 +513,7 @@ L(dus_tail8):  /* Move 8 bytes.  */
      ld    3,-16(1)
      blr
      .align  4
-L(dus_tail8p4):  /* less then 4 bytes left.  */
+L(dus_tail8p4):  /* less than 4 bytes left.  */
      addi  12,12,8
      addi  3,3,8
      bgt   cr1,L(dus_tail2)
@@ -511,14 +524,14 @@ L(dus_tail8p4):  /* less then 4 bytes left.  */
      .align  4
  L(dus_tail4):  /* Move 4 bytes.  */
  /*  r6 already loaded speculatively.  If we are here we know there is
-    more then 4 bytes left.  So there is no need to test.  */
+    more than 4 bytes left.  So there is no need to test.  */
      addi  12,12,4
      stw   6,0(3)
      addi  3,3,4
  L(dus_tail2):  /* Move 2-3 bytes.  */
      bf    30,L(dus_tail1)
      lhz   6,0(12)
-    sth   6,0(3) 
+    sth   6,0(3)
      bf    31,L(dus_tailX)
      lbz   7,2(12)
      stb   7,2(3)
@@ -538,7 +551,7 @@ L(dus_tailX):
  .LE8:
      mr    12,4
      bne   cr6,L(dus_4)
-/* Exactly 8 bytes.  We may cross a 32-/128-byte boundry and take a ~20
+/* Exactly 8 bytes.  We may cross a 32-/128-byte boundary and take a ~20
     cycle delay.  This case should be rare and any attempt to avoid this
     would take most of 20 cycles any way.  */
      ld   6,0(4)
@@ -553,7 +566,7 @@ L(dus_4):
      stw   6,0(3)
      bf    30,L(dus_5)
      lhz   7,4(4)
-    sth   7,4(3) 
+    sth   7,4(3)
      bf    31,L(dus_0)
      lbz   8,6(4)
      stb   8,6(3)
@@ -591,20 +604,31 @@ L(dus_0):
      bge     cr0, L(du4_do)
      blt     cr5, L(du1_do)
      beq     cr5, L(du2_do)
-    b       L(du3_do) 
-       
+    b       L(du3_do)
+
      .align 4
  L(du1_do):
      bf      30,L(du1_1dw)
  
      /* there are at least two DWs to copy */
+    /* FIXME: can combine last shift and "or" into "rldimi" */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
      sldi     0,6, 8
      srdi     8,7, 64-8
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 8
+    sldi     8,6, 64-8
+#else
      sldi     0,7, 8
      srdi     8,6, 64-8
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -613,8 +637,13 @@ L(du1_do):
      blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du1_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
      sldi     0,6, 8
      srdi     8,7, 64-8
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -625,8 +654,13 @@ L(du1_do):
      b       L(du1_loop)
      .align 4
  L(du1_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
      sldi     0,6, 8
      srdi     8,7, 64-8
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du1_loop)
@@ -638,23 +672,43 @@ L(du1_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du1_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
      sldi   0,6, 8
      srdi   8,7, 64-8
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
      sldi   0,7, 8
      srdi   8,6, 64-8
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
      sldi   0,6, 8
      srdi   8,7, 64-8
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
      sldi   0,7, 8
      srdi   8,6, 64-8
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -664,9 +718,14 @@ L(du1_loop):
      .align 4
  L(du1_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
      sldi   0,6, 8
      srdi   8,7, 64-8
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -675,13 +734,23 @@ L(du2_do):
      bf      30,L(du2_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
      sldi     0,6, 16
      srdi     8,7, 64-16
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 16
+    sldi     8,6, 64-16
+#else
      sldi     0,7, 16
      srdi     8,6, 64-16
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -690,8 +759,13 @@ L(du2_do):
      blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du2_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
      sldi     0,6, 16
      srdi     8,7, 64-16
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -702,8 +776,13 @@ L(du2_do):
      b       L(du2_loop)
      .align 4
  L(du2_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
      sldi     0,6, 16
      srdi     8,7, 64-16
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du2_loop)
@@ -715,23 +794,43 @@ L(du2_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du2_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
      sldi   0,6, 16
      srdi   8,7, 64-16
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
      sldi   0,7, 16
      srdi   8,6, 64-16
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
      sldi   0,6, 16
      srdi   8,7, 64-16
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
      sldi   0,7, 16
      srdi   8,6, 64-16
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -741,9 +840,14 @@ L(du2_loop):
      .align 4
  L(du2_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
      sldi   0,6, 16
      srdi   8,7, 64-16
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -752,13 +856,23 @@ L(du3_do):
      bf      30,L(du3_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
      sldi     0,6, 24
      srdi     8,7, 64-24
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 24
+    sldi     8,6, 64-24
+#else
      sldi     0,7, 24
      srdi     8,6, 64-24
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -767,8 +881,13 @@ L(du3_do):
      blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du3_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
      sldi     0,6, 24
      srdi     8,7, 64-24
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -779,8 +898,13 @@ L(du3_do):
      b       L(du3_loop)
      .align 4
  L(du3_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
      sldi     0,6, 24
      srdi     8,7, 64-24
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du3_loop)
@@ -792,23 +916,43 @@ L(du3_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du3_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
      sldi   0,6, 24
      srdi   8,7, 64-24
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
      sldi   0,7, 24
      srdi   8,6, 64-24
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
      sldi   0,6, 24
      srdi   8,7, 64-24
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
      sldi   0,7, 24
      srdi   8,6, 64-24
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -818,9 +962,14 @@ L(du3_loop):
      .align 4
  L(du3_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
      sldi   0,6, 24
      srdi   8,7, 64-24
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -835,13 +984,23 @@ L(du4_dox):
      bf      30,L(du4_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
      sldi     0,6, 32
      srdi     8,7, 64-32
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 32
+    sldi     8,6, 64-32
+#else
      sldi     0,7, 32
      srdi     8,6, 64-32
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -850,8 +1009,13 @@ L(du4_dox):
      blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du4_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
      sldi     0,6, 32
      srdi     8,7, 64-32
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -862,8 +1026,13 @@ L(du4_dox):
      b       L(du4_loop)
      .align 4
  L(du4_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
      sldi     0,6, 32
      srdi     8,7, 64-32
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du4_loop)
@@ -875,23 +1044,43 @@ L(du4_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du4_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
      sldi   0,6, 32
      srdi   8,7, 64-32
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
      sldi   0,7, 32
      srdi   8,6, 64-32
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
      sldi   0,6, 32
      srdi   8,7, 64-32
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
      sldi   0,7, 32
      srdi   8,6, 64-32
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -901,9 +1090,14 @@ L(du4_loop):
      .align 4
  L(du4_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
      sldi   0,6, 32
      srdi   8,7, 64-32
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -912,13 +1106,23 @@ L(du5_do):
      bf      30,L(du5_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
      sldi     0,6, 40
      srdi     8,7, 64-40
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 40
+    sldi     8,6, 64-40
+#else
      sldi     0,7, 40
      srdi     8,6, 64-40
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -927,8 +1131,13 @@ L(du5_do):
      blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du5_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
      sldi     0,6, 40
      srdi     8,7, 64-40
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -939,8 +1148,13 @@ L(du5_do):
      b       L(du5_loop)
      .align 4
  L(du5_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
      sldi     0,6, 40
      srdi     8,7, 64-40
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du5_loop)
@@ -952,23 +1166,43 @@ L(du5_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du5_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
      sldi   0,6, 40
      srdi   8,7, 64-40
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
      sldi   0,7, 40
      srdi   8,6, 64-40
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
      sldi   0,6, 40
      srdi   8,7, 64-40
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
      sldi   0,7, 40
      srdi   8,6, 64-40
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -978,9 +1212,14 @@ L(du5_loop):
      .align 4
  L(du5_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
      sldi   0,6, 40
      srdi   8,7, 64-40
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -989,13 +1228,23 @@ L(du6_do):
      bf      30,L(du6_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
      sldi     0,6, 48
      srdi     8,7, 64-48
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 48
+    sldi     8,6, 64-48
+#else
      sldi     0,7, 48
      srdi     8,6, 64-48
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -1004,8 +1253,13 @@ L(du6_do):
      blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du6_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
      sldi     0,6, 48
      srdi     8,7, 64-48
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -1016,8 +1270,13 @@ L(du6_do):
      b       L(du6_loop)
      .align 4
  L(du6_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
      sldi     0,6, 48
      srdi     8,7, 64-48
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du6_loop)
@@ -1029,23 +1288,43 @@ L(du6_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du6_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
      sldi   0,6, 48
      srdi   8,7, 64-48
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
      sldi   0,7, 48
      srdi   8,6, 64-48
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
      sldi   0,6, 48
      srdi   8,7, 64-48
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
      sldi   0,7, 48
      srdi   8,6, 64-48
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -1055,9 +1334,14 @@ L(du6_loop):
      .align 4
  L(du6_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
      sldi   0,6, 48
      srdi   8,7, 64-48
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
  
@@ -1066,13 +1350,23 @@ L(du7_do):
      bf      30,L(du7_1dw)
  
      /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
      sldi     0,6, 56
      srdi     8,7, 64-56
+#endif
      or      0,0,8
      ld      6,16(5)
      std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 56
+    sldi     8,6, 64-56
+#else
      sldi     0,7, 56
      srdi     8,6, 64-56
+#endif
      or      0,0,8
      ld      7,24(5)
      std     0,8(4)
@@ -1081,8 +1375,13 @@ L(du7_do):
      blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
      bf      31,L(du7_loop)
      /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
      sldi     0,6, 56
      srdi     8,7, 64-56
+#endif
      or      0,0,8
      std     0,0(4)
      mr      6,7
@@ -1093,8 +1392,13 @@ L(du7_do):
      b       L(du7_loop)
      .align 4
  L(du7_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
      sldi     0,6, 56
      srdi     8,7, 64-56
+#endif
      addi    5,5,16
      or      0,0,8
      bf      31,L(du7_loop)
@@ -1106,23 +1410,43 @@ L(du7_1dw):
      .align 4
  /* copy 32 bytes at a time */
  L(du7_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
      sldi   0,6, 56
      srdi   8,7, 64-56
+#endif
      or    0,0,8
      ld    6,0(5)
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
      sldi   0,7, 56
      srdi   8,6, 64-56
+#endif
      or    0,0,8
      ld    7,8(5)
      std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
      sldi   0,6, 56
      srdi   8,7, 64-56
+#endif
      or    0,0,8
      ld    6,16(5)
      std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
      sldi   0,7, 56
      srdi   8,6, 64-56
+#endif
      or    0,0,8
      ld    7,24(5)
      std   0,24(4)
@@ -1132,12 +1456,17 @@ L(du7_loop):
      .align 4
  L(du7_fini):
      /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
      sldi   0,6, 56
      srdi   8,7, 64-56
-    or    0,0,8  
+#endif
+    or    0,0,8
      std   0,0(4)
      b     L(du_done)
-    
+
      .align 4
  L(du_done):
      rldicr 0,31,0,60
@@ -1145,9 +1474,9 @@ L(du_done):
      beq   cr1,0f       /* If the tail is 0 bytes we are done!  */
  
      add   3,3,0
-    add   12,12,0    
+    add   12,12,0
  /*  At this point we have a tail of 0-7 bytes and we know that the
-    destiniation is double word aligned.  */
+    destination is double word aligned.  */
  4:  bf    29,2f
      lwz   6,0(12)
      addi  12,12,4
@@ -1166,5 +1495,5 @@ L(du_done):
      ld 31,-8(1)
      ld 3,-16(1)
      blr
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+END_GEN_TB (MEMCPY,TB_TOCLESS)
  libc_hidden_builtin_def (memcpy)