Prefer https to http for gnu.org and fsf.org URLs

[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memcpy.S
diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S

index 878ca85b696306b39a7ae4d981744a77cc436da4..3610051fa5432dba0ee74adf631a0d735e00086a 100644 (file)
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@@ -1,5 +1,5 @@
  /* Optimized memcpy implementation for PowerPC64.
-   Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2003-2019 Free Software Foundation, Inc.
     This file is part of the GNU C Library.
  
     The GNU C Library is free software; you can redistribute it and/or
@@ -14,13 +14,11 @@
  
     You should have received a copy of the GNU Lesser General Public
     License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
+   <https://www.gnu.org/licenses/>.  */
  
  #include <sysdep.h>
-#include <bp-sym.h>
-#include <bp-asm.h>
  
-/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
     Returns 'dst'.
  
     Memcpy handles short copies (< 32-bytes) using a binary move blocks
@@ -28,14 +26,18 @@
     with the appropriate combination of byte and halfword load/stores.
     There is minimal effort to optimize the alignment of short moves.
     The 64-bit implementations of POWER3 and POWER4 do a reasonable job
-   of handling unligned load/stores that do not cross 32-byte boundries.
+   of handling unaligned load/stores that do not cross 32-byte boundaries.
  
     Longer moves (>= 32-bytes) justify the effort to get at least the
     destination doubleword (8-byte) aligned.  Further optimization is
-   posible when both source and destination are doubleword aligned.
+   possible when both source and destination are doubleword aligned.
     Each case has a optimized unrolled loop.   */
  
-EALIGN (BP_SYM (memcpy), 5, 0)
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+ENTRY_TOCLESS (MEMCPY, 5)
         CALL_MCOUNT 3
  
      cmpldi cr1,5,31
@@ -43,9 +45,9 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      std   3,-16(1)
      std   31,-8(1)
      cfi_offset(31,-8)
-    andi. 11,3,7       /* check alignement of dst.  */
+    andi. 11,3,7       /* check alignment of dst.  */
      clrldi 0,0,61      /* Number of bytes until the 1st doubleword of dst.  */
-    clrldi 10,4,61     /* check alignement of src.  */
+    clrldi 10,4,61     /* check alignment of src.  */
      cmpldi cr6,5,8
      ble-  cr1,.L2      /* If move < 32 bytes use short move code.  */
      cmpld cr6,10,11
@@ -56,7 +58,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      beq   .L0
  
      subf  31,0,5
-  /* Move 0-7 bytes as needed to get the destination doubleword alligned.  */
+  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
  1:  bf    31,2f
      lbz   6,0(12)
      addi  12,12,1
@@ -73,10 +75,10 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      stw   6,0(3)
      addi  3,3,4
  0:
-    clrldi 10,12,61    /* check alignement of src again.  */
+    clrldi 10,12,61    /* check alignment of src again.  */
      srdi  9,31,3       /* Number of full double words remaining.  */
  
-  /* Copy doublewords from source to destination, assumpting the
+  /* Copy doublewords from source to destination, assuming the
       destination is aligned on a doubleword boundary.
  
       At this point we know there are at least 25 bytes left (32-7) to copy.
@@ -152,7 +154,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      add   12,12,0
  
  /*  At this point we have a tail of 0-7 bytes and we know that the
-    destiniation is double word aligned.  */
+    destination is double word aligned.  */
  4:  bf    29,2f
      lwz   6,0(12)
      addi  12,12,4
@@ -214,15 +216,28 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      blt   cr6,5f
      srdi  7,6,16
      bgt          cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
      sth   6,0(3)
+#endif
      b     7f
      .align  4
  3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
      stb   7,0(3)
      sth   6,1(3)
+#endif
      b     7f
      .align  4
  5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
      stb   6,0(3)
  7:
      cmpldi     cr1,10,16
@@ -282,7 +297,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      bne   cr6,4f
  /* Would have liked to use use ld/std here but the 630 processors are
     slow for load/store doubles that are not at least word aligned.
-   Unaligned Load/Store word execute with only a 1 cycle penaltity.  */
+   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
      lwz   6,0(4)
      lwz   7,4(4)
      stw   6,0(3)
@@ -330,7 +345,11 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      ld    7,8(5)
      subfic  9,10,64
      beq   2f
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+#else
      sld   0,6,10
+#endif
      cmpldi  11,1
      mr    6,7
      addi  4,4,-8
@@ -338,15 +357,25 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      b     1f
  2:  addi  5,5,8
      .align  4
+#ifdef __LITTLE_ENDIAN__
+0:  srd   0,6,10
+    sld   8,7,9
+#else
  0:  sld   0,6,10
      srd   8,7,9
+#endif
      cmpldi  11,2
      ld    6,8(5)
      or    0,0,8
      addi  11,11,-2
      std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+1:  sld   8,6,9
+#else
      sld   0,7,10
  1:  srd   8,6,9
+#endif
      or    0,0,8
      beq   8f
      ld    7,16(5)
@@ -364,5 +393,5 @@ EALIGN (BP_SYM (memcpy), 5, 0)
      ld 31,-8(1)
      ld 3,-16(1)
      blr
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+END_GEN_TB (MEMCPY,TB_TOCLESS)
  libc_hidden_builtin_def (memcpy)