/* Optimized memcpy implementation for PowerPC64.
- Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
+ Copyright (C) 2003-2019 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
+ <https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <bp-sym.h>
-#include <bp-asm.h>
-/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
Returns 'dst'.
Memcpy handles short copies (< 32-bytes) using a binary move blocks
with the appropriate combination of byte and halfword load/stores.
There is minimal effort to optimize the alignment of short moves.
The 64-bit implementations of POWER3 and POWER4 do a reasonable job
- of handling unligned load/stores that do not cross 32-byte boundries.
+ of handling unaligned load/stores that do not cross 32-byte boundaries.
Longer moves (>= 32-bytes) justify the effort to get at least the
destination doubleword (8-byte) aligned. Further optimization is
- posible when both source and destination are doubleword aligned.
+ possible when both source and destination are doubleword aligned.
Each case has a optimized unrolled loop. */
-EALIGN (BP_SYM (memcpy), 5, 0)
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+ENTRY_TOCLESS (MEMCPY, 5)
CALL_MCOUNT 3
cmpldi cr1,5,31
std 3,-16(1)
std 31,-8(1)
cfi_offset(31,-8)
- andi. 11,3,7 /* check alignement of dst. */
+ andi. 11,3,7 /* check alignment of dst. */
clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
- clrldi 10,4,61 /* check alignement of src. */
+ clrldi 10,4,61 /* check alignment of src. */
cmpldi cr6,5,8
ble- cr1,.L2 /* If move < 32 bytes use short move code. */
cmpld cr6,10,11
beq .L0
subf 31,0,5
- /* Move 0-7 bytes as needed to get the destination doubleword alligned. */
+ /* Move 0-7 bytes as needed to get the destination doubleword aligned. */
1: bf 31,2f
lbz 6,0(12)
addi 12,12,1
stw 6,0(3)
addi 3,3,4
0:
- clrldi 10,12,61 /* check alignement of src again. */
+ clrldi 10,12,61 /* check alignment of src again. */
srdi 9,31,3 /* Number of full double words remaining. */
- /* Copy doublewords from source to destination, assumpting the
+ /* Copy doublewords from source to destination, assuming the
destination is aligned on a doubleword boundary.
At this point we know there are at least 25 bytes left (32-7) to copy.
add 12,12,0
/* At this point we have a tail of 0-7 bytes and we know that the
- destiniation is double word aligned. */
+ destination is double word aligned. */
4: bf 29,2f
lwz 6,0(12)
addi 12,12,4
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
bne cr6,4f
/* Would have liked to use use ld/std here but the 630 processors are
slow for load/store doubles that are not at least word aligned.
- Unaligned Load/Store word execute with only a 1 cycle penaltity. */
+ Unaligned Load/Store word execute with only a 1 cycle penalty. */
lwz 6,0(4)
lwz 7,4(4)
stw 6,0(3)
ld 7,8(5)
subfic 9,10,64
beq 2f
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+#else
sld 0,6,10
+#endif
cmpldi 11,1
mr 6,7
addi 4,4,-8
b 1f
2: addi 5,5,8
.align 4
+#ifdef __LITTLE_ENDIAN__
+0: srd 0,6,10
+ sld 8,7,9
+#else
0: sld 0,6,10
srd 8,7,9
+#endif
cmpldi 11,2
ld 6,8(5)
or 0,0,8
addi 11,11,-2
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+1: sld 8,6,9
+#else
sld 0,7,10
1: srd 8,6,9
+#endif
or 0,0,8
beq 8f
ld 7,16(5)
ld 31,-8(1)
ld 3,-16(1)
blr
-END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+END_GEN_TB (MEMCPY,TB_TOCLESS)
libc_hidden_builtin_def (memcpy)