[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / memcpy.S

/* Optimized memcpy implementation for PowerPC64.
   Copyright (C) 2003-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
   Returns 'dst'.

   Memcpy handles short copies (< 32-bytes) using a binary move blocks
   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
   with the appropriate combination of byte and halfword load/stores.
   There is minimal effort to optimize the alignment of short moves.
   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
   of handling unaligned load/stores that do not cross 32-byte boundaries.

   Longer moves (>= 32-bytes) justify the effort to get at least the
   destination doubleword (8-byte) aligned.  Further optimization is
   possible when both source and destination are doubleword aligned.
   Each case has a optimized unrolled loop.   */

#ifndef MEMCPY
# define MEMCPY memcpy
#endif

ENTRY_TOCLESS (MEMCPY, 5)
	CALL_MCOUNT 3

    cmpldi cr1,5,31
    neg   0,3
    std   3,-16(1)
    std   31,-8(1)
    cfi_offset(31,-8)
    andi. 11,3,7	/* check alignment of dst.  */
    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
    clrldi 10,4,61	/* check alignment of src.  */
    cmpldi cr6,5,8
    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
    cmpld cr6,10,11
    mr    12,4
    srdi  9,5,3		/* Number of full double words remaining.  */
    mtcrf 0x01,0
    mr    31,5
    beq   .L0

    subf  31,0,5
  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
1:  bf    31,2f
    lbz   6,0(12)
    addi  12,12,1
    stb   6,0(3)
    addi  3,3,1
2:  bf    30,4f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
4:  bf    29,0f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
0:
    clrldi 10,12,61	/* check alignment of src again.  */
    srdi  9,31,3	/* Number of full double words remaining.  */

  /* Copy doublewords from source to destination, assuming the
     destination is aligned on a doubleword boundary.

     At this point we know there are at least 25 bytes left (32-7) to copy.
     The next step is to determine if the source is also doubleword aligned.
     If not branch to the unaligned move code at .L6. which uses
     a load, shift, store strategy.

     Otherwise source and destination are doubleword aligned, and we can
     the optimized doubleword copy loop.  */
.L0:
    clrldi	11,31,61
    mtcrf 0x01,9
    bne-  cr6,.L6   /* If source is not DW aligned.  */

  /* Move doublewords where destination and source are DW aligned.
     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
     If the copy is not an exact multiple of 32 bytes, 1-3
     doublewords are copied as needed to set up the main loop.  After
     the main loop exits there may be a tail of 1-7 bytes. These byte are
     copied a word/halfword/byte at a time as needed to preserve alignment.  */

    srdi  8,31,5
    cmpldi	cr1,9,4
    cmpldi	cr6,11,0
    mr    11,12

    bf    30,1f
    ld    6,0(12)
    ld    7,8(12)
    addi  11,12,16
    mtctr 8
    std   6,0(3)
    std   7,8(3)
    addi  10,3,16
    bf    31,4f
    ld    0,16(12)
    std   0,16(3)
    blt   cr1,3f
    addi  11,12,24
    addi  10,3,24
    b     4f
    .align  4
1:
    mr    10,3
    mtctr 8
    bf    31,4f
    ld    6,0(12)
    addi  11,12,8
    std   6,0(3)
    addi  10,3,8

    .align  4
4:
    ld    6,0(11)
    ld    7,8(11)
    ld    8,16(11)
    ld    0,24(11)
    addi  11,11,32
2:
    std   6,0(10)
    std   7,8(10)
    std   8,16(10)
    std   0,24(10)
    addi  10,10,32
    bdnz  4b
3:

    rldicr 0,31,0,60
    mtcrf 0x01,31
    beq   cr6,0f
.L9:
    add   3,3,0
    add   12,12,0

/*  At this point we have a tail of 0-7 bytes and we know that the
    destination is double word aligned.  */
4:  bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  bf    30,1f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
1:  bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr

/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
   bytes.  Each case is handled without loops, using binary (1,2,4,8)
   tests.

   In the short (0-8 byte) case no attempt is made to force alignment
   of either source or destination.  The hardware will handle the
   unaligned load/stores with small delays for crossing 32- 64-byte, and
   4096-byte boundaries. Since these short moves are unlikely to be
   unaligned or cross these boundaries, the overhead to force
   alignment is not justified.

   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
   boundaries.  Since only loads are sensitive to the 32-/64-byte
   boundaries it is more important to align the source then the
   destination.  If the source is not already word aligned, we first
   move 1-3 bytes as needed.  Since we are only word aligned we don't
   use double word load/stores to insure that all loads are aligned.
   While the destination and stores may still be unaligned, this
   is only an issue for page (4096 byte boundary) crossing, which
   should be rare for these short moves.  The hardware handles this
   case automatically with a small delay.  */

    .align  4
.L2:
    mtcrf 0x01,5
    neg   8,4
    clrrdi	11,4,2
    andi. 0,8,3
    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
/* At least 9 bytes left.  Get the source word aligned.  */
    cmpldi	cr1,5,16
    mr    10,5
    mr    12,4
    cmpldi	cr6,0,2
    beq   .L3	/* If the source is already word aligned skip this.  */
/* Copy 1-3 bytes to get source address word aligned.  */
    lwz   6,0(11)
    subf  10,0,5
    add   12,4,0
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
#ifdef __LITTLE_ENDIAN__
    sth   7,0(3)
#else
    sth   6,0(3)
#endif
    b     7f
    .align  4
3:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,24
    stb   6,0(3)
    sth   7,1(3)
#else
    stb   7,0(3)
    sth   6,1(3)
#endif
    b     7f
    .align  4
5:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,8
#endif
    stb   6,0(3)
7:
    cmpldi	cr1,10,16
    add   3,3,0
    mtcrf 0x01,10
    .align  4
.L3:
/* At least 6 bytes left and the source is word aligned.  */
    blt   cr1,8f
16: /* Move 16 bytes.  */
    lwz   6,0(12)
    lwz   7,4(12)
    stw   6,0(3)
    lwz   6,8(12)
    stw   7,4(3)
    lwz   7,12(12)
    addi  12,12,16
    stw   6,8(3)
    stw   7,12(3)
    addi  3,3,16
8:  /* Move 8 bytes.  */
    bf    28,4f
    lwz   6,0(12)
    lwz   7,4(12)
    addi  12,12,8
    stw   6,0(3)
    stw   7,4(3)
    addi  3,3,8
4:  /* Move 4 bytes.  */
    bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  /* Move 2-3 bytes.  */
    bf    30,1f
    lhz   6,0(12)
    sth   6,0(3)
    bf    31,0f
    lbz   7,2(12)
    stb   7,2(3)
    ld 3,-16(1)
    blr
1:  /* Move 1 byte.  */
    bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

/* Special case to copy 0-8 bytes.  */
    .align  4
.LE8:
    mr    12,4
    bne   cr6,4f
/* Would have liked to use use ld/std here but the 630 processors are
   slow for load/store doubles that are not at least word aligned.
   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
    lwz   6,0(4)
    lwz   7,4(4)
    stw   6,0(3)
    stw   7,4(3)
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr
    .align  4
4:  bf    29,2b
    lwz   6,0(4)
    stw   6,0(3)
6:
    bf    30,5f
    lhz   7,4(4)
    sth   7,4(3)
    bf    31,0f
    lbz   8,6(4)
    stb   8,6(3)
    ld 3,-16(1)
    blr
    .align  4
5:
    bf    31,0f
    lbz   6,4(4)
    stb   6,4(3)
    .align  4
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

    .align  4
.L6:

  /* Copy doublewords where the destination is aligned but the source is
     not.  Use aligned doubleword loads from the source, shifted to realign
     the data, to allow aligned destination stores.  */
    subf  5,10,12
    andi. 0,9,1
    cmpldi cr6,11,0
    sldi  10,10,3
    mr    11,9
    mr    4,3
    ld    6,0(5)
    ld    7,8(5)
    subfic  9,10,64
    beq   2f
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
#else
    sld   0,6,10
#endif
    cmpldi  11,1
    mr    6,7
    addi  4,4,-8
    addi  11,11,-1
    b     1f
2:  addi  5,5,8
    .align  4
#ifdef __LITTLE_ENDIAN__
0:  srd   0,6,10
    sld   8,7,9
#else
0:  sld   0,6,10
    srd   8,7,9
#endif
    cmpldi  11,2
    ld    6,8(5)
    or    0,0,8
    addi  11,11,-2
    std   0,0(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,7,10
1:  sld   8,6,9
#else
    sld   0,7,10
1:  srd   8,6,9
#endif
    or    0,0,8
    beq   8f
    ld    7,16(5)
    std   0,8(4)
    addi  5,5,16
    addi  4,4,16
    b     0b
    .align 4
8:
    std   0,8(4)
    rldicr 0,31,0,60
    mtcrf 0x01,31
    bne   cr6,.L9	/* If the tail is 0 bytes we are done!  */
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr
END_GEN_TB (MEMCPY,TB_TOCLESS)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
a14b373c	1	/* Optimized memcpy implementation for PowerPC64.
688903eb	2	Copyright (C) 2003-2018 Free Software Foundation, Inc.
a14b373c UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
a14b373c UD	18
a14b373c UD	19	#include <sysdep.h>
a14b373c	20
f17a4233	21	/* void * [r3] memcpy (void dst [r3], void src [r4], size_t len [r5]);
a14b373c UD	22	Returns 'dst'.
a14b373c UD	23
7a41d99a UD	24	Memcpy handles short copies (< 32-bytes) using a binary move blocks
	25	(no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
	26	with the appropriate combination of byte and halfword load/stores.
	27	There is minimal effort to optimize the alignment of short moves.
a8870a61	28	The 64-bit implementations of POWER3 and POWER4 do a reasonable job
2ccdea26	29	of handling unaligned load/stores that do not cross 32-byte boundaries.
a14b373c UD	30
	31	Longer moves (>= 32-bytes) justify the effort to get at least the
	32	destination doubleword (8-byte) aligned. Further optimization is
2ccdea26	33	possible when both source and destination are doubleword aligned.
a14b373c UD	34	Each case has a optimized unrolled loop. */
a14b373c UD	35
72fd128a WSM	36	#ifndef MEMCPY
	37	# define MEMCPY memcpy
	38	#endif
	39
d5b41185	40	ENTRY_TOCLESS (MEMCPY, 5)
d7d06f79 UD	41	CALL_MCOUNT 3
d7d06f79 UD	42
a14b373c UD	43	cmpldi cr1,5,31
a14b373c UD	44	neg 0,3
a8870a61	45	std 3,-16(1)
a14b373c	46	std 31,-8(1)
3e7e947f	47	cfi_offset(31,-8)
2ccdea26	48	andi. 11,3,7 /* check alignment of dst. */
a8870a61	49	clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
2ccdea26	50	clrldi 10,4,61 /* check alignment of src. */
a8870a61 UD	51	cmpldi cr6,5,8
a8870a61 UD	52	ble- cr1,.L2 /* If move < 32 bytes use short move code. */
7a41d99a	53	cmpld cr6,10,11
a14b373c	54	mr 12,4
a8870a61 UD	55	srdi 9,5,3 /* Number of full double words remaining. */
a8870a61 UD	56	mtcrf 0x01,0
a14b373c	57	mr 31,5
a8870a61	58	beq .L0
7a41d99a	59
a14b373c	60	subf 31,0,5
2ccdea26	61	/* Move 0-7 bytes as needed to get the destination doubleword aligned. */
a14b373c UD	62	1: bf 31,2f
	63	lbz 6,0(12)
	64	addi 12,12,1
	65	stb 6,0(3)
	66	addi 3,3,1
	67	2: bf 30,4f
	68	lhz 6,0(12)
	69	addi 12,12,2
	70	sth 6,0(3)
	71	addi 3,3,2
	72	4: bf 29,0f
	73	lwz 6,0(12)
	74	addi 12,12,4
	75	stw 6,0(3)
	76	addi 3,3,4
	77	0:
2ccdea26	78	clrldi 10,12,61 /* check alignment of src again. */
a8870a61	79	srdi 9,31,3 /* Number of full double words remaining. */
7a41d99a	80
2ccdea26	81	/* Copy doublewords from source to destination, assuming the
a14b373c UD	82	destination is aligned on a doubleword boundary.
a14b373c UD	83
a8870a61	84	At this point we know there are at least 25 bytes left (32-7) to copy.
7a41d99a	85	The next step is to determine if the source is also doubleword aligned.
a8870a61 UD	86	If not branch to the unaligned move code at .L6. which uses
a8870a61 UD	87	a load, shift, store strategy.
7a41d99a	88
a8870a61 UD	89	Otherwise source and destination are doubleword aligned, and we can
	90	the optimized doubleword copy loop. */
	91	.L0:
	92	clrldi 11,31,61
	93	mtcrf 0x01,9
	94	bne- cr6,.L6 /* If source is not DW aligned. */
a14b373c	95
a8870a61	96	/* Move doublewords where destination and source are DW aligned.
a14b373c	97	Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
7a41d99a	98	If the copy is not an exact multiple of 32 bytes, 1-3
a8870a61	99	doublewords are copied as needed to set up the main loop. After
7a41d99a	100	the main loop exits there may be a tail of 1-7 bytes. These byte are
a8870a61 UD	101	copied a word/halfword/byte at a time as needed to preserve alignment. */
	102
	103	srdi 8,31,5
a14b373c	104	cmpldi cr1,9,4
a8870a61 UD	105	cmpldi cr6,11,0
a8870a61 UD	106	mr 11,12
7a41d99a	107
a8870a61 UD	108	bf 30,1f
	109	ld 6,0(12)
	110	ld 7,8(12)
	111	addi 11,12,16
	112	mtctr 8
	113	std 6,0(3)
	114	std 7,8(3)
	115	addi 10,3,16
	116	bf 31,4f
	117	ld 0,16(12)
7a41d99a	118	std 0,16(3)
a8870a61 UD	119	blt cr1,3f
	120	addi 11,12,24
	121	addi 10,3,24
	122	b 4f
	123	.align 4
	124	1:
	125	mr 10,3
	126	mtctr 8
	127	bf 31,4f
	128	ld 6,0(12)
	129	addi 11,12,8
	130	std 6,0(3)
	131	addi 10,3,8
7a41d99a	132
a14b373c UD	133	.align 4
a14b373c UD	134	4:
a14b373c UD	135	ld 6,0(11)
a14b373c UD	136	ld 7,8(11)
a8870a61 UD	137	ld 8,16(11)
	138	ld 0,24(11)
	139	addi 11,11,32
	140	2:
a14b373c UD	141	std 6,0(10)
a14b373c UD	142	std 7,8(10)
a8870a61 UD	143	std 8,16(10)
	144	std 0,24(10)
	145	addi 10,10,32
	146	bdnz 4b
7a41d99a	147	3:
a14b373c	148
a14b373c	149	rldicr 0,31,0,60
a8870a61 UD	150	mtcrf 0x01,31
	151	beq cr6,0f
	152	.L9:
a14b373c UD	153	add 3,3,0
a14b373c UD	154	add 12,12,0
7a41d99a	155
a8870a61	156	/* At this point we have a tail of 0-7 bytes and we know that the
2ccdea26	157	destination is double word aligned. */
a8870a61 UD	158	4: bf 29,2f
a8870a61 UD	159	lwz 6,0(12)
a14b373c	160	addi 12,12,4
a14b373c	161	stw 6,0(3)
a14b373c	162	addi 3,3,4
a14b373c UD	163	2: bf 30,1f
	164	lhz 6,0(12)
	165	addi 12,12,2
	166	sth 6,0(3)
	167	addi 3,3,2
	168	1: bf 31,0f
	169	lbz 6,0(12)
a14b373c	170	stb 6,0(3)
a14b373c UD	171	0:
	172	/* Return original dst pointer. */
	173	ld 31,-8(1)
a8870a61 UD	174	ld 3,-16(1)
a8870a61 UD	175	blr
7a41d99a UD	176
	177	/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
	178	bytes. Each case is handled without loops, using binary (1,2,4,8)
	179	tests.
	180
a8870a61	181	In the short (0-8 byte) case no attempt is made to force alignment
7a41d99a UD	182	of either source or destination. The hardware will handle the
7a41d99a UD	183	unaligned load/stores with small delays for crossing 32- 64-byte, and
a8870a61	184	4096-byte boundaries. Since these short moves are unlikely to be
7a41d99a	185	unaligned or cross these boundaries, the overhead to force
a8870a61	186	alignment is not justified.
7a41d99a	187
a8870a61 UD	188	The longer (9-31 byte) move is more likely to cross 32- or 64-byte
a8870a61 UD	189	boundaries. Since only loads are sensitive to the 32-/64-byte
7a41d99a	190	boundaries it is more important to align the source then the
a8870a61	191	destination. If the source is not already word aligned, we first
7a41d99a UD	192	move 1-3 bytes as needed. Since we are only word aligned we don't
7a41d99a UD	193	use double word load/stores to insure that all loads are aligned.
a8870a61 UD	194	While the destination and stores may still be unaligned, this
	195	is only an issue for page (4096 byte boundary) crossing, which
	196	should be rare for these short moves. The hardware handles this
7a41d99a UD	197	case automatically with a small delay. */
7a41d99a UD	198
a8870a61 UD	199	.align 4
	200	.L2:
	201	mtcrf 0x01,5
	202	neg 8,4
	203	clrrdi 11,4,2
	204	andi. 0,8,3
	205	ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
	206	/* At least 9 bytes left. Get the source word aligned. */
	207	cmpldi cr1,5,16
	208	mr 10,5
	209	mr 12,4
	210	cmpldi cr6,0,2
	211	beq .L3 /* If the source is already word aligned skip this. */
	212	/* Copy 1-3 bytes to get source address word aligned. */
	213	lwz 6,0(11)
	214	subf 10,0,5
	215	add 12,4,0
	216	blt cr6,5f
	217	srdi 7,6,16
	218	bgt cr6,3f
759cfef3 AM	219	#ifdef __LITTLE_ENDIAN__
	220	sth 7,0(3)
	221	#else
a8870a61	222	sth 6,0(3)
759cfef3	223	#endif
a8870a61 UD	224	b 7f
	225	.align 4
	226	3:
759cfef3 AM	227	#ifdef __LITTLE_ENDIAN__
	228	rotlwi 6,6,24
	229	stb 6,0(3)
	230	sth 7,1(3)
	231	#else
a8870a61 UD	232	stb 7,0(3)
a8870a61 UD	233	sth 6,1(3)
759cfef3	234	#endif
a8870a61 UD	235	b 7f
	236	.align 4
	237	5:
759cfef3 AM	238	#ifdef __LITTLE_ENDIAN__
	239	rotlwi 6,6,8
	240	#endif
a8870a61 UD	241	stb 6,0(3)
	242	7:
	243	cmpldi cr1,10,16
	244	add 3,3,0
	245	mtcrf 0x01,10
	246	.align 4
	247	.L3:
	248	/* At least 6 bytes left and the source is word aligned. */
	249	blt cr1,8f
	250	16: /* Move 16 bytes. */
	251	lwz 6,0(12)
	252	lwz 7,4(12)
	253	stw 6,0(3)
	254	lwz 6,8(12)
	255	stw 7,4(3)
	256	lwz 7,12(12)
	257	addi 12,12,16
	258	stw 6,8(3)
	259	stw 7,12(3)
	260	addi 3,3,16
	261	8: /* Move 8 bytes. */
	262	bf 28,4f
	263	lwz 6,0(12)
	264	lwz 7,4(12)
	265	addi 12,12,8
	266	stw 6,0(3)
	267	stw 7,4(3)
	268	addi 3,3,8
	269	4: /* Move 4 bytes. */
	270	bf 29,2f
	271	lwz 6,0(12)
	272	addi 12,12,4
	273	stw 6,0(3)
7a41d99a	274	addi 3,3,4
a8870a61 UD	275	2: /* Move 2-3 bytes. */
	276	bf 30,1f
	277	lhz 6,0(12)
7a41d99a	278	sth 6,0(3)
a8870a61 UD	279	bf 31,0f
	280	lbz 7,2(12)
	281	stb 7,2(3)
	282	ld 3,-16(1)
	283	blr
	284	1: /* Move 1 byte. */
	285	bf 31,0f
	286	lbz 6,0(12)
	287	stb 6,0(3)
	288	0:
	289	/* Return original dst pointer. */
	290	ld 3,-16(1)
	291	blr
	292
	293	/* Special case to copy 0-8 bytes. */
	294	.align 4
	295	.LE8:
	296	mr 12,4
	297	bne cr6,4f
	298	/* Would have liked to use use ld/std here but the 630 processors are
7a41d99a	299	slow for load/store doubles that are not at least word aligned.
2ccdea26	300	Unaligned Load/Store word execute with only a 1 cycle penalty. */
a8870a61 UD	301	lwz 6,0(4)
	302	lwz 7,4(4)
	303	stw 6,0(3)
	304	stw 7,4(3)
	305	/* Return original dst pointer. */
	306	ld 3,-16(1)
	307	blr
	308	.align 4
	309	4: bf 29,2b
	310	lwz 6,0(4)
	311	stw 6,0(3)
	312	6:
	313	bf 30,5f
	314	lhz 7,4(4)
7a41d99a	315	sth 7,4(3)
a8870a61 UD	316	bf 31,0f
	317	lbz 8,6(4)
	318	stb 8,6(3)
	319	ld 3,-16(1)
	320	blr
	321	.align 4
7a41d99a	322	5:
a8870a61 UD	323	bf 31,0f
	324	lbz 6,4(4)
	325	stb 6,4(3)
	326	.align 4
	327	0:
	328	/* Return original dst pointer. */
	329	ld 3,-16(1)
a14b373c UD	330	blr
a14b373c UD	331
7c3164bc	332	.align 4
a14b373c	333	.L6:
a14b373c UD	334
	335	/* Copy doublewords where the destination is aligned but the source is
	336	not. Use aligned doubleword loads from the source, shifted to realign
	337	the data, to allow aligned destination stores. */
7c3164bc	338	subf 5,10,12
a8870a61 UD	339	andi. 0,9,1
a8870a61 UD	340	cmpldi cr6,11,0
a14b373c	341	sldi 10,10,3
a8870a61	342	mr 11,9
7c3164bc UD	343	mr 4,3
7c3164bc UD	344	ld 6,0(5)
a14b373c UD	345	ld 7,8(5)
	346	subfic 9,10,64
	347	beq 2f
759cfef3 AM	348	#ifdef __LITTLE_ENDIAN__
	349	srd 0,6,10
	350	#else
a14b373c	351	sld 0,6,10
759cfef3	352	#endif
a8870a61	353	cmpldi 11,1
a14b373c UD	354	mr 6,7
a14b373c UD	355	addi 4,4,-8
a8870a61	356	addi 11,11,-1
a14b373c UD	357	b 1f
	358	2: addi 5,5,8
	359	.align 4
759cfef3 AM	360	#ifdef __LITTLE_ENDIAN__
	361	0: srd 0,6,10
	362	sld 8,7,9
	363	#else
a14b373c UD	364	0: sld 0,6,10
a14b373c UD	365	srd 8,7,9
759cfef3	366	#endif
a8870a61	367	cmpldi 11,2
a14b373c UD	368	ld 6,8(5)
a14b373c UD	369	or 0,0,8
a8870a61	370	addi 11,11,-2
a14b373c	371	std 0,0(4)
759cfef3 AM	372	#ifdef __LITTLE_ENDIAN__
	373	srd 0,7,10
	374	1: sld 8,6,9
	375	#else
a14b373c UD	376	sld 0,7,10
a14b373c UD	377	1: srd 8,6,9
759cfef3	378	#endif
a14b373c UD	379	or 0,0,8
	380	beq 8f
	381	ld 7,16(5)
	382	std 0,8(4)
	383	addi 5,5,16
	384	addi 4,4,16
	385	b 0b
a8870a61	386	.align 4
a14b373c UD	387	8:
a14b373c UD	388	std 0,8(4)
a8870a61 UD	389	rldicr 0,31,0,60
	390	mtcrf 0x01,31
	391	bne cr6,.L9 /* If the tail is 0 bytes we are done! */
	392	/* Return original dst pointer. */
	393	ld 31,-8(1)
	394	ld 3,-16(1)
	395	blr
72fd128a	396	END_GEN_TB (MEMCPY,TB_TOCLESS)
85dd1003	397	libc_hidden_builtin_def (memcpy)