[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / mempcpy.S

/* Optimized mempcpy implementation for POWER7.
   Copyright (C) 2010-2019 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
	Returns 'dst' + 'len'.  */

	.machine  power7
EALIGN (__mempcpy, 5, 0)
	CALL_MCOUNT

	stwu	1,-32(1)
	cfi_adjust_cfa_offset(32)
	stw	30,20(1)
	cfi_offset(30,(20-32))
	stw	31,24(1)
	mr	30,3
	cmplwi	cr1,5,31
	neg	0,3
	cfi_offset(31,-8)
	ble	cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
					code.  */

	andi.	11,3,7	      /* Check alignment of DST.  */
	clrlwi	10,4,29	      /* Check alignment of SRC.  */
	cmplw	cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srwi	9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrlwi	0,0,29
	mtcrf	0x01,0
	subf	31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	bf	30,4f
	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	bf	29,0f
	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
0:
	clrlwi	10,12,29      /* Check alignment of SRC again.  */
	srwi	9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrlwi	11,31,29
	mtcrf	0x01,9

	srwi	8,31,5
	cmplwi	cr1,9,4
	cmplwi	cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	lfd	6,0(12)
	lfd	7,8(12)
	addi	11,12,16
	mtctr	8
	stfd	6,0(3)
	stfd	7,8(3)
	addi	10,3,16
	bf	31,4f
	lfd	0,16(12)
	stfd	0,16(3)
	blt	cr1,3f
	addi	11,12,24
	addi	10,3,24
	b	4f

	.align	4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr	8
	bf	31,4f
	lfd	6,0(12)
	addi	11,12,8
	stfd	6,0(3)
	addi	10,3,8

	.align	4
4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	lfd	6,0(11)
	lfd	7,8(11)
	lfd	8,16(11)
	lfd	0,24(11)
	addi	11,11,32

	stfd	6,0(10)
	stfd	7,8(10)
	stfd	8,16(10)
	stfd	0,24(10)
	addi	10,10,32
	bdnz	4b
3:

	/* Check for tail bytes.  */

	clrrwi	0,31,3
	mtcrf	0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	lwz	31,24(1)
	addi	1,1,32
	blr

	/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	cmplwi	cr6,5,8
	mr	12,4
	mtcrf	0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrwi	11,4,2
	andi.	0,8,3
	cmplwi	cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf	10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1

	.align	4
L(end_4bytes_alignment):
	cmplwi	cr1,10,16
	mtcrf	0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz	7,4(12)
	stw	6,0(3)
	lwz	8,8(12)
	stw	7,4(3)
	lwz	6,12(12)
	addi	12,12,16
	stw	8,8(3)
	stw	6,12(3)
	addi	3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	sth	6,0(3)
	bf	31,0f
	lbz	7,2(12)
	stb	7,2(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used lfd/stfd here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz	7,4(4)
	stw	6,0(3)
	stw	7,4(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw	6,0(3)
	bf	30,5f
	lhz	7,4(4)
	sth	7,4(3)
	bf	31,0f
	lbz	8,6(4)
	stb	8,6(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	andi.	11,3,15	      /* Check alignment of DST.  */
	clrlwi	0,0,28	      /* Number of bytes until the 1st
				 quadword of DST.  */
	srwi	9,5,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* DST is not quadword aligned, get it aligned.  */

	mtcrf	0x01,0
	subf	31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	/* Copy 2 bytes.  */
	bf		30,4f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	lfd	6,0(12)
	addi	12,12,8
	stfd	6,0(3)
	addi	3,3,8
0:
	clrlwi	10,12,28      /* Check alignment of SRC.  */
	srwi	9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrlwi	11,31,28
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmplwi	cr1,11,0
	srwi	8,31,5	      /* Setup the loop counter.  */
	mr	10,3
	mr	11,12
	mtcrf	0x01,9
	cmplwi	cr6,9,1
#ifdef __LITTLE_ENDIAN__
	lvsr    5,0,12
#else
	lvsl    5,0,12
#endif
	lvx	3,0,12
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx	4,12,6
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	addi	11,12,16
	addi	10,3,16
	stvx	6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	lvx	3,11,7	      /* vr3 = r11+32.  */
#ifdef __LITTLE_ENDIAN__
	vperm   10,3,4,5
#else
	vperm   10,4,3,5
#endif
	addi	11,11,32
	stvx	6,0,10
	stvx	10,10,6
	addi	10,10,32

	bdnz	L(unaligned_loop)

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	clrrwi	0,31,4
	mtcrf	0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	lwz	31,24(1)
	addi	1,1,32
	blr

END (__mempcpy)
libc_hidden_def (__mempcpy)
weak_alias (__mempcpy, mempcpy)
libc_hidden_builtin_def (mempcpy)
Commit	Line	Data
	1	/* Optimized mempcpy implementation for POWER7.
	2	Copyright (C) 2010-2019 Free Software Foundation, Inc.
	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
	18	<https://www.gnu.org/licenses/>. */
	19
	20	#include <sysdep.h>
	21
	22	/* void * [r3] __mempcpy (void dst [r3], void src [r4], size_t len [r5]);
	23	Returns 'dst' + 'len'. */
	24
	25	.machine power7
	26	EALIGN (__mempcpy, 5, 0)
	27	CALL_MCOUNT
	28
	29	stwu 1,-32(1)
	30	cfi_adjust_cfa_offset(32)
	31	stw 30,20(1)
	32	cfi_offset(30,(20-32))
	33	stw 31,24(1)
	34	mr 30,3
	35	cmplwi cr1,5,31
	36	neg 0,3
	37	cfi_offset(31,-8)
	38	ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
	39	code. */
	40
	41	andi. 11,3,7 /* Check alignment of DST. */
	42	clrlwi 10,4,29 /* Check alignment of SRC. */
	43	cmplw cr6,10,11 /* SRC and DST alignments match? */
	44	mr 12,4
	45	mr 31,5
	46	bne cr6,L(copy_GE_32_unaligned)
	47
	48	srwi 9,5,3 /* Number of full quadwords remaining. */
	49
	50	beq L(copy_GE_32_aligned_cont)
	51
	52	clrlwi 0,0,29
	53	mtcrf 0x01,0
	54	subf 31,0,5
	55
	56	/* Get the SRC aligned to 8 bytes. */
	57
	58	1: bf 31,2f
	59	lbz 6,0(12)
	60	addi 12,12,1
	61	stb 6,0(3)
	62	addi 3,3,1
	63	2: bf 30,4f
	64	lhz 6,0(12)
	65	addi 12,12,2
	66	sth 6,0(3)
	67	addi 3,3,2
	68	4: bf 29,0f
	69	lwz 6,0(12)
	70	addi 12,12,4
	71	stw 6,0(3)
	72	addi 3,3,4
	73	0:
	74	clrlwi 10,12,29 /* Check alignment of SRC again. */
	75	srwi 9,31,3 /* Number of full doublewords remaining. */
	76
	77	L(copy_GE_32_aligned_cont):
	78
	79	clrlwi 11,31,29
	80	mtcrf 0x01,9
	81
	82	srwi 8,31,5
	83	cmplwi cr1,9,4
	84	cmplwi cr6,11,0
	85	mr 11,12
	86
	87	/* Copy 1~3 doublewords so the main loop starts
	88	at a multiple of 32 bytes. */
	89
	90	bf 30,1f
	91	lfd 6,0(12)
	92	lfd 7,8(12)
	93	addi 11,12,16
	94	mtctr 8
	95	stfd 6,0(3)
	96	stfd 7,8(3)
	97	addi 10,3,16
	98	bf 31,4f
	99	lfd 0,16(12)
	100	stfd 0,16(3)
	101	blt cr1,3f
	102	addi 11,12,24
	103	addi 10,3,24
	104	b 4f
	105
	106	.align 4
	107	1: /* Copy 1 doubleword and set the counter. */
	108	mr 10,3
	109	mtctr 8
	110	bf 31,4f
	111	lfd 6,0(12)
	112	addi 11,12,8
	113	stfd 6,0(3)
	114	addi 10,3,8
	115
	116	.align 4
	117	4: /* Main aligned copy loop. Copies 32-bytes at a time. */
	118	lfd 6,0(11)
	119	lfd 7,8(11)
	120	lfd 8,16(11)
	121	lfd 0,24(11)
	122	addi 11,11,32
	123
	124	stfd 6,0(10)
	125	stfd 7,8(10)
	126	stfd 8,16(10)
	127	stfd 0,24(10)
	128	addi 10,10,32
	129	bdnz 4b
	130	3:
	131
	132	/* Check for tail bytes. */
	133
	134	clrrwi 0,31,3
	135	mtcrf 0x01,31
	136	beq cr6,0f
	137
	138	.L9:
	139	add 3,3,0
	140	add 12,12,0
	141
	142	/* At this point we have a tail of 0-7 bytes and we know that the
	143	destination is doubleword-aligned. */
	144	4: /* Copy 4 bytes. */
	145	bf 29,2f
	146
	147	lwz 6,0(12)
	148	addi 12,12,4
	149	stw 6,0(3)
	150	addi 3,3,4
	151	2: /* Copy 2 bytes. */
	152	bf 30,1f
	153
	154	lhz 6,0(12)
	155	addi 12,12,2
	156	sth 6,0(3)
	157	addi 3,3,2
	158	1: /* Copy 1 byte. */
	159	bf 31,0f
	160
	161	lbz 6,0(12)
	162	stb 6,0(3)
	163	0: /* Return DST + LEN pointer. */
	164	add 3,30,5
	165	lwz 30,20(1)
	166	lwz 31,24(1)
	167	addi 1,1,32
	168	blr
	169
	170	/* Handle copies of 0~31 bytes. */
	171	.align 4
	172	L(copy_LT_32):
	173	cmplwi cr6,5,8
	174	mr 12,4
	175	mtcrf 0x01,5
	176	ble cr6,L(copy_LE_8)
	177
	178	/* At least 9 bytes to go. */
	179	neg 8,4
	180	clrrwi 11,4,2
	181	andi. 0,8,3
	182	cmplwi cr1,5,16
	183	mr 10,5
	184	beq L(copy_LT_32_aligned)
	185
	186	/* Force 4-bytes alignment for SRC. */
	187	mtocrf 0x01,0
	188	subf 10,0,5
	189	2: bf 30,1f
	190
	191	lhz 6,0(12)
	192	addi 12,12,2
	193	sth 6,0(3)
	194	addi 3,3,2
	195	1: bf 31,L(end_4bytes_alignment)
	196
	197	lbz 6,0(12)
	198	addi 12,12,1
	199	stb 6,0(3)
	200	addi 3,3,1
	201
	202	.align 4
	203	L(end_4bytes_alignment):
	204	cmplwi cr1,10,16
	205	mtcrf 0x01,10
	206
	207	L(copy_LT_32_aligned):
	208	/* At least 6 bytes to go, and SRC is word-aligned. */
	209	blt cr1,8f
	210
	211	/* Copy 16 bytes. */
	212	lwz 6,0(12)
	213	lwz 7,4(12)
	214	stw 6,0(3)
	215	lwz 8,8(12)
	216	stw 7,4(3)
	217	lwz 6,12(12)
	218	addi 12,12,16
	219	stw 8,8(3)
	220	stw 6,12(3)
	221	addi 3,3,16
	222	8: /* Copy 8 bytes. */
	223	bf 28,4f
	224
	225	lwz 6,0(12)
	226	lwz 7,4(12)
	227	addi 12,12,8
	228	stw 6,0(3)
	229	stw 7,4(3)
	230	addi 3,3,8
	231	4: /* Copy 4 bytes. */
	232	bf 29,2f
	233
	234	lwz 6,0(12)
	235	addi 12,12,4
	236	stw 6,0(3)
	237	addi 3,3,4
	238	2: /* Copy 2-3 bytes. */
	239	bf 30,1f
	240
	241	lhz 6,0(12)
	242	sth 6,0(3)
	243	bf 31,0f
	244	lbz 7,2(12)
	245	stb 7,2(3)
	246
	247	/* Return DST + LEN pointer. */
	248	add 3,30,5
	249	lwz 30,20(1)
	250	addi 1,1,32
	251	blr
	252
	253	.align 4
	254	1: /* Copy 1 byte. */
	255	bf 31,0f
	256
	257	lbz 6,0(12)
	258	stb 6,0(3)
	259	0: /* Return DST + LEN pointer. */
	260	add 3,30,5
	261	lwz 30,20(1)
	262	addi 1,1,32
	263	blr
	264
	265	/* Handles copies of 0~8 bytes. */
	266	.align 4
	267	L(copy_LE_8):
	268	bne cr6,4f
	269
	270	/* Though we could've used lfd/stfd here, they are still
	271	slow for unaligned cases. */
	272
	273	lwz 6,0(4)
	274	lwz 7,4(4)
	275	stw 6,0(3)
	276	stw 7,4(3)
	277
	278	/* Return DST + LEN pointer. */
	279	add 3,30,5
	280	lwz 30,20(1)
	281	addi 1,1,32
	282	blr
	283
	284	.align 4
	285	4: /* Copies 4~7 bytes. */
	286	bf 29,2b
	287
	288	lwz 6,0(4)
	289	stw 6,0(3)
	290	bf 30,5f
	291	lhz 7,4(4)
	292	sth 7,4(3)
	293	bf 31,0f
	294	lbz 8,6(4)
	295	stb 8,6(3)
	296
	297	/* Return DST + LEN pointer. */
	298	add 3,30,5
	299	lwz 30,20(1)
	300	addi 1,1,32
	301	blr
	302
	303	.align 4
	304	5: /* Copy 1 byte. */
	305	bf 31,0f
	306
	307	lbz 6,4(4)
	308	stb 6,4(3)
	309
	310	0: /* Return DST + LEN pointer. */
	311	add 3,30,5
	312	lwz 30,20(1)
	313	addi 1,1,32
	314	blr
	315
	316	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	317	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	318	the data, allowing for aligned DST stores. */
	319	.align 4
	320	L(copy_GE_32_unaligned):
	321	andi. 11,3,15 /* Check alignment of DST. */
	322	clrlwi 0,0,28 /* Number of bytes until the 1st
	323	quadword of DST. */
	324	srwi 9,5,4 /* Number of full quadwords remaining. */
	325
	326	beq L(copy_GE_32_unaligned_cont)
	327
	328	/* DST is not quadword aligned, get it aligned. */
	329
	330	mtcrf 0x01,0
	331	subf 31,0,5
	332
	333	/* Vector instructions work best when proper alignment (16-bytes)
	334	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
	335	1: /* Copy 1 byte. */
	336	bf 31,2f
	337
	338	lbz 6,0(12)
	339	addi 12,12,1
	340	stb 6,0(3)
	341	addi 3,3,1
	342	2: /* Copy 2 bytes. */
	343	bf 30,4f
	344
	345	lhz 6,0(12)
	346	addi 12,12,2
	347	sth 6,0(3)
	348	addi 3,3,2
	349	4: /* Copy 4 bytes. */
	350	bf 29,8f
	351
	352	lwz 6,0(12)
	353	addi 12,12,4
	354	stw 6,0(3)
	355	addi 3,3,4
	356	8: /* Copy 8 bytes. */
	357	bf 28,0f
	358
	359	lfd 6,0(12)
	360	addi 12,12,8
	361	stfd 6,0(3)
	362	addi 3,3,8
	363	0:
	364	clrlwi 10,12,28 /* Check alignment of SRC. */
	365	srwi 9,31,4 /* Number of full quadwords remaining. */
	366
	367	/* The proper alignment is present, it is OK to copy the bytes now. */
	368	L(copy_GE_32_unaligned_cont):
	369
	370	/* Setup two indexes to speed up the indexed vector operations. */
	371	clrlwi 11,31,28
	372	li 6,16 /* Index for 16-bytes offsets. */
	373	li 7,32 /* Index for 32-bytes offsets. */
	374	cmplwi cr1,11,0
	375	srwi 8,31,5 /* Setup the loop counter. */
	376	mr 10,3
	377	mr 11,12
	378	mtcrf 0x01,9
	379	cmplwi cr6,9,1
	380	#ifdef __LITTLE_ENDIAN__
	381	lvsr 5,0,12
	382	#else
	383	lvsl 5,0,12
	384	#endif
	385	lvx 3,0,12
	386	bf 31,L(setup_unaligned_loop)
	387
	388	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
	389	lvx 4,12,6
	390	#ifdef __LITTLE_ENDIAN__
	391	vperm 6,4,3,5
	392	#else
	393	vperm 6,3,4,5
	394	#endif
	395	addi 11,12,16
	396	addi 10,3,16
	397	stvx 6,0,3
	398	vor 3,4,4
	399
	400	L(setup_unaligned_loop):
	401	mtctr 8
	402	ble cr6,L(end_unaligned_loop)
	403
	404	/* Copy 32 bytes at a time using vector instructions. */
	405	.align 4
	406	L(unaligned_loop):
	407
	408	/* Note: vr6/vr10 may contain data that was already copied,
	409	but in order to get proper alignment, we may have to copy
	410	some portions again. This is faster than having unaligned
	411	vector instructions though. */
	412
	413	lvx 4,11,6 /* vr4 = r11+16. */
	414	#ifdef __LITTLE_ENDIAN__
	415	vperm 6,4,3,5
	416	#else
	417	vperm 6,3,4,5
	418	#endif
	419	lvx 3,11,7 /* vr3 = r11+32. */
	420	#ifdef __LITTLE_ENDIAN__
	421	vperm 10,3,4,5
	422	#else
	423	vperm 10,4,3,5
	424	#endif
	425	addi 11,11,32
	426	stvx 6,0,10
	427	stvx 10,10,6
	428	addi 10,10,32
	429
	430	bdnz L(unaligned_loop)
	431
	432	.align 4
	433	L(end_unaligned_loop):
	434
	435	/* Check for tail bytes. */
	436	clrrwi 0,31,4
	437	mtcrf 0x01,31
	438	beq cr1,0f
	439
	440	add 3,3,0
	441	add 12,12,0
	442
	443	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
	444	8: /* Copy 8 bytes. */
	445	bf 28,4f
	446
	447	lwz 6,0(12)
	448	lwz 7,4(12)
	449	addi 12,12,8
	450	stw 6,0(3)
	451	stw 7,4(3)
	452	addi 3,3,8
	453	4: /* Copy 4 bytes. */
	454	bf 29,2f
	455
	456	lwz 6,0(12)
	457	addi 12,12,4
	458	stw 6,0(3)
	459	addi 3,3,4
	460	2: /* Copy 2~3 bytes. */
	461	bf 30,1f
	462
	463	lhz 6,0(12)
	464	addi 12,12,2
	465	sth 6,0(3)
	466	addi 3,3,2
	467	1: /* Copy 1 byte. */
	468	bf 31,0f
	469
	470	lbz 6,0(12)
	471	stb 6,0(3)
	472	0: /* Return DST + LEN pointer. */
	473	add 3,30,5
	474	lwz 30,20(1)
	475	lwz 31,24(1)
	476	addi 1,1,32
	477	blr
	478
	479	END (__mempcpy)
	480	libc_hidden_def (__mempcpy)
	481	weak_alias (__mempcpy, mempcpy)
	482	libc_hidden_builtin_def (mempcpy)