[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S

/* Optimized mempcpy implementation for POWER7.
   Copyright (C) 2010-2019 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>


/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
    Returns 'dst' + 'len'.  */

#ifndef MEMPCPY
# define MEMPCPY __mempcpy
#endif
	.machine  power7
ENTRY_TOCLESS (MEMPCPY, 5)
	CALL_MCOUNT 3

	cmpldi	cr1,5,31
	neg	0,3
	std	3,-16(1)
	std	31,-8(1)
	cfi_offset(31,-8)
	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
				       code.  */

	andi.	11,3,7	      /* Check alignment of DST.  */


	clrldi	10,4,61	      /* Check alignment of SRC.  */
	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srdi	9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrldi	0,0,61
	mtcrf	0x01,0
	subf	31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	bf	30,4f
	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	bf	29,0f
	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
0:
	clrldi	10,12,61      /* Check alignment of SRC again.  */
	srdi	9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrldi	11,31,61
	mtcrf	0x01,9

	srdi	8,31,5
	cmpldi	cr1,9,4
	cmpldi	cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	ld	6,0(12)
	ld	7,8(12)
	addi	11,12,16
	mtctr	8
	std	6,0(3)
	std	7,8(3)
	addi	10,3,16
	bf	31,4f
	ld	0,16(12)
	std	0,16(3)
	blt	cr1,3f
	addi	11,12,24
	addi	10,3,24
	b	4f

	.align	4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr	8
	bf	31,4f
	ld	6,0(12)
	addi	11,12,8
	std	6,0(3)
	addi	10,3,8

	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	.align	4
4:
	ld	6,0(11)
	ld	7,8(11)
	ld	8,16(11)
	ld	0,24(11)
	addi	11,11,32

	std	6,0(10)
	std	7,8(10)
	std	8,16(10)
	std	0,24(10)
	addi	10,10,32
	bdnz	4b
3:

	/* Check for tail bytes.  */
	rldicr	0,31,0,60
	mtcrf	0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	cmpldi	cr6,5,8
	mr	12,4
	mtcrf	0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrdi	11,4,2
	andi.	0,8,3
	cmpldi	cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf	10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1

	.align	4
L(end_4bytes_alignment):
	cmpldi	cr1,10,16
	mtcrf	0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz	7,4(12)
	stw	6,0(3)
	lwz	8,8(12)
	stw	7,4(3)
	lwz	6,12(12)
	addi	12,12,16
	stw	8,8(3)
	stw	6,12(3)
	addi	3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	sth	6,0(3)
	bf	31,0f
	lbz	7,2(12)
	stb	7,2(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used ld/std here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz	7,4(4)
	stw	6,0(3)
	stw	7,4(3)
	ld	3,-16(1)      /* Return DST + LEN pointer.  */
	add	3,3,5
	blr

	.align	4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw	6,0(3)
	bf	30,5f
	lhz	7,4(4)
	sth	7,4(3)
	bf	31,0f
	lbz	8,6(4)
	stb	8,6(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	clrldi	0,0,60	      /* Number of bytes until the 1st
				 quadword.  */
	andi.	11,3,15	      /* Check alignment of DST (against
				 quadwords).  */
	srdi	9,5,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf	0x01,0
	subf	31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	/* Copy 2 bytes.  */
	bf	30,4f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	ld	6,0(12)
	addi	12,12,8
	std	6,0(3)
	addi	3,3,8
0:
	clrldi	10,12,60      /* Check alignment of SRC.  */
	srdi	9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrldi	11,31,60
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmpldi	cr1,11,0
	srdi	8,31,5	      /* Setup the loop counter.  */
	mr	10,3
	mr	11,12
	mtcrf	0x01,9
	cmpldi	cr6,9,1
#ifdef __LITTLE_ENDIAN__
	lvsr    5,0,12
#else
	lvsl    5,0,12
#endif
	lvx	3,0,12
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx	4,12,6
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	addi	11,12,16
	addi	10,3,16
	stvx	6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	lvx	3,11,7	      /* vr3 = r11+32.  */
#ifdef __LITTLE_ENDIAN__
	vperm   10,3,4,5
#else
	vperm   10,4,3,5
#endif
	addi	11,11,32
	stvx	6,0,10
	stvx	10,10,6
	addi	10,10,32

	bdnz	L(unaligned_loop)

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	rldicr	0,31,0,59
	mtcrf	0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

END_GEN_TB (MEMPCPY,TB_TOCLESS)
libc_hidden_def (__mempcpy)
weak_alias (__mempcpy, mempcpy)
libc_hidden_builtin_def (mempcpy)
Commit	Line	Data
344d0b54	1	/* Optimized mempcpy implementation for POWER7.
04277e02	2	Copyright (C) 2010-2019 Free Software Foundation, Inc.
344d0b54 LM	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	17	License along with the GNU C Library; if not, see
59ba27a6 PE	18	<http://www.gnu.org/licenses/>. */
344d0b54 LM	19
344d0b54 LM	20	#include <sysdep.h>
344d0b54 LM	21
344d0b54 LM	22
f17a4233	23	/* void * [r3] __mempcpy (void dst [r3], void src [r4], size_t len [r5]);
344d0b54 LM	24	Returns 'dst' + 'len'. */
344d0b54 LM	25
72fd128a WSM	26	#ifndef MEMPCPY
	27	# define MEMPCPY __mempcpy
	28	#endif
344d0b54	29	.machine power7
d5b41185	30	ENTRY_TOCLESS (MEMPCPY, 5)
344d0b54 LM	31	CALL_MCOUNT 3
	32
	33	cmpldi cr1,5,31
	34	neg 0,3
	35	std 3,-16(1)
	36	std 31,-8(1)
	37	cfi_offset(31,-8)
	38	ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
	39	code. */
	40
	41	andi. 11,3,7 /* Check alignment of DST. */
	42
	43
	44	clrldi 10,4,61 /* Check alignment of SRC. */
	45	cmpld cr6,10,11 /* SRC and DST alignments match? */
	46	mr 12,4
	47	mr 31,5
	48	bne cr6,L(copy_GE_32_unaligned)
	49
	50	srdi 9,5,3 /* Number of full quadwords remaining. */
	51
	52	beq L(copy_GE_32_aligned_cont)
	53
	54	clrldi 0,0,61
	55	mtcrf 0x01,0
	56	subf 31,0,5
	57
	58	/* Get the SRC aligned to 8 bytes. */
	59
	60	1: bf 31,2f
	61	lbz 6,0(12)
	62	addi 12,12,1
	63	stb 6,0(3)
	64	addi 3,3,1
	65	2: bf 30,4f
	66	lhz 6,0(12)
	67	addi 12,12,2
	68	sth 6,0(3)
	69	addi 3,3,2
	70	4: bf 29,0f
	71	lwz 6,0(12)
	72	addi 12,12,4
	73	stw 6,0(3)
	74	addi 3,3,4
	75	0:
	76	clrldi 10,12,61 /* Check alignment of SRC again. */
	77	srdi 9,31,3 /* Number of full doublewords remaining. */
	78
	79	L(copy_GE_32_aligned_cont):
	80
	81	clrldi 11,31,61
	82	mtcrf 0x01,9
	83
	84	srdi 8,31,5
	85	cmpldi cr1,9,4
	86	cmpldi cr6,11,0
	87	mr 11,12
	88
	89	/* Copy 1~3 doublewords so the main loop starts
	90	at a multiple of 32 bytes. */
	91
	92	bf 30,1f
	93	ld 6,0(12)
	94	ld 7,8(12)
95	addi 11,12,16
96	mtctr 8
97	std 6,0(3)
98	std 7,8(3)
99	addi 10,3,16
100	bf 31,4f
101	ld 0,16(12)
102	std 0,16(3)
103	blt cr1,3f
104	addi 11,12,24
105	addi 10,3,24
106	b 4f
107
108	.align 4
109	1: /* Copy 1 doubleword and set the counter. */
110	mr 10,3
111	mtctr 8
112	bf 31,4f
113	ld 6,0(12)
114	addi 11,12,8
115	std 6,0(3)
116	addi 10,3,8
117
118	/* Main aligned copy loop. Copies 32-bytes at a time. */
119	.align 4
120	4:
121	ld 6,0(11)
122	ld 7,8(11)
123	ld 8,16(11)
124	ld 0,24(11)
125	addi 11,11,32
126
127	std 6,0(10)
128	std 7,8(10)
129	std 8,16(10)
130	std 0,24(10)
131	addi 10,10,32
132	bdnz 4b
133	3:
134
135	/* Check for tail bytes. */
136	rldicr 0,31,0,60
137	mtcrf 0x01,31
138	beq cr6,0f
139
140	.L9:
141	add 3,3,0
142	add 12,12,0
143
144	/* At this point we have a tail of 0-7 bytes and we know that the
145	destination is doubleword-aligned. */
146	4: /* Copy 4 bytes. */
147	bf 29,2f
148
149	lwz 6,0(12)
150	addi 12,12,4
151	stw 6,0(3)
152	addi 3,3,4
153	2: /* Copy 2 bytes. */
154	bf 30,1f
155
156	lhz 6,0(12)
157	addi 12,12,2
158	sth 6,0(3)
159	addi 3,3,2
160	1: /* Copy 1 byte. */
161	bf 31,0f
162
163	lbz 6,0(12)
164	stb 6,0(3)
165	0: /* Return DST + LEN pointer. */
166	ld 31,-8(1)
167	ld 3,-16(1)
168	add 3,3,5
169	blr
170
171	/* Handle copies of 0~31 bytes. */
172	.align 4
173	L(copy_LT_32):
174	cmpldi cr6,5,8
175	mr 12,4
176	mtcrf 0x01,5
177	ble cr6,L(copy_LE_8)
178
179	/* At least 9 bytes to go. */
180	neg 8,4
181	clrrdi 11,4,2
182	andi. 0,8,3
183	cmpldi cr1,5,16
184	mr 10,5
185	beq L(copy_LT_32_aligned)
186
187	/* Force 4-bytes alignment for SRC. */
188	mtocrf 0x01,0
189	subf 10,0,5
190	2: bf 30,1f
191
192	lhz 6,0(12)
193	addi 12,12,2
194	sth 6,0(3)
195	addi 3,3,2
196	1: bf 31,L(end_4bytes_alignment)
197
198	lbz 6,0(12)
199	addi 12,12,1
200	stb 6,0(3)
201	addi 3,3,1
202
203	.align 4
204	L(end_4bytes_alignment):
205	cmpldi cr1,10,16
206	mtcrf 0x01,10
207
208	L(copy_LT_32_aligned):
209	/* At least 6 bytes to go, and SRC is word-aligned. */
210	blt cr1,8f
211
212	/* Copy 16 bytes. */
213	lwz 6,0(12)
214	lwz 7,4(12)
215	stw 6,0(3)
216	lwz 8,8(12)
217	stw 7,4(3)
218	lwz 6,12(12)
219	addi 12,12,16
220	stw 8,8(3)
221	stw 6,12(3)
222	addi 3,3,16
223	8: /* Copy 8 bytes. */
224	bf 28,4f
225
226	lwz 6,0(12)
227	lwz 7,4(12)
228	addi 12,12,8
229	stw 6,0(3)
230	stw 7,4(3)
231	addi 3,3,8
232	4: /* Copy 4 bytes. */
233	bf 29,2f
234
235	lwz 6,0(12)
236	addi 12,12,4
237	stw 6,0(3)
238	addi 3,3,4
239	2: /* Copy 2-3 bytes. */
240	bf 30,1f
241
242	lhz 6,0(12)
243	sth 6,0(3)
244	bf 31,0f
245	lbz 7,2(12)
246	stb 7,2(3)
247	ld 3,-16(1)
248	add 3,3,5
249	blr
250
251	.align 4
252	1: /* Copy 1 byte. */
253	bf 31,0f
254
255	lbz 6,0(12)
256	stb 6,0(3)
257	0: /* Return DST + LEN pointer. */
258	ld 3,-16(1)
259	add 3,3,5
260	blr
261
262	/* Handles copies of 0~8 bytes. */
263	.align 4
264	L(copy_LE_8):
265	bne cr6,4f
266
267	/* Though we could've used ld/std here, they are still
268	slow for unaligned cases. */
269
270	lwz 6,0(4)
271	lwz 7,4(4)
272	stw 6,0(3)
273	stw 7,4(3)
274	ld 3,-16(1) /* Return DST + LEN pointer. */
275	add 3,3,5
276	blr
277
278	.align 4
279	4: /* Copies 4~7 bytes. */
280	bf 29,2b
281
282	lwz 6,0(4)
283	stw 6,0(3)
284	bf 30,5f
285	lhz 7,4(4)
286	sth 7,4(3)
287	bf 31,0f
288	lbz 8,6(4)
289	stb 8,6(3)
290	ld 3,-16(1)
291	add 3,3,5
292	blr
293
294	.align 4
295	5: /* Copy 1 byte. */
296	bf 31,0f
297
298	lbz 6,4(4)
299	stb 6,4(3)
300
301	0: /* Return DST + LEN pointer. */
302	ld 3,-16(1)
303	add 3,3,5
304	blr
305
306	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
307	SRC is not. Use aligned quadword loads from SRC, shifted to realign
308	the data, allowing for aligned DST stores. */
309	.align 4
310	L(copy_GE_32_unaligned):
311	clrldi 0,0,60 /* Number of bytes until the 1st
312	quadword. */
313	andi. 11,3,15 /* Check alignment of DST (against
314	quadwords). */
315	srdi 9,5,4 /* Number of full quadwords remaining. */
316
317	beq L(copy_GE_32_unaligned_cont)
318
319	/* SRC is not quadword aligned, get it aligned. */
320
321	mtcrf 0x01,0
322	subf 31,0,5
323
324	/* Vector instructions work best when proper alignment (16-bytes)
325	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
326	1: /* Copy 1 byte. */
327	bf 31,2f
328
329	lbz 6,0(12)
330	addi 12,12,1
331	stb 6,0(3)
332	addi 3,3,1
333	2: /* Copy 2 bytes. */
334	bf 30,4f
335
336	lhz 6,0(12)
337	addi 12,12,2
338	sth 6,0(3)
339	addi 3,3,2
340	4: /* Copy 4 bytes. */
341	bf 29,8f
342
343	lwz 6,0(12)
344	addi 12,12,4
345	stw 6,0(3)
346	addi 3,3,4
347	8: /* Copy 8 bytes. */
348	bf 28,0f
349
350	ld 6,0(12)
351	addi 12,12,8
352	std 6,0(3)
353	addi 3,3,8
354	0:
355	clrldi 10,12,60 /* Check alignment of SRC. */
356	srdi 9,31,4 /* Number of full quadwords remaining. */
357
358	/* The proper alignment is present, it is OK to copy the bytes now. */
359	L(copy_GE_32_unaligned_cont):
360
361	/* Setup two indexes to speed up the indexed vector operations. */
362	clrldi 11,31,60
363	li 6,16 /* Index for 16-bytes offsets. */
364	li 7,32 /* Index for 32-bytes offsets. */
365	cmpldi cr1,11,0
366	srdi 8,31,5 /* Setup the loop counter. */
367	mr 10,3
368	mr 11,12
369	mtcrf 0x01,9
370	cmpldi cr6,9,1
759cfef3 AM	371	#ifdef __LITTLE_ENDIAN__
	372	lvsr 5,0,12
	373	#else
	374	lvsl 5,0,12
	375	#endif
344d0b54 LM	376	lvx 3,0,12
	377	bf 31,L(setup_unaligned_loop)
	378
	379	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
	380	lvx 4,12,6
759cfef3 AM	381	#ifdef __LITTLE_ENDIAN__
	382	vperm 6,4,3,5
	383	#else
	384	vperm 6,3,4,5
	385	#endif
344d0b54 LM	386	addi 11,12,16
	387	addi 10,3,16
	388	stvx 6,0,3
	389	vor 3,4,4
	390
	391	L(setup_unaligned_loop):
	392	mtctr 8
	393	ble cr6,L(end_unaligned_loop)
	394
	395	/* Copy 32 bytes at a time using vector instructions. */
	396	.align 4
	397	L(unaligned_loop):
	398
	399	/* Note: vr6/vr10 may contain data that was already copied,
	400	but in order to get proper alignment, we may have to copy
	401	some portions again. This is faster than having unaligned
	402	vector instructions though. */
	403
	404	lvx 4,11,6 /* vr4 = r11+16. */
759cfef3 AM	405	#ifdef __LITTLE_ENDIAN__
	406	vperm 6,4,3,5
	407	#else
	408	vperm 6,3,4,5
	409	#endif
344d0b54	410	lvx 3,11,7 /* vr3 = r11+32. */
759cfef3 AM	411	#ifdef __LITTLE_ENDIAN__
	412	vperm 10,3,4,5
	413	#else
	414	vperm 10,4,3,5
	415	#endif
344d0b54 LM	416	addi 11,11,32
	417	stvx 6,0,10
	418	stvx 10,10,6
	419	addi 10,10,32
	420
	421	bdnz L(unaligned_loop)
	422
	423	.align 4
	424	L(end_unaligned_loop):
	425
	426	/* Check for tail bytes. */
	427	rldicr 0,31,0,59
	428	mtcrf 0x01,31
	429	beq cr1,0f
	430
	431	add 3,3,0
	432	add 12,12,0
	433
	434	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
	435	8: /* Copy 8 bytes. */
	436	bf 28,4f
	437
	438	lwz 6,0(12)
	439	lwz 7,4(12)
	440	addi 12,12,8
	441	stw 6,0(3)
	442	stw 7,4(3)
	443	addi 3,3,8
	444	4: /* Copy 4 bytes. */
	445	bf 29,2f
	446
	447	lwz 6,0(12)
	448	addi 12,12,4
	449	stw 6,0(3)
	450	addi 3,3,4
	451	2: /* Copy 2~3 bytes. */
	452	bf 30,1f
	453
	454	lhz 6,0(12)
	455	addi 12,12,2
	456	sth 6,0(3)
	457	addi 3,3,2
	458	1: /* Copy 1 byte. */
	459	bf 31,0f
	460
	461	lbz 6,0(12)
	462	stb 6,0(3)
	463	0: /* Return DST + LEN pointer. */
	464	ld 31,-8(1)
	465	ld 3,-16(1)
	466	add 3,3,5
	467	blr
	468
72fd128a	469	END_GEN_TB (MEMPCPY,TB_TOCLESS)
2d67d91a JM	470	libc_hidden_def (__mempcpy)
2d67d91a JM	471	weak_alias (__mempcpy, mempcpy)
344d0b54	472	libc_hidden_builtin_def (mempcpy)