[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S

/* Optimized mempcpy implementation for POWER7.
   Copyright (C) 2010-2015 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>


/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
    Returns 'dst' + 'len'.  */

	.machine  power7
EALIGN (__mempcpy, 5, 0)
	CALL_MCOUNT 3

	cmpldi	cr1,5,31
	neg	0,3
	std	3,-16(1)
	std	31,-8(1)
	cfi_offset(31,-8)
	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
				       code.  */

	andi.	11,3,7	      /* Check alignment of DST.  */


	clrldi	10,4,61	      /* Check alignment of SRC.  */
	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srdi	9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrldi	0,0,61
	mtcrf	0x01,0
	subf	31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	bf	30,4f
	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	bf	29,0f
	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
0:
	clrldi	10,12,61      /* Check alignment of SRC again.  */
	srdi	9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrldi	11,31,61
	mtcrf	0x01,9

	srdi	8,31,5
	cmpldi	cr1,9,4
	cmpldi	cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	ld	6,0(12)
	ld	7,8(12)
	addi	11,12,16
	mtctr	8
	std	6,0(3)
	std	7,8(3)
	addi	10,3,16
	bf	31,4f
	ld	0,16(12)
	std	0,16(3)
	blt	cr1,3f
	addi	11,12,24
	addi	10,3,24
	b	4f

	.align	4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr	8
	bf	31,4f
	ld	6,0(12)
	addi	11,12,8
	std	6,0(3)
	addi	10,3,8

	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	.align	4
4:
	ld	6,0(11)
	ld	7,8(11)
	ld	8,16(11)
	ld	0,24(11)
	addi	11,11,32

	std	6,0(10)
	std	7,8(10)
	std	8,16(10)
	std	0,24(10)
	addi	10,10,32
	bdnz	4b
3:

	/* Check for tail bytes.  */
	rldicr	0,31,0,60
	mtcrf	0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	cmpldi	cr6,5,8
	mr	12,4
	mtcrf	0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrdi	11,4,2
	andi.	0,8,3
	cmpldi	cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf	10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1

	.align	4
L(end_4bytes_alignment):
	cmpldi	cr1,10,16
	mtcrf	0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz	7,4(12)
	stw	6,0(3)
	lwz	8,8(12)
	stw	7,4(3)
	lwz	6,12(12)
	addi	12,12,16
	stw	8,8(3)
	stw	6,12(3)
	addi	3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	sth	6,0(3)
	bf	31,0f
	lbz	7,2(12)
	stb	7,2(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used ld/std here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz	7,4(4)
	stw	6,0(3)
	stw	7,4(3)
	ld	3,-16(1)      /* Return DST + LEN pointer.  */
	add	3,3,5
	blr

	.align	4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw	6,0(3)
	bf	30,5f
	lhz	7,4(4)
	sth	7,4(3)
	bf	31,0f
	lbz	8,6(4)
	stb	8,6(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	clrldi	0,0,60	      /* Number of bytes until the 1st
				 quadword.  */
	andi.	11,3,15	      /* Check alignment of DST (against
				 quadwords).  */
	srdi	9,5,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf	0x01,0
	subf	31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	/* Copy 2 bytes.  */
	bf	30,4f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	ld	6,0(12)
	addi	12,12,8
	std	6,0(3)
	addi	3,3,8
0:
	clrldi	10,12,60      /* Check alignment of SRC.  */
	srdi	9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrldi	11,31,60
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmpldi	cr1,11,0
	srdi	8,31,5	      /* Setup the loop counter.  */
	mr	10,3
	mr	11,12
	mtcrf	0x01,9
	cmpldi	cr6,9,1
#ifdef __LITTLE_ENDIAN__
	lvsr    5,0,12
#else
	lvsl    5,0,12
#endif
	lvx	3,0,12
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx	4,12,6
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	addi	11,12,16
	addi	10,3,16
	stvx	6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
#ifdef __LITTLE_ENDIAN__
	vperm   6,4,3,5
#else
	vperm   6,3,4,5
#endif
	lvx	3,11,7	      /* vr3 = r11+32.  */
#ifdef __LITTLE_ENDIAN__
	vperm   10,3,4,5
#else
	vperm   10,4,3,5
#endif
	addi	11,11,32
	stvx	6,0,10
	stvx	10,10,6
	addi	10,10,32

	bdnz	L(unaligned_loop)

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	rldicr	0,31,0,59
	mtcrf	0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

END_GEN_TB (__mempcpy,TB_TOCLESS)
libc_hidden_def (__mempcpy)
weak_alias (__mempcpy, mempcpy)
libc_hidden_builtin_def (mempcpy)
Commit	Line	Data
344d0b54	1	/* Optimized mempcpy implementation for POWER7.
b168057a	2	Copyright (C) 2010-2015 Free Software Foundation, Inc.
344d0b54 LM	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	17	License along with the GNU C Library; if not, see
59ba27a6 PE	18	<http://www.gnu.org/licenses/>. */
344d0b54 LM	19
344d0b54 LM	20	#include <sysdep.h>
344d0b54 LM	21
	22
	23	/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	24	Returns 'dst' + 'len'. */
	25
	26	.machine power7
2d67d91a	27	EALIGN (__mempcpy, 5, 0)
344d0b54 LM	28	CALL_MCOUNT 3
	29
	30	cmpldi cr1,5,31
	31	neg 0,3
	32	std 3,-16(1)
	33	std 31,-8(1)
	34	cfi_offset(31,-8)
	35	ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
	36	code. */
	37
	38	andi. 11,3,7 /* Check alignment of DST. */
	39
	40
	41	clrldi 10,4,61 /* Check alignment of SRC. */
	42	cmpld cr6,10,11 /* SRC and DST alignments match? */
	43	mr 12,4
	44	mr 31,5
	45	bne cr6,L(copy_GE_32_unaligned)
	46
	47	srdi 9,5,3 /* Number of full quadwords remaining. */
	48
	49	beq L(copy_GE_32_aligned_cont)
	50
	51	clrldi 0,0,61
	52	mtcrf 0x01,0
	53	subf 31,0,5
	54
	55	/* Get the SRC aligned to 8 bytes. */
	56
	57	1: bf 31,2f
	58	lbz 6,0(12)
	59	addi 12,12,1
	60	stb 6,0(3)
	61	addi 3,3,1
	62	2: bf 30,4f
	63	lhz 6,0(12)
	64	addi 12,12,2
	65	sth 6,0(3)
	66	addi 3,3,2
	67	4: bf 29,0f
	68	lwz 6,0(12)
	69	addi 12,12,4
	70	stw 6,0(3)
	71	addi 3,3,4
	72	0:
	73	clrldi 10,12,61 /* Check alignment of SRC again. */
	74	srdi 9,31,3 /* Number of full doublewords remaining. */
	75
	76	L(copy_GE_32_aligned_cont):
	77
	78	clrldi 11,31,61
	79	mtcrf 0x01,9
	80
	81	srdi 8,31,5
	82	cmpldi cr1,9,4
	83	cmpldi cr6,11,0
	84	mr 11,12
	85
	86	/* Copy 1~3 doublewords so the main loop starts
	87	at a multiple of 32 bytes. */
	88
	89	bf 30,1f
	90	ld 6,0(12)
	91	ld 7,8(12)
92	addi 11,12,16
93	mtctr 8
94	std 6,0(3)
95	std 7,8(3)
96	addi 10,3,16
97	bf 31,4f
98	ld 0,16(12)
99	std 0,16(3)
100	blt cr1,3f
101	addi 11,12,24
102	addi 10,3,24
103	b 4f
104
105	.align 4
106	1: /* Copy 1 doubleword and set the counter. */
107	mr 10,3
108	mtctr 8
109	bf 31,4f
110	ld 6,0(12)
111	addi 11,12,8
112	std 6,0(3)
113	addi 10,3,8
114
115	/* Main aligned copy loop. Copies 32-bytes at a time. */
116	.align 4
117	4:
118	ld 6,0(11)
119	ld 7,8(11)
120	ld 8,16(11)
121	ld 0,24(11)
122	addi 11,11,32
123
124	std 6,0(10)
125	std 7,8(10)
126	std 8,16(10)
127	std 0,24(10)
128	addi 10,10,32
129	bdnz 4b
130	3:
131
132	/* Check for tail bytes. */
133	rldicr 0,31,0,60
134	mtcrf 0x01,31
135	beq cr6,0f
136
137	.L9:
138	add 3,3,0
139	add 12,12,0
140
141	/* At this point we have a tail of 0-7 bytes and we know that the
142	destination is doubleword-aligned. */
143	4: /* Copy 4 bytes. */
144	bf 29,2f
145
146	lwz 6,0(12)
147	addi 12,12,4
148	stw 6,0(3)
149	addi 3,3,4
150	2: /* Copy 2 bytes. */
151	bf 30,1f
152
153	lhz 6,0(12)
154	addi 12,12,2
155	sth 6,0(3)
156	addi 3,3,2
157	1: /* Copy 1 byte. */
158	bf 31,0f
159
160	lbz 6,0(12)
161	stb 6,0(3)
162	0: /* Return DST + LEN pointer. */
163	ld 31,-8(1)
164	ld 3,-16(1)
165	add 3,3,5
166	blr
167
168	/* Handle copies of 0~31 bytes. */
169	.align 4
170	L(copy_LT_32):
171	cmpldi cr6,5,8
172	mr 12,4
173	mtcrf 0x01,5
174	ble cr6,L(copy_LE_8)
175
176	/* At least 9 bytes to go. */
177	neg 8,4
178	clrrdi 11,4,2
179	andi. 0,8,3
180	cmpldi cr1,5,16
181	mr 10,5
182	beq L(copy_LT_32_aligned)
183
184	/* Force 4-bytes alignment for SRC. */
185	mtocrf 0x01,0
186	subf 10,0,5
187	2: bf 30,1f
188
189	lhz 6,0(12)
190	addi 12,12,2
191	sth 6,0(3)
192	addi 3,3,2
193	1: bf 31,L(end_4bytes_alignment)
194
195	lbz 6,0(12)
196	addi 12,12,1
197	stb 6,0(3)
198	addi 3,3,1
199
200	.align 4
201	L(end_4bytes_alignment):
202	cmpldi cr1,10,16
203	mtcrf 0x01,10
204
205	L(copy_LT_32_aligned):
206	/* At least 6 bytes to go, and SRC is word-aligned. */
207	blt cr1,8f
208
209	/* Copy 16 bytes. */
210	lwz 6,0(12)
211	lwz 7,4(12)
212	stw 6,0(3)
213	lwz 8,8(12)
214	stw 7,4(3)
215	lwz 6,12(12)
216	addi 12,12,16
217	stw 8,8(3)
218	stw 6,12(3)
219	addi 3,3,16
220	8: /* Copy 8 bytes. */
221	bf 28,4f
222
223	lwz 6,0(12)
224	lwz 7,4(12)
225	addi 12,12,8
226	stw 6,0(3)
227	stw 7,4(3)
228	addi 3,3,8
229	4: /* Copy 4 bytes. */
230	bf 29,2f
231
232	lwz 6,0(12)
233	addi 12,12,4
234	stw 6,0(3)
235	addi 3,3,4
236	2: /* Copy 2-3 bytes. */
237	bf 30,1f
238
239	lhz 6,0(12)
240	sth 6,0(3)
241	bf 31,0f
242	lbz 7,2(12)
243	stb 7,2(3)
244	ld 3,-16(1)
245	add 3,3,5
246	blr
247
248	.align 4
249	1: /* Copy 1 byte. */
250	bf 31,0f
251
252	lbz 6,0(12)
253	stb 6,0(3)
254	0: /* Return DST + LEN pointer. */
255	ld 3,-16(1)
256	add 3,3,5
257	blr
258
259	/* Handles copies of 0~8 bytes. */
260	.align 4
261	L(copy_LE_8):
262	bne cr6,4f
263
264	/* Though we could've used ld/std here, they are still
265	slow for unaligned cases. */
266
267	lwz 6,0(4)
268	lwz 7,4(4)
269	stw 6,0(3)
270	stw 7,4(3)
271	ld 3,-16(1) /* Return DST + LEN pointer. */
272	add 3,3,5
273	blr
274
275	.align 4
276	4: /* Copies 4~7 bytes. */
277	bf 29,2b
278
279	lwz 6,0(4)
280	stw 6,0(3)
281	bf 30,5f
282	lhz 7,4(4)
283	sth 7,4(3)
284	bf 31,0f
285	lbz 8,6(4)
286	stb 8,6(3)
287	ld 3,-16(1)
288	add 3,3,5
289	blr
290
291	.align 4
292	5: /* Copy 1 byte. */
293	bf 31,0f
294
295	lbz 6,4(4)
296	stb 6,4(3)
297
298	0: /* Return DST + LEN pointer. */
299	ld 3,-16(1)
300	add 3,3,5
301	blr
302
303	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
304	SRC is not. Use aligned quadword loads from SRC, shifted to realign
305	the data, allowing for aligned DST stores. */
306	.align 4
307	L(copy_GE_32_unaligned):
308	clrldi 0,0,60 /* Number of bytes until the 1st
309	quadword. */
310	andi. 11,3,15 /* Check alignment of DST (against
311	quadwords). */
312	srdi 9,5,4 /* Number of full quadwords remaining. */
313
314	beq L(copy_GE_32_unaligned_cont)
315
316	/* SRC is not quadword aligned, get it aligned. */
317
318	mtcrf 0x01,0
319	subf 31,0,5
320
321	/* Vector instructions work best when proper alignment (16-bytes)
322	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
323	1: /* Copy 1 byte. */
324	bf 31,2f
325
326	lbz 6,0(12)
327	addi 12,12,1
328	stb 6,0(3)
329	addi 3,3,1
330	2: /* Copy 2 bytes. */
331	bf 30,4f
332
333	lhz 6,0(12)
334	addi 12,12,2
335	sth 6,0(3)
336	addi 3,3,2
337	4: /* Copy 4 bytes. */
338	bf 29,8f
339
340	lwz 6,0(12)
341	addi 12,12,4
342	stw 6,0(3)
343	addi 3,3,4
344	8: /* Copy 8 bytes. */
345	bf 28,0f
346
347	ld 6,0(12)
348	addi 12,12,8
349	std 6,0(3)
350	addi 3,3,8
351	0:
352	clrldi 10,12,60 /* Check alignment of SRC. */
353	srdi 9,31,4 /* Number of full quadwords remaining. */
354
355	/* The proper alignment is present, it is OK to copy the bytes now. */
356	L(copy_GE_32_unaligned_cont):
357
358	/* Setup two indexes to speed up the indexed vector operations. */
359	clrldi 11,31,60
360	li 6,16 /* Index for 16-bytes offsets. */
361	li 7,32 /* Index for 32-bytes offsets. */
362	cmpldi cr1,11,0
363	srdi 8,31,5 /* Setup the loop counter. */
364	mr 10,3
365	mr 11,12
366	mtcrf 0x01,9
367	cmpldi cr6,9,1
759cfef3 AM	368	#ifdef __LITTLE_ENDIAN__
	369	lvsr 5,0,12
	370	#else
	371	lvsl 5,0,12
	372	#endif
344d0b54 LM	373	lvx 3,0,12
	374	bf 31,L(setup_unaligned_loop)
	375
	376	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
	377	lvx 4,12,6
759cfef3 AM	378	#ifdef __LITTLE_ENDIAN__
	379	vperm 6,4,3,5
	380	#else
	381	vperm 6,3,4,5
	382	#endif
344d0b54 LM	383	addi 11,12,16
	384	addi 10,3,16
	385	stvx 6,0,3
	386	vor 3,4,4
	387
	388	L(setup_unaligned_loop):
	389	mtctr 8
	390	ble cr6,L(end_unaligned_loop)
	391
	392	/* Copy 32 bytes at a time using vector instructions. */
	393	.align 4
	394	L(unaligned_loop):
	395
	396	/* Note: vr6/vr10 may contain data that was already copied,
	397	but in order to get proper alignment, we may have to copy
	398	some portions again. This is faster than having unaligned
	399	vector instructions though. */
	400
	401	lvx 4,11,6 /* vr4 = r11+16. */
759cfef3 AM	402	#ifdef __LITTLE_ENDIAN__
	403	vperm 6,4,3,5
	404	#else
	405	vperm 6,3,4,5
	406	#endif
344d0b54	407	lvx 3,11,7 /* vr3 = r11+32. */
759cfef3 AM	408	#ifdef __LITTLE_ENDIAN__
	409	vperm 10,3,4,5
	410	#else
	411	vperm 10,4,3,5
	412	#endif
344d0b54 LM	413	addi 11,11,32
	414	stvx 6,0,10
	415	stvx 10,10,6
	416	addi 10,10,32
	417
	418	bdnz L(unaligned_loop)
	419
	420	.align 4
	421	L(end_unaligned_loop):
	422
	423	/* Check for tail bytes. */
	424	rldicr 0,31,0,59
	425	mtcrf 0x01,31
	426	beq cr1,0f
	427
	428	add 3,3,0
	429	add 12,12,0
	430
	431	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
	432	8: /* Copy 8 bytes. */
	433	bf 28,4f
	434
	435	lwz 6,0(12)
	436	lwz 7,4(12)
	437	addi 12,12,8
	438	stw 6,0(3)
	439	stw 7,4(3)
	440	addi 3,3,8
	441	4: /* Copy 4 bytes. */
	442	bf 29,2f
	443
	444	lwz 6,0(12)
	445	addi 12,12,4
	446	stw 6,0(3)
	447	addi 3,3,4
	448	2: /* Copy 2~3 bytes. */
	449	bf 30,1f
	450
	451	lhz 6,0(12)
	452	addi 12,12,2
	453	sth 6,0(3)
	454	addi 3,3,2
	455	1: /* Copy 1 byte. */
	456	bf 31,0f
	457
	458	lbz 6,0(12)
	459	stb 6,0(3)
	460	0: /* Return DST + LEN pointer. */
	461	ld 31,-8(1)
	462	ld 3,-16(1)
	463	add 3,3,5
	464	blr
	465
2d67d91a JM	466	END_GEN_TB (__mempcpy,TB_TOCLESS)
	467	libc_hidden_def (__mempcpy)
	468	weak_alias (__mempcpy, mempcpy)
344d0b54	469	libc_hidden_builtin_def (mempcpy)