[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S

/* Optimized mempcpy implementation for POWER7.
   Copyright (C) 2010-2013 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>


/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
    Returns 'dst' + 'len'.  */

	.machine  power7
EALIGN (BP_SYM (__mempcpy), 5, 0)
	CALL_MCOUNT 3

	cmpldi	cr1,5,31
	neg	0,3
	std	3,-16(1)
	std	31,-8(1)
	cfi_offset(31,-8)
	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
				       code.  */

	andi.	11,3,7	      /* Check alignment of DST.  */


	clrldi	10,4,61	      /* Check alignment of SRC.  */
	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srdi	9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrldi	0,0,61
	mtcrf	0x01,0
	subf	31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	bf	30,4f
	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	bf	29,0f
	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
0:
	clrldi	10,12,61      /* Check alignment of SRC again.  */
	srdi	9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrldi	11,31,61
	mtcrf	0x01,9

	srdi	8,31,5
	cmpldi	cr1,9,4
	cmpldi	cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	ld	6,0(12)
	ld	7,8(12)
	addi	11,12,16
	mtctr	8
	std	6,0(3)
	std	7,8(3)
	addi	10,3,16
	bf	31,4f
	ld	0,16(12)
	std	0,16(3)
	blt	cr1,3f
	addi	11,12,24
	addi	10,3,24
	b	4f

	.align	4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr	8
	bf	31,4f
	ld	6,0(12)
	addi	11,12,8
	std	6,0(3)
	addi	10,3,8

	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	.align	4
4:
	ld	6,0(11)
	ld	7,8(11)
	ld	8,16(11)
	ld	0,24(11)
	addi	11,11,32

	std	6,0(10)
	std	7,8(10)
	std	8,16(10)
	std	0,24(10)
	addi	10,10,32
	bdnz	4b
3:

	/* Check for tail bytes.  */
	rldicr	0,31,0,60
	mtcrf	0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	cmpldi	cr6,5,8
	mr	12,4
	mtcrf	0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrdi	11,4,2
	andi.	0,8,3
	cmpldi	cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf	10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1

	.align	4
L(end_4bytes_alignment):
	cmpldi	cr1,10,16
	mtcrf	0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz	7,4(12)
	stw	6,0(3)
	lwz	8,8(12)
	stw	7,4(3)
	lwz	6,12(12)
	addi	12,12,16
	stw	8,8(3)
	stw	6,12(3)
	addi	3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	sth	6,0(3)
	bf	31,0f
	lbz	7,2(12)
	stb	7,2(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used ld/std here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz	7,4(4)
	stw	6,0(3)
	stw	7,4(3)
	ld	3,-16(1)      /* Return DST + LEN pointer.  */
	add	3,3,5
	blr

	.align	4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw	6,0(3)
	bf	30,5f
	lhz	7,4(4)
	sth	7,4(3)
	bf	31,0f
	lbz	8,6(4)
	stb	8,6(3)
	ld	3,-16(1)
	add	3,3,5
	blr

	.align	4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return DST + LEN pointer.  */
	ld	3,-16(1)
	add	3,3,5
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	clrldi	0,0,60	      /* Number of bytes until the 1st
				 quadword.  */
	andi.	11,3,15	      /* Check alignment of DST (against
				 quadwords).  */
	srdi	9,5,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf	0x01,0
	subf	31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	/* Copy 2 bytes.  */
	bf	30,4f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	ld	6,0(12)
	addi	12,12,8
	std	6,0(3)
	addi	3,3,8
0:
	clrldi	10,12,60      /* Check alignment of SRC.  */
	srdi	9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrldi	11,31,60
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmpldi	cr1,11,0
	srdi	8,31,5	      /* Setup the loop counter.  */
	mr	10,3
	mr	11,12
	mtcrf	0x01,9
	cmpldi	cr6,9,1
	lvsl	5,0,12
	lvx	3,0,12
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx	4,12,6
	vperm	6,3,4,5
	addi	11,12,16
	addi	10,3,16
	stvx	6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
				 of vr3/vr4 into vr6.  */
	lvx	3,11,7	      /* vr3 = r11+32.  */
	vperm	10,4,3,5      /* Merge the correctly-aligned portions
				 of vr3/vr4 into vr10.  */
	addi	11,11,32
	stvx	6,0,10
	stvx	10,10,6
	addi	10,10,32

	bdnz	L(unaligned_loop)

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	rldicr	0,31,0,59
	mtcrf	0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	ld	31,-8(1)
	ld	3,-16(1)
	add	3,3,5
	blr

END_GEN_TB (BP_SYM (__mempcpy),TB_TOCLESS)
libc_hidden_def (BP_SYM (__mempcpy))
weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
libc_hidden_builtin_def (mempcpy)
Commit	Line	Data
344d0b54	1	/* Optimized mempcpy implementation for POWER7.
568035b7	2	Copyright (C) 2010-2013 Free Software Foundation, Inc.
344d0b54 LM	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	17	License along with the GNU C Library; if not, see
59ba27a6 PE	18	<http://www.gnu.org/licenses/>. */
344d0b54 LM	19
	20	#include <sysdep.h>
	21	#include <bp-sym.h>
	22	#include <bp-asm.h>
	23
	24
	25	/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	26	Returns 'dst' + 'len'. */
	27
	28	.machine power7
	29	EALIGN (BP_SYM (__mempcpy), 5, 0)
	30	CALL_MCOUNT 3
	31
	32	cmpldi cr1,5,31
	33	neg 0,3
	34	std 3,-16(1)
	35	std 31,-8(1)
	36	cfi_offset(31,-8)
	37	ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
	38	code. */
	39
	40	andi. 11,3,7 /* Check alignment of DST. */
	41
	42
	43	clrldi 10,4,61 /* Check alignment of SRC. */
	44	cmpld cr6,10,11 /* SRC and DST alignments match? */
	45	mr 12,4
	46	mr 31,5
	47	bne cr6,L(copy_GE_32_unaligned)
	48
	49	srdi 9,5,3 /* Number of full quadwords remaining. */
	50
	51	beq L(copy_GE_32_aligned_cont)
	52
	53	clrldi 0,0,61
	54	mtcrf 0x01,0
	55	subf 31,0,5
	56
	57	/* Get the SRC aligned to 8 bytes. */
	58
	59	1: bf 31,2f
	60	lbz 6,0(12)
	61	addi 12,12,1
	62	stb 6,0(3)
	63	addi 3,3,1
	64	2: bf 30,4f
	65	lhz 6,0(12)
	66	addi 12,12,2
	67	sth 6,0(3)
	68	addi 3,3,2
	69	4: bf 29,0f
	70	lwz 6,0(12)
	71	addi 12,12,4
	72	stw 6,0(3)
	73	addi 3,3,4
	74	0:
	75	clrldi 10,12,61 /* Check alignment of SRC again. */
	76	srdi 9,31,3 /* Number of full doublewords remaining. */
	77
	78	L(copy_GE_32_aligned_cont):
	79
	80	clrldi 11,31,61
	81	mtcrf 0x01,9
	82
83	srdi 8,31,5
84	cmpldi cr1,9,4
85	cmpldi cr6,11,0
86	mr 11,12
87
88	/* Copy 1~3 doublewords so the main loop starts
89	at a multiple of 32 bytes. */
90
91	bf 30,1f
92	ld 6,0(12)
93	ld 7,8(12)
94	addi 11,12,16
95	mtctr 8
96	std 6,0(3)
97	std 7,8(3)
98	addi 10,3,16
99	bf 31,4f
100	ld 0,16(12)
101	std 0,16(3)
102	blt cr1,3f
103	addi 11,12,24
104	addi 10,3,24
105	b 4f
106
107	.align 4
108	1: /* Copy 1 doubleword and set the counter. */
109	mr 10,3
110	mtctr 8
111	bf 31,4f
112	ld 6,0(12)
113	addi 11,12,8
114	std 6,0(3)
115	addi 10,3,8
116
117	/* Main aligned copy loop. Copies 32-bytes at a time. */
118	.align 4
119	4:
120	ld 6,0(11)
121	ld 7,8(11)
122	ld 8,16(11)
123	ld 0,24(11)
124	addi 11,11,32
125
126	std 6,0(10)
127	std 7,8(10)
128	std 8,16(10)
129	std 0,24(10)
130	addi 10,10,32
131	bdnz 4b
132	3:
133
134	/* Check for tail bytes. */
135	rldicr 0,31,0,60
136	mtcrf 0x01,31
137	beq cr6,0f
138
139	.L9:
140	add 3,3,0
141	add 12,12,0
142
143	/* At this point we have a tail of 0-7 bytes and we know that the
144	destination is doubleword-aligned. */
145	4: /* Copy 4 bytes. */
146	bf 29,2f
147
148	lwz 6,0(12)
149	addi 12,12,4
150	stw 6,0(3)
151	addi 3,3,4
152	2: /* Copy 2 bytes. */
153	bf 30,1f
154
155	lhz 6,0(12)
156	addi 12,12,2
157	sth 6,0(3)
158	addi 3,3,2
159	1: /* Copy 1 byte. */
160	bf 31,0f
161
162	lbz 6,0(12)
163	stb 6,0(3)
164	0: /* Return DST + LEN pointer. */
165	ld 31,-8(1)
166	ld 3,-16(1)
167	add 3,3,5
168	blr
169
170	/* Handle copies of 0~31 bytes. */
171	.align 4
172	L(copy_LT_32):
173	cmpldi cr6,5,8
174	mr 12,4
175	mtcrf 0x01,5
176	ble cr6,L(copy_LE_8)
177
178	/* At least 9 bytes to go. */
179	neg 8,4
180	clrrdi 11,4,2
181	andi. 0,8,3
182	cmpldi cr1,5,16
183	mr 10,5
184	beq L(copy_LT_32_aligned)
185
186	/* Force 4-bytes alignment for SRC. */
187	mtocrf 0x01,0
188	subf 10,0,5
189	2: bf 30,1f
190
191	lhz 6,0(12)
192	addi 12,12,2
193	sth 6,0(3)
194	addi 3,3,2
195	1: bf 31,L(end_4bytes_alignment)
196
197	lbz 6,0(12)
198	addi 12,12,1
199	stb 6,0(3)
200	addi 3,3,1
201
202	.align 4
203	L(end_4bytes_alignment):
204	cmpldi cr1,10,16
205	mtcrf 0x01,10
206
207	L(copy_LT_32_aligned):
208	/* At least 6 bytes to go, and SRC is word-aligned. */
209	blt cr1,8f
210
211	/* Copy 16 bytes. */
212	lwz 6,0(12)
213	lwz 7,4(12)
214	stw 6,0(3)
215	lwz 8,8(12)
216	stw 7,4(3)
217	lwz 6,12(12)
218	addi 12,12,16
219	stw 8,8(3)
220	stw 6,12(3)
221	addi 3,3,16
222	8: /* Copy 8 bytes. */
223	bf 28,4f
224
225	lwz 6,0(12)
226	lwz 7,4(12)
227	addi 12,12,8
228	stw 6,0(3)
229	stw 7,4(3)
230	addi 3,3,8
231	4: /* Copy 4 bytes. */
232	bf 29,2f
233
234	lwz 6,0(12)
235	addi 12,12,4
236	stw 6,0(3)
237	addi 3,3,4
238	2: /* Copy 2-3 bytes. */
239	bf 30,1f
240
241	lhz 6,0(12)
242	sth 6,0(3)
243	bf 31,0f
244	lbz 7,2(12)
245	stb 7,2(3)
246	ld 3,-16(1)
247	add 3,3,5
248	blr
249
250	.align 4
251	1: /* Copy 1 byte. */
252	bf 31,0f
253
254	lbz 6,0(12)
255	stb 6,0(3)
256	0: /* Return DST + LEN pointer. */
257	ld 3,-16(1)
258	add 3,3,5
259	blr
260
261	/* Handles copies of 0~8 bytes. */
262	.align 4
263	L(copy_LE_8):
264	bne cr6,4f
265
266	/* Though we could've used ld/std here, they are still
267	slow for unaligned cases. */
268
269	lwz 6,0(4)
270	lwz 7,4(4)
271	stw 6,0(3)
272	stw 7,4(3)
273	ld 3,-16(1) /* Return DST + LEN pointer. */
274	add 3,3,5
275	blr
276
277	.align 4
278	4: /* Copies 4~7 bytes. */
279	bf 29,2b
280
281	lwz 6,0(4)
282	stw 6,0(3)
283	bf 30,5f
284	lhz 7,4(4)
285	sth 7,4(3)
286	bf 31,0f
287	lbz 8,6(4)
288	stb 8,6(3)
289	ld 3,-16(1)
290	add 3,3,5
291	blr
292
293	.align 4
294	5: /* Copy 1 byte. */
295	bf 31,0f
296
297	lbz 6,4(4)
298	stb 6,4(3)
299
300	0: /* Return DST + LEN pointer. */
301	ld 3,-16(1)
302	add 3,3,5
303	blr
304
305	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
306	SRC is not. Use aligned quadword loads from SRC, shifted to realign
307	the data, allowing for aligned DST stores. */
308	.align 4
309	L(copy_GE_32_unaligned):
310	clrldi 0,0,60 /* Number of bytes until the 1st
311	quadword. */
312	andi. 11,3,15 /* Check alignment of DST (against
313	quadwords). */
314	srdi 9,5,4 /* Number of full quadwords remaining. */
315
316	beq L(copy_GE_32_unaligned_cont)
317
318	/* SRC is not quadword aligned, get it aligned. */
319
320	mtcrf 0x01,0
321	subf 31,0,5
322
323	/* Vector instructions work best when proper alignment (16-bytes)
324	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
325	1: /* Copy 1 byte. */
326	bf 31,2f
327
328	lbz 6,0(12)
329	addi 12,12,1
330	stb 6,0(3)
331	addi 3,3,1
332	2: /* Copy 2 bytes. */
333	bf 30,4f
334
335	lhz 6,0(12)
336	addi 12,12,2
337	sth 6,0(3)
338	addi 3,3,2
339	4: /* Copy 4 bytes. */
340	bf 29,8f
341
342	lwz 6,0(12)
343	addi 12,12,4
344	stw 6,0(3)
345	addi 3,3,4
346	8: /* Copy 8 bytes. */
347	bf 28,0f
348
349	ld 6,0(12)
350	addi 12,12,8
351	std 6,0(3)
352	addi 3,3,8
353	0:
354	clrldi 10,12,60 /* Check alignment of SRC. */
355	srdi 9,31,4 /* Number of full quadwords remaining. */
356
357	/* The proper alignment is present, it is OK to copy the bytes now. */
358	L(copy_GE_32_unaligned_cont):
359
360	/* Setup two indexes to speed up the indexed vector operations. */
361	clrldi 11,31,60
362	li 6,16 /* Index for 16-bytes offsets. */
363	li 7,32 /* Index for 32-bytes offsets. */
364	cmpldi cr1,11,0
365	srdi 8,31,5 /* Setup the loop counter. */
366	mr 10,3
367	mr 11,12
368	mtcrf 0x01,9
369	cmpldi cr6,9,1
370	lvsl 5,0,12
371	lvx 3,0,12
372	bf 31,L(setup_unaligned_loop)
373
374	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
375	lvx 4,12,6
376	vperm 6,3,4,5
377	addi 11,12,16
378	addi 10,3,16
379	stvx 6,0,3
380	vor 3,4,4
381
382	L(setup_unaligned_loop):
383	mtctr 8
384	ble cr6,L(end_unaligned_loop)
385
386	/* Copy 32 bytes at a time using vector instructions. */
387	.align 4
388	L(unaligned_loop):
389
390	/* Note: vr6/vr10 may contain data that was already copied,
391	but in order to get proper alignment, we may have to copy
392	some portions again. This is faster than having unaligned
393	vector instructions though. */
394
395	lvx 4,11,6 /* vr4 = r11+16. */
396	vperm 6,3,4,5 /* Merge the correctly-aligned portions
397	of vr3/vr4 into vr6. */
398	lvx 3,11,7 /* vr3 = r11+32. */
399	vperm 10,4,3,5 /* Merge the correctly-aligned portions
400	of vr3/vr4 into vr10. */
401	addi 11,11,32
402	stvx 6,0,10
403	stvx 10,10,6
404	addi 10,10,32
405
406	bdnz L(unaligned_loop)
407
408	.align 4
409	L(end_unaligned_loop):
410
411	/* Check for tail bytes. */
412	rldicr 0,31,0,59
413	mtcrf 0x01,31
414	beq cr1,0f
415
416	add 3,3,0
417	add 12,12,0
418
419	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
420	8: /* Copy 8 bytes. */
421	bf 28,4f
422
423	lwz 6,0(12)
424	lwz 7,4(12)
425	addi 12,12,8
426	stw 6,0(3)
427	stw 7,4(3)
428	addi 3,3,8
429	4: /* Copy 4 bytes. */
430	bf 29,2f
431
432	lwz 6,0(12)
433	addi 12,12,4
434	stw 6,0(3)
435	addi 3,3,4
436	2: /* Copy 2~3 bytes. */
437	bf 30,1f
438
439	lhz 6,0(12)
440	addi 12,12,2
441	sth 6,0(3)
442	addi 3,3,2
443	1: /* Copy 1 byte. */
444	bf 31,0f
445
446	lbz 6,0(12)
447	stb 6,0(3)
448	0: /* Return DST + LEN pointer. */
449	ld 31,-8(1)
450	ld 3,-16(1)
451	add 3,3,5
452	blr
453
454	END_GEN_TB (BP_SYM (__mempcpy),TB_TOCLESS)
455	libc_hidden_def (BP_SYM (__mempcpy))
456	weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
457	libc_hidden_builtin_def (mempcpy)