[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / mempcpy.S

/* Optimized mempcpy implementation for POWER7.
   Copyright (C) 2010 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
   02110-1301 USA.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>

/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	Returns 'dst' + 'len'.  */

	.machine  power7
EALIGN (BP_SYM (__mempcpy), 5, 0)
	CALL_MCOUNT

	stwu	1,-32(1)
	cfi_adjust_cfa_offset(32)
	stw	30,20(1)
	cfi_offset(30,(20-32))
	stw	31,24(1)
	mr	30,3
	cmplwi	cr1,5,31
	neg	0,3
	cfi_offset(31,-8)
	ble	cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
					code.  */

	andi.	11,3,7	      /* Check alignment of DST.  */
	clrlwi	10,4,29	      /* Check alignment of SRC.  */
	cmplw	cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srwi	9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrlwi	0,0,29
	mtcrf	0x01,0
	subf	31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	bf	30,4f
	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	bf	29,0f
	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
0:
	clrlwi	10,12,29      /* Check alignment of SRC again.  */
	srwi	9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrlwi	11,31,29
	mtcrf	0x01,9

	srwi	8,31,5
	cmplwi	cr1,9,4
	cmplwi	cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	lfd	6,0(12)
	lfd	7,8(12)
	addi	11,12,16
	mtctr	8
	stfd	6,0(3)
	stfd	7,8(3)
	addi	10,3,16
	bf	31,4f
	lfd	0,16(12)
	stfd	0,16(3)
	blt	cr1,3f
	addi	11,12,24
	addi	10,3,24
	b	4f

	.align	4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr	8
	bf	31,4f
	lfd	6,0(12)
	addi	11,12,8
	stfd	6,0(3)
	addi	10,3,8

	.align	4
4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	lfd	6,0(11)
	lfd	7,8(11)
	lfd	8,16(11)
	lfd	0,24(11)
	addi	11,11,32

	stfd	6,0(10)
	stfd	7,8(10)
	stfd	8,16(10)
	stfd	0,24(10)
	addi	10,10,32
	bdnz	4b
3:

	/* Check for tail bytes.  */

	clrrwi	0,31,3
	mtcrf	0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	lwz	31,24(1)
	addi	1,1,32
	blr

	/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	cmplwi	cr6,5,8
	mr	12,4
	mtcrf	0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrwi	11,4,2
	andi.	0,8,3
	cmplwi	cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf	10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1

	.align	4
L(end_4bytes_alignment):
	cmplwi	cr1,10,16
	mtcrf	0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz	7,4(12)
	stw	6,0(3)
	lwz	8,8(12)
	stw	7,4(3)
	lwz	6,12(12)
	addi	12,12,16
	stw	8,8(3)
	stw	6,12(3)
	addi	3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	sth	6,0(3)
	bf	31,0f
	lbz	7,2(12)
	stb	7,2(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used lfd/stfd here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz	7,4(4)
	stw	6,0(3)
	stw	7,4(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw	6,0(3)
	bf	30,5f
	lhz	7,4(4)
	sth	7,4(3)
	bf	31,0f
	lbz	8,6(4)
	stb	8,6(3)

	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	.align	4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	addi	1,1,32
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	andi.	11,3,15	      /* Check alignment of DST.  */
	clrlwi	0,0,28	      /* Number of bytes until the 1st
				 quadword of DST.  */
	srwi	9,5,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf	0x01,0
	subf	31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi	12,12,1
	stb	6,0(3)
	addi	3,3,1
2:	/* Copy 2 bytes.  */
	bf		30,4f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	lfd	6,0(12)
	addi	12,12,8
	stfd	6,0(3)
	addi	3,3,8
0:
	clrlwi	10,12,28      /* Check alignment of SRC.  */
	srwi	9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrlwi	11,31,28
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmplwi	cr1,11,0
	srwi	8,31,5	      /* Setup the loop counter.  */
	mr	10,3
	mr	11,12
	mtcrf	0x01,9
	cmplwi	cr6,9,1
	lvsl	5,0,12
	lvx	3,0,12
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx	4,12,6
	vperm	6,3,4,5
	addi	11,12,16
	addi	10,3,16
	stvx	6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
				 of vr3/vr4 into vr6.  */
	lvx	3,11,7	      /* vr3 = r11+32.  */
	vperm	10,4,3,5      /* Merge the correctly-aligned portions
				 of vr3/vr4 into vr10.  */
	addi	11,11,32
	stvx	6,0,10
	stvx	10,10,6
	addi	10,10,32

	bdnz	L(unaligned_loop)

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	clrrwi	0,31,4
	mtcrf	0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi	12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi	3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi	12,12,4
	stw	6,0(3)
	addi	3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi	12,12,2
	sth	6,0(3)
	addi	3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return DST + LEN pointer.  */
	add	3,30,5
	lwz	30,20(1)
	lwz	31,24(1)
	addi	1,1,32
	blr

END (BP_SYM (__mempcpy))
libc_hidden_def (BP_SYM (__mempcpy))
weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
libc_hidden_builtin_def (mempcpy)
Commit	Line	Data
344d0b54 LM	1	/* Optimized mempcpy implementation for POWER7.
	2	Copyright (C) 2010 Free Software Foundation, Inc.
	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, write to the Free
	18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
	19	02110-1301 USA. */
	20
	21	#include <sysdep.h>
	22	#include <bp-sym.h>
	23	#include <bp-asm.h>
	24
	25	/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	26	Returns 'dst' + 'len'. */
	27
	28	.machine power7
	29	EALIGN (BP_SYM (__mempcpy), 5, 0)
	30	CALL_MCOUNT
	31
	32	stwu 1,-32(1)
	33	cfi_adjust_cfa_offset(32)
	34	stw 30,20(1)
	35	cfi_offset(30,(20-32))
	36	stw 31,24(1)
	37	mr 30,3
	38	cmplwi cr1,5,31
	39	neg 0,3
	40	cfi_offset(31,-8)
	41	ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
	42	code. */
	43
	44	andi. 11,3,7 /* Check alignment of DST. */
	45	clrlwi 10,4,29 /* Check alignment of SRC. */
	46	cmplw cr6,10,11 /* SRC and DST alignments match? */
	47	mr 12,4
	48	mr 31,5
	49	bne cr6,L(copy_GE_32_unaligned)
	50
	51	srwi 9,5,3 /* Number of full quadwords remaining. */
	52
	53	beq L(copy_GE_32_aligned_cont)
	54
	55	clrlwi 0,0,29
	56	mtcrf 0x01,0
	57	subf 31,0,5
	58
	59	/* Get the SRC aligned to 8 bytes. */
	60
	61	1: bf 31,2f
	62	lbz 6,0(12)
	63	addi 12,12,1
	64	stb 6,0(3)
65	addi 3,3,1
66	2: bf 30,4f
67	lhz 6,0(12)
68	addi 12,12,2
69	sth 6,0(3)
70	addi 3,3,2
71	4: bf 29,0f
72	lwz 6,0(12)
73	addi 12,12,4
74	stw 6,0(3)
75	addi 3,3,4
76	0:
77	clrlwi 10,12,29 /* Check alignment of SRC again. */
78	srwi 9,31,3 /* Number of full doublewords remaining. */
79
80	L(copy_GE_32_aligned_cont):
81
82	clrlwi 11,31,29
83	mtcrf 0x01,9
84
85	srwi 8,31,5
86	cmplwi cr1,9,4
87	cmplwi cr6,11,0
88	mr 11,12
89
90	/* Copy 1~3 doublewords so the main loop starts
91	at a multiple of 32 bytes. */
92
93	bf 30,1f
94	lfd 6,0(12)
95	lfd 7,8(12)
96	addi 11,12,16
97	mtctr 8
98	stfd 6,0(3)
99	stfd 7,8(3)
100	addi 10,3,16
101	bf 31,4f
102	lfd 0,16(12)
103	stfd 0,16(3)
104	blt cr1,3f
105	addi 11,12,24
106	addi 10,3,24
107	b 4f
108
109	.align 4
110	1: /* Copy 1 doubleword and set the counter. */
111	mr 10,3
112	mtctr 8
113	bf 31,4f
114	lfd 6,0(12)
115	addi 11,12,8
116	stfd 6,0(3)
117	addi 10,3,8
118
119	.align 4
120	4: /* Main aligned copy loop. Copies 32-bytes at a time. */
121	lfd 6,0(11)
122	lfd 7,8(11)
123	lfd 8,16(11)
124	lfd 0,24(11)
125	addi 11,11,32
126
127	stfd 6,0(10)
128	stfd 7,8(10)
129	stfd 8,16(10)
130	stfd 0,24(10)
131	addi 10,10,32
132	bdnz 4b
133	3:
134
135	/* Check for tail bytes. */
136
137	clrrwi 0,31,3
138	mtcrf 0x01,31
139	beq cr6,0f
140
141	.L9:
142	add 3,3,0
143	add 12,12,0
144
145	/* At this point we have a tail of 0-7 bytes and we know that the
146	destination is doubleword-aligned. */
147	4: /* Copy 4 bytes. */
148	bf 29,2f
149
150	lwz 6,0(12)
151	addi 12,12,4
152	stw 6,0(3)
153	addi 3,3,4
154	2: /* Copy 2 bytes. */
155	bf 30,1f
156
157	lhz 6,0(12)
158	addi 12,12,2
159	sth 6,0(3)
160	addi 3,3,2
161	1: /* Copy 1 byte. */
162	bf 31,0f
163
164	lbz 6,0(12)
165	stb 6,0(3)
166	0: /* Return DST + LEN pointer. */
167	add 3,30,5
168	lwz 30,20(1)
169	lwz 31,24(1)
170	addi 1,1,32
171	blr
172
173	/* Handle copies of 0~31 bytes. */
174	.align 4
175	L(copy_LT_32):
176	cmplwi cr6,5,8
177	mr 12,4
178	mtcrf 0x01,5
179	ble cr6,L(copy_LE_8)
180
181	/* At least 9 bytes to go. */
182	neg 8,4
183	clrrwi 11,4,2
184	andi. 0,8,3
185	cmplwi cr1,5,16
186	mr 10,5
187	beq L(copy_LT_32_aligned)
188
189	/* Force 4-bytes alignment for SRC. */
190	mtocrf 0x01,0
191	subf 10,0,5
192	2: bf 30,1f
193
194	lhz 6,0(12)
195	addi 12,12,2
196	sth 6,0(3)
197	addi 3,3,2
198	1: bf 31,L(end_4bytes_alignment)
199
200	lbz 6,0(12)
201	addi 12,12,1
202	stb 6,0(3)
203	addi 3,3,1
204
205	.align 4
206	L(end_4bytes_alignment):
207	cmplwi cr1,10,16
208	mtcrf 0x01,10
209
210	L(copy_LT_32_aligned):
211	/* At least 6 bytes to go, and SRC is word-aligned. */
212	blt cr1,8f
213
214	/* Copy 16 bytes. */
215	lwz 6,0(12)
216	lwz 7,4(12)
217	stw 6,0(3)
218	lwz 8,8(12)
219	stw 7,4(3)
220	lwz 6,12(12)
221	addi 12,12,16
222	stw 8,8(3)
223	stw 6,12(3)
224	addi 3,3,16
225	8: /* Copy 8 bytes. */
226	bf 28,4f
227
228	lwz 6,0(12)
229	lwz 7,4(12)
230	addi 12,12,8
231	stw 6,0(3)
232	stw 7,4(3)
233	addi 3,3,8
234	4: /* Copy 4 bytes. */
235	bf 29,2f
236
237	lwz 6,0(12)
238	addi 12,12,4
239	stw 6,0(3)
240	addi 3,3,4
241	2: /* Copy 2-3 bytes. */
242	bf 30,1f
243
244	lhz 6,0(12)
245	sth 6,0(3)
246	bf 31,0f
247	lbz 7,2(12)
248	stb 7,2(3)
249
250	/* Return DST + LEN pointer. */
251	add 3,30,5
252	lwz 30,20(1)
253	addi 1,1,32
254	blr
255
256	.align 4
257	1: /* Copy 1 byte. */
258	bf 31,0f
259
260	lbz 6,0(12)
261	stb 6,0(3)
262	0: /* Return DST + LEN pointer. */
263	add 3,30,5
264	lwz 30,20(1)
265	addi 1,1,32
266	blr
267
268	/* Handles copies of 0~8 bytes. */
269	.align 4
270	L(copy_LE_8):
271	bne cr6,4f
272
273	/* Though we could've used lfd/stfd here, they are still
274	slow for unaligned cases. */
275
276	lwz 6,0(4)
277	lwz 7,4(4)
278	stw 6,0(3)
279	stw 7,4(3)
280
281	/* Return DST + LEN pointer. */
282	add 3,30,5
283	lwz 30,20(1)
284	addi 1,1,32
285	blr
286
287	.align 4
288	4: /* Copies 4~7 bytes. */
289	bf 29,2b
290
291	lwz 6,0(4)
292	stw 6,0(3)
293	bf 30,5f
294	lhz 7,4(4)
295	sth 7,4(3)
296	bf 31,0f
297	lbz 8,6(4)
298	stb 8,6(3)
299
300	/* Return DST + LEN pointer. */
301	add 3,30,5
302	lwz 30,20(1)
303	addi 1,1,32
304	blr
305
306	.align 4
307	5: /* Copy 1 byte. */
308	bf 31,0f
309
310	lbz 6,4(4)
311	stb 6,4(3)
312
313	0: /* Return DST + LEN pointer. */
314	add 3,30,5
315	lwz 30,20(1)
316	addi 1,1,32
317	blr
318
319	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
320	SRC is not. Use aligned quadword loads from SRC, shifted to realign
321	the data, allowing for aligned DST stores. */
322	.align 4
323	L(copy_GE_32_unaligned):
324	andi. 11,3,15 /* Check alignment of DST. */
325	clrlwi 0,0,28 /* Number of bytes until the 1st
326	quadword of DST. */
327	srwi 9,5,4 /* Number of full quadwords remaining. */
328
329	beq L(copy_GE_32_unaligned_cont)
330
331	/* SRC is not quadword aligned, get it aligned. */
332
333	mtcrf 0x01,0
334	subf 31,0,5
335
336	/* Vector instructions work best when proper alignment (16-bytes)
337	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
338	1: /* Copy 1 byte. */
339	bf 31,2f
340
341	lbz 6,0(12)
342	addi 12,12,1
343	stb 6,0(3)
344	addi 3,3,1
345	2: /* Copy 2 bytes. */
346	bf 30,4f
347
348	lhz 6,0(12)
349	addi 12,12,2
350	sth 6,0(3)
351	addi 3,3,2
352	4: /* Copy 4 bytes. */
353	bf 29,8f
354
355	lwz 6,0(12)
356	addi 12,12,4
357	stw 6,0(3)
358	addi 3,3,4
359	8: /* Copy 8 bytes. */
360	bf 28,0f
361
362	lfd 6,0(12)
363	addi 12,12,8
364	stfd 6,0(3)
365	addi 3,3,8
366	0:
367	clrlwi 10,12,28 /* Check alignment of SRC. */
368	srwi 9,31,4 /* Number of full quadwords remaining. */
369
370	/* The proper alignment is present, it is OK to copy the bytes now. */
371	L(copy_GE_32_unaligned_cont):
372
373	/* Setup two indexes to speed up the indexed vector operations. */
374	clrlwi 11,31,28
375	li 6,16 /* Index for 16-bytes offsets. */
376	li 7,32 /* Index for 32-bytes offsets. */
377	cmplwi cr1,11,0
378	srwi 8,31,5 /* Setup the loop counter. */
379	mr 10,3
380	mr 11,12
381	mtcrf 0x01,9
382	cmplwi cr6,9,1
383	lvsl 5,0,12
384	lvx 3,0,12
385	bf 31,L(setup_unaligned_loop)
386
387	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
388	lvx 4,12,6
389	vperm 6,3,4,5
390	addi 11,12,16
391	addi 10,3,16
392	stvx 6,0,3
393	vor 3,4,4
394
395	L(setup_unaligned_loop):
396	mtctr 8
397	ble cr6,L(end_unaligned_loop)
398
399	/* Copy 32 bytes at a time using vector instructions. */
400	.align 4
401	L(unaligned_loop):
402
403	/* Note: vr6/vr10 may contain data that was already copied,
404	but in order to get proper alignment, we may have to copy
405	some portions again. This is faster than having unaligned
406	vector instructions though. */
407
408	lvx 4,11,6 /* vr4 = r11+16. */
409	vperm 6,3,4,5 /* Merge the correctly-aligned portions
410	of vr3/vr4 into vr6. */
411	lvx 3,11,7 /* vr3 = r11+32. */
412	vperm 10,4,3,5 /* Merge the correctly-aligned portions
413	of vr3/vr4 into vr10. */
414	addi 11,11,32
415	stvx 6,0,10
416	stvx 10,10,6
417	addi 10,10,32
418
419	bdnz L(unaligned_loop)
420
421	.align 4
422	L(end_unaligned_loop):
423
424	/* Check for tail bytes. */
425	clrrwi 0,31,4
426	mtcrf 0x01,31
427	beq cr1,0f
428
429	add 3,3,0
430	add 12,12,0
431
432	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
433	8: /* Copy 8 bytes. */
434	bf 28,4f
435
436	lwz 6,0(12)
437	lwz 7,4(12)
438	addi 12,12,8
439	stw 6,0(3)
440	stw 7,4(3)
441	addi 3,3,8
442	4: /* Copy 4 bytes. */
443	bf 29,2f
444
445	lwz 6,0(12)
446	addi 12,12,4
447	stw 6,0(3)
448	addi 3,3,4
449	2: /* Copy 2~3 bytes. */
450	bf 30,1f
451
452	lhz 6,0(12)
453	addi 12,12,2
454	sth 6,0(3)
455	addi 3,3,2
456	1: /* Copy 1 byte. */
457	bf 31,0f
458
459	lbz 6,0(12)
460	stb 6,0(3)
461	0: /* Return DST + LEN pointer. */
462	add 3,30,5
463	lwz 30,20(1)
464	lwz 31,24(1)
465	addi 1,1,32
466	blr
467
468	END (BP_SYM (__mempcpy))
469	libc_hidden_def (BP_SYM (__mempcpy))
470	weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
471	libc_hidden_builtin_def (mempcpy)