[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S

/* Optimized memcpy implementation for PowerPC32/POWER7.
   Copyright (C) 2010 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
   02110-1301 USA.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>

/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
   Returns 'dst'.  */

	.machine  power7
EALIGN (BP_SYM (memcpy), 5, 0)
	CALL_MCOUNT

	stwu    1,-32(1)
	cfi_adjust_cfa_offset(32)
	stw	30,20(1)
	cfi_offset(30,(20-32))
	stw	31,24(1)
	mr      30,3
	cmplwi  cr1,5,31
	neg	0,3
	cfi_offset(31,-8)
	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
				    code.  */

	andi.   11,3,7	      /* Check alignment of DST.  */
	clrlwi  10,4,29	      /* Check alignment of SRC.  */
	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srwi    9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrlwi  0,0,29
	mtcrf   0x01,0
	subf    31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1
2:	bf      30,4f
	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
4:	bf      29,0f
	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
0:
	clrlwi  10,12,29      /* Check alignment of SRC again.  */
	srwi    9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrlwi  11,31,29
	mtcrf   0x01,9

	srwi    8,31,5
	cmplwi  cr1,9,4
	cmplwi  cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	lfd     6,0(12)
	lfd     7,8(12)
	addi    11,12,16
	mtctr   8
	stfd    6,0(3)
	stfd    7,8(3)
	addi    10,3,16
	bf      31,4f
	lfd     0,16(12)
	stfd    0,16(3)
	blt     cr1,3f
	addi    11,12,24
	addi    10,3,24
	b       4f

	.align  4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr   8
	bf      31,4f
	lfd     6,0(12)
	addi    11,12,8
	stfd    6,0(3)
	addi    10,3,8

	.align  4
4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
	lfd	6,0(11)
	lfd     7,8(11)
	lfd     8,16(11)
	lfd     0,24(11)
	addi    11,11,32

	stfd    6,0(10)
	stfd    7,8(10)
	stfd    8,16(10)
	stfd    0,24(10)
	addi    10,10,32
	bdnz    4b
3:

	/* Check for tail bytes.  */

	clrrwi  0,31,3
	mtcrf   0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz	30,20(1)
	lwz     31,24(1)
	addi    1,1,32
	blr

	/* Handle copies of 0~31 bytes.  */
	.align  4
L(copy_LT_32):
	cmplwi  cr6,5,8
	mr	12,4
	mtcrf   0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrwi  11,4,2
	andi.   0,8,3
	cmplwi  cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf    10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi    12,12,2
	sth	6,0(3)
	addi    3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1

	.align  4
L(end_4bytes_alignment):
	cmplwi  cr1,10,16
	mtcrf   0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz     7,4(12)
	stw     6,0(3)
	lwz     8,8(12)
	stw     7,4(3)
	lwz     6,12(12)
	addi    12,12,16
	stw     8,8(3)
	stw     6,12(3)
	addi    3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz     6,0(12)
	lwz     7,4(12)
	addi    12,12,8
	stw     6,0(3)
	stw     7,4(3)
	addi    3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz     6,0(12)
	sth     6,0(3)
	bf      31,0f
	lbz     7,2(12)
	stb     7,2(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz	30,20(1)
	addi    1,1,32
	blr

	/* Handles copies of 0~8 bytes.  */
	.align  4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used lfd/stfd here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz     7,4(4)
	stw     6,0(3)
	stw     7,4(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw     6,0(3)
	bf      30,5f
	lhz     7,4(4)
	sth     7,4(3)
	bf      31,0f
	lbz     8,6(4)
	stb     8,6(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return original DST pointer.  */
	mr	3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align  4
L(copy_GE_32_unaligned):
	andi.   11,3,15	      /* Check alignment of DST.  */
	clrlwi  0,0,28	      /* Number of bytes until the 1st
			      quadword of DST.  */
	srwi    9,5,4	      /* Number of full quadwords remaining.  */

	beq    L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf   0x01,0
	subf    31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1
2:	/* Copy 2 bytes.  */
	bf	    30,4f

	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	lfd	6,0(12)
	addi    12,12,8
	stfd    6,0(3)
	addi    3,3,8
0:
	clrlwi  10,12,28      /* Check alignment of SRC.  */
	srwi    9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrlwi  11,31,28
	li      6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmplwi  cr1,11,0
	srwi    8,31,5	      /* Setup the loop counter.  */
	mr      10,3
	mr      11,12
	mtcrf   0x01,9
	cmplwi  cr6,9,1
	lvsl    5,0,12
	lvx     3,0,12
	bf      31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx     4,12,6
	vperm   6,3,4,5
	addi    11,12,16
	addi    10,3,16
	stvx    6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr   8
	ble     cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align  4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
			      of vr3/vr4 into vr6.  */
	lvx	3,11,7	      /* vr3 = r11+32.  */
	vperm   10,4,3,5      /* Merge the correctly-aligned portions
			      of vr3/vr4 into vr10.  */
	addi    11,11,32
	stvx    6,0,10
	stvx    10,10,6
	addi    10,10,32

	bdnz    L(unaligned_loop)

	.align  4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	clrrwi  0,31,4
	mtcrf   0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi    12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi    3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi    12,12,4
	stw	6,0(3)
	addi    3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi    12,12,2
	sth	6,0(3)
	addi    3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz     30,20(1)
	lwz	31,24(1)
	addi    1,1,32
	blr

END (BP_SYM (memcpy))
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
fb084e5e LM	1	/* Optimized memcpy implementation for PowerPC32/POWER7.
	2	Copyright (C) 2010 Free Software Foundation, Inc.
	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, write to the Free
	18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
	19	02110-1301 USA. */
	20
	21	#include <sysdep.h>
	22	#include <bp-sym.h>
	23	#include <bp-asm.h>
	24
	25	/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	26	Returns 'dst'. */
	27
	28	.machine power7
	29	EALIGN (BP_SYM (memcpy), 5, 0)
	30	CALL_MCOUNT
	31
	32	stwu 1,-32(1)
	33	cfi_adjust_cfa_offset(32)
	34	stw 30,20(1)
	35	cfi_offset(30,(20-32))
	36	stw 31,24(1)
	37	mr 30,3
	38	cmplwi cr1,5,31
	39	neg 0,3
	40	cfi_offset(31,-8)
	41	ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
	42	code. */
	43
	44	andi. 11,3,7 /* Check alignment of DST. */
	45	clrlwi 10,4,29 /* Check alignment of SRC. */
	46	cmplw cr6,10,11 /* SRC and DST alignments match? */
	47	mr 12,4
	48	mr 31,5
	49	bne cr6,L(copy_GE_32_unaligned)
	50
	51	srwi 9,5,3 /* Number of full quadwords remaining. */
	52
	53	beq L(copy_GE_32_aligned_cont)
	54
	55	clrlwi 0,0,29
	56	mtcrf 0x01,0
	57	subf 31,0,5
	58
	59	/* Get the SRC aligned to 8 bytes. */
	60
b8907dfd UD	61	1: bf 31,2f
	62	lbz 6,0(12)
	63	addi 12,12,1
	64	stb 6,0(3)
	65	addi 3,3,1
	66	2: bf 30,4f
	67	lhz 6,0(12)
	68	addi 12,12,2
	69	sth 6,0(3)
	70	addi 3,3,2
	71	4: bf 29,0f
	72	lwz 6,0(12)
	73	addi 12,12,4
	74	stw 6,0(3)
	75	addi 3,3,4
fb084e5e	76	0:
b8907dfd UD	77	clrlwi 10,12,29 /* Check alignment of SRC again. */
b8907dfd UD	78	srwi 9,31,3 /* Number of full doublewords remaining. */
fb084e5e LM	79
	80	L(copy_GE_32_aligned_cont):
	81
b8907dfd UD	82	clrlwi 11,31,29
	83	mtcrf 0x01,9
	84
	85	srwi 8,31,5
	86	cmplwi cr1,9,4
	87	cmplwi cr6,11,0
	88	mr 11,12
	89
	90	/* Copy 1~3 doublewords so the main loop starts
	91	at a multiple of 32 bytes. */
	92
	93	bf 30,1f
	94	lfd 6,0(12)
	95	lfd 7,8(12)
	96	addi 11,12,16
	97	mtctr 8
	98	stfd 6,0(3)
	99	stfd 7,8(3)
	100	addi 10,3,16
	101	bf 31,4f
	102	lfd 0,16(12)
	103	stfd 0,16(3)
	104	blt cr1,3f
	105	addi 11,12,24
	106	addi 10,3,24
	107	b 4f
	108
	109	.align 4
	110	1: /* Copy 1 doubleword and set the counter. */
	111	mr 10,3
	112	mtctr 8
	113	bf 31,4f
	114	lfd 6,0(12)
	115	addi 11,12,8
	116	stfd 6,0(3)
	117	addi 10,3,8
	118
	119	.align 4
	120	4: /* Main aligned copy loop. Copies 32-bytes at a time. */
	121	lfd 6,0(11)
	122	lfd 7,8(11)
	123	lfd 8,16(11)
	124	lfd 0,24(11)
	125	addi 11,11,32
	126
	127	stfd 6,0(10)
	128	stfd 7,8(10)
	129	stfd 8,16(10)
	130	stfd 0,24(10)
	131	addi 10,10,32
	132	bdnz 4b
fb084e5e LM	133	3:
fb084e5e LM	134
b8907dfd	135	/* Check for tail bytes. */
fb084e5e	136
b8907dfd UD	137	clrrwi 0,31,3
	138	mtcrf 0x01,31
	139	beq cr6,0f
fb084e5e LM	140
fb084e5e LM	141	.L9:
b8907dfd UD	142	add 3,3,0
	143	add 12,12,0
	144
	145	/* At this point we have a tail of 0-7 bytes and we know that the
	146	destination is doubleword-aligned. */
	147	4: /* Copy 4 bytes. */
	148	bf 29,2f
	149
	150	lwz 6,0(12)
	151	addi 12,12,4
	152	stw 6,0(3)
	153	addi 3,3,4
	154	2: /* Copy 2 bytes. */
	155	bf 30,1f
	156
	157	lhz 6,0(12)
	158	addi 12,12,2
	159	sth 6,0(3)
	160	addi 3,3,2
	161	1: /* Copy 1 byte. */
	162	bf 31,0f
	163
	164	lbz 6,0(12)
	165	stb 6,0(3)
	166	0: /* Return original DST pointer. */
	167	mr 3,30
	168	lwz 30,20(1)
	169	lwz 31,24(1)
	170	addi 1,1,32
	171	blr
	172
	173	/* Handle copies of 0~31 bytes. */
	174	.align 4
fb084e5e	175	L(copy_LT_32):
b8907dfd UD	176	cmplwi cr6,5,8
	177	mr 12,4
	178	mtcrf 0x01,5
	179	ble cr6,L(copy_LE_8)
	180
	181	/* At least 9 bytes to go. */
	182	neg 8,4
	183	clrrwi 11,4,2
	184	andi. 0,8,3
	185	cmplwi cr1,5,16
	186	mr 10,5
	187	beq L(copy_LT_32_aligned)
	188
	189	/* Force 4-bytes alignment for SRC. */
	190	mtocrf 0x01,0
	191	subf 10,0,5
	192	2: bf 30,1f
	193
	194	lhz 6,0(12)
	195	addi 12,12,2
	196	sth 6,0(3)
	197	addi 3,3,2
	198	1: bf 31,L(end_4bytes_alignment)
	199
	200	lbz 6,0(12)
	201	addi 12,12,1
	202	stb 6,0(3)
	203	addi 3,3,1
	204
	205	.align 4
fb084e5e	206	L(end_4bytes_alignment):
b8907dfd UD	207	cmplwi cr1,10,16
b8907dfd UD	208	mtcrf 0x01,10
fb084e5e LM	209
fb084e5e LM	210	L(copy_LT_32_aligned):
b8907dfd UD	211	/* At least 6 bytes to go, and SRC is word-aligned. */
	212	blt cr1,8f
	213
	214	/* Copy 16 bytes. */
	215	lwz 6,0(12)
	216	lwz 7,4(12)
	217	stw 6,0(3)
	218	lwz 8,8(12)
	219	stw 7,4(3)
	220	lwz 6,12(12)
	221	addi 12,12,16
	222	stw 8,8(3)
	223	stw 6,12(3)
	224	addi 3,3,16
	225	8: /* Copy 8 bytes. */
	226	bf 28,4f
	227
	228	lwz 6,0(12)
	229	lwz 7,4(12)
	230	addi 12,12,8
	231	stw 6,0(3)
	232	stw 7,4(3)
	233	addi 3,3,8
	234	4: /* Copy 4 bytes. */
	235	bf 29,2f
	236
	237	lwz 6,0(12)
	238	addi 12,12,4
	239	stw 6,0(3)
	240	addi 3,3,4
	241	2: /* Copy 2-3 bytes. */
	242	bf 30,1f
	243
	244	lhz 6,0(12)
	245	sth 6,0(3)
	246	bf 31,0f
	247	lbz 7,2(12)
	248	stb 7,2(3)
	249
	250	/* Return original DST pointer. */
	251	mr 3,30
	252	lwz 30,20(1)
	253	addi 1,1,32
	254	blr
	255
	256	.align 4
	257	1: /* Copy 1 byte. */
	258	bf 31,0f
	259
	260	lbz 6,0(12)
	261	stb 6,0(3)
	262	0: /* Return original DST pointer. */
	263	mr 3,30
	264	lwz 30,20(1)
	265	addi 1,1,32
	266	blr
	267
	268	/* Handles copies of 0~8 bytes. */
	269	.align 4
fb084e5e	270	L(copy_LE_8):
b8907dfd UD	271	bne cr6,4f
	272
	273	/* Though we could've used lfd/stfd here, they are still
	274	slow for unaligned cases. */
	275
	276	lwz 6,0(4)
	277	lwz 7,4(4)
	278	stw 6,0(3)
	279	stw 7,4(3)
	280
	281	/* Return original DST pointer. */
	282	mr 3,30
	283	lwz 30,20(1)
	284	addi 1,1,32
	285	blr
	286
	287	.align 4
	288	4: /* Copies 4~7 bytes. */
	289	bf 29,2b
	290
	291	lwz 6,0(4)
	292	stw 6,0(3)
	293	bf 30,5f
	294	lhz 7,4(4)
	295	sth 7,4(3)
	296	bf 31,0f
	297	lbz 8,6(4)
	298	stb 8,6(3)
	299
	300	/* Return original DST pointer. */
	301	mr 3,30
	302	lwz 30,20(1)
	303	addi 1,1,32
	304	blr
	305
	306	.align 4
	307	5: /* Copy 1 byte. */
	308	bf 31,0f
	309
	310	lbz 6,4(4)
	311	stb 6,4(3)
	312
	313	0: /* Return original DST pointer. */
	314	mr 3,30
	315	lwz 30,20(1)
	316	addi 1,1,32
	317	blr
	318
	319	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	320	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	321	the data, allowing for aligned DST stores. */
	322	.align 4
fb084e5e	323	L(copy_GE_32_unaligned):
b8907dfd UD	324	andi. 11,3,15 /* Check alignment of DST. */
	325	clrlwi 0,0,28 /* Number of bytes until the 1st
	326	quadword of DST. */
	327	srwi 9,5,4 /* Number of full quadwords remaining. */
	328
	329	beq L(copy_GE_32_unaligned_cont)
	330
	331	/* SRC is not quadword aligned, get it aligned. */
	332
	333	mtcrf 0x01,0
	334	subf 31,0,5
	335
	336	/* Vector instructions work best when proper alignment (16-bytes)
	337	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
	338	1: /* Copy 1 byte. */
	339	bf 31,2f
	340
	341	lbz 6,0(12)
	342	addi 12,12,1
	343	stb 6,0(3)
	344	addi 3,3,1
	345	2: /* Copy 2 bytes. */
	346	bf 30,4f
	347
	348	lhz 6,0(12)
	349	addi 12,12,2
	350	sth 6,0(3)
	351	addi 3,3,2
	352	4: /* Copy 4 bytes. */
	353	bf 29,8f
	354
	355	lwz 6,0(12)
	356	addi 12,12,4
	357	stw 6,0(3)
	358	addi 3,3,4
	359	8: /* Copy 8 bytes. */
	360	bf 28,0f
	361
	362	lfd 6,0(12)
	363	addi 12,12,8
	364	stfd 6,0(3)
	365	addi 3,3,8
fb084e5e	366	0:
b8907dfd	367	clrlwi 10,12,28 /* Check alignment of SRC. */
eb5ad2eb	368	srwi 9,31,4 /* Number of full quadwords remaining. */
fb084e5e	369
b8907dfd	370	/* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e LM	371	L(copy_GE_32_unaligned_cont):
fb084e5e LM	372
b8907dfd UD	373	/* Setup two indexes to speed up the indexed vector operations. */
	374	clrlwi 11,31,28
	375	li 6,16 /* Index for 16-bytes offsets. */
	376	li 7,32 /* Index for 32-bytes offsets. */
	377	cmplwi cr1,11,0
eb5ad2eb	378	srwi 8,31,5 /* Setup the loop counter. */
b8907dfd UD	379	mr 10,3
	380	mr 11,12
	381	mtcrf 0x01,9
	382	cmplwi cr6,9,1
	383	lvsl 5,0,12
	384	lvx 3,0,12
	385	bf 31,L(setup_unaligned_loop)
	386
	387	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
	388	lvx 4,12,6
	389	vperm 6,3,4,5
	390	addi 11,12,16
	391	addi 10,3,16
	392	stvx 6,0,3
	393	vor 3,4,4
fb084e5e LM	394
fb084e5e LM	395	L(setup_unaligned_loop):
b8907dfd UD	396	mtctr 8
b8907dfd UD	397	ble cr6,L(end_unaligned_loop)
fb084e5e	398
b8907dfd UD	399	/* Copy 32 bytes at a time using vector instructions. */
b8907dfd UD	400	.align 4
fb084e5e LM	401	L(unaligned_loop):
fb084e5e LM	402
b8907dfd UD	403	/* Note: vr6/vr10 may contain data that was already copied,
	404	but in order to get proper alignment, we may have to copy
	405	some portions again. This is faster than having unaligned
	406	vector instructions though. */
	407
	408	lvx 4,11,6 /* vr4 = r11+16. */
	409	vperm 6,3,4,5 /* Merge the correctly-aligned portions
	410	of vr3/vr4 into vr6. */
	411	lvx 3,11,7 /* vr3 = r11+32. */
	412	vperm 10,4,3,5 /* Merge the correctly-aligned portions
	413	of vr3/vr4 into vr10. */
	414	addi 11,11,32
	415	stvx 6,0,10
	416	stvx 10,10,6
	417	addi 10,10,32
	418
	419	bdnz L(unaligned_loop)
	420
	421	.align 4
fb084e5e LM	422	L(end_unaligned_loop):
fb084e5e LM	423
b8907dfd UD	424	/* Check for tail bytes. */
	425	clrrwi 0,31,4
	426	mtcrf 0x01,31
	427	beq cr1,0f
	428
	429	add 3,3,0
	430	add 12,12,0
	431
	432	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
	433	8: /* Copy 8 bytes. */
	434	bf 28,4f
	435
	436	lwz 6,0(12)
	437	lwz 7,4(12)
	438	addi 12,12,8
	439	stw 6,0(3)
	440	stw 7,4(3)
	441	addi 3,3,8
	442	4: /* Copy 4 bytes. */
	443	bf 29,2f
	444
	445	lwz 6,0(12)
	446	addi 12,12,4
	447	stw 6,0(3)
	448	addi 3,3,4
	449	2: /* Copy 2~3 bytes. */
	450	bf 30,1f
	451
	452	lhz 6,0(12)
	453	addi 12,12,2
	454	sth 6,0(3)
	455	addi 3,3,2
	456	1: /* Copy 1 byte. */
	457	bf 31,0f
	458
	459	lbz 6,0(12)
	460	stb 6,0(3)
	461	0: /* Return original DST pointer. */
	462	mr 3,30
	463	lwz 30,20(1)
	464	lwz 31,24(1)
	465	addi 1,1,32
	466	blr
fb084e5e LM	467
	468	END (BP_SYM (memcpy))
	469	libc_hidden_builtin_def (memcpy)