[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memcpy.S

/* Optimized memcpy implementation for PowerPC64/POWER7.
   Copyright (C) 2010-2014 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>


/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
   Returns 'dst'.  */

#define dst 11		/* Use r11 so r3 kept unchanged.  */
#define src 4
#define cnt 5

	.machine power7
EALIGN (memcpy, 5, 0)
	CALL_MCOUNT 3

	cmpldi	cr1,cnt,31
	neg	0,3
	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
				    code.  */

#ifdef __LITTLE_ENDIAN__
/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
   loop is only used for quadword aligned copies.  */
	andi.	10,3,15
	clrldi	11,4,60
#else
	andi.	10,3,7		/* Check alignment of DST.  */
	clrldi	11,4,61		/* Check alignment of SRC.  */
#endif
	cmpld	cr6,10,11	/* SRC and DST alignments match?  */

	mr	dst,3
	bne	cr6,L(copy_GE_32_unaligned)
	beq	L(aligned_copy)

	mtocrf	0x01,0
#ifdef __LITTLE_ENDIAN__
	clrldi	0,0,60
#else
	clrldi	0,0,61
#endif

/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
1:
	bf	31,2f
	lbz	6,0(src)
	addi	src,src,1
	stb	6,0(dst)
	addi	dst,dst,1
2:
	bf	30,4f
	lhz	6,0(src)
	addi	src,src,2
	sth	6,0(dst)
	addi	dst,dst,2
4:
	bf	29,8f
	lwz	6,0(src)
	addi	src,src,4
	stw	6,0(dst)
	addi	dst,dst,4
8:
#ifdef __LITTLE_ENDIAN__
	bf	28,16f
	ld	6,0(src)
	addi	src,src,8
	std	6,0(dst)
	addi	dst,dst,8
16:
#endif
	subf	cnt,0,cnt

/* Main aligned copy loop. Copies 128 bytes at a time. */
L(aligned_copy):
	li	6,16
	li	7,32
	li	8,48
	mtocrf	0x02,cnt
	srdi	12,cnt,7
	cmpdi	12,0
	beq	L(aligned_tail)
	lxvd2x	6,0,src
	lxvd2x	7,src,6
	mtctr	12
	b	L(aligned_128loop)

	.align  4
L(aligned_128head):
	/* for the 2nd + iteration of this loop. */
	lxvd2x	6,0,src
	lxvd2x	7,src,6
L(aligned_128loop):
	lxvd2x	8,src,7
	lxvd2x	9,src,8
	stxvd2x	6,0,dst
	addi	src,src,64
	stxvd2x	7,dst,6
	stxvd2x	8,dst,7
	stxvd2x	9,dst,8
	lxvd2x	6,0,src
	lxvd2x	7,src,6
	addi	dst,dst,64
	lxvd2x	8,src,7
	lxvd2x	9,src,8
	addi	src,src,64
	stxvd2x	6,0,dst
	stxvd2x	7,dst,6
	stxvd2x	8,dst,7
	stxvd2x	9,dst,8
	addi	dst,dst,64
	bdnz	L(aligned_128head)

L(aligned_tail):
	mtocrf	0x01,cnt
	bf	25,32f
	lxvd2x	6,0,src
	lxvd2x	7,src,6
	lxvd2x	8,src,7
	lxvd2x	9,src,8
	addi	src,src,64
	stxvd2x	6,0,dst
	stxvd2x	7,dst,6
	stxvd2x	8,dst,7
	stxvd2x	9,dst,8
	addi	dst,dst,64
32:
	bf	26,16f
	lxvd2x	6,0,src
	lxvd2x	7,src,6
	addi	src,src,32
	stxvd2x	6,0,dst
	stxvd2x	7,dst,6
	addi	dst,dst,32
16:
	bf	27,8f
	lxvd2x	6,0,src
	addi	src,src,16
	stxvd2x	6,0,dst
	addi	dst,dst,16
8:
	bf	28,4f
	ld	6,0(src)
	addi	src,src,8
	std     6,0(dst)
	addi	dst,dst,8
4:	/* Copies 4~7 bytes.  */
	bf	29,L(tail2)
	lwz	6,0(src)
	stw     6,0(dst)
	bf      30,L(tail5)
	lhz     7,4(src)
	sth     7,4(dst)
	bflr	31
	lbz     8,6(src)
	stb     8,6(dst)
	/* Return original DST pointer.  */
	blr


/* Handle copies of 0~31 bytes.  */
	.align	4
L(copy_LT_32):
	mr	dst,3
	cmpldi	cr6,cnt,8
	mtocrf	0x01,cnt
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	andi.	0,8,3
	cmpldi	cr1,cnt,16
	beq	L(copy_LT_32_aligned)

	/* Force 4-byte alignment for SRC.  */
	mtocrf	0x01,0
	subf	cnt,0,cnt
2:
	bf	30,1f
	lhz	6,0(src)
	addi	src,src,2
	sth	6,0(dst)
	addi	dst,dst,2
1:
	bf	31,L(end_4bytes_alignment)
	lbz	6,0(src)
	addi	src,src,1
	stb	6,0(dst)
	addi	dst,dst,1

	.align	4
L(end_4bytes_alignment):
	cmpldi	cr1,cnt,16
	mtocrf	0x01,cnt

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(src)
	lwz	7,4(src)
	stw	6,0(dst)
	lwz	8,8(src)
	stw	7,4(dst)
	lwz	6,12(src)
	addi	src,src,16
	stw	8,8(dst)
	stw	6,12(dst)
	addi	dst,dst,16
8:	/* Copy 8 bytes.  */
	bf	28,L(tail4)
	lwz	6,0(src)
	lwz	7,4(src)
	addi	src,src,8
	stw	6,0(dst)
	stw	7,4(dst)
	addi	dst,dst,8

	.align	4
/* Copies 4~7 bytes.  */
L(tail4):
	bf	29,L(tail2)
	lwz	6,0(src)
	stw	6,0(dst)
	bf	30,L(tail5)
	lhz	7,4(src)
	sth	7,4(dst)
	bflr	31
	lbz	8,6(src)
	stb	8,6(dst)
	/* Return original DST pointer.  */
	blr

	.align	4
/* Copies 2~3 bytes.  */
L(tail2):
	bf	30,1f
	lhz	6,0(src)
	sth	6,0(dst)
	bflr	31
	lbz	7,2(src)
	stb	7,2(dst)
	blr

	.align	4
L(tail5):
	bflr	31
	lbz	6,4(src)
	stb	6,4(dst)
	blr

	.align	4
1:
	bflr	31
	lbz	6,0(src)
	stb	6,0(dst)
	/* Return original DST pointer.  */
	blr


/* Handles copies of 0~8 bytes.  */
	.align	4
L(copy_LE_8):
	bne	cr6,L(tail4)

	/* Though we could've used ld/std here, they are still
	slow for unaligned cases.  */

	lwz	6,0(src)
	lwz	7,4(src)
	stw	6,0(dst)
	stw	7,4(dst)
	blr


/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
   the data, allowing for aligned DST stores.  */
	.align	4
L(copy_GE_32_unaligned):
	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
#ifndef __LITTLE_ENDIAN__
	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
#endif
	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_unaligned_cont)

	/* DST is not quadword aligned, get it aligned.  */

	mtocrf	0x01,0
	subf	cnt,0,cnt

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:
	bf	31,2f
	lbz	6,0(src)
	addi	src,src,1
	stb	6,0(dst)
	addi	dst,dst,1
2:
	bf	30,4f
	lhz	6,0(src)
	addi	src,src,2
	sth	6,0(dst)
	addi	dst,dst,2
4:
	bf	29,8f
	lwz	6,0(src)
	addi	src,src,4
	stw	6,0(dst)
	addi	dst,dst,4
8:
	bf	28,0f
	ld	6,0(src)
	addi	src,src,8
	std	6,0(dst)
	addi	dst,dst,8
0:
	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrldi	10,cnt,60
	li	6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmpldi	cr1,10,0
	srdi	8,cnt,5	      /* Setup the loop counter.  */
	mtocrf	0x01,9
	cmpldi	cr6,9,1
#ifdef __LITTLE_ENDIAN__
	lvsr	5,0,src
#else
	lvsl	5,0,src
#endif
	lvx	3,0,src
	li	0,0
	bf	31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
	lvx	4,src,6
#ifdef __LITTLE_ENDIAN__
	vperm	6,4,3,5
#else
	vperm	6,3,4,5
#endif
	addi	src,src,16
	stvx	6,0,dst
	addi	dst,dst,16
	vor	3,4,4
	clrrdi	0,src,60

L(setup_unaligned_loop):
	mtctr	8
	ble	cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align	4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,src,6
#ifdef __LITTLE_ENDIAN__
	vperm	6,4,3,5
#else
	vperm	6,3,4,5
#endif
	lvx	3,src,7
#ifdef __LITTLE_ENDIAN__
	vperm	10,3,4,5
#else
	vperm	10,4,3,5
#endif
	addi	src,src,32
	stvx	6,0,dst
	stvx	10,dst,6
	addi	dst,dst,32
	bdnz	L(unaligned_loop)

	clrrdi	0,src,60

	.align	4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	mtocrf	0x01,cnt
	beqlr	cr1

	add	src,src,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
	/* Copy 8 bytes.  */
	bf	28,4f
	lwz	6,0(src)
	lwz	7,4(src)
	addi	src,src,8
	stw	6,0(dst)
	stw	7,4(dst)
	addi	dst,dst,8
4:	/* Copy 4~7 bytes.  */
	bf	29,L(tail2)
	lwz	6,0(src)
	stw	6,0(dst)
	bf	30,L(tail5)
	lhz	7,4(src)
	sth	7,4(dst)
	bflr	31
	lbz	8,6(src)
	stb	8,6(dst)
	/* Return original DST pointer.  */
	blr

END_GEN_TB (memcpy,TB_TOCLESS)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
fb084e5e	1	/* Optimized memcpy implementation for PowerPC64/POWER7.
d4697bc9	2	Copyright (C) 2010-2014 Free Software Foundation, Inc.
fb084e5e LM	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	17	License along with the GNU C Library; if not, see
59ba27a6 PE	18	<http://www.gnu.org/licenses/>. */
fb084e5e LM	19
fb084e5e LM	20	#include <sysdep.h>
fb084e5e LM	21
	22
	23	/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	24	Returns 'dst'. */
	25
759cfef3 AM	26	#define dst 11 /* Use r11 so r3 kept unchanged. */
	27	#define src 4
	28	#define cnt 5
	29
fb084e5e	30	.machine power7
2d67d91a	31	EALIGN (memcpy, 5, 0)
fb084e5e LM	32	CALL_MCOUNT 3
fb084e5e LM	33
759cfef3	34	cmpldi cr1,cnt,31
fb084e5e	35	neg 0,3
fb084e5e LM	36	ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
	37	code. */
	38
759cfef3 AM	39	#ifdef __LITTLE_ENDIAN__
	40	/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
	41	or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
	42	loop is only used for quadword aligned copies. */
	43	andi. 10,3,15
	44	clrldi 11,4,60
	45	#else
	46	andi. 10,3,7 /* Check alignment of DST. */
	47	clrldi 11,4,61 /* Check alignment of SRC. */
	48	#endif
	49	cmpld cr6,10,11 /* SRC and DST alignments match? */
	50
	51	mr dst,3
fb084e5e	52	bne cr6,L(copy_GE_32_unaligned)
759cfef3	53	beq L(aligned_copy)
fb084e5e	54
759cfef3 AM	55	mtocrf 0x01,0
	56	#ifdef __LITTLE_ENDIAN__
	57	clrldi 0,0,60
	58	#else
	59	clrldi 0,0,61
	60	#endif
fb084e5e	61
759cfef3 AM	62	/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
	63	1:
	64	bf 31,2f
	65	lbz 6,0(src)
	66	addi src,src,1
	67	stb 6,0(dst)
	68	addi dst,dst,1
	69	2:
	70	bf 30,4f
	71	lhz 6,0(src)
	72	addi src,src,2
	73	sth 6,0(dst)
	74	addi dst,dst,2
fb084e5e	75	4:
759cfef3 AM	76	bf 29,8f
	77	lwz 6,0(src)
	78	addi src,src,4
	79	stw 6,0(dst)
	80	addi dst,dst,4
	81	8:
	82	#ifdef __LITTLE_ENDIAN__
	83	bf 28,16f
	84	ld 6,0(src)
	85	addi src,src,8
	86	std 6,0(dst)
	87	addi dst,dst,8
	88	16:
	89	#endif
	90	subf cnt,0,cnt
	91
	92	/* Main aligned copy loop. Copies 128 bytes at a time. */
	93	L(aligned_copy):
	94	li 6,16
	95	li 7,32
	96	li 8,48
	97	mtocrf 0x02,cnt
	98	srdi 12,cnt,7
	99	cmpdi 12,0
	100	beq L(aligned_tail)
	101	lxvd2x 6,0,src
	102	lxvd2x 7,src,6
	103	mtctr 12
5025581e WS	104	b L(aligned_128loop)
5025581e WS	105
759cfef3	106	.align 4
5025581e WS	107	L(aligned_128head):
5025581e WS	108	/* for the 2nd + iteration of this loop. */
759cfef3 AM	109	lxvd2x 6,0,src
759cfef3 AM	110	lxvd2x 7,src,6
5025581e	111	L(aligned_128loop):
759cfef3 AM	112	lxvd2x 8,src,7
	113	lxvd2x 9,src,8
	114	stxvd2x 6,0,dst
	115	addi src,src,64
	116	stxvd2x 7,dst,6
	117	stxvd2x 8,dst,7
	118	stxvd2x 9,dst,8
	119	lxvd2x 6,0,src
	120	lxvd2x 7,src,6
	121	addi dst,dst,64
	122	lxvd2x 8,src,7
	123	lxvd2x 9,src,8
	124	addi src,src,64
	125	stxvd2x 6,0,dst
	126	stxvd2x 7,dst,6
	127	stxvd2x 8,dst,7
	128	stxvd2x 9,dst,8
	129	addi dst,dst,64
5025581e	130	bdnz L(aligned_128head)
fb084e5e	131
759cfef3 AM	132	L(aligned_tail):
	133	mtocrf 0x01,cnt
	134	bf 25,32f
	135	lxvd2x 6,0,src
	136	lxvd2x 7,src,6
	137	lxvd2x 8,src,7
	138	lxvd2x 9,src,8
	139	addi src,src,64
	140	stxvd2x 6,0,dst
	141	stxvd2x 7,dst,6
	142	stxvd2x 8,dst,7
	143	stxvd2x 9,dst,8
	144	addi dst,dst,64
	145	32:
	146	bf 26,16f
	147	lxvd2x 6,0,src
	148	lxvd2x 7,src,6
	149	addi src,src,32
	150	stxvd2x 6,0,dst
	151	stxvd2x 7,dst,6
	152	addi dst,dst,32
	153	16:
	154	bf 27,8f
	155	lxvd2x 6,0,src
	156	addi src,src,16
	157	stxvd2x 6,0,dst
	158	addi dst,dst,16
	159	8:
	160	bf 28,4f
	161	ld 6,0(src)
	162	addi src,src,8
	163	std 6,0(dst)
	164	addi dst,dst,8
	165	4: /* Copies 4~7 bytes. */
	166	bf 29,L(tail2)
	167	lwz 6,0(src)
	168	stw 6,0(dst)
	169	bf 30,L(tail5)
	170	lhz 7,4(src)
	171	sth 7,4(dst)
	172	bflr 31
	173	lbz 8,6(src)
	174	stb 8,6(dst)
	175	/* Return original DST pointer. */
b8907dfd UD	176	blr
b8907dfd UD	177
759cfef3 AM	178
	179	/* Handle copies of 0~31 bytes. */
	180	.align 4
fb084e5e	181	L(copy_LT_32):
759cfef3 AM	182	mr dst,3
	183	cmpldi cr6,cnt,8
	184	mtocrf 0x01,cnt
b8907dfd UD	185	ble cr6,L(copy_LE_8)
	186
	187	/* At least 9 bytes to go. */
	188	neg 8,4
759cfef3 AM	189	andi. 0,8,3
759cfef3 AM	190	cmpldi cr1,cnt,16
b8907dfd UD	191	beq L(copy_LT_32_aligned)
b8907dfd UD	192
759cfef3 AM	193	/* Force 4-byte alignment for SRC. */
	194	mtocrf 0x01,0
	195	subf cnt,0,cnt
	196	2:
	197	bf 30,1f
	198	lhz 6,0(src)
	199	addi src,src,2
	200	sth 6,0(dst)
	201	addi dst,dst,2
	202	1:
	203	bf 31,L(end_4bytes_alignment)
	204	lbz 6,0(src)
	205	addi src,src,1
	206	stb 6,0(dst)
	207	addi dst,dst,1
	208
	209	.align 4
fb084e5e	210	L(end_4bytes_alignment):
759cfef3 AM	211	cmpldi cr1,cnt,16
759cfef3 AM	212	mtocrf 0x01,cnt
fb084e5e LM	213
	214	L(copy_LT_32_aligned):
	215	/* At least 6 bytes to go, and SRC is word-aligned. */
b8907dfd UD	216	blt cr1,8f
	217
	218	/* Copy 16 bytes. */
759cfef3 AM	219	lwz 6,0(src)
	220	lwz 7,4(src)
	221	stw 6,0(dst)
	222	lwz 8,8(src)
	223	stw 7,4(dst)
	224	lwz 6,12(src)
	225	addi src,src,16
	226	stw 8,8(dst)
	227	stw 6,12(dst)
	228	addi dst,dst,16
b8907dfd	229	8: /* Copy 8 bytes. */
759cfef3 AM	230	bf 28,L(tail4)
	231	lwz 6,0(src)
	232	lwz 7,4(src)
	233	addi src,src,8
	234	stw 6,0(dst)
	235	stw 7,4(dst)
	236	addi dst,dst,8
	237
	238	.align 4
	239	/* Copies 4~7 bytes. */
	240	L(tail4):
	241	bf 29,L(tail2)
	242	lwz 6,0(src)
	243	stw 6,0(dst)
	244	bf 30,L(tail5)
	245	lhz 7,4(src)
	246	sth 7,4(dst)
	247	bflr 31
	248	lbz 8,6(src)
	249	stb 8,6(dst)
	250	/* Return original DST pointer. */
	251	blr
b8907dfd	252
759cfef3 AM	253	.align 4
	254	/* Copies 2~3 bytes. */
	255	L(tail2):
b8907dfd	256	bf 30,1f
759cfef3 AM	257	lhz 6,0(src)
	258	sth 6,0(dst)
	259	bflr 31
	260	lbz 7,2(src)
	261	stb 7,2(dst)
b8907dfd UD	262	blr
b8907dfd UD	263
759cfef3 AM	264	.align 4
	265	L(tail5):
	266	bflr 31
	267	lbz 6,4(src)
	268	stb 6,4(dst)
	269	blr
b8907dfd	270
759cfef3 AM	271	.align 4
	272	1:
	273	bflr 31
	274	lbz 6,0(src)
	275	stb 6,0(dst)
	276	/* Return original DST pointer. */
b8907dfd UD	277	blr
b8907dfd UD	278
759cfef3 AM	279
	280	/* Handles copies of 0~8 bytes. */
	281	.align 4
fb084e5e	282	L(copy_LE_8):
759cfef3	283	bne cr6,L(tail4)
fb084e5e	284
b8907dfd UD	285	/* Though we could've used ld/std here, they are still
	286	slow for unaligned cases. */
	287
759cfef3 AM	288	lwz 6,0(src)
	289	lwz 7,4(src)
	290	stw 6,0(dst)
	291	stw 7,4(dst)
b8907dfd UD	292	blr
b8907dfd UD	293
b8907dfd	294
759cfef3 AM	295	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	296	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	297	the data, allowing for aligned DST stores. */
	298	.align 4
fb084e5e	299	L(copy_GE_32_unaligned):
759cfef3 AM	300	clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
	301	#ifndef __LITTLE_ENDIAN__
	302	andi. 10,3,15 /* Check alignment of DST (against quadwords). */
	303	#endif
	304	srdi 9,cnt,4 /* Number of full quadwords remaining. */
b8907dfd UD	305
	306	beq L(copy_GE_32_unaligned_cont)
	307
759cfef3	308	/* DST is not quadword aligned, get it aligned. */
b8907dfd	309
759cfef3 AM	310	mtocrf 0x01,0
759cfef3 AM	311	subf cnt,0,cnt
b8907dfd UD	312
	313	/* Vector instructions work best when proper alignment (16-bytes)
	314	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
759cfef3	315	1:
b8907dfd	316	bf 31,2f
759cfef3 AM	317	lbz 6,0(src)
	318	addi src,src,1
	319	stb 6,0(dst)
	320	addi dst,dst,1
	321	2:
b8907dfd	322	bf 30,4f
759cfef3 AM	323	lhz 6,0(src)
	324	addi src,src,2
	325	sth 6,0(dst)
	326	addi dst,dst,2
	327	4:
b8907dfd	328	bf 29,8f
759cfef3 AM	329	lwz 6,0(src)
	330	addi src,src,4
	331	stw 6,0(dst)
	332	addi dst,dst,4
	333	8:
b8907dfd	334	bf 28,0f
759cfef3 AM	335	ld 6,0(src)
	336	addi src,src,8
	337	std 6,0(dst)
	338	addi dst,dst,8
fb084e5e	339	0:
759cfef3	340	srdi 9,cnt,4 /* Number of full quadwords remaining. */
fb084e5e	341
b8907dfd	342	/* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e LM	343	L(copy_GE_32_unaligned_cont):
fb084e5e LM	344
b8907dfd	345	/* Setup two indexes to speed up the indexed vector operations. */
759cfef3 AM	346	clrldi 10,cnt,60
759cfef3 AM	347	li 6,16 /* Index for 16-bytes offsets. */
b8907dfd	348	li 7,32 /* Index for 32-bytes offsets. */
759cfef3 AM	349	cmpldi cr1,10,0
	350	srdi 8,cnt,5 /* Setup the loop counter. */
	351	mtocrf 0x01,9
	352	cmpldi cr6,9,1
	353	#ifdef __LITTLE_ENDIAN__
	354	lvsr 5,0,src
	355	#else
	356	lvsl 5,0,src
	357	#endif
	358	lvx 3,0,src
	359	li 0,0
	360	bf 31,L(setup_unaligned_loop)
	361
	362	/* Copy another 16 bytes to align to 32-bytes due to the loop. */
	363	lvx 4,src,6
	364	#ifdef __LITTLE_ENDIAN__
	365	vperm 6,4,3,5
	366	#else
	367	vperm 6,3,4,5
	368	#endif
	369	addi src,src,16
	370	stvx 6,0,dst
	371	addi dst,dst,16
b8907dfd	372	vor 3,4,4
759cfef3	373	clrrdi 0,src,60
fb084e5e LM	374
fb084e5e LM	375	L(setup_unaligned_loop):
759cfef3 AM	376	mtctr 8
759cfef3 AM	377	ble cr6,L(end_unaligned_loop)
fb084e5e	378
b8907dfd	379	/* Copy 32 bytes at a time using vector instructions. */
759cfef3	380	.align 4
fb084e5e LM	381	L(unaligned_loop):
fb084e5e LM	382
b8907dfd UD	383	/* Note: vr6/vr10 may contain data that was already copied,
	384	but in order to get proper alignment, we may have to copy
	385	some portions again. This is faster than having unaligned
	386	vector instructions though. */
	387
759cfef3 AM	388	lvx 4,src,6
	389	#ifdef __LITTLE_ENDIAN__
	390	vperm 6,4,3,5
	391	#else
	392	vperm 6,3,4,5
	393	#endif
	394	lvx 3,src,7
	395	#ifdef __LITTLE_ENDIAN__
	396	vperm 10,3,4,5
	397	#else
	398	vperm 10,4,3,5
	399	#endif
	400	addi src,src,32
	401	stvx 6,0,dst
	402	stvx 10,dst,6
	403	addi dst,dst,32
b8907dfd UD	404	bdnz L(unaligned_loop)
b8907dfd UD	405
759cfef3 AM	406	clrrdi 0,src,60
	407
	408	.align 4
fb084e5e LM	409	L(end_unaligned_loop):
fb084e5e LM	410
b8907dfd	411	/* Check for tail bytes. */
759cfef3 AM	412	mtocrf 0x01,cnt
759cfef3 AM	413	beqlr cr1
b8907dfd	414
759cfef3	415	add src,src,0
b8907dfd UD	416
b8907dfd UD	417	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
759cfef3	418	/* Copy 8 bytes. */
b8907dfd	419	bf 28,4f
759cfef3 AM	420	lwz 6,0(src)
	421	lwz 7,4(src)
	422	addi src,src,8
	423	stw 6,0(dst)
	424	stw 7,4(dst)
	425	addi dst,dst,8
	426	4: /* Copy 4~7 bytes. */
	427	bf 29,L(tail2)
	428	lwz 6,0(src)
	429	stw 6,0(dst)
	430	bf 30,L(tail5)
	431	lhz 7,4(src)
	432	sth 7,4(dst)
	433	bflr 31
	434	lbz 8,6(src)
	435	stb 8,6(dst)
	436	/* Return original DST pointer. */
b8907dfd	437	blr
fb084e5e	438
2d67d91a	439	END_GEN_TB (memcpy,TB_TOCLESS)
fb084e5e	440	libc_hidden_builtin_def (memcpy)