[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S

/* Optimized memcpy implementation for PowerPC32/POWER7.
   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
   Contributed by Luis Machado <luisgpm@br.ibm.com>.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
   02110-1301 USA.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>

/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
   Returns 'dst'.  */

	.machine  power7
EALIGN (BP_SYM (memcpy), 5, 0)
	CALL_MCOUNT

	stwu    1,-32(1)
	cfi_adjust_cfa_offset(32)
	stw	30,20(1)
	cfi_offset(30,(20-32))
	stw	31,24(1)
	mr      30,3
	cmplwi  cr1,5,31
	neg	0,3
	cfi_offset(31,-8)
	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
				    code.  */

	andi.   11,3,7	      /* Check alignment of DST.  */
	clrlwi  10,4,29	      /* Check alignment of SRC.  */
	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
	mr	12,4
	mr	31,5
	bne	cr6,L(copy_GE_32_unaligned)

	srwi    9,5,3	      /* Number of full quadwords remaining.  */

	beq	L(copy_GE_32_aligned_cont)

	clrlwi  0,0,29
	mtcrf   0x01,0
	subf    31,0,5

	/* Get the SRC aligned to 8 bytes.  */

1:	bf	31,2f
	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1
2:	bf      30,4f
	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
4:	bf      29,0f
	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
0:
	clrlwi  10,12,29      /* Check alignment of SRC again.  */
	srwi    9,31,3	      /* Number of full doublewords remaining.  */

L(copy_GE_32_aligned_cont):

	clrlwi  11,31,29
	mtcrf   0x01,9

	srwi    8,31,5
	cmplwi  cr1,9,4
	cmplwi  cr6,11,0
	mr	11,12

	/* Copy 1~3 doublewords so the main loop starts
	at a multiple of 32 bytes.  */

	bf	30,1f
	lfd     6,0(12)
	lfd     7,8(12)
	addi    11,12,16
	mtctr   8
	stfd    6,0(3)
	stfd    7,8(3)
	addi    10,3,16
	bf      31,4f
	lfd     0,16(12)
	stfd    0,16(3)
	blt     cr1,3f
	addi    11,12,24
	addi    10,3,24
	b       4f

	.align  4
1:	/* Copy 1 doubleword and set the counter.  */
	mr	10,3
	mtctr   8
	bf      31,4f
	lfd     6,0(12)
	addi    11,12,8
	stfd    6,0(3)
	addi    10,3,8

L(aligned_copy):
	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
	.align  4
4:
	/* check for any 32-byte or 64-byte lumps that are outside of a
	   nice 128-byte range.  R8 contains the number of 32-byte
	   lumps, so drop this into the CR, and use the SO/EQ bits to help
	   handle the 32- or 64- byte lumps.  Then handle the rest with an
	   unrolled 128-bytes-at-a-time copy loop. */
	mtocrf	1,8
	li	6,16	# 16() index
	li	7,32	# 32() index
	li	8,48	# 48() index

L(aligned_32byte):
	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
	bns	cr7,L(aligned_64byte)
	lxvd2x	6,0,11
	lxvd2x	7,11,6
	addi	11,11,32
	stxvd2x	6,0,10
	stxvd2x	7,10,6
	addi	10,10,32

L(aligned_64byte):
	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
	bne	cr7,L(aligned_128setup)
	lxvd2x	6,0,11
	lxvd2x	7,11,6
	lxvd2x	8,11,7
	lxvd2x	9,11,8
	addi	11,11,64
	stxvd2x	6,0,10
	stxvd2x	7,10,6
	stxvd2x	8,10,7
	stxvd2x	9,10,8
	addi	10,10,64

L(aligned_128setup):
	/* Set up for the 128-byte at a time copy loop.  */
	srwi	8,31,7
	cmpwi	8,0	# Any 4x lumps left?
	beq	3f	# if not, move along.
	lxvd2x	6,0,11
	lxvd2x	7,11,6
	mtctr	8	# otherwise, load the ctr and begin.
	li	8,48	# 48() index
	b	L(aligned_128loop)

L(aligned_128head):
	/* for the 2nd + iteration of this loop. */
	lxvd2x	6,0,11
	lxvd2x	7,11,6
L(aligned_128loop):
	lxvd2x	8,11,7
	lxvd2x	9,11,8
	stxvd2x	6,0,10
	addi	11,11,64
	stxvd2x	7,10,6
	stxvd2x	8,10,7
	stxvd2x	9,10,8
	lxvd2x	6,0,11
	lxvd2x	7,11,6
	addi	10,10,64
	lxvd2x	8,11,7
	lxvd2x	9,11,8
	addi	11,11,64
	stxvd2x	6,0,10
	stxvd2x	7,10,6
	stxvd2x	8,10,7
	stxvd2x	9,10,8
	addi	10,10,64
	bdnz	L(aligned_128head)

3:
	/* Check for tail bytes.  */
	clrrwi  0,31,3
	mtcrf   0x01,31
	beq	cr6,0f

.L9:
	add	3,3,0
	add	12,12,0

	/*  At this point we have a tail of 0-7 bytes and we know that the
	destination is doubleword-aligned.  */
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
2:	/* Copy 2 bytes.  */
	bf	30,1f

	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz	30,20(1)
	lwz     31,24(1)
	addi    1,1,32
	blr

	/* Handle copies of 0~31 bytes.  */
	.align  4
L(copy_LT_32):
	cmplwi  cr6,5,8
	mr	12,4
	mtcrf   0x01,5
	ble	cr6,L(copy_LE_8)

	/* At least 9 bytes to go.  */
	neg	8,4
	clrrwi  11,4,2
	andi.   0,8,3
	cmplwi  cr1,5,16
	mr	10,5
	beq	L(copy_LT_32_aligned)

	/* Force 4-bytes alignment for SRC.  */
	mtocrf  0x01,0
	subf    10,0,5
2:	bf	30,1f

	lhz	6,0(12)
	addi    12,12,2
	sth	6,0(3)
	addi    3,3,2
1:	bf	31,L(end_4bytes_alignment)

	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1

	.align  4
L(end_4bytes_alignment):
	cmplwi  cr1,10,16
	mtcrf   0x01,10

L(copy_LT_32_aligned):
	/* At least 6 bytes to go, and SRC is word-aligned.  */
	blt	cr1,8f

	/* Copy 16 bytes.  */
	lwz	6,0(12)
	lwz     7,4(12)
	stw     6,0(3)
	lwz     8,8(12)
	stw     7,4(3)
	lwz     6,12(12)
	addi    12,12,16
	stw     8,8(3)
	stw     6,12(3)
	addi    3,3,16
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz     6,0(12)
	lwz     7,4(12)
	addi    12,12,8
	stw     6,0(3)
	stw     7,4(3)
	addi    3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
2:	/* Copy 2-3 bytes.  */
	bf	30,1f

	lhz     6,0(12)
	sth     6,0(3)
	bf      31,0f
	lbz     7,2(12)
	stb     7,2(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz	30,20(1)
	addi    1,1,32
	blr

	/* Handles copies of 0~8 bytes.  */
	.align  4
L(copy_LE_8):
	bne	cr6,4f

	/* Though we could've used lfd/stfd here, they are still
	slow for unaligned cases.  */

	lwz	6,0(4)
	lwz     7,4(4)
	stw     6,0(3)
	stw     7,4(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
4:	/* Copies 4~7 bytes.  */
	bf	29,2b

	lwz	6,0(4)
	stw     6,0(3)
	bf      30,5f
	lhz     7,4(4)
	sth     7,4(3)
	bf      31,0f
	lbz     8,6(4)
	stb     8,6(3)

	/* Return original DST pointer.  */
	mr      3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	.align  4
5:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,4(4)
	stb	6,4(3)

0:	/* Return original DST pointer.  */
	mr	3,30
	lwz     30,20(1)
	addi    1,1,32
	blr

	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	the data, allowing for aligned DST stores.  */
	.align  4
L(copy_GE_32_unaligned):
	andi.   11,3,15	      /* Check alignment of DST.  */
	clrlwi  0,0,28	      /* Number of bytes until the 1st
			      quadword of DST.  */
	srwi    9,5,4	      /* Number of full quadwords remaining.  */

	beq    L(copy_GE_32_unaligned_cont)

	/* SRC is not quadword aligned, get it aligned.  */

	mtcrf   0x01,0
	subf    31,0,5

	/* Vector instructions work best when proper alignment (16-bytes)
	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
1:	/* Copy 1 byte.  */
	bf	31,2f

	lbz	6,0(12)
	addi    12,12,1
	stb	6,0(3)
	addi    3,3,1
2:	/* Copy 2 bytes.  */
	bf	    30,4f

	lhz     6,0(12)
	addi    12,12,2
	sth     6,0(3)
	addi    3,3,2
4:	/* Copy 4 bytes.  */
	bf	29,8f

	lwz     6,0(12)
	addi    12,12,4
	stw     6,0(3)
	addi    3,3,4
8:	/* Copy 8 bytes.  */
	bf	28,0f

	lfd	6,0(12)
	addi    12,12,8
	stfd    6,0(3)
	addi    3,3,8
0:
	clrlwi  10,12,28      /* Check alignment of SRC.  */
	srwi    9,31,4	      /* Number of full quadwords remaining.  */

	/* The proper alignment is present, it is OK to copy the bytes now.  */
L(copy_GE_32_unaligned_cont):

	/* Setup two indexes to speed up the indexed vector operations.  */
	clrlwi  11,31,28
	li      6,16	      /* Index for 16-bytes offsets.  */
	li	7,32	      /* Index for 32-bytes offsets.  */
	cmplwi  cr1,11,0
	srwi    8,31,5	      /* Setup the loop counter.  */
	mr      10,3
	mr      11,12
	mtcrf   0x01,9
	cmplwi  cr6,9,1
	lvsl    5,0,12
	lvx     3,0,12
	bf      31,L(setup_unaligned_loop)

	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
	lvx     4,12,6
	vperm   6,3,4,5
	addi    11,12,16
	addi    10,3,16
	stvx    6,0,3
	vor	3,4,4

L(setup_unaligned_loop):
	mtctr   8
	ble     cr6,L(end_unaligned_loop)

	/* Copy 32 bytes at a time using vector instructions.  */
	.align  4
L(unaligned_loop):

	/* Note: vr6/vr10 may contain data that was already copied,
	but in order to get proper alignment, we may have to copy
	some portions again. This is faster than having unaligned
	vector instructions though.  */

	lvx	4,11,6	      /* vr4 = r11+16.  */
	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
			      of vr3/vr4 into vr6.  */
	lvx	3,11,7	      /* vr3 = r11+32.  */
	vperm   10,4,3,5      /* Merge the correctly-aligned portions
			      of vr3/vr4 into vr10.  */
	addi    11,11,32
	stvx    6,0,10
	stvx    10,10,6
	addi    10,10,32

	bdnz    L(unaligned_loop)

	.align  4
L(end_unaligned_loop):

	/* Check for tail bytes.  */
	clrrwi  0,31,4
	mtcrf   0x01,31
	beq	cr1,0f

	add	3,3,0
	add	12,12,0

	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
8:	/* Copy 8 bytes.  */
	bf	28,4f

	lwz	6,0(12)
	lwz	7,4(12)
	addi    12,12,8
	stw	6,0(3)
	stw	7,4(3)
	addi    3,3,8
4:	/* Copy 4 bytes.  */
	bf	29,2f

	lwz	6,0(12)
	addi    12,12,4
	stw	6,0(3)
	addi    3,3,4
2:	/* Copy 2~3 bytes.  */
	bf	30,1f

	lhz	6,0(12)
	addi    12,12,2
	sth	6,0(3)
	addi    3,3,2
1:	/* Copy 1 byte.  */
	bf	31,0f

	lbz	6,0(12)
	stb	6,0(3)
0:	/* Return original DST pointer.  */
	mr	3,30
	lwz     30,20(1)
	lwz	31,24(1)
	addi    1,1,32
	blr

END (BP_SYM (memcpy))
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
fb084e5e	1	/* Optimized memcpy implementation for PowerPC32/POWER7.
5025581e	2	Copyright (C) 2010, 2011 Free Software Foundation, Inc.
fb084e5e LM	3	Contributed by Luis Machado <luisgpm@br.ibm.com>.
	4	This file is part of the GNU C Library.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, write to the Free
	18	Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
	19	02110-1301 USA. */
	20
	21	#include <sysdep.h>
	22	#include <bp-sym.h>
	23	#include <bp-asm.h>
	24
	25	/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	26	Returns 'dst'. */
	27
	28	.machine power7
	29	EALIGN (BP_SYM (memcpy), 5, 0)
	30	CALL_MCOUNT
	31
	32	stwu 1,-32(1)
	33	cfi_adjust_cfa_offset(32)
	34	stw 30,20(1)
	35	cfi_offset(30,(20-32))
	36	stw 31,24(1)
	37	mr 30,3
	38	cmplwi cr1,5,31
	39	neg 0,3
	40	cfi_offset(31,-8)
	41	ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
	42	code. */
	43
	44	andi. 11,3,7 /* Check alignment of DST. */
	45	clrlwi 10,4,29 /* Check alignment of SRC. */
	46	cmplw cr6,10,11 /* SRC and DST alignments match? */
	47	mr 12,4
	48	mr 31,5
	49	bne cr6,L(copy_GE_32_unaligned)
	50
	51	srwi 9,5,3 /* Number of full quadwords remaining. */
	52
	53	beq L(copy_GE_32_aligned_cont)
	54
	55	clrlwi 0,0,29
	56	mtcrf 0x01,0
	57	subf 31,0,5
	58
	59	/* Get the SRC aligned to 8 bytes. */
	60
b8907dfd UD	61	1: bf 31,2f
	62	lbz 6,0(12)
	63	addi 12,12,1
	64	stb 6,0(3)
	65	addi 3,3,1
	66	2: bf 30,4f
	67	lhz 6,0(12)
	68	addi 12,12,2
	69	sth 6,0(3)
	70	addi 3,3,2
	71	4: bf 29,0f
	72	lwz 6,0(12)
	73	addi 12,12,4
	74	stw 6,0(3)
	75	addi 3,3,4
fb084e5e	76	0:
b8907dfd UD	77	clrlwi 10,12,29 /* Check alignment of SRC again. */
b8907dfd UD	78	srwi 9,31,3 /* Number of full doublewords remaining. */
fb084e5e LM	79
	80	L(copy_GE_32_aligned_cont):
	81
b8907dfd UD	82	clrlwi 11,31,29
	83	mtcrf 0x01,9
	84
	85	srwi 8,31,5
	86	cmplwi cr1,9,4
	87	cmplwi cr6,11,0
	88	mr 11,12
	89
	90	/* Copy 1~3 doublewords so the main loop starts
	91	at a multiple of 32 bytes. */
	92
	93	bf 30,1f
	94	lfd 6,0(12)
	95	lfd 7,8(12)
	96	addi 11,12,16
	97	mtctr 8
	98	stfd 6,0(3)
	99	stfd 7,8(3)
	100	addi 10,3,16
	101	bf 31,4f
	102	lfd 0,16(12)
	103	stfd 0,16(3)
	104	blt cr1,3f
	105	addi 11,12,24
	106	addi 10,3,24
	107	b 4f
	108
	109	.align 4
	110	1: /* Copy 1 doubleword and set the counter. */
	111	mr 10,3
	112	mtctr 8
	113	bf 31,4f
	114	lfd 6,0(12)
	115	addi 11,12,8
	116	stfd 6,0(3)
	117	addi 10,3,8
	118
5025581e WS	119	L(aligned_copy):
5025581e WS	120	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
b8907dfd	121	.align 4
5025581e WS	122	4:
	123	/* check for any 32-byte or 64-byte lumps that are outside of a
	124	nice 128-byte range. R8 contains the number of 32-byte
	125	lumps, so drop this into the CR, and use the SO/EQ bits to help
	126	handle the 32- or 64- byte lumps. Then handle the rest with an
	127	unrolled 128-bytes-at-a-time copy loop. */
	128	mtocrf 1,8
	129	li 6,16 # 16() index
	130	li 7,32 # 32() index
	131	li 8,48 # 48() index
	132
	133	L(aligned_32byte):
	134	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
	135	bns cr7,L(aligned_64byte)
	136	lxvd2x 6,0,11
	137	lxvd2x 7,11,6
	138	addi 11,11,32
	139	stxvd2x 6,0,10
	140	stxvd2x 7,10,6
	141	addi 10,10,32
	142
	143	L(aligned_64byte):
	144	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
	145	bne cr7,L(aligned_128setup)
	146	lxvd2x 6,0,11
	147	lxvd2x 7,11,6
	148	lxvd2x 8,11,7
	149	lxvd2x 9,11,8
	150	addi 11,11,64
	151	stxvd2x 6,0,10
	152	stxvd2x 7,10,6
	153	stxvd2x 8,10,7
	154	stxvd2x 9,10,8
	155	addi 10,10,64
	156
	157	L(aligned_128setup):
	158	/* Set up for the 128-byte at a time copy loop. */
	159	srwi 8,31,7
	160	cmpwi 8,0 # Any 4x lumps left?
	161	beq 3f # if not, move along.
	162	lxvd2x 6,0,11
	163	lxvd2x 7,11,6
	164	mtctr 8 # otherwise, load the ctr and begin.
	165	li 8,48 # 48() index
	166	b L(aligned_128loop)
	167
	168	L(aligned_128head):
	169	/* for the 2nd + iteration of this loop. */
	170	lxvd2x 6,0,11
	171	lxvd2x 7,11,6
	172	L(aligned_128loop):
	173	lxvd2x 8,11,7
	174	lxvd2x 9,11,8
	175	stxvd2x 6,0,10
	176	addi 11,11,64
	177	stxvd2x 7,10,6
	178	stxvd2x 8,10,7
	179	stxvd2x 9,10,8
	180	lxvd2x 6,0,11
	181	lxvd2x 7,11,6
	182	addi 10,10,64
	183	lxvd2x 8,11,7
	184	lxvd2x 9,11,8
	185	addi 11,11,64
186	stxvd2x 6,0,10
187	stxvd2x 7,10,6
188	stxvd2x 8,10,7
189	stxvd2x 9,10,8
190	addi 10,10,64
191	bdnz L(aligned_128head)
fb084e5e	192
5025581e	193	3:
b8907dfd	194	/* Check for tail bytes. */
b8907dfd UD	195	clrrwi 0,31,3
	196	mtcrf 0x01,31
	197	beq cr6,0f
fb084e5e LM	198
fb084e5e LM	199	.L9:
b8907dfd UD	200	add 3,3,0
	201	add 12,12,0
	202
	203	/* At this point we have a tail of 0-7 bytes and we know that the
	204	destination is doubleword-aligned. */
	205	4: /* Copy 4 bytes. */
	206	bf 29,2f
	207
	208	lwz 6,0(12)
	209	addi 12,12,4
	210	stw 6,0(3)
	211	addi 3,3,4
	212	2: /* Copy 2 bytes. */
	213	bf 30,1f
	214
	215	lhz 6,0(12)
	216	addi 12,12,2
	217	sth 6,0(3)
	218	addi 3,3,2
	219	1: /* Copy 1 byte. */
	220	bf 31,0f
	221
	222	lbz 6,0(12)
	223	stb 6,0(3)
	224	0: /* Return original DST pointer. */
	225	mr 3,30
	226	lwz 30,20(1)
	227	lwz 31,24(1)
	228	addi 1,1,32
	229	blr
	230
	231	/* Handle copies of 0~31 bytes. */
	232	.align 4
fb084e5e	233	L(copy_LT_32):
b8907dfd UD	234	cmplwi cr6,5,8
	235	mr 12,4
	236	mtcrf 0x01,5
	237	ble cr6,L(copy_LE_8)
	238
	239	/* At least 9 bytes to go. */
	240	neg 8,4
	241	clrrwi 11,4,2
	242	andi. 0,8,3
	243	cmplwi cr1,5,16
	244	mr 10,5
	245	beq L(copy_LT_32_aligned)
	246
	247	/* Force 4-bytes alignment for SRC. */
	248	mtocrf 0x01,0
	249	subf 10,0,5
	250	2: bf 30,1f
	251
	252	lhz 6,0(12)
	253	addi 12,12,2
	254	sth 6,0(3)
	255	addi 3,3,2
	256	1: bf 31,L(end_4bytes_alignment)
	257
	258	lbz 6,0(12)
	259	addi 12,12,1
	260	stb 6,0(3)
	261	addi 3,3,1
	262
	263	.align 4
fb084e5e	264	L(end_4bytes_alignment):
b8907dfd UD	265	cmplwi cr1,10,16
b8907dfd UD	266	mtcrf 0x01,10
fb084e5e LM	267
fb084e5e LM	268	L(copy_LT_32_aligned):
b8907dfd UD	269	/* At least 6 bytes to go, and SRC is word-aligned. */
	270	blt cr1,8f
	271
	272	/* Copy 16 bytes. */
	273	lwz 6,0(12)
	274	lwz 7,4(12)
	275	stw 6,0(3)
	276	lwz 8,8(12)
	277	stw 7,4(3)
	278	lwz 6,12(12)
	279	addi 12,12,16
	280	stw 8,8(3)
	281	stw 6,12(3)
	282	addi 3,3,16
	283	8: /* Copy 8 bytes. */
	284	bf 28,4f
	285
	286	lwz 6,0(12)
	287	lwz 7,4(12)
	288	addi 12,12,8
	289	stw 6,0(3)
	290	stw 7,4(3)
	291	addi 3,3,8
	292	4: /* Copy 4 bytes. */
	293	bf 29,2f
	294
	295	lwz 6,0(12)
	296	addi 12,12,4
	297	stw 6,0(3)
	298	addi 3,3,4
	299	2: /* Copy 2-3 bytes. */
	300	bf 30,1f
	301
	302	lhz 6,0(12)
	303	sth 6,0(3)
	304	bf 31,0f
	305	lbz 7,2(12)
	306	stb 7,2(3)
	307
	308	/* Return original DST pointer. */
	309	mr 3,30
	310	lwz 30,20(1)
	311	addi 1,1,32
	312	blr
	313
	314	.align 4
	315	1: /* Copy 1 byte. */
	316	bf 31,0f
	317
	318	lbz 6,0(12)
	319	stb 6,0(3)
	320	0: /* Return original DST pointer. */
	321	mr 3,30
	322	lwz 30,20(1)
	323	addi 1,1,32
	324	blr
	325
	326	/* Handles copies of 0~8 bytes. */
	327	.align 4
fb084e5e	328	L(copy_LE_8):
b8907dfd UD	329	bne cr6,4f
	330
	331	/* Though we could've used lfd/stfd here, they are still
	332	slow for unaligned cases. */
	333
	334	lwz 6,0(4)
	335	lwz 7,4(4)
	336	stw 6,0(3)
	337	stw 7,4(3)
	338
	339	/* Return original DST pointer. */
	340	mr 3,30
	341	lwz 30,20(1)
	342	addi 1,1,32
	343	blr
	344
	345	.align 4
	346	4: /* Copies 4~7 bytes. */
	347	bf 29,2b
	348
	349	lwz 6,0(4)
	350	stw 6,0(3)
	351	bf 30,5f
	352	lhz 7,4(4)
	353	sth 7,4(3)
	354	bf 31,0f
	355	lbz 8,6(4)
	356	stb 8,6(3)
	357
	358	/* Return original DST pointer. */
	359	mr 3,30
	360	lwz 30,20(1)
	361	addi 1,1,32
	362	blr
	363
	364	.align 4
	365	5: /* Copy 1 byte. */
	366	bf 31,0f
	367
	368	lbz 6,4(4)
	369	stb 6,4(3)
	370
	371	0: /* Return original DST pointer. */
	372	mr 3,30
	373	lwz 30,20(1)
	374	addi 1,1,32
	375	blr
	376
	377	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
	378	SRC is not. Use aligned quadword loads from SRC, shifted to realign
	379	the data, allowing for aligned DST stores. */
	380	.align 4
fb084e5e	381	L(copy_GE_32_unaligned):
b8907dfd UD	382	andi. 11,3,15 /* Check alignment of DST. */
	383	clrlwi 0,0,28 /* Number of bytes until the 1st
	384	quadword of DST. */
	385	srwi 9,5,4 /* Number of full quadwords remaining. */
	386
	387	beq L(copy_GE_32_unaligned_cont)
	388
	389	/* SRC is not quadword aligned, get it aligned. */
	390
	391	mtcrf 0x01,0
	392	subf 31,0,5
	393
	394	/* Vector instructions work best when proper alignment (16-bytes)
	395	is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
	396	1: /* Copy 1 byte. */
	397	bf 31,2f
	398
	399	lbz 6,0(12)
	400	addi 12,12,1
	401	stb 6,0(3)
	402	addi 3,3,1
	403	2: /* Copy 2 bytes. */
	404	bf 30,4f
	405
	406	lhz 6,0(12)
	407	addi 12,12,2
	408	sth 6,0(3)
	409	addi 3,3,2
	410	4: /* Copy 4 bytes. */
	411	bf 29,8f
	412
	413	lwz 6,0(12)
	414	addi 12,12,4
	415	stw 6,0(3)
	416	addi 3,3,4
	417	8: /* Copy 8 bytes. */
	418	bf 28,0f
	419
	420	lfd 6,0(12)
	421	addi 12,12,8
	422	stfd 6,0(3)
	423	addi 3,3,8
fb084e5e	424	0:
b8907dfd	425	clrlwi 10,12,28 /* Check alignment of SRC. */
eb5ad2eb	426	srwi 9,31,4 /* Number of full quadwords remaining. */
fb084e5e	427
b8907dfd	428	/* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e LM	429	L(copy_GE_32_unaligned_cont):
fb084e5e LM	430
b8907dfd UD	431	/* Setup two indexes to speed up the indexed vector operations. */
	432	clrlwi 11,31,28
	433	li 6,16 /* Index for 16-bytes offsets. */
	434	li 7,32 /* Index for 32-bytes offsets. */
	435	cmplwi cr1,11,0
eb5ad2eb	436	srwi 8,31,5 /* Setup the loop counter. */
b8907dfd UD	437	mr 10,3
	438	mr 11,12
	439	mtcrf 0x01,9
	440	cmplwi cr6,9,1
	441	lvsl 5,0,12
	442	lvx 3,0,12
	443	bf 31,L(setup_unaligned_loop)
	444
	445	/* Copy another 16 bytes to align to 32-bytes due to the loop . */
	446	lvx 4,12,6
	447	vperm 6,3,4,5
	448	addi 11,12,16
	449	addi 10,3,16
	450	stvx 6,0,3
	451	vor 3,4,4
fb084e5e LM	452
fb084e5e LM	453	L(setup_unaligned_loop):
b8907dfd UD	454	mtctr 8
b8907dfd UD	455	ble cr6,L(end_unaligned_loop)
fb084e5e	456
b8907dfd UD	457	/* Copy 32 bytes at a time using vector instructions. */
b8907dfd UD	458	.align 4
fb084e5e LM	459	L(unaligned_loop):
fb084e5e LM	460
b8907dfd UD	461	/* Note: vr6/vr10 may contain data that was already copied,
	462	but in order to get proper alignment, we may have to copy
	463	some portions again. This is faster than having unaligned
	464	vector instructions though. */
	465
	466	lvx 4,11,6 /* vr4 = r11+16. */
	467	vperm 6,3,4,5 /* Merge the correctly-aligned portions
	468	of vr3/vr4 into vr6. */
	469	lvx 3,11,7 /* vr3 = r11+32. */
	470	vperm 10,4,3,5 /* Merge the correctly-aligned portions
	471	of vr3/vr4 into vr10. */
	472	addi 11,11,32
	473	stvx 6,0,10
	474	stvx 10,10,6
	475	addi 10,10,32
	476
	477	bdnz L(unaligned_loop)
	478
	479	.align 4
fb084e5e LM	480	L(end_unaligned_loop):
fb084e5e LM	481
b8907dfd UD	482	/* Check for tail bytes. */
	483	clrrwi 0,31,4
	484	mtcrf 0x01,31
	485	beq cr1,0f
	486
	487	add 3,3,0
	488	add 12,12,0
	489
	490	/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
	491	8: /* Copy 8 bytes. */
	492	bf 28,4f
	493
	494	lwz 6,0(12)
	495	lwz 7,4(12)
	496	addi 12,12,8
	497	stw 6,0(3)
	498	stw 7,4(3)
	499	addi 3,3,8
	500	4: /* Copy 4 bytes. */
	501	bf 29,2f
	502
	503	lwz 6,0(12)
	504	addi 12,12,4
	505	stw 6,0(3)
	506	addi 3,3,4
	507	2: /* Copy 2~3 bytes. */
	508	bf 30,1f
	509
	510	lhz 6,0(12)
	511	addi 12,12,2
	512	sth 6,0(3)
	513	addi 3,3,2
	514	1: /* Copy 1 byte. */
	515	bf 31,0f
	516
	517	lbz 6,0(12)
	518	stb 6,0(3)
	519	0: /* Return original DST pointer. */
	520	mr 3,30
	521	lwz 30,20(1)
	522	lwz 31,24(1)
	523	addi 1,1,32
	524	blr
fb084e5e LM	525
	526	END (BP_SYM (memcpy))
	527	libc_hidden_builtin_def (memcpy)