[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcpy.S

/* Optimized memcpy implementation for PowerPC64.
   Copyright (C) 2003-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
   Returns 'dst'.

   Memcpy handles short copies (< 32-bytes) using a binary move blocks
   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
   with the appropriate combination of byte and halfword load/stores.
   There is minimal effort to optimize the alignment of short moves.
   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
   of handling unaligned load/stores that do not cross 32-byte boundaries.

   Longer moves (>= 32-bytes) justify the effort to get at least the
   destination doubleword (8-byte) aligned.  Further optimization is
   possible when both source and destination are doubleword aligned.
   Each case has a optimized unrolled loop.   */

	.machine power4
EALIGN (memcpy, 5, 0)
	CALL_MCOUNT 3

    cmpldi cr1,5,31
    neg   0,3
    std   3,-16(1)
    std   31,-8(1)
    cfi_offset(31,-8)
    andi. 11,3,7	/* check alignment of dst.  */
    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
    clrldi 10,4,61	/* check alignment of src.  */
    cmpldi cr6,5,8
    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
    cmpld cr6,10,11
    mr    12,4
    srdi  9,5,3		/* Number of full double words remaining.  */
    mtcrf 0x01,0
    mr    31,5
    beq   .L0

    subf  31,0,5
  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
1:  bf    31,2f
    lbz   6,0(12)
    addi  12,12,1
    stb   6,0(3)
    addi  3,3,1
2:  bf    30,4f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
4:  bf    29,0f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
0:
    clrldi 10,12,61	/* check alignment of src again.  */
    srdi  9,31,3	/* Number of full double words remaining.  */

  /* Copy doublewords from source to destination, assuming the
     destination is aligned on a doubleword boundary.

     At this point we know there are at least 25 bytes left (32-7) to copy.
     The next step is to determine if the source is also doubleword aligned.
     If not branch to the unaligned move code at .L6. which uses
     a load, shift, store strategy.

     Otherwise source and destination are doubleword aligned, and we can
     the optimized doubleword copy loop.  */
.L0:
    clrldi  11,31,61
    mtcrf   0x01,9
    cmpldi  cr1,11,0
    bne-    cr6,.L6   /* If source is not DW aligned.  */

  /* Move doublewords where destination and source are DW aligned.
     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
     If the copy is not an exact multiple of 32 bytes, 1-3
     doublewords are copied as needed to set up the main loop.  After
     the main loop exits there may be a tail of 1-7 bytes. These byte are
     copied a word/halfword/byte at a time as needed to preserve alignment.  */

    srdi  8,31,5
    cmpldi	cr1,9,4
    cmpldi	cr6,11,0
    mr    11,12

    bf    30,1f
    ld    6,0(12)
    ld    7,8(12)
    addi  11,12,16
    mtctr 8
    std   6,0(3)
    std   7,8(3)
    addi  10,3,16
    bf    31,4f
    ld    0,16(12)
    std   0,16(3)
    blt   cr1,3f
    addi  11,12,24
    addi  10,3,24
    b     4f
    .align  4
1:
    mr    10,3
    mtctr 8
    bf    31,4f
    ld    6,0(12)
    addi  11,12,8
    std   6,0(3)
    addi  10,3,8

    .align  4
4:
    ld    6,0(11)
    ld    7,8(11)
    ld    8,16(11)
    ld    0,24(11)
    addi  11,11,32
2:
    std   6,0(10)
    std   7,8(10)
    std   8,16(10)
    std   0,24(10)
    addi  10,10,32
    bdnz  4b
3:

    rldicr 0,31,0,60
    mtcrf 0x01,31
    beq   cr6,0f
.L9:
    add   3,3,0
    add   12,12,0

/*  At this point we have a tail of 0-7 bytes and we know that the
    destination is double word aligned.  */
4:  bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  bf    30,1f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
1:  bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr

/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
   bytes.  Each case is handled without loops, using binary (1,2,4,8)
   tests.

   In the short (0-8 byte) case no attempt is made to force alignment
   of either source or destination.  The hardware will handle the
   unaligned load/stores with small delays for crossing 32- 64-byte, and
   4096-byte boundaries. Since these short moves are unlikely to be
   unaligned or cross these boundaries, the overhead to force
   alignment is not justified.

   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
   boundaries.  Since only loads are sensitive to the 32-/64-byte
   boundaries it is more important to align the source then the
   destination.  If the source is not already word aligned, we first
   move 1-3 bytes as needed.  Since we are only word aligned we don't
   use double word load/stores to insure that all loads are aligned.
   While the destination and stores may still be unaligned, this
   is only an issue for page (4096 byte boundary) crossing, which
   should be rare for these short moves.  The hardware handles this
   case automatically with a small delay.  */

    .align  4
.L2:
    mtcrf 0x01,5
    neg   8,4
    clrrdi	11,4,2
    andi. 0,8,3
    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
/* At least 9 bytes left.  Get the source word aligned.  */
    cmpldi	cr1,5,16
    mr    10,5
    mr    12,4
    cmpldi	cr6,0,2
    beq   .L3	/* If the source is already word aligned skip this.  */
/* Copy 1-3 bytes to get source address word aligned.  */
    lwz   6,0(11)
    subf  10,0,5
    add   12,4,0
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
#ifdef __LITTLE_ENDIAN__
    sth   7,0(3)
#else
    sth   6,0(3)
#endif
    b     7f
    .align  4
3:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,24
    stb   6,0(3)
    sth   7,1(3)
#else
    stb   7,0(3)
    sth   6,1(3)
#endif
    b     7f
    .align  4
5:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,8
#endif
    stb   6,0(3)
7:
    cmpldi	cr1,10,16
    add   3,3,0
    mtcrf 0x01,10
    .align  4
.L3:
/* At least 6 bytes left and the source is word aligned.  */
    blt   cr1,8f
16: /* Move 16 bytes.  */
    lwz   6,0(12)
    lwz   7,4(12)
    stw   6,0(3)
    lwz   6,8(12)
    stw   7,4(3)
    lwz   7,12(12)
    addi  12,12,16
    stw   6,8(3)
    stw   7,12(3)
    addi  3,3,16
8:  /* Move 8 bytes.  */
    bf    28,4f
    lwz   6,0(12)
    lwz   7,4(12)
    addi  12,12,8
    stw   6,0(3)
    stw   7,4(3)
    addi  3,3,8
4:  /* Move 4 bytes.  */
    bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  /* Move 2-3 bytes.  */
    bf    30,1f
    lhz   6,0(12)
    sth   6,0(3)
    bf    31,0f
    lbz   7,2(12)
    stb   7,2(3)
    ld 3,-16(1)
    blr
1:  /* Move 1 byte.  */
    bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

/* Special case to copy 0-8 bytes.  */
    .align  4
.LE8:
    mr    12,4
    bne   cr6,4f
/* Would have liked to use use ld/std here but the 630 processors are
   slow for load/store doubles that are not at least word aligned.
   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
    lwz   6,0(4)
    lwz   7,4(4)
    stw   6,0(3)
    stw   7,4(3)
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr
    .align  4
4:  bf    29,2b
    lwz   6,0(4)
    stw   6,0(3)
6:
    bf    30,5f
    lhz   7,4(4)
    sth   7,4(3)
    bf    31,0f
    lbz   8,6(4)
    stb   8,6(3)
    ld 3,-16(1)
    blr
    .align  4
5:
    bf    31,0f
    lbz   6,4(4)
    stb   6,4(3)
    .align  4
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

    .align  4
.L6:

  /* Copy doublewords where the destination is aligned but the source is
     not.  Use aligned doubleword loads from the source, shifted to realign
     the data, to allow aligned destination stores.  */
    addi    11,9,-1  /* loop DW count is one less than total */
    subf    5,10,12
    sldi    10,10,3
    mr      4,3
    srdi    8,11,2   /* calculate the 32 byte loop count */
    ld      6,0(5)
    mtcrf   0x01,11
    cmpldi  cr6,9,4
    mtctr   8
    ld      7,8(5)
    subfic  9,10,64
    bf      30,1f

    /* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
#ifdef __LITTLE_ENDIAN__
    srd     0,7,10
    sld     8,6,9
#else
    sld     0,7,10
    srd     8,6,9
#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
    addi    4,4,16
    addi    5,5,32
    blt     cr6,8f  /* if total DWs = 3, then bypass loop */
    bf      31,4f
    /* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
    ld      7,0(5)
    addi    5,5,8
    addi    4,4,8
    beq     cr6,8f  /* if total DWs = 4, then bypass loop */
    b       4f
    .align 4
1:
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    addi    5,5,16
    or      0,0,8
    bf      31,4f
    mr      6,7
    ld      7,0(5)
    addi    5,5,8
    std     0,0(4)
    addi    4,4,8
    .align 4
/* copy 32 bytes at a time */
4:
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,7,10
    sld   8,6,9
#else
    sld   0,7,10
    srd   8,6,9
#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,7,10
    sld   8,6,9
#else
    sld   0,7,10
    srd   8,6,9
#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
    addi  5,5,32
    addi  4,4,32
    bdnz+ 4b
    .align 4
8:
    /* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    std   0,0(4)
3:
    rldicr 0,31,0,60
    mtcrf 0x01,31
    bne   cr1,.L9	/* If the tail is 0 bytes we are done!  */
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr
END_GEN_TB (memcpy,TB_TOCLESS)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
04067002	1	/* Optimized memcpy implementation for PowerPC64.
b168057a	2	Copyright (C) 2003-2015 Free Software Foundation, Inc.
04067002 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
04067002 UD	18
04067002 UD	19	#include <sysdep.h>
04067002 UD	20
	21	/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
	22	Returns 'dst'.
	23
9c84384c JM	24	Memcpy handles short copies (< 32-bytes) using a binary move blocks
	25	(no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
	26	with the appropriate combination of byte and halfword load/stores.
	27	There is minimal effort to optimize the alignment of short moves.
04067002	28	The 64-bit implementations of POWER3 and POWER4 do a reasonable job
2ccdea26	29	of handling unaligned load/stores that do not cross 32-byte boundaries.
04067002 UD	30
	31	Longer moves (>= 32-bytes) justify the effort to get at least the
	32	destination doubleword (8-byte) aligned. Further optimization is
2ccdea26	33	possible when both source and destination are doubleword aligned.
04067002 UD	34	Each case has a optimized unrolled loop. */
04067002 UD	35
a88f47a7	36	.machine power4
2d67d91a	37	EALIGN (memcpy, 5, 0)
04067002 UD	38	CALL_MCOUNT 3
	39
	40	cmpldi cr1,5,31
	41	neg 0,3
	42	std 3,-16(1)
	43	std 31,-8(1)
	44	cfi_offset(31,-8)
2ccdea26	45	andi. 11,3,7 /* check alignment of dst. */
04067002	46	clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
2ccdea26	47	clrldi 10,4,61 /* check alignment of src. */
04067002 UD	48	cmpldi cr6,5,8
04067002 UD	49	ble- cr1,.L2 /* If move < 32 bytes use short move code. */
9c84384c	50	cmpld cr6,10,11
04067002 UD	51	mr 12,4
	52	srdi 9,5,3 /* Number of full double words remaining. */
	53	mtcrf 0x01,0
	54	mr 31,5
	55	beq .L0
9c84384c	56
04067002	57	subf 31,0,5
2ccdea26	58	/* Move 0-7 bytes as needed to get the destination doubleword aligned. */
04067002 UD	59	1: bf 31,2f
	60	lbz 6,0(12)
	61	addi 12,12,1
	62	stb 6,0(3)
	63	addi 3,3,1
	64	2: bf 30,4f
	65	lhz 6,0(12)
	66	addi 12,12,2
	67	sth 6,0(3)
	68	addi 3,3,2
	69	4: bf 29,0f
	70	lwz 6,0(12)
	71	addi 12,12,4
	72	stw 6,0(3)
	73	addi 3,3,4
	74	0:
2ccdea26	75	clrldi 10,12,61 /* check alignment of src again. */
04067002	76	srdi 9,31,3 /* Number of full double words remaining. */
9c84384c	77
2ccdea26	78	/* Copy doublewords from source to destination, assuming the
04067002 UD	79	destination is aligned on a doubleword boundary.
	80
	81	At this point we know there are at least 25 bytes left (32-7) to copy.
9c84384c	82	The next step is to determine if the source is also doubleword aligned.
04067002 UD	83	If not branch to the unaligned move code at .L6. which uses
04067002 UD	84	a load, shift, store strategy.
9c84384c	85
04067002 UD	86	Otherwise source and destination are doubleword aligned, and we can
	87	the optimized doubleword copy loop. */
	88	.L0:
	89	clrldi 11,31,61
	90	mtcrf 0x01,9
	91	cmpldi cr1,11,0
	92	bne- cr6,.L6 /* If source is not DW aligned. */
	93
	94	/* Move doublewords where destination and source are DW aligned.
	95	Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
ded5b9b7	96	If the copy is not an exact multiple of 32 bytes, 1-3
04067002	97	doublewords are copied as needed to set up the main loop. After
9c84384c	98	the main loop exits there may be a tail of 1-7 bytes. These byte are
04067002 UD	99	copied a word/halfword/byte at a time as needed to preserve alignment. */
	100
	101	srdi 8,31,5
	102	cmpldi cr1,9,4
	103	cmpldi cr6,11,0
	104	mr 11,12
9c84384c	105
04067002 UD	106	bf 30,1f
	107	ld 6,0(12)
	108	ld 7,8(12)
	109	addi 11,12,16
	110	mtctr 8
	111	std 6,0(3)
	112	std 7,8(3)
	113	addi 10,3,16
	114	bf 31,4f
	115	ld 0,16(12)
9c84384c	116	std 0,16(3)
04067002 UD	117	blt cr1,3f
	118	addi 11,12,24
	119	addi 10,3,24
	120	b 4f
	121	.align 4
	122	1:
	123	mr 10,3
	124	mtctr 8
	125	bf 31,4f
	126	ld 6,0(12)
	127	addi 11,12,8
	128	std 6,0(3)
	129	addi 10,3,8
9c84384c	130
04067002 UD	131	.align 4
	132	4:
	133	ld 6,0(11)
	134	ld 7,8(11)
	135	ld 8,16(11)
	136	ld 0,24(11)
	137	addi 11,11,32
	138	2:
	139	std 6,0(10)
	140	std 7,8(10)
	141	std 8,16(10)
	142	std 0,24(10)
	143	addi 10,10,32
	144	bdnz 4b
9c84384c	145	3:
04067002 UD	146
	147	rldicr 0,31,0,60
	148	mtcrf 0x01,31
	149	beq cr6,0f
	150	.L9:
	151	add 3,3,0
	152	add 12,12,0
9c84384c	153
04067002	154	/* At this point we have a tail of 0-7 bytes and we know that the
2ccdea26	155	destination is double word aligned. */
04067002 UD	156	4: bf 29,2f
	157	lwz 6,0(12)
	158	addi 12,12,4
	159	stw 6,0(3)
	160	addi 3,3,4
	161	2: bf 30,1f
	162	lhz 6,0(12)
	163	addi 12,12,2
	164	sth 6,0(3)
	165	addi 3,3,2
	166	1: bf 31,0f
	167	lbz 6,0(12)
	168	stb 6,0(3)
	169	0:
	170	/* Return original dst pointer. */
	171	ld 31,-8(1)
	172	ld 3,-16(1)
	173	blr
9c84384c JM	174
	175	/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
	176	bytes. Each case is handled without loops, using binary (1,2,4,8)
	177	tests.
	178
04067002	179	In the short (0-8 byte) case no attempt is made to force alignment
9c84384c JM	180	of either source or destination. The hardware will handle the
9c84384c JM	181	unaligned load/stores with small delays for crossing 32- 64-byte, and
04067002	182	4096-byte boundaries. Since these short moves are unlikely to be
9c84384c	183	unaligned or cross these boundaries, the overhead to force
04067002	184	alignment is not justified.
9c84384c	185
04067002 UD	186	The longer (9-31 byte) move is more likely to cross 32- or 64-byte
04067002 UD	187	boundaries. Since only loads are sensitive to the 32-/64-byte
9c84384c	188	boundaries it is more important to align the source then the
04067002	189	destination. If the source is not already word aligned, we first
9c84384c JM	190	move 1-3 bytes as needed. Since we are only word aligned we don't
9c84384c JM	191	use double word load/stores to insure that all loads are aligned.
04067002 UD	192	While the destination and stores may still be unaligned, this
	193	is only an issue for page (4096 byte boundary) crossing, which
	194	should be rare for these short moves. The hardware handles this
9c84384c JM	195	case automatically with a small delay. */
9c84384c JM	196
04067002 UD	197	.align 4
	198	.L2:
	199	mtcrf 0x01,5
	200	neg 8,4
	201	clrrdi 11,4,2
	202	andi. 0,8,3
	203	ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
	204	/* At least 9 bytes left. Get the source word aligned. */
	205	cmpldi cr1,5,16
	206	mr 10,5
	207	mr 12,4
	208	cmpldi cr6,0,2
	209	beq .L3 /* If the source is already word aligned skip this. */
	210	/* Copy 1-3 bytes to get source address word aligned. */
	211	lwz 6,0(11)
	212	subf 10,0,5
	213	add 12,4,0
	214	blt cr6,5f
	215	srdi 7,6,16
	216	bgt cr6,3f
759cfef3 AM	217	#ifdef __LITTLE_ENDIAN__
	218	sth 7,0(3)
	219	#else
04067002	220	sth 6,0(3)
759cfef3	221	#endif
04067002 UD	222	b 7f
	223	.align 4
	224	3:
759cfef3 AM	225	#ifdef __LITTLE_ENDIAN__
	226	rotlwi 6,6,24
	227	stb 6,0(3)
	228	sth 7,1(3)
	229	#else
04067002 UD	230	stb 7,0(3)
04067002 UD	231	sth 6,1(3)
759cfef3	232	#endif
04067002 UD	233	b 7f
	234	.align 4
	235	5:
759cfef3 AM	236	#ifdef __LITTLE_ENDIAN__
	237	rotlwi 6,6,8
	238	#endif
04067002 UD	239	stb 6,0(3)
	240	7:
	241	cmpldi cr1,10,16
	242	add 3,3,0
	243	mtcrf 0x01,10
	244	.align 4
	245	.L3:
	246	/* At least 6 bytes left and the source is word aligned. */
	247	blt cr1,8f
	248	16: /* Move 16 bytes. */
	249	lwz 6,0(12)
	250	lwz 7,4(12)
	251	stw 6,0(3)
	252	lwz 6,8(12)
	253	stw 7,4(3)
	254	lwz 7,12(12)
	255	addi 12,12,16
	256	stw 6,8(3)
	257	stw 7,12(3)
	258	addi 3,3,16
	259	8: /* Move 8 bytes. */
	260	bf 28,4f
	261	lwz 6,0(12)
	262	lwz 7,4(12)
	263	addi 12,12,8
	264	stw 6,0(3)
	265	stw 7,4(3)
	266	addi 3,3,8
	267	4: /* Move 4 bytes. */
	268	bf 29,2f
	269	lwz 6,0(12)
	270	addi 12,12,4
	271	stw 6,0(3)
9c84384c	272	addi 3,3,4
04067002 UD	273	2: /* Move 2-3 bytes. */
	274	bf 30,1f
	275	lhz 6,0(12)
9c84384c	276	sth 6,0(3)
04067002 UD	277	bf 31,0f
	278	lbz 7,2(12)
	279	stb 7,2(3)
	280	ld 3,-16(1)
	281	blr
	282	1: /* Move 1 byte. */
	283	bf 31,0f
	284	lbz 6,0(12)
	285	stb 6,0(3)
	286	0:
	287	/* Return original dst pointer. */
	288	ld 3,-16(1)
	289	blr
	290
	291	/* Special case to copy 0-8 bytes. */
	292	.align 4
	293	.LE8:
	294	mr 12,4
	295	bne cr6,4f
	296	/* Would have liked to use use ld/std here but the 630 processors are
9c84384c	297	slow for load/store doubles that are not at least word aligned.
2ccdea26	298	Unaligned Load/Store word execute with only a 1 cycle penalty. */
04067002 UD	299	lwz 6,0(4)
	300	lwz 7,4(4)
	301	stw 6,0(3)
	302	stw 7,4(3)
	303	/* Return original dst pointer. */
	304	ld 3,-16(1)
	305	blr
	306	.align 4
	307	4: bf 29,2b
	308	lwz 6,0(4)
	309	stw 6,0(3)
	310	6:
	311	bf 30,5f
	312	lhz 7,4(4)
9c84384c	313	sth 7,4(3)
04067002 UD	314	bf 31,0f
	315	lbz 8,6(4)
	316	stb 8,6(3)
	317	ld 3,-16(1)
	318	blr
	319	.align 4
9c84384c	320	5:
04067002 UD	321	bf 31,0f
	322	lbz 6,4(4)
	323	stb 6,4(3)
	324	.align 4
	325	0:
	326	/* Return original dst pointer. */
	327	ld 3,-16(1)
	328	blr
	329
	330	.align 4
	331	.L6:
	332
	333	/* Copy doublewords where the destination is aligned but the source is
	334	not. Use aligned doubleword loads from the source, shifted to realign
	335	the data, to allow aligned destination stores. */
	336	addi 11,9,-1 /* loop DW count is one less than total */
	337	subf 5,10,12
	338	sldi 10,10,3
	339	mr 4,3
	340	srdi 8,11,2 /* calculate the 32 byte loop count */
	341	ld 6,0(5)
	342	mtcrf 0x01,11
	343	cmpldi cr6,9,4
	344	mtctr 8
	345	ld 7,8(5)
	346	subfic 9,10,64
	347	bf 30,1f
	348
	349	/* there are at least two DWs to copy */
759cfef3 AM	350	#ifdef __LITTLE_ENDIAN__
	351	srd 0,6,10
	352	sld 8,7,9
	353	#else
04067002 UD	354	sld 0,6,10
04067002 UD	355	srd 8,7,9
759cfef3	356	#endif
04067002 UD	357	or 0,0,8
	358	ld 6,16(5)
	359	std 0,0(4)
759cfef3 AM	360	#ifdef __LITTLE_ENDIAN__
	361	srd 0,7,10
	362	sld 8,6,9
	363	#else
04067002 UD	364	sld 0,7,10
04067002 UD	365	srd 8,6,9
759cfef3	366	#endif
04067002 UD	367	or 0,0,8
	368	ld 7,24(5)
	369	std 0,8(4)
	370	addi 4,4,16
	371	addi 5,5,32
	372	blt cr6,8f /* if total DWs = 3, then bypass loop */
	373	bf 31,4f
	374	/* there is a third DW to copy */
759cfef3 AM	375	#ifdef __LITTLE_ENDIAN__
	376	srd 0,6,10
	377	sld 8,7,9
	378	#else
04067002 UD	379	sld 0,6,10
04067002 UD	380	srd 8,7,9
759cfef3	381	#endif
04067002 UD	382	or 0,0,8
	383	std 0,0(4)
	384	mr 6,7
	385	ld 7,0(5)
	386	addi 5,5,8
	387	addi 4,4,8
	388	beq cr6,8f /* if total DWs = 4, then bypass loop */
	389	b 4f
	390	.align 4
	391	1:
759cfef3 AM	392	#ifdef __LITTLE_ENDIAN__
	393	srd 0,6,10
	394	sld 8,7,9
	395	#else
04067002 UD	396	sld 0,6,10
04067002 UD	397	srd 8,7,9
759cfef3	398	#endif
04067002 UD	399	addi 5,5,16
	400	or 0,0,8
	401	bf 31,4f
	402	mr 6,7
	403	ld 7,0(5)
	404	addi 5,5,8
	405	std 0,0(4)
	406	addi 4,4,8
	407	.align 4
	408	/* copy 32 bytes at a time */
759cfef3 AM	409	4:
	410	#ifdef __LITTLE_ENDIAN__
	411	srd 0,6,10
	412	sld 8,7,9
	413	#else
	414	sld 0,6,10
04067002	415	srd 8,7,9
759cfef3	416	#endif
04067002 UD	417	or 0,0,8
	418	ld 6,0(5)
	419	std 0,0(4)
759cfef3 AM	420	#ifdef __LITTLE_ENDIAN__
	421	srd 0,7,10
	422	sld 8,6,9
	423	#else
04067002 UD	424	sld 0,7,10
04067002 UD	425	srd 8,6,9
759cfef3	426	#endif
04067002 UD	427	or 0,0,8
	428	ld 7,8(5)
	429	std 0,8(4)
759cfef3 AM	430	#ifdef __LITTLE_ENDIAN__
	431	srd 0,6,10
	432	sld 8,7,9
	433	#else
04067002 UD	434	sld 0,6,10
04067002 UD	435	srd 8,7,9
759cfef3	436	#endif
04067002 UD	437	or 0,0,8
	438	ld 6,16(5)
	439	std 0,16(4)
759cfef3 AM	440	#ifdef __LITTLE_ENDIAN__
	441	srd 0,7,10
	442	sld 8,6,9
	443	#else
04067002 UD	444	sld 0,7,10
04067002 UD	445	srd 8,6,9
759cfef3	446	#endif
04067002 UD	447	or 0,0,8
	448	ld 7,24(5)
	449	std 0,24(4)
	450	addi 5,5,32
	451	addi 4,4,32
	452	bdnz+ 4b
	453	.align 4
	454	8:
	455	/* calculate and store the final DW */
759cfef3 AM	456	#ifdef __LITTLE_ENDIAN__
	457	srd 0,6,10
	458	sld 8,7,9
	459	#else
04067002 UD	460	sld 0,6,10
04067002 UD	461	srd 8,7,9
759cfef3	462	#endif
9c84384c	463	or 0,0,8
04067002 UD	464	std 0,0(4)
	465	3:
	466	rldicr 0,31,0,60
	467	mtcrf 0x01,31
	468	bne cr1,.L9 /* If the tail is 0 bytes we are done! */
	469	/* Return original dst pointer. */
	470	ld 31,-8(1)
	471	ld 3,-16(1)
	472	blr
2d67d91a	473	END_GEN_TB (memcpy,TB_TOCLESS)
04067002	474	libc_hidden_builtin_def (memcpy)