[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcpy.S

/* Optimized memcpy implementation for PowerPC64.
   Copyright (C) 2003-2017 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
   Returns 'dst'.

   Memcpy handles short copies (< 32-bytes) using a binary move blocks
   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
   with the appropriate combination of byte and halfword load/stores.
   There is minimal effort to optimize the alignment of short moves.
   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
   of handling unaligned load/stores that do not cross 32-byte boundaries.

   Longer moves (>= 32-bytes) justify the effort to get at least the
   destination doubleword (8-byte) aligned.  Further optimization is
   possible when both source and destination are doubleword aligned.
   Each case has a optimized unrolled loop.   */

#ifndef MEMCPY
# define MEMCPY memcpy
#endif
	.machine power4
ENTRY_TOCLESS (MEMCPY, 5)
	CALL_MCOUNT 3

    cmpldi cr1,5,31
    neg   0,3
    std   3,-16(1)
    std   31,-8(1)
    cfi_offset(31,-8)
    andi. 11,3,7	/* check alignment of dst.  */
    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
    clrldi 10,4,61	/* check alignment of src.  */
    cmpldi cr6,5,8
    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
    cmpld cr6,10,11
    mr    12,4
    srdi  9,5,3		/* Number of full double words remaining.  */
    mtcrf 0x01,0
    mr    31,5
    beq   .L0

    subf  31,0,5
  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
1:  bf    31,2f
    lbz   6,0(12)
    addi  12,12,1
    stb   6,0(3)
    addi  3,3,1
2:  bf    30,4f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
4:  bf    29,0f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
0:
    clrldi 10,12,61	/* check alignment of src again.  */
    srdi  9,31,3	/* Number of full double words remaining.  */

  /* Copy doublewords from source to destination, assuming the
     destination is aligned on a doubleword boundary.

     At this point we know there are at least 25 bytes left (32-7) to copy.
     The next step is to determine if the source is also doubleword aligned.
     If not branch to the unaligned move code at .L6. which uses
     a load, shift, store strategy.

     Otherwise source and destination are doubleword aligned, and we can
     the optimized doubleword copy loop.  */
.L0:
    clrldi  11,31,61
    mtcrf   0x01,9
    cmpldi  cr1,11,0
    bne-    cr6,.L6   /* If source is not DW aligned.  */

  /* Move doublewords where destination and source are DW aligned.
     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
     If the copy is not an exact multiple of 32 bytes, 1-3
     doublewords are copied as needed to set up the main loop.  After
     the main loop exits there may be a tail of 1-7 bytes. These byte are
     copied a word/halfword/byte at a time as needed to preserve alignment.  */

    srdi  8,31,5
    cmpldi	cr1,9,4
    cmpldi	cr6,11,0
    mr    11,12

    bf    30,1f
    ld    6,0(12)
    ld    7,8(12)
    addi  11,12,16
    mtctr 8
    std   6,0(3)
    std   7,8(3)
    addi  10,3,16
    bf    31,4f
    ld    0,16(12)
    std   0,16(3)
    blt   cr1,3f
    addi  11,12,24
    addi  10,3,24
    b     4f
    .align  4
1:
    mr    10,3
    mtctr 8
    bf    31,4f
    ld    6,0(12)
    addi  11,12,8
    std   6,0(3)
    addi  10,3,8

    .align  4
4:
    ld    6,0(11)
    ld    7,8(11)
    ld    8,16(11)
    ld    0,24(11)
    addi  11,11,32
2:
    std   6,0(10)
    std   7,8(10)
    std   8,16(10)
    std   0,24(10)
    addi  10,10,32
    bdnz  4b
3:

    rldicr 0,31,0,60
    mtcrf 0x01,31
    beq   cr6,0f
.L9:
    add   3,3,0
    add   12,12,0

/*  At this point we have a tail of 0-7 bytes and we know that the
    destination is double word aligned.  */
4:  bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  bf    30,1f
    lhz   6,0(12)
    addi  12,12,2
    sth   6,0(3)
    addi  3,3,2
1:  bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr

/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
   bytes.  Each case is handled without loops, using binary (1,2,4,8)
   tests.

   In the short (0-8 byte) case no attempt is made to force alignment
   of either source or destination.  The hardware will handle the
   unaligned load/stores with small delays for crossing 32- 64-byte, and
   4096-byte boundaries. Since these short moves are unlikely to be
   unaligned or cross these boundaries, the overhead to force
   alignment is not justified.

   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
   boundaries.  Since only loads are sensitive to the 32-/64-byte
   boundaries it is more important to align the source then the
   destination.  If the source is not already word aligned, we first
   move 1-3 bytes as needed.  Since we are only word aligned we don't
   use double word load/stores to insure that all loads are aligned.
   While the destination and stores may still be unaligned, this
   is only an issue for page (4096 byte boundary) crossing, which
   should be rare for these short moves.  The hardware handles this
   case automatically with a small delay.  */

    .align  4
.L2:
    mtcrf 0x01,5
    neg   8,4
    clrrdi	11,4,2
    andi. 0,8,3
    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
/* At least 9 bytes left.  Get the source word aligned.  */
    cmpldi	cr1,5,16
    mr    10,5
    mr    12,4
    cmpldi	cr6,0,2
    beq   .L3	/* If the source is already word aligned skip this.  */
/* Copy 1-3 bytes to get source address word aligned.  */
    lwz   6,0(11)
    subf  10,0,5
    add   12,4,0
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
#ifdef __LITTLE_ENDIAN__
    sth   7,0(3)
#else
    sth   6,0(3)
#endif
    b     7f
    .align  4
3:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,24
    stb   6,0(3)
    sth   7,1(3)
#else
    stb   7,0(3)
    sth   6,1(3)
#endif
    b     7f
    .align  4
5:
#ifdef __LITTLE_ENDIAN__
    rotlwi 6,6,8
#endif
    stb   6,0(3)
7:
    cmpldi	cr1,10,16
    add   3,3,0
    mtcrf 0x01,10
    .align  4
.L3:
/* At least 6 bytes left and the source is word aligned.  */
    blt   cr1,8f
16: /* Move 16 bytes.  */
    lwz   6,0(12)
    lwz   7,4(12)
    stw   6,0(3)
    lwz   6,8(12)
    stw   7,4(3)
    lwz   7,12(12)
    addi  12,12,16
    stw   6,8(3)
    stw   7,12(3)
    addi  3,3,16
8:  /* Move 8 bytes.  */
    bf    28,4f
    lwz   6,0(12)
    lwz   7,4(12)
    addi  12,12,8
    stw   6,0(3)
    stw   7,4(3)
    addi  3,3,8
4:  /* Move 4 bytes.  */
    bf    29,2f
    lwz   6,0(12)
    addi  12,12,4
    stw   6,0(3)
    addi  3,3,4
2:  /* Move 2-3 bytes.  */
    bf    30,1f
    lhz   6,0(12)
    sth   6,0(3)
    bf    31,0f
    lbz   7,2(12)
    stb   7,2(3)
    ld 3,-16(1)
    blr
1:  /* Move 1 byte.  */
    bf    31,0f
    lbz   6,0(12)
    stb   6,0(3)
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

/* Special case to copy 0-8 bytes.  */
    .align  4
.LE8:
    mr    12,4
    bne   cr6,4f
/* Would have liked to use use ld/std here but the 630 processors are
   slow for load/store doubles that are not at least word aligned.
   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
    lwz   6,0(4)
    lwz   7,4(4)
    stw   6,0(3)
    stw   7,4(3)
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr
    .align  4
4:  bf    29,2b
    lwz   6,0(4)
    stw   6,0(3)
6:
    bf    30,5f
    lhz   7,4(4)
    sth   7,4(3)
    bf    31,0f
    lbz   8,6(4)
    stb   8,6(3)
    ld 3,-16(1)
    blr
    .align  4
5:
    bf    31,0f
    lbz   6,4(4)
    stb   6,4(3)
    .align  4
0:
  /* Return original dst pointer.  */
    ld    3,-16(1)
    blr

    .align  4
.L6:

  /* Copy doublewords where the destination is aligned but the source is
     not.  Use aligned doubleword loads from the source, shifted to realign
     the data, to allow aligned destination stores.  */
    addi    11,9,-1  /* loop DW count is one less than total */
    subf    5,10,12
    sldi    10,10,3
    mr      4,3
    srdi    8,11,2   /* calculate the 32 byte loop count */
    ld      6,0(5)
    mtcrf   0x01,11
    cmpldi  cr6,9,4
    mtctr   8
    ld      7,8(5)
    subfic  9,10,64
    bf      30,1f

    /* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
#ifdef __LITTLE_ENDIAN__
    srd     0,7,10
    sld     8,6,9
#else
    sld     0,7,10
    srd     8,6,9
#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
    addi    4,4,16
    addi    5,5,32
    blt     cr6,8f  /* if total DWs = 3, then bypass loop */
    bf      31,4f
    /* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
    ld      7,0(5)
    addi    5,5,8
    addi    4,4,8
    beq     cr6,8f  /* if total DWs = 4, then bypass loop */
    b       4f
    .align 4
1:
#ifdef __LITTLE_ENDIAN__
    srd     0,6,10
    sld     8,7,9
#else
    sld     0,6,10
    srd     8,7,9
#endif
    addi    5,5,16
    or      0,0,8
    bf      31,4f
    mr      6,7
    ld      7,0(5)
    addi    5,5,8
    std     0,0(4)
    addi    4,4,8
    .align 4
/* copy 32 bytes at a time */
4:
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,7,10
    sld   8,6,9
#else
    sld   0,7,10
    srd   8,6,9
#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
#ifdef __LITTLE_ENDIAN__
    srd   0,7,10
    sld   8,6,9
#else
    sld   0,7,10
    srd   8,6,9
#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
    addi  5,5,32
    addi  4,4,32
    bdnz+ 4b
    .align 4
8:
    /* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
    srd   0,6,10
    sld   8,7,9
#else
    sld   0,6,10
    srd   8,7,9
#endif
    or    0,0,8
    std   0,0(4)
3:
    rldicr 0,31,0,60
    mtcrf 0x01,31
    bne   cr1,.L9	/* If the tail is 0 bytes we are done!  */
  /* Return original dst pointer.  */
    ld 31,-8(1)
    ld 3,-16(1)
    blr
END_GEN_TB (MEMCPY,TB_TOCLESS)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
04067002	1	/* Optimized memcpy implementation for PowerPC64.
bfff8b1b	2	Copyright (C) 2003-2017 Free Software Foundation, Inc.
04067002 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
04067002 UD	18
04067002 UD	19	#include <sysdep.h>
04067002	20
f17a4233	21	/* void * [r3] memcpy (void dst [r3], void src [r4], size_t len [r5]);
04067002 UD	22	Returns 'dst'.
04067002 UD	23
9c84384c JM	24	Memcpy handles short copies (< 32-bytes) using a binary move blocks
	25	(no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
	26	with the appropriate combination of byte and halfword load/stores.
	27	There is minimal effort to optimize the alignment of short moves.
04067002	28	The 64-bit implementations of POWER3 and POWER4 do a reasonable job
2ccdea26	29	of handling unaligned load/stores that do not cross 32-byte boundaries.
04067002 UD	30
	31	Longer moves (>= 32-bytes) justify the effort to get at least the
	32	destination doubleword (8-byte) aligned. Further optimization is
2ccdea26	33	possible when both source and destination are doubleword aligned.
04067002 UD	34	Each case has a optimized unrolled loop. */
04067002 UD	35
72fd128a WSM	36	#ifndef MEMCPY
	37	# define MEMCPY memcpy
	38	#endif
a88f47a7	39	.machine power4
d5b41185	40	ENTRY_TOCLESS (MEMCPY, 5)
04067002 UD	41	CALL_MCOUNT 3
	42
	43	cmpldi cr1,5,31
	44	neg 0,3
	45	std 3,-16(1)
	46	std 31,-8(1)
	47	cfi_offset(31,-8)
2ccdea26	48	andi. 11,3,7 /* check alignment of dst. */
04067002	49	clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
2ccdea26	50	clrldi 10,4,61 /* check alignment of src. */
04067002 UD	51	cmpldi cr6,5,8
04067002 UD	52	ble- cr1,.L2 /* If move < 32 bytes use short move code. */
9c84384c	53	cmpld cr6,10,11
04067002 UD	54	mr 12,4
	55	srdi 9,5,3 /* Number of full double words remaining. */
	56	mtcrf 0x01,0
	57	mr 31,5
	58	beq .L0
9c84384c	59
04067002	60	subf 31,0,5
2ccdea26	61	/* Move 0-7 bytes as needed to get the destination doubleword aligned. */
04067002 UD	62	1: bf 31,2f
	63	lbz 6,0(12)
	64	addi 12,12,1
	65	stb 6,0(3)
	66	addi 3,3,1
	67	2: bf 30,4f
	68	lhz 6,0(12)
	69	addi 12,12,2
	70	sth 6,0(3)
	71	addi 3,3,2
	72	4: bf 29,0f
	73	lwz 6,0(12)
	74	addi 12,12,4
	75	stw 6,0(3)
	76	addi 3,3,4
	77	0:
2ccdea26	78	clrldi 10,12,61 /* check alignment of src again. */
04067002	79	srdi 9,31,3 /* Number of full double words remaining. */
9c84384c	80
2ccdea26	81	/* Copy doublewords from source to destination, assuming the
04067002 UD	82	destination is aligned on a doubleword boundary.
	83
	84	At this point we know there are at least 25 bytes left (32-7) to copy.
9c84384c	85	The next step is to determine if the source is also doubleword aligned.
04067002 UD	86	If not branch to the unaligned move code at .L6. which uses
04067002 UD	87	a load, shift, store strategy.
9c84384c	88
04067002 UD	89	Otherwise source and destination are doubleword aligned, and we can
	90	the optimized doubleword copy loop. */
	91	.L0:
	92	clrldi 11,31,61
	93	mtcrf 0x01,9
	94	cmpldi cr1,11,0
	95	bne- cr6,.L6 /* If source is not DW aligned. */
	96
	97	/* Move doublewords where destination and source are DW aligned.
	98	Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
ded5b9b7	99	If the copy is not an exact multiple of 32 bytes, 1-3
04067002	100	doublewords are copied as needed to set up the main loop. After
9c84384c	101	the main loop exits there may be a tail of 1-7 bytes. These byte are
04067002 UD	102	copied a word/halfword/byte at a time as needed to preserve alignment. */
	103
	104	srdi 8,31,5
	105	cmpldi cr1,9,4
	106	cmpldi cr6,11,0
	107	mr 11,12
9c84384c	108
04067002 UD	109	bf 30,1f
	110	ld 6,0(12)
	111	ld 7,8(12)
	112	addi 11,12,16
	113	mtctr 8
	114	std 6,0(3)
	115	std 7,8(3)
	116	addi 10,3,16
	117	bf 31,4f
	118	ld 0,16(12)
9c84384c	119	std 0,16(3)
04067002 UD	120	blt cr1,3f
	121	addi 11,12,24
	122	addi 10,3,24
	123	b 4f
	124	.align 4
	125	1:
	126	mr 10,3
	127	mtctr 8
	128	bf 31,4f
	129	ld 6,0(12)
	130	addi 11,12,8
	131	std 6,0(3)
	132	addi 10,3,8
9c84384c	133
04067002 UD	134	.align 4
	135	4:
	136	ld 6,0(11)
	137	ld 7,8(11)
	138	ld 8,16(11)
	139	ld 0,24(11)
	140	addi 11,11,32
	141	2:
	142	std 6,0(10)
	143	std 7,8(10)
	144	std 8,16(10)
	145	std 0,24(10)
	146	addi 10,10,32
	147	bdnz 4b
9c84384c	148	3:
04067002 UD	149
	150	rldicr 0,31,0,60
	151	mtcrf 0x01,31
	152	beq cr6,0f
	153	.L9:
	154	add 3,3,0
	155	add 12,12,0
9c84384c	156
04067002	157	/* At this point we have a tail of 0-7 bytes and we know that the
2ccdea26	158	destination is double word aligned. */
04067002 UD	159	4: bf 29,2f
	160	lwz 6,0(12)
	161	addi 12,12,4
	162	stw 6,0(3)
	163	addi 3,3,4
	164	2: bf 30,1f
	165	lhz 6,0(12)
	166	addi 12,12,2
	167	sth 6,0(3)
	168	addi 3,3,2
	169	1: bf 31,0f
	170	lbz 6,0(12)
	171	stb 6,0(3)
	172	0:
	173	/* Return original dst pointer. */
	174	ld 31,-8(1)
	175	ld 3,-16(1)
	176	blr
9c84384c JM	177
	178	/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
	179	bytes. Each case is handled without loops, using binary (1,2,4,8)
	180	tests.
	181
04067002	182	In the short (0-8 byte) case no attempt is made to force alignment
9c84384c JM	183	of either source or destination. The hardware will handle the
9c84384c JM	184	unaligned load/stores with small delays for crossing 32- 64-byte, and
04067002	185	4096-byte boundaries. Since these short moves are unlikely to be
9c84384c	186	unaligned or cross these boundaries, the overhead to force
04067002	187	alignment is not justified.
9c84384c	188
04067002 UD	189	The longer (9-31 byte) move is more likely to cross 32- or 64-byte
04067002 UD	190	boundaries. Since only loads are sensitive to the 32-/64-byte
9c84384c	191	boundaries it is more important to align the source then the
04067002	192	destination. If the source is not already word aligned, we first
9c84384c JM	193	move 1-3 bytes as needed. Since we are only word aligned we don't
9c84384c JM	194	use double word load/stores to insure that all loads are aligned.
04067002 UD	195	While the destination and stores may still be unaligned, this
	196	is only an issue for page (4096 byte boundary) crossing, which
	197	should be rare for these short moves. The hardware handles this
9c84384c JM	198	case automatically with a small delay. */
9c84384c JM	199
04067002 UD	200	.align 4
	201	.L2:
	202	mtcrf 0x01,5
	203	neg 8,4
	204	clrrdi 11,4,2
	205	andi. 0,8,3
	206	ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
	207	/* At least 9 bytes left. Get the source word aligned. */
	208	cmpldi cr1,5,16
	209	mr 10,5
	210	mr 12,4
	211	cmpldi cr6,0,2
	212	beq .L3 /* If the source is already word aligned skip this. */
	213	/* Copy 1-3 bytes to get source address word aligned. */
	214	lwz 6,0(11)
	215	subf 10,0,5
	216	add 12,4,0
	217	blt cr6,5f
	218	srdi 7,6,16
	219	bgt cr6,3f
759cfef3 AM	220	#ifdef __LITTLE_ENDIAN__
	221	sth 7,0(3)
	222	#else
04067002	223	sth 6,0(3)
759cfef3	224	#endif
04067002 UD	225	b 7f
	226	.align 4
	227	3:
759cfef3 AM	228	#ifdef __LITTLE_ENDIAN__
	229	rotlwi 6,6,24
	230	stb 6,0(3)
	231	sth 7,1(3)
	232	#else
04067002 UD	233	stb 7,0(3)
04067002 UD	234	sth 6,1(3)
759cfef3	235	#endif
04067002 UD	236	b 7f
	237	.align 4
	238	5:
759cfef3 AM	239	#ifdef __LITTLE_ENDIAN__
	240	rotlwi 6,6,8
	241	#endif
04067002 UD	242	stb 6,0(3)
	243	7:
	244	cmpldi cr1,10,16
	245	add 3,3,0
	246	mtcrf 0x01,10
	247	.align 4
	248	.L3:
	249	/* At least 6 bytes left and the source is word aligned. */
	250	blt cr1,8f
	251	16: /* Move 16 bytes. */
	252	lwz 6,0(12)
	253	lwz 7,4(12)
	254	stw 6,0(3)
	255	lwz 6,8(12)
	256	stw 7,4(3)
	257	lwz 7,12(12)
	258	addi 12,12,16
	259	stw 6,8(3)
	260	stw 7,12(3)
	261	addi 3,3,16
	262	8: /* Move 8 bytes. */
	263	bf 28,4f
	264	lwz 6,0(12)
	265	lwz 7,4(12)
	266	addi 12,12,8
	267	stw 6,0(3)
	268	stw 7,4(3)
	269	addi 3,3,8
	270	4: /* Move 4 bytes. */
	271	bf 29,2f
	272	lwz 6,0(12)
	273	addi 12,12,4
	274	stw 6,0(3)
9c84384c	275	addi 3,3,4
04067002 UD	276	2: /* Move 2-3 bytes. */
	277	bf 30,1f
	278	lhz 6,0(12)
9c84384c	279	sth 6,0(3)
04067002 UD	280	bf 31,0f
	281	lbz 7,2(12)
	282	stb 7,2(3)
	283	ld 3,-16(1)
	284	blr
	285	1: /* Move 1 byte. */
	286	bf 31,0f
	287	lbz 6,0(12)
	288	stb 6,0(3)
	289	0:
	290	/* Return original dst pointer. */
	291	ld 3,-16(1)
	292	blr
	293
	294	/* Special case to copy 0-8 bytes. */
	295	.align 4
	296	.LE8:
	297	mr 12,4
	298	bne cr6,4f
	299	/* Would have liked to use use ld/std here but the 630 processors are
9c84384c	300	slow for load/store doubles that are not at least word aligned.
2ccdea26	301	Unaligned Load/Store word execute with only a 1 cycle penalty. */
04067002 UD	302	lwz 6,0(4)
	303	lwz 7,4(4)
	304	stw 6,0(3)
	305	stw 7,4(3)
	306	/* Return original dst pointer. */
	307	ld 3,-16(1)
	308	blr
	309	.align 4
	310	4: bf 29,2b
	311	lwz 6,0(4)
	312	stw 6,0(3)
	313	6:
	314	bf 30,5f
	315	lhz 7,4(4)
9c84384c	316	sth 7,4(3)
04067002 UD	317	bf 31,0f
	318	lbz 8,6(4)
	319	stb 8,6(3)
	320	ld 3,-16(1)
	321	blr
	322	.align 4
9c84384c	323	5:
04067002 UD	324	bf 31,0f
	325	lbz 6,4(4)
	326	stb 6,4(3)
	327	.align 4
	328	0:
	329	/* Return original dst pointer. */
	330	ld 3,-16(1)
	331	blr
	332
	333	.align 4
	334	.L6:
	335
	336	/* Copy doublewords where the destination is aligned but the source is
	337	not. Use aligned doubleword loads from the source, shifted to realign
	338	the data, to allow aligned destination stores. */
	339	addi 11,9,-1 /* loop DW count is one less than total */
	340	subf 5,10,12
	341	sldi 10,10,3
	342	mr 4,3
	343	srdi 8,11,2 /* calculate the 32 byte loop count */
	344	ld 6,0(5)
	345	mtcrf 0x01,11
	346	cmpldi cr6,9,4
	347	mtctr 8
	348	ld 7,8(5)
	349	subfic 9,10,64
	350	bf 30,1f
	351
	352	/* there are at least two DWs to copy */
759cfef3 AM	353	#ifdef __LITTLE_ENDIAN__
	354	srd 0,6,10
	355	sld 8,7,9
	356	#else
04067002 UD	357	sld 0,6,10
04067002 UD	358	srd 8,7,9
759cfef3	359	#endif
04067002 UD	360	or 0,0,8
	361	ld 6,16(5)
	362	std 0,0(4)
759cfef3 AM	363	#ifdef __LITTLE_ENDIAN__
	364	srd 0,7,10
	365	sld 8,6,9
	366	#else
04067002 UD	367	sld 0,7,10
04067002 UD	368	srd 8,6,9
759cfef3	369	#endif
04067002 UD	370	or 0,0,8
	371	ld 7,24(5)
	372	std 0,8(4)
	373	addi 4,4,16
	374	addi 5,5,32
	375	blt cr6,8f /* if total DWs = 3, then bypass loop */
	376	bf 31,4f
	377	/* there is a third DW to copy */
759cfef3 AM	378	#ifdef __LITTLE_ENDIAN__
	379	srd 0,6,10
	380	sld 8,7,9
	381	#else
04067002 UD	382	sld 0,6,10
04067002 UD	383	srd 8,7,9
759cfef3	384	#endif
04067002 UD	385	or 0,0,8
	386	std 0,0(4)
	387	mr 6,7
	388	ld 7,0(5)
	389	addi 5,5,8
	390	addi 4,4,8
	391	beq cr6,8f /* if total DWs = 4, then bypass loop */
	392	b 4f
	393	.align 4
	394	1:
759cfef3 AM	395	#ifdef __LITTLE_ENDIAN__
	396	srd 0,6,10
	397	sld 8,7,9
	398	#else
04067002 UD	399	sld 0,6,10
04067002 UD	400	srd 8,7,9
759cfef3	401	#endif
04067002 UD	402	addi 5,5,16
	403	or 0,0,8
	404	bf 31,4f
	405	mr 6,7
	406	ld 7,0(5)
	407	addi 5,5,8
	408	std 0,0(4)
	409	addi 4,4,8
	410	.align 4
	411	/* copy 32 bytes at a time */
759cfef3 AM	412	4:
	413	#ifdef __LITTLE_ENDIAN__
	414	srd 0,6,10
	415	sld 8,7,9
	416	#else
	417	sld 0,6,10
04067002	418	srd 8,7,9
759cfef3	419	#endif
04067002 UD	420	or 0,0,8
	421	ld 6,0(5)
	422	std 0,0(4)
759cfef3 AM	423	#ifdef __LITTLE_ENDIAN__
	424	srd 0,7,10
	425	sld 8,6,9
	426	#else
04067002 UD	427	sld 0,7,10
04067002 UD	428	srd 8,6,9
759cfef3	429	#endif
04067002 UD	430	or 0,0,8
	431	ld 7,8(5)
	432	std 0,8(4)
759cfef3 AM	433	#ifdef __LITTLE_ENDIAN__
	434	srd 0,6,10
	435	sld 8,7,9
	436	#else
04067002 UD	437	sld 0,6,10
04067002 UD	438	srd 8,7,9
759cfef3	439	#endif
04067002 UD	440	or 0,0,8
	441	ld 6,16(5)
	442	std 0,16(4)
759cfef3 AM	443	#ifdef __LITTLE_ENDIAN__
	444	srd 0,7,10
	445	sld 8,6,9
	446	#else
04067002 UD	447	sld 0,7,10
04067002 UD	448	srd 8,6,9
759cfef3	449	#endif
04067002 UD	450	or 0,0,8
	451	ld 7,24(5)
	452	std 0,24(4)
	453	addi 5,5,32
	454	addi 4,4,32
	455	bdnz+ 4b
	456	.align 4
	457	8:
	458	/* calculate and store the final DW */
759cfef3 AM	459	#ifdef __LITTLE_ENDIAN__
	460	srd 0,6,10
	461	sld 8,7,9
	462	#else
04067002 UD	463	sld 0,6,10
04067002 UD	464	srd 8,7,9
759cfef3	465	#endif
9c84384c	466	or 0,0,8
04067002 UD	467	std 0,0(4)
	468	3:
	469	rldicr 0,31,0,60
	470	mtcrf 0x01,31
	471	bne cr1,.L9 /* If the tail is 0 bytes we are done! */
	472	/* Return original dst pointer. */
	473	ld 31,-8(1)
	474	ld 3,-16(1)
	475	blr
72fd128a	476	END_GEN_TB (MEMCPY,TB_TOCLESS)
04067002	477	libc_hidden_builtin_def (memcpy)