[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / cell / memcpy.S

/* Optimized memcpy implementation for CELL BE PowerPC.
   Copyright (C) 2010-2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>
#include <bp-sym.h>
#include <bp-asm.h>

#define PREFETCH_AHEAD 6	/* no cache lines SRC prefetching ahead  */
#define ZERO_AHEAD 4		/* no cache lines DST zeroing ahead  */

/* memcpy routine optimized for CELL-BE-PPC	v2.0
 *
 * The CELL PPC core has 1 integer unit and 1 load/store unit
 * CELL:
 * 1st level data cache = 32K
 * 2nd level data cache = 512K
 * 3rd level data cache = 0K
 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
 * latency to memory is >400 clocks
 * To improve copy performance we need to prefetch source data
 * far ahead to hide this latency
 * For best performance instructionforms ending in "." like "andi."
 * should be avoided as the are implemented in microcode on CELL.
 * The below code is loop unrolled for the CELL cache line of 128 bytes
 */

.align  7

EALIGN (BP_SYM (memcpy), 5, 0)
	CALL_MCOUNT 3

	dcbt	0,r4		/* Prefetch ONE SRC cacheline  */
	cmpldi	cr1,r5,16	/* is size < 16 ?  */
	mr	r6,r3
	blt+	cr1,.Lshortcopy

.Lbigcopy:
	neg	r8,r3		/* LS 3 bits = # bytes to 8-byte dest bdry  */
	clrldi  r8,r8,64-4	/* aling to 16byte boundary  */
	sub     r7,r4,r3
	cmpldi	cr0,r8,0
	beq+	.Ldst_aligned

.Ldst_unaligned:
	mtcrf	0x01,r8		/* put #bytes to boundary into cr7  */
	subf	r5,r8,r5

	bf	cr7*4+3,1f
	lbzx	r0,r7,r6	/* copy 1 byte  */
	stb	r0,0(r6)
	addi	r6,r6,1
1:	bf	cr7*4+2,2f
	lhzx	r0,r7,r6	/* copy 2 byte  */
	sth	r0,0(r6)
	addi	r6,r6,2
2:	bf	cr7*4+1,4f
	lwzx	r0,r7,r6	/* copy 4 byte  */
	stw	r0,0(r6)
	addi	r6,r6,4
4:	bf	cr7*4+0,8f
	ldx	r0,r7,r6	/* copy 8 byte  */
	std	r0,0(r6)
	addi	r6,r6,8
8:
	add	r4,r7,r6

.Ldst_aligned:

	cmpdi	cr5,r5,128-1

	neg	r7,r6
	addi	r6,r6,-8	/* prepare for stdu  */
	addi	r4,r4,-8	/* prepare for ldu  */

	clrldi  r7,r7,64-7	/* align to cacheline boundary  */
	ble+	cr5,.Llessthancacheline

	cmpldi	cr6,r7,0
	subf	r5,r7,r5
	srdi	r7,r7,4		/* divide size by 16  */
	srdi	r10,r5,7	/* number of cache lines to copy  */

	cmpldi	r10,0
	li	r11,0		/* number cachelines to copy with prefetch  */
	beq	.Lnocacheprefetch

	cmpldi	r10,PREFETCH_AHEAD
	li	r12,128+8	/* prefetch distance  */
	ble	.Llessthanmaxprefetch

	subi	r11,r10,PREFETCH_AHEAD
	li	r10,PREFETCH_AHEAD

.Llessthanmaxprefetch:
	mtctr	r10

.LprefetchSRC:
	dcbt    r12,r4
	addi    r12,r12,128
	bdnz    .LprefetchSRC

.Lnocacheprefetch:
	mtctr	r7
	cmpldi	cr1,r5,128
	clrldi  r5,r5,64-7
	beq	cr6,.Lcachelinealigned

.Laligntocacheline:
	ld	r9,0x08(r4)
	ldu	r7,0x10(r4)
	std	r9,0x08(r6)
	stdu	r7,0x10(r6)
	bdnz	.Laligntocacheline


.Lcachelinealigned:		/* copy while cache lines  */

	blt-	cr1,.Llessthancacheline	/* size <128  */

.Louterloop:
	cmpdi   r11,0
	mtctr	r11
	beq-	.Lendloop

	li	r11,128*ZERO_AHEAD +8	/* DCBZ dist  */

.align	4
	/* Copy whole cachelines, optimized by prefetching SRC cacheline  */
.Lloop:				/* Copy aligned body  */
	dcbt	r12,r4		/* PREFETCH SOURCE some cache lines ahead  */
	ld	r9, 0x08(r4)
	dcbz	r11,r6
	ld	r7, 0x10(r4)	/* 4 register stride copy is optimal  */
	ld	r8, 0x18(r4)	/* to hide 1st level cache lantency.  */
	ld	r0, 0x20(r4)
	std	r9, 0x08(r6)
	std	r7, 0x10(r6)
	std	r8, 0x18(r6)
	std	r0, 0x20(r6)
	ld	r9, 0x28(r4)
	ld	r7, 0x30(r4)
	ld	r8, 0x38(r4)
	ld	r0, 0x40(r4)
	std	r9, 0x28(r6)
	std	r7, 0x30(r6)
	std	r8, 0x38(r6)
	std	r0, 0x40(r6)
	ld	r9, 0x48(r4)
	ld	r7, 0x50(r4)
	ld	r8, 0x58(r4)
	ld	r0, 0x60(r4)
	std	r9, 0x48(r6)
	std	r7, 0x50(r6)
	std	r8, 0x58(r6)
	std	r0, 0x60(r6)
	ld	r9, 0x68(r4)
	ld	r7, 0x70(r4)
	ld	r8, 0x78(r4)
	ldu	r0, 0x80(r4)
	std	r9, 0x68(r6)
	std	r7, 0x70(r6)
	std	r8, 0x78(r6)
	stdu	r0, 0x80(r6)

	bdnz	.Lloop

.Lendloop:
	cmpdi	r10,0
	sldi	r10,r10,2	/* adjust from 128 to 32 byte stride  */
	beq-	.Lendloop2
	mtctr	r10

.Lloop2:			/* Copy aligned body  */
	ld	r9, 0x08(r4)
	ld	r7, 0x10(r4)
	ld	r8, 0x18(r4)
	ldu	r0, 0x20(r4)
	std	r9, 0x08(r6)
	std	r7, 0x10(r6)
	std	r8, 0x18(r6)
	stdu	r0, 0x20(r6)

	bdnz	.Lloop2
.Lendloop2:

.Llessthancacheline:		/* less than cache to do ?  */
	cmpldi	cr0,r5,16
	srdi	r7,r5,4		/* divide size by 16  */
	blt-	.Ldo_lt16
	mtctr	r7

.Lcopy_remaining:
	ld	r8,0x08(r4)
	ldu	r7,0x10(r4)
	std	r8,0x08(r6)
	stdu	r7,0x10(r6)
	bdnz	.Lcopy_remaining

.Ldo_lt16:			/* less than 16 ?  */
	cmpldi	cr0,r5,0	/* copy remaining bytes (0-15)  */
	beqlr+			/* no rest to copy  */
	addi	r4,r4,8
	addi	r6,r6,8

.Lshortcopy:			/* SIMPLE COPY to handle size =< 15 bytes  */
	mtcrf	0x01,r5
	sub	r7,r4,r6
	bf-	cr7*4+0,8f
	ldx	r0,r7,r6	/* copy 8 byte  */
	std	r0,0(r6)
	addi	r6,r6,8
8:
	bf	cr7*4+1,4f
	lwzx	r0,r7,r6	/* copy 4 byte  */
	stw	r0,0(r6)
	addi	r6,r6,4
4:
	bf	cr7*4+2,2f
	lhzx	r0,r7,r6	/* copy 2 byte  */
	sth	r0,0(r6)
	addi	r6,r6,2
2:
	bf	cr7*4+3,1f
	lbzx	r0,r7,r6	/* copy 1 byte  */
	stb	r0,0(r6)
1:	blr

END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
libc_hidden_builtin_def (memcpy)
Commit	Line	Data
057edf90	1	/* Optimized memcpy implementation for CELL BE PowerPC.
568035b7	2	Copyright (C) 2010-2013 Free Software Foundation, Inc.
057edf90 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
057edf90 UD	18
	19	#include <sysdep.h>
	20	#include <bp-sym.h>
	21	#include <bp-asm.h>
	22
	23	#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
	24	#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
	25
	26	/* memcpy routine optimized for CELL-BE-PPC v2.0
	27	*
	28	* The CELL PPC core has 1 integer unit and 1 load/store unit
	29	* CELL:
	30	* 1st level data cache = 32K
	31	* 2nd level data cache = 512K
	32	* 3rd level data cache = 0K
	33	* With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
	34	* latency to memory is >400 clocks
	35	* To improve copy performance we need to prefetch source data
	36	* far ahead to hide this latency
	37	* For best performance instructionforms ending in "." like "andi."
	38	* should be avoided as the are implemented in microcode on CELL.
	39	* The below code is loop unrolled for the CELL cache line of 128 bytes
	40	*/
	41
	42	.align 7
	43
	44	EALIGN (BP_SYM (memcpy), 5, 0)
d6ac9329	45	CALL_MCOUNT 3
057edf90 UD	46
	47	dcbt 0,r4 /* Prefetch ONE SRC cacheline */
	48	cmpldi cr1,r5,16 /* is size < 16 ? */
d6ac9329	49	mr r6,r3
057edf90 UD	50	blt+ cr1,.Lshortcopy
	51
	52	.Lbigcopy:
	53	neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
d6ac9329	54	clrldi r8,r8,64-4 /* aling to 16byte boundary */
057edf90 UD	55	sub r7,r4,r3
	56	cmpldi cr0,r8,0
	57	beq+ .Ldst_aligned
	58
	59	.Ldst_unaligned:
	60	mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
	61	subf r5,r8,r5
	62
	63	bf cr7*4+3,1f
	64	lbzx r0,r7,r6 /* copy 1 byte */
	65	stb r0,0(r6)
	66	addi r6,r6,1
	67	1: bf cr7*4+2,2f
	68	lhzx r0,r7,r6 /* copy 2 byte */
	69	sth r0,0(r6)
	70	addi r6,r6,2
	71	2: bf cr7*4+1,4f
	72	lwzx r0,r7,r6 /* copy 4 byte */
	73	stw r0,0(r6)
	74	addi r6,r6,4
	75	4: bf cr7*4+0,8f
	76	ldx r0,r7,r6 /* copy 8 byte */
	77	std r0,0(r6)
	78	addi r6,r6,8
	79	8:
	80	add r4,r7,r6
	81
	82	.Ldst_aligned:
	83
	84	cmpdi cr5,r5,128-1
	85
	86	neg r7,r6
	87	addi r6,r6,-8 /* prepare for stdu */
	88	addi r4,r4,-8 /* prepare for ldu */
	89
	90	clrldi r7,r7,64-7 /* align to cacheline boundary */
	91	ble+ cr5,.Llessthancacheline
	92
	93	cmpldi cr6,r7,0
	94	subf r5,r7,r5
	95	srdi r7,r7,4 /* divide size by 16 */
	96	srdi r10,r5,7 /* number of cache lines to copy */
	97
	98	cmpldi r10,0
	99	li r11,0 /* number cachelines to copy with prefetch */
	100	beq .Lnocacheprefetch
	101
	102	cmpldi r10,PREFETCH_AHEAD
	103	li r12,128+8 /* prefetch distance */
	104	ble .Llessthanmaxprefetch
	105
	106	subi r11,r10,PREFETCH_AHEAD
	107	li r10,PREFETCH_AHEAD
	108
	109	.Llessthanmaxprefetch:
	110	mtctr r10
	111
	112	.LprefetchSRC:
	113	dcbt r12,r4
d6ac9329 UD	114	addi r12,r12,128
d6ac9329 UD	115	bdnz .LprefetchSRC
057edf90 UD	116
	117	.Lnocacheprefetch:
	118	mtctr r7
	119	cmpldi cr1,r5,128
	120	clrldi r5,r5,64-7
	121	beq cr6,.Lcachelinealigned
	122
	123	.Laligntocacheline:
d6ac9329	124	ld r9,0x08(r4)
057edf90 UD	125	ldu r7,0x10(r4)
	126	std r9,0x08(r6)
	127	stdu r7,0x10(r6)
	128	bdnz .Laligntocacheline
	129
	130
	131	.Lcachelinealigned: /* copy while cache lines */
	132
d6ac9329	133	blt- cr1,.Llessthancacheline /* size <128 */
057edf90 UD	134
057edf90 UD	135	.Louterloop:
d6ac9329	136	cmpdi r11,0
057edf90 UD	137	mtctr r11
	138	beq- .Lendloop
	139
	140	li r11,128ZERO_AHEAD +8 / DCBZ dist */
	141
	142	.align 4
	143	/* Copy whole cachelines, optimized by prefetching SRC cacheline */
d6ac9329	144	.Lloop: /* Copy aligned body */
057edf90 UD	145	dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
	146	ld r9, 0x08(r4)
	147	dcbz r11,r6
	148	ld r7, 0x10(r4) /* 4 register stride copy is optimal */
	149	ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
	150	ld r0, 0x20(r4)
	151	std r9, 0x08(r6)
	152	std r7, 0x10(r6)
	153	std r8, 0x18(r6)
	154	std r0, 0x20(r6)
	155	ld r9, 0x28(r4)
	156	ld r7, 0x30(r4)
	157	ld r8, 0x38(r4)
	158	ld r0, 0x40(r4)
	159	std r9, 0x28(r6)
	160	std r7, 0x30(r6)
	161	std r8, 0x38(r6)
	162	std r0, 0x40(r6)
	163	ld r9, 0x48(r4)
	164	ld r7, 0x50(r4)
	165	ld r8, 0x58(r4)
	166	ld r0, 0x60(r4)
	167	std r9, 0x48(r6)
	168	std r7, 0x50(r6)
	169	std r8, 0x58(r6)
	170	std r0, 0x60(r6)
	171	ld r9, 0x68(r4)
	172	ld r7, 0x70(r4)
	173	ld r8, 0x78(r4)
	174	ldu r0, 0x80(r4)
	175	std r9, 0x68(r6)
	176	std r7, 0x70(r6)
	177	std r8, 0x78(r6)
	178	stdu r0, 0x80(r6)
	179
	180	bdnz .Lloop
	181
	182	.Lendloop:
	183	cmpdi r10,0
	184	sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
	185	beq- .Lendloop2
	186	mtctr r10
	187
d6ac9329	188	.Lloop2: /* Copy aligned body */
057edf90 UD	189	ld r9, 0x08(r4)
	190	ld r7, 0x10(r4)
	191	ld r8, 0x18(r4)
	192	ldu r0, 0x20(r4)
	193	std r9, 0x08(r6)
	194	std r7, 0x10(r6)
	195	std r8, 0x18(r6)
	196	stdu r0, 0x20(r6)
	197
	198	bdnz .Lloop2
	199	.Lendloop2:
	200
	201	.Llessthancacheline: /* less than cache to do ? */
	202	cmpldi cr0,r5,16
	203	srdi r7,r5,4 /* divide size by 16 */
	204	blt- .Ldo_lt16
	205	mtctr r7
	206
	207	.Lcopy_remaining:
d6ac9329	208	ld r8,0x08(r4)
057edf90 UD	209	ldu r7,0x10(r4)
	210	std r8,0x08(r6)
	211	stdu r7,0x10(r6)
	212	bdnz .Lcopy_remaining
	213
	214	.Ldo_lt16: /* less than 16 ? */
	215	cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
d6ac9329	216	beqlr+ /* no rest to copy */
057edf90 UD	217	addi r4,r4,8
	218	addi r6,r6,8
	219
	220	.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
	221	mtcrf 0x01,r5
	222	sub r7,r4,r6
	223	bf- cr7*4+0,8f
	224	ldx r0,r7,r6 /* copy 8 byte */
	225	std r0,0(r6)
	226	addi r6,r6,8
	227	8:
	228	bf cr7*4+1,4f
	229	lwzx r0,r7,r6 /* copy 4 byte */
	230	stw r0,0(r6)
	231	addi r6,r6,4
	232	4:
	233	bf cr7*4+2,2f
	234	lhzx r0,r7,r6 /* copy 2 byte */
	235	sth r0,0(r6)
	236	addi r6,r6,2
	237	2:
	238	bf cr7*4+3,1f
	239	lbzx r0,r7,r6 /* copy 1 byte */
	240	stb r0,0(r6)
	241	1: blr
	242
	243	END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
	244	libc_hidden_builtin_def (memcpy)