1 /* Optimized memcpy implementation for CELL BE PowerPC.
2 Copyright (C) 2010-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
22 # define MEMCPY memcpy
25 #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
26 #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
28 /* memcpy routine optimized for CELL-BE-PPC v2.0
30 * The CELL PPC core has 1 integer unit and 1 load/store unit
32 * 1st level data cache = 32K
33 * 2nd level data cache = 512K
34 * 3rd level data cache = 0K
35 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
36 * latency to memory is >400 clocks
37 * To improve copy performance we need to prefetch source data
38 * far ahead to hide this latency
39 * For best performance instruction forms ending in "." like "andi."
40 * should be avoided as the are implemented in microcode on CELL.
41 * The below code is loop unrolled for the CELL cache line of 128 bytes
46 ENTRY_TOCLESS (MEMCPY, 5)
49 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
50 cmpldi cr1,r5,16 /* is size < 16 ? */
55 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
56 clrldi r8,r8,64-4 /* align to 16byte boundary */
62 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
66 lbzx r0,r7,r6 /* copy 1 byte */
70 lhzx r0,r7,r6 /* copy 2 byte */
74 lwzx r0,r7,r6 /* copy 4 byte */
78 ldx r0,r7,r6 /* copy 8 byte */
89 addi r6,r6,-8 /* prepare for stdu */
90 addi r4,r4,-8 /* prepare for ldu */
92 clrldi r7,r7,64-7 /* align to cacheline boundary */
93 ble+ cr5,.Llessthancacheline
97 srdi r7,r7,4 /* divide size by 16 */
98 srdi r10,r5,7 /* number of cache lines to copy */
101 li r11,0 /* number cachelines to copy with prefetch */
102 beq .Lnocacheprefetch
104 cmpldi r10,PREFETCH_AHEAD
105 li r12,128+8 /* prefetch distance */
106 ble .Llessthanmaxprefetch
108 subi r11,r10,PREFETCH_AHEAD
109 li r10,PREFETCH_AHEAD
111 .Llessthanmaxprefetch:
123 beq cr6,.Lcachelinealigned
130 bdnz .Laligntocacheline
133 .Lcachelinealigned: /* copy while cache lines */
135 blt- cr1,.Llessthancacheline /* size <128 */
142 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
145 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
146 .Lloop: /* Copy aligned body */
147 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
150 ld r7, 0x10(r4) /* 4 register stride copy is optimal */
151 ld r8, 0x18(r4) /* to hide 1st level cache latency. */
186 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
190 .Lloop2: /* Copy aligned body */
203 .Llessthancacheline: /* less than cache to do ? */
205 srdi r7,r5,4 /* divide size by 16 */
214 bdnz .Lcopy_remaining
216 .Ldo_lt16: /* less than 16 ? */
217 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
218 beqlr+ /* no rest to copy */
222 .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
226 ldx r0,r7,r6 /* copy 8 byte */
231 lwzx r0,r7,r6 /* copy 4 byte */
236 lhzx r0,r7,r6 /* copy 2 byte */
241 lbzx r0,r7,r6 /* copy 1 byte */
245 END_GEN_TB (MEMCPY,TB_TOCLESS)
246 libc_hidden_builtin_def (memcpy)