]>
Commit | Line | Data |
---|---|---|
057edf90 | 1 | /* Optimized memcpy implementation for CELL BE PowerPC. |
568035b7 | 2 | Copyright (C) 2010-2013 Free Software Foundation, Inc. |
057edf90 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
057edf90 UD |
18 | |
19 | #include <sysdep.h> | |
20 | #include <bp-sym.h> | |
21 | #include <bp-asm.h> | |
22 | ||
23 | #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ | |
24 | #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ | |
25 | ||
26 | /* memcpy routine optimized for CELL-BE-PPC v2.0 | |
27 | * | |
28 | * The CELL PPC core has 1 integer unit and 1 load/store unit | |
29 | * CELL: | |
30 | * 1st level data cache = 32K | |
31 | * 2nd level data cache = 512K | |
32 | * 3rd level data cache = 0K | |
33 | * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, | |
34 | * latency to memory is >400 clocks | |
35 | * To improve copy performance we need to prefetch source data | |
36 | * far ahead to hide this latency | |
37 | * For best performance instructionforms ending in "." like "andi." | |
38 | * should be avoided as the are implemented in microcode on CELL. | |
39 | * The below code is loop unrolled for the CELL cache line of 128 bytes | |
40 | */ | |
41 | ||
42 | .align 7 | |
43 | ||
44 | EALIGN (BP_SYM (memcpy), 5, 0) | |
d6ac9329 | 45 | CALL_MCOUNT 3 |
057edf90 UD |
46 | |
47 | dcbt 0,r4 /* Prefetch ONE SRC cacheline */ | |
48 | cmpldi cr1,r5,16 /* is size < 16 ? */ | |
d6ac9329 | 49 | mr r6,r3 |
057edf90 UD |
50 | blt+ cr1,.Lshortcopy |
51 | ||
52 | .Lbigcopy: | |
53 | neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ | |
d6ac9329 | 54 | clrldi r8,r8,64-4 /* aling to 16byte boundary */ |
057edf90 UD |
55 | sub r7,r4,r3 |
56 | cmpldi cr0,r8,0 | |
57 | beq+ .Ldst_aligned | |
58 | ||
59 | .Ldst_unaligned: | |
60 | mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ | |
61 | subf r5,r8,r5 | |
62 | ||
63 | bf cr7*4+3,1f | |
64 | lbzx r0,r7,r6 /* copy 1 byte */ | |
65 | stb r0,0(r6) | |
66 | addi r6,r6,1 | |
67 | 1: bf cr7*4+2,2f | |
68 | lhzx r0,r7,r6 /* copy 2 byte */ | |
69 | sth r0,0(r6) | |
70 | addi r6,r6,2 | |
71 | 2: bf cr7*4+1,4f | |
72 | lwzx r0,r7,r6 /* copy 4 byte */ | |
73 | stw r0,0(r6) | |
74 | addi r6,r6,4 | |
75 | 4: bf cr7*4+0,8f | |
76 | ldx r0,r7,r6 /* copy 8 byte */ | |
77 | std r0,0(r6) | |
78 | addi r6,r6,8 | |
79 | 8: | |
80 | add r4,r7,r6 | |
81 | ||
82 | .Ldst_aligned: | |
83 | ||
84 | cmpdi cr5,r5,128-1 | |
85 | ||
86 | neg r7,r6 | |
87 | addi r6,r6,-8 /* prepare for stdu */ | |
88 | addi r4,r4,-8 /* prepare for ldu */ | |
89 | ||
90 | clrldi r7,r7,64-7 /* align to cacheline boundary */ | |
91 | ble+ cr5,.Llessthancacheline | |
92 | ||
93 | cmpldi cr6,r7,0 | |
94 | subf r5,r7,r5 | |
95 | srdi r7,r7,4 /* divide size by 16 */ | |
96 | srdi r10,r5,7 /* number of cache lines to copy */ | |
97 | ||
98 | cmpldi r10,0 | |
99 | li r11,0 /* number cachelines to copy with prefetch */ | |
100 | beq .Lnocacheprefetch | |
101 | ||
102 | cmpldi r10,PREFETCH_AHEAD | |
103 | li r12,128+8 /* prefetch distance */ | |
104 | ble .Llessthanmaxprefetch | |
105 | ||
106 | subi r11,r10,PREFETCH_AHEAD | |
107 | li r10,PREFETCH_AHEAD | |
108 | ||
109 | .Llessthanmaxprefetch: | |
110 | mtctr r10 | |
111 | ||
112 | .LprefetchSRC: | |
113 | dcbt r12,r4 | |
d6ac9329 UD |
114 | addi r12,r12,128 |
115 | bdnz .LprefetchSRC | |
057edf90 UD |
116 | |
117 | .Lnocacheprefetch: | |
118 | mtctr r7 | |
119 | cmpldi cr1,r5,128 | |
120 | clrldi r5,r5,64-7 | |
121 | beq cr6,.Lcachelinealigned | |
122 | ||
123 | .Laligntocacheline: | |
d6ac9329 | 124 | ld r9,0x08(r4) |
057edf90 UD |
125 | ldu r7,0x10(r4) |
126 | std r9,0x08(r6) | |
127 | stdu r7,0x10(r6) | |
128 | bdnz .Laligntocacheline | |
129 | ||
130 | ||
131 | .Lcachelinealigned: /* copy while cache lines */ | |
132 | ||
d6ac9329 | 133 | blt- cr1,.Llessthancacheline /* size <128 */ |
057edf90 UD |
134 | |
135 | .Louterloop: | |
d6ac9329 | 136 | cmpdi r11,0 |
057edf90 UD |
137 | mtctr r11 |
138 | beq- .Lendloop | |
139 | ||
140 | li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ | |
141 | ||
142 | .align 4 | |
143 | /* Copy whole cachelines, optimized by prefetching SRC cacheline */ | |
d6ac9329 | 144 | .Lloop: /* Copy aligned body */ |
057edf90 UD |
145 | dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
146 | ld r9, 0x08(r4) | |
147 | dcbz r11,r6 | |
148 | ld r7, 0x10(r4) /* 4 register stride copy is optimal */ | |
149 | ld r8, 0x18(r4) /* to hide 1st level cache lantency. */ | |
150 | ld r0, 0x20(r4) | |
151 | std r9, 0x08(r6) | |
152 | std r7, 0x10(r6) | |
153 | std r8, 0x18(r6) | |
154 | std r0, 0x20(r6) | |
155 | ld r9, 0x28(r4) | |
156 | ld r7, 0x30(r4) | |
157 | ld r8, 0x38(r4) | |
158 | ld r0, 0x40(r4) | |
159 | std r9, 0x28(r6) | |
160 | std r7, 0x30(r6) | |
161 | std r8, 0x38(r6) | |
162 | std r0, 0x40(r6) | |
163 | ld r9, 0x48(r4) | |
164 | ld r7, 0x50(r4) | |
165 | ld r8, 0x58(r4) | |
166 | ld r0, 0x60(r4) | |
167 | std r9, 0x48(r6) | |
168 | std r7, 0x50(r6) | |
169 | std r8, 0x58(r6) | |
170 | std r0, 0x60(r6) | |
171 | ld r9, 0x68(r4) | |
172 | ld r7, 0x70(r4) | |
173 | ld r8, 0x78(r4) | |
174 | ldu r0, 0x80(r4) | |
175 | std r9, 0x68(r6) | |
176 | std r7, 0x70(r6) | |
177 | std r8, 0x78(r6) | |
178 | stdu r0, 0x80(r6) | |
179 | ||
180 | bdnz .Lloop | |
181 | ||
182 | .Lendloop: | |
183 | cmpdi r10,0 | |
184 | sldi r10,r10,2 /* adjust from 128 to 32 byte stride */ | |
185 | beq- .Lendloop2 | |
186 | mtctr r10 | |
187 | ||
d6ac9329 | 188 | .Lloop2: /* Copy aligned body */ |
057edf90 UD |
189 | ld r9, 0x08(r4) |
190 | ld r7, 0x10(r4) | |
191 | ld r8, 0x18(r4) | |
192 | ldu r0, 0x20(r4) | |
193 | std r9, 0x08(r6) | |
194 | std r7, 0x10(r6) | |
195 | std r8, 0x18(r6) | |
196 | stdu r0, 0x20(r6) | |
197 | ||
198 | bdnz .Lloop2 | |
199 | .Lendloop2: | |
200 | ||
201 | .Llessthancacheline: /* less than cache to do ? */ | |
202 | cmpldi cr0,r5,16 | |
203 | srdi r7,r5,4 /* divide size by 16 */ | |
204 | blt- .Ldo_lt16 | |
205 | mtctr r7 | |
206 | ||
207 | .Lcopy_remaining: | |
d6ac9329 | 208 | ld r8,0x08(r4) |
057edf90 UD |
209 | ldu r7,0x10(r4) |
210 | std r8,0x08(r6) | |
211 | stdu r7,0x10(r6) | |
212 | bdnz .Lcopy_remaining | |
213 | ||
214 | .Ldo_lt16: /* less than 16 ? */ | |
215 | cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ | |
d6ac9329 | 216 | beqlr+ /* no rest to copy */ |
057edf90 UD |
217 | addi r4,r4,8 |
218 | addi r6,r6,8 | |
219 | ||
220 | .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ | |
221 | mtcrf 0x01,r5 | |
222 | sub r7,r4,r6 | |
223 | bf- cr7*4+0,8f | |
224 | ldx r0,r7,r6 /* copy 8 byte */ | |
225 | std r0,0(r6) | |
226 | addi r6,r6,8 | |
227 | 8: | |
228 | bf cr7*4+1,4f | |
229 | lwzx r0,r7,r6 /* copy 4 byte */ | |
230 | stw r0,0(r6) | |
231 | addi r6,r6,4 | |
232 | 4: | |
233 | bf cr7*4+2,2f | |
234 | lhzx r0,r7,r6 /* copy 2 byte */ | |
235 | sth r0,0(r6) | |
236 | addi r6,r6,2 | |
237 | 2: | |
238 | bf cr7*4+3,1f | |
239 | lbzx r0,r7,r6 /* copy 1 byte */ | |
240 | stb r0,0(r6) | |
241 | 1: blr | |
242 | ||
243 | END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) | |
244 | libc_hidden_builtin_def (memcpy) |