]>
Commit | Line | Data |
---|---|---|
568035b7 | 1 | /* Copyright (C) 2011-2013 Free Software Foundation, Inc. |
63d143a2 CM |
2 | This file is part of the GNU C Library. |
3 | Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
ab84e3ff PE |
16 | License along with the GNU C Library. If not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
63d143a2 CM |
18 | |
19 | #include <arch/chip.h> | |
20 | #include <sysdep.h> | |
21 | ||
22 | .text | |
23 | ENTRY (__memcpy) | |
24 | FEEDBACK_ENTER(__memcpy) | |
25 | ||
26 | /* r0 is the dest, r1 is the source, r2 is the size. */ | |
27 | ||
28 | /* Save aside original dest so we can return it at the end. */ | |
29 | { sw sp, lr; move r23, r0; or r4, r0, r1 } | |
30 | cfi_offset (lr, 0) | |
31 | ||
32 | /* Check for an empty size. */ | |
33 | { bz r2, .Ldone; andi r4, r4, 3 } | |
34 | ||
35 | /* Check for an unaligned source or dest. */ | |
36 | { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 } | |
37 | ||
38 | .Lcheck_aligned_copy_size: | |
39 | /* If we are copying < 256 bytes, branch to simple case. */ | |
40 | { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 } | |
41 | ||
42 | /* Copying >= 256 bytes, so jump to complex prefetching loop. */ | |
43 | { andi r6, r1, 63; j .Lcopy_many } | |
44 | ||
45 | /* Aligned 4 byte at a time copy loop. */ | |
46 | ||
47 | .Lcopy_8_loop: | |
48 | /* Copy two words at a time to hide load latency. */ | |
49 | { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 } | |
50 | { lw r4, r1; addi r1, r1, 4 } | |
51 | { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | |
52 | { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 } | |
53 | .Lcopy_8_check: | |
54 | { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 } | |
55 | ||
56 | /* Copy odd leftover word, if any. */ | |
57 | { bnzt r4, .Lcheck_odd_stragglers } | |
58 | { lw r3, r1; addi r1, r1, 4 } | |
59 | { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | |
60 | ||
61 | .Lcheck_odd_stragglers: | |
62 | { bnz r2, .Lcopy_unaligned_few } | |
63 | ||
64 | .Ldone: | |
65 | { move r0, r23; jrp lr } | |
66 | ||
67 | /* Prefetching multiple cache line copy handler (for large transfers). */ | |
68 | ||
69 | /* Copy words until r1 is cache-line-aligned. */ | |
70 | .Lalign_loop: | |
71 | { lw r3, r1; addi r1, r1, 4 } | |
72 | { andi r6, r1, 63 } | |
73 | { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } | |
74 | .Lcopy_many: | |
75 | { bnzt r6, .Lalign_loop; addi r9, r0, 63 } | |
76 | ||
77 | { addi r3, r1, 60; andi r9, r9, -64 } | |
78 | ||
79 | /* No need to prefetch dst, we'll just do the wh64 | |
80 | right before we copy a line. */ | |
81 | { lw r5, r3; addi r3, r3, 64; movei r4, 1 } | |
82 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | |
83 | { bnzt zero, .; move r27, lr } | |
84 | { lw r6, r3; addi r3, r3, 64 } | |
85 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | |
86 | { bnzt zero, . } | |
87 | { lw r7, r3; addi r3, r3, 64 } | |
88 | /* Intentionally stall for a few cycles to leave L2 cache alone. */ | |
89 | { bz zero, .Lbig_loop2 } | |
90 | ||
91 | /* On entry to this loop: | |
92 | - r0 points to the start of dst line 0 | |
93 | - r1 points to start of src line 0 | |
94 | - r2 >= (256 - 60), only the first time the loop trips. | |
95 | - r3 contains r1 + 128 + 60 [pointer to end of source line 2] | |
96 | This is our prefetch address. When we get near the end | |
97 | rather than prefetching off the end this is changed to point | |
98 | to some "safe" recently loaded address. | |
99 | - r5 contains *(r1 + 60) [i.e. last word of source line 0] | |
100 | - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] | |
101 | - r9 contains ((r0 + 63) & -64) | |
102 | [start of next dst cache line.] */ | |
103 | ||
104 | .Lbig_loop: | |
105 | { jal .Lcopy_line2; add r15, r1, r2 } | |
106 | ||
107 | .Lbig_loop2: | |
108 | /* Copy line 0, first stalling until r5 is ready. */ | |
109 | { move r12, r5; lw r16, r1 } | |
110 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | |
111 | /* Prefetch several lines ahead. */ | |
112 | { lw r5, r3; addi r3, r3, 64 } | |
113 | { jal .Lcopy_line } | |
114 | ||
115 | /* Copy line 1, first stalling until r6 is ready. */ | |
116 | { move r12, r6; lw r16, r1 } | |
117 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | |
118 | /* Prefetch several lines ahead. */ | |
119 | { lw r6, r3; addi r3, r3, 64 } | |
120 | { jal .Lcopy_line } | |
121 | ||
122 | /* Copy line 2, first stalling until r7 is ready. */ | |
123 | { move r12, r7; lw r16, r1 } | |
124 | { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } | |
125 | /* Prefetch several lines ahead. */ | |
126 | { lw r7, r3; addi r3, r3, 64 } | |
127 | /* Use up a caches-busy cycle by jumping back to the top of the | |
128 | loop. Might as well get it out of the way now. */ | |
129 | { j .Lbig_loop } | |
130 | ||
131 | ||
132 | /* On entry: | |
133 | - r0 points to the destination line. | |
134 | - r1 points to the source line. | |
135 | - r3 is the next prefetch address. | |
136 | - r9 holds the last address used for wh64. | |
137 | - r12 = WORD_15 | |
138 | - r16 = WORD_0. | |
139 | - r17 == r1 + 16. | |
140 | - r27 holds saved lr to restore. | |
5556231d | 141 | |
63d143a2 CM |
142 | On exit: |
143 | - r0 is incremented by 64. | |
144 | - r1 is incremented by 64, unless that would point to a word | |
145 | beyond the end of the source array, in which case it is redirected | |
146 | to point to an arbitrary word already in the cache. | |
147 | - r2 is decremented by 64. | |
148 | - r3 is unchanged, unless it points to a word beyond the | |
149 | end of the source array, in which case it is redirected | |
150 | to point to an arbitrary word already in the cache. | |
151 | Redirecting is OK since if we are that close to the end | |
152 | of the array we will not come back to this subroutine | |
153 | and use the contents of the prefetched address. | |
154 | - r4 is nonzero iff r2 >= 64. | |
155 | - r9 is incremented by 64, unless it points beyond the | |
156 | end of the last full destination cache line, in which | |
157 | case it is redirected to a "safe address" that can be | |
158 | clobbered (sp - 64) | |
159 | - lr contains the value in r27. */ | |
160 | ||
161 | /* r26 unused */ | |
162 | ||
163 | .Lcopy_line: | |
164 | /* TODO: when r3 goes past the end, we would like to redirect it | |
165 | to prefetch the last partial cache line (if any) just once, for the | |
166 | benefit of the final cleanup loop. But we don't want to | |
167 | prefetch that line more than once, or subsequent prefetches | |
168 | will go into the RTF. But then .Lbig_loop should unconditionally | |
169 | branch to top of loop to execute final prefetch, and its | |
170 | nop should become a conditional branch. */ | |
171 | ||
172 | /* We need two non-memory cycles here to cover the resources | |
173 | used by the loads initiated by the caller. */ | |
174 | { add r15, r1, r2 } | |
175 | .Lcopy_line2: | |
176 | { slt_u r13, r3, r15; addi r17, r1, 16 } | |
177 | ||
178 | /* NOTE: this will stall for one cycle as L1 is busy. */ | |
179 | ||
180 | /* Fill second L1D line. */ | |
181 | { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ | |
182 | ||
183 | /* Prepare destination line for writing. */ | |
184 | { wh64 r9; addi r9, r9, 64 } | |
185 | ||
186 | /* Load seven words that are L1D hits to cover wh64 L2 usage. */ | |
187 | ||
188 | /* Load the three remaining words from the last L1D line, which | |
189 | we know has already filled the L1D. */ | |
190 | { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ | |
191 | { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ | |
192 | { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ | |
193 | ||
194 | /* Load the three remaining words from the first L1D line, first | |
195 | stalling until it has filled by "looking at" r16. */ | |
196 | { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ | |
197 | { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ | |
198 | { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ | |
199 | ||
200 | /* Load second word from the second L1D line, first | |
201 | stalling until it has filled by "looking at" r17. */ | |
202 | { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ | |
203 | ||
204 | /* Store last word to the destination line, potentially dirtying it | |
205 | for the first time, which keeps the L2 busy for two cycles. */ | |
206 | { sw r10, r12 } /* store(WORD_15) */ | |
207 | ||
208 | /* Use two L1D hits to cover the sw L2 access above. */ | |
209 | { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ | |
210 | { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ | |
211 | ||
212 | /* Fill third L1D line. */ | |
213 | { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ | |
214 | ||
215 | /* Store first L1D line. */ | |
216 | { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ | |
217 | { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ | |
218 | { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ | |
219 | { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ | |
220 | ||
221 | /* Store second L1D line. */ | |
222 | { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ | |
223 | { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ | |
224 | { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ | |
225 | { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */ | |
226 | ||
227 | { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ | |
228 | { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ | |
229 | { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ | |
230 | ||
231 | /* Store third L1D line. */ | |
232 | { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ | |
233 | { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ | |
234 | { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ | |
235 | { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ | |
236 | ||
237 | /* Store rest of fourth L1D line. */ | |
238 | { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ | |
239 | { | |
240 | sw r0, r8 /* store(WORD_13) */ | |
241 | addi r0, r0, 4 | |
242 | /* Will r2 be > 64 after we subtract 64 below? */ | |
243 | shri r4, r2, 7 | |
244 | } | |
245 | { | |
246 | sw r0, r11 /* store(WORD_14) */ | |
247 | addi r0, r0, 8 | |
248 | /* Record 64 bytes successfully copied. */ | |
249 | addi r2, r2, -64 | |
250 | } | |
251 | ||
252 | { jrp lr; move lr, r27 } | |
253 | ||
254 | /* Convey to the backtrace library that the stack frame is | |
255 | size zero, and the real return address is on the stack | |
256 | rather than in 'lr'. */ | |
257 | { info 8 } | |
258 | ||
259 | .align 64 | |
260 | .Lcopy_unaligned_maybe_many: | |
261 | /* Skip the setup overhead if we aren't copying many bytes. */ | |
262 | { slti_u r8, r2, 20; sub r4, zero, r0 } | |
263 | { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 } | |
264 | { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 } | |
265 | ||
266 | /* Unaligned 4 byte at a time copy handler. */ | |
267 | ||
268 | /* Copy single bytes until r0 == 0 mod 4, so we can store words. */ | |
269 | .Lalign_dest_loop: | |
270 | { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 } | |
271 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | |
272 | { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 } | |
273 | ||
274 | /* If source and dest are now *both* aligned, do an aligned copy. */ | |
275 | { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 } | |
276 | ||
277 | .Ldest_is_word_aligned: | |
278 | ||
279 | { andi r8, r0, 63; lwadd_na r6, r1, 4} | |
280 | { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } | |
281 | ||
282 | /* This copies unaligned words until either there are fewer | |
283 | than 4 bytes left to copy, or until the destination pointer | |
284 | is cache-aligned, whichever comes first. | |
5556231d | 285 | |
63d143a2 CM |
286 | On entry: |
287 | - r0 is the next store address. | |
288 | - r1 points 4 bytes past the load address corresponding to r0. | |
289 | - r2 >= 4 | |
290 | - r6 is the next aligned word loaded. */ | |
291 | .Lcopy_unaligned_src_words: | |
292 | { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 } | |
293 | /* stall */ | |
294 | { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 } | |
295 | { swadd r0, r6, 4; addi r2, r2, -4 } | |
296 | { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 } | |
297 | { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 } | |
298 | ||
299 | /* On entry: | |
300 | - r0 is the next store address. | |
301 | - r1 points 4 bytes past the load address corresponding to r0. | |
302 | - r2 >= 4 (# of bytes left to store). | |
303 | - r6 is the next aligned src word value. | |
304 | - r9 = (r2 < 64U). | |
305 | - r18 points one byte past the end of source memory. */ | |
306 | .Ldest_is_L2_line_aligned: | |
307 | ||
308 | { | |
309 | /* Not a full cache line remains. */ | |
310 | bnz r9, .Lcleanup_unaligned_words | |
311 | move r7, r6 | |
312 | } | |
313 | ||
314 | /* r2 >= 64 */ | |
315 | ||
316 | /* Kick off two prefetches, but don't go past the end. */ | |
317 | { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 } | |
318 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } | |
319 | { mvz r3, r8, r1; addi r8, r3, 64 } | |
320 | { prefetch r3; move r3, r8; slt_u r8, r8, r18 } | |
321 | { mvz r3, r8, r1; movei r17, 0 } | |
322 | ||
323 | .Lcopy_unaligned_line: | |
324 | /* Prefetch another line. */ | |
325 | { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 } | |
326 | /* Fire off a load of the last word we are about to copy. */ | |
327 | { lw_na r15, r15; slt_u r8, r3, r18 } | |
328 | ||
329 | { mvz r3, r8, r1; wh64 r0 } | |
330 | ||
331 | /* This loop runs twice. | |
5556231d | 332 | |
63d143a2 CM |
333 | On entry: |
334 | - r17 is even before the first iteration, and odd before | |
335 | the second. It is incremented inside the loop. Encountering | |
336 | an even value at the end of the loop makes it stop. */ | |
337 | .Lcopy_half_an_unaligned_line: | |
338 | { | |
339 | /* Stall until the last byte is ready. In the steady state this | |
340 | guarantees all words to load below will be in the L2 cache, which | |
341 | avoids shunting the loads to the RTF. */ | |
342 | move zero, r15 | |
343 | lwadd_na r7, r1, 16 | |
344 | } | |
345 | { lwadd_na r11, r1, 12 } | |
346 | { lwadd_na r14, r1, -24 } | |
347 | { lwadd_na r8, r1, 4 } | |
348 | { lwadd_na r9, r1, 4 } | |
349 | { | |
350 | lwadd_na r10, r1, 8 | |
351 | /* r16 = (r2 < 64), after we subtract 32 from r2 below. */ | |
352 | slti_u r16, r2, 64 + 32 | |
353 | } | |
354 | { lwadd_na r12, r1, 4; addi r17, r17, 1 } | |
355 | { lwadd_na r13, r1, 8; dword_align r6, r7, r1 } | |
356 | { swadd r0, r6, 4; dword_align r7, r8, r1 } | |
357 | { swadd r0, r7, 4; dword_align r8, r9, r1 } | |
358 | { swadd r0, r8, 4; dword_align r9, r10, r1 } | |
359 | { swadd r0, r9, 4; dword_align r10, r11, r1 } | |
360 | { swadd r0, r10, 4; dword_align r11, r12, r1 } | |
361 | { swadd r0, r11, 4; dword_align r12, r13, r1 } | |
362 | { swadd r0, r12, 4; dword_align r13, r14, r1 } | |
363 | { swadd r0, r13, 4; addi r2, r2, -32 } | |
364 | { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line } | |
365 | ||
366 | { bzt r16, .Lcopy_unaligned_line; move r7, r6 } | |
367 | ||
368 | /* On entry: | |
369 | - r0 is the next store address. | |
370 | - r1 points 4 bytes past the load address corresponding to r0. | |
371 | - r2 >= 0 (# of bytes left to store). | |
372 | - r7 is the next aligned src word value. */ | |
373 | .Lcleanup_unaligned_words: | |
374 | /* Handle any trailing bytes. */ | |
375 | { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 } | |
376 | { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 } | |
377 | ||
378 | /* Move r1 back to the point where it corresponds to r0. */ | |
379 | { addi r1, r1, -4 } | |
380 | ||
381 | /* Fall through */ | |
382 | ||
383 | /* 1 byte at a time copy handler. */ | |
384 | ||
385 | .Lcopy_unaligned_few: | |
386 | { lb_u r3, r1; addi r1, r1, 1 } | |
387 | { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } | |
388 | { bnzt r2, .Lcopy_unaligned_few } | |
389 | ||
390 | .Lcopy_unaligned_done: | |
391 | ||
392 | { move r0, r23; jrp lr } | |
393 | ||
394 | END (__memcpy) | |
395 | ||
396 | weak_alias (__memcpy, memcpy) | |
397 | libc_hidden_builtin_def (memcpy) |