]>
Commit | Line | Data |
---|---|---|
04067002 | 1 | /* Optimized memcpy implementation for PowerPC32 on POWER6. |
bfff8b1b | 2 | Copyright (C) 2003-2017 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
f17a4233 | 21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
04067002 UD |
22 | Returns 'dst'. |
23 | ||
9c84384c | 24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25bfbb9e RA |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled |
26 | with the appropriate combination of byte and halfword load/stores. | |
27 | There is minimal effort to optimize the alignment of short moves. | |
04067002 UD |
28 | |
29 | Longer moves (>= 32-bytes) justify the effort to get at least the | |
30 | destination word (4-byte) aligned. Further optimization is | |
31 | possible when both source and destination are word aligned. | |
32 | Each case has an optimized unrolled loop. */ | |
33 | ||
a88f47a7 | 34 | .machine power6 |
b5510883 | 35 | EALIGN (memcpy, 5, 0) |
04067002 UD |
36 | CALL_MCOUNT |
37 | ||
38 | stwu 1,-32(1) | |
39 | cfi_adjust_cfa_offset(32) | |
40 | cmplwi cr1,5,31 /* check for short move. */ | |
41 | neg 0,3 | |
42 | cmplwi cr1,5,31 | |
43 | clrlwi 10,4,30 /* check alignment of src. */ | |
44 | andi. 11,3,3 /* check alignment of dst. */ | |
45 | clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */ | |
46 | ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */ | |
47 | cmplw cr6,10,11 | |
48 | stw 31,24(1) | |
04067002 | 49 | stw 30,20(1) |
869d7180 | 50 | cfi_offset(31,(24-32)) |
04067002 UD |
51 | cfi_offset(30,(20-32)) |
52 | mr 30,3 | |
53 | beq .L0 | |
54 | mtcrf 0x01,0 | |
55 | subf 31,0,5 /* Length after alignment. */ | |
56 | add 12,4,0 /* Compute src addr after alignment. */ | |
57 | /* Move 0-3 bytes as needed to get the destination word aligned. */ | |
58 | 1: bf 31,2f | |
59 | lbz 6,0(4) | |
60 | bf 30,3f | |
61 | lhz 7,1(4) | |
62 | stb 6,0(3) | |
63 | sth 7,1(3) | |
64 | addi 3,3,3 | |
65 | b 0f | |
66 | 3: | |
67 | stb 6,0(3) | |
68 | addi 3,3,1 | |
69 | b 0f | |
70 | 2: bf 30,0f | |
71 | lhz 6,0(4) | |
72 | sth 6,0(3) | |
73 | addi 3,3,2 | |
74 | 0: | |
75 | clrlwi 10,12,30 /* check alignment of src again. */ | |
76 | srwi 9,31,2 /* Number of full words remaining. */ | |
77 | bne- cr6,L(wdu) /* If source is not word aligned. .L6 */ | |
78 | clrlwi 11,31,30 /* calculate the number of tail bytes */ | |
79 | b L(word_aligned) | |
25bfbb9e | 80 | /* Copy words from source to destination, assuming the destination is |
04067002 UD |
81 | aligned on a word boundary. |
82 | ||
83 | At this point we know there are at least 29 bytes left (32-3) to copy. | |
25bfbb9e | 84 | The next step is to determine if the source is also word aligned. |
04067002 UD |
85 | If not branch to the unaligned move code at .L6. which uses |
86 | a load, shift, store strategy. | |
87 | ||
88 | Otherwise source and destination are word aligned, and we can use | |
89 | the optimized word copy loop. */ | |
90 | .align 4 | |
91 | .L0: | |
92 | mr 31,5 | |
93 | mr 12,4 | |
94 | bne- cr6,L(wdu) /* If source is not word aligned. .L6 */ | |
95 | srwi 9,5,2 /* Number of full words remaining. */ | |
96 | clrlwi 11,5,30 /* calculate the number of tail bytes */ | |
97 | ||
98 | /* Move words where destination and source are word aligned. | |
99 | Use an unrolled loop to copy 4 words (16-bytes) per iteration. | |
ded5b9b7 | 100 | If the copy is not an exact multiple of 16 bytes, 1-3 |
04067002 | 101 | words are copied as needed to set up the main loop. After |
25bfbb9e | 102 | the main loop exits there may be a tail of 1-3 bytes. These bytes are |
04067002 UD |
103 | copied a halfword/byte at a time as needed to preserve alignment. */ |
104 | L(word_aligned): | |
105 | mtcrf 0x01,9 | |
106 | srwi 8,31,4 /* calculate the 16 byte loop count */ | |
107 | cmplwi cr1,9,4 | |
108 | cmplwi cr6,11,0 | |
109 | mr 11,12 | |
110 | ||
111 | bf 30,1f | |
112 | lwz 6,0(12) | |
113 | lwz 7,4(12) | |
114 | addi 11,12,8 | |
115 | mtctr 8 | |
116 | stw 6,0(3) | |
117 | stw 7,4(3) | |
118 | addi 10,3,8 | |
119 | bf 31,4f | |
120 | lwz 0,8(12) | |
25bfbb9e | 121 | stw 0,8(3) |
04067002 UD |
122 | blt cr1,3f |
123 | addi 11,12,12 | |
124 | addi 10,3,12 | |
125 | b 4f | |
126 | .align 4 | |
127 | 1: | |
128 | mr 10,3 | |
129 | mtctr 8 | |
130 | bf 31,4f | |
131 | lwz 6,0(12) | |
132 | addi 11,12,4 | |
133 | stw 6,0(3) | |
134 | addi 10,3,4 | |
25bfbb9e | 135 | |
04067002 UD |
136 | .align 4 |
137 | 4: | |
138 | lwz 6,0(11) | |
139 | lwz 7,4(11) | |
140 | lwz 8,8(11) | |
141 | lwz 0,12(11) | |
142 | stw 6,0(10) | |
143 | stw 7,4(10) | |
144 | stw 8,8(10) | |
145 | stw 0,12(10) | |
146 | addi 11,11,16 | |
147 | addi 10,10,16 | |
148 | bdnz 4b | |
25bfbb9e | 149 | 3: |
04067002 UD |
150 | clrrwi 0,31,2 |
151 | mtcrf 0x01,31 | |
152 | beq cr6,0f | |
153 | .L9: | |
154 | add 3,3,0 | |
155 | add 12,12,0 | |
25bfbb9e | 156 | |
04067002 UD |
157 | /* At this point we have a tail of 0-3 bytes and we know that the |
158 | destination is word aligned. */ | |
159 | 2: bf 30,1f | |
160 | lhz 6,0(12) | |
161 | addi 12,12,2 | |
162 | sth 6,0(3) | |
163 | addi 3,3,2 | |
164 | 1: bf 31,0f | |
165 | lbz 6,0(12) | |
166 | stb 6,0(3) | |
167 | 0: | |
168 | /* Return original dst pointer. */ | |
169 | mr 3,30 | |
170 | lwz 30,20(1) | |
171 | lwz 31,24(1) | |
172 | addi 1,1,32 | |
173 | blr | |
174 | ||
25bfbb9e | 175 | /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 |
04067002 UD |
176 | bytes. Each case is handled without loops, using binary (1,2,4,8) |
177 | tests. | |
178 | ||
179 | In the short (0-8 byte) case no attempt is made to force alignment | |
180 | of either source or destination. The hardware will handle the | |
181 | unaligned load/stores with small delays for crossing 32- 128-byte, | |
182 | and 4096-byte boundaries. Since these short moves are unlikely to be | |
183 | unaligned or cross these boundaries, the overhead to force | |
184 | alignment is not justified. | |
185 | ||
186 | The longer (9-31 byte) move is more likely to cross 32- or 128-byte | |
187 | boundaries. Since only loads are sensitive to the 32-/128-byte | |
188 | boundaries it is more important to align the source then the | |
189 | destination. If the source is not already word aligned, we first | |
190 | move 1-3 bytes as needed. Since we are only word aligned we don't | |
191 | use double word load/stores to insure that all loads are aligned. | |
192 | While the destination and stores may still be unaligned, this | |
193 | is only an issue for page (4096 byte boundary) crossing, which | |
194 | should be rare for these short moves. The hardware handles this | |
195 | case automatically with a small (~20 cycle) delay. */ | |
196 | .align 4 | |
197 | ||
198 | cfi_same_value (31) | |
199 | cfi_same_value (30) | |
200 | L(word_unaligned_short): | |
201 | mtcrf 0x01,5 | |
202 | cmplwi cr6,5,8 | |
203 | neg 8,4 | |
204 | clrrwi 9,4,2 | |
205 | andi. 0,8,3 | |
206 | beq cr6,L(wus_8) /* Handle moves of 8 bytes. */ | |
207 | /* At least 9 bytes left. Get the source word aligned. */ | |
25bfbb9e | 208 | cmplwi cr1,5,16 |
04067002 UD |
209 | mr 12,4 |
210 | ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */ | |
211 | mr 11,3 | |
212 | mr 10,5 | |
213 | cmplwi cr6,0,2 | |
214 | beq L(wus_tail) /* If the source is already word aligned skip this. */ | |
215 | /* Copy 1-3 bytes to get source address word aligned. */ | |
216 | lwz 6,0(9) | |
217 | subf 10,0,5 | |
218 | add 12,4,0 | |
219 | blt cr6,5f | |
a050d2a5 | 220 | srwi 7,6,16 |
04067002 | 221 | bgt cr6,3f |
759cfef3 AM |
222 | #ifdef __LITTLE_ENDIAN__ |
223 | sth 7,0(3) | |
224 | #else | |
04067002 | 225 | sth 6,0(3) |
759cfef3 | 226 | #endif |
04067002 UD |
227 | b 7f |
228 | .align 4 | |
229 | 3: | |
759cfef3 AM |
230 | #ifdef __LITTLE_ENDIAN__ |
231 | rotlwi 6,6,24 | |
232 | stb 6,0(3) | |
233 | sth 7,1(3) | |
234 | #else | |
04067002 UD |
235 | stb 7,0(3) |
236 | sth 6,1(3) | |
759cfef3 | 237 | #endif |
04067002 UD |
238 | b 7f |
239 | .align 4 | |
240 | 5: | |
759cfef3 AM |
241 | #ifdef __LITTLE_ENDIAN__ |
242 | rotlwi 6,6,8 | |
243 | #endif | |
04067002 UD |
244 | stb 6,0(3) |
245 | 7: | |
246 | cmplwi cr1,10,16 | |
247 | add 11,3,0 | |
248 | mtcrf 0x01,10 | |
249 | .align 4 | |
250 | L(wus_tail): | |
251 | /* At least 6 bytes left and the source is word aligned. This allows | |
252 | some speculative loads up front. */ | |
253 | /* We need to special case the fall-through because the biggest delays | |
25bfbb9e | 254 | are due to address computation not being ready in time for the |
04067002 UD |
255 | AGEN. */ |
256 | lwz 6,0(12) | |
257 | lwz 7,4(12) | |
258 | blt cr1,L(wus_tail8) | |
259 | cmplwi cr0,10,24 | |
260 | L(wus_tail16): /* Move 16 bytes. */ | |
261 | stw 6,0(11) | |
262 | stw 7,4(11) | |
263 | lwz 6,8(12) | |
264 | lwz 7,12(12) | |
265 | stw 6,8(11) | |
266 | stw 7,12(11) | |
267 | /* Move 8 bytes more. */ | |
268 | bf 28,L(wus_tail16p8) | |
269 | cmplwi cr1,10,28 | |
270 | lwz 6,16(12) | |
271 | lwz 7,20(12) | |
272 | stw 6,16(11) | |
273 | stw 7,20(11) | |
274 | /* Move 4 bytes more. */ | |
275 | bf 29,L(wus_tail16p4) | |
276 | lwz 6,24(12) | |
277 | stw 6,24(11) | |
278 | addi 12,12,28 | |
279 | addi 11,11,28 | |
280 | bgt cr1,L(wus_tail2) | |
281 | /* exactly 28 bytes. Return original dst pointer and exit. */ | |
282 | addi 1,1,32 | |
283 | blr | |
284 | .align 4 | |
f24a6d08 | 285 | L(wus_tail16p8): /* less than 8 bytes left. */ |
04067002 UD |
286 | beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */ |
287 | cmplwi cr1,10,20 | |
288 | bf 29,L(wus_tail16p2) | |
289 | /* Move 4 bytes more. */ | |
290 | lwz 6,16(12) | |
291 | stw 6,16(11) | |
292 | addi 12,12,20 | |
293 | addi 11,11,20 | |
294 | bgt cr1,L(wus_tail2) | |
295 | /* exactly 20 bytes. Return original dst pointer and exit. */ | |
296 | addi 1,1,32 | |
297 | blr | |
298 | .align 4 | |
f24a6d08 | 299 | L(wus_tail16p4): /* less than 4 bytes left. */ |
04067002 UD |
300 | addi 12,12,24 |
301 | addi 11,11,24 | |
302 | bgt cr0,L(wus_tail2) | |
303 | /* exactly 24 bytes. Return original dst pointer and exit. */ | |
304 | addi 1,1,32 | |
305 | blr | |
306 | .align 4 | |
f24a6d08 | 307 | L(wus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */ |
04067002 UD |
308 | addi 12,12,16 |
309 | addi 11,11,16 | |
310 | b L(wus_tail2) | |
311 | ||
312 | .align 4 | |
313 | L(wus_tail8): /* Move 8 bytes. */ | |
314 | /* r6, r7 already loaded speculatively. */ | |
315 | cmplwi cr1,10,8 | |
316 | cmplwi cr0,10,12 | |
317 | bf 28,L(wus_tail4) | |
318 | stw 6,0(11) | |
319 | stw 7,4(11) | |
320 | /* Move 4 bytes more. */ | |
321 | bf 29,L(wus_tail8p4) | |
322 | lwz 6,8(12) | |
323 | stw 6,8(11) | |
324 | addi 12,12,12 | |
325 | addi 11,11,12 | |
326 | bgt cr0,L(wus_tail2) | |
327 | /* exactly 12 bytes. Return original dst pointer and exit. */ | |
328 | addi 1,1,32 | |
329 | blr | |
330 | .align 4 | |
f24a6d08 | 331 | L(wus_tail8p4): /* less than 4 bytes left. */ |
04067002 UD |
332 | addi 12,12,8 |
333 | addi 11,11,8 | |
334 | bgt cr1,L(wus_tail2) | |
335 | /* exactly 8 bytes. Return original dst pointer and exit. */ | |
336 | addi 1,1,32 | |
337 | blr | |
338 | ||
339 | .align 4 | |
340 | L(wus_tail4): /* Move 4 bytes. */ | |
341 | /* r6 already loaded speculatively. If we are here we know there is | |
f24a6d08 | 342 | more than 4 bytes left. So there is no need to test. */ |
04067002 UD |
343 | addi 12,12,4 |
344 | stw 6,0(11) | |
345 | addi 11,11,4 | |
346 | L(wus_tail2): /* Move 2-3 bytes. */ | |
347 | bf 30,L(wus_tail1) | |
348 | lhz 6,0(12) | |
25bfbb9e | 349 | sth 6,0(11) |
04067002 UD |
350 | bf 31,L(wus_tailX) |
351 | lbz 7,2(12) | |
352 | stb 7,2(11) | |
353 | addi 1,1,32 | |
354 | blr | |
355 | L(wus_tail1): /* Move 1 byte. */ | |
356 | bf 31,L(wus_tailX) | |
357 | lbz 6,0(12) | |
358 | stb 6,0(11) | |
359 | L(wus_tailX): | |
360 | /* Return original dst pointer. */ | |
361 | addi 1,1,32 | |
362 | blr | |
363 | ||
364 | /* Special case to copy 0-8 bytes. */ | |
365 | .align 4 | |
366 | L(wus_8): | |
367 | lwz 6,0(4) | |
368 | lwz 7,4(4) | |
369 | stw 6,0(3) | |
370 | stw 7,4(3) | |
371 | /* Return original dst pointer. */ | |
372 | addi 1,1,32 | |
373 | blr | |
374 | .align 4 | |
375 | L(wus_4): | |
376 | bf 29,L(wus_2) | |
377 | lwz 6,0(4) | |
378 | stw 6,0(3) | |
379 | bf 30,L(wus_5) | |
380 | lhz 7,4(4) | |
25bfbb9e | 381 | sth 7,4(3) |
04067002 UD |
382 | bf 31,L(wus_0) |
383 | lbz 8,6(4) | |
384 | stb 8,6(3) | |
385 | addi 1,1,32 | |
386 | blr | |
387 | .align 4 | |
388 | L(wus_5): | |
389 | bf 31,L(wus_0) | |
390 | lbz 6,4(4) | |
391 | stb 6,4(3) | |
392 | /* Return original dst pointer. */ | |
393 | addi 1,1,32 | |
394 | blr | |
395 | .align 4 | |
396 | L(wus_2): /* Move 2-3 bytes. */ | |
397 | bf 30,L(wus_1) | |
398 | lhz 6,0(4) | |
25bfbb9e | 399 | sth 6,0(3) |
04067002 UD |
400 | bf 31,L(wus_0) |
401 | lbz 7,2(4) | |
402 | stb 7,2(3) | |
403 | addi 1,1,32 | |
404 | blr | |
405 | .align 4 | |
406 | L(wus_1): /* Move 1 byte. */ | |
407 | bf 31,L(wus_0) | |
408 | lbz 6,0(4) | |
409 | stb 6,0(3) | |
410 | .align 3 | |
411 | L(wus_0): | |
412 | /* Return original dst pointer. */ | |
413 | addi 1,1,32 | |
414 | blr | |
415 | ||
416 | .align 4 | |
417 | cfi_offset(31,(24-32)) | |
418 | cfi_offset(30,(20-32)) | |
419 | L(wdu): | |
420 | ||
421 | /* Copy words where the destination is aligned but the source is | |
422 | not. For power4, power5 and power6 machines there is penalty for | |
25bfbb9e | 423 | unaligned loads (src) that cross 32-byte, cacheline, or page |
04067002 | 424 | boundaries. So we want to use simple (unaligned) loads where |
2ccdea26 | 425 | possible but avoid them where we know the load would span a 32-byte |
25bfbb9e | 426 | boundary. |
04067002 UD |
427 | |
428 | At this point we know we have at least 29 (32-3) bytes to copy | |
25bfbb9e | 429 | the src is unaligned. and we may cross at least one 32-byte |
2ccdea26 | 430 | boundary. Also we have the following register values: |
04067002 UD |
431 | r3 == adjusted dst, word aligned |
432 | r4 == unadjusted src | |
433 | r5 == unadjusted len | |
434 | r9 == adjusted Word length | |
435 | r10 == src alignment (1-3) | |
2ccdea26 | 436 | r12 == adjusted src, not aligned |
04067002 UD |
437 | r31 == adjusted len |
438 | ||
2ccdea26 | 439 | First we need to copy word up to but not crossing the next 32-byte |
25bfbb9e | 440 | boundary. Then perform aligned loads just before and just after |
2ccdea26 | 441 | the boundary and use shifts and or to generate the next aligned |
f24a6d08 OB |
442 | word for dst. If more than 32 bytes remain we copy (unaligned src) |
443 | the next 7 words and repeat the loop until less than 32-bytes | |
2ccdea26 | 444 | remain. |
04067002 | 445 | |
f24a6d08 | 446 | Then if more than 4 bytes remain we again use aligned loads, |
04067002 UD |
447 | shifts and or to generate the next dst word. We then process the |
448 | remaining words using unaligned loads as needed. Finally we check | |
8220f4f0 | 449 | if there are more than 0 bytes (1-3) bytes remaining and use |
04067002 UD |
450 | halfword and or byte load/stores to complete the copy. |
451 | */ | |
452 | mr 4,12 /* restore unaligned adjusted src ptr */ | |
453 | clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */ | |
454 | slwi 10,10,3 /* calculate number of bits to shift 1st word left */ | |
25bfbb9e | 455 | cmplwi cr5,0,16 |
04067002 UD |
456 | subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */ |
457 | ||
458 | mtcrf 0x01,8 | |
459 | cmplwi cr1,10,16 | |
460 | subfic 9,10,32 /* number of bits to shift 2nd word right */ | |
461 | /* This test is reversed because the timing to compare the bytes to | |
462 | 32-byte boundary could not be meet. So we compare the bytes from | |
463 | previous 32-byte boundary and invert the test. */ | |
464 | bge cr5,L(wdu_h32_8) | |
465 | .align 4 | |
466 | lwz 6,0(4) | |
467 | lwz 7,4(4) | |
468 | addi 12,4,16 /* generate alternate pointers to avoid agen */ | |
469 | addi 11,3,16 /* timing issues downstream. */ | |
470 | stw 6,0(3) | |
471 | stw 7,4(3) | |
472 | subi 31,31,16 | |
473 | lwz 6,8(4) | |
474 | lwz 7,12(4) | |
475 | addi 4,4,16 | |
476 | stw 6,8(3) | |
477 | stw 7,12(3) | |
478 | addi 3,3,16 | |
479 | bf 28,L(wdu_h32_4) | |
480 | lwz 6,0(12) | |
481 | lwz 7,4(12) | |
482 | subi 31,31,8 | |
483 | addi 4,4,8 | |
484 | stw 6,0(11) | |
485 | stw 7,4(11) | |
486 | addi 3,3,8 | |
487 | bf 29,L(wdu_h32_0) | |
488 | lwz 6,8(12) | |
489 | addi 4,4,4 | |
490 | subi 31,31,4 | |
491 | stw 6,8(11) | |
492 | addi 3,3,4 | |
493 | b L(wdu_h32_0) | |
494 | .align 4 | |
495 | L(wdu_h32_8): | |
496 | bf 28,L(wdu_h32_4) | |
497 | lwz 6,0(4) | |
498 | lwz 7,4(4) | |
499 | subi 31,31,8 | |
500 | bf 29,L(wdu_h32_8x) | |
501 | stw 6,0(3) | |
502 | stw 7,4(3) | |
503 | lwz 6,8(4) | |
504 | addi 4,4,12 | |
505 | subi 31,31,4 | |
506 | stw 6,8(3) | |
507 | addi 3,3,12 | |
508 | b L(wdu_h32_0) | |
509 | .align 4 | |
510 | L(wdu_h32_8x): | |
511 | addi 4,4,8 | |
512 | stw 6,0(3) | |
513 | stw 7,4(3) | |
514 | addi 3,3,8 | |
515 | b L(wdu_h32_0) | |
516 | .align 4 | |
517 | L(wdu_h32_4): | |
518 | bf 29,L(wdu_h32_0) | |
519 | lwz 6,0(4) | |
520 | subi 31,31,4 | |
521 | addi 4,4,4 | |
522 | stw 6,0(3) | |
523 | addi 3,3,4 | |
524 | .align 4 | |
525 | L(wdu_h32_0): | |
2ccdea26 | 526 | /* set up for 32-byte boundary crossing word move and possibly 32-byte |
04067002 UD |
527 | move loop. */ |
528 | clrrwi 12,4,2 | |
529 | cmplwi cr5,31,32 | |
530 | bge cr1,L(wdu2_32) | |
531 | #if 0 | |
532 | b L(wdu1_32) | |
533 | /* | |
534 | cmplwi cr1,10,8 | |
535 | beq cr1,L(wdu1_32) | |
536 | cmplwi cr1,10,16 | |
537 | beq cr1,L(wdu2_32) | |
538 | cmplwi cr1,10,24 | |
539 | beq cr1,L(wdu3_32) | |
540 | */ | |
541 | L(wdu_32): | |
542 | lwz 6,0(12) | |
543 | cmplwi cr6,31,4 | |
544 | srwi 8,31,5 /* calculate the 32 byte loop count */ | |
25bfbb9e | 545 | slw 0,6,10 |
04067002 UD |
546 | clrlwi 31,31,27 /* The remaining bytes, < 32. */ |
547 | blt cr5,L(wdu_32tail) | |
548 | mtctr 8 | |
549 | cmplwi cr6,31,4 | |
550 | .align 4 | |
551 | L(wdu_loop32): | |
552 | /* copy 32 bytes at a time */ | |
553 | lwz 8,4(12) | |
554 | addi 12,12,32 | |
555 | lwz 7,4(4) | |
25bfbb9e | 556 | srw 8,8,9 |
04067002 UD |
557 | or 0,0,8 |
558 | stw 0,0(3) | |
559 | stw 7,4(3) | |
560 | lwz 6,8(4) | |
561 | lwz 7,12(4) | |
562 | stw 6,8(3) | |
563 | stw 7,12(3) | |
564 | lwz 6,16(4) | |
565 | lwz 7,20(4) | |
566 | stw 6,16(3) | |
567 | stw 7,20(3) | |
568 | lwz 6,24(4) | |
569 | lwz 7,28(4) | |
570 | lwz 8,0(12) | |
571 | addi 4,4,32 | |
572 | stw 6,24(3) | |
573 | stw 7,28(3) | |
574 | addi 3,3,32 | |
25bfbb9e | 575 | slw 0,8,10 |
04067002 UD |
576 | bdnz+ L(wdu_loop32) |
577 | ||
578 | L(wdu_32tail): | |
579 | mtcrf 0x01,31 | |
580 | cmplwi cr5,31,16 | |
581 | blt cr6,L(wdu_4tail) | |
582 | /* calculate and store the final word */ | |
583 | lwz 8,4(12) | |
25bfbb9e | 584 | srw 8,8,9 |
04067002 UD |
585 | or 6,0,8 |
586 | b L(wdu_32tailx) | |
587 | #endif | |
588 | .align 4 | |
589 | L(wdu1_32): | |
590 | lwz 6,-1(4) | |
591 | cmplwi cr6,31,4 | |
592 | srwi 8,31,5 /* calculate the 32 byte loop count */ | |
759cfef3 AM |
593 | #ifdef __LITTLE_ENDIAN__ |
594 | srwi 6,6,8 | |
595 | #else | |
04067002 | 596 | slwi 6,6,8 |
759cfef3 | 597 | #endif |
04067002 UD |
598 | clrlwi 31,31,27 /* The remaining bytes, < 32. */ |
599 | blt cr5,L(wdu1_32tail) | |
600 | mtctr 8 | |
601 | cmplwi cr6,31,4 | |
602 | ||
603 | lwz 8,3(4) | |
604 | lwz 7,4(4) | |
759cfef3 AM |
605 | #ifdef __LITTLE_ENDIAN__ |
606 | rldimi 6,8,24,32 | |
607 | #else | |
04067002 UD |
608 | /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ |
609 | rlwimi 6,8,8,(32-8),31 | |
759cfef3 | 610 | #endif |
04067002 UD |
611 | b L(wdu1_loop32x) |
612 | .align 4 | |
613 | L(wdu1_loop32): | |
614 | /* copy 32 bytes at a time */ | |
615 | lwz 8,3(4) | |
616 | lwz 7,4(4) | |
617 | stw 10,-8(3) | |
618 | stw 11,-4(3) | |
759cfef3 AM |
619 | #ifdef __LITTLE_ENDIAN__ |
620 | rldimi 6,8,24,32 | |
621 | #else | |
04067002 UD |
622 | /* Equivalent to srwi 8,8,32-8; or 6,6,8 */ |
623 | rlwimi 6,8,8,(32-8),31 | |
759cfef3 | 624 | #endif |
04067002 UD |
625 | L(wdu1_loop32x): |
626 | lwz 10,8(4) | |
627 | lwz 11,12(4) | |
628 | stw 6,0(3) | |
629 | stw 7,4(3) | |
630 | lwz 6,16(4) | |
631 | lwz 7,20(4) | |
632 | stw 10,8(3) | |
633 | stw 11,12(3) | |
634 | lwz 10,24(4) | |
635 | lwz 11,28(4) | |
636 | lwz 8,32-1(4) | |
637 | addi 4,4,32 | |
638 | stw 6,16(3) | |
639 | stw 7,20(3) | |
640 | addi 3,3,32 | |
759cfef3 AM |
641 | #ifdef __LITTLE_ENDIAN__ |
642 | srwi 6,8,8 | |
643 | #else | |
04067002 | 644 | slwi 6,8,8 |
759cfef3 | 645 | #endif |
04067002 UD |
646 | bdnz+ L(wdu1_loop32) |
647 | stw 10,-8(3) | |
648 | stw 11,-4(3) | |
649 | ||
650 | L(wdu1_32tail): | |
651 | mtcrf 0x01,31 | |
652 | cmplwi cr5,31,16 | |
653 | blt cr6,L(wdu_4tail) | |
654 | /* calculate and store the final word */ | |
655 | lwz 8,3(4) | |
759cfef3 AM |
656 | #ifdef __LITTLE_ENDIAN__ |
657 | rldimi 6,8,24,32 | |
658 | #else | |
659 | /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ | |
04067002 | 660 | rlwimi 6,8,8,(32-8),31 |
759cfef3 | 661 | #endif |
04067002 UD |
662 | b L(wdu_32tailx) |
663 | ||
664 | L(wdu2_32): | |
665 | bgt cr1,L(wdu3_32) | |
666 | lwz 6,-2(4) | |
667 | cmplwi cr6,31,4 | |
668 | srwi 8,31,5 /* calculate the 32 byte loop count */ | |
759cfef3 AM |
669 | #ifdef __LITTLE_ENDIAN__ |
670 | srwi 6,6,16 | |
671 | #else | |
04067002 | 672 | slwi 6,6,16 |
759cfef3 | 673 | #endif |
04067002 UD |
674 | clrlwi 31,31,27 /* The remaining bytes, < 32. */ |
675 | blt cr5,L(wdu2_32tail) | |
676 | mtctr 8 | |
677 | cmplwi cr6,31,4 | |
678 | ||
679 | lwz 8,2(4) | |
680 | lwz 7,4(4) | |
759cfef3 AM |
681 | #ifdef __LITTLE_ENDIAN__ |
682 | rldimi 6,8,16,32 | |
683 | #else | |
04067002 | 684 | rlwimi 6,8,16,(32-16),31 |
759cfef3 | 685 | #endif |
04067002 UD |
686 | b L(wdu2_loop32x) |
687 | .align 4 | |
688 | L(wdu2_loop32): | |
689 | /* copy 32 bytes at a time */ | |
690 | lwz 8,2(4) | |
691 | lwz 7,4(4) | |
692 | stw 10,-8(3) | |
693 | stw 11,-4(3) | |
759cfef3 AM |
694 | #ifdef __LITTLE_ENDIAN__ |
695 | rldimi 6,8,16,32 | |
696 | #else | |
04067002 | 697 | rlwimi 6,8,16,(32-16),31 |
759cfef3 | 698 | #endif |
04067002 UD |
699 | L(wdu2_loop32x): |
700 | lwz 10,8(4) | |
701 | lwz 11,12(4) | |
702 | stw 6,0(3) | |
703 | stw 7,4(3) | |
704 | lwz 6,16(4) | |
705 | lwz 7,20(4) | |
706 | stw 10,8(3) | |
707 | stw 11,12(3) | |
708 | lwz 10,24(4) | |
709 | lwz 11,28(4) | |
710 | /* lwz 8,0(12) */ | |
711 | lwz 8,32-2(4) | |
712 | addi 4,4,32 | |
713 | stw 6,16(3) | |
714 | stw 7,20(3) | |
715 | addi 3,3,32 | |
759cfef3 AM |
716 | #ifdef __LITTLE_ENDIAN__ |
717 | srwi 6,8,16 | |
718 | #else | |
04067002 | 719 | slwi 6,8,16 |
759cfef3 | 720 | #endif |
04067002 UD |
721 | bdnz+ L(wdu2_loop32) |
722 | stw 10,-8(3) | |
723 | stw 11,-4(3) | |
724 | ||
725 | L(wdu2_32tail): | |
726 | mtcrf 0x01,31 | |
727 | cmplwi cr5,31,16 | |
728 | blt cr6,L(wdu_4tail) | |
729 | /* calculate and store the final word */ | |
730 | lwz 8,2(4) | |
759cfef3 AM |
731 | #ifdef __LITTLE_ENDIAN__ |
732 | rldimi 6,8,16,32 | |
733 | #else | |
04067002 | 734 | rlwimi 6,8,16,(32-16),31 |
759cfef3 | 735 | #endif |
04067002 UD |
736 | b L(wdu_32tailx) |
737 | ||
738 | L(wdu3_32): | |
739 | /* lwz 6,0(12) */ | |
740 | lwz 6,-3(4) | |
741 | cmplwi cr6,31,4 | |
742 | srwi 8,31,5 /* calculate the 32 byte loop count */ | |
759cfef3 AM |
743 | #ifdef __LITTLE_ENDIAN__ |
744 | srwi 6,6,24 | |
745 | #else | |
04067002 | 746 | slwi 6,6,24 |
759cfef3 | 747 | #endif |
04067002 UD |
748 | clrlwi 31,31,27 /* The remaining bytes, < 32. */ |
749 | blt cr5,L(wdu3_32tail) | |
750 | mtctr 8 | |
751 | cmplwi cr6,31,4 | |
752 | ||
753 | lwz 8,1(4) | |
754 | lwz 7,4(4) | |
759cfef3 AM |
755 | #ifdef __LITTLE_ENDIAN__ |
756 | rldimi 6,8,8,32 | |
757 | #else | |
04067002 | 758 | rlwimi 6,8,24,(32-24),31 |
759cfef3 | 759 | #endif |
04067002 UD |
760 | b L(wdu3_loop32x) |
761 | .align 4 | |
762 | L(wdu3_loop32): | |
763 | /* copy 32 bytes at a time */ | |
764 | lwz 8,1(4) | |
765 | lwz 7,4(4) | |
766 | stw 10,-8(3) | |
767 | stw 11,-4(3) | |
759cfef3 AM |
768 | #ifdef __LITTLE_ENDIAN__ |
769 | rldimi 6,8,8,32 | |
770 | #else | |
04067002 | 771 | rlwimi 6,8,24,(32-24),31 |
759cfef3 | 772 | #endif |
04067002 UD |
773 | L(wdu3_loop32x): |
774 | lwz 10,8(4) | |
775 | lwz 11,12(4) | |
776 | stw 6,0(3) | |
777 | stw 7,4(3) | |
778 | lwz 6,16(4) | |
779 | lwz 7,20(4) | |
780 | stw 10,8(3) | |
781 | stw 11,12(3) | |
782 | lwz 10,24(4) | |
783 | lwz 11,28(4) | |
784 | lwz 8,32-3(4) | |
785 | addi 4,4,32 | |
786 | stw 6,16(3) | |
787 | stw 7,20(3) | |
788 | addi 3,3,32 | |
759cfef3 AM |
789 | #ifdef __LITTLE_ENDIAN__ |
790 | srwi 6,8,24 | |
791 | #else | |
04067002 | 792 | slwi 6,8,24 |
759cfef3 | 793 | #endif |
04067002 UD |
794 | bdnz+ L(wdu3_loop32) |
795 | stw 10,-8(3) | |
796 | stw 11,-4(3) | |
797 | ||
798 | L(wdu3_32tail): | |
799 | mtcrf 0x01,31 | |
800 | cmplwi cr5,31,16 | |
801 | blt cr6,L(wdu_4tail) | |
802 | /* calculate and store the final word */ | |
803 | lwz 8,1(4) | |
759cfef3 AM |
804 | #ifdef __LITTLE_ENDIAN__ |
805 | rldimi 6,8,8,32 | |
806 | #else | |
04067002 | 807 | rlwimi 6,8,24,(32-24),31 |
759cfef3 | 808 | #endif |
04067002 UD |
809 | b L(wdu_32tailx) |
810 | .align 4 | |
811 | L(wdu_32tailx): | |
812 | blt cr5,L(wdu_t32_8) | |
813 | lwz 7,4(4) | |
814 | addi 12,4,16 /* generate alternate pointers to avoid agen */ | |
815 | addi 11,3,16 /* timing issues downstream. */ | |
816 | stw 6,0(3) | |
817 | stw 7,4(3) | |
818 | subi 31,31,16 | |
819 | lwz 6,8(4) | |
820 | lwz 7,12(4) | |
821 | addi 4,4,16 | |
822 | stw 6,8(3) | |
823 | stw 7,12(3) | |
824 | addi 3,3,16 | |
825 | bf 28,L(wdu_t32_4x) | |
826 | lwz 6,0(12) | |
827 | lwz 7,4(12) | |
828 | addi 4,4,8 | |
829 | subi 31,31,8 | |
830 | stw 6,0(11) | |
831 | stw 7,4(11) | |
832 | addi 3,3,8 | |
833 | bf 29,L(wdu_t32_0) | |
834 | lwz 6,8(12) | |
835 | addi 4,4,4 | |
836 | subi 31,31,4 | |
837 | stw 6,8(11) | |
838 | addi 3,3,4 | |
839 | b L(wdu_t32_0) | |
840 | .align 4 | |
841 | L(wdu_t32_4x): | |
842 | bf 29,L(wdu_t32_0) | |
843 | lwz 6,0(4) | |
844 | addi 4,4,4 | |
845 | subi 31,31,4 | |
846 | stw 6,0(3) | |
847 | addi 3,3,4 | |
848 | b L(wdu_t32_0) | |
849 | .align 4 | |
850 | L(wdu_t32_8): | |
851 | bf 28,L(wdu_t32_4) | |
852 | lwz 7,4(4) | |
853 | subi 31,31,8 | |
854 | bf 29,L(wdu_t32_8x) | |
855 | stw 6,0(3) | |
856 | stw 7,4(3) | |
857 | lwz 6,8(4) | |
858 | subi 31,31,4 | |
859 | addi 4,4,12 | |
860 | stw 6,8(3) | |
861 | addi 3,3,12 | |
862 | b L(wdu_t32_0) | |
863 | .align 4 | |
864 | L(wdu_t32_8x): | |
865 | addi 4,4,8 | |
866 | stw 6,0(3) | |
867 | stw 7,4(3) | |
868 | addi 3,3,8 | |
869 | b L(wdu_t32_0) | |
870 | .align 4 | |
871 | L(wdu_t32_4): | |
872 | subi 31,31,4 | |
873 | stw 6,0(3) | |
874 | addi 4,4,4 | |
875 | addi 3,3,4 | |
876 | .align 4 | |
877 | L(wdu_t32_0): | |
878 | L(wdu_4tail): | |
879 | cmplwi cr6,31,0 | |
880 | beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */ | |
881 | bf 30,L(wdus_3) | |
882 | lhz 7,0(4) | |
25bfbb9e | 883 | sth 7,0(3) |
04067002 UD |
884 | bf 31,L(wdus_0) |
885 | lbz 8,2(4) | |
886 | stb 8,2(3) | |
887 | mr 3,30 | |
888 | lwz 30,20(1) | |
889 | lwz 31,24(1) | |
890 | addi 1,1,32 | |
891 | blr | |
892 | .align 4 | |
893 | L(wdus_3): | |
894 | bf 31,L(wus_0) | |
895 | lbz 6,0(4) | |
896 | stb 6,0(3) | |
897 | .align 4 | |
898 | L(wdus_0): | |
899 | /* Return original dst pointer. */ | |
900 | mr 3,30 | |
901 | lwz 30,20(1) | |
902 | lwz 31,24(1) | |
903 | addi 1,1,32 | |
904 | blr | |
b5510883 | 905 | END (memcpy) |
04067002 UD |
906 | |
907 | libc_hidden_builtin_def (memcpy) |