]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc32/power6/memcpy.S
f58114a0c5d34c828a113dac372262bb56337e53
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power6 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC32 on POWER6.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
33
34 .machine power6
35 EALIGN (memcpy, 5, 0)
36 CALL_MCOUNT
37
38 stwu 1,-32(1)
39 cfi_adjust_cfa_offset(32)
40 cmplwi cr1,5,31 /* check for short move. */
41 neg 0,3
42 cmplwi cr1,5,31
43 clrlwi 10,4,30 /* check alignment of src. */
44 andi. 11,3,3 /* check alignment of dst. */
45 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
46 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
47 cmplw cr6,10,11
48 stw 31,24(1)
49 cfi_offset(31,(24-32))
50 stw 30,20(1)
51 cfi_offset(30,(20-32))
52 mr 30,3
53 beq .L0
54 mtcrf 0x01,0
55 subf 31,0,5 /* Length after alignment. */
56 add 12,4,0 /* Compute src addr after alignment. */
57 /* Move 0-3 bytes as needed to get the destination word aligned. */
58 1: bf 31,2f
59 lbz 6,0(4)
60 bf 30,3f
61 lhz 7,1(4)
62 stb 6,0(3)
63 sth 7,1(3)
64 addi 3,3,3
65 b 0f
66 3:
67 stb 6,0(3)
68 addi 3,3,1
69 b 0f
70 2: bf 30,0f
71 lhz 6,0(4)
72 sth 6,0(3)
73 addi 3,3,2
74 0:
75 clrlwi 10,12,30 /* check alignment of src again. */
76 srwi 9,31,2 /* Number of full words remaining. */
77 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
78 clrlwi 11,31,30 /* calculate the number of tail bytes */
79 b L(word_aligned)
80 /* Copy words from source to destination, assuming the destination is
81 aligned on a word boundary.
82
83 At this point we know there are at least 29 bytes left (32-3) to copy.
84 The next step is to determine if the source is also word aligned.
85 If not branch to the unaligned move code at .L6. which uses
86 a load, shift, store strategy.
87
88 Otherwise source and destination are word aligned, and we can use
89 the optimized word copy loop. */
90 .align 4
91 .L0:
92 mr 31,5
93 mr 12,4
94 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
95 srwi 9,5,2 /* Number of full words remaining. */
96 clrlwi 11,5,30 /* calculate the number of tail bytes */
97
98 /* Move words where destination and source are word aligned.
99 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
100 If the copy is not an exact multiple of 16 bytes, 1-3
101 words are copied as needed to set up the main loop. After
102 the main loop exits there may be a tail of 1-3 bytes. These bytes are
103 copied a halfword/byte at a time as needed to preserve alignment. */
104 L(word_aligned):
105 mtcrf 0x01,9
106 srwi 8,31,4 /* calculate the 16 byte loop count */
107 cmplwi cr1,9,4
108 cmplwi cr6,11,0
109 mr 11,12
110
111 bf 30,1f
112 lwz 6,0(12)
113 lwz 7,4(12)
114 addi 11,12,8
115 mtctr 8
116 stw 6,0(3)
117 stw 7,4(3)
118 addi 10,3,8
119 bf 31,4f
120 lwz 0,8(12)
121 stw 0,8(3)
122 blt cr1,3f
123 addi 11,12,12
124 addi 10,3,12
125 b 4f
126 .align 4
127 1:
128 mr 10,3
129 mtctr 8
130 bf 31,4f
131 lwz 6,0(12)
132 addi 11,12,4
133 stw 6,0(3)
134 addi 10,3,4
135
136 .align 4
137 4:
138 lwz 6,0(11)
139 lwz 7,4(11)
140 lwz 8,8(11)
141 lwz 0,12(11)
142 stw 6,0(10)
143 stw 7,4(10)
144 stw 8,8(10)
145 stw 0,12(10)
146 addi 11,11,16
147 addi 10,10,16
148 bdnz 4b
149 3:
150 clrrwi 0,31,2
151 mtcrf 0x01,31
152 beq cr6,0f
153 .L9:
154 add 3,3,0
155 add 12,12,0
156
157 /* At this point we have a tail of 0-3 bytes and we know that the
158 destination is word aligned. */
159 2: bf 30,1f
160 lhz 6,0(12)
161 addi 12,12,2
162 sth 6,0(3)
163 addi 3,3,2
164 1: bf 31,0f
165 lbz 6,0(12)
166 stb 6,0(3)
167 0:
168 /* Return original dst pointer. */
169 mr 3,30
170 lwz 30,20(1)
171 lwz 31,24(1)
172 addi 1,1,32
173 blr
174
175 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
176 bytes. Each case is handled without loops, using binary (1,2,4,8)
177 tests.
178
179 In the short (0-8 byte) case no attempt is made to force alignment
180 of either source or destination. The hardware will handle the
181 unaligned load/stores with small delays for crossing 32- 128-byte,
182 and 4096-byte boundaries. Since these short moves are unlikely to be
183 unaligned or cross these boundaries, the overhead to force
184 alignment is not justified.
185
186 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
187 boundaries. Since only loads are sensitive to the 32-/128-byte
188 boundaries it is more important to align the source then the
189 destination. If the source is not already word aligned, we first
190 move 1-3 bytes as needed. Since we are only word aligned we don't
191 use double word load/stores to insure that all loads are aligned.
192 While the destination and stores may still be unaligned, this
193 is only an issue for page (4096 byte boundary) crossing, which
194 should be rare for these short moves. The hardware handles this
195 case automatically with a small (~20 cycle) delay. */
196 .align 4
197
198 cfi_same_value (31)
199 cfi_same_value (30)
200 L(word_unaligned_short):
201 mtcrf 0x01,5
202 cmplwi cr6,5,8
203 neg 8,4
204 clrrwi 9,4,2
205 andi. 0,8,3
206 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
207 /* At least 9 bytes left. Get the source word aligned. */
208 cmplwi cr1,5,16
209 mr 12,4
210 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
211 mr 11,3
212 mr 10,5
213 cmplwi cr6,0,2
214 beq L(wus_tail) /* If the source is already word aligned skip this. */
215 /* Copy 1-3 bytes to get source address word aligned. */
216 lwz 6,0(9)
217 subf 10,0,5
218 add 12,4,0
219 blt cr6,5f
220 srwi 7,6,16
221 bgt cr6,3f
222 #ifdef __LITTLE_ENDIAN__
223 sth 7,0(3)
224 #else
225 sth 6,0(3)
226 #endif
227 b 7f
228 .align 4
229 3:
230 #ifdef __LITTLE_ENDIAN__
231 rotlwi 6,6,24
232 stb 6,0(3)
233 sth 7,1(3)
234 #else
235 stb 7,0(3)
236 sth 6,1(3)
237 #endif
238 b 7f
239 .align 4
240 5:
241 #ifdef __LITTLE_ENDIAN__
242 rotlwi 6,6,8
243 #endif
244 stb 6,0(3)
245 7:
246 cmplwi cr1,10,16
247 add 11,3,0
248 mtcrf 0x01,10
249 .align 4
250 L(wus_tail):
251 /* At least 6 bytes left and the source is word aligned. This allows
252 some speculative loads up front. */
253 /* We need to special case the fall-through because the biggest delays
254 are due to address computation not being ready in time for the
255 AGEN. */
256 lwz 6,0(12)
257 lwz 7,4(12)
258 blt cr1,L(wus_tail8)
259 cmplwi cr0,10,24
260 L(wus_tail16): /* Move 16 bytes. */
261 stw 6,0(11)
262 stw 7,4(11)
263 lwz 6,8(12)
264 lwz 7,12(12)
265 stw 6,8(11)
266 stw 7,12(11)
267 /* Move 8 bytes more. */
268 bf 28,L(wus_tail16p8)
269 cmplwi cr1,10,28
270 lwz 6,16(12)
271 lwz 7,20(12)
272 stw 6,16(11)
273 stw 7,20(11)
274 /* Move 4 bytes more. */
275 bf 29,L(wus_tail16p4)
276 lwz 6,24(12)
277 stw 6,24(11)
278 addi 12,12,28
279 addi 11,11,28
280 bgt cr1,L(wus_tail2)
281 /* exactly 28 bytes. Return original dst pointer and exit. */
282 addi 1,1,32
283 blr
284 .align 4
285 L(wus_tail16p8): /* less than 8 bytes left. */
286 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
287 cmplwi cr1,10,20
288 bf 29,L(wus_tail16p2)
289 /* Move 4 bytes more. */
290 lwz 6,16(12)
291 stw 6,16(11)
292 addi 12,12,20
293 addi 11,11,20
294 bgt cr1,L(wus_tail2)
295 /* exactly 20 bytes. Return original dst pointer and exit. */
296 addi 1,1,32
297 blr
298 .align 4
299 L(wus_tail16p4): /* less than 4 bytes left. */
300 addi 12,12,24
301 addi 11,11,24
302 bgt cr0,L(wus_tail2)
303 /* exactly 24 bytes. Return original dst pointer and exit. */
304 addi 1,1,32
305 blr
306 .align 4
307 L(wus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
308 addi 12,12,16
309 addi 11,11,16
310 b L(wus_tail2)
311
312 .align 4
313 L(wus_tail8): /* Move 8 bytes. */
314 /* r6, r7 already loaded speculatively. */
315 cmplwi cr1,10,8
316 cmplwi cr0,10,12
317 bf 28,L(wus_tail4)
318 stw 6,0(11)
319 stw 7,4(11)
320 /* Move 4 bytes more. */
321 bf 29,L(wus_tail8p4)
322 lwz 6,8(12)
323 stw 6,8(11)
324 addi 12,12,12
325 addi 11,11,12
326 bgt cr0,L(wus_tail2)
327 /* exactly 12 bytes. Return original dst pointer and exit. */
328 addi 1,1,32
329 blr
330 .align 4
331 L(wus_tail8p4): /* less than 4 bytes left. */
332 addi 12,12,8
333 addi 11,11,8
334 bgt cr1,L(wus_tail2)
335 /* exactly 8 bytes. Return original dst pointer and exit. */
336 addi 1,1,32
337 blr
338
339 .align 4
340 L(wus_tail4): /* Move 4 bytes. */
341 /* r6 already loaded speculatively. If we are here we know there is
342 more than 4 bytes left. So there is no need to test. */
343 addi 12,12,4
344 stw 6,0(11)
345 addi 11,11,4
346 L(wus_tail2): /* Move 2-3 bytes. */
347 bf 30,L(wus_tail1)
348 lhz 6,0(12)
349 sth 6,0(11)
350 bf 31,L(wus_tailX)
351 lbz 7,2(12)
352 stb 7,2(11)
353 addi 1,1,32
354 blr
355 L(wus_tail1): /* Move 1 byte. */
356 bf 31,L(wus_tailX)
357 lbz 6,0(12)
358 stb 6,0(11)
359 L(wus_tailX):
360 /* Return original dst pointer. */
361 addi 1,1,32
362 blr
363
364 /* Special case to copy 0-8 bytes. */
365 .align 4
366 L(wus_8):
367 lwz 6,0(4)
368 lwz 7,4(4)
369 stw 6,0(3)
370 stw 7,4(3)
371 /* Return original dst pointer. */
372 addi 1,1,32
373 blr
374 .align 4
375 L(wus_4):
376 bf 29,L(wus_2)
377 lwz 6,0(4)
378 stw 6,0(3)
379 bf 30,L(wus_5)
380 lhz 7,4(4)
381 sth 7,4(3)
382 bf 31,L(wus_0)
383 lbz 8,6(4)
384 stb 8,6(3)
385 addi 1,1,32
386 blr
387 .align 4
388 L(wus_5):
389 bf 31,L(wus_0)
390 lbz 6,4(4)
391 stb 6,4(3)
392 /* Return original dst pointer. */
393 addi 1,1,32
394 blr
395 .align 4
396 L(wus_2): /* Move 2-3 bytes. */
397 bf 30,L(wus_1)
398 lhz 6,0(4)
399 sth 6,0(3)
400 bf 31,L(wus_0)
401 lbz 7,2(4)
402 stb 7,2(3)
403 addi 1,1,32
404 blr
405 .align 4
406 L(wus_1): /* Move 1 byte. */
407 bf 31,L(wus_0)
408 lbz 6,0(4)
409 stb 6,0(3)
410 .align 3
411 L(wus_0):
412 /* Return original dst pointer. */
413 addi 1,1,32
414 blr
415
416 .align 4
417 cfi_offset(31,(24-32))
418 cfi_offset(30,(20-32))
419 L(wdu):
420
421 /* Copy words where the destination is aligned but the source is
422 not. For power4, power5 and power6 machines there is penalty for
423 unaligned loads (src) that cross 32-byte, cacheline, or page
424 boundaries. So we want to use simple (unaligned) loads where
425 possible but avoid them where we know the load would span a 32-byte
426 boundary.
427
428 At this point we know we have at least 29 (32-3) bytes to copy
429 the src is unaligned. and we may cross at least one 32-byte
430 boundary. Also we have the following register values:
431 r3 == adjusted dst, word aligned
432 r4 == unadjusted src
433 r5 == unadjusted len
434 r9 == adjusted Word length
435 r10 == src alignment (1-3)
436 r12 == adjusted src, not aligned
437 r31 == adjusted len
438
439 First we need to copy word up to but not crossing the next 32-byte
440 boundary. Then perform aligned loads just before and just after
441 the boundary and use shifts and or to generate the next aligned
442 word for dst. If more than 32 bytes remain we copy (unaligned src)
443 the next 7 words and repeat the loop until less than 32-bytes
444 remain.
445
446 Then if more than 4 bytes remain we again use aligned loads,
447 shifts and or to generate the next dst word. We then process the
448 remaining words using unaligned loads as needed. Finally we check
449 if there are more than 0 bytes (1-3) bytes remaining and use
450 halfword and or byte load/stores to complete the copy.
451 */
452 mr 4,12 /* restore unaligned adjusted src ptr */
453 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
454 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
455 cmplwi cr5,0,16
456 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
457
458 mtcrf 0x01,8
459 cmplwi cr1,10,16
460 subfic 9,10,32 /* number of bits to shift 2nd word right */
461 /* This test is reversed because the timing to compare the bytes to
462 32-byte boundary could not be meet. So we compare the bytes from
463 previous 32-byte boundary and invert the test. */
464 bge cr5,L(wdu_h32_8)
465 .align 4
466 lwz 6,0(4)
467 lwz 7,4(4)
468 addi 12,4,16 /* generate alternate pointers to avoid agen */
469 addi 11,3,16 /* timing issues downstream. */
470 stw 6,0(3)
471 stw 7,4(3)
472 subi 31,31,16
473 lwz 6,8(4)
474 lwz 7,12(4)
475 addi 4,4,16
476 stw 6,8(3)
477 stw 7,12(3)
478 addi 3,3,16
479 bf 28,L(wdu_h32_4)
480 lwz 6,0(12)
481 lwz 7,4(12)
482 subi 31,31,8
483 addi 4,4,8
484 stw 6,0(11)
485 stw 7,4(11)
486 addi 3,3,8
487 bf 29,L(wdu_h32_0)
488 lwz 6,8(12)
489 addi 4,4,4
490 subi 31,31,4
491 stw 6,8(11)
492 addi 3,3,4
493 b L(wdu_h32_0)
494 .align 4
495 L(wdu_h32_8):
496 bf 28,L(wdu_h32_4)
497 lwz 6,0(4)
498 lwz 7,4(4)
499 subi 31,31,8
500 bf 29,L(wdu_h32_8x)
501 stw 6,0(3)
502 stw 7,4(3)
503 lwz 6,8(4)
504 addi 4,4,12
505 subi 31,31,4
506 stw 6,8(3)
507 addi 3,3,12
508 b L(wdu_h32_0)
509 .align 4
510 L(wdu_h32_8x):
511 addi 4,4,8
512 stw 6,0(3)
513 stw 7,4(3)
514 addi 3,3,8
515 b L(wdu_h32_0)
516 .align 4
517 L(wdu_h32_4):
518 bf 29,L(wdu_h32_0)
519 lwz 6,0(4)
520 subi 31,31,4
521 addi 4,4,4
522 stw 6,0(3)
523 addi 3,3,4
524 .align 4
525 L(wdu_h32_0):
526 /* set up for 32-byte boundary crossing word move and possibly 32-byte
527 move loop. */
528 clrrwi 12,4,2
529 cmplwi cr5,31,32
530 bge cr1,L(wdu2_32)
531 #if 0
532 b L(wdu1_32)
533 /*
534 cmplwi cr1,10,8
535 beq cr1,L(wdu1_32)
536 cmplwi cr1,10,16
537 beq cr1,L(wdu2_32)
538 cmplwi cr1,10,24
539 beq cr1,L(wdu3_32)
540 */
541 L(wdu_32):
542 lwz 6,0(12)
543 cmplwi cr6,31,4
544 srwi 8,31,5 /* calculate the 32 byte loop count */
545 slw 0,6,10
546 clrlwi 31,31,27 /* The remaining bytes, < 32. */
547 blt cr5,L(wdu_32tail)
548 mtctr 8
549 cmplwi cr6,31,4
550 .align 4
551 L(wdu_loop32):
552 /* copy 32 bytes at a time */
553 lwz 8,4(12)
554 addi 12,12,32
555 lwz 7,4(4)
556 srw 8,8,9
557 or 0,0,8
558 stw 0,0(3)
559 stw 7,4(3)
560 lwz 6,8(4)
561 lwz 7,12(4)
562 stw 6,8(3)
563 stw 7,12(3)
564 lwz 6,16(4)
565 lwz 7,20(4)
566 stw 6,16(3)
567 stw 7,20(3)
568 lwz 6,24(4)
569 lwz 7,28(4)
570 lwz 8,0(12)
571 addi 4,4,32
572 stw 6,24(3)
573 stw 7,28(3)
574 addi 3,3,32
575 slw 0,8,10
576 bdnz+ L(wdu_loop32)
577
578 L(wdu_32tail):
579 mtcrf 0x01,31
580 cmplwi cr5,31,16
581 blt cr6,L(wdu_4tail)
582 /* calculate and store the final word */
583 lwz 8,4(12)
584 srw 8,8,9
585 or 6,0,8
586 b L(wdu_32tailx)
587 #endif
588 .align 4
589 L(wdu1_32):
590 lwz 6,-1(4)
591 cmplwi cr6,31,4
592 srwi 8,31,5 /* calculate the 32 byte loop count */
593 #ifdef __LITTLE_ENDIAN__
594 srwi 6,6,8
595 #else
596 slwi 6,6,8
597 #endif
598 clrlwi 31,31,27 /* The remaining bytes, < 32. */
599 blt cr5,L(wdu1_32tail)
600 mtctr 8
601 cmplwi cr6,31,4
602
603 lwz 8,3(4)
604 lwz 7,4(4)
605 #ifdef __LITTLE_ENDIAN__
606 rldimi 6,8,24,32
607 #else
608 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
609 rlwimi 6,8,8,(32-8),31
610 #endif
611 b L(wdu1_loop32x)
612 .align 4
613 L(wdu1_loop32):
614 /* copy 32 bytes at a time */
615 lwz 8,3(4)
616 lwz 7,4(4)
617 stw 10,-8(3)
618 stw 11,-4(3)
619 #ifdef __LITTLE_ENDIAN__
620 rldimi 6,8,24,32
621 #else
622 /* Equivalent to srwi 8,8,32-8; or 6,6,8 */
623 rlwimi 6,8,8,(32-8),31
624 #endif
625 L(wdu1_loop32x):
626 lwz 10,8(4)
627 lwz 11,12(4)
628 stw 6,0(3)
629 stw 7,4(3)
630 lwz 6,16(4)
631 lwz 7,20(4)
632 stw 10,8(3)
633 stw 11,12(3)
634 lwz 10,24(4)
635 lwz 11,28(4)
636 lwz 8,32-1(4)
637 addi 4,4,32
638 stw 6,16(3)
639 stw 7,20(3)
640 addi 3,3,32
641 #ifdef __LITTLE_ENDIAN__
642 srwi 6,8,8
643 #else
644 slwi 6,8,8
645 #endif
646 bdnz+ L(wdu1_loop32)
647 stw 10,-8(3)
648 stw 11,-4(3)
649
650 L(wdu1_32tail):
651 mtcrf 0x01,31
652 cmplwi cr5,31,16
653 blt cr6,L(wdu_4tail)
654 /* calculate and store the final word */
655 lwz 8,3(4)
656 #ifdef __LITTLE_ENDIAN__
657 rldimi 6,8,24,32
658 #else
659 /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
660 rlwimi 6,8,8,(32-8),31
661 #endif
662 b L(wdu_32tailx)
663
664 L(wdu2_32):
665 bgt cr1,L(wdu3_32)
666 lwz 6,-2(4)
667 cmplwi cr6,31,4
668 srwi 8,31,5 /* calculate the 32 byte loop count */
669 #ifdef __LITTLE_ENDIAN__
670 srwi 6,6,16
671 #else
672 slwi 6,6,16
673 #endif
674 clrlwi 31,31,27 /* The remaining bytes, < 32. */
675 blt cr5,L(wdu2_32tail)
676 mtctr 8
677 cmplwi cr6,31,4
678
679 lwz 8,2(4)
680 lwz 7,4(4)
681 #ifdef __LITTLE_ENDIAN__
682 rldimi 6,8,16,32
683 #else
684 rlwimi 6,8,16,(32-16),31
685 #endif
686 b L(wdu2_loop32x)
687 .align 4
688 L(wdu2_loop32):
689 /* copy 32 bytes at a time */
690 lwz 8,2(4)
691 lwz 7,4(4)
692 stw 10,-8(3)
693 stw 11,-4(3)
694 #ifdef __LITTLE_ENDIAN__
695 rldimi 6,8,16,32
696 #else
697 rlwimi 6,8,16,(32-16),31
698 #endif
699 L(wdu2_loop32x):
700 lwz 10,8(4)
701 lwz 11,12(4)
702 stw 6,0(3)
703 stw 7,4(3)
704 lwz 6,16(4)
705 lwz 7,20(4)
706 stw 10,8(3)
707 stw 11,12(3)
708 lwz 10,24(4)
709 lwz 11,28(4)
710 /* lwz 8,0(12) */
711 lwz 8,32-2(4)
712 addi 4,4,32
713 stw 6,16(3)
714 stw 7,20(3)
715 addi 3,3,32
716 #ifdef __LITTLE_ENDIAN__
717 srwi 6,8,16
718 #else
719 slwi 6,8,16
720 #endif
721 bdnz+ L(wdu2_loop32)
722 stw 10,-8(3)
723 stw 11,-4(3)
724
725 L(wdu2_32tail):
726 mtcrf 0x01,31
727 cmplwi cr5,31,16
728 blt cr6,L(wdu_4tail)
729 /* calculate and store the final word */
730 lwz 8,2(4)
731 #ifdef __LITTLE_ENDIAN__
732 rldimi 6,8,16,32
733 #else
734 rlwimi 6,8,16,(32-16),31
735 #endif
736 b L(wdu_32tailx)
737
738 L(wdu3_32):
739 /* lwz 6,0(12) */
740 lwz 6,-3(4)
741 cmplwi cr6,31,4
742 srwi 8,31,5 /* calculate the 32 byte loop count */
743 #ifdef __LITTLE_ENDIAN__
744 srwi 6,6,24
745 #else
746 slwi 6,6,24
747 #endif
748 clrlwi 31,31,27 /* The remaining bytes, < 32. */
749 blt cr5,L(wdu3_32tail)
750 mtctr 8
751 cmplwi cr6,31,4
752
753 lwz 8,1(4)
754 lwz 7,4(4)
755 #ifdef __LITTLE_ENDIAN__
756 rldimi 6,8,8,32
757 #else
758 rlwimi 6,8,24,(32-24),31
759 #endif
760 b L(wdu3_loop32x)
761 .align 4
762 L(wdu3_loop32):
763 /* copy 32 bytes at a time */
764 lwz 8,1(4)
765 lwz 7,4(4)
766 stw 10,-8(3)
767 stw 11,-4(3)
768 #ifdef __LITTLE_ENDIAN__
769 rldimi 6,8,8,32
770 #else
771 rlwimi 6,8,24,(32-24),31
772 #endif
773 L(wdu3_loop32x):
774 lwz 10,8(4)
775 lwz 11,12(4)
776 stw 6,0(3)
777 stw 7,4(3)
778 lwz 6,16(4)
779 lwz 7,20(4)
780 stw 10,8(3)
781 stw 11,12(3)
782 lwz 10,24(4)
783 lwz 11,28(4)
784 lwz 8,32-3(4)
785 addi 4,4,32
786 stw 6,16(3)
787 stw 7,20(3)
788 addi 3,3,32
789 #ifdef __LITTLE_ENDIAN__
790 srwi 6,8,24
791 #else
792 slwi 6,8,24
793 #endif
794 bdnz+ L(wdu3_loop32)
795 stw 10,-8(3)
796 stw 11,-4(3)
797
798 L(wdu3_32tail):
799 mtcrf 0x01,31
800 cmplwi cr5,31,16
801 blt cr6,L(wdu_4tail)
802 /* calculate and store the final word */
803 lwz 8,1(4)
804 #ifdef __LITTLE_ENDIAN__
805 rldimi 6,8,8,32
806 #else
807 rlwimi 6,8,24,(32-24),31
808 #endif
809 b L(wdu_32tailx)
810 .align 4
811 L(wdu_32tailx):
812 blt cr5,L(wdu_t32_8)
813 lwz 7,4(4)
814 addi 12,4,16 /* generate alternate pointers to avoid agen */
815 addi 11,3,16 /* timing issues downstream. */
816 stw 6,0(3)
817 stw 7,4(3)
818 subi 31,31,16
819 lwz 6,8(4)
820 lwz 7,12(4)
821 addi 4,4,16
822 stw 6,8(3)
823 stw 7,12(3)
824 addi 3,3,16
825 bf 28,L(wdu_t32_4x)
826 lwz 6,0(12)
827 lwz 7,4(12)
828 addi 4,4,8
829 subi 31,31,8
830 stw 6,0(11)
831 stw 7,4(11)
832 addi 3,3,8
833 bf 29,L(wdu_t32_0)
834 lwz 6,8(12)
835 addi 4,4,4
836 subi 31,31,4
837 stw 6,8(11)
838 addi 3,3,4
839 b L(wdu_t32_0)
840 .align 4
841 L(wdu_t32_4x):
842 bf 29,L(wdu_t32_0)
843 lwz 6,0(4)
844 addi 4,4,4
845 subi 31,31,4
846 stw 6,0(3)
847 addi 3,3,4
848 b L(wdu_t32_0)
849 .align 4
850 L(wdu_t32_8):
851 bf 28,L(wdu_t32_4)
852 lwz 7,4(4)
853 subi 31,31,8
854 bf 29,L(wdu_t32_8x)
855 stw 6,0(3)
856 stw 7,4(3)
857 lwz 6,8(4)
858 subi 31,31,4
859 addi 4,4,12
860 stw 6,8(3)
861 addi 3,3,12
862 b L(wdu_t32_0)
863 .align 4
864 L(wdu_t32_8x):
865 addi 4,4,8
866 stw 6,0(3)
867 stw 7,4(3)
868 addi 3,3,8
869 b L(wdu_t32_0)
870 .align 4
871 L(wdu_t32_4):
872 subi 31,31,4
873 stw 6,0(3)
874 addi 4,4,4
875 addi 3,3,4
876 .align 4
877 L(wdu_t32_0):
878 L(wdu_4tail):
879 cmplwi cr6,31,0
880 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
881 bf 30,L(wdus_3)
882 lhz 7,0(4)
883 sth 7,0(3)
884 bf 31,L(wdus_0)
885 lbz 8,2(4)
886 stb 8,2(3)
887 mr 3,30
888 lwz 30,20(1)
889 lwz 31,24(1)
890 addi 1,1,32
891 blr
892 .align 4
893 L(wdus_3):
894 bf 31,L(wus_0)
895 lbz 6,0(4)
896 stb 6,0(3)
897 .align 4
898 L(wdus_0):
899 /* Return original dst pointer. */
900 mr 3,30
901 lwz 30,20(1)
902 lwz 31,24(1)
903 addi 1,1,32
904 blr
905 END (memcpy)
906
907 libc_hidden_builtin_def (memcpy)