]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power6/memcpy.S
Fix spelling errors in sysdeps/powerpc files.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include <bp-sym.h>
21 #include <bp-asm.h>
22
23 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Returns 'dst'.
25
26 Memcpy handles short copies (< 32-bytes) using a binary move blocks
27 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
28 with the appropriate combination of byte and halfword load/stores.
29 There is minimal effort to optimize the alignment of short moves.
30 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
31 of handling unaligned load/stores that do not cross 32-byte boundaries.
32
33 Longer moves (>= 32-bytes) justify the effort to get at least the
34 destination doubleword (8-byte) aligned. Further optimization is
35 possible when both source and destination are doubleword aligned.
36 Each case has a optimized unrolled loop.
37
38 For POWER6 unaligned loads will take a 20+ cycle hiccup for any
39 L1 cache miss that crosses a 32- or 128-byte boundary. Store
40 is more forgiving and does not take a hiccup until page or
41 segment boundaries. So we require doubleword alignment for
42 the source but may take a risk and only require word alignment
43 for the destination. */
44
45 .machine "power6"
46 EALIGN (BP_SYM (memcpy), 7, 0)
47 CALL_MCOUNT 3
48
49 cmpldi cr1,5,31
50 neg 0,3
51 std 3,-16(1)
52 std 31,-8(1)
53 andi. 11,3,7 /* check alignment of dst. */
54 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
55 clrldi 10,4,61 /* check alignment of src. */
56 cmpldi cr6,5,8
57 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
58 mtcrf 0x01,0
59 cmpld cr6,10,11
60 srdi 9,5,3 /* Number of full double words remaining. */
61 beq .L0
62
63 subf 5,0,5
64 /* Move 0-7 bytes as needed to get the destination doubleword aligned.
65 Duplicate some code to maximize fall-through and minimize agen delays. */
66 1: bf 31,2f
67 lbz 6,0(4)
68 stb 6,0(3)
69 bf 30,5f
70 lhz 6,1(4)
71 sth 6,1(3)
72 bf 29,0f
73 lwz 6,3(4)
74 stw 6,3(3)
75 b 0f
76 5:
77 bf 29,0f
78 lwz 6,1(4)
79 stw 6,1(3)
80 b 0f
81
82 2: bf 30,4f
83 lhz 6,0(4)
84 sth 6,0(3)
85 bf 29,0f
86 lwz 6,2(4)
87 stw 6,2(3)
88 b 0f
89
90 4: bf 29,0f
91 lwz 6,0(4)
92 stw 6,0(3)
93 0:
94 /* Add the number of bytes until the 1st doubleword of dst to src and dst. */
95 add 4,4,0
96 add 3,3,0
97
98 clrldi 10,4,61 /* check alignment of src again. */
99 srdi 9,5,3 /* Number of full double words remaining. */
100
101 /* Copy doublewords from source to destination, assuming the
102 destination is aligned on a doubleword boundary.
103
104 At this point we know there are at least 25 bytes left (32-7) to copy.
105 The next step is to determine if the source is also doubleword aligned.
106 If not branch to the unaligned move code at .L6. which uses
107 a load, shift, store strategy.
108
109 Otherwise source and destination are doubleword aligned, and we can
110 the optimized doubleword copy loop. */
111 .align 4
112 .L0:
113 clrldi 11,5,61
114 andi. 0,5,0x78
115 srdi 12,5,7 /* Number of 128-byte blocks to move. */
116 cmpldi cr1,11,0 /* If the tail is 0 bytes */
117 bne- cr6,.L6 /* If source is not DW aligned. */
118
119 /* Move doublewords where destination and source are DW aligned.
120 Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
121 If the copy is not an exact multiple of 128 bytes, 1-15
122 doublewords are copied as needed to set up the main loop. After
123 the main loop exits there may be a tail of 1-7 bytes. These byte
124 are copied a word/halfword/byte at a time as needed to preserve
125 alignment.
126
127 For POWER6 the L1 is store-through and the L2 is store-in. The
128 L2 is clocked at half CPU clock so we can store 16 bytes every
129 other cycle. POWER6 also has a load/store bypass so we can do
130 load, load, store, store every 2 cycles.
131
132 The following code is sensitive to cache line alignment. Do not
133 make any change with out first making sure they don't result in
134 splitting ld/std pairs across a cache line. */
135
136 mtcrf 0x02,5
137 mtcrf 0x01,5
138 cmpldi cr5,12,1
139 beq L(das_loop)
140
141 bf 25,4f
142 .align 3
143 ld 6,0(4)
144 ld 7,8(4)
145 mr 11,4
146 mr 10,3
147 std 6,0(3)
148 std 7,8(3)
149 ld 6,16(4)
150 ld 7,24(4)
151 std 6,16(3)
152 std 7,24(3)
153 ld 6,0+32(4)
154 ld 7,8+32(4)
155 addi 4,4,64
156 addi 3,3,64
157 std 6,0+32(10)
158 std 7,8+32(10)
159 ld 6,16+32(11)
160 ld 7,24+32(11)
161 std 6,16+32(10)
162 std 7,24+32(10)
163 4:
164 mr 10,3
165 bf 26,2f
166 ld 6,0(4)
167 ld 7,8(4)
168 mr 11,4
169 nop
170 std 6,0(3)
171 std 7,8(3)
172 ld 6,16(4)
173 ld 7,24(4)
174 addi 4,4,32
175 std 6,16(3)
176 std 7,24(3)
177 addi 3,3,32
178 6:
179 nop
180 bf 27,5f
181 ld 6,0+32(11)
182 ld 7,8+32(11)
183 addi 4,4,16
184 addi 3,3,16
185 std 6,0+32(10)
186 std 7,8+32(10)
187 bf 28,L(das_loop_s)
188 ld 0,16+32(11)
189 addi 4,4,8
190 addi 3,3,8
191 std 0,16+32(10)
192 blt cr5,L(das_tail)
193 b L(das_loop)
194 .align 3
195 5:
196 nop
197 bf 28,L(das_loop_s)
198 ld 6,32(11)
199 addi 4,4,8
200 addi 3,3,8
201 std 6,32(10)
202 blt cr5,L(das_tail)
203 b L(das_loop)
204 .align 3
205 2:
206 mr 11,4
207 bf 27,1f
208 ld 6,0(4)
209 ld 7,8(4)
210 addi 4,4,16
211 addi 3,3,16
212 std 6,0(10)
213 std 7,8(10)
214 bf 28,L(das_loop_s)
215 ld 0,16(11)
216 addi 4,11,24
217 addi 3,10,24
218 std 0,16(10)
219 blt cr5,L(das_tail)
220 b L(das_loop)
221 .align 3
222 1:
223 nop
224 bf 28,L(das_loop_s)
225 ld 6,0(4)
226 addi 4,4,8
227 addi 3,3,8
228 std 6,0(10)
229 L(das_loop_s):
230 nop
231 blt cr5,L(das_tail)
232 .align 4
233 L(das_loop):
234 ld 6,0(4)
235 ld 7,8(4)
236 mr 10,3
237 mr 11,4
238 std 6,0(3)
239 std 7,8(3)
240 addi 12,12,-1
241 nop
242 ld 8,16(4)
243 ld 0,24(4)
244 std 8,16(3)
245 std 0,24(3)
246
247 ld 6,0+32(4)
248 ld 7,8+32(4)
249 std 6,0+32(3)
250 std 7,8+32(3)
251 ld 8,16+32(4)
252 ld 0,24+32(4)
253 std 8,16+32(3)
254 std 0,24+32(3)
255
256 ld 6,0+64(11)
257 ld 7,8+64(11)
258 std 6,0+64(10)
259 std 7,8+64(10)
260 ld 8,16+64(11)
261 ld 0,24+64(11)
262 std 8,16+64(10)
263 std 0,24+64(10)
264
265 ld 6,0+96(11)
266 ld 7,8+96(11)
267 addi 4,4,128
268 addi 3,3,128
269 std 6,0+96(10)
270 std 7,8+96(10)
271 ld 8,16+96(11)
272 ld 0,24+96(11)
273 std 8,16+96(10)
274 std 0,24+96(10)
275 ble cr5,L(das_loop_e)
276
277 mtctr 12
278 .align 4
279 L(das_loop2):
280 ld 6,0(4)
281 ld 7,8(4)
282 mr 10,3
283 mr 11,4
284 std 6,0(3)
285 std 7,8(3)
286 ld 8,16(4)
287 ld 0,24(4)
288 std 8,16(3)
289 std 0,24(3)
290
291 ld 6,0+32(4)
292 ld 7,8+32(4)
293 std 6,0+32(3)
294 std 7,8+32(3)
295 ld 8,16+32(4)
296 ld 0,24+32(4)
297 std 8,16+32(3)
298 std 0,24+32(3)
299
300 ld 6,0+64(11)
301 ld 7,8+64(11)
302 std 6,0+64(10)
303 std 7,8+64(10)
304 ld 8,16+64(11)
305 ld 0,24+64(11)
306 std 8,16+64(10)
307 std 0,24+64(10)
308
309 ld 6,0+96(11)
310 ld 7,8+96(11)
311 addi 4,4,128
312 addi 3,3,128
313 std 6,0+96(10)
314 std 7,8+96(10)
315 ld 8,16+96(11)
316 ld 0,24+96(11)
317 std 8,16+96(10)
318 std 0,24+96(10)
319 bdnz L(das_loop2)
320 L(das_loop_e):
321 /* Check of a 1-7 byte tail, return if none. */
322 bne cr1,L(das_tail2)
323 /* Return original dst pointer. */
324 ld 3,-16(1)
325 blr
326 .align 4
327 L(das_tail):
328 beq cr1,0f
329
330 L(das_tail2):
331 /* At this point we have a tail of 0-7 bytes and we know that the
332 destination is double word aligned. */
333 4: bf 29,2f
334 lwz 6,0(4)
335 stw 6,0(3)
336 bf 30,5f
337 lhz 6,4(4)
338 sth 6,4(3)
339 bf 31,0f
340 lbz 6,6(4)
341 stb 6,6(3)
342 b 0f
343 5: bf 31,0f
344 lbz 6,4(4)
345 stb 6,4(3)
346 b 0f
347
348 2: bf 30,1f
349 lhz 6,0(4)
350 sth 6,0(3)
351 bf 31,0f
352 lbz 6,2(4)
353 stb 6,2(3)
354 b 0f
355
356 1: bf 31,0f
357 lbz 6,0(4)
358 stb 6,0(3)
359 0:
360 /* Return original dst pointer. */
361 ld 3,-16(1)
362 blr
363
364 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
365 bytes. Each case is handled without loops, using binary (1,2,4,8)
366 tests.
367
368 In the short (0-8 byte) case no attempt is made to force alignment
369 of either source or destination. The hardware will handle the
370 unaligned load/stores with small delays for crossing 32- 128-byte,
371 and 4096-byte boundaries. Since these short moves are unlikely to be
372 unaligned or cross these boundaries, the overhead to force
373 alignment is not justified.
374
375 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
376 boundaries. Since only loads are sensitive to the 32-/128-byte
377 boundaries it is more important to align the source then the
378 destination. If the source is not already word aligned, we first
379 move 1-3 bytes as needed. Since we are only word aligned we don't
380 use double word load/stores to insure that all loads are aligned.
381 While the destination and stores may still be unaligned, this
382 is only an issue for page (4096 byte boundary) crossing, which
383 should be rare for these short moves. The hardware handles this
384 case automatically with a small (~20 cycle) delay. */
385 .align 4
386 .L2:
387 mtcrf 0x01,5
388 neg 8,4
389 clrrdi 11,4,2
390 andi. 0,8,3
391 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
392 /* At least 9 bytes left. Get the source word aligned. */
393 cmpldi cr1,5,16
394 mr 10,5
395 mr 12,4
396 cmpldi cr6,0,2
397 beq L(dus_tail) /* If the source is already word aligned skip this. */
398 /* Copy 1-3 bytes to get source address word aligned. */
399 lwz 6,0(11)
400 subf 10,0,5
401 add 12,4,0
402 blt cr6,5f
403 srdi 7,6,16
404 bgt cr6,3f
405 sth 6,0(3)
406 b 7f
407 .align 4
408 3:
409 stb 7,0(3)
410 sth 6,1(3)
411 b 7f
412 .align 4
413 5:
414 stb 6,0(3)
415 7:
416 cmpldi cr1,10,16
417 add 3,3,0
418 mtcrf 0x01,10
419 .align 4
420 L(dus_tail):
421 /* At least 6 bytes left and the source is word aligned. This allows
422 some speculative loads up front. */
423 /* We need to special case the fall-through because the biggest delays
424 are due to address computation not being ready in time for the
425 AGEN. */
426 lwz 6,0(12)
427 lwz 7,4(12)
428 blt cr1,L(dus_tail8)
429 cmpldi cr0,10,24
430 L(dus_tail16): /* Move 16 bytes. */
431 stw 6,0(3)
432 stw 7,4(3)
433 lwz 6,8(12)
434 lwz 7,12(12)
435 stw 6,8(3)
436 stw 7,12(3)
437 /* Move 8 bytes more. */
438 bf 28,L(dus_tail16p8)
439 cmpldi cr1,10,28
440 lwz 6,16(12)
441 lwz 7,20(12)
442 stw 6,16(3)
443 stw 7,20(3)
444 /* Move 4 bytes more. */
445 bf 29,L(dus_tail16p4)
446 lwz 6,24(12)
447 stw 6,24(3)
448 addi 12,12,28
449 addi 3,3,28
450 bgt cr1,L(dus_tail2)
451 /* exactly 28 bytes. Return original dst pointer and exit. */
452 ld 3,-16(1)
453 blr
454 .align 4
455 L(dus_tail16p8): /* less then 8 bytes left. */
456 beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */
457 cmpldi cr1,10,20
458 bf 29,L(dus_tail16p2)
459 /* Move 4 bytes more. */
460 lwz 6,16(12)
461 stw 6,16(3)
462 addi 12,12,20
463 addi 3,3,20
464 bgt cr1,L(dus_tail2)
465 /* exactly 20 bytes. Return original dst pointer and exit. */
466 ld 3,-16(1)
467 blr
468 .align 4
469 L(dus_tail16p4): /* less then 4 bytes left. */
470 addi 12,12,24
471 addi 3,3,24
472 bgt cr0,L(dus_tail2)
473 /* exactly 24 bytes. Return original dst pointer and exit. */
474 ld 3,-16(1)
475 blr
476 .align 4
477 L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */
478 addi 12,12,16
479 addi 3,3,16
480 b L(dus_tail2)
481
482 .align 4
483 L(dus_tail8): /* Move 8 bytes. */
484 /* r6, r7 already loaded speculatively. */
485 cmpldi cr1,10,8
486 cmpldi cr0,10,12
487 bf 28,L(dus_tail4)
488 .align 2
489 stw 6,0(3)
490 stw 7,4(3)
491 /* Move 4 bytes more. */
492 bf 29,L(dus_tail8p4)
493 lwz 6,8(12)
494 stw 6,8(3)
495 addi 12,12,12
496 addi 3,3,12
497 bgt cr0,L(dus_tail2)
498 /* exactly 12 bytes. Return original dst pointer and exit. */
499 ld 3,-16(1)
500 blr
501 .align 4
502 L(dus_tail8p4): /* less then 4 bytes left. */
503 addi 12,12,8
504 addi 3,3,8
505 bgt cr1,L(dus_tail2)
506 /* exactly 8 bytes. Return original dst pointer and exit. */
507 ld 3,-16(1)
508 blr
509
510 .align 4
511 L(dus_tail4): /* Move 4 bytes. */
512 /* r6 already loaded speculatively. If we are here we know there is
513 more then 4 bytes left. So there is no need to test. */
514 addi 12,12,4
515 stw 6,0(3)
516 addi 3,3,4
517 L(dus_tail2): /* Move 2-3 bytes. */
518 bf 30,L(dus_tail1)
519 lhz 6,0(12)
520 sth 6,0(3)
521 bf 31,L(dus_tailX)
522 lbz 7,2(12)
523 stb 7,2(3)
524 ld 3,-16(1)
525 blr
526 L(dus_tail1): /* Move 1 byte. */
527 bf 31,L(dus_tailX)
528 lbz 6,0(12)
529 stb 6,0(3)
530 L(dus_tailX):
531 /* Return original dst pointer. */
532 ld 3,-16(1)
533 blr
534
535 /* Special case to copy 0-8 bytes. */
536 .align 4
537 .LE8:
538 mr 12,4
539 bne cr6,L(dus_4)
540 /* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
541 cycle delay. This case should be rare and any attempt to avoid this
542 would take most of 20 cycles any way. */
543 ld 6,0(4)
544 std 6,0(3)
545 /* Return original dst pointer. */
546 ld 3,-16(1)
547 blr
548 .align 4
549 L(dus_4):
550 bf 29,L(dus_tail2)
551 lwz 6,0(4)
552 stw 6,0(3)
553 bf 30,L(dus_5)
554 lhz 7,4(4)
555 sth 7,4(3)
556 bf 31,L(dus_0)
557 lbz 8,6(4)
558 stb 8,6(3)
559 ld 3,-16(1)
560 blr
561 .align 4
562 L(dus_5):
563 bf 31,L(dus_0)
564 lbz 6,4(4)
565 stb 6,4(3)
566 L(dus_0):
567 /* Return original dst pointer. */
568 ld 3,-16(1)
569 blr
570
571 .align 4
572 .L6:
573 cfi_offset(31,-8)
574 mr 12,4
575 mr 31,5
576 /* Copy doublewords where the destination is aligned but the source is
577 not. Use aligned doubleword loads from the source, shifted to realign
578 the data, to allow aligned destination stores. */
579 addi 11,9,-1 /* loop DW count is one less than total */
580 subf 5,10,12 /* Move source addr to previous full double word. */
581 cmpldi cr5, 10, 2
582 cmpldi cr0, 10, 4
583 mr 4,3
584 srdi 8,11,2 /* calculate the 32 byte loop count */
585 ld 6,0(5) /* pre load 1st full doubleword. */
586 mtcrf 0x01,11
587 cmpldi cr6,9,4
588 mtctr 8
589 ld 7,8(5) /* pre load 2nd full doubleword. */
590 bge cr0, L(du4_do)
591 blt cr5, L(du1_do)
592 beq cr5, L(du2_do)
593 b L(du3_do)
594
595 .align 4
596 L(du1_do):
597 bf 30,L(du1_1dw)
598
599 /* there are at least two DWs to copy */
600 sldi 0,6, 8
601 srdi 8,7, 64-8
602 or 0,0,8
603 ld 6,16(5)
604 std 0,0(4)
605 sldi 0,7, 8
606 srdi 8,6, 64-8
607 or 0,0,8
608 ld 7,24(5)
609 std 0,8(4)
610 addi 4,4,16
611 addi 5,5,32
612 blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
613 bf 31,L(du1_loop)
614 /* there is a third DW to copy */
615 sldi 0,6, 8
616 srdi 8,7, 64-8
617 or 0,0,8
618 std 0,0(4)
619 mr 6,7
620 ld 7,0(5)
621 addi 5,5,8
622 addi 4,4,8
623 beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */
624 b L(du1_loop)
625 .align 4
626 L(du1_1dw):
627 sldi 0,6, 8
628 srdi 8,7, 64-8
629 addi 5,5,16
630 or 0,0,8
631 bf 31,L(du1_loop)
632 mr 6,7
633 ld 7,0(5)
634 addi 5,5,8
635 std 0,0(4)
636 addi 4,4,8
637 .align 4
638 /* copy 32 bytes at a time */
639 L(du1_loop):
640 sldi 0,6, 8
641 srdi 8,7, 64-8
642 or 0,0,8
643 ld 6,0(5)
644 std 0,0(4)
645 sldi 0,7, 8
646 srdi 8,6, 64-8
647 or 0,0,8
648 ld 7,8(5)
649 std 0,8(4)
650 sldi 0,6, 8
651 srdi 8,7, 64-8
652 or 0,0,8
653 ld 6,16(5)
654 std 0,16(4)
655 sldi 0,7, 8
656 srdi 8,6, 64-8
657 or 0,0,8
658 ld 7,24(5)
659 std 0,24(4)
660 addi 5,5,32
661 addi 4,4,32
662 bdnz+ L(du1_loop)
663 .align 4
664 L(du1_fini):
665 /* calculate and store the final DW */
666 sldi 0,6, 8
667 srdi 8,7, 64-8
668 or 0,0,8
669 std 0,0(4)
670 b L(du_done)
671
672 .align 4
673 L(du2_do):
674 bf 30,L(du2_1dw)
675
676 /* there are at least two DWs to copy */
677 sldi 0,6, 16
678 srdi 8,7, 64-16
679 or 0,0,8
680 ld 6,16(5)
681 std 0,0(4)
682 sldi 0,7, 16
683 srdi 8,6, 64-16
684 or 0,0,8
685 ld 7,24(5)
686 std 0,8(4)
687 addi 4,4,16
688 addi 5,5,32
689 blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
690 bf 31,L(du2_loop)
691 /* there is a third DW to copy */
692 sldi 0,6, 16
693 srdi 8,7, 64-16
694 or 0,0,8
695 std 0,0(4)
696 mr 6,7
697 ld 7,0(5)
698 addi 5,5,8
699 addi 4,4,8
700 beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */
701 b L(du2_loop)
702 .align 4
703 L(du2_1dw):
704 sldi 0,6, 16
705 srdi 8,7, 64-16
706 addi 5,5,16
707 or 0,0,8
708 bf 31,L(du2_loop)
709 mr 6,7
710 ld 7,0(5)
711 addi 5,5,8
712 std 0,0(4)
713 addi 4,4,8
714 .align 4
715 /* copy 32 bytes at a time */
716 L(du2_loop):
717 sldi 0,6, 16
718 srdi 8,7, 64-16
719 or 0,0,8
720 ld 6,0(5)
721 std 0,0(4)
722 sldi 0,7, 16
723 srdi 8,6, 64-16
724 or 0,0,8
725 ld 7,8(5)
726 std 0,8(4)
727 sldi 0,6, 16
728 srdi 8,7, 64-16
729 or 0,0,8
730 ld 6,16(5)
731 std 0,16(4)
732 sldi 0,7, 16
733 srdi 8,6, 64-16
734 or 0,0,8
735 ld 7,24(5)
736 std 0,24(4)
737 addi 5,5,32
738 addi 4,4,32
739 bdnz+ L(du2_loop)
740 .align 4
741 L(du2_fini):
742 /* calculate and store the final DW */
743 sldi 0,6, 16
744 srdi 8,7, 64-16
745 or 0,0,8
746 std 0,0(4)
747 b L(du_done)
748
749 .align 4
750 L(du3_do):
751 bf 30,L(du3_1dw)
752
753 /* there are at least two DWs to copy */
754 sldi 0,6, 24
755 srdi 8,7, 64-24
756 or 0,0,8
757 ld 6,16(5)
758 std 0,0(4)
759 sldi 0,7, 24
760 srdi 8,6, 64-24
761 or 0,0,8
762 ld 7,24(5)
763 std 0,8(4)
764 addi 4,4,16
765 addi 5,5,32
766 blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
767 bf 31,L(du3_loop)
768 /* there is a third DW to copy */
769 sldi 0,6, 24
770 srdi 8,7, 64-24
771 or 0,0,8
772 std 0,0(4)
773 mr 6,7
774 ld 7,0(5)
775 addi 5,5,8
776 addi 4,4,8
777 beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */
778 b L(du3_loop)
779 .align 4
780 L(du3_1dw):
781 sldi 0,6, 24
782 srdi 8,7, 64-24
783 addi 5,5,16
784 or 0,0,8
785 bf 31,L(du3_loop)
786 mr 6,7
787 ld 7,0(5)
788 addi 5,5,8
789 std 0,0(4)
790 addi 4,4,8
791 .align 4
792 /* copy 32 bytes at a time */
793 L(du3_loop):
794 sldi 0,6, 24
795 srdi 8,7, 64-24
796 or 0,0,8
797 ld 6,0(5)
798 std 0,0(4)
799 sldi 0,7, 24
800 srdi 8,6, 64-24
801 or 0,0,8
802 ld 7,8(5)
803 std 0,8(4)
804 sldi 0,6, 24
805 srdi 8,7, 64-24
806 or 0,0,8
807 ld 6,16(5)
808 std 0,16(4)
809 sldi 0,7, 24
810 srdi 8,6, 64-24
811 or 0,0,8
812 ld 7,24(5)
813 std 0,24(4)
814 addi 5,5,32
815 addi 4,4,32
816 bdnz+ L(du3_loop)
817 .align 4
818 L(du3_fini):
819 /* calculate and store the final DW */
820 sldi 0,6, 24
821 srdi 8,7, 64-24
822 or 0,0,8
823 std 0,0(4)
824 b L(du_done)
825
826 .align 4
827 L(du4_do):
828 cmpldi cr5, 10, 6
829 beq cr0, L(du4_dox)
830 blt cr5, L(du5_do)
831 beq cr5, L(du6_do)
832 b L(du7_do)
833 L(du4_dox):
834 bf 30,L(du4_1dw)
835
836 /* there are at least two DWs to copy */
837 sldi 0,6, 32
838 srdi 8,7, 64-32
839 or 0,0,8
840 ld 6,16(5)
841 std 0,0(4)
842 sldi 0,7, 32
843 srdi 8,6, 64-32
844 or 0,0,8
845 ld 7,24(5)
846 std 0,8(4)
847 addi 4,4,16
848 addi 5,5,32
849 blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
850 bf 31,L(du4_loop)
851 /* there is a third DW to copy */
852 sldi 0,6, 32
853 srdi 8,7, 64-32
854 or 0,0,8
855 std 0,0(4)
856 mr 6,7
857 ld 7,0(5)
858 addi 5,5,8
859 addi 4,4,8
860 beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */
861 b L(du4_loop)
862 .align 4
863 L(du4_1dw):
864 sldi 0,6, 32
865 srdi 8,7, 64-32
866 addi 5,5,16
867 or 0,0,8
868 bf 31,L(du4_loop)
869 mr 6,7
870 ld 7,0(5)
871 addi 5,5,8
872 std 0,0(4)
873 addi 4,4,8
874 .align 4
875 /* copy 32 bytes at a time */
876 L(du4_loop):
877 sldi 0,6, 32
878 srdi 8,7, 64-32
879 or 0,0,8
880 ld 6,0(5)
881 std 0,0(4)
882 sldi 0,7, 32
883 srdi 8,6, 64-32
884 or 0,0,8
885 ld 7,8(5)
886 std 0,8(4)
887 sldi 0,6, 32
888 srdi 8,7, 64-32
889 or 0,0,8
890 ld 6,16(5)
891 std 0,16(4)
892 sldi 0,7, 32
893 srdi 8,6, 64-32
894 or 0,0,8
895 ld 7,24(5)
896 std 0,24(4)
897 addi 5,5,32
898 addi 4,4,32
899 bdnz+ L(du4_loop)
900 .align 4
901 L(du4_fini):
902 /* calculate and store the final DW */
903 sldi 0,6, 32
904 srdi 8,7, 64-32
905 or 0,0,8
906 std 0,0(4)
907 b L(du_done)
908
909 .align 4
910 L(du5_do):
911 bf 30,L(du5_1dw)
912
913 /* there are at least two DWs to copy */
914 sldi 0,6, 40
915 srdi 8,7, 64-40
916 or 0,0,8
917 ld 6,16(5)
918 std 0,0(4)
919 sldi 0,7, 40
920 srdi 8,6, 64-40
921 or 0,0,8
922 ld 7,24(5)
923 std 0,8(4)
924 addi 4,4,16
925 addi 5,5,32
926 blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
927 bf 31,L(du5_loop)
928 /* there is a third DW to copy */
929 sldi 0,6, 40
930 srdi 8,7, 64-40
931 or 0,0,8
932 std 0,0(4)
933 mr 6,7
934 ld 7,0(5)
935 addi 5,5,8
936 addi 4,4,8
937 beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */
938 b L(du5_loop)
939 .align 4
940 L(du5_1dw):
941 sldi 0,6, 40
942 srdi 8,7, 64-40
943 addi 5,5,16
944 or 0,0,8
945 bf 31,L(du5_loop)
946 mr 6,7
947 ld 7,0(5)
948 addi 5,5,8
949 std 0,0(4)
950 addi 4,4,8
951 .align 4
952 /* copy 32 bytes at a time */
953 L(du5_loop):
954 sldi 0,6, 40
955 srdi 8,7, 64-40
956 or 0,0,8
957 ld 6,0(5)
958 std 0,0(4)
959 sldi 0,7, 40
960 srdi 8,6, 64-40
961 or 0,0,8
962 ld 7,8(5)
963 std 0,8(4)
964 sldi 0,6, 40
965 srdi 8,7, 64-40
966 or 0,0,8
967 ld 6,16(5)
968 std 0,16(4)
969 sldi 0,7, 40
970 srdi 8,6, 64-40
971 or 0,0,8
972 ld 7,24(5)
973 std 0,24(4)
974 addi 5,5,32
975 addi 4,4,32
976 bdnz+ L(du5_loop)
977 .align 4
978 L(du5_fini):
979 /* calculate and store the final DW */
980 sldi 0,6, 40
981 srdi 8,7, 64-40
982 or 0,0,8
983 std 0,0(4)
984 b L(du_done)
985
986 .align 4
987 L(du6_do):
988 bf 30,L(du6_1dw)
989
990 /* there are at least two DWs to copy */
991 sldi 0,6, 48
992 srdi 8,7, 64-48
993 or 0,0,8
994 ld 6,16(5)
995 std 0,0(4)
996 sldi 0,7, 48
997 srdi 8,6, 64-48
998 or 0,0,8
999 ld 7,24(5)
1000 std 0,8(4)
1001 addi 4,4,16
1002 addi 5,5,32
1003 blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
1004 bf 31,L(du6_loop)
1005 /* there is a third DW to copy */
1006 sldi 0,6, 48
1007 srdi 8,7, 64-48
1008 or 0,0,8
1009 std 0,0(4)
1010 mr 6,7
1011 ld 7,0(5)
1012 addi 5,5,8
1013 addi 4,4,8
1014 beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */
1015 b L(du6_loop)
1016 .align 4
1017 L(du6_1dw):
1018 sldi 0,6, 48
1019 srdi 8,7, 64-48
1020 addi 5,5,16
1021 or 0,0,8
1022 bf 31,L(du6_loop)
1023 mr 6,7
1024 ld 7,0(5)
1025 addi 5,5,8
1026 std 0,0(4)
1027 addi 4,4,8
1028 .align 4
1029 /* copy 32 bytes at a time */
1030 L(du6_loop):
1031 sldi 0,6, 48
1032 srdi 8,7, 64-48
1033 or 0,0,8
1034 ld 6,0(5)
1035 std 0,0(4)
1036 sldi 0,7, 48
1037 srdi 8,6, 64-48
1038 or 0,0,8
1039 ld 7,8(5)
1040 std 0,8(4)
1041 sldi 0,6, 48
1042 srdi 8,7, 64-48
1043 or 0,0,8
1044 ld 6,16(5)
1045 std 0,16(4)
1046 sldi 0,7, 48
1047 srdi 8,6, 64-48
1048 or 0,0,8
1049 ld 7,24(5)
1050 std 0,24(4)
1051 addi 5,5,32
1052 addi 4,4,32
1053 bdnz+ L(du6_loop)
1054 .align 4
1055 L(du6_fini):
1056 /* calculate and store the final DW */
1057 sldi 0,6, 48
1058 srdi 8,7, 64-48
1059 or 0,0,8
1060 std 0,0(4)
1061 b L(du_done)
1062
1063 .align 4
1064 L(du7_do):
1065 bf 30,L(du7_1dw)
1066
1067 /* there are at least two DWs to copy */
1068 sldi 0,6, 56
1069 srdi 8,7, 64-56
1070 or 0,0,8
1071 ld 6,16(5)
1072 std 0,0(4)
1073 sldi 0,7, 56
1074 srdi 8,6, 64-56
1075 or 0,0,8
1076 ld 7,24(5)
1077 std 0,8(4)
1078 addi 4,4,16
1079 addi 5,5,32
1080 blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
1081 bf 31,L(du7_loop)
1082 /* there is a third DW to copy */
1083 sldi 0,6, 56
1084 srdi 8,7, 64-56
1085 or 0,0,8
1086 std 0,0(4)
1087 mr 6,7
1088 ld 7,0(5)
1089 addi 5,5,8
1090 addi 4,4,8
1091 beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */
1092 b L(du7_loop)
1093 .align 4
1094 L(du7_1dw):
1095 sldi 0,6, 56
1096 srdi 8,7, 64-56
1097 addi 5,5,16
1098 or 0,0,8
1099 bf 31,L(du7_loop)
1100 mr 6,7
1101 ld 7,0(5)
1102 addi 5,5,8
1103 std 0,0(4)
1104 addi 4,4,8
1105 .align 4
1106 /* copy 32 bytes at a time */
1107 L(du7_loop):
1108 sldi 0,6, 56
1109 srdi 8,7, 64-56
1110 or 0,0,8
1111 ld 6,0(5)
1112 std 0,0(4)
1113 sldi 0,7, 56
1114 srdi 8,6, 64-56
1115 or 0,0,8
1116 ld 7,8(5)
1117 std 0,8(4)
1118 sldi 0,6, 56
1119 srdi 8,7, 64-56
1120 or 0,0,8
1121 ld 6,16(5)
1122 std 0,16(4)
1123 sldi 0,7, 56
1124 srdi 8,6, 64-56
1125 or 0,0,8
1126 ld 7,24(5)
1127 std 0,24(4)
1128 addi 5,5,32
1129 addi 4,4,32
1130 bdnz+ L(du7_loop)
1131 .align 4
1132 L(du7_fini):
1133 /* calculate and store the final DW */
1134 sldi 0,6, 56
1135 srdi 8,7, 64-56
1136 or 0,0,8
1137 std 0,0(4)
1138 b L(du_done)
1139
1140 .align 4
1141 L(du_done):
1142 rldicr 0,31,0,60
1143 mtcrf 0x01,31
1144 beq cr1,0f /* If the tail is 0 bytes we are done! */
1145
1146 add 3,3,0
1147 add 12,12,0
1148 /* At this point we have a tail of 0-7 bytes and we know that the
1149 destination is double word aligned. */
1150 4: bf 29,2f
1151 lwz 6,0(12)
1152 addi 12,12,4
1153 stw 6,0(3)
1154 addi 3,3,4
1155 2: bf 30,1f
1156 lhz 6,0(12)
1157 addi 12,12,2
1158 sth 6,0(3)
1159 addi 3,3,2
1160 1: bf 31,0f
1161 lbz 6,0(12)
1162 stb 6,0(3)
1163 0:
1164 /* Return original dst pointer. */
1165 ld 31,-8(1)
1166 ld 3,-16(1)
1167 blr
1168 END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
1169 libc_hidden_builtin_def (memcpy)