]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power6/memcpy.S
Remove powerpc64 bounded-pointers code.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29 of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31 Longer moves (>= 32-bytes) justify the effort to get at least the
32 destination doubleword (8-byte) aligned. Further optimization is
33 possible when both source and destination are doubleword aligned.
34 Each case has a optimized unrolled loop.
35
36 For POWER6 unaligned loads will take a 20+ cycle hiccup for any
37 L1 cache miss that crosses a 32- or 128-byte boundary. Store
38 is more forgiving and does not take a hiccup until page or
39 segment boundaries. So we require doubleword alignment for
40 the source but may take a risk and only require word alignment
41 for the destination. */
42
43 .machine "power6"
44 EALIGN (memcpy, 7, 0)
45 CALL_MCOUNT 3
46
47 cmpldi cr1,5,31
48 neg 0,3
49 std 3,-16(1)
50 std 31,-8(1)
51 andi. 11,3,7 /* check alignment of dst. */
52 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
53 clrldi 10,4,61 /* check alignment of src. */
54 cmpldi cr6,5,8
55 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
56 mtcrf 0x01,0
57 cmpld cr6,10,11
58 srdi 9,5,3 /* Number of full double words remaining. */
59 beq .L0
60
61 subf 5,0,5
62 /* Move 0-7 bytes as needed to get the destination doubleword aligned.
63 Duplicate some code to maximize fall-through and minimize agen delays. */
64 1: bf 31,2f
65 lbz 6,0(4)
66 stb 6,0(3)
67 bf 30,5f
68 lhz 6,1(4)
69 sth 6,1(3)
70 bf 29,0f
71 lwz 6,3(4)
72 stw 6,3(3)
73 b 0f
74 5:
75 bf 29,0f
76 lwz 6,1(4)
77 stw 6,1(3)
78 b 0f
79
80 2: bf 30,4f
81 lhz 6,0(4)
82 sth 6,0(3)
83 bf 29,0f
84 lwz 6,2(4)
85 stw 6,2(3)
86 b 0f
87
88 4: bf 29,0f
89 lwz 6,0(4)
90 stw 6,0(3)
91 0:
92 /* Add the number of bytes until the 1st doubleword of dst to src and dst. */
93 add 4,4,0
94 add 3,3,0
95
96 clrldi 10,4,61 /* check alignment of src again. */
97 srdi 9,5,3 /* Number of full double words remaining. */
98
99 /* Copy doublewords from source to destination, assuming the
100 destination is aligned on a doubleword boundary.
101
102 At this point we know there are at least 25 bytes left (32-7) to copy.
103 The next step is to determine if the source is also doubleword aligned.
104 If not branch to the unaligned move code at .L6. which uses
105 a load, shift, store strategy.
106
107 Otherwise source and destination are doubleword aligned, and we can
108 the optimized doubleword copy loop. */
109 .align 4
110 .L0:
111 clrldi 11,5,61
112 andi. 0,5,0x78
113 srdi 12,5,7 /* Number of 128-byte blocks to move. */
114 cmpldi cr1,11,0 /* If the tail is 0 bytes */
115 bne- cr6,.L6 /* If source is not DW aligned. */
116
117 /* Move doublewords where destination and source are DW aligned.
118 Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
119 If the copy is not an exact multiple of 128 bytes, 1-15
120 doublewords are copied as needed to set up the main loop. After
121 the main loop exits there may be a tail of 1-7 bytes. These byte
122 are copied a word/halfword/byte at a time as needed to preserve
123 alignment.
124
125 For POWER6 the L1 is store-through and the L2 is store-in. The
126 L2 is clocked at half CPU clock so we can store 16 bytes every
127 other cycle. POWER6 also has a load/store bypass so we can do
128 load, load, store, store every 2 cycles.
129
130 The following code is sensitive to cache line alignment. Do not
131 make any change with out first making sure they don't result in
132 splitting ld/std pairs across a cache line. */
133
134 mtcrf 0x02,5
135 mtcrf 0x01,5
136 cmpldi cr5,12,1
137 beq L(das_loop)
138
139 bf 25,4f
140 .align 3
141 ld 6,0(4)
142 ld 7,8(4)
143 mr 11,4
144 mr 10,3
145 std 6,0(3)
146 std 7,8(3)
147 ld 6,16(4)
148 ld 7,24(4)
149 std 6,16(3)
150 std 7,24(3)
151 ld 6,0+32(4)
152 ld 7,8+32(4)
153 addi 4,4,64
154 addi 3,3,64
155 std 6,0+32(10)
156 std 7,8+32(10)
157 ld 6,16+32(11)
158 ld 7,24+32(11)
159 std 6,16+32(10)
160 std 7,24+32(10)
161 4:
162 mr 10,3
163 bf 26,2f
164 ld 6,0(4)
165 ld 7,8(4)
166 mr 11,4
167 nop
168 std 6,0(3)
169 std 7,8(3)
170 ld 6,16(4)
171 ld 7,24(4)
172 addi 4,4,32
173 std 6,16(3)
174 std 7,24(3)
175 addi 3,3,32
176 6:
177 nop
178 bf 27,5f
179 ld 6,0+32(11)
180 ld 7,8+32(11)
181 addi 4,4,16
182 addi 3,3,16
183 std 6,0+32(10)
184 std 7,8+32(10)
185 bf 28,L(das_loop_s)
186 ld 0,16+32(11)
187 addi 4,4,8
188 addi 3,3,8
189 std 0,16+32(10)
190 blt cr5,L(das_tail)
191 b L(das_loop)
192 .align 3
193 5:
194 nop
195 bf 28,L(das_loop_s)
196 ld 6,32(11)
197 addi 4,4,8
198 addi 3,3,8
199 std 6,32(10)
200 blt cr5,L(das_tail)
201 b L(das_loop)
202 .align 3
203 2:
204 mr 11,4
205 bf 27,1f
206 ld 6,0(4)
207 ld 7,8(4)
208 addi 4,4,16
209 addi 3,3,16
210 std 6,0(10)
211 std 7,8(10)
212 bf 28,L(das_loop_s)
213 ld 0,16(11)
214 addi 4,11,24
215 addi 3,10,24
216 std 0,16(10)
217 blt cr5,L(das_tail)
218 b L(das_loop)
219 .align 3
220 1:
221 nop
222 bf 28,L(das_loop_s)
223 ld 6,0(4)
224 addi 4,4,8
225 addi 3,3,8
226 std 6,0(10)
227 L(das_loop_s):
228 nop
229 blt cr5,L(das_tail)
230 .align 4
231 L(das_loop):
232 ld 6,0(4)
233 ld 7,8(4)
234 mr 10,3
235 mr 11,4
236 std 6,0(3)
237 std 7,8(3)
238 addi 12,12,-1
239 nop
240 ld 8,16(4)
241 ld 0,24(4)
242 std 8,16(3)
243 std 0,24(3)
244
245 ld 6,0+32(4)
246 ld 7,8+32(4)
247 std 6,0+32(3)
248 std 7,8+32(3)
249 ld 8,16+32(4)
250 ld 0,24+32(4)
251 std 8,16+32(3)
252 std 0,24+32(3)
253
254 ld 6,0+64(11)
255 ld 7,8+64(11)
256 std 6,0+64(10)
257 std 7,8+64(10)
258 ld 8,16+64(11)
259 ld 0,24+64(11)
260 std 8,16+64(10)
261 std 0,24+64(10)
262
263 ld 6,0+96(11)
264 ld 7,8+96(11)
265 addi 4,4,128
266 addi 3,3,128
267 std 6,0+96(10)
268 std 7,8+96(10)
269 ld 8,16+96(11)
270 ld 0,24+96(11)
271 std 8,16+96(10)
272 std 0,24+96(10)
273 ble cr5,L(das_loop_e)
274
275 mtctr 12
276 .align 4
277 L(das_loop2):
278 ld 6,0(4)
279 ld 7,8(4)
280 mr 10,3
281 mr 11,4
282 std 6,0(3)
283 std 7,8(3)
284 ld 8,16(4)
285 ld 0,24(4)
286 std 8,16(3)
287 std 0,24(3)
288
289 ld 6,0+32(4)
290 ld 7,8+32(4)
291 std 6,0+32(3)
292 std 7,8+32(3)
293 ld 8,16+32(4)
294 ld 0,24+32(4)
295 std 8,16+32(3)
296 std 0,24+32(3)
297
298 ld 6,0+64(11)
299 ld 7,8+64(11)
300 std 6,0+64(10)
301 std 7,8+64(10)
302 ld 8,16+64(11)
303 ld 0,24+64(11)
304 std 8,16+64(10)
305 std 0,24+64(10)
306
307 ld 6,0+96(11)
308 ld 7,8+96(11)
309 addi 4,4,128
310 addi 3,3,128
311 std 6,0+96(10)
312 std 7,8+96(10)
313 ld 8,16+96(11)
314 ld 0,24+96(11)
315 std 8,16+96(10)
316 std 0,24+96(10)
317 bdnz L(das_loop2)
318 L(das_loop_e):
319 /* Check of a 1-7 byte tail, return if none. */
320 bne cr1,L(das_tail2)
321 /* Return original dst pointer. */
322 ld 3,-16(1)
323 blr
324 .align 4
325 L(das_tail):
326 beq cr1,0f
327
328 L(das_tail2):
329 /* At this point we have a tail of 0-7 bytes and we know that the
330 destination is double word aligned. */
331 4: bf 29,2f
332 lwz 6,0(4)
333 stw 6,0(3)
334 bf 30,5f
335 lhz 6,4(4)
336 sth 6,4(3)
337 bf 31,0f
338 lbz 6,6(4)
339 stb 6,6(3)
340 b 0f
341 5: bf 31,0f
342 lbz 6,4(4)
343 stb 6,4(3)
344 b 0f
345
346 2: bf 30,1f
347 lhz 6,0(4)
348 sth 6,0(3)
349 bf 31,0f
350 lbz 6,2(4)
351 stb 6,2(3)
352 b 0f
353
354 1: bf 31,0f
355 lbz 6,0(4)
356 stb 6,0(3)
357 0:
358 /* Return original dst pointer. */
359 ld 3,-16(1)
360 blr
361
362 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
363 bytes. Each case is handled without loops, using binary (1,2,4,8)
364 tests.
365
366 In the short (0-8 byte) case no attempt is made to force alignment
367 of either source or destination. The hardware will handle the
368 unaligned load/stores with small delays for crossing 32- 128-byte,
369 and 4096-byte boundaries. Since these short moves are unlikely to be
370 unaligned or cross these boundaries, the overhead to force
371 alignment is not justified.
372
373 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
374 boundaries. Since only loads are sensitive to the 32-/128-byte
375 boundaries it is more important to align the source then the
376 destination. If the source is not already word aligned, we first
377 move 1-3 bytes as needed. Since we are only word aligned we don't
378 use double word load/stores to insure that all loads are aligned.
379 While the destination and stores may still be unaligned, this
380 is only an issue for page (4096 byte boundary) crossing, which
381 should be rare for these short moves. The hardware handles this
382 case automatically with a small (~20 cycle) delay. */
383 .align 4
384 .L2:
385 mtcrf 0x01,5
386 neg 8,4
387 clrrdi 11,4,2
388 andi. 0,8,3
389 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
390 /* At least 9 bytes left. Get the source word aligned. */
391 cmpldi cr1,5,16
392 mr 10,5
393 mr 12,4
394 cmpldi cr6,0,2
395 beq L(dus_tail) /* If the source is already word aligned skip this. */
396 /* Copy 1-3 bytes to get source address word aligned. */
397 lwz 6,0(11)
398 subf 10,0,5
399 add 12,4,0
400 blt cr6,5f
401 srdi 7,6,16
402 bgt cr6,3f
403 sth 6,0(3)
404 b 7f
405 .align 4
406 3:
407 stb 7,0(3)
408 sth 6,1(3)
409 b 7f
410 .align 4
411 5:
412 stb 6,0(3)
413 7:
414 cmpldi cr1,10,16
415 add 3,3,0
416 mtcrf 0x01,10
417 .align 4
418 L(dus_tail):
419 /* At least 6 bytes left and the source is word aligned. This allows
420 some speculative loads up front. */
421 /* We need to special case the fall-through because the biggest delays
422 are due to address computation not being ready in time for the
423 AGEN. */
424 lwz 6,0(12)
425 lwz 7,4(12)
426 blt cr1,L(dus_tail8)
427 cmpldi cr0,10,24
428 L(dus_tail16): /* Move 16 bytes. */
429 stw 6,0(3)
430 stw 7,4(3)
431 lwz 6,8(12)
432 lwz 7,12(12)
433 stw 6,8(3)
434 stw 7,12(3)
435 /* Move 8 bytes more. */
436 bf 28,L(dus_tail16p8)
437 cmpldi cr1,10,28
438 lwz 6,16(12)
439 lwz 7,20(12)
440 stw 6,16(3)
441 stw 7,20(3)
442 /* Move 4 bytes more. */
443 bf 29,L(dus_tail16p4)
444 lwz 6,24(12)
445 stw 6,24(3)
446 addi 12,12,28
447 addi 3,3,28
448 bgt cr1,L(dus_tail2)
449 /* exactly 28 bytes. Return original dst pointer and exit. */
450 ld 3,-16(1)
451 blr
452 .align 4
453 L(dus_tail16p8): /* less then 8 bytes left. */
454 beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */
455 cmpldi cr1,10,20
456 bf 29,L(dus_tail16p2)
457 /* Move 4 bytes more. */
458 lwz 6,16(12)
459 stw 6,16(3)
460 addi 12,12,20
461 addi 3,3,20
462 bgt cr1,L(dus_tail2)
463 /* exactly 20 bytes. Return original dst pointer and exit. */
464 ld 3,-16(1)
465 blr
466 .align 4
467 L(dus_tail16p4): /* less then 4 bytes left. */
468 addi 12,12,24
469 addi 3,3,24
470 bgt cr0,L(dus_tail2)
471 /* exactly 24 bytes. Return original dst pointer and exit. */
472 ld 3,-16(1)
473 blr
474 .align 4
475 L(dus_tail16p2): /* 16 bytes moved, less then 4 bytes left. */
476 addi 12,12,16
477 addi 3,3,16
478 b L(dus_tail2)
479
480 .align 4
481 L(dus_tail8): /* Move 8 bytes. */
482 /* r6, r7 already loaded speculatively. */
483 cmpldi cr1,10,8
484 cmpldi cr0,10,12
485 bf 28,L(dus_tail4)
486 .align 2
487 stw 6,0(3)
488 stw 7,4(3)
489 /* Move 4 bytes more. */
490 bf 29,L(dus_tail8p4)
491 lwz 6,8(12)
492 stw 6,8(3)
493 addi 12,12,12
494 addi 3,3,12
495 bgt cr0,L(dus_tail2)
496 /* exactly 12 bytes. Return original dst pointer and exit. */
497 ld 3,-16(1)
498 blr
499 .align 4
500 L(dus_tail8p4): /* less then 4 bytes left. */
501 addi 12,12,8
502 addi 3,3,8
503 bgt cr1,L(dus_tail2)
504 /* exactly 8 bytes. Return original dst pointer and exit. */
505 ld 3,-16(1)
506 blr
507
508 .align 4
509 L(dus_tail4): /* Move 4 bytes. */
510 /* r6 already loaded speculatively. If we are here we know there is
511 more then 4 bytes left. So there is no need to test. */
512 addi 12,12,4
513 stw 6,0(3)
514 addi 3,3,4
515 L(dus_tail2): /* Move 2-3 bytes. */
516 bf 30,L(dus_tail1)
517 lhz 6,0(12)
518 sth 6,0(3)
519 bf 31,L(dus_tailX)
520 lbz 7,2(12)
521 stb 7,2(3)
522 ld 3,-16(1)
523 blr
524 L(dus_tail1): /* Move 1 byte. */
525 bf 31,L(dus_tailX)
526 lbz 6,0(12)
527 stb 6,0(3)
528 L(dus_tailX):
529 /* Return original dst pointer. */
530 ld 3,-16(1)
531 blr
532
533 /* Special case to copy 0-8 bytes. */
534 .align 4
535 .LE8:
536 mr 12,4
537 bne cr6,L(dus_4)
538 /* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
539 cycle delay. This case should be rare and any attempt to avoid this
540 would take most of 20 cycles any way. */
541 ld 6,0(4)
542 std 6,0(3)
543 /* Return original dst pointer. */
544 ld 3,-16(1)
545 blr
546 .align 4
547 L(dus_4):
548 bf 29,L(dus_tail2)
549 lwz 6,0(4)
550 stw 6,0(3)
551 bf 30,L(dus_5)
552 lhz 7,4(4)
553 sth 7,4(3)
554 bf 31,L(dus_0)
555 lbz 8,6(4)
556 stb 8,6(3)
557 ld 3,-16(1)
558 blr
559 .align 4
560 L(dus_5):
561 bf 31,L(dus_0)
562 lbz 6,4(4)
563 stb 6,4(3)
564 L(dus_0):
565 /* Return original dst pointer. */
566 ld 3,-16(1)
567 blr
568
569 .align 4
570 .L6:
571 cfi_offset(31,-8)
572 mr 12,4
573 mr 31,5
574 /* Copy doublewords where the destination is aligned but the source is
575 not. Use aligned doubleword loads from the source, shifted to realign
576 the data, to allow aligned destination stores. */
577 addi 11,9,-1 /* loop DW count is one less than total */
578 subf 5,10,12 /* Move source addr to previous full double word. */
579 cmpldi cr5, 10, 2
580 cmpldi cr0, 10, 4
581 mr 4,3
582 srdi 8,11,2 /* calculate the 32 byte loop count */
583 ld 6,0(5) /* pre load 1st full doubleword. */
584 mtcrf 0x01,11
585 cmpldi cr6,9,4
586 mtctr 8
587 ld 7,8(5) /* pre load 2nd full doubleword. */
588 bge cr0, L(du4_do)
589 blt cr5, L(du1_do)
590 beq cr5, L(du2_do)
591 b L(du3_do)
592
593 .align 4
594 L(du1_do):
595 bf 30,L(du1_1dw)
596
597 /* there are at least two DWs to copy */
598 sldi 0,6, 8
599 srdi 8,7, 64-8
600 or 0,0,8
601 ld 6,16(5)
602 std 0,0(4)
603 sldi 0,7, 8
604 srdi 8,6, 64-8
605 or 0,0,8
606 ld 7,24(5)
607 std 0,8(4)
608 addi 4,4,16
609 addi 5,5,32
610 blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
611 bf 31,L(du1_loop)
612 /* there is a third DW to copy */
613 sldi 0,6, 8
614 srdi 8,7, 64-8
615 or 0,0,8
616 std 0,0(4)
617 mr 6,7
618 ld 7,0(5)
619 addi 5,5,8
620 addi 4,4,8
621 beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */
622 b L(du1_loop)
623 .align 4
624 L(du1_1dw):
625 sldi 0,6, 8
626 srdi 8,7, 64-8
627 addi 5,5,16
628 or 0,0,8
629 bf 31,L(du1_loop)
630 mr 6,7
631 ld 7,0(5)
632 addi 5,5,8
633 std 0,0(4)
634 addi 4,4,8
635 .align 4
636 /* copy 32 bytes at a time */
637 L(du1_loop):
638 sldi 0,6, 8
639 srdi 8,7, 64-8
640 or 0,0,8
641 ld 6,0(5)
642 std 0,0(4)
643 sldi 0,7, 8
644 srdi 8,6, 64-8
645 or 0,0,8
646 ld 7,8(5)
647 std 0,8(4)
648 sldi 0,6, 8
649 srdi 8,7, 64-8
650 or 0,0,8
651 ld 6,16(5)
652 std 0,16(4)
653 sldi 0,7, 8
654 srdi 8,6, 64-8
655 or 0,0,8
656 ld 7,24(5)
657 std 0,24(4)
658 addi 5,5,32
659 addi 4,4,32
660 bdnz+ L(du1_loop)
661 .align 4
662 L(du1_fini):
663 /* calculate and store the final DW */
664 sldi 0,6, 8
665 srdi 8,7, 64-8
666 or 0,0,8
667 std 0,0(4)
668 b L(du_done)
669
670 .align 4
671 L(du2_do):
672 bf 30,L(du2_1dw)
673
674 /* there are at least two DWs to copy */
675 sldi 0,6, 16
676 srdi 8,7, 64-16
677 or 0,0,8
678 ld 6,16(5)
679 std 0,0(4)
680 sldi 0,7, 16
681 srdi 8,6, 64-16
682 or 0,0,8
683 ld 7,24(5)
684 std 0,8(4)
685 addi 4,4,16
686 addi 5,5,32
687 blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
688 bf 31,L(du2_loop)
689 /* there is a third DW to copy */
690 sldi 0,6, 16
691 srdi 8,7, 64-16
692 or 0,0,8
693 std 0,0(4)
694 mr 6,7
695 ld 7,0(5)
696 addi 5,5,8
697 addi 4,4,8
698 beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */
699 b L(du2_loop)
700 .align 4
701 L(du2_1dw):
702 sldi 0,6, 16
703 srdi 8,7, 64-16
704 addi 5,5,16
705 or 0,0,8
706 bf 31,L(du2_loop)
707 mr 6,7
708 ld 7,0(5)
709 addi 5,5,8
710 std 0,0(4)
711 addi 4,4,8
712 .align 4
713 /* copy 32 bytes at a time */
714 L(du2_loop):
715 sldi 0,6, 16
716 srdi 8,7, 64-16
717 or 0,0,8
718 ld 6,0(5)
719 std 0,0(4)
720 sldi 0,7, 16
721 srdi 8,6, 64-16
722 or 0,0,8
723 ld 7,8(5)
724 std 0,8(4)
725 sldi 0,6, 16
726 srdi 8,7, 64-16
727 or 0,0,8
728 ld 6,16(5)
729 std 0,16(4)
730 sldi 0,7, 16
731 srdi 8,6, 64-16
732 or 0,0,8
733 ld 7,24(5)
734 std 0,24(4)
735 addi 5,5,32
736 addi 4,4,32
737 bdnz+ L(du2_loop)
738 .align 4
739 L(du2_fini):
740 /* calculate and store the final DW */
741 sldi 0,6, 16
742 srdi 8,7, 64-16
743 or 0,0,8
744 std 0,0(4)
745 b L(du_done)
746
747 .align 4
748 L(du3_do):
749 bf 30,L(du3_1dw)
750
751 /* there are at least two DWs to copy */
752 sldi 0,6, 24
753 srdi 8,7, 64-24
754 or 0,0,8
755 ld 6,16(5)
756 std 0,0(4)
757 sldi 0,7, 24
758 srdi 8,6, 64-24
759 or 0,0,8
760 ld 7,24(5)
761 std 0,8(4)
762 addi 4,4,16
763 addi 5,5,32
764 blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
765 bf 31,L(du3_loop)
766 /* there is a third DW to copy */
767 sldi 0,6, 24
768 srdi 8,7, 64-24
769 or 0,0,8
770 std 0,0(4)
771 mr 6,7
772 ld 7,0(5)
773 addi 5,5,8
774 addi 4,4,8
775 beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */
776 b L(du3_loop)
777 .align 4
778 L(du3_1dw):
779 sldi 0,6, 24
780 srdi 8,7, 64-24
781 addi 5,5,16
782 or 0,0,8
783 bf 31,L(du3_loop)
784 mr 6,7
785 ld 7,0(5)
786 addi 5,5,8
787 std 0,0(4)
788 addi 4,4,8
789 .align 4
790 /* copy 32 bytes at a time */
791 L(du3_loop):
792 sldi 0,6, 24
793 srdi 8,7, 64-24
794 or 0,0,8
795 ld 6,0(5)
796 std 0,0(4)
797 sldi 0,7, 24
798 srdi 8,6, 64-24
799 or 0,0,8
800 ld 7,8(5)
801 std 0,8(4)
802 sldi 0,6, 24
803 srdi 8,7, 64-24
804 or 0,0,8
805 ld 6,16(5)
806 std 0,16(4)
807 sldi 0,7, 24
808 srdi 8,6, 64-24
809 or 0,0,8
810 ld 7,24(5)
811 std 0,24(4)
812 addi 5,5,32
813 addi 4,4,32
814 bdnz+ L(du3_loop)
815 .align 4
816 L(du3_fini):
817 /* calculate and store the final DW */
818 sldi 0,6, 24
819 srdi 8,7, 64-24
820 or 0,0,8
821 std 0,0(4)
822 b L(du_done)
823
824 .align 4
825 L(du4_do):
826 cmpldi cr5, 10, 6
827 beq cr0, L(du4_dox)
828 blt cr5, L(du5_do)
829 beq cr5, L(du6_do)
830 b L(du7_do)
831 L(du4_dox):
832 bf 30,L(du4_1dw)
833
834 /* there are at least two DWs to copy */
835 sldi 0,6, 32
836 srdi 8,7, 64-32
837 or 0,0,8
838 ld 6,16(5)
839 std 0,0(4)
840 sldi 0,7, 32
841 srdi 8,6, 64-32
842 or 0,0,8
843 ld 7,24(5)
844 std 0,8(4)
845 addi 4,4,16
846 addi 5,5,32
847 blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
848 bf 31,L(du4_loop)
849 /* there is a third DW to copy */
850 sldi 0,6, 32
851 srdi 8,7, 64-32
852 or 0,0,8
853 std 0,0(4)
854 mr 6,7
855 ld 7,0(5)
856 addi 5,5,8
857 addi 4,4,8
858 beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */
859 b L(du4_loop)
860 .align 4
861 L(du4_1dw):
862 sldi 0,6, 32
863 srdi 8,7, 64-32
864 addi 5,5,16
865 or 0,0,8
866 bf 31,L(du4_loop)
867 mr 6,7
868 ld 7,0(5)
869 addi 5,5,8
870 std 0,0(4)
871 addi 4,4,8
872 .align 4
873 /* copy 32 bytes at a time */
874 L(du4_loop):
875 sldi 0,6, 32
876 srdi 8,7, 64-32
877 or 0,0,8
878 ld 6,0(5)
879 std 0,0(4)
880 sldi 0,7, 32
881 srdi 8,6, 64-32
882 or 0,0,8
883 ld 7,8(5)
884 std 0,8(4)
885 sldi 0,6, 32
886 srdi 8,7, 64-32
887 or 0,0,8
888 ld 6,16(5)
889 std 0,16(4)
890 sldi 0,7, 32
891 srdi 8,6, 64-32
892 or 0,0,8
893 ld 7,24(5)
894 std 0,24(4)
895 addi 5,5,32
896 addi 4,4,32
897 bdnz+ L(du4_loop)
898 .align 4
899 L(du4_fini):
900 /* calculate and store the final DW */
901 sldi 0,6, 32
902 srdi 8,7, 64-32
903 or 0,0,8
904 std 0,0(4)
905 b L(du_done)
906
907 .align 4
908 L(du5_do):
909 bf 30,L(du5_1dw)
910
911 /* there are at least two DWs to copy */
912 sldi 0,6, 40
913 srdi 8,7, 64-40
914 or 0,0,8
915 ld 6,16(5)
916 std 0,0(4)
917 sldi 0,7, 40
918 srdi 8,6, 64-40
919 or 0,0,8
920 ld 7,24(5)
921 std 0,8(4)
922 addi 4,4,16
923 addi 5,5,32
924 blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
925 bf 31,L(du5_loop)
926 /* there is a third DW to copy */
927 sldi 0,6, 40
928 srdi 8,7, 64-40
929 or 0,0,8
930 std 0,0(4)
931 mr 6,7
932 ld 7,0(5)
933 addi 5,5,8
934 addi 4,4,8
935 beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */
936 b L(du5_loop)
937 .align 4
938 L(du5_1dw):
939 sldi 0,6, 40
940 srdi 8,7, 64-40
941 addi 5,5,16
942 or 0,0,8
943 bf 31,L(du5_loop)
944 mr 6,7
945 ld 7,0(5)
946 addi 5,5,8
947 std 0,0(4)
948 addi 4,4,8
949 .align 4
950 /* copy 32 bytes at a time */
951 L(du5_loop):
952 sldi 0,6, 40
953 srdi 8,7, 64-40
954 or 0,0,8
955 ld 6,0(5)
956 std 0,0(4)
957 sldi 0,7, 40
958 srdi 8,6, 64-40
959 or 0,0,8
960 ld 7,8(5)
961 std 0,8(4)
962 sldi 0,6, 40
963 srdi 8,7, 64-40
964 or 0,0,8
965 ld 6,16(5)
966 std 0,16(4)
967 sldi 0,7, 40
968 srdi 8,6, 64-40
969 or 0,0,8
970 ld 7,24(5)
971 std 0,24(4)
972 addi 5,5,32
973 addi 4,4,32
974 bdnz+ L(du5_loop)
975 .align 4
976 L(du5_fini):
977 /* calculate and store the final DW */
978 sldi 0,6, 40
979 srdi 8,7, 64-40
980 or 0,0,8
981 std 0,0(4)
982 b L(du_done)
983
984 .align 4
985 L(du6_do):
986 bf 30,L(du6_1dw)
987
988 /* there are at least two DWs to copy */
989 sldi 0,6, 48
990 srdi 8,7, 64-48
991 or 0,0,8
992 ld 6,16(5)
993 std 0,0(4)
994 sldi 0,7, 48
995 srdi 8,6, 64-48
996 or 0,0,8
997 ld 7,24(5)
998 std 0,8(4)
999 addi 4,4,16
1000 addi 5,5,32
1001 blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
1002 bf 31,L(du6_loop)
1003 /* there is a third DW to copy */
1004 sldi 0,6, 48
1005 srdi 8,7, 64-48
1006 or 0,0,8
1007 std 0,0(4)
1008 mr 6,7
1009 ld 7,0(5)
1010 addi 5,5,8
1011 addi 4,4,8
1012 beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */
1013 b L(du6_loop)
1014 .align 4
1015 L(du6_1dw):
1016 sldi 0,6, 48
1017 srdi 8,7, 64-48
1018 addi 5,5,16
1019 or 0,0,8
1020 bf 31,L(du6_loop)
1021 mr 6,7
1022 ld 7,0(5)
1023 addi 5,5,8
1024 std 0,0(4)
1025 addi 4,4,8
1026 .align 4
1027 /* copy 32 bytes at a time */
1028 L(du6_loop):
1029 sldi 0,6, 48
1030 srdi 8,7, 64-48
1031 or 0,0,8
1032 ld 6,0(5)
1033 std 0,0(4)
1034 sldi 0,7, 48
1035 srdi 8,6, 64-48
1036 or 0,0,8
1037 ld 7,8(5)
1038 std 0,8(4)
1039 sldi 0,6, 48
1040 srdi 8,7, 64-48
1041 or 0,0,8
1042 ld 6,16(5)
1043 std 0,16(4)
1044 sldi 0,7, 48
1045 srdi 8,6, 64-48
1046 or 0,0,8
1047 ld 7,24(5)
1048 std 0,24(4)
1049 addi 5,5,32
1050 addi 4,4,32
1051 bdnz+ L(du6_loop)
1052 .align 4
1053 L(du6_fini):
1054 /* calculate and store the final DW */
1055 sldi 0,6, 48
1056 srdi 8,7, 64-48
1057 or 0,0,8
1058 std 0,0(4)
1059 b L(du_done)
1060
1061 .align 4
1062 L(du7_do):
1063 bf 30,L(du7_1dw)
1064
1065 /* there are at least two DWs to copy */
1066 sldi 0,6, 56
1067 srdi 8,7, 64-56
1068 or 0,0,8
1069 ld 6,16(5)
1070 std 0,0(4)
1071 sldi 0,7, 56
1072 srdi 8,6, 64-56
1073 or 0,0,8
1074 ld 7,24(5)
1075 std 0,8(4)
1076 addi 4,4,16
1077 addi 5,5,32
1078 blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
1079 bf 31,L(du7_loop)
1080 /* there is a third DW to copy */
1081 sldi 0,6, 56
1082 srdi 8,7, 64-56
1083 or 0,0,8
1084 std 0,0(4)
1085 mr 6,7
1086 ld 7,0(5)
1087 addi 5,5,8
1088 addi 4,4,8
1089 beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */
1090 b L(du7_loop)
1091 .align 4
1092 L(du7_1dw):
1093 sldi 0,6, 56
1094 srdi 8,7, 64-56
1095 addi 5,5,16
1096 or 0,0,8
1097 bf 31,L(du7_loop)
1098 mr 6,7
1099 ld 7,0(5)
1100 addi 5,5,8
1101 std 0,0(4)
1102 addi 4,4,8
1103 .align 4
1104 /* copy 32 bytes at a time */
1105 L(du7_loop):
1106 sldi 0,6, 56
1107 srdi 8,7, 64-56
1108 or 0,0,8
1109 ld 6,0(5)
1110 std 0,0(4)
1111 sldi 0,7, 56
1112 srdi 8,6, 64-56
1113 or 0,0,8
1114 ld 7,8(5)
1115 std 0,8(4)
1116 sldi 0,6, 56
1117 srdi 8,7, 64-56
1118 or 0,0,8
1119 ld 6,16(5)
1120 std 0,16(4)
1121 sldi 0,7, 56
1122 srdi 8,6, 64-56
1123 or 0,0,8
1124 ld 7,24(5)
1125 std 0,24(4)
1126 addi 5,5,32
1127 addi 4,4,32
1128 bdnz+ L(du7_loop)
1129 .align 4
1130 L(du7_fini):
1131 /* calculate and store the final DW */
1132 sldi 0,6, 56
1133 srdi 8,7, 64-56
1134 or 0,0,8
1135 std 0,0(4)
1136 b L(du_done)
1137
1138 .align 4
1139 L(du_done):
1140 rldicr 0,31,0,60
1141 mtcrf 0x01,31
1142 beq cr1,0f /* If the tail is 0 bytes we are done! */
1143
1144 add 3,3,0
1145 add 12,12,0
1146 /* At this point we have a tail of 0-7 bytes and we know that the
1147 destination is double word aligned. */
1148 4: bf 29,2f
1149 lwz 6,0(12)
1150 addi 12,12,4
1151 stw 6,0(3)
1152 addi 3,3,4
1153 2: bf 30,1f
1154 lhz 6,0(12)
1155 addi 12,12,2
1156 sth 6,0(3)
1157 addi 3,3,2
1158 1: bf 31,0f
1159 lbz 6,0(12)
1160 stb 6,0(3)
1161 0:
1162 /* Return original dst pointer. */
1163 ld 31,-8(1)
1164 ld 3,-16(1)
1165 blr
1166 END_GEN_TB (memcpy,TB_TOCLESS)
1167 libc_hidden_builtin_def (memcpy)