]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power6/memcpy.S
9711810caf78933cf08906d18730c41867cefe43
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power6 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC64.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
22 Returns 'dst'.
23
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29 of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31 Longer moves (>= 32-bytes) justify the effort to get at least the
32 destination doubleword (8-byte) aligned. Further optimization is
33 possible when both source and destination are doubleword aligned.
34 Each case has a optimized unrolled loop.
35
36 For POWER6 unaligned loads will take a 20+ cycle hiccup for any
37 L1 cache miss that crosses a 32- or 128-byte boundary. Store
38 is more forgiving and does not take a hiccup until page or
39 segment boundaries. So we require doubleword alignment for
40 the source but may take a risk and only require word alignment
41 for the destination. */
42
43 .machine "power6"
44 EALIGN (memcpy, 7, 0)
45 CALL_MCOUNT 3
46
47 cmpldi cr1,5,31
48 neg 0,3
49 std 3,-16(1)
50 std 31,-8(1)
51 andi. 11,3,7 /* check alignment of dst. */
52 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */
53 clrldi 10,4,61 /* check alignment of src. */
54 cmpldi cr6,5,8
55 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
56 mtcrf 0x01,0
57 cmpld cr6,10,11
58 srdi 9,5,3 /* Number of full double words remaining. */
59 beq .L0
60
61 subf 5,0,5
62 /* Move 0-7 bytes as needed to get the destination doubleword aligned.
63 Duplicate some code to maximize fall-through and minimize agen delays. */
64 1: bf 31,2f
65 lbz 6,0(4)
66 stb 6,0(3)
67 bf 30,5f
68 lhz 6,1(4)
69 sth 6,1(3)
70 bf 29,0f
71 lwz 6,3(4)
72 stw 6,3(3)
73 b 0f
74 5:
75 bf 29,0f
76 lwz 6,1(4)
77 stw 6,1(3)
78 b 0f
79
80 2: bf 30,4f
81 lhz 6,0(4)
82 sth 6,0(3)
83 bf 29,0f
84 lwz 6,2(4)
85 stw 6,2(3)
86 b 0f
87
88 4: bf 29,0f
89 lwz 6,0(4)
90 stw 6,0(3)
91 0:
92 /* Add the number of bytes until the 1st doubleword of dst to src and dst. */
93 add 4,4,0
94 add 3,3,0
95
96 clrldi 10,4,61 /* check alignment of src again. */
97 srdi 9,5,3 /* Number of full double words remaining. */
98
99 /* Copy doublewords from source to destination, assuming the
100 destination is aligned on a doubleword boundary.
101
102 At this point we know there are at least 25 bytes left (32-7) to copy.
103 The next step is to determine if the source is also doubleword aligned.
104 If not branch to the unaligned move code at .L6. which uses
105 a load, shift, store strategy.
106
107 Otherwise source and destination are doubleword aligned, and we can
108 the optimized doubleword copy loop. */
109 .align 4
110 .L0:
111 clrldi 11,5,61
112 andi. 0,5,0x78
113 srdi 12,5,7 /* Number of 128-byte blocks to move. */
114 cmpldi cr1,11,0 /* If the tail is 0 bytes */
115 bne- cr6,.L6 /* If source is not DW aligned. */
116
117 /* Move doublewords where destination and source are DW aligned.
118 Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration.
119 If the copy is not an exact multiple of 128 bytes, 1-15
120 doublewords are copied as needed to set up the main loop. After
121 the main loop exits there may be a tail of 1-7 bytes. These byte
122 are copied a word/halfword/byte at a time as needed to preserve
123 alignment.
124
125 For POWER6 the L1 is store-through and the L2 is store-in. The
126 L2 is clocked at half CPU clock so we can store 16 bytes every
127 other cycle. POWER6 also has a load/store bypass so we can do
128 load, load, store, store every 2 cycles.
129
130 The following code is sensitive to cache line alignment. Do not
131 make any change with out first making sure they don't result in
132 splitting ld/std pairs across a cache line. */
133
134 mtcrf 0x02,5
135 mtcrf 0x01,5
136 cmpldi cr5,12,1
137 beq L(das_loop)
138
139 bf 25,4f
140 .align 3
141 ld 6,0(4)
142 ld 7,8(4)
143 mr 11,4
144 mr 10,3
145 std 6,0(3)
146 std 7,8(3)
147 ld 6,16(4)
148 ld 7,24(4)
149 std 6,16(3)
150 std 7,24(3)
151 ld 6,0+32(4)
152 ld 7,8+32(4)
153 addi 4,4,64
154 addi 3,3,64
155 std 6,0+32(10)
156 std 7,8+32(10)
157 ld 6,16+32(11)
158 ld 7,24+32(11)
159 std 6,16+32(10)
160 std 7,24+32(10)
161 4:
162 mr 10,3
163 bf 26,2f
164 ld 6,0(4)
165 ld 7,8(4)
166 mr 11,4
167 nop
168 std 6,0(3)
169 std 7,8(3)
170 ld 6,16(4)
171 ld 7,24(4)
172 addi 4,4,32
173 std 6,16(3)
174 std 7,24(3)
175 addi 3,3,32
176 6:
177 nop
178 bf 27,5f
179 ld 6,0+32(11)
180 ld 7,8+32(11)
181 addi 4,4,16
182 addi 3,3,16
183 std 6,0+32(10)
184 std 7,8+32(10)
185 bf 28,L(das_loop_s)
186 ld 0,16+32(11)
187 addi 4,4,8
188 addi 3,3,8
189 std 0,16+32(10)
190 blt cr5,L(das_tail)
191 b L(das_loop)
192 .align 3
193 5:
194 nop
195 bf 28,L(das_loop_s)
196 ld 6,32(11)
197 addi 4,4,8
198 addi 3,3,8
199 std 6,32(10)
200 blt cr5,L(das_tail)
201 b L(das_loop)
202 .align 3
203 2:
204 mr 11,4
205 bf 27,1f
206 ld 6,0(4)
207 ld 7,8(4)
208 addi 4,4,16
209 addi 3,3,16
210 std 6,0(10)
211 std 7,8(10)
212 bf 28,L(das_loop_s)
213 ld 0,16(11)
214 addi 4,11,24
215 addi 3,10,24
216 std 0,16(10)
217 blt cr5,L(das_tail)
218 b L(das_loop)
219 .align 3
220 1:
221 nop
222 bf 28,L(das_loop_s)
223 ld 6,0(4)
224 addi 4,4,8
225 addi 3,3,8
226 std 6,0(10)
227 L(das_loop_s):
228 nop
229 blt cr5,L(das_tail)
230 .align 4
231 L(das_loop):
232 ld 6,0(4)
233 ld 7,8(4)
234 mr 10,3
235 mr 11,4
236 std 6,0(3)
237 std 7,8(3)
238 addi 12,12,-1
239 nop
240 ld 8,16(4)
241 ld 0,24(4)
242 std 8,16(3)
243 std 0,24(3)
244
245 ld 6,0+32(4)
246 ld 7,8+32(4)
247 std 6,0+32(3)
248 std 7,8+32(3)
249 ld 8,16+32(4)
250 ld 0,24+32(4)
251 std 8,16+32(3)
252 std 0,24+32(3)
253
254 ld 6,0+64(11)
255 ld 7,8+64(11)
256 std 6,0+64(10)
257 std 7,8+64(10)
258 ld 8,16+64(11)
259 ld 0,24+64(11)
260 std 8,16+64(10)
261 std 0,24+64(10)
262
263 ld 6,0+96(11)
264 ld 7,8+96(11)
265 addi 4,4,128
266 addi 3,3,128
267 std 6,0+96(10)
268 std 7,8+96(10)
269 ld 8,16+96(11)
270 ld 0,24+96(11)
271 std 8,16+96(10)
272 std 0,24+96(10)
273 ble cr5,L(das_loop_e)
274
275 mtctr 12
276 .align 4
277 L(das_loop2):
278 ld 6,0(4)
279 ld 7,8(4)
280 mr 10,3
281 mr 11,4
282 std 6,0(3)
283 std 7,8(3)
284 ld 8,16(4)
285 ld 0,24(4)
286 std 8,16(3)
287 std 0,24(3)
288
289 ld 6,0+32(4)
290 ld 7,8+32(4)
291 std 6,0+32(3)
292 std 7,8+32(3)
293 ld 8,16+32(4)
294 ld 0,24+32(4)
295 std 8,16+32(3)
296 std 0,24+32(3)
297
298 ld 6,0+64(11)
299 ld 7,8+64(11)
300 std 6,0+64(10)
301 std 7,8+64(10)
302 ld 8,16+64(11)
303 ld 0,24+64(11)
304 std 8,16+64(10)
305 std 0,24+64(10)
306
307 ld 6,0+96(11)
308 ld 7,8+96(11)
309 addi 4,4,128
310 addi 3,3,128
311 std 6,0+96(10)
312 std 7,8+96(10)
313 ld 8,16+96(11)
314 ld 0,24+96(11)
315 std 8,16+96(10)
316 std 0,24+96(10)
317 bdnz L(das_loop2)
318 L(das_loop_e):
319 /* Check of a 1-7 byte tail, return if none. */
320 bne cr1,L(das_tail2)
321 /* Return original dst pointer. */
322 ld 3,-16(1)
323 blr
324 .align 4
325 L(das_tail):
326 beq cr1,0f
327
328 L(das_tail2):
329 /* At this point we have a tail of 0-7 bytes and we know that the
330 destination is double word aligned. */
331 4: bf 29,2f
332 lwz 6,0(4)
333 stw 6,0(3)
334 bf 30,5f
335 lhz 6,4(4)
336 sth 6,4(3)
337 bf 31,0f
338 lbz 6,6(4)
339 stb 6,6(3)
340 b 0f
341 5: bf 31,0f
342 lbz 6,4(4)
343 stb 6,4(3)
344 b 0f
345
346 2: bf 30,1f
347 lhz 6,0(4)
348 sth 6,0(3)
349 bf 31,0f
350 lbz 6,2(4)
351 stb 6,2(3)
352 b 0f
353
354 1: bf 31,0f
355 lbz 6,0(4)
356 stb 6,0(3)
357 0:
358 /* Return original dst pointer. */
359 ld 3,-16(1)
360 blr
361
362 /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
363 bytes. Each case is handled without loops, using binary (1,2,4,8)
364 tests.
365
366 In the short (0-8 byte) case no attempt is made to force alignment
367 of either source or destination. The hardware will handle the
368 unaligned load/stores with small delays for crossing 32- 128-byte,
369 and 4096-byte boundaries. Since these short moves are unlikely to be
370 unaligned or cross these boundaries, the overhead to force
371 alignment is not justified.
372
373 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
374 boundaries. Since only loads are sensitive to the 32-/128-byte
375 boundaries it is more important to align the source then the
376 destination. If the source is not already word aligned, we first
377 move 1-3 bytes as needed. Since we are only word aligned we don't
378 use double word load/stores to insure that all loads are aligned.
379 While the destination and stores may still be unaligned, this
380 is only an issue for page (4096 byte boundary) crossing, which
381 should be rare for these short moves. The hardware handles this
382 case automatically with a small (~20 cycle) delay. */
383 .align 4
384 .L2:
385 mtcrf 0x01,5
386 neg 8,4
387 clrrdi 11,4,2
388 andi. 0,8,3
389 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
390 /* At least 9 bytes left. Get the source word aligned. */
391 cmpldi cr1,5,16
392 mr 10,5
393 mr 12,4
394 cmpldi cr6,0,2
395 beq L(dus_tail) /* If the source is already word aligned skip this. */
396 /* Copy 1-3 bytes to get source address word aligned. */
397 lwz 6,0(11)
398 subf 10,0,5
399 add 12,4,0
400 blt cr6,5f
401 srdi 7,6,16
402 bgt cr6,3f
403 #ifdef __LITTLE_ENDIAN__
404 sth 7,0(3)
405 #else
406 sth 6,0(3)
407 #endif
408 b 7f
409 .align 4
410 3:
411 #ifdef __LITTLE_ENDIAN__
412 rotlwi 6,6,24
413 stb 6,0(3)
414 sth 7,1(3)
415 #else
416 stb 7,0(3)
417 sth 6,1(3)
418 #endif
419 b 7f
420 .align 4
421 5:
422 #ifdef __LITTLE_ENDIAN__
423 rotlwi 6,6,8
424 #endif
425 stb 6,0(3)
426 7:
427 cmpldi cr1,10,16
428 add 3,3,0
429 mtcrf 0x01,10
430 .align 4
431 L(dus_tail):
432 /* At least 6 bytes left and the source is word aligned. This allows
433 some speculative loads up front. */
434 /* We need to special case the fall-through because the biggest delays
435 are due to address computation not being ready in time for the
436 AGEN. */
437 lwz 6,0(12)
438 lwz 7,4(12)
439 blt cr1,L(dus_tail8)
440 cmpldi cr0,10,24
441 L(dus_tail16): /* Move 16 bytes. */
442 stw 6,0(3)
443 stw 7,4(3)
444 lwz 6,8(12)
445 lwz 7,12(12)
446 stw 6,8(3)
447 stw 7,12(3)
448 /* Move 8 bytes more. */
449 bf 28,L(dus_tail16p8)
450 cmpldi cr1,10,28
451 lwz 6,16(12)
452 lwz 7,20(12)
453 stw 6,16(3)
454 stw 7,20(3)
455 /* Move 4 bytes more. */
456 bf 29,L(dus_tail16p4)
457 lwz 6,24(12)
458 stw 6,24(3)
459 addi 12,12,28
460 addi 3,3,28
461 bgt cr1,L(dus_tail2)
462 /* exactly 28 bytes. Return original dst pointer and exit. */
463 ld 3,-16(1)
464 blr
465 .align 4
466 L(dus_tail16p8): /* less than 8 bytes left. */
467 beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */
468 cmpldi cr1,10,20
469 bf 29,L(dus_tail16p2)
470 /* Move 4 bytes more. */
471 lwz 6,16(12)
472 stw 6,16(3)
473 addi 12,12,20
474 addi 3,3,20
475 bgt cr1,L(dus_tail2)
476 /* exactly 20 bytes. Return original dst pointer and exit. */
477 ld 3,-16(1)
478 blr
479 .align 4
480 L(dus_tail16p4): /* less than 4 bytes left. */
481 addi 12,12,24
482 addi 3,3,24
483 bgt cr0,L(dus_tail2)
484 /* exactly 24 bytes. Return original dst pointer and exit. */
485 ld 3,-16(1)
486 blr
487 .align 4
488 L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
489 addi 12,12,16
490 addi 3,3,16
491 b L(dus_tail2)
492
493 .align 4
494 L(dus_tail8): /* Move 8 bytes. */
495 /* r6, r7 already loaded speculatively. */
496 cmpldi cr1,10,8
497 cmpldi cr0,10,12
498 bf 28,L(dus_tail4)
499 .align 2
500 stw 6,0(3)
501 stw 7,4(3)
502 /* Move 4 bytes more. */
503 bf 29,L(dus_tail8p4)
504 lwz 6,8(12)
505 stw 6,8(3)
506 addi 12,12,12
507 addi 3,3,12
508 bgt cr0,L(dus_tail2)
509 /* exactly 12 bytes. Return original dst pointer and exit. */
510 ld 3,-16(1)
511 blr
512 .align 4
513 L(dus_tail8p4): /* less than 4 bytes left. */
514 addi 12,12,8
515 addi 3,3,8
516 bgt cr1,L(dus_tail2)
517 /* exactly 8 bytes. Return original dst pointer and exit. */
518 ld 3,-16(1)
519 blr
520
521 .align 4
522 L(dus_tail4): /* Move 4 bytes. */
523 /* r6 already loaded speculatively. If we are here we know there is
524 more than 4 bytes left. So there is no need to test. */
525 addi 12,12,4
526 stw 6,0(3)
527 addi 3,3,4
528 L(dus_tail2): /* Move 2-3 bytes. */
529 bf 30,L(dus_tail1)
530 lhz 6,0(12)
531 sth 6,0(3)
532 bf 31,L(dus_tailX)
533 lbz 7,2(12)
534 stb 7,2(3)
535 ld 3,-16(1)
536 blr
537 L(dus_tail1): /* Move 1 byte. */
538 bf 31,L(dus_tailX)
539 lbz 6,0(12)
540 stb 6,0(3)
541 L(dus_tailX):
542 /* Return original dst pointer. */
543 ld 3,-16(1)
544 blr
545
546 /* Special case to copy 0-8 bytes. */
547 .align 4
548 .LE8:
549 mr 12,4
550 bne cr6,L(dus_4)
551 /* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20
552 cycle delay. This case should be rare and any attempt to avoid this
553 would take most of 20 cycles any way. */
554 ld 6,0(4)
555 std 6,0(3)
556 /* Return original dst pointer. */
557 ld 3,-16(1)
558 blr
559 .align 4
560 L(dus_4):
561 bf 29,L(dus_tail2)
562 lwz 6,0(4)
563 stw 6,0(3)
564 bf 30,L(dus_5)
565 lhz 7,4(4)
566 sth 7,4(3)
567 bf 31,L(dus_0)
568 lbz 8,6(4)
569 stb 8,6(3)
570 ld 3,-16(1)
571 blr
572 .align 4
573 L(dus_5):
574 bf 31,L(dus_0)
575 lbz 6,4(4)
576 stb 6,4(3)
577 L(dus_0):
578 /* Return original dst pointer. */
579 ld 3,-16(1)
580 blr
581
582 .align 4
583 .L6:
584 cfi_offset(31,-8)
585 mr 12,4
586 mr 31,5
587 /* Copy doublewords where the destination is aligned but the source is
588 not. Use aligned doubleword loads from the source, shifted to realign
589 the data, to allow aligned destination stores. */
590 addi 11,9,-1 /* loop DW count is one less than total */
591 subf 5,10,12 /* Move source addr to previous full double word. */
592 cmpldi cr5, 10, 2
593 cmpldi cr0, 10, 4
594 mr 4,3
595 srdi 8,11,2 /* calculate the 32 byte loop count */
596 ld 6,0(5) /* pre load 1st full doubleword. */
597 mtcrf 0x01,11
598 cmpldi cr6,9,4
599 mtctr 8
600 ld 7,8(5) /* pre load 2nd full doubleword. */
601 bge cr0, L(du4_do)
602 blt cr5, L(du1_do)
603 beq cr5, L(du2_do)
604 b L(du3_do)
605
606 .align 4
607 L(du1_do):
608 bf 30,L(du1_1dw)
609
610 /* there are at least two DWs to copy */
611 /* FIXME: can combine last shift and "or" into "rldimi" */
612 #ifdef __LITTLE_ENDIAN__
613 srdi 0,6, 8
614 sldi 8,7, 64-8
615 #else
616 sldi 0,6, 8
617 srdi 8,7, 64-8
618 #endif
619 or 0,0,8
620 ld 6,16(5)
621 std 0,0(4)
622 #ifdef __LITTLE_ENDIAN__
623 srdi 0,7, 8
624 sldi 8,6, 64-8
625 #else
626 sldi 0,7, 8
627 srdi 8,6, 64-8
628 #endif
629 or 0,0,8
630 ld 7,24(5)
631 std 0,8(4)
632 addi 4,4,16
633 addi 5,5,32
634 blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
635 bf 31,L(du1_loop)
636 /* there is a third DW to copy */
637 #ifdef __LITTLE_ENDIAN__
638 srdi 0,6, 8
639 sldi 8,7, 64-8
640 #else
641 sldi 0,6, 8
642 srdi 8,7, 64-8
643 #endif
644 or 0,0,8
645 std 0,0(4)
646 mr 6,7
647 ld 7,0(5)
648 addi 5,5,8
649 addi 4,4,8
650 beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */
651 b L(du1_loop)
652 .align 4
653 L(du1_1dw):
654 #ifdef __LITTLE_ENDIAN__
655 srdi 0,6, 8
656 sldi 8,7, 64-8
657 #else
658 sldi 0,6, 8
659 srdi 8,7, 64-8
660 #endif
661 addi 5,5,16
662 or 0,0,8
663 bf 31,L(du1_loop)
664 mr 6,7
665 ld 7,0(5)
666 addi 5,5,8
667 std 0,0(4)
668 addi 4,4,8
669 .align 4
670 /* copy 32 bytes at a time */
671 L(du1_loop):
672 #ifdef __LITTLE_ENDIAN__
673 srdi 0,6, 8
674 sldi 8,7, 64-8
675 #else
676 sldi 0,6, 8
677 srdi 8,7, 64-8
678 #endif
679 or 0,0,8
680 ld 6,0(5)
681 std 0,0(4)
682 #ifdef __LITTLE_ENDIAN__
683 srdi 0,7, 8
684 sldi 8,6, 64-8
685 #else
686 sldi 0,7, 8
687 srdi 8,6, 64-8
688 #endif
689 or 0,0,8
690 ld 7,8(5)
691 std 0,8(4)
692 #ifdef __LITTLE_ENDIAN__
693 srdi 0,6, 8
694 sldi 8,7, 64-8
695 #else
696 sldi 0,6, 8
697 srdi 8,7, 64-8
698 #endif
699 or 0,0,8
700 ld 6,16(5)
701 std 0,16(4)
702 #ifdef __LITTLE_ENDIAN__
703 srdi 0,7, 8
704 sldi 8,6, 64-8
705 #else
706 sldi 0,7, 8
707 srdi 8,6, 64-8
708 #endif
709 or 0,0,8
710 ld 7,24(5)
711 std 0,24(4)
712 addi 5,5,32
713 addi 4,4,32
714 bdnz+ L(du1_loop)
715 .align 4
716 L(du1_fini):
717 /* calculate and store the final DW */
718 #ifdef __LITTLE_ENDIAN__
719 srdi 0,6, 8
720 sldi 8,7, 64-8
721 #else
722 sldi 0,6, 8
723 srdi 8,7, 64-8
724 #endif
725 or 0,0,8
726 std 0,0(4)
727 b L(du_done)
728
729 .align 4
730 L(du2_do):
731 bf 30,L(du2_1dw)
732
733 /* there are at least two DWs to copy */
734 #ifdef __LITTLE_ENDIAN__
735 srdi 0,6, 16
736 sldi 8,7, 64-16
737 #else
738 sldi 0,6, 16
739 srdi 8,7, 64-16
740 #endif
741 or 0,0,8
742 ld 6,16(5)
743 std 0,0(4)
744 #ifdef __LITTLE_ENDIAN__
745 srdi 0,7, 16
746 sldi 8,6, 64-16
747 #else
748 sldi 0,7, 16
749 srdi 8,6, 64-16
750 #endif
751 or 0,0,8
752 ld 7,24(5)
753 std 0,8(4)
754 addi 4,4,16
755 addi 5,5,32
756 blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
757 bf 31,L(du2_loop)
758 /* there is a third DW to copy */
759 #ifdef __LITTLE_ENDIAN__
760 srdi 0,6, 16
761 sldi 8,7, 64-16
762 #else
763 sldi 0,6, 16
764 srdi 8,7, 64-16
765 #endif
766 or 0,0,8
767 std 0,0(4)
768 mr 6,7
769 ld 7,0(5)
770 addi 5,5,8
771 addi 4,4,8
772 beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */
773 b L(du2_loop)
774 .align 4
775 L(du2_1dw):
776 #ifdef __LITTLE_ENDIAN__
777 srdi 0,6, 16
778 sldi 8,7, 64-16
779 #else
780 sldi 0,6, 16
781 srdi 8,7, 64-16
782 #endif
783 addi 5,5,16
784 or 0,0,8
785 bf 31,L(du2_loop)
786 mr 6,7
787 ld 7,0(5)
788 addi 5,5,8
789 std 0,0(4)
790 addi 4,4,8
791 .align 4
792 /* copy 32 bytes at a time */
793 L(du2_loop):
794 #ifdef __LITTLE_ENDIAN__
795 srdi 0,6, 16
796 sldi 8,7, 64-16
797 #else
798 sldi 0,6, 16
799 srdi 8,7, 64-16
800 #endif
801 or 0,0,8
802 ld 6,0(5)
803 std 0,0(4)
804 #ifdef __LITTLE_ENDIAN__
805 srdi 0,7, 16
806 sldi 8,6, 64-16
807 #else
808 sldi 0,7, 16
809 srdi 8,6, 64-16
810 #endif
811 or 0,0,8
812 ld 7,8(5)
813 std 0,8(4)
814 #ifdef __LITTLE_ENDIAN__
815 srdi 0,6, 16
816 sldi 8,7, 64-16
817 #else
818 sldi 0,6, 16
819 srdi 8,7, 64-16
820 #endif
821 or 0,0,8
822 ld 6,16(5)
823 std 0,16(4)
824 #ifdef __LITTLE_ENDIAN__
825 srdi 0,7, 16
826 sldi 8,6, 64-16
827 #else
828 sldi 0,7, 16
829 srdi 8,6, 64-16
830 #endif
831 or 0,0,8
832 ld 7,24(5)
833 std 0,24(4)
834 addi 5,5,32
835 addi 4,4,32
836 bdnz+ L(du2_loop)
837 .align 4
838 L(du2_fini):
839 /* calculate and store the final DW */
840 #ifdef __LITTLE_ENDIAN__
841 srdi 0,6, 16
842 sldi 8,7, 64-16
843 #else
844 sldi 0,6, 16
845 srdi 8,7, 64-16
846 #endif
847 or 0,0,8
848 std 0,0(4)
849 b L(du_done)
850
851 .align 4
852 L(du3_do):
853 bf 30,L(du3_1dw)
854
855 /* there are at least two DWs to copy */
856 #ifdef __LITTLE_ENDIAN__
857 srdi 0,6, 24
858 sldi 8,7, 64-24
859 #else
860 sldi 0,6, 24
861 srdi 8,7, 64-24
862 #endif
863 or 0,0,8
864 ld 6,16(5)
865 std 0,0(4)
866 #ifdef __LITTLE_ENDIAN__
867 srdi 0,7, 24
868 sldi 8,6, 64-24
869 #else
870 sldi 0,7, 24
871 srdi 8,6, 64-24
872 #endif
873 or 0,0,8
874 ld 7,24(5)
875 std 0,8(4)
876 addi 4,4,16
877 addi 5,5,32
878 blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
879 bf 31,L(du3_loop)
880 /* there is a third DW to copy */
881 #ifdef __LITTLE_ENDIAN__
882 srdi 0,6, 24
883 sldi 8,7, 64-24
884 #else
885 sldi 0,6, 24
886 srdi 8,7, 64-24
887 #endif
888 or 0,0,8
889 std 0,0(4)
890 mr 6,7
891 ld 7,0(5)
892 addi 5,5,8
893 addi 4,4,8
894 beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */
895 b L(du3_loop)
896 .align 4
897 L(du3_1dw):
898 #ifdef __LITTLE_ENDIAN__
899 srdi 0,6, 24
900 sldi 8,7, 64-24
901 #else
902 sldi 0,6, 24
903 srdi 8,7, 64-24
904 #endif
905 addi 5,5,16
906 or 0,0,8
907 bf 31,L(du3_loop)
908 mr 6,7
909 ld 7,0(5)
910 addi 5,5,8
911 std 0,0(4)
912 addi 4,4,8
913 .align 4
914 /* copy 32 bytes at a time */
915 L(du3_loop):
916 #ifdef __LITTLE_ENDIAN__
917 srdi 0,6, 24
918 sldi 8,7, 64-24
919 #else
920 sldi 0,6, 24
921 srdi 8,7, 64-24
922 #endif
923 or 0,0,8
924 ld 6,0(5)
925 std 0,0(4)
926 #ifdef __LITTLE_ENDIAN__
927 srdi 0,7, 24
928 sldi 8,6, 64-24
929 #else
930 sldi 0,7, 24
931 srdi 8,6, 64-24
932 #endif
933 or 0,0,8
934 ld 7,8(5)
935 std 0,8(4)
936 #ifdef __LITTLE_ENDIAN__
937 srdi 0,6, 24
938 sldi 8,7, 64-24
939 #else
940 sldi 0,6, 24
941 srdi 8,7, 64-24
942 #endif
943 or 0,0,8
944 ld 6,16(5)
945 std 0,16(4)
946 #ifdef __LITTLE_ENDIAN__
947 srdi 0,7, 24
948 sldi 8,6, 64-24
949 #else
950 sldi 0,7, 24
951 srdi 8,6, 64-24
952 #endif
953 or 0,0,8
954 ld 7,24(5)
955 std 0,24(4)
956 addi 5,5,32
957 addi 4,4,32
958 bdnz+ L(du3_loop)
959 .align 4
960 L(du3_fini):
961 /* calculate and store the final DW */
962 #ifdef __LITTLE_ENDIAN__
963 srdi 0,6, 24
964 sldi 8,7, 64-24
965 #else
966 sldi 0,6, 24
967 srdi 8,7, 64-24
968 #endif
969 or 0,0,8
970 std 0,0(4)
971 b L(du_done)
972
973 .align 4
974 L(du4_do):
975 cmpldi cr5, 10, 6
976 beq cr0, L(du4_dox)
977 blt cr5, L(du5_do)
978 beq cr5, L(du6_do)
979 b L(du7_do)
980 L(du4_dox):
981 bf 30,L(du4_1dw)
982
983 /* there are at least two DWs to copy */
984 #ifdef __LITTLE_ENDIAN__
985 srdi 0,6, 32
986 sldi 8,7, 64-32
987 #else
988 sldi 0,6, 32
989 srdi 8,7, 64-32
990 #endif
991 or 0,0,8
992 ld 6,16(5)
993 std 0,0(4)
994 #ifdef __LITTLE_ENDIAN__
995 srdi 0,7, 32
996 sldi 8,6, 64-32
997 #else
998 sldi 0,7, 32
999 srdi 8,6, 64-32
1000 #endif
1001 or 0,0,8
1002 ld 7,24(5)
1003 std 0,8(4)
1004 addi 4,4,16
1005 addi 5,5,32
1006 blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
1007 bf 31,L(du4_loop)
1008 /* there is a third DW to copy */
1009 #ifdef __LITTLE_ENDIAN__
1010 srdi 0,6, 32
1011 sldi 8,7, 64-32
1012 #else
1013 sldi 0,6, 32
1014 srdi 8,7, 64-32
1015 #endif
1016 or 0,0,8
1017 std 0,0(4)
1018 mr 6,7
1019 ld 7,0(5)
1020 addi 5,5,8
1021 addi 4,4,8
1022 beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */
1023 b L(du4_loop)
1024 .align 4
1025 L(du4_1dw):
1026 #ifdef __LITTLE_ENDIAN__
1027 srdi 0,6, 32
1028 sldi 8,7, 64-32
1029 #else
1030 sldi 0,6, 32
1031 srdi 8,7, 64-32
1032 #endif
1033 addi 5,5,16
1034 or 0,0,8
1035 bf 31,L(du4_loop)
1036 mr 6,7
1037 ld 7,0(5)
1038 addi 5,5,8
1039 std 0,0(4)
1040 addi 4,4,8
1041 .align 4
1042 /* copy 32 bytes at a time */
1043 L(du4_loop):
1044 #ifdef __LITTLE_ENDIAN__
1045 srdi 0,6, 32
1046 sldi 8,7, 64-32
1047 #else
1048 sldi 0,6, 32
1049 srdi 8,7, 64-32
1050 #endif
1051 or 0,0,8
1052 ld 6,0(5)
1053 std 0,0(4)
1054 #ifdef __LITTLE_ENDIAN__
1055 srdi 0,7, 32
1056 sldi 8,6, 64-32
1057 #else
1058 sldi 0,7, 32
1059 srdi 8,6, 64-32
1060 #endif
1061 or 0,0,8
1062 ld 7,8(5)
1063 std 0,8(4)
1064 #ifdef __LITTLE_ENDIAN__
1065 srdi 0,6, 32
1066 sldi 8,7, 64-32
1067 #else
1068 sldi 0,6, 32
1069 srdi 8,7, 64-32
1070 #endif
1071 or 0,0,8
1072 ld 6,16(5)
1073 std 0,16(4)
1074 #ifdef __LITTLE_ENDIAN__
1075 srdi 0,7, 32
1076 sldi 8,6, 64-32
1077 #else
1078 sldi 0,7, 32
1079 srdi 8,6, 64-32
1080 #endif
1081 or 0,0,8
1082 ld 7,24(5)
1083 std 0,24(4)
1084 addi 5,5,32
1085 addi 4,4,32
1086 bdnz+ L(du4_loop)
1087 .align 4
1088 L(du4_fini):
1089 /* calculate and store the final DW */
1090 #ifdef __LITTLE_ENDIAN__
1091 srdi 0,6, 32
1092 sldi 8,7, 64-32
1093 #else
1094 sldi 0,6, 32
1095 srdi 8,7, 64-32
1096 #endif
1097 or 0,0,8
1098 std 0,0(4)
1099 b L(du_done)
1100
1101 .align 4
1102 L(du5_do):
1103 bf 30,L(du5_1dw)
1104
1105 /* there are at least two DWs to copy */
1106 #ifdef __LITTLE_ENDIAN__
1107 srdi 0,6, 40
1108 sldi 8,7, 64-40
1109 #else
1110 sldi 0,6, 40
1111 srdi 8,7, 64-40
1112 #endif
1113 or 0,0,8
1114 ld 6,16(5)
1115 std 0,0(4)
1116 #ifdef __LITTLE_ENDIAN__
1117 srdi 0,7, 40
1118 sldi 8,6, 64-40
1119 #else
1120 sldi 0,7, 40
1121 srdi 8,6, 64-40
1122 #endif
1123 or 0,0,8
1124 ld 7,24(5)
1125 std 0,8(4)
1126 addi 4,4,16
1127 addi 5,5,32
1128 blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
1129 bf 31,L(du5_loop)
1130 /* there is a third DW to copy */
1131 #ifdef __LITTLE_ENDIAN__
1132 srdi 0,6, 40
1133 sldi 8,7, 64-40
1134 #else
1135 sldi 0,6, 40
1136 srdi 8,7, 64-40
1137 #endif
1138 or 0,0,8
1139 std 0,0(4)
1140 mr 6,7
1141 ld 7,0(5)
1142 addi 5,5,8
1143 addi 4,4,8
1144 beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */
1145 b L(du5_loop)
1146 .align 4
1147 L(du5_1dw):
1148 #ifdef __LITTLE_ENDIAN__
1149 srdi 0,6, 40
1150 sldi 8,7, 64-40
1151 #else
1152 sldi 0,6, 40
1153 srdi 8,7, 64-40
1154 #endif
1155 addi 5,5,16
1156 or 0,0,8
1157 bf 31,L(du5_loop)
1158 mr 6,7
1159 ld 7,0(5)
1160 addi 5,5,8
1161 std 0,0(4)
1162 addi 4,4,8
1163 .align 4
1164 /* copy 32 bytes at a time */
1165 L(du5_loop):
1166 #ifdef __LITTLE_ENDIAN__
1167 srdi 0,6, 40
1168 sldi 8,7, 64-40
1169 #else
1170 sldi 0,6, 40
1171 srdi 8,7, 64-40
1172 #endif
1173 or 0,0,8
1174 ld 6,0(5)
1175 std 0,0(4)
1176 #ifdef __LITTLE_ENDIAN__
1177 srdi 0,7, 40
1178 sldi 8,6, 64-40
1179 #else
1180 sldi 0,7, 40
1181 srdi 8,6, 64-40
1182 #endif
1183 or 0,0,8
1184 ld 7,8(5)
1185 std 0,8(4)
1186 #ifdef __LITTLE_ENDIAN__
1187 srdi 0,6, 40
1188 sldi 8,7, 64-40
1189 #else
1190 sldi 0,6, 40
1191 srdi 8,7, 64-40
1192 #endif
1193 or 0,0,8
1194 ld 6,16(5)
1195 std 0,16(4)
1196 #ifdef __LITTLE_ENDIAN__
1197 srdi 0,7, 40
1198 sldi 8,6, 64-40
1199 #else
1200 sldi 0,7, 40
1201 srdi 8,6, 64-40
1202 #endif
1203 or 0,0,8
1204 ld 7,24(5)
1205 std 0,24(4)
1206 addi 5,5,32
1207 addi 4,4,32
1208 bdnz+ L(du5_loop)
1209 .align 4
1210 L(du5_fini):
1211 /* calculate and store the final DW */
1212 #ifdef __LITTLE_ENDIAN__
1213 srdi 0,6, 40
1214 sldi 8,7, 64-40
1215 #else
1216 sldi 0,6, 40
1217 srdi 8,7, 64-40
1218 #endif
1219 or 0,0,8
1220 std 0,0(4)
1221 b L(du_done)
1222
1223 .align 4
1224 L(du6_do):
1225 bf 30,L(du6_1dw)
1226
1227 /* there are at least two DWs to copy */
1228 #ifdef __LITTLE_ENDIAN__
1229 srdi 0,6, 48
1230 sldi 8,7, 64-48
1231 #else
1232 sldi 0,6, 48
1233 srdi 8,7, 64-48
1234 #endif
1235 or 0,0,8
1236 ld 6,16(5)
1237 std 0,0(4)
1238 #ifdef __LITTLE_ENDIAN__
1239 srdi 0,7, 48
1240 sldi 8,6, 64-48
1241 #else
1242 sldi 0,7, 48
1243 srdi 8,6, 64-48
1244 #endif
1245 or 0,0,8
1246 ld 7,24(5)
1247 std 0,8(4)
1248 addi 4,4,16
1249 addi 5,5,32
1250 blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
1251 bf 31,L(du6_loop)
1252 /* there is a third DW to copy */
1253 #ifdef __LITTLE_ENDIAN__
1254 srdi 0,6, 48
1255 sldi 8,7, 64-48
1256 #else
1257 sldi 0,6, 48
1258 srdi 8,7, 64-48
1259 #endif
1260 or 0,0,8
1261 std 0,0(4)
1262 mr 6,7
1263 ld 7,0(5)
1264 addi 5,5,8
1265 addi 4,4,8
1266 beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */
1267 b L(du6_loop)
1268 .align 4
1269 L(du6_1dw):
1270 #ifdef __LITTLE_ENDIAN__
1271 srdi 0,6, 48
1272 sldi 8,7, 64-48
1273 #else
1274 sldi 0,6, 48
1275 srdi 8,7, 64-48
1276 #endif
1277 addi 5,5,16
1278 or 0,0,8
1279 bf 31,L(du6_loop)
1280 mr 6,7
1281 ld 7,0(5)
1282 addi 5,5,8
1283 std 0,0(4)
1284 addi 4,4,8
1285 .align 4
1286 /* copy 32 bytes at a time */
1287 L(du6_loop):
1288 #ifdef __LITTLE_ENDIAN__
1289 srdi 0,6, 48
1290 sldi 8,7, 64-48
1291 #else
1292 sldi 0,6, 48
1293 srdi 8,7, 64-48
1294 #endif
1295 or 0,0,8
1296 ld 6,0(5)
1297 std 0,0(4)
1298 #ifdef __LITTLE_ENDIAN__
1299 srdi 0,7, 48
1300 sldi 8,6, 64-48
1301 #else
1302 sldi 0,7, 48
1303 srdi 8,6, 64-48
1304 #endif
1305 or 0,0,8
1306 ld 7,8(5)
1307 std 0,8(4)
1308 #ifdef __LITTLE_ENDIAN__
1309 srdi 0,6, 48
1310 sldi 8,7, 64-48
1311 #else
1312 sldi 0,6, 48
1313 srdi 8,7, 64-48
1314 #endif
1315 or 0,0,8
1316 ld 6,16(5)
1317 std 0,16(4)
1318 #ifdef __LITTLE_ENDIAN__
1319 srdi 0,7, 48
1320 sldi 8,6, 64-48
1321 #else
1322 sldi 0,7, 48
1323 srdi 8,6, 64-48
1324 #endif
1325 or 0,0,8
1326 ld 7,24(5)
1327 std 0,24(4)
1328 addi 5,5,32
1329 addi 4,4,32
1330 bdnz+ L(du6_loop)
1331 .align 4
1332 L(du6_fini):
1333 /* calculate and store the final DW */
1334 #ifdef __LITTLE_ENDIAN__
1335 srdi 0,6, 48
1336 sldi 8,7, 64-48
1337 #else
1338 sldi 0,6, 48
1339 srdi 8,7, 64-48
1340 #endif
1341 or 0,0,8
1342 std 0,0(4)
1343 b L(du_done)
1344
1345 .align 4
1346 L(du7_do):
1347 bf 30,L(du7_1dw)
1348
1349 /* there are at least two DWs to copy */
1350 #ifdef __LITTLE_ENDIAN__
1351 srdi 0,6, 56
1352 sldi 8,7, 64-56
1353 #else
1354 sldi 0,6, 56
1355 srdi 8,7, 64-56
1356 #endif
1357 or 0,0,8
1358 ld 6,16(5)
1359 std 0,0(4)
1360 #ifdef __LITTLE_ENDIAN__
1361 srdi 0,7, 56
1362 sldi 8,6, 64-56
1363 #else
1364 sldi 0,7, 56
1365 srdi 8,6, 64-56
1366 #endif
1367 or 0,0,8
1368 ld 7,24(5)
1369 std 0,8(4)
1370 addi 4,4,16
1371 addi 5,5,32
1372 blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
1373 bf 31,L(du7_loop)
1374 /* there is a third DW to copy */
1375 #ifdef __LITTLE_ENDIAN__
1376 srdi 0,6, 56
1377 sldi 8,7, 64-56
1378 #else
1379 sldi 0,6, 56
1380 srdi 8,7, 64-56
1381 #endif
1382 or 0,0,8
1383 std 0,0(4)
1384 mr 6,7
1385 ld 7,0(5)
1386 addi 5,5,8
1387 addi 4,4,8
1388 beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */
1389 b L(du7_loop)
1390 .align 4
1391 L(du7_1dw):
1392 #ifdef __LITTLE_ENDIAN__
1393 srdi 0,6, 56
1394 sldi 8,7, 64-56
1395 #else
1396 sldi 0,6, 56
1397 srdi 8,7, 64-56
1398 #endif
1399 addi 5,5,16
1400 or 0,0,8
1401 bf 31,L(du7_loop)
1402 mr 6,7
1403 ld 7,0(5)
1404 addi 5,5,8
1405 std 0,0(4)
1406 addi 4,4,8
1407 .align 4
1408 /* copy 32 bytes at a time */
1409 L(du7_loop):
1410 #ifdef __LITTLE_ENDIAN__
1411 srdi 0,6, 56
1412 sldi 8,7, 64-56
1413 #else
1414 sldi 0,6, 56
1415 srdi 8,7, 64-56
1416 #endif
1417 or 0,0,8
1418 ld 6,0(5)
1419 std 0,0(4)
1420 #ifdef __LITTLE_ENDIAN__
1421 srdi 0,7, 56
1422 sldi 8,6, 64-56
1423 #else
1424 sldi 0,7, 56
1425 srdi 8,6, 64-56
1426 #endif
1427 or 0,0,8
1428 ld 7,8(5)
1429 std 0,8(4)
1430 #ifdef __LITTLE_ENDIAN__
1431 srdi 0,6, 56
1432 sldi 8,7, 64-56
1433 #else
1434 sldi 0,6, 56
1435 srdi 8,7, 64-56
1436 #endif
1437 or 0,0,8
1438 ld 6,16(5)
1439 std 0,16(4)
1440 #ifdef __LITTLE_ENDIAN__
1441 srdi 0,7, 56
1442 sldi 8,6, 64-56
1443 #else
1444 sldi 0,7, 56
1445 srdi 8,6, 64-56
1446 #endif
1447 or 0,0,8
1448 ld 7,24(5)
1449 std 0,24(4)
1450 addi 5,5,32
1451 addi 4,4,32
1452 bdnz+ L(du7_loop)
1453 .align 4
1454 L(du7_fini):
1455 /* calculate and store the final DW */
1456 #ifdef __LITTLE_ENDIAN__
1457 srdi 0,6, 56
1458 sldi 8,7, 64-56
1459 #else
1460 sldi 0,6, 56
1461 srdi 8,7, 64-56
1462 #endif
1463 or 0,0,8
1464 std 0,0(4)
1465 b L(du_done)
1466
1467 .align 4
1468 L(du_done):
1469 rldicr 0,31,0,60
1470 mtcrf 0x01,31
1471 beq cr1,0f /* If the tail is 0 bytes we are done! */
1472
1473 add 3,3,0
1474 add 12,12,0
1475 /* At this point we have a tail of 0-7 bytes and we know that the
1476 destination is double word aligned. */
1477 4: bf 29,2f
1478 lwz 6,0(12)
1479 addi 12,12,4
1480 stw 6,0(3)
1481 addi 3,3,4
1482 2: bf 30,1f
1483 lhz 6,0(12)
1484 addi 12,12,2
1485 sth 6,0(3)
1486 addi 3,3,2
1487 1: bf 31,0f
1488 lbz 6,0(12)
1489 stb 6,0(3)
1490 0:
1491 /* Return original dst pointer. */
1492 ld 31,-8(1)
1493 ld 3,-16(1)
1494 blr
1495 END_GEN_TB (memcpy,TB_TOCLESS)
1496 libc_hidden_builtin_def (memcpy)