]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power6/memcpy.S
Do not use __ptr_t.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power6 / memcpy.S
CommitLineData
04067002 1/* Optimized memcpy implementation for PowerPC32 on POWER6.
bfff8b1b 2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
f17a4233 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
04067002
UD
22 Returns 'dst'.
23
9c84384c 24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25bfbb9e
RA
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
04067002
UD
28
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
33
a88f47a7 34 .machine power6
b5510883 35EALIGN (memcpy, 5, 0)
04067002
UD
36 CALL_MCOUNT
37
38 stwu 1,-32(1)
39 cfi_adjust_cfa_offset(32)
40 cmplwi cr1,5,31 /* check for short move. */
41 neg 0,3
42 cmplwi cr1,5,31
43 clrlwi 10,4,30 /* check alignment of src. */
44 andi. 11,3,3 /* check alignment of dst. */
45 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
46 ble- cr1,L(word_unaligned_short) /* If move < 32 bytes. */
47 cmplw cr6,10,11
48 stw 31,24(1)
04067002 49 stw 30,20(1)
869d7180 50 cfi_offset(31,(24-32))
04067002
UD
51 cfi_offset(30,(20-32))
52 mr 30,3
53 beq .L0
54 mtcrf 0x01,0
55 subf 31,0,5 /* Length after alignment. */
56 add 12,4,0 /* Compute src addr after alignment. */
57 /* Move 0-3 bytes as needed to get the destination word aligned. */
581: bf 31,2f
59 lbz 6,0(4)
60 bf 30,3f
61 lhz 7,1(4)
62 stb 6,0(3)
63 sth 7,1(3)
64 addi 3,3,3
65 b 0f
663:
67 stb 6,0(3)
68 addi 3,3,1
69 b 0f
702: bf 30,0f
71 lhz 6,0(4)
72 sth 6,0(3)
73 addi 3,3,2
740:
75 clrlwi 10,12,30 /* check alignment of src again. */
76 srwi 9,31,2 /* Number of full words remaining. */
77 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
78 clrlwi 11,31,30 /* calculate the number of tail bytes */
79 b L(word_aligned)
25bfbb9e 80 /* Copy words from source to destination, assuming the destination is
04067002
UD
81 aligned on a word boundary.
82
83 At this point we know there are at least 29 bytes left (32-3) to copy.
25bfbb9e 84 The next step is to determine if the source is also word aligned.
04067002
UD
85 If not branch to the unaligned move code at .L6. which uses
86 a load, shift, store strategy.
87
88 Otherwise source and destination are word aligned, and we can use
89 the optimized word copy loop. */
90 .align 4
91.L0:
92 mr 31,5
93 mr 12,4
94 bne- cr6,L(wdu) /* If source is not word aligned. .L6 */
95 srwi 9,5,2 /* Number of full words remaining. */
96 clrlwi 11,5,30 /* calculate the number of tail bytes */
97
98 /* Move words where destination and source are word aligned.
99 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
ded5b9b7 100 If the copy is not an exact multiple of 16 bytes, 1-3
04067002 101 words are copied as needed to set up the main loop. After
25bfbb9e 102 the main loop exits there may be a tail of 1-3 bytes. These bytes are
04067002
UD
103 copied a halfword/byte at a time as needed to preserve alignment. */
104L(word_aligned):
105 mtcrf 0x01,9
106 srwi 8,31,4 /* calculate the 16 byte loop count */
107 cmplwi cr1,9,4
108 cmplwi cr6,11,0
109 mr 11,12
110
111 bf 30,1f
112 lwz 6,0(12)
113 lwz 7,4(12)
114 addi 11,12,8
115 mtctr 8
116 stw 6,0(3)
117 stw 7,4(3)
118 addi 10,3,8
119 bf 31,4f
120 lwz 0,8(12)
25bfbb9e 121 stw 0,8(3)
04067002
UD
122 blt cr1,3f
123 addi 11,12,12
124 addi 10,3,12
125 b 4f
126 .align 4
1271:
128 mr 10,3
129 mtctr 8
130 bf 31,4f
131 lwz 6,0(12)
132 addi 11,12,4
133 stw 6,0(3)
134 addi 10,3,4
25bfbb9e 135
04067002
UD
136 .align 4
1374:
138 lwz 6,0(11)
139 lwz 7,4(11)
140 lwz 8,8(11)
141 lwz 0,12(11)
142 stw 6,0(10)
143 stw 7,4(10)
144 stw 8,8(10)
145 stw 0,12(10)
146 addi 11,11,16
147 addi 10,10,16
148 bdnz 4b
25bfbb9e 1493:
04067002
UD
150 clrrwi 0,31,2
151 mtcrf 0x01,31
152 beq cr6,0f
153.L9:
154 add 3,3,0
155 add 12,12,0
25bfbb9e 156
04067002
UD
157/* At this point we have a tail of 0-3 bytes and we know that the
158 destination is word aligned. */
1592: bf 30,1f
160 lhz 6,0(12)
161 addi 12,12,2
162 sth 6,0(3)
163 addi 3,3,2
1641: bf 31,0f
165 lbz 6,0(12)
166 stb 6,0(3)
1670:
168 /* Return original dst pointer. */
169 mr 3,30
170 lwz 30,20(1)
171 lwz 31,24(1)
172 addi 1,1,32
173 blr
174
25bfbb9e 175/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31
04067002
UD
176 bytes. Each case is handled without loops, using binary (1,2,4,8)
177 tests.
178
179 In the short (0-8 byte) case no attempt is made to force alignment
180 of either source or destination. The hardware will handle the
181 unaligned load/stores with small delays for crossing 32- 128-byte,
182 and 4096-byte boundaries. Since these short moves are unlikely to be
183 unaligned or cross these boundaries, the overhead to force
184 alignment is not justified.
185
186 The longer (9-31 byte) move is more likely to cross 32- or 128-byte
187 boundaries. Since only loads are sensitive to the 32-/128-byte
188 boundaries it is more important to align the source then the
189 destination. If the source is not already word aligned, we first
190 move 1-3 bytes as needed. Since we are only word aligned we don't
191 use double word load/stores to insure that all loads are aligned.
192 While the destination and stores may still be unaligned, this
193 is only an issue for page (4096 byte boundary) crossing, which
194 should be rare for these short moves. The hardware handles this
195 case automatically with a small (~20 cycle) delay. */
196 .align 4
197
198 cfi_same_value (31)
199 cfi_same_value (30)
200L(word_unaligned_short):
201 mtcrf 0x01,5
202 cmplwi cr6,5,8
203 neg 8,4
204 clrrwi 9,4,2
205 andi. 0,8,3
206 beq cr6,L(wus_8) /* Handle moves of 8 bytes. */
207/* At least 9 bytes left. Get the source word aligned. */
25bfbb9e 208 cmplwi cr1,5,16
04067002
UD
209 mr 12,4
210 ble cr6,L(wus_4) /* Handle moves of 0-8 bytes. */
211 mr 11,3
212 mr 10,5
213 cmplwi cr6,0,2
214 beq L(wus_tail) /* If the source is already word aligned skip this. */
215/* Copy 1-3 bytes to get source address word aligned. */
216 lwz 6,0(9)
217 subf 10,0,5
218 add 12,4,0
219 blt cr6,5f
a050d2a5 220 srwi 7,6,16
04067002 221 bgt cr6,3f
759cfef3
AM
222#ifdef __LITTLE_ENDIAN__
223 sth 7,0(3)
224#else
04067002 225 sth 6,0(3)
759cfef3 226#endif
04067002
UD
227 b 7f
228 .align 4
2293:
759cfef3
AM
230#ifdef __LITTLE_ENDIAN__
231 rotlwi 6,6,24
232 stb 6,0(3)
233 sth 7,1(3)
234#else
04067002
UD
235 stb 7,0(3)
236 sth 6,1(3)
759cfef3 237#endif
04067002
UD
238 b 7f
239 .align 4
2405:
759cfef3
AM
241#ifdef __LITTLE_ENDIAN__
242 rotlwi 6,6,8
243#endif
04067002
UD
244 stb 6,0(3)
2457:
246 cmplwi cr1,10,16
247 add 11,3,0
248 mtcrf 0x01,10
249 .align 4
250L(wus_tail):
251/* At least 6 bytes left and the source is word aligned. This allows
252 some speculative loads up front. */
253/* We need to special case the fall-through because the biggest delays
25bfbb9e 254 are due to address computation not being ready in time for the
04067002
UD
255 AGEN. */
256 lwz 6,0(12)
257 lwz 7,4(12)
258 blt cr1,L(wus_tail8)
259 cmplwi cr0,10,24
260L(wus_tail16): /* Move 16 bytes. */
261 stw 6,0(11)
262 stw 7,4(11)
263 lwz 6,8(12)
264 lwz 7,12(12)
265 stw 6,8(11)
266 stw 7,12(11)
267/* Move 8 bytes more. */
268 bf 28,L(wus_tail16p8)
269 cmplwi cr1,10,28
270 lwz 6,16(12)
271 lwz 7,20(12)
272 stw 6,16(11)
273 stw 7,20(11)
274/* Move 4 bytes more. */
275 bf 29,L(wus_tail16p4)
276 lwz 6,24(12)
277 stw 6,24(11)
278 addi 12,12,28
279 addi 11,11,28
280 bgt cr1,L(wus_tail2)
281 /* exactly 28 bytes. Return original dst pointer and exit. */
282 addi 1,1,32
283 blr
284 .align 4
f24a6d08 285L(wus_tail16p8): /* less than 8 bytes left. */
04067002
UD
286 beq cr1,L(wus_tailX) /* exactly 16 bytes, early exit. */
287 cmplwi cr1,10,20
288 bf 29,L(wus_tail16p2)
289/* Move 4 bytes more. */
290 lwz 6,16(12)
291 stw 6,16(11)
292 addi 12,12,20
293 addi 11,11,20
294 bgt cr1,L(wus_tail2)
295 /* exactly 20 bytes. Return original dst pointer and exit. */
296 addi 1,1,32
297 blr
298 .align 4
f24a6d08 299L(wus_tail16p4): /* less than 4 bytes left. */
04067002
UD
300 addi 12,12,24
301 addi 11,11,24
302 bgt cr0,L(wus_tail2)
303 /* exactly 24 bytes. Return original dst pointer and exit. */
304 addi 1,1,32
305 blr
306 .align 4
f24a6d08 307L(wus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */
04067002
UD
308 addi 12,12,16
309 addi 11,11,16
310 b L(wus_tail2)
311
312 .align 4
313L(wus_tail8): /* Move 8 bytes. */
314/* r6, r7 already loaded speculatively. */
315 cmplwi cr1,10,8
316 cmplwi cr0,10,12
317 bf 28,L(wus_tail4)
318 stw 6,0(11)
319 stw 7,4(11)
320/* Move 4 bytes more. */
321 bf 29,L(wus_tail8p4)
322 lwz 6,8(12)
323 stw 6,8(11)
324 addi 12,12,12
325 addi 11,11,12
326 bgt cr0,L(wus_tail2)
327 /* exactly 12 bytes. Return original dst pointer and exit. */
328 addi 1,1,32
329 blr
330 .align 4
f24a6d08 331L(wus_tail8p4): /* less than 4 bytes left. */
04067002
UD
332 addi 12,12,8
333 addi 11,11,8
334 bgt cr1,L(wus_tail2)
335 /* exactly 8 bytes. Return original dst pointer and exit. */
336 addi 1,1,32
337 blr
338
339 .align 4
340L(wus_tail4): /* Move 4 bytes. */
341/* r6 already loaded speculatively. If we are here we know there is
f24a6d08 342 more than 4 bytes left. So there is no need to test. */
04067002
UD
343 addi 12,12,4
344 stw 6,0(11)
345 addi 11,11,4
346L(wus_tail2): /* Move 2-3 bytes. */
347 bf 30,L(wus_tail1)
348 lhz 6,0(12)
25bfbb9e 349 sth 6,0(11)
04067002
UD
350 bf 31,L(wus_tailX)
351 lbz 7,2(12)
352 stb 7,2(11)
353 addi 1,1,32
354 blr
355L(wus_tail1): /* Move 1 byte. */
356 bf 31,L(wus_tailX)
357 lbz 6,0(12)
358 stb 6,0(11)
359L(wus_tailX):
360 /* Return original dst pointer. */
361 addi 1,1,32
362 blr
363
364/* Special case to copy 0-8 bytes. */
365 .align 4
366L(wus_8):
367 lwz 6,0(4)
368 lwz 7,4(4)
369 stw 6,0(3)
370 stw 7,4(3)
371 /* Return original dst pointer. */
372 addi 1,1,32
373 blr
374 .align 4
375L(wus_4):
376 bf 29,L(wus_2)
377 lwz 6,0(4)
378 stw 6,0(3)
379 bf 30,L(wus_5)
380 lhz 7,4(4)
25bfbb9e 381 sth 7,4(3)
04067002
UD
382 bf 31,L(wus_0)
383 lbz 8,6(4)
384 stb 8,6(3)
385 addi 1,1,32
386 blr
387 .align 4
388L(wus_5):
389 bf 31,L(wus_0)
390 lbz 6,4(4)
391 stb 6,4(3)
392 /* Return original dst pointer. */
393 addi 1,1,32
394 blr
395 .align 4
396L(wus_2): /* Move 2-3 bytes. */
397 bf 30,L(wus_1)
398 lhz 6,0(4)
25bfbb9e 399 sth 6,0(3)
04067002
UD
400 bf 31,L(wus_0)
401 lbz 7,2(4)
402 stb 7,2(3)
403 addi 1,1,32
404 blr
405 .align 4
406L(wus_1): /* Move 1 byte. */
407 bf 31,L(wus_0)
408 lbz 6,0(4)
409 stb 6,0(3)
410 .align 3
411L(wus_0):
412 /* Return original dst pointer. */
413 addi 1,1,32
414 blr
415
416 .align 4
417 cfi_offset(31,(24-32))
418 cfi_offset(30,(20-32))
419L(wdu):
420
421 /* Copy words where the destination is aligned but the source is
422 not. For power4, power5 and power6 machines there is penalty for
25bfbb9e 423 unaligned loads (src) that cross 32-byte, cacheline, or page
04067002 424 boundaries. So we want to use simple (unaligned) loads where
2ccdea26 425 possible but avoid them where we know the load would span a 32-byte
25bfbb9e 426 boundary.
04067002
UD
427
428 At this point we know we have at least 29 (32-3) bytes to copy
25bfbb9e 429 the src is unaligned. and we may cross at least one 32-byte
2ccdea26 430 boundary. Also we have the following register values:
04067002
UD
431 r3 == adjusted dst, word aligned
432 r4 == unadjusted src
433 r5 == unadjusted len
434 r9 == adjusted Word length
435 r10 == src alignment (1-3)
2ccdea26 436 r12 == adjusted src, not aligned
04067002
UD
437 r31 == adjusted len
438
2ccdea26 439 First we need to copy word up to but not crossing the next 32-byte
25bfbb9e 440 boundary. Then perform aligned loads just before and just after
2ccdea26 441 the boundary and use shifts and or to generate the next aligned
f24a6d08
OB
442 word for dst. If more than 32 bytes remain we copy (unaligned src)
443 the next 7 words and repeat the loop until less than 32-bytes
2ccdea26 444 remain.
04067002 445
f24a6d08 446 Then if more than 4 bytes remain we again use aligned loads,
04067002
UD
447 shifts and or to generate the next dst word. We then process the
448 remaining words using unaligned loads as needed. Finally we check
8220f4f0 449 if there are more than 0 bytes (1-3) bytes remaining and use
04067002
UD
450 halfword and or byte load/stores to complete the copy.
451*/
452 mr 4,12 /* restore unaligned adjusted src ptr */
453 clrlwi 0,12,27 /* Find dist from previous 32-byte boundary. */
454 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
25bfbb9e 455 cmplwi cr5,0,16
04067002
UD
456 subfic 8,0,32 /* Number of bytes to next 32-byte boundary. */
457
458 mtcrf 0x01,8
459 cmplwi cr1,10,16
460 subfic 9,10,32 /* number of bits to shift 2nd word right */
461/* This test is reversed because the timing to compare the bytes to
462 32-byte boundary could not be meet. So we compare the bytes from
463 previous 32-byte boundary and invert the test. */
464 bge cr5,L(wdu_h32_8)
465 .align 4
466 lwz 6,0(4)
467 lwz 7,4(4)
468 addi 12,4,16 /* generate alternate pointers to avoid agen */
469 addi 11,3,16 /* timing issues downstream. */
470 stw 6,0(3)
471 stw 7,4(3)
472 subi 31,31,16
473 lwz 6,8(4)
474 lwz 7,12(4)
475 addi 4,4,16
476 stw 6,8(3)
477 stw 7,12(3)
478 addi 3,3,16
479 bf 28,L(wdu_h32_4)
480 lwz 6,0(12)
481 lwz 7,4(12)
482 subi 31,31,8
483 addi 4,4,8
484 stw 6,0(11)
485 stw 7,4(11)
486 addi 3,3,8
487 bf 29,L(wdu_h32_0)
488 lwz 6,8(12)
489 addi 4,4,4
490 subi 31,31,4
491 stw 6,8(11)
492 addi 3,3,4
493 b L(wdu_h32_0)
494 .align 4
495L(wdu_h32_8):
496 bf 28,L(wdu_h32_4)
497 lwz 6,0(4)
498 lwz 7,4(4)
499 subi 31,31,8
500 bf 29,L(wdu_h32_8x)
501 stw 6,0(3)
502 stw 7,4(3)
503 lwz 6,8(4)
504 addi 4,4,12
505 subi 31,31,4
506 stw 6,8(3)
507 addi 3,3,12
508 b L(wdu_h32_0)
509 .align 4
510L(wdu_h32_8x):
511 addi 4,4,8
512 stw 6,0(3)
513 stw 7,4(3)
514 addi 3,3,8
515 b L(wdu_h32_0)
516 .align 4
517L(wdu_h32_4):
518 bf 29,L(wdu_h32_0)
519 lwz 6,0(4)
520 subi 31,31,4
521 addi 4,4,4
522 stw 6,0(3)
523 addi 3,3,4
524 .align 4
525L(wdu_h32_0):
2ccdea26 526/* set up for 32-byte boundary crossing word move and possibly 32-byte
04067002
UD
527 move loop. */
528 clrrwi 12,4,2
529 cmplwi cr5,31,32
530 bge cr1,L(wdu2_32)
531#if 0
532 b L(wdu1_32)
533/*
534 cmplwi cr1,10,8
535 beq cr1,L(wdu1_32)
536 cmplwi cr1,10,16
537 beq cr1,L(wdu2_32)
538 cmplwi cr1,10,24
539 beq cr1,L(wdu3_32)
540*/
541L(wdu_32):
542 lwz 6,0(12)
543 cmplwi cr6,31,4
544 srwi 8,31,5 /* calculate the 32 byte loop count */
25bfbb9e 545 slw 0,6,10
04067002
UD
546 clrlwi 31,31,27 /* The remaining bytes, < 32. */
547 blt cr5,L(wdu_32tail)
548 mtctr 8
549 cmplwi cr6,31,4
550 .align 4
551L(wdu_loop32):
552 /* copy 32 bytes at a time */
553 lwz 8,4(12)
554 addi 12,12,32
555 lwz 7,4(4)
25bfbb9e 556 srw 8,8,9
04067002
UD
557 or 0,0,8
558 stw 0,0(3)
559 stw 7,4(3)
560 lwz 6,8(4)
561 lwz 7,12(4)
562 stw 6,8(3)
563 stw 7,12(3)
564 lwz 6,16(4)
565 lwz 7,20(4)
566 stw 6,16(3)
567 stw 7,20(3)
568 lwz 6,24(4)
569 lwz 7,28(4)
570 lwz 8,0(12)
571 addi 4,4,32
572 stw 6,24(3)
573 stw 7,28(3)
574 addi 3,3,32
25bfbb9e 575 slw 0,8,10
04067002
UD
576 bdnz+ L(wdu_loop32)
577
578L(wdu_32tail):
579 mtcrf 0x01,31
580 cmplwi cr5,31,16
581 blt cr6,L(wdu_4tail)
582 /* calculate and store the final word */
583 lwz 8,4(12)
25bfbb9e 584 srw 8,8,9
04067002
UD
585 or 6,0,8
586 b L(wdu_32tailx)
587#endif
588 .align 4
589L(wdu1_32):
590 lwz 6,-1(4)
591 cmplwi cr6,31,4
592 srwi 8,31,5 /* calculate the 32 byte loop count */
759cfef3
AM
593#ifdef __LITTLE_ENDIAN__
594 srwi 6,6,8
595#else
04067002 596 slwi 6,6,8
759cfef3 597#endif
04067002
UD
598 clrlwi 31,31,27 /* The remaining bytes, < 32. */
599 blt cr5,L(wdu1_32tail)
600 mtctr 8
601 cmplwi cr6,31,4
602
603 lwz 8,3(4)
604 lwz 7,4(4)
759cfef3
AM
605#ifdef __LITTLE_ENDIAN__
606 rldimi 6,8,24,32
607#else
04067002
UD
608/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
609 rlwimi 6,8,8,(32-8),31
759cfef3 610#endif
04067002
UD
611 b L(wdu1_loop32x)
612 .align 4
613L(wdu1_loop32):
614 /* copy 32 bytes at a time */
615 lwz 8,3(4)
616 lwz 7,4(4)
617 stw 10,-8(3)
618 stw 11,-4(3)
759cfef3
AM
619#ifdef __LITTLE_ENDIAN__
620 rldimi 6,8,24,32
621#else
04067002
UD
622/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
623 rlwimi 6,8,8,(32-8),31
759cfef3 624#endif
04067002
UD
625L(wdu1_loop32x):
626 lwz 10,8(4)
627 lwz 11,12(4)
628 stw 6,0(3)
629 stw 7,4(3)
630 lwz 6,16(4)
631 lwz 7,20(4)
632 stw 10,8(3)
633 stw 11,12(3)
634 lwz 10,24(4)
635 lwz 11,28(4)
636 lwz 8,32-1(4)
637 addi 4,4,32
638 stw 6,16(3)
639 stw 7,20(3)
640 addi 3,3,32
759cfef3
AM
641#ifdef __LITTLE_ENDIAN__
642 srwi 6,8,8
643#else
04067002 644 slwi 6,8,8
759cfef3 645#endif
04067002
UD
646 bdnz+ L(wdu1_loop32)
647 stw 10,-8(3)
648 stw 11,-4(3)
649
650L(wdu1_32tail):
651 mtcrf 0x01,31
652 cmplwi cr5,31,16
653 blt cr6,L(wdu_4tail)
654 /* calculate and store the final word */
655 lwz 8,3(4)
759cfef3
AM
656#ifdef __LITTLE_ENDIAN__
657 rldimi 6,8,24,32
658#else
659/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
04067002 660 rlwimi 6,8,8,(32-8),31
759cfef3 661#endif
04067002
UD
662 b L(wdu_32tailx)
663
664L(wdu2_32):
665 bgt cr1,L(wdu3_32)
666 lwz 6,-2(4)
667 cmplwi cr6,31,4
668 srwi 8,31,5 /* calculate the 32 byte loop count */
759cfef3
AM
669#ifdef __LITTLE_ENDIAN__
670 srwi 6,6,16
671#else
04067002 672 slwi 6,6,16
759cfef3 673#endif
04067002
UD
674 clrlwi 31,31,27 /* The remaining bytes, < 32. */
675 blt cr5,L(wdu2_32tail)
676 mtctr 8
677 cmplwi cr6,31,4
678
679 lwz 8,2(4)
680 lwz 7,4(4)
759cfef3
AM
681#ifdef __LITTLE_ENDIAN__
682 rldimi 6,8,16,32
683#else
04067002 684 rlwimi 6,8,16,(32-16),31
759cfef3 685#endif
04067002
UD
686 b L(wdu2_loop32x)
687 .align 4
688L(wdu2_loop32):
689 /* copy 32 bytes at a time */
690 lwz 8,2(4)
691 lwz 7,4(4)
692 stw 10,-8(3)
693 stw 11,-4(3)
759cfef3
AM
694#ifdef __LITTLE_ENDIAN__
695 rldimi 6,8,16,32
696#else
04067002 697 rlwimi 6,8,16,(32-16),31
759cfef3 698#endif
04067002
UD
699L(wdu2_loop32x):
700 lwz 10,8(4)
701 lwz 11,12(4)
702 stw 6,0(3)
703 stw 7,4(3)
704 lwz 6,16(4)
705 lwz 7,20(4)
706 stw 10,8(3)
707 stw 11,12(3)
708 lwz 10,24(4)
709 lwz 11,28(4)
710/* lwz 8,0(12) */
711 lwz 8,32-2(4)
712 addi 4,4,32
713 stw 6,16(3)
714 stw 7,20(3)
715 addi 3,3,32
759cfef3
AM
716#ifdef __LITTLE_ENDIAN__
717 srwi 6,8,16
718#else
04067002 719 slwi 6,8,16
759cfef3 720#endif
04067002
UD
721 bdnz+ L(wdu2_loop32)
722 stw 10,-8(3)
723 stw 11,-4(3)
724
725L(wdu2_32tail):
726 mtcrf 0x01,31
727 cmplwi cr5,31,16
728 blt cr6,L(wdu_4tail)
729 /* calculate and store the final word */
730 lwz 8,2(4)
759cfef3
AM
731#ifdef __LITTLE_ENDIAN__
732 rldimi 6,8,16,32
733#else
04067002 734 rlwimi 6,8,16,(32-16),31
759cfef3 735#endif
04067002
UD
736 b L(wdu_32tailx)
737
738L(wdu3_32):
739/* lwz 6,0(12) */
740 lwz 6,-3(4)
741 cmplwi cr6,31,4
742 srwi 8,31,5 /* calculate the 32 byte loop count */
759cfef3
AM
743#ifdef __LITTLE_ENDIAN__
744 srwi 6,6,24
745#else
04067002 746 slwi 6,6,24
759cfef3 747#endif
04067002
UD
748 clrlwi 31,31,27 /* The remaining bytes, < 32. */
749 blt cr5,L(wdu3_32tail)
750 mtctr 8
751 cmplwi cr6,31,4
752
753 lwz 8,1(4)
754 lwz 7,4(4)
759cfef3
AM
755#ifdef __LITTLE_ENDIAN__
756 rldimi 6,8,8,32
757#else
04067002 758 rlwimi 6,8,24,(32-24),31
759cfef3 759#endif
04067002
UD
760 b L(wdu3_loop32x)
761 .align 4
762L(wdu3_loop32):
763 /* copy 32 bytes at a time */
764 lwz 8,1(4)
765 lwz 7,4(4)
766 stw 10,-8(3)
767 stw 11,-4(3)
759cfef3
AM
768#ifdef __LITTLE_ENDIAN__
769 rldimi 6,8,8,32
770#else
04067002 771 rlwimi 6,8,24,(32-24),31
759cfef3 772#endif
04067002
UD
773L(wdu3_loop32x):
774 lwz 10,8(4)
775 lwz 11,12(4)
776 stw 6,0(3)
777 stw 7,4(3)
778 lwz 6,16(4)
779 lwz 7,20(4)
780 stw 10,8(3)
781 stw 11,12(3)
782 lwz 10,24(4)
783 lwz 11,28(4)
784 lwz 8,32-3(4)
785 addi 4,4,32
786 stw 6,16(3)
787 stw 7,20(3)
788 addi 3,3,32
759cfef3
AM
789#ifdef __LITTLE_ENDIAN__
790 srwi 6,8,24
791#else
04067002 792 slwi 6,8,24
759cfef3 793#endif
04067002
UD
794 bdnz+ L(wdu3_loop32)
795 stw 10,-8(3)
796 stw 11,-4(3)
797
798L(wdu3_32tail):
799 mtcrf 0x01,31
800 cmplwi cr5,31,16
801 blt cr6,L(wdu_4tail)
802 /* calculate and store the final word */
803 lwz 8,1(4)
759cfef3
AM
804#ifdef __LITTLE_ENDIAN__
805 rldimi 6,8,8,32
806#else
04067002 807 rlwimi 6,8,24,(32-24),31
759cfef3 808#endif
04067002
UD
809 b L(wdu_32tailx)
810 .align 4
811L(wdu_32tailx):
812 blt cr5,L(wdu_t32_8)
813 lwz 7,4(4)
814 addi 12,4,16 /* generate alternate pointers to avoid agen */
815 addi 11,3,16 /* timing issues downstream. */
816 stw 6,0(3)
817 stw 7,4(3)
818 subi 31,31,16
819 lwz 6,8(4)
820 lwz 7,12(4)
821 addi 4,4,16
822 stw 6,8(3)
823 stw 7,12(3)
824 addi 3,3,16
825 bf 28,L(wdu_t32_4x)
826 lwz 6,0(12)
827 lwz 7,4(12)
828 addi 4,4,8
829 subi 31,31,8
830 stw 6,0(11)
831 stw 7,4(11)
832 addi 3,3,8
833 bf 29,L(wdu_t32_0)
834 lwz 6,8(12)
835 addi 4,4,4
836 subi 31,31,4
837 stw 6,8(11)
838 addi 3,3,4
839 b L(wdu_t32_0)
840 .align 4
841L(wdu_t32_4x):
842 bf 29,L(wdu_t32_0)
843 lwz 6,0(4)
844 addi 4,4,4
845 subi 31,31,4
846 stw 6,0(3)
847 addi 3,3,4
848 b L(wdu_t32_0)
849 .align 4
850L(wdu_t32_8):
851 bf 28,L(wdu_t32_4)
852 lwz 7,4(4)
853 subi 31,31,8
854 bf 29,L(wdu_t32_8x)
855 stw 6,0(3)
856 stw 7,4(3)
857 lwz 6,8(4)
858 subi 31,31,4
859 addi 4,4,12
860 stw 6,8(3)
861 addi 3,3,12
862 b L(wdu_t32_0)
863 .align 4
864L(wdu_t32_8x):
865 addi 4,4,8
866 stw 6,0(3)
867 stw 7,4(3)
868 addi 3,3,8
869 b L(wdu_t32_0)
870 .align 4
871L(wdu_t32_4):
872 subi 31,31,4
873 stw 6,0(3)
874 addi 4,4,4
875 addi 3,3,4
876 .align 4
877L(wdu_t32_0):
878L(wdu_4tail):
879 cmplwi cr6,31,0
880 beq cr6,L(wdus_0) /* If the tail is 0 bytes we are done! */
881 bf 30,L(wdus_3)
882 lhz 7,0(4)
25bfbb9e 883 sth 7,0(3)
04067002
UD
884 bf 31,L(wdus_0)
885 lbz 8,2(4)
886 stb 8,2(3)
887 mr 3,30
888 lwz 30,20(1)
889 lwz 31,24(1)
890 addi 1,1,32
891 blr
892 .align 4
893L(wdus_3):
894 bf 31,L(wus_0)
895 lbz 6,0(4)
896 stb 6,0(3)
897 .align 4
898L(wdus_0):
899 /* Return original dst pointer. */
900 mr 3,30
901 lwz 30,20(1)
902 lwz 31,24(1)
903 addi 1,1,32
904 blr
b5510883 905END (memcpy)
04067002
UD
906
907libc_hidden_builtin_def (memcpy)