]>
Commit | Line | Data |
---|---|---|
04067002 | 1 | /* Optimized memcpy implementation for PowerPC64. |
bfff8b1b | 2 | Copyright (C) 2003-2017 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
f17a4233 | 21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
04067002 UD |
22 | Returns 'dst'. |
23 | ||
9c84384c JM |
24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled | |
26 | with the appropriate combination of byte and halfword load/stores. | |
27 | There is minimal effort to optimize the alignment of short moves. | |
04067002 | 28 | The 64-bit implementations of POWER3 and POWER4 do a reasonable job |
2ccdea26 | 29 | of handling unaligned load/stores that do not cross 32-byte boundaries. |
04067002 UD |
30 | |
31 | Longer moves (>= 32-bytes) justify the effort to get at least the | |
32 | destination doubleword (8-byte) aligned. Further optimization is | |
2ccdea26 | 33 | possible when both source and destination are doubleword aligned. |
9c84384c JM |
34 | Each case has a optimized unrolled loop. |
35 | ||
2ccdea26 | 36 | For POWER6 unaligned loads will take a 20+ cycle hiccup for any |
04067002 | 37 | L1 cache miss that crosses a 32- or 128-byte boundary. Store |
2ccdea26 | 38 | is more forgiving and does not take a hiccup until page or |
9c84384c | 39 | segment boundaries. So we require doubleword alignment for |
04067002 UD |
40 | the source but may take a risk and only require word alignment |
41 | for the destination. */ | |
42 | ||
72fd128a WSM |
43 | #ifndef MEMCPY |
44 | # define MEMCPY memcpy | |
45 | #endif | |
04067002 | 46 | .machine "power6" |
d5b41185 | 47 | ENTRY_TOCLESS (MEMCPY, 7) |
04067002 UD |
48 | CALL_MCOUNT 3 |
49 | ||
50 | cmpldi cr1,5,31 | |
51 | neg 0,3 | |
52 | std 3,-16(1) | |
53 | std 31,-8(1) | |
2ccdea26 | 54 | andi. 11,3,7 /* check alignment of dst. */ |
04067002 | 55 | clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ |
2ccdea26 | 56 | clrldi 10,4,61 /* check alignment of src. */ |
04067002 UD |
57 | cmpldi cr6,5,8 |
58 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ | |
59 | mtcrf 0x01,0 | |
9c84384c | 60 | cmpld cr6,10,11 |
04067002 UD |
61 | srdi 9,5,3 /* Number of full double words remaining. */ |
62 | beq .L0 | |
9c84384c | 63 | |
04067002 | 64 | subf 5,0,5 |
2ccdea26 AB |
65 | /* Move 0-7 bytes as needed to get the destination doubleword aligned. |
66 | Duplicate some code to maximize fall-through and minimize agen delays. */ | |
04067002 UD |
67 | 1: bf 31,2f |
68 | lbz 6,0(4) | |
69 | stb 6,0(3) | |
70 | bf 30,5f | |
71 | lhz 6,1(4) | |
72 | sth 6,1(3) | |
73 | bf 29,0f | |
74 | lwz 6,3(4) | |
75 | stw 6,3(3) | |
76 | b 0f | |
77 | 5: | |
78 | bf 29,0f | |
79 | lwz 6,1(4) | |
80 | stw 6,1(3) | |
81 | b 0f | |
9c84384c | 82 | |
04067002 UD |
83 | 2: bf 30,4f |
84 | lhz 6,0(4) | |
85 | sth 6,0(3) | |
86 | bf 29,0f | |
87 | lwz 6,2(4) | |
88 | stw 6,2(3) | |
89 | b 0f | |
9c84384c | 90 | |
04067002 UD |
91 | 4: bf 29,0f |
92 | lwz 6,0(4) | |
93 | stw 6,0(3) | |
9c84384c | 94 | 0: |
04067002 UD |
95 | /* Add the number of bytes until the 1st doubleword of dst to src and dst. */ |
96 | add 4,4,0 | |
97 | add 3,3,0 | |
9c84384c | 98 | |
2ccdea26 | 99 | clrldi 10,4,61 /* check alignment of src again. */ |
04067002 | 100 | srdi 9,5,3 /* Number of full double words remaining. */ |
9c84384c | 101 | |
2ccdea26 | 102 | /* Copy doublewords from source to destination, assuming the |
04067002 UD |
103 | destination is aligned on a doubleword boundary. |
104 | ||
105 | At this point we know there are at least 25 bytes left (32-7) to copy. | |
9c84384c | 106 | The next step is to determine if the source is also doubleword aligned. |
04067002 UD |
107 | If not branch to the unaligned move code at .L6. which uses |
108 | a load, shift, store strategy. | |
9c84384c | 109 | |
04067002 UD |
110 | Otherwise source and destination are doubleword aligned, and we can |
111 | the optimized doubleword copy loop. */ | |
112 | .align 4 | |
113 | .L0: | |
114 | clrldi 11,5,61 | |
115 | andi. 0,5,0x78 | |
116 | srdi 12,5,7 /* Number of 128-byte blocks to move. */ | |
117 | cmpldi cr1,11,0 /* If the tail is 0 bytes */ | |
118 | bne- cr6,.L6 /* If source is not DW aligned. */ | |
119 | ||
120 | /* Move doublewords where destination and source are DW aligned. | |
121 | Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration. | |
ded5b9b7 | 122 | If the copy is not an exact multiple of 128 bytes, 1-15 |
04067002 UD |
123 | doublewords are copied as needed to set up the main loop. After |
124 | the main loop exits there may be a tail of 1-7 bytes. These byte | |
125 | are copied a word/halfword/byte at a time as needed to preserve | |
126 | alignment. | |
9c84384c | 127 | |
04067002 UD |
128 | For POWER6 the L1 is store-through and the L2 is store-in. The |
129 | L2 is clocked at half CPU clock so we can store 16 bytes every | |
130 | other cycle. POWER6 also has a load/store bypass so we can do | |
9c84384c JM |
131 | load, load, store, store every 2 cycles. |
132 | ||
04067002 | 133 | The following code is sensitive to cache line alignment. Do not |
2ccdea26 | 134 | make any change with out first making sure they don't result in |
04067002 UD |
135 | splitting ld/std pairs across a cache line. */ |
136 | ||
137 | mtcrf 0x02,5 | |
138 | mtcrf 0x01,5 | |
139 | cmpldi cr5,12,1 | |
140 | beq L(das_loop) | |
141 | ||
142 | bf 25,4f | |
143 | .align 3 | |
144 | ld 6,0(4) | |
145 | ld 7,8(4) | |
146 | mr 11,4 | |
147 | mr 10,3 | |
148 | std 6,0(3) | |
149 | std 7,8(3) | |
150 | ld 6,16(4) | |
151 | ld 7,24(4) | |
152 | std 6,16(3) | |
153 | std 7,24(3) | |
154 | ld 6,0+32(4) | |
155 | ld 7,8+32(4) | |
156 | addi 4,4,64 | |
157 | addi 3,3,64 | |
158 | std 6,0+32(10) | |
159 | std 7,8+32(10) | |
160 | ld 6,16+32(11) | |
161 | ld 7,24+32(11) | |
162 | std 6,16+32(10) | |
163 | std 7,24+32(10) | |
164 | 4: | |
165 | mr 10,3 | |
166 | bf 26,2f | |
167 | ld 6,0(4) | |
168 | ld 7,8(4) | |
169 | mr 11,4 | |
170 | nop | |
171 | std 6,0(3) | |
172 | std 7,8(3) | |
173 | ld 6,16(4) | |
174 | ld 7,24(4) | |
175 | addi 4,4,32 | |
176 | std 6,16(3) | |
177 | std 7,24(3) | |
178 | addi 3,3,32 | |
179 | 6: | |
180 | nop | |
181 | bf 27,5f | |
182 | ld 6,0+32(11) | |
183 | ld 7,8+32(11) | |
184 | addi 4,4,16 | |
185 | addi 3,3,16 | |
186 | std 6,0+32(10) | |
187 | std 7,8+32(10) | |
188 | bf 28,L(das_loop_s) | |
189 | ld 0,16+32(11) | |
190 | addi 4,4,8 | |
191 | addi 3,3,8 | |
192 | std 0,16+32(10) | |
193 | blt cr5,L(das_tail) | |
194 | b L(das_loop) | |
195 | .align 3 | |
196 | 5: | |
197 | nop | |
198 | bf 28,L(das_loop_s) | |
199 | ld 6,32(11) | |
200 | addi 4,4,8 | |
201 | addi 3,3,8 | |
202 | std 6,32(10) | |
203 | blt cr5,L(das_tail) | |
204 | b L(das_loop) | |
205 | .align 3 | |
206 | 2: | |
207 | mr 11,4 | |
208 | bf 27,1f | |
209 | ld 6,0(4) | |
210 | ld 7,8(4) | |
211 | addi 4,4,16 | |
212 | addi 3,3,16 | |
213 | std 6,0(10) | |
214 | std 7,8(10) | |
215 | bf 28,L(das_loop_s) | |
216 | ld 0,16(11) | |
217 | addi 4,11,24 | |
218 | addi 3,10,24 | |
219 | std 0,16(10) | |
220 | blt cr5,L(das_tail) | |
221 | b L(das_loop) | |
222 | .align 3 | |
223 | 1: | |
224 | nop | |
225 | bf 28,L(das_loop_s) | |
226 | ld 6,0(4) | |
227 | addi 4,4,8 | |
228 | addi 3,3,8 | |
229 | std 6,0(10) | |
230 | L(das_loop_s): | |
231 | nop | |
232 | blt cr5,L(das_tail) | |
233 | .align 4 | |
234 | L(das_loop): | |
235 | ld 6,0(4) | |
236 | ld 7,8(4) | |
237 | mr 10,3 | |
238 | mr 11,4 | |
239 | std 6,0(3) | |
240 | std 7,8(3) | |
241 | addi 12,12,-1 | |
242 | nop | |
243 | ld 8,16(4) | |
244 | ld 0,24(4) | |
245 | std 8,16(3) | |
246 | std 0,24(3) | |
247 | ||
248 | ld 6,0+32(4) | |
249 | ld 7,8+32(4) | |
250 | std 6,0+32(3) | |
251 | std 7,8+32(3) | |
252 | ld 8,16+32(4) | |
253 | ld 0,24+32(4) | |
254 | std 8,16+32(3) | |
255 | std 0,24+32(3) | |
256 | ||
257 | ld 6,0+64(11) | |
258 | ld 7,8+64(11) | |
259 | std 6,0+64(10) | |
260 | std 7,8+64(10) | |
261 | ld 8,16+64(11) | |
262 | ld 0,24+64(11) | |
263 | std 8,16+64(10) | |
264 | std 0,24+64(10) | |
265 | ||
266 | ld 6,0+96(11) | |
267 | ld 7,8+96(11) | |
268 | addi 4,4,128 | |
269 | addi 3,3,128 | |
270 | std 6,0+96(10) | |
271 | std 7,8+96(10) | |
272 | ld 8,16+96(11) | |
273 | ld 0,24+96(11) | |
274 | std 8,16+96(10) | |
275 | std 0,24+96(10) | |
276 | ble cr5,L(das_loop_e) | |
9c84384c | 277 | |
04067002 UD |
278 | mtctr 12 |
279 | .align 4 | |
280 | L(das_loop2): | |
281 | ld 6,0(4) | |
282 | ld 7,8(4) | |
283 | mr 10,3 | |
284 | mr 11,4 | |
285 | std 6,0(3) | |
286 | std 7,8(3) | |
287 | ld 8,16(4) | |
288 | ld 0,24(4) | |
289 | std 8,16(3) | |
290 | std 0,24(3) | |
291 | ||
292 | ld 6,0+32(4) | |
293 | ld 7,8+32(4) | |
294 | std 6,0+32(3) | |
295 | std 7,8+32(3) | |
296 | ld 8,16+32(4) | |
297 | ld 0,24+32(4) | |
298 | std 8,16+32(3) | |
299 | std 0,24+32(3) | |
300 | ||
301 | ld 6,0+64(11) | |
302 | ld 7,8+64(11) | |
303 | std 6,0+64(10) | |
304 | std 7,8+64(10) | |
305 | ld 8,16+64(11) | |
306 | ld 0,24+64(11) | |
307 | std 8,16+64(10) | |
308 | std 0,24+64(10) | |
309 | ||
310 | ld 6,0+96(11) | |
311 | ld 7,8+96(11) | |
312 | addi 4,4,128 | |
313 | addi 3,3,128 | |
314 | std 6,0+96(10) | |
315 | std 7,8+96(10) | |
316 | ld 8,16+96(11) | |
317 | ld 0,24+96(11) | |
318 | std 8,16+96(10) | |
319 | std 0,24+96(10) | |
320 | bdnz L(das_loop2) | |
321 | L(das_loop_e): | |
322 | /* Check of a 1-7 byte tail, return if none. */ | |
323 | bne cr1,L(das_tail2) | |
324 | /* Return original dst pointer. */ | |
325 | ld 3,-16(1) | |
326 | blr | |
327 | .align 4 | |
328 | L(das_tail): | |
329 | beq cr1,0f | |
9c84384c | 330 | |
04067002 UD |
331 | L(das_tail2): |
332 | /* At this point we have a tail of 0-7 bytes and we know that the | |
2ccdea26 | 333 | destination is double word aligned. */ |
04067002 UD |
334 | 4: bf 29,2f |
335 | lwz 6,0(4) | |
336 | stw 6,0(3) | |
337 | bf 30,5f | |
338 | lhz 6,4(4) | |
339 | sth 6,4(3) | |
340 | bf 31,0f | |
341 | lbz 6,6(4) | |
342 | stb 6,6(3) | |
343 | b 0f | |
344 | 5: bf 31,0f | |
345 | lbz 6,4(4) | |
346 | stb 6,4(3) | |
347 | b 0f | |
9c84384c | 348 | |
04067002 UD |
349 | 2: bf 30,1f |
350 | lhz 6,0(4) | |
351 | sth 6,0(3) | |
352 | bf 31,0f | |
353 | lbz 6,2(4) | |
354 | stb 6,2(3) | |
355 | b 0f | |
9c84384c | 356 | |
04067002 UD |
357 | 1: bf 31,0f |
358 | lbz 6,0(4) | |
359 | stb 6,0(3) | |
360 | 0: | |
361 | /* Return original dst pointer. */ | |
362 | ld 3,-16(1) | |
363 | blr | |
364 | ||
9c84384c | 365 | /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 |
04067002 UD |
366 | bytes. Each case is handled without loops, using binary (1,2,4,8) |
367 | tests. | |
368 | ||
369 | In the short (0-8 byte) case no attempt is made to force alignment | |
370 | of either source or destination. The hardware will handle the | |
371 | unaligned load/stores with small delays for crossing 32- 128-byte, | |
372 | and 4096-byte boundaries. Since these short moves are unlikely to be | |
373 | unaligned or cross these boundaries, the overhead to force | |
374 | alignment is not justified. | |
375 | ||
376 | The longer (9-31 byte) move is more likely to cross 32- or 128-byte | |
377 | boundaries. Since only loads are sensitive to the 32-/128-byte | |
378 | boundaries it is more important to align the source then the | |
379 | destination. If the source is not already word aligned, we first | |
380 | move 1-3 bytes as needed. Since we are only word aligned we don't | |
381 | use double word load/stores to insure that all loads are aligned. | |
382 | While the destination and stores may still be unaligned, this | |
383 | is only an issue for page (4096 byte boundary) crossing, which | |
384 | should be rare for these short moves. The hardware handles this | |
385 | case automatically with a small (~20 cycle) delay. */ | |
386 | .align 4 | |
387 | .L2: | |
388 | mtcrf 0x01,5 | |
389 | neg 8,4 | |
390 | clrrdi 11,4,2 | |
391 | andi. 0,8,3 | |
392 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ | |
393 | /* At least 9 bytes left. Get the source word aligned. */ | |
394 | cmpldi cr1,5,16 | |
395 | mr 10,5 | |
396 | mr 12,4 | |
397 | cmpldi cr6,0,2 | |
398 | beq L(dus_tail) /* If the source is already word aligned skip this. */ | |
399 | /* Copy 1-3 bytes to get source address word aligned. */ | |
400 | lwz 6,0(11) | |
401 | subf 10,0,5 | |
402 | add 12,4,0 | |
403 | blt cr6,5f | |
404 | srdi 7,6,16 | |
405 | bgt cr6,3f | |
759cfef3 AM |
406 | #ifdef __LITTLE_ENDIAN__ |
407 | sth 7,0(3) | |
408 | #else | |
04067002 | 409 | sth 6,0(3) |
759cfef3 | 410 | #endif |
04067002 UD |
411 | b 7f |
412 | .align 4 | |
413 | 3: | |
759cfef3 AM |
414 | #ifdef __LITTLE_ENDIAN__ |
415 | rotlwi 6,6,24 | |
416 | stb 6,0(3) | |
417 | sth 7,1(3) | |
418 | #else | |
04067002 UD |
419 | stb 7,0(3) |
420 | sth 6,1(3) | |
759cfef3 | 421 | #endif |
04067002 UD |
422 | b 7f |
423 | .align 4 | |
424 | 5: | |
759cfef3 AM |
425 | #ifdef __LITTLE_ENDIAN__ |
426 | rotlwi 6,6,8 | |
427 | #endif | |
04067002 UD |
428 | stb 6,0(3) |
429 | 7: | |
430 | cmpldi cr1,10,16 | |
431 | add 3,3,0 | |
432 | mtcrf 0x01,10 | |
433 | .align 4 | |
434 | L(dus_tail): | |
435 | /* At least 6 bytes left and the source is word aligned. This allows | |
436 | some speculative loads up front. */ | |
437 | /* We need to special case the fall-through because the biggest delays | |
9c84384c | 438 | are due to address computation not being ready in time for the |
04067002 UD |
439 | AGEN. */ |
440 | lwz 6,0(12) | |
441 | lwz 7,4(12) | |
442 | blt cr1,L(dus_tail8) | |
443 | cmpldi cr0,10,24 | |
444 | L(dus_tail16): /* Move 16 bytes. */ | |
445 | stw 6,0(3) | |
446 | stw 7,4(3) | |
447 | lwz 6,8(12) | |
448 | lwz 7,12(12) | |
449 | stw 6,8(3) | |
450 | stw 7,12(3) | |
451 | /* Move 8 bytes more. */ | |
452 | bf 28,L(dus_tail16p8) | |
453 | cmpldi cr1,10,28 | |
454 | lwz 6,16(12) | |
455 | lwz 7,20(12) | |
456 | stw 6,16(3) | |
457 | stw 7,20(3) | |
458 | /* Move 4 bytes more. */ | |
459 | bf 29,L(dus_tail16p4) | |
460 | lwz 6,24(12) | |
461 | stw 6,24(3) | |
462 | addi 12,12,28 | |
463 | addi 3,3,28 | |
464 | bgt cr1,L(dus_tail2) | |
465 | /* exactly 28 bytes. Return original dst pointer and exit. */ | |
466 | ld 3,-16(1) | |
467 | blr | |
468 | .align 4 | |
f24a6d08 | 469 | L(dus_tail16p8): /* less than 8 bytes left. */ |
04067002 UD |
470 | beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */ |
471 | cmpldi cr1,10,20 | |
472 | bf 29,L(dus_tail16p2) | |
473 | /* Move 4 bytes more. */ | |
474 | lwz 6,16(12) | |
475 | stw 6,16(3) | |
476 | addi 12,12,20 | |
477 | addi 3,3,20 | |
478 | bgt cr1,L(dus_tail2) | |
479 | /* exactly 20 bytes. Return original dst pointer and exit. */ | |
480 | ld 3,-16(1) | |
481 | blr | |
482 | .align 4 | |
f24a6d08 | 483 | L(dus_tail16p4): /* less than 4 bytes left. */ |
04067002 UD |
484 | addi 12,12,24 |
485 | addi 3,3,24 | |
486 | bgt cr0,L(dus_tail2) | |
487 | /* exactly 24 bytes. Return original dst pointer and exit. */ | |
488 | ld 3,-16(1) | |
489 | blr | |
490 | .align 4 | |
f24a6d08 | 491 | L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */ |
04067002 UD |
492 | addi 12,12,16 |
493 | addi 3,3,16 | |
494 | b L(dus_tail2) | |
495 | ||
496 | .align 4 | |
497 | L(dus_tail8): /* Move 8 bytes. */ | |
498 | /* r6, r7 already loaded speculatively. */ | |
499 | cmpldi cr1,10,8 | |
500 | cmpldi cr0,10,12 | |
501 | bf 28,L(dus_tail4) | |
502 | .align 2 | |
503 | stw 6,0(3) | |
504 | stw 7,4(3) | |
505 | /* Move 4 bytes more. */ | |
506 | bf 29,L(dus_tail8p4) | |
507 | lwz 6,8(12) | |
508 | stw 6,8(3) | |
509 | addi 12,12,12 | |
510 | addi 3,3,12 | |
511 | bgt cr0,L(dus_tail2) | |
512 | /* exactly 12 bytes. Return original dst pointer and exit. */ | |
513 | ld 3,-16(1) | |
514 | blr | |
515 | .align 4 | |
f24a6d08 | 516 | L(dus_tail8p4): /* less than 4 bytes left. */ |
04067002 UD |
517 | addi 12,12,8 |
518 | addi 3,3,8 | |
519 | bgt cr1,L(dus_tail2) | |
520 | /* exactly 8 bytes. Return original dst pointer and exit. */ | |
521 | ld 3,-16(1) | |
522 | blr | |
523 | ||
524 | .align 4 | |
525 | L(dus_tail4): /* Move 4 bytes. */ | |
526 | /* r6 already loaded speculatively. If we are here we know there is | |
f24a6d08 | 527 | more than 4 bytes left. So there is no need to test. */ |
04067002 UD |
528 | addi 12,12,4 |
529 | stw 6,0(3) | |
530 | addi 3,3,4 | |
531 | L(dus_tail2): /* Move 2-3 bytes. */ | |
532 | bf 30,L(dus_tail1) | |
533 | lhz 6,0(12) | |
9c84384c | 534 | sth 6,0(3) |
04067002 UD |
535 | bf 31,L(dus_tailX) |
536 | lbz 7,2(12) | |
537 | stb 7,2(3) | |
538 | ld 3,-16(1) | |
539 | blr | |
540 | L(dus_tail1): /* Move 1 byte. */ | |
541 | bf 31,L(dus_tailX) | |
542 | lbz 6,0(12) | |
543 | stb 6,0(3) | |
544 | L(dus_tailX): | |
545 | /* Return original dst pointer. */ | |
546 | ld 3,-16(1) | |
547 | blr | |
548 | ||
549 | /* Special case to copy 0-8 bytes. */ | |
550 | .align 4 | |
551 | .LE8: | |
552 | mr 12,4 | |
553 | bne cr6,L(dus_4) | |
2ccdea26 | 554 | /* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20 |
04067002 UD |
555 | cycle delay. This case should be rare and any attempt to avoid this |
556 | would take most of 20 cycles any way. */ | |
557 | ld 6,0(4) | |
558 | std 6,0(3) | |
559 | /* Return original dst pointer. */ | |
560 | ld 3,-16(1) | |
561 | blr | |
562 | .align 4 | |
563 | L(dus_4): | |
564 | bf 29,L(dus_tail2) | |
565 | lwz 6,0(4) | |
566 | stw 6,0(3) | |
567 | bf 30,L(dus_5) | |
568 | lhz 7,4(4) | |
9c84384c | 569 | sth 7,4(3) |
04067002 UD |
570 | bf 31,L(dus_0) |
571 | lbz 8,6(4) | |
572 | stb 8,6(3) | |
573 | ld 3,-16(1) | |
574 | blr | |
575 | .align 4 | |
576 | L(dus_5): | |
577 | bf 31,L(dus_0) | |
578 | lbz 6,4(4) | |
579 | stb 6,4(3) | |
580 | L(dus_0): | |
581 | /* Return original dst pointer. */ | |
582 | ld 3,-16(1) | |
583 | blr | |
584 | ||
585 | .align 4 | |
586 | .L6: | |
587 | cfi_offset(31,-8) | |
588 | mr 12,4 | |
589 | mr 31,5 | |
590 | /* Copy doublewords where the destination is aligned but the source is | |
591 | not. Use aligned doubleword loads from the source, shifted to realign | |
592 | the data, to allow aligned destination stores. */ | |
593 | addi 11,9,-1 /* loop DW count is one less than total */ | |
594 | subf 5,10,12 /* Move source addr to previous full double word. */ | |
595 | cmpldi cr5, 10, 2 | |
596 | cmpldi cr0, 10, 4 | |
597 | mr 4,3 | |
598 | srdi 8,11,2 /* calculate the 32 byte loop count */ | |
599 | ld 6,0(5) /* pre load 1st full doubleword. */ | |
600 | mtcrf 0x01,11 | |
601 | cmpldi cr6,9,4 | |
602 | mtctr 8 | |
603 | ld 7,8(5) /* pre load 2nd full doubleword. */ | |
604 | bge cr0, L(du4_do) | |
605 | blt cr5, L(du1_do) | |
606 | beq cr5, L(du2_do) | |
9c84384c JM |
607 | b L(du3_do) |
608 | ||
04067002 UD |
609 | .align 4 |
610 | L(du1_do): | |
611 | bf 30,L(du1_1dw) | |
612 | ||
613 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
614 | /* FIXME: can combine last shift and "or" into "rldimi" */ |
615 | #ifdef __LITTLE_ENDIAN__ | |
616 | srdi 0,6, 8 | |
617 | sldi 8,7, 64-8 | |
618 | #else | |
04067002 UD |
619 | sldi 0,6, 8 |
620 | srdi 8,7, 64-8 | |
759cfef3 | 621 | #endif |
04067002 UD |
622 | or 0,0,8 |
623 | ld 6,16(5) | |
624 | std 0,0(4) | |
759cfef3 AM |
625 | #ifdef __LITTLE_ENDIAN__ |
626 | srdi 0,7, 8 | |
627 | sldi 8,6, 64-8 | |
628 | #else | |
04067002 UD |
629 | sldi 0,7, 8 |
630 | srdi 8,6, 64-8 | |
759cfef3 | 631 | #endif |
04067002 UD |
632 | or 0,0,8 |
633 | ld 7,24(5) | |
634 | std 0,8(4) | |
635 | addi 4,4,16 | |
636 | addi 5,5,32 | |
637 | blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ | |
638 | bf 31,L(du1_loop) | |
639 | /* there is a third DW to copy */ | |
759cfef3 AM |
640 | #ifdef __LITTLE_ENDIAN__ |
641 | srdi 0,6, 8 | |
642 | sldi 8,7, 64-8 | |
643 | #else | |
04067002 UD |
644 | sldi 0,6, 8 |
645 | srdi 8,7, 64-8 | |
759cfef3 | 646 | #endif |
04067002 UD |
647 | or 0,0,8 |
648 | std 0,0(4) | |
649 | mr 6,7 | |
650 | ld 7,0(5) | |
651 | addi 5,5,8 | |
652 | addi 4,4,8 | |
653 | beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */ | |
654 | b L(du1_loop) | |
655 | .align 4 | |
656 | L(du1_1dw): | |
759cfef3 AM |
657 | #ifdef __LITTLE_ENDIAN__ |
658 | srdi 0,6, 8 | |
659 | sldi 8,7, 64-8 | |
660 | #else | |
04067002 UD |
661 | sldi 0,6, 8 |
662 | srdi 8,7, 64-8 | |
759cfef3 | 663 | #endif |
04067002 UD |
664 | addi 5,5,16 |
665 | or 0,0,8 | |
666 | bf 31,L(du1_loop) | |
667 | mr 6,7 | |
668 | ld 7,0(5) | |
669 | addi 5,5,8 | |
670 | std 0,0(4) | |
671 | addi 4,4,8 | |
672 | .align 4 | |
673 | /* copy 32 bytes at a time */ | |
674 | L(du1_loop): | |
759cfef3 AM |
675 | #ifdef __LITTLE_ENDIAN__ |
676 | srdi 0,6, 8 | |
677 | sldi 8,7, 64-8 | |
678 | #else | |
04067002 UD |
679 | sldi 0,6, 8 |
680 | srdi 8,7, 64-8 | |
759cfef3 | 681 | #endif |
04067002 UD |
682 | or 0,0,8 |
683 | ld 6,0(5) | |
684 | std 0,0(4) | |
759cfef3 AM |
685 | #ifdef __LITTLE_ENDIAN__ |
686 | srdi 0,7, 8 | |
687 | sldi 8,6, 64-8 | |
688 | #else | |
04067002 UD |
689 | sldi 0,7, 8 |
690 | srdi 8,6, 64-8 | |
759cfef3 | 691 | #endif |
04067002 UD |
692 | or 0,0,8 |
693 | ld 7,8(5) | |
694 | std 0,8(4) | |
759cfef3 AM |
695 | #ifdef __LITTLE_ENDIAN__ |
696 | srdi 0,6, 8 | |
697 | sldi 8,7, 64-8 | |
698 | #else | |
04067002 UD |
699 | sldi 0,6, 8 |
700 | srdi 8,7, 64-8 | |
759cfef3 | 701 | #endif |
04067002 UD |
702 | or 0,0,8 |
703 | ld 6,16(5) | |
704 | std 0,16(4) | |
759cfef3 AM |
705 | #ifdef __LITTLE_ENDIAN__ |
706 | srdi 0,7, 8 | |
707 | sldi 8,6, 64-8 | |
708 | #else | |
04067002 UD |
709 | sldi 0,7, 8 |
710 | srdi 8,6, 64-8 | |
759cfef3 | 711 | #endif |
04067002 UD |
712 | or 0,0,8 |
713 | ld 7,24(5) | |
714 | std 0,24(4) | |
715 | addi 5,5,32 | |
716 | addi 4,4,32 | |
717 | bdnz+ L(du1_loop) | |
718 | .align 4 | |
719 | L(du1_fini): | |
720 | /* calculate and store the final DW */ | |
759cfef3 AM |
721 | #ifdef __LITTLE_ENDIAN__ |
722 | srdi 0,6, 8 | |
723 | sldi 8,7, 64-8 | |
724 | #else | |
04067002 UD |
725 | sldi 0,6, 8 |
726 | srdi 8,7, 64-8 | |
759cfef3 | 727 | #endif |
9c84384c | 728 | or 0,0,8 |
04067002 UD |
729 | std 0,0(4) |
730 | b L(du_done) | |
731 | ||
732 | .align 4 | |
733 | L(du2_do): | |
734 | bf 30,L(du2_1dw) | |
735 | ||
736 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
737 | #ifdef __LITTLE_ENDIAN__ |
738 | srdi 0,6, 16 | |
739 | sldi 8,7, 64-16 | |
740 | #else | |
04067002 UD |
741 | sldi 0,6, 16 |
742 | srdi 8,7, 64-16 | |
759cfef3 | 743 | #endif |
04067002 UD |
744 | or 0,0,8 |
745 | ld 6,16(5) | |
746 | std 0,0(4) | |
759cfef3 AM |
747 | #ifdef __LITTLE_ENDIAN__ |
748 | srdi 0,7, 16 | |
749 | sldi 8,6, 64-16 | |
750 | #else | |
04067002 UD |
751 | sldi 0,7, 16 |
752 | srdi 8,6, 64-16 | |
759cfef3 | 753 | #endif |
04067002 UD |
754 | or 0,0,8 |
755 | ld 7,24(5) | |
756 | std 0,8(4) | |
757 | addi 4,4,16 | |
758 | addi 5,5,32 | |
759 | blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ | |
760 | bf 31,L(du2_loop) | |
761 | /* there is a third DW to copy */ | |
759cfef3 AM |
762 | #ifdef __LITTLE_ENDIAN__ |
763 | srdi 0,6, 16 | |
764 | sldi 8,7, 64-16 | |
765 | #else | |
04067002 UD |
766 | sldi 0,6, 16 |
767 | srdi 8,7, 64-16 | |
759cfef3 | 768 | #endif |
04067002 UD |
769 | or 0,0,8 |
770 | std 0,0(4) | |
771 | mr 6,7 | |
772 | ld 7,0(5) | |
773 | addi 5,5,8 | |
774 | addi 4,4,8 | |
775 | beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */ | |
776 | b L(du2_loop) | |
777 | .align 4 | |
778 | L(du2_1dw): | |
759cfef3 AM |
779 | #ifdef __LITTLE_ENDIAN__ |
780 | srdi 0,6, 16 | |
781 | sldi 8,7, 64-16 | |
782 | #else | |
04067002 UD |
783 | sldi 0,6, 16 |
784 | srdi 8,7, 64-16 | |
759cfef3 | 785 | #endif |
04067002 UD |
786 | addi 5,5,16 |
787 | or 0,0,8 | |
788 | bf 31,L(du2_loop) | |
789 | mr 6,7 | |
790 | ld 7,0(5) | |
791 | addi 5,5,8 | |
792 | std 0,0(4) | |
793 | addi 4,4,8 | |
794 | .align 4 | |
795 | /* copy 32 bytes at a time */ | |
796 | L(du2_loop): | |
759cfef3 AM |
797 | #ifdef __LITTLE_ENDIAN__ |
798 | srdi 0,6, 16 | |
799 | sldi 8,7, 64-16 | |
800 | #else | |
04067002 UD |
801 | sldi 0,6, 16 |
802 | srdi 8,7, 64-16 | |
759cfef3 | 803 | #endif |
04067002 UD |
804 | or 0,0,8 |
805 | ld 6,0(5) | |
806 | std 0,0(4) | |
759cfef3 AM |
807 | #ifdef __LITTLE_ENDIAN__ |
808 | srdi 0,7, 16 | |
809 | sldi 8,6, 64-16 | |
810 | #else | |
04067002 UD |
811 | sldi 0,7, 16 |
812 | srdi 8,6, 64-16 | |
759cfef3 | 813 | #endif |
04067002 UD |
814 | or 0,0,8 |
815 | ld 7,8(5) | |
816 | std 0,8(4) | |
759cfef3 AM |
817 | #ifdef __LITTLE_ENDIAN__ |
818 | srdi 0,6, 16 | |
819 | sldi 8,7, 64-16 | |
820 | #else | |
04067002 UD |
821 | sldi 0,6, 16 |
822 | srdi 8,7, 64-16 | |
759cfef3 | 823 | #endif |
04067002 UD |
824 | or 0,0,8 |
825 | ld 6,16(5) | |
826 | std 0,16(4) | |
759cfef3 AM |
827 | #ifdef __LITTLE_ENDIAN__ |
828 | srdi 0,7, 16 | |
829 | sldi 8,6, 64-16 | |
830 | #else | |
04067002 UD |
831 | sldi 0,7, 16 |
832 | srdi 8,6, 64-16 | |
759cfef3 | 833 | #endif |
04067002 UD |
834 | or 0,0,8 |
835 | ld 7,24(5) | |
836 | std 0,24(4) | |
837 | addi 5,5,32 | |
838 | addi 4,4,32 | |
839 | bdnz+ L(du2_loop) | |
840 | .align 4 | |
841 | L(du2_fini): | |
842 | /* calculate and store the final DW */ | |
759cfef3 AM |
843 | #ifdef __LITTLE_ENDIAN__ |
844 | srdi 0,6, 16 | |
845 | sldi 8,7, 64-16 | |
846 | #else | |
04067002 UD |
847 | sldi 0,6, 16 |
848 | srdi 8,7, 64-16 | |
759cfef3 | 849 | #endif |
9c84384c | 850 | or 0,0,8 |
04067002 UD |
851 | std 0,0(4) |
852 | b L(du_done) | |
853 | ||
854 | .align 4 | |
855 | L(du3_do): | |
856 | bf 30,L(du3_1dw) | |
857 | ||
858 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
859 | #ifdef __LITTLE_ENDIAN__ |
860 | srdi 0,6, 24 | |
861 | sldi 8,7, 64-24 | |
862 | #else | |
04067002 UD |
863 | sldi 0,6, 24 |
864 | srdi 8,7, 64-24 | |
759cfef3 | 865 | #endif |
04067002 UD |
866 | or 0,0,8 |
867 | ld 6,16(5) | |
868 | std 0,0(4) | |
759cfef3 AM |
869 | #ifdef __LITTLE_ENDIAN__ |
870 | srdi 0,7, 24 | |
871 | sldi 8,6, 64-24 | |
872 | #else | |
04067002 UD |
873 | sldi 0,7, 24 |
874 | srdi 8,6, 64-24 | |
759cfef3 | 875 | #endif |
04067002 UD |
876 | or 0,0,8 |
877 | ld 7,24(5) | |
878 | std 0,8(4) | |
879 | addi 4,4,16 | |
880 | addi 5,5,32 | |
881 | blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ | |
882 | bf 31,L(du3_loop) | |
883 | /* there is a third DW to copy */ | |
759cfef3 AM |
884 | #ifdef __LITTLE_ENDIAN__ |
885 | srdi 0,6, 24 | |
886 | sldi 8,7, 64-24 | |
887 | #else | |
04067002 UD |
888 | sldi 0,6, 24 |
889 | srdi 8,7, 64-24 | |
759cfef3 | 890 | #endif |
04067002 UD |
891 | or 0,0,8 |
892 | std 0,0(4) | |
893 | mr 6,7 | |
894 | ld 7,0(5) | |
895 | addi 5,5,8 | |
896 | addi 4,4,8 | |
897 | beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */ | |
898 | b L(du3_loop) | |
899 | .align 4 | |
900 | L(du3_1dw): | |
759cfef3 AM |
901 | #ifdef __LITTLE_ENDIAN__ |
902 | srdi 0,6, 24 | |
903 | sldi 8,7, 64-24 | |
904 | #else | |
04067002 UD |
905 | sldi 0,6, 24 |
906 | srdi 8,7, 64-24 | |
759cfef3 | 907 | #endif |
04067002 UD |
908 | addi 5,5,16 |
909 | or 0,0,8 | |
910 | bf 31,L(du3_loop) | |
911 | mr 6,7 | |
912 | ld 7,0(5) | |
913 | addi 5,5,8 | |
914 | std 0,0(4) | |
915 | addi 4,4,8 | |
916 | .align 4 | |
917 | /* copy 32 bytes at a time */ | |
918 | L(du3_loop): | |
759cfef3 AM |
919 | #ifdef __LITTLE_ENDIAN__ |
920 | srdi 0,6, 24 | |
921 | sldi 8,7, 64-24 | |
922 | #else | |
04067002 UD |
923 | sldi 0,6, 24 |
924 | srdi 8,7, 64-24 | |
759cfef3 | 925 | #endif |
04067002 UD |
926 | or 0,0,8 |
927 | ld 6,0(5) | |
928 | std 0,0(4) | |
759cfef3 AM |
929 | #ifdef __LITTLE_ENDIAN__ |
930 | srdi 0,7, 24 | |
931 | sldi 8,6, 64-24 | |
932 | #else | |
04067002 UD |
933 | sldi 0,7, 24 |
934 | srdi 8,6, 64-24 | |
759cfef3 | 935 | #endif |
04067002 UD |
936 | or 0,0,8 |
937 | ld 7,8(5) | |
938 | std 0,8(4) | |
759cfef3 AM |
939 | #ifdef __LITTLE_ENDIAN__ |
940 | srdi 0,6, 24 | |
941 | sldi 8,7, 64-24 | |
942 | #else | |
04067002 UD |
943 | sldi 0,6, 24 |
944 | srdi 8,7, 64-24 | |
759cfef3 | 945 | #endif |
04067002 UD |
946 | or 0,0,8 |
947 | ld 6,16(5) | |
948 | std 0,16(4) | |
759cfef3 AM |
949 | #ifdef __LITTLE_ENDIAN__ |
950 | srdi 0,7, 24 | |
951 | sldi 8,6, 64-24 | |
952 | #else | |
04067002 UD |
953 | sldi 0,7, 24 |
954 | srdi 8,6, 64-24 | |
759cfef3 | 955 | #endif |
04067002 UD |
956 | or 0,0,8 |
957 | ld 7,24(5) | |
958 | std 0,24(4) | |
959 | addi 5,5,32 | |
960 | addi 4,4,32 | |
961 | bdnz+ L(du3_loop) | |
962 | .align 4 | |
963 | L(du3_fini): | |
964 | /* calculate and store the final DW */ | |
759cfef3 AM |
965 | #ifdef __LITTLE_ENDIAN__ |
966 | srdi 0,6, 24 | |
967 | sldi 8,7, 64-24 | |
968 | #else | |
04067002 UD |
969 | sldi 0,6, 24 |
970 | srdi 8,7, 64-24 | |
759cfef3 | 971 | #endif |
9c84384c | 972 | or 0,0,8 |
04067002 UD |
973 | std 0,0(4) |
974 | b L(du_done) | |
975 | ||
976 | .align 4 | |
977 | L(du4_do): | |
978 | cmpldi cr5, 10, 6 | |
979 | beq cr0, L(du4_dox) | |
980 | blt cr5, L(du5_do) | |
981 | beq cr5, L(du6_do) | |
982 | b L(du7_do) | |
983 | L(du4_dox): | |
984 | bf 30,L(du4_1dw) | |
985 | ||
986 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
987 | #ifdef __LITTLE_ENDIAN__ |
988 | srdi 0,6, 32 | |
989 | sldi 8,7, 64-32 | |
990 | #else | |
04067002 UD |
991 | sldi 0,6, 32 |
992 | srdi 8,7, 64-32 | |
759cfef3 | 993 | #endif |
04067002 UD |
994 | or 0,0,8 |
995 | ld 6,16(5) | |
996 | std 0,0(4) | |
759cfef3 AM |
997 | #ifdef __LITTLE_ENDIAN__ |
998 | srdi 0,7, 32 | |
999 | sldi 8,6, 64-32 | |
1000 | #else | |
04067002 UD |
1001 | sldi 0,7, 32 |
1002 | srdi 8,6, 64-32 | |
759cfef3 | 1003 | #endif |
04067002 UD |
1004 | or 0,0,8 |
1005 | ld 7,24(5) | |
1006 | std 0,8(4) | |
1007 | addi 4,4,16 | |
1008 | addi 5,5,32 | |
1009 | blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ | |
1010 | bf 31,L(du4_loop) | |
1011 | /* there is a third DW to copy */ | |
759cfef3 AM |
1012 | #ifdef __LITTLE_ENDIAN__ |
1013 | srdi 0,6, 32 | |
1014 | sldi 8,7, 64-32 | |
1015 | #else | |
04067002 UD |
1016 | sldi 0,6, 32 |
1017 | srdi 8,7, 64-32 | |
759cfef3 | 1018 | #endif |
04067002 UD |
1019 | or 0,0,8 |
1020 | std 0,0(4) | |
1021 | mr 6,7 | |
1022 | ld 7,0(5) | |
1023 | addi 5,5,8 | |
1024 | addi 4,4,8 | |
1025 | beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */ | |
1026 | b L(du4_loop) | |
1027 | .align 4 | |
1028 | L(du4_1dw): | |
759cfef3 AM |
1029 | #ifdef __LITTLE_ENDIAN__ |
1030 | srdi 0,6, 32 | |
1031 | sldi 8,7, 64-32 | |
1032 | #else | |
04067002 UD |
1033 | sldi 0,6, 32 |
1034 | srdi 8,7, 64-32 | |
759cfef3 | 1035 | #endif |
04067002 UD |
1036 | addi 5,5,16 |
1037 | or 0,0,8 | |
1038 | bf 31,L(du4_loop) | |
1039 | mr 6,7 | |
1040 | ld 7,0(5) | |
1041 | addi 5,5,8 | |
1042 | std 0,0(4) | |
1043 | addi 4,4,8 | |
1044 | .align 4 | |
1045 | /* copy 32 bytes at a time */ | |
1046 | L(du4_loop): | |
759cfef3 AM |
1047 | #ifdef __LITTLE_ENDIAN__ |
1048 | srdi 0,6, 32 | |
1049 | sldi 8,7, 64-32 | |
1050 | #else | |
04067002 UD |
1051 | sldi 0,6, 32 |
1052 | srdi 8,7, 64-32 | |
759cfef3 | 1053 | #endif |
04067002 UD |
1054 | or 0,0,8 |
1055 | ld 6,0(5) | |
1056 | std 0,0(4) | |
759cfef3 AM |
1057 | #ifdef __LITTLE_ENDIAN__ |
1058 | srdi 0,7, 32 | |
1059 | sldi 8,6, 64-32 | |
1060 | #else | |
04067002 UD |
1061 | sldi 0,7, 32 |
1062 | srdi 8,6, 64-32 | |
759cfef3 | 1063 | #endif |
04067002 UD |
1064 | or 0,0,8 |
1065 | ld 7,8(5) | |
1066 | std 0,8(4) | |
759cfef3 AM |
1067 | #ifdef __LITTLE_ENDIAN__ |
1068 | srdi 0,6, 32 | |
1069 | sldi 8,7, 64-32 | |
1070 | #else | |
04067002 UD |
1071 | sldi 0,6, 32 |
1072 | srdi 8,7, 64-32 | |
759cfef3 | 1073 | #endif |
04067002 UD |
1074 | or 0,0,8 |
1075 | ld 6,16(5) | |
1076 | std 0,16(4) | |
759cfef3 AM |
1077 | #ifdef __LITTLE_ENDIAN__ |
1078 | srdi 0,7, 32 | |
1079 | sldi 8,6, 64-32 | |
1080 | #else | |
04067002 UD |
1081 | sldi 0,7, 32 |
1082 | srdi 8,6, 64-32 | |
759cfef3 | 1083 | #endif |
04067002 UD |
1084 | or 0,0,8 |
1085 | ld 7,24(5) | |
1086 | std 0,24(4) | |
1087 | addi 5,5,32 | |
1088 | addi 4,4,32 | |
1089 | bdnz+ L(du4_loop) | |
1090 | .align 4 | |
1091 | L(du4_fini): | |
1092 | /* calculate and store the final DW */ | |
759cfef3 AM |
1093 | #ifdef __LITTLE_ENDIAN__ |
1094 | srdi 0,6, 32 | |
1095 | sldi 8,7, 64-32 | |
1096 | #else | |
04067002 UD |
1097 | sldi 0,6, 32 |
1098 | srdi 8,7, 64-32 | |
759cfef3 | 1099 | #endif |
9c84384c | 1100 | or 0,0,8 |
04067002 UD |
1101 | std 0,0(4) |
1102 | b L(du_done) | |
1103 | ||
1104 | .align 4 | |
1105 | L(du5_do): | |
1106 | bf 30,L(du5_1dw) | |
1107 | ||
1108 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
1109 | #ifdef __LITTLE_ENDIAN__ |
1110 | srdi 0,6, 40 | |
1111 | sldi 8,7, 64-40 | |
1112 | #else | |
04067002 UD |
1113 | sldi 0,6, 40 |
1114 | srdi 8,7, 64-40 | |
759cfef3 | 1115 | #endif |
04067002 UD |
1116 | or 0,0,8 |
1117 | ld 6,16(5) | |
1118 | std 0,0(4) | |
759cfef3 AM |
1119 | #ifdef __LITTLE_ENDIAN__ |
1120 | srdi 0,7, 40 | |
1121 | sldi 8,6, 64-40 | |
1122 | #else | |
04067002 UD |
1123 | sldi 0,7, 40 |
1124 | srdi 8,6, 64-40 | |
759cfef3 | 1125 | #endif |
04067002 UD |
1126 | or 0,0,8 |
1127 | ld 7,24(5) | |
1128 | std 0,8(4) | |
1129 | addi 4,4,16 | |
1130 | addi 5,5,32 | |
1131 | blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ | |
1132 | bf 31,L(du5_loop) | |
1133 | /* there is a third DW to copy */ | |
759cfef3 AM |
1134 | #ifdef __LITTLE_ENDIAN__ |
1135 | srdi 0,6, 40 | |
1136 | sldi 8,7, 64-40 | |
1137 | #else | |
04067002 UD |
1138 | sldi 0,6, 40 |
1139 | srdi 8,7, 64-40 | |
759cfef3 | 1140 | #endif |
04067002 UD |
1141 | or 0,0,8 |
1142 | std 0,0(4) | |
1143 | mr 6,7 | |
1144 | ld 7,0(5) | |
1145 | addi 5,5,8 | |
1146 | addi 4,4,8 | |
1147 | beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */ | |
1148 | b L(du5_loop) | |
1149 | .align 4 | |
1150 | L(du5_1dw): | |
759cfef3 AM |
1151 | #ifdef __LITTLE_ENDIAN__ |
1152 | srdi 0,6, 40 | |
1153 | sldi 8,7, 64-40 | |
1154 | #else | |
04067002 UD |
1155 | sldi 0,6, 40 |
1156 | srdi 8,7, 64-40 | |
759cfef3 | 1157 | #endif |
04067002 UD |
1158 | addi 5,5,16 |
1159 | or 0,0,8 | |
1160 | bf 31,L(du5_loop) | |
1161 | mr 6,7 | |
1162 | ld 7,0(5) | |
1163 | addi 5,5,8 | |
1164 | std 0,0(4) | |
1165 | addi 4,4,8 | |
1166 | .align 4 | |
1167 | /* copy 32 bytes at a time */ | |
1168 | L(du5_loop): | |
759cfef3 AM |
1169 | #ifdef __LITTLE_ENDIAN__ |
1170 | srdi 0,6, 40 | |
1171 | sldi 8,7, 64-40 | |
1172 | #else | |
04067002 UD |
1173 | sldi 0,6, 40 |
1174 | srdi 8,7, 64-40 | |
759cfef3 | 1175 | #endif |
04067002 UD |
1176 | or 0,0,8 |
1177 | ld 6,0(5) | |
1178 | std 0,0(4) | |
759cfef3 AM |
1179 | #ifdef __LITTLE_ENDIAN__ |
1180 | srdi 0,7, 40 | |
1181 | sldi 8,6, 64-40 | |
1182 | #else | |
04067002 UD |
1183 | sldi 0,7, 40 |
1184 | srdi 8,6, 64-40 | |
759cfef3 | 1185 | #endif |
04067002 UD |
1186 | or 0,0,8 |
1187 | ld 7,8(5) | |
1188 | std 0,8(4) | |
759cfef3 AM |
1189 | #ifdef __LITTLE_ENDIAN__ |
1190 | srdi 0,6, 40 | |
1191 | sldi 8,7, 64-40 | |
1192 | #else | |
04067002 UD |
1193 | sldi 0,6, 40 |
1194 | srdi 8,7, 64-40 | |
759cfef3 | 1195 | #endif |
04067002 UD |
1196 | or 0,0,8 |
1197 | ld 6,16(5) | |
1198 | std 0,16(4) | |
759cfef3 AM |
1199 | #ifdef __LITTLE_ENDIAN__ |
1200 | srdi 0,7, 40 | |
1201 | sldi 8,6, 64-40 | |
1202 | #else | |
04067002 UD |
1203 | sldi 0,7, 40 |
1204 | srdi 8,6, 64-40 | |
759cfef3 | 1205 | #endif |
04067002 UD |
1206 | or 0,0,8 |
1207 | ld 7,24(5) | |
1208 | std 0,24(4) | |
1209 | addi 5,5,32 | |
1210 | addi 4,4,32 | |
1211 | bdnz+ L(du5_loop) | |
1212 | .align 4 | |
1213 | L(du5_fini): | |
1214 | /* calculate and store the final DW */ | |
759cfef3 AM |
1215 | #ifdef __LITTLE_ENDIAN__ |
1216 | srdi 0,6, 40 | |
1217 | sldi 8,7, 64-40 | |
1218 | #else | |
04067002 UD |
1219 | sldi 0,6, 40 |
1220 | srdi 8,7, 64-40 | |
759cfef3 | 1221 | #endif |
9c84384c | 1222 | or 0,0,8 |
04067002 UD |
1223 | std 0,0(4) |
1224 | b L(du_done) | |
1225 | ||
1226 | .align 4 | |
1227 | L(du6_do): | |
1228 | bf 30,L(du6_1dw) | |
1229 | ||
1230 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
1231 | #ifdef __LITTLE_ENDIAN__ |
1232 | srdi 0,6, 48 | |
1233 | sldi 8,7, 64-48 | |
1234 | #else | |
04067002 UD |
1235 | sldi 0,6, 48 |
1236 | srdi 8,7, 64-48 | |
759cfef3 | 1237 | #endif |
04067002 UD |
1238 | or 0,0,8 |
1239 | ld 6,16(5) | |
1240 | std 0,0(4) | |
759cfef3 AM |
1241 | #ifdef __LITTLE_ENDIAN__ |
1242 | srdi 0,7, 48 | |
1243 | sldi 8,6, 64-48 | |
1244 | #else | |
04067002 UD |
1245 | sldi 0,7, 48 |
1246 | srdi 8,6, 64-48 | |
759cfef3 | 1247 | #endif |
04067002 UD |
1248 | or 0,0,8 |
1249 | ld 7,24(5) | |
1250 | std 0,8(4) | |
1251 | addi 4,4,16 | |
1252 | addi 5,5,32 | |
1253 | blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ | |
1254 | bf 31,L(du6_loop) | |
1255 | /* there is a third DW to copy */ | |
759cfef3 AM |
1256 | #ifdef __LITTLE_ENDIAN__ |
1257 | srdi 0,6, 48 | |
1258 | sldi 8,7, 64-48 | |
1259 | #else | |
04067002 UD |
1260 | sldi 0,6, 48 |
1261 | srdi 8,7, 64-48 | |
759cfef3 | 1262 | #endif |
04067002 UD |
1263 | or 0,0,8 |
1264 | std 0,0(4) | |
1265 | mr 6,7 | |
1266 | ld 7,0(5) | |
1267 | addi 5,5,8 | |
1268 | addi 4,4,8 | |
1269 | beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */ | |
1270 | b L(du6_loop) | |
1271 | .align 4 | |
1272 | L(du6_1dw): | |
759cfef3 AM |
1273 | #ifdef __LITTLE_ENDIAN__ |
1274 | srdi 0,6, 48 | |
1275 | sldi 8,7, 64-48 | |
1276 | #else | |
04067002 UD |
1277 | sldi 0,6, 48 |
1278 | srdi 8,7, 64-48 | |
759cfef3 | 1279 | #endif |
04067002 UD |
1280 | addi 5,5,16 |
1281 | or 0,0,8 | |
1282 | bf 31,L(du6_loop) | |
1283 | mr 6,7 | |
1284 | ld 7,0(5) | |
1285 | addi 5,5,8 | |
1286 | std 0,0(4) | |
1287 | addi 4,4,8 | |
1288 | .align 4 | |
1289 | /* copy 32 bytes at a time */ | |
1290 | L(du6_loop): | |
759cfef3 AM |
1291 | #ifdef __LITTLE_ENDIAN__ |
1292 | srdi 0,6, 48 | |
1293 | sldi 8,7, 64-48 | |
1294 | #else | |
04067002 UD |
1295 | sldi 0,6, 48 |
1296 | srdi 8,7, 64-48 | |
759cfef3 | 1297 | #endif |
04067002 UD |
1298 | or 0,0,8 |
1299 | ld 6,0(5) | |
1300 | std 0,0(4) | |
759cfef3 AM |
1301 | #ifdef __LITTLE_ENDIAN__ |
1302 | srdi 0,7, 48 | |
1303 | sldi 8,6, 64-48 | |
1304 | #else | |
04067002 UD |
1305 | sldi 0,7, 48 |
1306 | srdi 8,6, 64-48 | |
759cfef3 | 1307 | #endif |
04067002 UD |
1308 | or 0,0,8 |
1309 | ld 7,8(5) | |
1310 | std 0,8(4) | |
759cfef3 AM |
1311 | #ifdef __LITTLE_ENDIAN__ |
1312 | srdi 0,6, 48 | |
1313 | sldi 8,7, 64-48 | |
1314 | #else | |
04067002 UD |
1315 | sldi 0,6, 48 |
1316 | srdi 8,7, 64-48 | |
759cfef3 | 1317 | #endif |
04067002 UD |
1318 | or 0,0,8 |
1319 | ld 6,16(5) | |
1320 | std 0,16(4) | |
759cfef3 AM |
1321 | #ifdef __LITTLE_ENDIAN__ |
1322 | srdi 0,7, 48 | |
1323 | sldi 8,6, 64-48 | |
1324 | #else | |
04067002 UD |
1325 | sldi 0,7, 48 |
1326 | srdi 8,6, 64-48 | |
759cfef3 | 1327 | #endif |
04067002 UD |
1328 | or 0,0,8 |
1329 | ld 7,24(5) | |
1330 | std 0,24(4) | |
1331 | addi 5,5,32 | |
1332 | addi 4,4,32 | |
1333 | bdnz+ L(du6_loop) | |
1334 | .align 4 | |
1335 | L(du6_fini): | |
1336 | /* calculate and store the final DW */ | |
759cfef3 AM |
1337 | #ifdef __LITTLE_ENDIAN__ |
1338 | srdi 0,6, 48 | |
1339 | sldi 8,7, 64-48 | |
1340 | #else | |
04067002 UD |
1341 | sldi 0,6, 48 |
1342 | srdi 8,7, 64-48 | |
759cfef3 | 1343 | #endif |
9c84384c | 1344 | or 0,0,8 |
04067002 UD |
1345 | std 0,0(4) |
1346 | b L(du_done) | |
1347 | ||
1348 | .align 4 | |
1349 | L(du7_do): | |
1350 | bf 30,L(du7_1dw) | |
1351 | ||
1352 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
1353 | #ifdef __LITTLE_ENDIAN__ |
1354 | srdi 0,6, 56 | |
1355 | sldi 8,7, 64-56 | |
1356 | #else | |
04067002 UD |
1357 | sldi 0,6, 56 |
1358 | srdi 8,7, 64-56 | |
759cfef3 | 1359 | #endif |
04067002 UD |
1360 | or 0,0,8 |
1361 | ld 6,16(5) | |
1362 | std 0,0(4) | |
759cfef3 AM |
1363 | #ifdef __LITTLE_ENDIAN__ |
1364 | srdi 0,7, 56 | |
1365 | sldi 8,6, 64-56 | |
1366 | #else | |
04067002 UD |
1367 | sldi 0,7, 56 |
1368 | srdi 8,6, 64-56 | |
759cfef3 | 1369 | #endif |
04067002 UD |
1370 | or 0,0,8 |
1371 | ld 7,24(5) | |
1372 | std 0,8(4) | |
1373 | addi 4,4,16 | |
1374 | addi 5,5,32 | |
1375 | blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ | |
1376 | bf 31,L(du7_loop) | |
1377 | /* there is a third DW to copy */ | |
759cfef3 AM |
1378 | #ifdef __LITTLE_ENDIAN__ |
1379 | srdi 0,6, 56 | |
1380 | sldi 8,7, 64-56 | |
1381 | #else | |
04067002 UD |
1382 | sldi 0,6, 56 |
1383 | srdi 8,7, 64-56 | |
759cfef3 | 1384 | #endif |
04067002 UD |
1385 | or 0,0,8 |
1386 | std 0,0(4) | |
1387 | mr 6,7 | |
1388 | ld 7,0(5) | |
1389 | addi 5,5,8 | |
1390 | addi 4,4,8 | |
1391 | beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */ | |
1392 | b L(du7_loop) | |
1393 | .align 4 | |
1394 | L(du7_1dw): | |
759cfef3 AM |
1395 | #ifdef __LITTLE_ENDIAN__ |
1396 | srdi 0,6, 56 | |
1397 | sldi 8,7, 64-56 | |
1398 | #else | |
04067002 UD |
1399 | sldi 0,6, 56 |
1400 | srdi 8,7, 64-56 | |
759cfef3 | 1401 | #endif |
04067002 UD |
1402 | addi 5,5,16 |
1403 | or 0,0,8 | |
1404 | bf 31,L(du7_loop) | |
1405 | mr 6,7 | |
1406 | ld 7,0(5) | |
1407 | addi 5,5,8 | |
1408 | std 0,0(4) | |
1409 | addi 4,4,8 | |
1410 | .align 4 | |
1411 | /* copy 32 bytes at a time */ | |
1412 | L(du7_loop): | |
759cfef3 AM |
1413 | #ifdef __LITTLE_ENDIAN__ |
1414 | srdi 0,6, 56 | |
1415 | sldi 8,7, 64-56 | |
1416 | #else | |
04067002 UD |
1417 | sldi 0,6, 56 |
1418 | srdi 8,7, 64-56 | |
759cfef3 | 1419 | #endif |
04067002 UD |
1420 | or 0,0,8 |
1421 | ld 6,0(5) | |
1422 | std 0,0(4) | |
759cfef3 AM |
1423 | #ifdef __LITTLE_ENDIAN__ |
1424 | srdi 0,7, 56 | |
1425 | sldi 8,6, 64-56 | |
1426 | #else | |
04067002 UD |
1427 | sldi 0,7, 56 |
1428 | srdi 8,6, 64-56 | |
759cfef3 | 1429 | #endif |
04067002 UD |
1430 | or 0,0,8 |
1431 | ld 7,8(5) | |
1432 | std 0,8(4) | |
759cfef3 AM |
1433 | #ifdef __LITTLE_ENDIAN__ |
1434 | srdi 0,6, 56 | |
1435 | sldi 8,7, 64-56 | |
1436 | #else | |
04067002 UD |
1437 | sldi 0,6, 56 |
1438 | srdi 8,7, 64-56 | |
759cfef3 | 1439 | #endif |
04067002 UD |
1440 | or 0,0,8 |
1441 | ld 6,16(5) | |
1442 | std 0,16(4) | |
759cfef3 AM |
1443 | #ifdef __LITTLE_ENDIAN__ |
1444 | srdi 0,7, 56 | |
1445 | sldi 8,6, 64-56 | |
1446 | #else | |
04067002 UD |
1447 | sldi 0,7, 56 |
1448 | srdi 8,6, 64-56 | |
759cfef3 | 1449 | #endif |
04067002 UD |
1450 | or 0,0,8 |
1451 | ld 7,24(5) | |
1452 | std 0,24(4) | |
1453 | addi 5,5,32 | |
1454 | addi 4,4,32 | |
1455 | bdnz+ L(du7_loop) | |
1456 | .align 4 | |
1457 | L(du7_fini): | |
1458 | /* calculate and store the final DW */ | |
759cfef3 AM |
1459 | #ifdef __LITTLE_ENDIAN__ |
1460 | srdi 0,6, 56 | |
1461 | sldi 8,7, 64-56 | |
1462 | #else | |
04067002 UD |
1463 | sldi 0,6, 56 |
1464 | srdi 8,7, 64-56 | |
759cfef3 | 1465 | #endif |
9c84384c | 1466 | or 0,0,8 |
04067002 UD |
1467 | std 0,0(4) |
1468 | b L(du_done) | |
9c84384c | 1469 | |
04067002 UD |
1470 | .align 4 |
1471 | L(du_done): | |
1472 | rldicr 0,31,0,60 | |
1473 | mtcrf 0x01,31 | |
1474 | beq cr1,0f /* If the tail is 0 bytes we are done! */ | |
1475 | ||
1476 | add 3,3,0 | |
9c84384c | 1477 | add 12,12,0 |
04067002 | 1478 | /* At this point we have a tail of 0-7 bytes and we know that the |
2ccdea26 | 1479 | destination is double word aligned. */ |
04067002 UD |
1480 | 4: bf 29,2f |
1481 | lwz 6,0(12) | |
1482 | addi 12,12,4 | |
1483 | stw 6,0(3) | |
1484 | addi 3,3,4 | |
1485 | 2: bf 30,1f | |
1486 | lhz 6,0(12) | |
1487 | addi 12,12,2 | |
1488 | sth 6,0(3) | |
1489 | addi 3,3,2 | |
1490 | 1: bf 31,0f | |
1491 | lbz 6,0(12) | |
1492 | stb 6,0(3) | |
1493 | 0: | |
1494 | /* Return original dst pointer. */ | |
1495 | ld 31,-8(1) | |
1496 | ld 3,-16(1) | |
1497 | blr | |
72fd128a | 1498 | END_GEN_TB (MEMCPY,TB_TOCLESS) |
04067002 | 1499 | libc_hidden_builtin_def (memcpy) |