]>
Commit | Line | Data |
---|---|---|
04067002 | 1 | /* Optimized memcpy implementation for PowerPC64. |
568035b7 | 2 | Copyright (C) 2003-2013 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 UD |
20 | |
21 | /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); | |
22 | Returns 'dst'. | |
23 | ||
9c84384c JM |
24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled | |
26 | with the appropriate combination of byte and halfword load/stores. | |
27 | There is minimal effort to optimize the alignment of short moves. | |
04067002 | 28 | The 64-bit implementations of POWER3 and POWER4 do a reasonable job |
2ccdea26 | 29 | of handling unaligned load/stores that do not cross 32-byte boundaries. |
04067002 UD |
30 | |
31 | Longer moves (>= 32-bytes) justify the effort to get at least the | |
32 | destination doubleword (8-byte) aligned. Further optimization is | |
2ccdea26 | 33 | possible when both source and destination are doubleword aligned. |
04067002 UD |
34 | Each case has a optimized unrolled loop. */ |
35 | ||
a88f47a7 | 36 | .machine power4 |
2d67d91a | 37 | EALIGN (memcpy, 5, 0) |
04067002 UD |
38 | CALL_MCOUNT 3 |
39 | ||
40 | cmpldi cr1,5,31 | |
41 | neg 0,3 | |
42 | std 3,-16(1) | |
43 | std 31,-8(1) | |
44 | cfi_offset(31,-8) | |
2ccdea26 | 45 | andi. 11,3,7 /* check alignment of dst. */ |
04067002 | 46 | clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ |
2ccdea26 | 47 | clrldi 10,4,61 /* check alignment of src. */ |
04067002 UD |
48 | cmpldi cr6,5,8 |
49 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ | |
9c84384c | 50 | cmpld cr6,10,11 |
04067002 UD |
51 | mr 12,4 |
52 | srdi 9,5,3 /* Number of full double words remaining. */ | |
53 | mtcrf 0x01,0 | |
54 | mr 31,5 | |
55 | beq .L0 | |
9c84384c | 56 | |
04067002 | 57 | subf 31,0,5 |
2ccdea26 | 58 | /* Move 0-7 bytes as needed to get the destination doubleword aligned. */ |
04067002 UD |
59 | 1: bf 31,2f |
60 | lbz 6,0(12) | |
61 | addi 12,12,1 | |
62 | stb 6,0(3) | |
63 | addi 3,3,1 | |
64 | 2: bf 30,4f | |
65 | lhz 6,0(12) | |
66 | addi 12,12,2 | |
67 | sth 6,0(3) | |
68 | addi 3,3,2 | |
69 | 4: bf 29,0f | |
70 | lwz 6,0(12) | |
71 | addi 12,12,4 | |
72 | stw 6,0(3) | |
73 | addi 3,3,4 | |
74 | 0: | |
2ccdea26 | 75 | clrldi 10,12,61 /* check alignment of src again. */ |
04067002 | 76 | srdi 9,31,3 /* Number of full double words remaining. */ |
9c84384c | 77 | |
2ccdea26 | 78 | /* Copy doublewords from source to destination, assuming the |
04067002 UD |
79 | destination is aligned on a doubleword boundary. |
80 | ||
81 | At this point we know there are at least 25 bytes left (32-7) to copy. | |
9c84384c | 82 | The next step is to determine if the source is also doubleword aligned. |
04067002 UD |
83 | If not branch to the unaligned move code at .L6. which uses |
84 | a load, shift, store strategy. | |
9c84384c | 85 | |
04067002 UD |
86 | Otherwise source and destination are doubleword aligned, and we can |
87 | the optimized doubleword copy loop. */ | |
88 | .L0: | |
89 | clrldi 11,31,61 | |
90 | mtcrf 0x01,9 | |
91 | cmpldi cr1,11,0 | |
92 | bne- cr6,.L6 /* If source is not DW aligned. */ | |
93 | ||
94 | /* Move doublewords where destination and source are DW aligned. | |
95 | Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration. | |
ded5b9b7 | 96 | If the copy is not an exact multiple of 32 bytes, 1-3 |
04067002 | 97 | doublewords are copied as needed to set up the main loop. After |
9c84384c | 98 | the main loop exits there may be a tail of 1-7 bytes. These byte are |
04067002 UD |
99 | copied a word/halfword/byte at a time as needed to preserve alignment. */ |
100 | ||
101 | srdi 8,31,5 | |
102 | cmpldi cr1,9,4 | |
103 | cmpldi cr6,11,0 | |
104 | mr 11,12 | |
9c84384c | 105 | |
04067002 UD |
106 | bf 30,1f |
107 | ld 6,0(12) | |
108 | ld 7,8(12) | |
109 | addi 11,12,16 | |
110 | mtctr 8 | |
111 | std 6,0(3) | |
112 | std 7,8(3) | |
113 | addi 10,3,16 | |
114 | bf 31,4f | |
115 | ld 0,16(12) | |
9c84384c | 116 | std 0,16(3) |
04067002 UD |
117 | blt cr1,3f |
118 | addi 11,12,24 | |
119 | addi 10,3,24 | |
120 | b 4f | |
121 | .align 4 | |
122 | 1: | |
123 | mr 10,3 | |
124 | mtctr 8 | |
125 | bf 31,4f | |
126 | ld 6,0(12) | |
127 | addi 11,12,8 | |
128 | std 6,0(3) | |
129 | addi 10,3,8 | |
9c84384c | 130 | |
04067002 UD |
131 | .align 4 |
132 | 4: | |
133 | ld 6,0(11) | |
134 | ld 7,8(11) | |
135 | ld 8,16(11) | |
136 | ld 0,24(11) | |
137 | addi 11,11,32 | |
138 | 2: | |
139 | std 6,0(10) | |
140 | std 7,8(10) | |
141 | std 8,16(10) | |
142 | std 0,24(10) | |
143 | addi 10,10,32 | |
144 | bdnz 4b | |
9c84384c | 145 | 3: |
04067002 UD |
146 | |
147 | rldicr 0,31,0,60 | |
148 | mtcrf 0x01,31 | |
149 | beq cr6,0f | |
150 | .L9: | |
151 | add 3,3,0 | |
152 | add 12,12,0 | |
9c84384c | 153 | |
04067002 | 154 | /* At this point we have a tail of 0-7 bytes and we know that the |
2ccdea26 | 155 | destination is double word aligned. */ |
04067002 UD |
156 | 4: bf 29,2f |
157 | lwz 6,0(12) | |
158 | addi 12,12,4 | |
159 | stw 6,0(3) | |
160 | addi 3,3,4 | |
161 | 2: bf 30,1f | |
162 | lhz 6,0(12) | |
163 | addi 12,12,2 | |
164 | sth 6,0(3) | |
165 | addi 3,3,2 | |
166 | 1: bf 31,0f | |
167 | lbz 6,0(12) | |
168 | stb 6,0(3) | |
169 | 0: | |
170 | /* Return original dst pointer. */ | |
171 | ld 31,-8(1) | |
172 | ld 3,-16(1) | |
173 | blr | |
9c84384c JM |
174 | |
175 | /* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 | |
176 | bytes. Each case is handled without loops, using binary (1,2,4,8) | |
177 | tests. | |
178 | ||
04067002 | 179 | In the short (0-8 byte) case no attempt is made to force alignment |
9c84384c JM |
180 | of either source or destination. The hardware will handle the |
181 | unaligned load/stores with small delays for crossing 32- 64-byte, and | |
04067002 | 182 | 4096-byte boundaries. Since these short moves are unlikely to be |
9c84384c | 183 | unaligned or cross these boundaries, the overhead to force |
04067002 | 184 | alignment is not justified. |
9c84384c | 185 | |
04067002 UD |
186 | The longer (9-31 byte) move is more likely to cross 32- or 64-byte |
187 | boundaries. Since only loads are sensitive to the 32-/64-byte | |
9c84384c | 188 | boundaries it is more important to align the source then the |
04067002 | 189 | destination. If the source is not already word aligned, we first |
9c84384c JM |
190 | move 1-3 bytes as needed. Since we are only word aligned we don't |
191 | use double word load/stores to insure that all loads are aligned. | |
04067002 UD |
192 | While the destination and stores may still be unaligned, this |
193 | is only an issue for page (4096 byte boundary) crossing, which | |
194 | should be rare for these short moves. The hardware handles this | |
9c84384c JM |
195 | case automatically with a small delay. */ |
196 | ||
04067002 UD |
197 | .align 4 |
198 | .L2: | |
199 | mtcrf 0x01,5 | |
200 | neg 8,4 | |
201 | clrrdi 11,4,2 | |
202 | andi. 0,8,3 | |
203 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ | |
204 | /* At least 9 bytes left. Get the source word aligned. */ | |
205 | cmpldi cr1,5,16 | |
206 | mr 10,5 | |
207 | mr 12,4 | |
208 | cmpldi cr6,0,2 | |
209 | beq .L3 /* If the source is already word aligned skip this. */ | |
210 | /* Copy 1-3 bytes to get source address word aligned. */ | |
211 | lwz 6,0(11) | |
212 | subf 10,0,5 | |
213 | add 12,4,0 | |
214 | blt cr6,5f | |
215 | srdi 7,6,16 | |
216 | bgt cr6,3f | |
759cfef3 AM |
217 | #ifdef __LITTLE_ENDIAN__ |
218 | sth 7,0(3) | |
219 | #else | |
04067002 | 220 | sth 6,0(3) |
759cfef3 | 221 | #endif |
04067002 UD |
222 | b 7f |
223 | .align 4 | |
224 | 3: | |
759cfef3 AM |
225 | #ifdef __LITTLE_ENDIAN__ |
226 | rotlwi 6,6,24 | |
227 | stb 6,0(3) | |
228 | sth 7,1(3) | |
229 | #else | |
04067002 UD |
230 | stb 7,0(3) |
231 | sth 6,1(3) | |
759cfef3 | 232 | #endif |
04067002 UD |
233 | b 7f |
234 | .align 4 | |
235 | 5: | |
759cfef3 AM |
236 | #ifdef __LITTLE_ENDIAN__ |
237 | rotlwi 6,6,8 | |
238 | #endif | |
04067002 UD |
239 | stb 6,0(3) |
240 | 7: | |
241 | cmpldi cr1,10,16 | |
242 | add 3,3,0 | |
243 | mtcrf 0x01,10 | |
244 | .align 4 | |
245 | .L3: | |
246 | /* At least 6 bytes left and the source is word aligned. */ | |
247 | blt cr1,8f | |
248 | 16: /* Move 16 bytes. */ | |
249 | lwz 6,0(12) | |
250 | lwz 7,4(12) | |
251 | stw 6,0(3) | |
252 | lwz 6,8(12) | |
253 | stw 7,4(3) | |
254 | lwz 7,12(12) | |
255 | addi 12,12,16 | |
256 | stw 6,8(3) | |
257 | stw 7,12(3) | |
258 | addi 3,3,16 | |
259 | 8: /* Move 8 bytes. */ | |
260 | bf 28,4f | |
261 | lwz 6,0(12) | |
262 | lwz 7,4(12) | |
263 | addi 12,12,8 | |
264 | stw 6,0(3) | |
265 | stw 7,4(3) | |
266 | addi 3,3,8 | |
267 | 4: /* Move 4 bytes. */ | |
268 | bf 29,2f | |
269 | lwz 6,0(12) | |
270 | addi 12,12,4 | |
271 | stw 6,0(3) | |
9c84384c | 272 | addi 3,3,4 |
04067002 UD |
273 | 2: /* Move 2-3 bytes. */ |
274 | bf 30,1f | |
275 | lhz 6,0(12) | |
9c84384c | 276 | sth 6,0(3) |
04067002 UD |
277 | bf 31,0f |
278 | lbz 7,2(12) | |
279 | stb 7,2(3) | |
280 | ld 3,-16(1) | |
281 | blr | |
282 | 1: /* Move 1 byte. */ | |
283 | bf 31,0f | |
284 | lbz 6,0(12) | |
285 | stb 6,0(3) | |
286 | 0: | |
287 | /* Return original dst pointer. */ | |
288 | ld 3,-16(1) | |
289 | blr | |
290 | ||
291 | /* Special case to copy 0-8 bytes. */ | |
292 | .align 4 | |
293 | .LE8: | |
294 | mr 12,4 | |
295 | bne cr6,4f | |
296 | /* Would have liked to use use ld/std here but the 630 processors are | |
9c84384c | 297 | slow for load/store doubles that are not at least word aligned. |
2ccdea26 | 298 | Unaligned Load/Store word execute with only a 1 cycle penalty. */ |
04067002 UD |
299 | lwz 6,0(4) |
300 | lwz 7,4(4) | |
301 | stw 6,0(3) | |
302 | stw 7,4(3) | |
303 | /* Return original dst pointer. */ | |
304 | ld 3,-16(1) | |
305 | blr | |
306 | .align 4 | |
307 | 4: bf 29,2b | |
308 | lwz 6,0(4) | |
309 | stw 6,0(3) | |
310 | 6: | |
311 | bf 30,5f | |
312 | lhz 7,4(4) | |
9c84384c | 313 | sth 7,4(3) |
04067002 UD |
314 | bf 31,0f |
315 | lbz 8,6(4) | |
316 | stb 8,6(3) | |
317 | ld 3,-16(1) | |
318 | blr | |
319 | .align 4 | |
9c84384c | 320 | 5: |
04067002 UD |
321 | bf 31,0f |
322 | lbz 6,4(4) | |
323 | stb 6,4(3) | |
324 | .align 4 | |
325 | 0: | |
326 | /* Return original dst pointer. */ | |
327 | ld 3,-16(1) | |
328 | blr | |
329 | ||
330 | .align 4 | |
331 | .L6: | |
332 | ||
333 | /* Copy doublewords where the destination is aligned but the source is | |
334 | not. Use aligned doubleword loads from the source, shifted to realign | |
335 | the data, to allow aligned destination stores. */ | |
336 | addi 11,9,-1 /* loop DW count is one less than total */ | |
337 | subf 5,10,12 | |
338 | sldi 10,10,3 | |
339 | mr 4,3 | |
340 | srdi 8,11,2 /* calculate the 32 byte loop count */ | |
341 | ld 6,0(5) | |
342 | mtcrf 0x01,11 | |
343 | cmpldi cr6,9,4 | |
344 | mtctr 8 | |
345 | ld 7,8(5) | |
346 | subfic 9,10,64 | |
347 | bf 30,1f | |
348 | ||
349 | /* there are at least two DWs to copy */ | |
759cfef3 AM |
350 | #ifdef __LITTLE_ENDIAN__ |
351 | srd 0,6,10 | |
352 | sld 8,7,9 | |
353 | #else | |
04067002 UD |
354 | sld 0,6,10 |
355 | srd 8,7,9 | |
759cfef3 | 356 | #endif |
04067002 UD |
357 | or 0,0,8 |
358 | ld 6,16(5) | |
359 | std 0,0(4) | |
759cfef3 AM |
360 | #ifdef __LITTLE_ENDIAN__ |
361 | srd 0,7,10 | |
362 | sld 8,6,9 | |
363 | #else | |
04067002 UD |
364 | sld 0,7,10 |
365 | srd 8,6,9 | |
759cfef3 | 366 | #endif |
04067002 UD |
367 | or 0,0,8 |
368 | ld 7,24(5) | |
369 | std 0,8(4) | |
370 | addi 4,4,16 | |
371 | addi 5,5,32 | |
372 | blt cr6,8f /* if total DWs = 3, then bypass loop */ | |
373 | bf 31,4f | |
374 | /* there is a third DW to copy */ | |
759cfef3 AM |
375 | #ifdef __LITTLE_ENDIAN__ |
376 | srd 0,6,10 | |
377 | sld 8,7,9 | |
378 | #else | |
04067002 UD |
379 | sld 0,6,10 |
380 | srd 8,7,9 | |
759cfef3 | 381 | #endif |
04067002 UD |
382 | or 0,0,8 |
383 | std 0,0(4) | |
384 | mr 6,7 | |
385 | ld 7,0(5) | |
386 | addi 5,5,8 | |
387 | addi 4,4,8 | |
388 | beq cr6,8f /* if total DWs = 4, then bypass loop */ | |
389 | b 4f | |
390 | .align 4 | |
391 | 1: | |
759cfef3 AM |
392 | #ifdef __LITTLE_ENDIAN__ |
393 | srd 0,6,10 | |
394 | sld 8,7,9 | |
395 | #else | |
04067002 UD |
396 | sld 0,6,10 |
397 | srd 8,7,9 | |
759cfef3 | 398 | #endif |
04067002 UD |
399 | addi 5,5,16 |
400 | or 0,0,8 | |
401 | bf 31,4f | |
402 | mr 6,7 | |
403 | ld 7,0(5) | |
404 | addi 5,5,8 | |
405 | std 0,0(4) | |
406 | addi 4,4,8 | |
407 | .align 4 | |
408 | /* copy 32 bytes at a time */ | |
759cfef3 AM |
409 | 4: |
410 | #ifdef __LITTLE_ENDIAN__ | |
411 | srd 0,6,10 | |
412 | sld 8,7,9 | |
413 | #else | |
414 | sld 0,6,10 | |
04067002 | 415 | srd 8,7,9 |
759cfef3 | 416 | #endif |
04067002 UD |
417 | or 0,0,8 |
418 | ld 6,0(5) | |
419 | std 0,0(4) | |
759cfef3 AM |
420 | #ifdef __LITTLE_ENDIAN__ |
421 | srd 0,7,10 | |
422 | sld 8,6,9 | |
423 | #else | |
04067002 UD |
424 | sld 0,7,10 |
425 | srd 8,6,9 | |
759cfef3 | 426 | #endif |
04067002 UD |
427 | or 0,0,8 |
428 | ld 7,8(5) | |
429 | std 0,8(4) | |
759cfef3 AM |
430 | #ifdef __LITTLE_ENDIAN__ |
431 | srd 0,6,10 | |
432 | sld 8,7,9 | |
433 | #else | |
04067002 UD |
434 | sld 0,6,10 |
435 | srd 8,7,9 | |
759cfef3 | 436 | #endif |
04067002 UD |
437 | or 0,0,8 |
438 | ld 6,16(5) | |
439 | std 0,16(4) | |
759cfef3 AM |
440 | #ifdef __LITTLE_ENDIAN__ |
441 | srd 0,7,10 | |
442 | sld 8,6,9 | |
443 | #else | |
04067002 UD |
444 | sld 0,7,10 |
445 | srd 8,6,9 | |
759cfef3 | 446 | #endif |
04067002 UD |
447 | or 0,0,8 |
448 | ld 7,24(5) | |
449 | std 0,24(4) | |
450 | addi 5,5,32 | |
451 | addi 4,4,32 | |
452 | bdnz+ 4b | |
453 | .align 4 | |
454 | 8: | |
455 | /* calculate and store the final DW */ | |
759cfef3 AM |
456 | #ifdef __LITTLE_ENDIAN__ |
457 | srd 0,6,10 | |
458 | sld 8,7,9 | |
459 | #else | |
04067002 UD |
460 | sld 0,6,10 |
461 | srd 8,7,9 | |
759cfef3 | 462 | #endif |
9c84384c | 463 | or 0,0,8 |
04067002 UD |
464 | std 0,0(4) |
465 | 3: | |
466 | rldicr 0,31,0,60 | |
467 | mtcrf 0x01,31 | |
468 | bne cr1,.L9 /* If the tail is 0 bytes we are done! */ | |
469 | /* Return original dst pointer. */ | |
470 | ld 31,-8(1) | |
471 | ld 3,-16(1) | |
472 | blr | |
2d67d91a | 473 | END_GEN_TB (memcpy,TB_TOCLESS) |
04067002 | 474 | libc_hidden_builtin_def (memcpy) |