]>
Commit | Line | Data |
---|---|---|
04067002 | 1 | /* Optimized memcpy implementation for PowerPC32 on PowerPC64. |
04277e02 | 2 | Copyright (C) 2003-2019 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 16 | License along with the GNU C Library; if not, see |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
f17a4233 | 21 | /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); |
04067002 UD |
22 | Returns 'dst'. |
23 | ||
7a41d99a UD |
24 | Memcpy handles short copies (< 32-bytes) using a binary move blocks |
25 | (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled | |
26 | with the appropriate combination of byte and halfword load/stores. | |
27 | There is minimal effort to optimize the alignment of short moves. | |
04067002 UD |
28 | |
29 | Longer moves (>= 32-bytes) justify the effort to get at least the | |
30 | destination word (4-byte) aligned. Further optimization is | |
31 | possible when both source and destination are word aligned. | |
32 | Each case has an optimized unrolled loop. */ | |
33 | ||
a88f47a7 | 34 | .machine power4 |
b5510883 | 35 | EALIGN (memcpy, 5, 0) |
04067002 UD |
36 | CALL_MCOUNT |
37 | ||
38 | stwu 1,-32(1) | |
39 | cfi_adjust_cfa_offset(32) | |
40 | stw 30,20(1) | |
41 | cfi_offset(30,(20-32)) | |
42 | mr 30,3 | |
7a41d99a | 43 | cmplwi cr1,5,31 |
04067002 UD |
44 | stw 31,24(1) |
45 | cfi_offset(31,(24-32)) | |
46 | neg 0,3 | |
47 | andi. 11,3,3 /* check alignment of dst. */ | |
48 | clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */ | |
49 | clrlwi 10,4,30 /* check alignment of src. */ | |
50 | cmplwi cr6,5,8 | |
51 | ble- cr1,.L2 /* If move < 32 bytes use short move code. */ | |
7a41d99a | 52 | cmplw cr6,10,11 |
04067002 UD |
53 | mr 12,4 |
54 | srwi 9,5,2 /* Number of full words remaining. */ | |
55 | mtcrf 0x01,0 | |
56 | mr 31,5 | |
57 | beq .L0 | |
7a41d99a | 58 | |
04067002 UD |
59 | subf 31,0,5 |
60 | /* Move 0-3 bytes as needed to get the destination word aligned. */ | |
61 | 1: bf 31,2f | |
62 | lbz 6,0(12) | |
63 | addi 12,12,1 | |
64 | stb 6,0(3) | |
65 | addi 3,3,1 | |
66 | 2: bf 30,0f | |
67 | lhz 6,0(12) | |
68 | addi 12,12,2 | |
69 | sth 6,0(3) | |
70 | addi 3,3,2 | |
71 | 0: | |
7a41d99a | 72 | clrlwi 10,12,30 /* check alignment of src again. */ |
04067002 | 73 | srwi 9,31,2 /* Number of full words remaining. */ |
7a41d99a UD |
74 | |
75 | /* Copy words from source to destination, assuming the destination is | |
04067002 UD |
76 | aligned on a word boundary. |
77 | ||
78 | At this point we know there are at least 25 bytes left (32-7) to copy. | |
7a41d99a | 79 | The next step is to determine if the source is also word aligned. |
04067002 UD |
80 | If not branch to the unaligned move code at .L6. which uses |
81 | a load, shift, store strategy. | |
7a41d99a | 82 | |
04067002 UD |
83 | Otherwise source and destination are word aligned, and we can use |
84 | the optimized word copy loop. */ | |
85 | .L0: | |
86 | clrlwi 11,31,30 /* calculate the number of tail bytes */ | |
87 | mtcrf 0x01,9 | |
88 | bne- cr6,.L6 /* If source is not word aligned. */ | |
89 | ||
90 | /* Move words where destination and source are word aligned. | |
91 | Use an unrolled loop to copy 4 words (16-bytes) per iteration. | |
7a41d99a | 92 | If the copy is not an exact multiple of 16 bytes, 1-3 |
04067002 | 93 | words are copied as needed to set up the main loop. After |
7a41d99a | 94 | the main loop exits there may be a tail of 1-3 bytes. These bytes are |
04067002 UD |
95 | copied a halfword/byte at a time as needed to preserve alignment. */ |
96 | ||
97 | srwi 8,31,4 /* calculate the 16 byte loop count */ | |
98 | cmplwi cr1,9,4 | |
99 | cmplwi cr6,11,0 | |
100 | mr 11,12 | |
7a41d99a | 101 | |
04067002 UD |
102 | bf 30,1f |
103 | lwz 6,0(12) | |
104 | lwz 7,4(12) | |
105 | addi 11,12,8 | |
106 | mtctr 8 | |
107 | stw 6,0(3) | |
108 | stw 7,4(3) | |
109 | addi 10,3,8 | |
110 | bf 31,4f | |
111 | lwz 0,8(12) | |
7a41d99a | 112 | stw 0,8(3) |
04067002 UD |
113 | blt cr1,3f |
114 | addi 11,12,12 | |
115 | addi 10,3,12 | |
116 | b 4f | |
117 | .align 4 | |
118 | 1: | |
119 | mr 10,3 | |
120 | mtctr 8 | |
121 | bf 31,4f | |
122 | lwz 6,0(12) | |
123 | addi 11,12,4 | |
124 | stw 6,0(3) | |
125 | addi 10,3,4 | |
7a41d99a | 126 | |
04067002 UD |
127 | .align 4 |
128 | 4: | |
129 | lwz 6,0(11) | |
130 | lwz 7,4(11) | |
131 | lwz 8,8(11) | |
132 | lwz 0,12(11) | |
133 | stw 6,0(10) | |
134 | stw 7,4(10) | |
135 | stw 8,8(10) | |
136 | stw 0,12(10) | |
137 | addi 11,11,16 | |
138 | addi 10,10,16 | |
139 | bdnz 4b | |
7a41d99a | 140 | 3: |
04067002 UD |
141 | clrrwi 0,31,2 |
142 | mtcrf 0x01,31 | |
143 | beq cr6,0f | |
144 | .L9: | |
145 | add 3,3,0 | |
146 | add 12,12,0 | |
7a41d99a | 147 | |
04067002 UD |
148 | /* At this point we have a tail of 0-3 bytes and we know that the |
149 | destination is word aligned. */ | |
150 | 2: bf 30,1f | |
151 | lhz 6,0(12) | |
152 | addi 12,12,2 | |
153 | sth 6,0(3) | |
154 | addi 3,3,2 | |
155 | 1: bf 31,0f | |
156 | lbz 6,0(12) | |
157 | stb 6,0(3) | |
158 | 0: | |
159 | /* Return original dst pointer. */ | |
160 | mr 3,30 | |
161 | lwz 30,20(1) | |
162 | lwz 31,24(1) | |
163 | addi 1,1,32 | |
164 | blr | |
7a41d99a UD |
165 | |
166 | /* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and | |
167 | 9-31 bytes. Each case is handled without loops, using binary | |
168 | (1,2,4,8) tests. | |
169 | ||
04067002 | 170 | In the short (0-8 byte) case no attempt is made to force alignment |
7a41d99a UD |
171 | of either source or destination. The hardware will handle the |
172 | unaligned load/stores with small delays for crossing 32- 64-byte, and | |
04067002 | 173 | 4096-byte boundaries. Since these short moves are unlikely to be |
7a41d99a | 174 | unaligned or cross these boundaries, the overhead to force |
04067002 | 175 | alignment is not justified. |
7a41d99a | 176 | |
04067002 UD |
177 | The longer (9-31 byte) move is more likely to cross 32- or 64-byte |
178 | boundaries. Since only loads are sensitive to the 32-/64-byte | |
7a41d99a | 179 | boundaries it is more important to align the source than the |
04067002 | 180 | destination. If the source is not already word aligned, we first |
7a41d99a | 181 | move 1-3 bytes as needed. While the destination and stores may |
04067002 | 182 | still be unaligned, this is only an issue for page (4096 byte |
7a41d99a UD |
183 | boundary) crossing, which should be rare for these short moves. |
184 | The hardware handles this case automatically with a small delay. */ | |
185 | ||
04067002 UD |
186 | .align 4 |
187 | .L2: | |
188 | mtcrf 0x01,5 | |
189 | neg 8,4 | |
190 | clrrwi 11,4,2 | |
191 | andi. 0,8,3 | |
192 | ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ | |
193 | /* At least 9 bytes left. Get the source word aligned. */ | |
194 | cmplwi cr1,5,16 | |
195 | mr 10,5 | |
196 | mr 12,4 | |
197 | cmplwi cr6,0,2 | |
198 | beq .L3 /* If the source is already word aligned skip this. */ | |
199 | /* Copy 1-3 bytes to get source address word aligned. */ | |
200 | lwz 6,0(11) | |
201 | subf 10,0,5 | |
202 | add 12,4,0 | |
203 | blt cr6,5f | |
204 | srwi 7,6,16 | |
205 | bgt cr6,3f | |
759cfef3 AM |
206 | #ifdef __LITTLE_ENDIAN__ |
207 | sth 7,0(3) | |
208 | #else | |
04067002 | 209 | sth 6,0(3) |
759cfef3 | 210 | #endif |
04067002 UD |
211 | b 7f |
212 | .align 4 | |
213 | 3: | |
759cfef3 AM |
214 | #ifdef __LITTLE_ENDIAN__ |
215 | rotlwi 6,6,24 | |
216 | stb 6,0(3) | |
217 | sth 7,1(3) | |
218 | #else | |
04067002 UD |
219 | stb 7,0(3) |
220 | sth 6,1(3) | |
759cfef3 | 221 | #endif |
04067002 UD |
222 | b 7f |
223 | .align 4 | |
224 | 5: | |
759cfef3 AM |
225 | #ifdef __LITTLE_ENDIAN__ |
226 | rotlwi 6,6,8 | |
227 | #endif | |
04067002 UD |
228 | stb 6,0(3) |
229 | 7: | |
230 | cmplwi cr1,10,16 | |
231 | add 3,3,0 | |
232 | mtcrf 0x01,10 | |
233 | .align 4 | |
234 | .L3: | |
235 | /* At least 6 bytes left and the source is word aligned. */ | |
236 | blt cr1,8f | |
237 | 16: /* Move 16 bytes. */ | |
238 | lwz 6,0(12) | |
239 | lwz 7,4(12) | |
240 | stw 6,0(3) | |
241 | lwz 6,8(12) | |
242 | stw 7,4(3) | |
243 | lwz 7,12(12) | |
244 | addi 12,12,16 | |
245 | stw 6,8(3) | |
246 | stw 7,12(3) | |
247 | addi 3,3,16 | |
248 | 8: /* Move 8 bytes. */ | |
249 | bf 28,4f | |
250 | lwz 6,0(12) | |
251 | lwz 7,4(12) | |
252 | addi 12,12,8 | |
253 | stw 6,0(3) | |
254 | stw 7,4(3) | |
255 | addi 3,3,8 | |
256 | 4: /* Move 4 bytes. */ | |
257 | bf 29,2f | |
258 | lwz 6,0(12) | |
259 | addi 12,12,4 | |
260 | stw 6,0(3) | |
7a41d99a | 261 | addi 3,3,4 |
04067002 UD |
262 | 2: /* Move 2-3 bytes. */ |
263 | bf 30,1f | |
264 | lhz 6,0(12) | |
7a41d99a | 265 | sth 6,0(3) |
04067002 UD |
266 | bf 31,0f |
267 | lbz 7,2(12) | |
268 | stb 7,2(3) | |
269 | mr 3,30 | |
270 | lwz 30,20(1) | |
271 | addi 1,1,32 | |
272 | blr | |
273 | 1: /* Move 1 byte. */ | |
274 | bf 31,0f | |
275 | lbz 6,0(12) | |
276 | stb 6,0(3) | |
277 | 0: | |
278 | /* Return original dst pointer. */ | |
279 | mr 3,30 | |
280 | lwz 30,20(1) | |
281 | addi 1,1,32 | |
282 | blr | |
283 | ||
284 | /* Special case to copy 0-8 bytes. */ | |
285 | .align 4 | |
286 | .LE8: | |
287 | mr 12,4 | |
288 | bne cr6,4f | |
289 | lwz 6,0(4) | |
290 | lwz 7,4(4) | |
291 | stw 6,0(3) | |
292 | stw 7,4(3) | |
293 | /* Return original dst pointer. */ | |
294 | mr 3,30 | |
295 | lwz 30,20(1) | |
296 | addi 1,1,32 | |
297 | blr | |
298 | .align 4 | |
299 | 4: bf 29,2b | |
300 | lwz 6,0(4) | |
301 | stw 6,0(3) | |
302 | 6: | |
303 | bf 30,5f | |
304 | lhz 7,4(4) | |
7a41d99a | 305 | sth 7,4(3) |
04067002 UD |
306 | bf 31,0f |
307 | lbz 8,6(4) | |
308 | stb 8,6(3) | |
309 | mr 3,30 | |
310 | lwz 30,20(1) | |
311 | addi 1,1,32 | |
312 | blr | |
313 | .align 4 | |
7a41d99a | 314 | 5: |
04067002 UD |
315 | bf 31,0f |
316 | lbz 6,4(4) | |
317 | stb 6,4(3) | |
318 | .align 4 | |
319 | 0: | |
320 | /* Return original dst pointer. */ | |
321 | mr 3,30 | |
322 | lwz 30,20(1) | |
323 | addi 1,1,32 | |
324 | blr | |
325 | ||
326 | .align 4 | |
327 | .L6: | |
328 | ||
329 | /* Copy words where the destination is aligned but the source is | |
330 | not. Use aligned word loads from the source, shifted to realign | |
7a41d99a | 331 | the data, to allow aligned destination stores. |
04067002 UD |
332 | Use an unrolled loop to copy 4 words (16-bytes) per iteration. |
333 | A single word is retained for storing at loop exit to avoid walking | |
334 | off the end of a page within the loop. | |
7a41d99a | 335 | If the copy is not an exact multiple of 16 bytes, 1-3 |
04067002 | 336 | words are copied as needed to set up the main loop. After |
7a41d99a | 337 | the main loop exits there may be a tail of 1-3 bytes. These bytes are |
04067002 | 338 | copied a halfword/byte at a time as needed to preserve alignment. */ |
7a41d99a | 339 | |
04067002 UD |
340 | |
341 | cmplwi cr6,11,0 /* are there tail bytes left ? */ | |
342 | subf 5,10,12 /* back up src pointer to prev word alignment */ | |
343 | slwi 10,10,3 /* calculate number of bits to shift 1st word left */ | |
344 | addi 11,9,-1 /* we move one word after the loop */ | |
345 | srwi 8,11,2 /* calculate the 16 byte loop count */ | |
346 | lwz 6,0(5) /* load 1st src word into R6 */ | |
347 | mr 4,3 | |
348 | lwz 7,4(5) /* load 2nd src word into R7 */ | |
349 | mtcrf 0x01,11 | |
350 | subfic 9,10,32 /* number of bits to shift 2nd word right */ | |
351 | mtctr 8 | |
352 | bf 30,1f | |
353 | ||
354 | /* there are at least two words to copy, so copy them */ | |
759cfef3 AM |
355 | #ifdef __LITTLE_ENDIAN__ |
356 | srw 0,6,10 | |
357 | slw 8,7,9 | |
358 | #else | |
04067002 UD |
359 | slw 0,6,10 /* shift 1st src word to left align it in R0 */ |
360 | srw 8,7,9 /* shift 2nd src word to right align it in R8 */ | |
759cfef3 | 361 | #endif |
04067002 UD |
362 | or 0,0,8 /* or them to get word to store */ |
363 | lwz 6,8(5) /* load the 3rd src word */ | |
364 | stw 0,0(4) /* store the 1st dst word */ | |
759cfef3 AM |
365 | #ifdef __LITTLE_ENDIAN__ |
366 | srw 0,7,10 | |
367 | slw 8,6,9 | |
368 | #else | |
04067002 UD |
369 | slw 0,7,10 /* now left align 2nd src word into R0 */ |
370 | srw 8,6,9 /* shift 3rd src word to right align it in R8 */ | |
759cfef3 | 371 | #endif |
04067002 UD |
372 | or 0,0,8 /* or them to get word to store */ |
373 | lwz 7,12(5) | |
374 | stw 0,4(4) /* store the 2nd dst word */ | |
375 | addi 4,4,8 | |
376 | addi 5,5,16 | |
377 | bf 31,4f | |
378 | /* there is a third word to copy, so copy it */ | |
759cfef3 AM |
379 | #ifdef __LITTLE_ENDIAN__ |
380 | srw 0,6,10 | |
381 | slw 8,7,9 | |
382 | #else | |
04067002 UD |
383 | slw 0,6,10 /* shift 3rd src word to left align it in R0 */ |
384 | srw 8,7,9 /* shift 4th src word to right align it in R8 */ | |
759cfef3 | 385 | #endif |
04067002 UD |
386 | or 0,0,8 /* or them to get word to store */ |
387 | stw 0,0(4) /* store 3rd dst word */ | |
388 | mr 6,7 | |
389 | lwz 7,0(5) | |
390 | addi 5,5,4 | |
391 | addi 4,4,4 | |
392 | b 4f | |
393 | .align 4 | |
394 | 1: | |
759cfef3 AM |
395 | #ifdef __LITTLE_ENDIAN__ |
396 | srw 0,6,10 | |
397 | slw 8,7,9 | |
398 | #else | |
04067002 UD |
399 | slw 0,6,10 /* shift 1st src word to left align it in R0 */ |
400 | srw 8,7,9 /* shift 2nd src word to right align it in R8 */ | |
759cfef3 | 401 | #endif |
04067002 UD |
402 | addi 5,5,8 |
403 | or 0,0,8 /* or them to get word to store */ | |
404 | bf 31,4f | |
405 | mr 6,7 | |
406 | lwz 7,0(5) | |
407 | addi 5,5,4 | |
408 | stw 0,0(4) /* store the 1st dst word */ | |
409 | addi 4,4,4 | |
410 | ||
411 | .align 4 | |
412 | 4: | |
413 | /* copy 16 bytes at a time */ | |
759cfef3 AM |
414 | #ifdef __LITTLE_ENDIAN__ |
415 | srw 0,6,10 | |
416 | slw 8,7,9 | |
417 | #else | |
7a41d99a UD |
418 | slw 0,6,10 |
419 | srw 8,7,9 | |
759cfef3 | 420 | #endif |
04067002 UD |
421 | or 0,0,8 |
422 | lwz 6,0(5) | |
423 | stw 0,0(4) | |
759cfef3 AM |
424 | #ifdef __LITTLE_ENDIAN__ |
425 | srw 0,7,10 | |
426 | slw 8,6,9 | |
427 | #else | |
04067002 UD |
428 | slw 0,7,10 |
429 | srw 8,6,9 | |
759cfef3 | 430 | #endif |
04067002 UD |
431 | or 0,0,8 |
432 | lwz 7,4(5) | |
433 | stw 0,4(4) | |
759cfef3 AM |
434 | #ifdef __LITTLE_ENDIAN__ |
435 | srw 0,6,10 | |
436 | slw 8,7,9 | |
437 | #else | |
7a41d99a UD |
438 | slw 0,6,10 |
439 | srw 8,7,9 | |
759cfef3 | 440 | #endif |
04067002 UD |
441 | or 0,0,8 |
442 | lwz 6,8(5) | |
443 | stw 0,8(4) | |
759cfef3 AM |
444 | #ifdef __LITTLE_ENDIAN__ |
445 | srw 0,7,10 | |
446 | slw 8,6,9 | |
447 | #else | |
04067002 | 448 | slw 0,7,10 |
7a41d99a | 449 | srw 8,6,9 |
759cfef3 | 450 | #endif |
04067002 UD |
451 | or 0,0,8 |
452 | lwz 7,12(5) | |
453 | stw 0,12(4) | |
454 | addi 5,5,16 | |
455 | addi 4,4,16 | |
456 | bdnz+ 4b | |
457 | 8: | |
458 | /* calculate and store the final word */ | |
759cfef3 AM |
459 | #ifdef __LITTLE_ENDIAN__ |
460 | srw 0,6,10 | |
461 | slw 8,7,9 | |
462 | #else | |
7a41d99a UD |
463 | slw 0,6,10 |
464 | srw 8,7,9 | |
759cfef3 | 465 | #endif |
04067002 UD |
466 | or 0,0,8 |
467 | stw 0,0(4) | |
468 | 3: | |
469 | clrrwi 0,31,2 | |
470 | mtcrf 0x01,31 | |
471 | bne cr6,.L9 /* If the tail is 0 bytes we are done! */ | |
472 | ||
473 | /* Return original dst pointer. */ | |
474 | mr 3,30 | |
475 | lwz 30,20(1) | |
476 | lwz 31,24(1) | |
477 | addi 1,1,32 | |
478 | blr | |
b5510883 | 479 | END (memcpy) |
04067002 UD |
480 | |
481 | libc_hidden_builtin_def (memcpy) |