]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power4/memcpy.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power4 / memcpy.S
CommitLineData
04067002 1/* Optimized memcpy implementation for PowerPC32 on PowerPC64.
04277e02 2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
04067002
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
04067002
UD
18
19#include <sysdep.h>
04067002 20
f17a4233 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
04067002
UD
22 Returns 'dst'.
23
7a41d99a
UD
24 Memcpy handles short copies (< 32-bytes) using a binary move blocks
25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
26 with the appropriate combination of byte and halfword load/stores.
27 There is minimal effort to optimize the alignment of short moves.
04067002
UD
28
29 Longer moves (>= 32-bytes) justify the effort to get at least the
30 destination word (4-byte) aligned. Further optimization is
31 possible when both source and destination are word aligned.
32 Each case has an optimized unrolled loop. */
33
a88f47a7 34 .machine power4
b5510883 35EALIGN (memcpy, 5, 0)
04067002
UD
36 CALL_MCOUNT
37
38 stwu 1,-32(1)
39 cfi_adjust_cfa_offset(32)
40 stw 30,20(1)
41 cfi_offset(30,(20-32))
42 mr 30,3
7a41d99a 43 cmplwi cr1,5,31
04067002
UD
44 stw 31,24(1)
45 cfi_offset(31,(24-32))
46 neg 0,3
47 andi. 11,3,3 /* check alignment of dst. */
48 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
49 clrlwi 10,4,30 /* check alignment of src. */
50 cmplwi cr6,5,8
51 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
7a41d99a 52 cmplw cr6,10,11
04067002
UD
53 mr 12,4
54 srwi 9,5,2 /* Number of full words remaining. */
55 mtcrf 0x01,0
56 mr 31,5
57 beq .L0
7a41d99a 58
04067002
UD
59 subf 31,0,5
60 /* Move 0-3 bytes as needed to get the destination word aligned. */
611: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
662: bf 30,0f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
710:
7a41d99a 72 clrlwi 10,12,30 /* check alignment of src again. */
04067002 73 srwi 9,31,2 /* Number of full words remaining. */
7a41d99a
UD
74
75 /* Copy words from source to destination, assuming the destination is
04067002
UD
76 aligned on a word boundary.
77
78 At this point we know there are at least 25 bytes left (32-7) to copy.
7a41d99a 79 The next step is to determine if the source is also word aligned.
04067002
UD
80 If not branch to the unaligned move code at .L6. which uses
81 a load, shift, store strategy.
7a41d99a 82
04067002
UD
83 Otherwise source and destination are word aligned, and we can use
84 the optimized word copy loop. */
85.L0:
86 clrlwi 11,31,30 /* calculate the number of tail bytes */
87 mtcrf 0x01,9
88 bne- cr6,.L6 /* If source is not word aligned. */
89
90 /* Move words where destination and source are word aligned.
91 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
7a41d99a 92 If the copy is not an exact multiple of 16 bytes, 1-3
04067002 93 words are copied as needed to set up the main loop. After
7a41d99a 94 the main loop exits there may be a tail of 1-3 bytes. These bytes are
04067002
UD
95 copied a halfword/byte at a time as needed to preserve alignment. */
96
97 srwi 8,31,4 /* calculate the 16 byte loop count */
98 cmplwi cr1,9,4
99 cmplwi cr6,11,0
100 mr 11,12
7a41d99a 101
04067002
UD
102 bf 30,1f
103 lwz 6,0(12)
104 lwz 7,4(12)
105 addi 11,12,8
106 mtctr 8
107 stw 6,0(3)
108 stw 7,4(3)
109 addi 10,3,8
110 bf 31,4f
111 lwz 0,8(12)
7a41d99a 112 stw 0,8(3)
04067002
UD
113 blt cr1,3f
114 addi 11,12,12
115 addi 10,3,12
116 b 4f
117 .align 4
1181:
119 mr 10,3
120 mtctr 8
121 bf 31,4f
122 lwz 6,0(12)
123 addi 11,12,4
124 stw 6,0(3)
125 addi 10,3,4
7a41d99a 126
04067002
UD
127 .align 4
1284:
129 lwz 6,0(11)
130 lwz 7,4(11)
131 lwz 8,8(11)
132 lwz 0,12(11)
133 stw 6,0(10)
134 stw 7,4(10)
135 stw 8,8(10)
136 stw 0,12(10)
137 addi 11,11,16
138 addi 10,10,16
139 bdnz 4b
7a41d99a 1403:
04067002
UD
141 clrrwi 0,31,2
142 mtcrf 0x01,31
143 beq cr6,0f
144.L9:
145 add 3,3,0
146 add 12,12,0
7a41d99a 147
04067002
UD
148/* At this point we have a tail of 0-3 bytes and we know that the
149 destination is word aligned. */
1502: bf 30,1f
151 lhz 6,0(12)
152 addi 12,12,2
153 sth 6,0(3)
154 addi 3,3,2
1551: bf 31,0f
156 lbz 6,0(12)
157 stb 6,0(3)
1580:
159 /* Return original dst pointer. */
160 mr 3,30
161 lwz 30,20(1)
162 lwz 31,24(1)
163 addi 1,1,32
164 blr
7a41d99a
UD
165
166/* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and
167 9-31 bytes. Each case is handled without loops, using binary
168 (1,2,4,8) tests.
169
04067002 170 In the short (0-8 byte) case no attempt is made to force alignment
7a41d99a
UD
171 of either source or destination. The hardware will handle the
172 unaligned load/stores with small delays for crossing 32- 64-byte, and
04067002 173 4096-byte boundaries. Since these short moves are unlikely to be
7a41d99a 174 unaligned or cross these boundaries, the overhead to force
04067002 175 alignment is not justified.
7a41d99a 176
04067002
UD
177 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
178 boundaries. Since only loads are sensitive to the 32-/64-byte
7a41d99a 179 boundaries it is more important to align the source than the
04067002 180 destination. If the source is not already word aligned, we first
7a41d99a 181 move 1-3 bytes as needed. While the destination and stores may
04067002 182 still be unaligned, this is only an issue for page (4096 byte
7a41d99a
UD
183 boundary) crossing, which should be rare for these short moves.
184 The hardware handles this case automatically with a small delay. */
185
04067002
UD
186 .align 4
187.L2:
188 mtcrf 0x01,5
189 neg 8,4
190 clrrwi 11,4,2
191 andi. 0,8,3
192 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
193/* At least 9 bytes left. Get the source word aligned. */
194 cmplwi cr1,5,16
195 mr 10,5
196 mr 12,4
197 cmplwi cr6,0,2
198 beq .L3 /* If the source is already word aligned skip this. */
199/* Copy 1-3 bytes to get source address word aligned. */
200 lwz 6,0(11)
201 subf 10,0,5
202 add 12,4,0
203 blt cr6,5f
204 srwi 7,6,16
205 bgt cr6,3f
759cfef3
AM
206#ifdef __LITTLE_ENDIAN__
207 sth 7,0(3)
208#else
04067002 209 sth 6,0(3)
759cfef3 210#endif
04067002
UD
211 b 7f
212 .align 4
2133:
759cfef3
AM
214#ifdef __LITTLE_ENDIAN__
215 rotlwi 6,6,24
216 stb 6,0(3)
217 sth 7,1(3)
218#else
04067002
UD
219 stb 7,0(3)
220 sth 6,1(3)
759cfef3 221#endif
04067002
UD
222 b 7f
223 .align 4
2245:
759cfef3
AM
225#ifdef __LITTLE_ENDIAN__
226 rotlwi 6,6,8
227#endif
04067002
UD
228 stb 6,0(3)
2297:
230 cmplwi cr1,10,16
231 add 3,3,0
232 mtcrf 0x01,10
233 .align 4
234.L3:
235/* At least 6 bytes left and the source is word aligned. */
236 blt cr1,8f
23716: /* Move 16 bytes. */
238 lwz 6,0(12)
239 lwz 7,4(12)
240 stw 6,0(3)
241 lwz 6,8(12)
242 stw 7,4(3)
243 lwz 7,12(12)
244 addi 12,12,16
245 stw 6,8(3)
246 stw 7,12(3)
247 addi 3,3,16
2488: /* Move 8 bytes. */
249 bf 28,4f
250 lwz 6,0(12)
251 lwz 7,4(12)
252 addi 12,12,8
253 stw 6,0(3)
254 stw 7,4(3)
255 addi 3,3,8
2564: /* Move 4 bytes. */
257 bf 29,2f
258 lwz 6,0(12)
259 addi 12,12,4
260 stw 6,0(3)
7a41d99a 261 addi 3,3,4
04067002
UD
2622: /* Move 2-3 bytes. */
263 bf 30,1f
264 lhz 6,0(12)
7a41d99a 265 sth 6,0(3)
04067002
UD
266 bf 31,0f
267 lbz 7,2(12)
268 stb 7,2(3)
269 mr 3,30
270 lwz 30,20(1)
271 addi 1,1,32
272 blr
2731: /* Move 1 byte. */
274 bf 31,0f
275 lbz 6,0(12)
276 stb 6,0(3)
2770:
278 /* Return original dst pointer. */
279 mr 3,30
280 lwz 30,20(1)
281 addi 1,1,32
282 blr
283
284/* Special case to copy 0-8 bytes. */
285 .align 4
286.LE8:
287 mr 12,4
288 bne cr6,4f
289 lwz 6,0(4)
290 lwz 7,4(4)
291 stw 6,0(3)
292 stw 7,4(3)
293 /* Return original dst pointer. */
294 mr 3,30
295 lwz 30,20(1)
296 addi 1,1,32
297 blr
298 .align 4
2994: bf 29,2b
300 lwz 6,0(4)
301 stw 6,0(3)
3026:
303 bf 30,5f
304 lhz 7,4(4)
7a41d99a 305 sth 7,4(3)
04067002
UD
306 bf 31,0f
307 lbz 8,6(4)
308 stb 8,6(3)
309 mr 3,30
310 lwz 30,20(1)
311 addi 1,1,32
312 blr
313 .align 4
7a41d99a 3145:
04067002
UD
315 bf 31,0f
316 lbz 6,4(4)
317 stb 6,4(3)
318 .align 4
3190:
320 /* Return original dst pointer. */
321 mr 3,30
322 lwz 30,20(1)
323 addi 1,1,32
324 blr
325
326 .align 4
327.L6:
328
329 /* Copy words where the destination is aligned but the source is
330 not. Use aligned word loads from the source, shifted to realign
7a41d99a 331 the data, to allow aligned destination stores.
04067002
UD
332 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
333 A single word is retained for storing at loop exit to avoid walking
334 off the end of a page within the loop.
7a41d99a 335 If the copy is not an exact multiple of 16 bytes, 1-3
04067002 336 words are copied as needed to set up the main loop. After
7a41d99a 337 the main loop exits there may be a tail of 1-3 bytes. These bytes are
04067002 338 copied a halfword/byte at a time as needed to preserve alignment. */
7a41d99a 339
04067002
UD
340
341 cmplwi cr6,11,0 /* are there tail bytes left ? */
342 subf 5,10,12 /* back up src pointer to prev word alignment */
343 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
344 addi 11,9,-1 /* we move one word after the loop */
345 srwi 8,11,2 /* calculate the 16 byte loop count */
346 lwz 6,0(5) /* load 1st src word into R6 */
347 mr 4,3
348 lwz 7,4(5) /* load 2nd src word into R7 */
349 mtcrf 0x01,11
350 subfic 9,10,32 /* number of bits to shift 2nd word right */
351 mtctr 8
352 bf 30,1f
353
354 /* there are at least two words to copy, so copy them */
759cfef3
AM
355#ifdef __LITTLE_ENDIAN__
356 srw 0,6,10
357 slw 8,7,9
358#else
04067002
UD
359 slw 0,6,10 /* shift 1st src word to left align it in R0 */
360 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
759cfef3 361#endif
04067002
UD
362 or 0,0,8 /* or them to get word to store */
363 lwz 6,8(5) /* load the 3rd src word */
364 stw 0,0(4) /* store the 1st dst word */
759cfef3
AM
365#ifdef __LITTLE_ENDIAN__
366 srw 0,7,10
367 slw 8,6,9
368#else
04067002
UD
369 slw 0,7,10 /* now left align 2nd src word into R0 */
370 srw 8,6,9 /* shift 3rd src word to right align it in R8 */
759cfef3 371#endif
04067002
UD
372 or 0,0,8 /* or them to get word to store */
373 lwz 7,12(5)
374 stw 0,4(4) /* store the 2nd dst word */
375 addi 4,4,8
376 addi 5,5,16
377 bf 31,4f
378 /* there is a third word to copy, so copy it */
759cfef3
AM
379#ifdef __LITTLE_ENDIAN__
380 srw 0,6,10
381 slw 8,7,9
382#else
04067002
UD
383 slw 0,6,10 /* shift 3rd src word to left align it in R0 */
384 srw 8,7,9 /* shift 4th src word to right align it in R8 */
759cfef3 385#endif
04067002
UD
386 or 0,0,8 /* or them to get word to store */
387 stw 0,0(4) /* store 3rd dst word */
388 mr 6,7
389 lwz 7,0(5)
390 addi 5,5,4
391 addi 4,4,4
392 b 4f
393 .align 4
3941:
759cfef3
AM
395#ifdef __LITTLE_ENDIAN__
396 srw 0,6,10
397 slw 8,7,9
398#else
04067002
UD
399 slw 0,6,10 /* shift 1st src word to left align it in R0 */
400 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
759cfef3 401#endif
04067002
UD
402 addi 5,5,8
403 or 0,0,8 /* or them to get word to store */
404 bf 31,4f
405 mr 6,7
406 lwz 7,0(5)
407 addi 5,5,4
408 stw 0,0(4) /* store the 1st dst word */
409 addi 4,4,4
410
411 .align 4
4124:
413 /* copy 16 bytes at a time */
759cfef3
AM
414#ifdef __LITTLE_ENDIAN__
415 srw 0,6,10
416 slw 8,7,9
417#else
7a41d99a
UD
418 slw 0,6,10
419 srw 8,7,9
759cfef3 420#endif
04067002
UD
421 or 0,0,8
422 lwz 6,0(5)
423 stw 0,0(4)
759cfef3
AM
424#ifdef __LITTLE_ENDIAN__
425 srw 0,7,10
426 slw 8,6,9
427#else
04067002
UD
428 slw 0,7,10
429 srw 8,6,9
759cfef3 430#endif
04067002
UD
431 or 0,0,8
432 lwz 7,4(5)
433 stw 0,4(4)
759cfef3
AM
434#ifdef __LITTLE_ENDIAN__
435 srw 0,6,10
436 slw 8,7,9
437#else
7a41d99a
UD
438 slw 0,6,10
439 srw 8,7,9
759cfef3 440#endif
04067002
UD
441 or 0,0,8
442 lwz 6,8(5)
443 stw 0,8(4)
759cfef3
AM
444#ifdef __LITTLE_ENDIAN__
445 srw 0,7,10
446 slw 8,6,9
447#else
04067002 448 slw 0,7,10
7a41d99a 449 srw 8,6,9
759cfef3 450#endif
04067002
UD
451 or 0,0,8
452 lwz 7,12(5)
453 stw 0,12(4)
454 addi 5,5,16
455 addi 4,4,16
456 bdnz+ 4b
4578:
458 /* calculate and store the final word */
759cfef3
AM
459#ifdef __LITTLE_ENDIAN__
460 srw 0,6,10
461 slw 8,7,9
462#else
7a41d99a
UD
463 slw 0,6,10
464 srw 8,7,9
759cfef3 465#endif
04067002
UD
466 or 0,0,8
467 stw 0,0(4)
4683:
469 clrrwi 0,31,2
470 mtcrf 0x01,31
471 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
472
473 /* Return original dst pointer. */
474 mr 3,30
475 lwz 30,20(1)
476 lwz 31,24(1)
477 addi 1,1,32
478 blr
b5510883 479END (memcpy)
04067002
UD
480
481libc_hidden_builtin_def (memcpy)