]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc32/power7/memcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC32/POWER7.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
23 Returns 'dst'. */
24
25 .machine power7
26 EALIGN (memcpy, 5, 0)
27 CALL_MCOUNT
28
29 stwu 1,-32(1)
30 cfi_adjust_cfa_offset(32)
31 stw 30,20(1)
32 cfi_offset(30,(20-32))
33 stw 31,24(1)
34 mr 30,3
35 cmplwi cr1,5,31
36 neg 0,3
37 cfi_offset(31,-8)
38 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
39 code. */
40
41 andi. 11,3,15 /* Check alignment of DST. */
42 clrlwi 10,4,28 /* Check alignment of SRC. */
43 cmplw cr6,10,11 /* SRC and DST alignments match? */
44 mr 12,4
45 mr 31,5
46 bne cr6,L(copy_GE_32_unaligned)
47
48 srwi 9,5,3 /* Number of full quadwords remaining. */
49
50 beq L(copy_GE_32_aligned_cont)
51
52 clrlwi 0,0,29
53 mtcrf 0x01,0
54 subf 31,0,5
55
56 /* Get the SRC aligned to 8 bytes. */
57
58 1: bf 31,2f
59 lbz 6,0(12)
60 addi 12,12,1
61 stb 6,0(3)
62 addi 3,3,1
63 2: bf 30,4f
64 lhz 6,0(12)
65 addi 12,12,2
66 sth 6,0(3)
67 addi 3,3,2
68 4: bf 29,0f
69 lwz 6,0(12)
70 addi 12,12,4
71 stw 6,0(3)
72 addi 3,3,4
73 0:
74 clrlwi 10,12,29 /* Check alignment of SRC again. */
75 srwi 9,31,3 /* Number of full doublewords remaining. */
76
77 L(copy_GE_32_aligned_cont):
78
79 clrlwi 11,31,29
80 mtcrf 0x01,9
81
82 srwi 8,31,5
83 cmplwi cr1,9,4
84 cmplwi cr6,11,0
85 mr 11,12
86
87 /* Copy 1~3 doublewords so the main loop starts
88 at a multiple of 32 bytes. */
89
90 bf 30,1f
91 lfd 6,0(12)
92 lfd 7,8(12)
93 addi 11,12,16
94 mtctr 8
95 stfd 6,0(3)
96 stfd 7,8(3)
97 addi 10,3,16
98 bf 31,4f
99 lfd 0,16(12)
100 stfd 0,16(3)
101 blt cr1,3f
102 addi 11,12,24
103 addi 10,3,24
104 b 4f
105
106 .align 4
107 1: /* Copy 1 doubleword and set the counter. */
108 mr 10,3
109 mtctr 8
110 bf 31,4f
111 lfd 6,0(12)
112 addi 11,12,8
113 stfd 6,0(3)
114 addi 10,3,8
115
116 L(aligned_copy):
117 /* Main aligned copy loop. Copies up to 128-bytes at a time. */
118 .align 4
119 4:
120 /* check for any 32-byte or 64-byte lumps that are outside of a
121 nice 128-byte range. R8 contains the number of 32-byte
122 lumps, so drop this into the CR, and use the SO/EQ bits to help
123 handle the 32- or 64- byte lumps. Then handle the rest with an
124 unrolled 128-bytes-at-a-time copy loop. */
125 mtocrf 1,8
126 li 6,16 # 16() index
127 li 7,32 # 32() index
128 li 8,48 # 48() index
129
130 L(aligned_32byte):
131 /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
132 bns cr7,L(aligned_64byte)
133 lxvd2x 6,0,11
134 lxvd2x 7,11,6
135 addi 11,11,32
136 stxvd2x 6,0,10
137 stxvd2x 7,10,6
138 addi 10,10,32
139
140 L(aligned_64byte):
141 /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
142 bne cr7,L(aligned_128setup)
143 lxvd2x 6,0,11
144 lxvd2x 7,11,6
145 lxvd2x 8,11,7
146 lxvd2x 9,11,8
147 addi 11,11,64
148 stxvd2x 6,0,10
149 stxvd2x 7,10,6
150 stxvd2x 8,10,7
151 stxvd2x 9,10,8
152 addi 10,10,64
153
154 L(aligned_128setup):
155 /* Set up for the 128-byte at a time copy loop. */
156 srwi 8,31,7
157 cmpwi 8,0 # Any 4x lumps left?
158 beq 3f # if not, move along.
159 lxvd2x 6,0,11
160 lxvd2x 7,11,6
161 mtctr 8 # otherwise, load the ctr and begin.
162 li 8,48 # 48() index
163 b L(aligned_128loop)
164
165 L(aligned_128head):
166 /* for the 2nd + iteration of this loop. */
167 lxvd2x 6,0,11
168 lxvd2x 7,11,6
169 L(aligned_128loop):
170 lxvd2x 8,11,7
171 lxvd2x 9,11,8
172 stxvd2x 6,0,10
173 addi 11,11,64
174 stxvd2x 7,10,6
175 stxvd2x 8,10,7
176 stxvd2x 9,10,8
177 lxvd2x 6,0,11
178 lxvd2x 7,11,6
179 addi 10,10,64
180 lxvd2x 8,11,7
181 lxvd2x 9,11,8
182 addi 11,11,64
183 stxvd2x 6,0,10
184 stxvd2x 7,10,6
185 stxvd2x 8,10,7
186 stxvd2x 9,10,8
187 addi 10,10,64
188 bdnz L(aligned_128head)
189
190 3:
191 /* Check for tail bytes. */
192 clrrwi 0,31,3
193 mtcrf 0x01,31
194 beq cr6,0f
195
196 .L9:
197 add 3,3,0
198 add 12,12,0
199
200 /* At this point we have a tail of 0-7 bytes and we know that the
201 destination is doubleword-aligned. */
202 4: /* Copy 4 bytes. */
203 bf 29,2f
204
205 lwz 6,0(12)
206 addi 12,12,4
207 stw 6,0(3)
208 addi 3,3,4
209 2: /* Copy 2 bytes. */
210 bf 30,1f
211
212 lhz 6,0(12)
213 addi 12,12,2
214 sth 6,0(3)
215 addi 3,3,2
216 1: /* Copy 1 byte. */
217 bf 31,0f
218
219 lbz 6,0(12)
220 stb 6,0(3)
221 0: /* Return original DST pointer. */
222 mr 3,30
223 lwz 30,20(1)
224 lwz 31,24(1)
225 addi 1,1,32
226 blr
227
228 /* Handle copies of 0~31 bytes. */
229 .align 4
230 L(copy_LT_32):
231 cmplwi cr6,5,8
232 mr 12,4
233 mtcrf 0x01,5
234 ble cr6,L(copy_LE_8)
235
236 /* At least 9 bytes to go. */
237 neg 8,4
238 clrrwi 11,4,2
239 andi. 0,8,3
240 cmplwi cr1,5,16
241 mr 10,5
242 beq L(copy_LT_32_aligned)
243
244 /* Force 4-bytes alignment for SRC. */
245 mtocrf 0x01,0
246 subf 10,0,5
247 2: bf 30,1f
248
249 lhz 6,0(12)
250 addi 12,12,2
251 sth 6,0(3)
252 addi 3,3,2
253 1: bf 31,L(end_4bytes_alignment)
254
255 lbz 6,0(12)
256 addi 12,12,1
257 stb 6,0(3)
258 addi 3,3,1
259
260 .align 4
261 L(end_4bytes_alignment):
262 cmplwi cr1,10,16
263 mtcrf 0x01,10
264
265 L(copy_LT_32_aligned):
266 /* At least 6 bytes to go, and SRC is word-aligned. */
267 blt cr1,8f
268
269 /* Copy 16 bytes. */
270 lwz 6,0(12)
271 lwz 7,4(12)
272 stw 6,0(3)
273 lwz 8,8(12)
274 stw 7,4(3)
275 lwz 6,12(12)
276 addi 12,12,16
277 stw 8,8(3)
278 stw 6,12(3)
279 addi 3,3,16
280 8: /* Copy 8 bytes. */
281 bf 28,4f
282
283 lwz 6,0(12)
284 lwz 7,4(12)
285 addi 12,12,8
286 stw 6,0(3)
287 stw 7,4(3)
288 addi 3,3,8
289 4: /* Copy 4 bytes. */
290 bf 29,2f
291
292 lwz 6,0(12)
293 addi 12,12,4
294 stw 6,0(3)
295 addi 3,3,4
296 2: /* Copy 2-3 bytes. */
297 bf 30,1f
298
299 lhz 6,0(12)
300 sth 6,0(3)
301 bf 31,0f
302 lbz 7,2(12)
303 stb 7,2(3)
304
305 /* Return original DST pointer. */
306 mr 3,30
307 lwz 30,20(1)
308 addi 1,1,32
309 blr
310
311 .align 4
312 1: /* Copy 1 byte. */
313 bf 31,0f
314
315 lbz 6,0(12)
316 stb 6,0(3)
317 0: /* Return original DST pointer. */
318 mr 3,30
319 lwz 30,20(1)
320 addi 1,1,32
321 blr
322
323 /* Handles copies of 0~8 bytes. */
324 .align 4
325 L(copy_LE_8):
326 bne cr6,4f
327
328 /* Though we could've used lfd/stfd here, they are still
329 slow for unaligned cases. */
330
331 lwz 6,0(4)
332 lwz 7,4(4)
333 stw 6,0(3)
334 stw 7,4(3)
335
336 /* Return original DST pointer. */
337 mr 3,30
338 lwz 30,20(1)
339 addi 1,1,32
340 blr
341
342 .align 4
343 4: /* Copies 4~7 bytes. */
344 bf 29,2b
345
346 lwz 6,0(4)
347 stw 6,0(3)
348 bf 30,5f
349 lhz 7,4(4)
350 sth 7,4(3)
351 bf 31,0f
352 lbz 8,6(4)
353 stb 8,6(3)
354
355 /* Return original DST pointer. */
356 mr 3,30
357 lwz 30,20(1)
358 addi 1,1,32
359 blr
360
361 .align 4
362 5: /* Copy 1 byte. */
363 bf 31,0f
364
365 lbz 6,4(4)
366 stb 6,4(3)
367
368 0: /* Return original DST pointer. */
369 mr 3,30
370 lwz 30,20(1)
371 addi 1,1,32
372 blr
373
374 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
375 SRC is not. Use aligned quadword loads from SRC, shifted to realign
376 the data, allowing for aligned DST stores. */
377 .align 4
378 L(copy_GE_32_unaligned):
379 andi. 11,3,15 /* Check alignment of DST. */
380 clrlwi 0,0,28 /* Number of bytes until the 1st
381 quadword of DST. */
382 srwi 9,5,4 /* Number of full quadwords remaining. */
383
384 beq L(copy_GE_32_unaligned_cont)
385
386 /* DST is not quadword aligned, get it aligned. */
387
388 mtcrf 0x01,0
389 subf 31,0,5
390
391 /* Vector instructions work best when proper alignment (16-bytes)
392 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
393 1: /* Copy 1 byte. */
394 bf 31,2f
395
396 lbz 6,0(12)
397 addi 12,12,1
398 stb 6,0(3)
399 addi 3,3,1
400 2: /* Copy 2 bytes. */
401 bf 30,4f
402
403 lhz 6,0(12)
404 addi 12,12,2
405 sth 6,0(3)
406 addi 3,3,2
407 4: /* Copy 4 bytes. */
408 bf 29,8f
409
410 lwz 6,0(12)
411 addi 12,12,4
412 stw 6,0(3)
413 addi 3,3,4
414 8: /* Copy 8 bytes. */
415 bf 28,0f
416
417 lfd 6,0(12)
418 addi 12,12,8
419 stfd 6,0(3)
420 addi 3,3,8
421 0:
422 clrlwi 10,12,28 /* Check alignment of SRC. */
423 srwi 9,31,4 /* Number of full quadwords remaining. */
424
425 /* The proper alignment is present, it is OK to copy the bytes now. */
426 L(copy_GE_32_unaligned_cont):
427
428 /* Setup two indexes to speed up the indexed vector operations. */
429 clrlwi 11,31,28
430 li 6,16 /* Index for 16-bytes offsets. */
431 li 7,32 /* Index for 32-bytes offsets. */
432 cmplwi cr1,11,0
433 srwi 8,31,5 /* Setup the loop counter. */
434 mr 10,3
435 mr 11,12
436 mtcrf 0x01,9
437 cmplwi cr6,9,1
438 #ifdef __LITTLE_ENDIAN__
439 lvsr 5,0,12
440 #else
441 lvsl 5,0,12
442 #endif
443 lvx 3,0,12
444 bf 31,L(setup_unaligned_loop)
445
446 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
447 lvx 4,12,6
448 #ifdef __LITTLE_ENDIAN__
449 vperm 6,4,3,5
450 #else
451 vperm 6,3,4,5
452 #endif
453 addi 11,12,16
454 addi 10,3,16
455 stvx 6,0,3
456 vor 3,4,4
457
458 L(setup_unaligned_loop):
459 mtctr 8
460 ble cr6,L(end_unaligned_loop)
461
462 /* Copy 32 bytes at a time using vector instructions. */
463 .align 4
464 L(unaligned_loop):
465
466 /* Note: vr6/vr10 may contain data that was already copied,
467 but in order to get proper alignment, we may have to copy
468 some portions again. This is faster than having unaligned
469 vector instructions though. */
470
471 lvx 4,11,6 /* vr4 = r11+16. */
472 #ifdef __LITTLE_ENDIAN__
473 vperm 6,4,3,5
474 #else
475 vperm 6,3,4,5
476 #endif
477 lvx 3,11,7 /* vr3 = r11+32. */
478 #ifdef __LITTLE_ENDIAN__
479 vperm 10,3,4,5
480 #else
481 vperm 10,4,3,5
482 #endif
483 addi 11,11,32
484 stvx 6,0,10
485 stvx 10,10,6
486 addi 10,10,32
487
488 bdnz L(unaligned_loop)
489
490 .align 4
491 L(end_unaligned_loop):
492
493 /* Check for tail bytes. */
494 clrrwi 0,31,4
495 mtcrf 0x01,31
496 beq cr1,0f
497
498 add 3,3,0
499 add 12,12,0
500
501 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
502 8: /* Copy 8 bytes. */
503 bf 28,4f
504
505 lwz 6,0(12)
506 lwz 7,4(12)
507 addi 12,12,8
508 stw 6,0(3)
509 stw 7,4(3)
510 addi 3,3,8
511 4: /* Copy 4 bytes. */
512 bf 29,2f
513
514 lwz 6,0(12)
515 addi 12,12,4
516 stw 6,0(3)
517 addi 3,3,4
518 2: /* Copy 2~3 bytes. */
519 bf 30,1f
520
521 lhz 6,0(12)
522 addi 12,12,2
523 sth 6,0(3)
524 addi 3,3,2
525 1: /* Copy 1 byte. */
526 bf 31,0f
527
528 lbz 6,0(12)
529 stb 6,0(3)
530 0: /* Return original DST pointer. */
531 mr 3,30
532 lwz 30,20(1)
533 lwz 31,24(1)
534 addi 1,1,32
535 blr
536
537 END (memcpy)
538 libc_hidden_builtin_def (memcpy)