]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power7/memcpy.S
Use <> for include of kernel-features.h.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S
CommitLineData
fb084e5e 1/* Optimized memcpy implementation for PowerPC32/POWER7.
5025581e 2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
fb084e5e
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
19 02110-1301 USA. */
20
21#include <sysdep.h>
22#include <bp-sym.h>
23#include <bp-asm.h>
24
25/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
26 Returns 'dst'. */
27
28 .machine power7
29EALIGN (BP_SYM (memcpy), 5, 0)
30 CALL_MCOUNT
31
32 stwu 1,-32(1)
33 cfi_adjust_cfa_offset(32)
34 stw 30,20(1)
35 cfi_offset(30,(20-32))
36 stw 31,24(1)
37 mr 30,3
38 cmplwi cr1,5,31
39 neg 0,3
40 cfi_offset(31,-8)
41 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
42 code. */
43
44 andi. 11,3,7 /* Check alignment of DST. */
45 clrlwi 10,4,29 /* Check alignment of SRC. */
46 cmplw cr6,10,11 /* SRC and DST alignments match? */
47 mr 12,4
48 mr 31,5
49 bne cr6,L(copy_GE_32_unaligned)
50
51 srwi 9,5,3 /* Number of full quadwords remaining. */
52
53 beq L(copy_GE_32_aligned_cont)
54
55 clrlwi 0,0,29
56 mtcrf 0x01,0
57 subf 31,0,5
58
59 /* Get the SRC aligned to 8 bytes. */
60
b8907dfd
UD
611: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
662: bf 30,4f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
714: bf 29,0f
72 lwz 6,0(12)
73 addi 12,12,4
74 stw 6,0(3)
75 addi 3,3,4
fb084e5e 760:
b8907dfd
UD
77 clrlwi 10,12,29 /* Check alignment of SRC again. */
78 srwi 9,31,3 /* Number of full doublewords remaining. */
fb084e5e
LM
79
80L(copy_GE_32_aligned_cont):
81
b8907dfd
UD
82 clrlwi 11,31,29
83 mtcrf 0x01,9
84
85 srwi 8,31,5
86 cmplwi cr1,9,4
87 cmplwi cr6,11,0
88 mr 11,12
89
90 /* Copy 1~3 doublewords so the main loop starts
91 at a multiple of 32 bytes. */
92
93 bf 30,1f
94 lfd 6,0(12)
95 lfd 7,8(12)
96 addi 11,12,16
97 mtctr 8
98 stfd 6,0(3)
99 stfd 7,8(3)
100 addi 10,3,16
101 bf 31,4f
102 lfd 0,16(12)
103 stfd 0,16(3)
104 blt cr1,3f
105 addi 11,12,24
106 addi 10,3,24
107 b 4f
108
109 .align 4
1101: /* Copy 1 doubleword and set the counter. */
111 mr 10,3
112 mtctr 8
113 bf 31,4f
114 lfd 6,0(12)
115 addi 11,12,8
116 stfd 6,0(3)
117 addi 10,3,8
118
5025581e
WS
119L(aligned_copy):
120 /* Main aligned copy loop. Copies up to 128-bytes at a time. */
b8907dfd 121 .align 4
5025581e
WS
1224:
123 /* check for any 32-byte or 64-byte lumps that are outside of a
124 nice 128-byte range. R8 contains the number of 32-byte
125 lumps, so drop this into the CR, and use the SO/EQ bits to help
126 handle the 32- or 64- byte lumps. Then handle the rest with an
127 unrolled 128-bytes-at-a-time copy loop. */
128 mtocrf 1,8
129 li 6,16 # 16() index
130 li 7,32 # 32() index
131 li 8,48 # 48() index
132
133L(aligned_32byte):
134 /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
135 bns cr7,L(aligned_64byte)
136 lxvd2x 6,0,11
137 lxvd2x 7,11,6
138 addi 11,11,32
139 stxvd2x 6,0,10
140 stxvd2x 7,10,6
141 addi 10,10,32
142
143L(aligned_64byte):
144 /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
145 bne cr7,L(aligned_128setup)
146 lxvd2x 6,0,11
147 lxvd2x 7,11,6
148 lxvd2x 8,11,7
149 lxvd2x 9,11,8
150 addi 11,11,64
151 stxvd2x 6,0,10
152 stxvd2x 7,10,6
153 stxvd2x 8,10,7
154 stxvd2x 9,10,8
155 addi 10,10,64
156
157L(aligned_128setup):
158 /* Set up for the 128-byte at a time copy loop. */
159 srwi 8,31,7
160 cmpwi 8,0 # Any 4x lumps left?
161 beq 3f # if not, move along.
162 lxvd2x 6,0,11
163 lxvd2x 7,11,6
164 mtctr 8 # otherwise, load the ctr and begin.
165 li 8,48 # 48() index
166 b L(aligned_128loop)
167
168L(aligned_128head):
169 /* for the 2nd + iteration of this loop. */
170 lxvd2x 6,0,11
171 lxvd2x 7,11,6
172L(aligned_128loop):
173 lxvd2x 8,11,7
174 lxvd2x 9,11,8
175 stxvd2x 6,0,10
176 addi 11,11,64
177 stxvd2x 7,10,6
178 stxvd2x 8,10,7
179 stxvd2x 9,10,8
180 lxvd2x 6,0,11
181 lxvd2x 7,11,6
182 addi 10,10,64
183 lxvd2x 8,11,7
184 lxvd2x 9,11,8
185 addi 11,11,64
186 stxvd2x 6,0,10
187 stxvd2x 7,10,6
188 stxvd2x 8,10,7
189 stxvd2x 9,10,8
190 addi 10,10,64
191 bdnz L(aligned_128head)
fb084e5e 192
5025581e 1933:
b8907dfd 194 /* Check for tail bytes. */
b8907dfd
UD
195 clrrwi 0,31,3
196 mtcrf 0x01,31
197 beq cr6,0f
fb084e5e
LM
198
199.L9:
b8907dfd
UD
200 add 3,3,0
201 add 12,12,0
202
203 /* At this point we have a tail of 0-7 bytes and we know that the
204 destination is doubleword-aligned. */
2054: /* Copy 4 bytes. */
206 bf 29,2f
207
208 lwz 6,0(12)
209 addi 12,12,4
210 stw 6,0(3)
211 addi 3,3,4
2122: /* Copy 2 bytes. */
213 bf 30,1f
214
215 lhz 6,0(12)
216 addi 12,12,2
217 sth 6,0(3)
218 addi 3,3,2
2191: /* Copy 1 byte. */
220 bf 31,0f
221
222 lbz 6,0(12)
223 stb 6,0(3)
2240: /* Return original DST pointer. */
225 mr 3,30
226 lwz 30,20(1)
227 lwz 31,24(1)
228 addi 1,1,32
229 blr
230
231 /* Handle copies of 0~31 bytes. */
232 .align 4
fb084e5e 233L(copy_LT_32):
b8907dfd
UD
234 cmplwi cr6,5,8
235 mr 12,4
236 mtcrf 0x01,5
237 ble cr6,L(copy_LE_8)
238
239 /* At least 9 bytes to go. */
240 neg 8,4
241 clrrwi 11,4,2
242 andi. 0,8,3
243 cmplwi cr1,5,16
244 mr 10,5
245 beq L(copy_LT_32_aligned)
246
247 /* Force 4-bytes alignment for SRC. */
248 mtocrf 0x01,0
249 subf 10,0,5
2502: bf 30,1f
251
252 lhz 6,0(12)
253 addi 12,12,2
254 sth 6,0(3)
255 addi 3,3,2
2561: bf 31,L(end_4bytes_alignment)
257
258 lbz 6,0(12)
259 addi 12,12,1
260 stb 6,0(3)
261 addi 3,3,1
262
263 .align 4
fb084e5e 264L(end_4bytes_alignment):
b8907dfd
UD
265 cmplwi cr1,10,16
266 mtcrf 0x01,10
fb084e5e
LM
267
268L(copy_LT_32_aligned):
b8907dfd
UD
269 /* At least 6 bytes to go, and SRC is word-aligned. */
270 blt cr1,8f
271
272 /* Copy 16 bytes. */
273 lwz 6,0(12)
274 lwz 7,4(12)
275 stw 6,0(3)
276 lwz 8,8(12)
277 stw 7,4(3)
278 lwz 6,12(12)
279 addi 12,12,16
280 stw 8,8(3)
281 stw 6,12(3)
282 addi 3,3,16
2838: /* Copy 8 bytes. */
284 bf 28,4f
285
286 lwz 6,0(12)
287 lwz 7,4(12)
288 addi 12,12,8
289 stw 6,0(3)
290 stw 7,4(3)
291 addi 3,3,8
2924: /* Copy 4 bytes. */
293 bf 29,2f
294
295 lwz 6,0(12)
296 addi 12,12,4
297 stw 6,0(3)
298 addi 3,3,4
2992: /* Copy 2-3 bytes. */
300 bf 30,1f
301
302 lhz 6,0(12)
303 sth 6,0(3)
304 bf 31,0f
305 lbz 7,2(12)
306 stb 7,2(3)
307
308 /* Return original DST pointer. */
309 mr 3,30
310 lwz 30,20(1)
311 addi 1,1,32
312 blr
313
314 .align 4
3151: /* Copy 1 byte. */
316 bf 31,0f
317
318 lbz 6,0(12)
319 stb 6,0(3)
3200: /* Return original DST pointer. */
321 mr 3,30
322 lwz 30,20(1)
323 addi 1,1,32
324 blr
325
326 /* Handles copies of 0~8 bytes. */
327 .align 4
fb084e5e 328L(copy_LE_8):
b8907dfd
UD
329 bne cr6,4f
330
331 /* Though we could've used lfd/stfd here, they are still
332 slow for unaligned cases. */
333
334 lwz 6,0(4)
335 lwz 7,4(4)
336 stw 6,0(3)
337 stw 7,4(3)
338
339 /* Return original DST pointer. */
340 mr 3,30
341 lwz 30,20(1)
342 addi 1,1,32
343 blr
344
345 .align 4
3464: /* Copies 4~7 bytes. */
347 bf 29,2b
348
349 lwz 6,0(4)
350 stw 6,0(3)
351 bf 30,5f
352 lhz 7,4(4)
353 sth 7,4(3)
354 bf 31,0f
355 lbz 8,6(4)
356 stb 8,6(3)
357
358 /* Return original DST pointer. */
359 mr 3,30
360 lwz 30,20(1)
361 addi 1,1,32
362 blr
363
364 .align 4
3655: /* Copy 1 byte. */
366 bf 31,0f
367
368 lbz 6,4(4)
369 stb 6,4(3)
370
3710: /* Return original DST pointer. */
372 mr 3,30
373 lwz 30,20(1)
374 addi 1,1,32
375 blr
376
377 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
378 SRC is not. Use aligned quadword loads from SRC, shifted to realign
379 the data, allowing for aligned DST stores. */
380 .align 4
fb084e5e 381L(copy_GE_32_unaligned):
b8907dfd
UD
382 andi. 11,3,15 /* Check alignment of DST. */
383 clrlwi 0,0,28 /* Number of bytes until the 1st
384 quadword of DST. */
385 srwi 9,5,4 /* Number of full quadwords remaining. */
386
387 beq L(copy_GE_32_unaligned_cont)
388
389 /* SRC is not quadword aligned, get it aligned. */
390
391 mtcrf 0x01,0
392 subf 31,0,5
393
394 /* Vector instructions work best when proper alignment (16-bytes)
395 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3961: /* Copy 1 byte. */
397 bf 31,2f
398
399 lbz 6,0(12)
400 addi 12,12,1
401 stb 6,0(3)
402 addi 3,3,1
4032: /* Copy 2 bytes. */
404 bf 30,4f
405
406 lhz 6,0(12)
407 addi 12,12,2
408 sth 6,0(3)
409 addi 3,3,2
4104: /* Copy 4 bytes. */
411 bf 29,8f
412
413 lwz 6,0(12)
414 addi 12,12,4
415 stw 6,0(3)
416 addi 3,3,4
4178: /* Copy 8 bytes. */
418 bf 28,0f
419
420 lfd 6,0(12)
421 addi 12,12,8
422 stfd 6,0(3)
423 addi 3,3,8
fb084e5e 4240:
b8907dfd 425 clrlwi 10,12,28 /* Check alignment of SRC. */
eb5ad2eb 426 srwi 9,31,4 /* Number of full quadwords remaining. */
fb084e5e 427
b8907dfd 428 /* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e
LM
429L(copy_GE_32_unaligned_cont):
430
b8907dfd
UD
431 /* Setup two indexes to speed up the indexed vector operations. */
432 clrlwi 11,31,28
433 li 6,16 /* Index for 16-bytes offsets. */
434 li 7,32 /* Index for 32-bytes offsets. */
435 cmplwi cr1,11,0
eb5ad2eb 436 srwi 8,31,5 /* Setup the loop counter. */
b8907dfd
UD
437 mr 10,3
438 mr 11,12
439 mtcrf 0x01,9
440 cmplwi cr6,9,1
441 lvsl 5,0,12
442 lvx 3,0,12
443 bf 31,L(setup_unaligned_loop)
444
445 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
446 lvx 4,12,6
447 vperm 6,3,4,5
448 addi 11,12,16
449 addi 10,3,16
450 stvx 6,0,3
451 vor 3,4,4
fb084e5e
LM
452
453L(setup_unaligned_loop):
b8907dfd
UD
454 mtctr 8
455 ble cr6,L(end_unaligned_loop)
fb084e5e 456
b8907dfd
UD
457 /* Copy 32 bytes at a time using vector instructions. */
458 .align 4
fb084e5e
LM
459L(unaligned_loop):
460
b8907dfd
UD
461 /* Note: vr6/vr10 may contain data that was already copied,
462 but in order to get proper alignment, we may have to copy
463 some portions again. This is faster than having unaligned
464 vector instructions though. */
465
466 lvx 4,11,6 /* vr4 = r11+16. */
467 vperm 6,3,4,5 /* Merge the correctly-aligned portions
468 of vr3/vr4 into vr6. */
469 lvx 3,11,7 /* vr3 = r11+32. */
470 vperm 10,4,3,5 /* Merge the correctly-aligned portions
471 of vr3/vr4 into vr10. */
472 addi 11,11,32
473 stvx 6,0,10
474 stvx 10,10,6
475 addi 10,10,32
476
477 bdnz L(unaligned_loop)
478
479 .align 4
fb084e5e
LM
480L(end_unaligned_loop):
481
b8907dfd
UD
482 /* Check for tail bytes. */
483 clrrwi 0,31,4
484 mtcrf 0x01,31
485 beq cr1,0f
486
487 add 3,3,0
488 add 12,12,0
489
490 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4918: /* Copy 8 bytes. */
492 bf 28,4f
493
494 lwz 6,0(12)
495 lwz 7,4(12)
496 addi 12,12,8
497 stw 6,0(3)
498 stw 7,4(3)
499 addi 3,3,8
5004: /* Copy 4 bytes. */
501 bf 29,2f
502
503 lwz 6,0(12)
504 addi 12,12,4
505 stw 6,0(3)
506 addi 3,3,4
5072: /* Copy 2~3 bytes. */
508 bf 30,1f
509
510 lhz 6,0(12)
511 addi 12,12,2
512 sth 6,0(3)
513 addi 3,3,2
5141: /* Copy 1 byte. */
515 bf 31,0f
516
517 lbz 6,0(12)
518 stb 6,0(3)
5190: /* Return original DST pointer. */
520 mr 3,30
521 lwz 30,20(1)
522 lwz 31,24(1)
523 addi 1,1,32
524 blr
fb084e5e
LM
525
526END (BP_SYM (memcpy))
527libc_hidden_builtin_def (memcpy)