]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power7/memcpy.S
Small fix to POWER7 32-bit memcpy
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S
CommitLineData
fb084e5e
LM
1/* Optimized memcpy implementation for PowerPC32/POWER7.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
19 02110-1301 USA. */
20
21#include <sysdep.h>
22#include <bp-sym.h>
23#include <bp-asm.h>
24
25/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
26 Returns 'dst'. */
27
28 .machine power7
29EALIGN (BP_SYM (memcpy), 5, 0)
30 CALL_MCOUNT
31
32 stwu 1,-32(1)
33 cfi_adjust_cfa_offset(32)
34 stw 30,20(1)
35 cfi_offset(30,(20-32))
36 stw 31,24(1)
37 mr 30,3
38 cmplwi cr1,5,31
39 neg 0,3
40 cfi_offset(31,-8)
41 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
42 code. */
43
44 andi. 11,3,7 /* Check alignment of DST. */
45 clrlwi 10,4,29 /* Check alignment of SRC. */
46 cmplw cr6,10,11 /* SRC and DST alignments match? */
47 mr 12,4
48 mr 31,5
49 bne cr6,L(copy_GE_32_unaligned)
50
51 srwi 9,5,3 /* Number of full quadwords remaining. */
52
53 beq L(copy_GE_32_aligned_cont)
54
55 clrlwi 0,0,29
56 mtcrf 0x01,0
57 subf 31,0,5
58
59 /* Get the SRC aligned to 8 bytes. */
60
b8907dfd
UD
611: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
662: bf 30,4f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
714: bf 29,0f
72 lwz 6,0(12)
73 addi 12,12,4
74 stw 6,0(3)
75 addi 3,3,4
fb084e5e 760:
b8907dfd
UD
77 clrlwi 10,12,29 /* Check alignment of SRC again. */
78 srwi 9,31,3 /* Number of full doublewords remaining. */
fb084e5e
LM
79
80L(copy_GE_32_aligned_cont):
81
b8907dfd
UD
82 clrlwi 11,31,29
83 mtcrf 0x01,9
84
85 srwi 8,31,5
86 cmplwi cr1,9,4
87 cmplwi cr6,11,0
88 mr 11,12
89
90 /* Copy 1~3 doublewords so the main loop starts
91 at a multiple of 32 bytes. */
92
93 bf 30,1f
94 lfd 6,0(12)
95 lfd 7,8(12)
96 addi 11,12,16
97 mtctr 8
98 stfd 6,0(3)
99 stfd 7,8(3)
100 addi 10,3,16
101 bf 31,4f
102 lfd 0,16(12)
103 stfd 0,16(3)
104 blt cr1,3f
105 addi 11,12,24
106 addi 10,3,24
107 b 4f
108
109 .align 4
1101: /* Copy 1 doubleword and set the counter. */
111 mr 10,3
112 mtctr 8
113 bf 31,4f
114 lfd 6,0(12)
115 addi 11,12,8
116 stfd 6,0(3)
117 addi 10,3,8
118
119 .align 4
1204: /* Main aligned copy loop. Copies 32-bytes at a time. */
121 lfd 6,0(11)
122 lfd 7,8(11)
123 lfd 8,16(11)
124 lfd 0,24(11)
125 addi 11,11,32
126
127 stfd 6,0(10)
128 stfd 7,8(10)
129 stfd 8,16(10)
130 stfd 0,24(10)
131 addi 10,10,32
132 bdnz 4b
fb084e5e
LM
1333:
134
b8907dfd 135 /* Check for tail bytes. */
fb084e5e 136
b8907dfd
UD
137 clrrwi 0,31,3
138 mtcrf 0x01,31
139 beq cr6,0f
fb084e5e
LM
140
141.L9:
b8907dfd
UD
142 add 3,3,0
143 add 12,12,0
144
145 /* At this point we have a tail of 0-7 bytes and we know that the
146 destination is doubleword-aligned. */
1474: /* Copy 4 bytes. */
148 bf 29,2f
149
150 lwz 6,0(12)
151 addi 12,12,4
152 stw 6,0(3)
153 addi 3,3,4
1542: /* Copy 2 bytes. */
155 bf 30,1f
156
157 lhz 6,0(12)
158 addi 12,12,2
159 sth 6,0(3)
160 addi 3,3,2
1611: /* Copy 1 byte. */
162 bf 31,0f
163
164 lbz 6,0(12)
165 stb 6,0(3)
1660: /* Return original DST pointer. */
167 mr 3,30
168 lwz 30,20(1)
169 lwz 31,24(1)
170 addi 1,1,32
171 blr
172
173 /* Handle copies of 0~31 bytes. */
174 .align 4
fb084e5e 175L(copy_LT_32):
b8907dfd
UD
176 cmplwi cr6,5,8
177 mr 12,4
178 mtcrf 0x01,5
179 ble cr6,L(copy_LE_8)
180
181 /* At least 9 bytes to go. */
182 neg 8,4
183 clrrwi 11,4,2
184 andi. 0,8,3
185 cmplwi cr1,5,16
186 mr 10,5
187 beq L(copy_LT_32_aligned)
188
189 /* Force 4-bytes alignment for SRC. */
190 mtocrf 0x01,0
191 subf 10,0,5
1922: bf 30,1f
193
194 lhz 6,0(12)
195 addi 12,12,2
196 sth 6,0(3)
197 addi 3,3,2
1981: bf 31,L(end_4bytes_alignment)
199
200 lbz 6,0(12)
201 addi 12,12,1
202 stb 6,0(3)
203 addi 3,3,1
204
205 .align 4
fb084e5e 206L(end_4bytes_alignment):
b8907dfd
UD
207 cmplwi cr1,10,16
208 mtcrf 0x01,10
fb084e5e
LM
209
210L(copy_LT_32_aligned):
b8907dfd
UD
211 /* At least 6 bytes to go, and SRC is word-aligned. */
212 blt cr1,8f
213
214 /* Copy 16 bytes. */
215 lwz 6,0(12)
216 lwz 7,4(12)
217 stw 6,0(3)
218 lwz 8,8(12)
219 stw 7,4(3)
220 lwz 6,12(12)
221 addi 12,12,16
222 stw 8,8(3)
223 stw 6,12(3)
224 addi 3,3,16
2258: /* Copy 8 bytes. */
226 bf 28,4f
227
228 lwz 6,0(12)
229 lwz 7,4(12)
230 addi 12,12,8
231 stw 6,0(3)
232 stw 7,4(3)
233 addi 3,3,8
2344: /* Copy 4 bytes. */
235 bf 29,2f
236
237 lwz 6,0(12)
238 addi 12,12,4
239 stw 6,0(3)
240 addi 3,3,4
2412: /* Copy 2-3 bytes. */
242 bf 30,1f
243
244 lhz 6,0(12)
245 sth 6,0(3)
246 bf 31,0f
247 lbz 7,2(12)
248 stb 7,2(3)
249
250 /* Return original DST pointer. */
251 mr 3,30
252 lwz 30,20(1)
253 addi 1,1,32
254 blr
255
256 .align 4
2571: /* Copy 1 byte. */
258 bf 31,0f
259
260 lbz 6,0(12)
261 stb 6,0(3)
2620: /* Return original DST pointer. */
263 mr 3,30
264 lwz 30,20(1)
265 addi 1,1,32
266 blr
267
268 /* Handles copies of 0~8 bytes. */
269 .align 4
fb084e5e 270L(copy_LE_8):
b8907dfd
UD
271 bne cr6,4f
272
273 /* Though we could've used lfd/stfd here, they are still
274 slow for unaligned cases. */
275
276 lwz 6,0(4)
277 lwz 7,4(4)
278 stw 6,0(3)
279 stw 7,4(3)
280
281 /* Return original DST pointer. */
282 mr 3,30
283 lwz 30,20(1)
284 addi 1,1,32
285 blr
286
287 .align 4
2884: /* Copies 4~7 bytes. */
289 bf 29,2b
290
291 lwz 6,0(4)
292 stw 6,0(3)
293 bf 30,5f
294 lhz 7,4(4)
295 sth 7,4(3)
296 bf 31,0f
297 lbz 8,6(4)
298 stb 8,6(3)
299
300 /* Return original DST pointer. */
301 mr 3,30
302 lwz 30,20(1)
303 addi 1,1,32
304 blr
305
306 .align 4
3075: /* Copy 1 byte. */
308 bf 31,0f
309
310 lbz 6,4(4)
311 stb 6,4(3)
312
3130: /* Return original DST pointer. */
314 mr 3,30
315 lwz 30,20(1)
316 addi 1,1,32
317 blr
318
319 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
320 SRC is not. Use aligned quadword loads from SRC, shifted to realign
321 the data, allowing for aligned DST stores. */
322 .align 4
fb084e5e 323L(copy_GE_32_unaligned):
b8907dfd
UD
324 andi. 11,3,15 /* Check alignment of DST. */
325 clrlwi 0,0,28 /* Number of bytes until the 1st
326 quadword of DST. */
327 srwi 9,5,4 /* Number of full quadwords remaining. */
328
329 beq L(copy_GE_32_unaligned_cont)
330
331 /* SRC is not quadword aligned, get it aligned. */
332
333 mtcrf 0x01,0
334 subf 31,0,5
335
336 /* Vector instructions work best when proper alignment (16-bytes)
337 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3381: /* Copy 1 byte. */
339 bf 31,2f
340
341 lbz 6,0(12)
342 addi 12,12,1
343 stb 6,0(3)
344 addi 3,3,1
3452: /* Copy 2 bytes. */
346 bf 30,4f
347
348 lhz 6,0(12)
349 addi 12,12,2
350 sth 6,0(3)
351 addi 3,3,2
3524: /* Copy 4 bytes. */
353 bf 29,8f
354
355 lwz 6,0(12)
356 addi 12,12,4
357 stw 6,0(3)
358 addi 3,3,4
3598: /* Copy 8 bytes. */
360 bf 28,0f
361
362 lfd 6,0(12)
363 addi 12,12,8
364 stfd 6,0(3)
365 addi 3,3,8
fb084e5e 3660:
b8907dfd 367 clrlwi 10,12,28 /* Check alignment of SRC. */
eb5ad2eb 368 srwi 9,31,4 /* Number of full quadwords remaining. */
fb084e5e 369
b8907dfd 370 /* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e
LM
371L(copy_GE_32_unaligned_cont):
372
b8907dfd
UD
373 /* Setup two indexes to speed up the indexed vector operations. */
374 clrlwi 11,31,28
375 li 6,16 /* Index for 16-bytes offsets. */
376 li 7,32 /* Index for 32-bytes offsets. */
377 cmplwi cr1,11,0
eb5ad2eb 378 srwi 8,31,5 /* Setup the loop counter. */
b8907dfd
UD
379 mr 10,3
380 mr 11,12
381 mtcrf 0x01,9
382 cmplwi cr6,9,1
383 lvsl 5,0,12
384 lvx 3,0,12
385 bf 31,L(setup_unaligned_loop)
386
387 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
388 lvx 4,12,6
389 vperm 6,3,4,5
390 addi 11,12,16
391 addi 10,3,16
392 stvx 6,0,3
393 vor 3,4,4
fb084e5e
LM
394
395L(setup_unaligned_loop):
b8907dfd
UD
396 mtctr 8
397 ble cr6,L(end_unaligned_loop)
fb084e5e 398
b8907dfd
UD
399 /* Copy 32 bytes at a time using vector instructions. */
400 .align 4
fb084e5e
LM
401L(unaligned_loop):
402
b8907dfd
UD
403 /* Note: vr6/vr10 may contain data that was already copied,
404 but in order to get proper alignment, we may have to copy
405 some portions again. This is faster than having unaligned
406 vector instructions though. */
407
408 lvx 4,11,6 /* vr4 = r11+16. */
409 vperm 6,3,4,5 /* Merge the correctly-aligned portions
410 of vr3/vr4 into vr6. */
411 lvx 3,11,7 /* vr3 = r11+32. */
412 vperm 10,4,3,5 /* Merge the correctly-aligned portions
413 of vr3/vr4 into vr10. */
414 addi 11,11,32
415 stvx 6,0,10
416 stvx 10,10,6
417 addi 10,10,32
418
419 bdnz L(unaligned_loop)
420
421 .align 4
fb084e5e
LM
422L(end_unaligned_loop):
423
b8907dfd
UD
424 /* Check for tail bytes. */
425 clrrwi 0,31,4
426 mtcrf 0x01,31
427 beq cr1,0f
428
429 add 3,3,0
430 add 12,12,0
431
432 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4338: /* Copy 8 bytes. */
434 bf 28,4f
435
436 lwz 6,0(12)
437 lwz 7,4(12)
438 addi 12,12,8
439 stw 6,0(3)
440 stw 7,4(3)
441 addi 3,3,8
4424: /* Copy 4 bytes. */
443 bf 29,2f
444
445 lwz 6,0(12)
446 addi 12,12,4
447 stw 6,0(3)
448 addi 3,3,4
4492: /* Copy 2~3 bytes. */
450 bf 30,1f
451
452 lhz 6,0(12)
453 addi 12,12,2
454 sth 6,0(3)
455 addi 3,3,2
4561: /* Copy 1 byte. */
457 bf 31,0f
458
459 lbz 6,0(12)
460 stb 6,0(3)
4610: /* Return original DST pointer. */
462 mr 3,30
463 lwz 30,20(1)
464 lwz 31,24(1)
465 addi 1,1,32
466 blr
fb084e5e
LM
467
468END (BP_SYM (memcpy))
469libc_hidden_builtin_def (memcpy)