]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power7/mempcpy.S
power7-optimized mempcpy
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / mempcpy.S
CommitLineData
344d0b54
LM
1/* Optimized mempcpy implementation for POWER7.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
19 02110-1301 USA. */
20
21#include <sysdep.h>
22#include <bp-sym.h>
23#include <bp-asm.h>
24
25/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
26 Returns 'dst' + 'len'. */
27
28 .machine power7
29EALIGN (BP_SYM (__mempcpy), 5, 0)
30 CALL_MCOUNT
31
32 stwu 1,-32(1)
33 cfi_adjust_cfa_offset(32)
34 stw 30,20(1)
35 cfi_offset(30,(20-32))
36 stw 31,24(1)
37 mr 30,3
38 cmplwi cr1,5,31
39 neg 0,3
40 cfi_offset(31,-8)
41 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
42 code. */
43
44 andi. 11,3,7 /* Check alignment of DST. */
45 clrlwi 10,4,29 /* Check alignment of SRC. */
46 cmplw cr6,10,11 /* SRC and DST alignments match? */
47 mr 12,4
48 mr 31,5
49 bne cr6,L(copy_GE_32_unaligned)
50
51 srwi 9,5,3 /* Number of full quadwords remaining. */
52
53 beq L(copy_GE_32_aligned_cont)
54
55 clrlwi 0,0,29
56 mtcrf 0x01,0
57 subf 31,0,5
58
59 /* Get the SRC aligned to 8 bytes. */
60
611: bf 31,2f
62 lbz 6,0(12)
63 addi 12,12,1
64 stb 6,0(3)
65 addi 3,3,1
662: bf 30,4f
67 lhz 6,0(12)
68 addi 12,12,2
69 sth 6,0(3)
70 addi 3,3,2
714: bf 29,0f
72 lwz 6,0(12)
73 addi 12,12,4
74 stw 6,0(3)
75 addi 3,3,4
760:
77 clrlwi 10,12,29 /* Check alignment of SRC again. */
78 srwi 9,31,3 /* Number of full doublewords remaining. */
79
80L(copy_GE_32_aligned_cont):
81
82 clrlwi 11,31,29
83 mtcrf 0x01,9
84
85 srwi 8,31,5
86 cmplwi cr1,9,4
87 cmplwi cr6,11,0
88 mr 11,12
89
90 /* Copy 1~3 doublewords so the main loop starts
91 at a multiple of 32 bytes. */
92
93 bf 30,1f
94 lfd 6,0(12)
95 lfd 7,8(12)
96 addi 11,12,16
97 mtctr 8
98 stfd 6,0(3)
99 stfd 7,8(3)
100 addi 10,3,16
101 bf 31,4f
102 lfd 0,16(12)
103 stfd 0,16(3)
104 blt cr1,3f
105 addi 11,12,24
106 addi 10,3,24
107 b 4f
108
109 .align 4
1101: /* Copy 1 doubleword and set the counter. */
111 mr 10,3
112 mtctr 8
113 bf 31,4f
114 lfd 6,0(12)
115 addi 11,12,8
116 stfd 6,0(3)
117 addi 10,3,8
118
119 .align 4
1204: /* Main aligned copy loop. Copies 32-bytes at a time. */
121 lfd 6,0(11)
122 lfd 7,8(11)
123 lfd 8,16(11)
124 lfd 0,24(11)
125 addi 11,11,32
126
127 stfd 6,0(10)
128 stfd 7,8(10)
129 stfd 8,16(10)
130 stfd 0,24(10)
131 addi 10,10,32
132 bdnz 4b
1333:
134
135 /* Check for tail bytes. */
136
137 clrrwi 0,31,3
138 mtcrf 0x01,31
139 beq cr6,0f
140
141.L9:
142 add 3,3,0
143 add 12,12,0
144
145 /* At this point we have a tail of 0-7 bytes and we know that the
146 destination is doubleword-aligned. */
1474: /* Copy 4 bytes. */
148 bf 29,2f
149
150 lwz 6,0(12)
151 addi 12,12,4
152 stw 6,0(3)
153 addi 3,3,4
1542: /* Copy 2 bytes. */
155 bf 30,1f
156
157 lhz 6,0(12)
158 addi 12,12,2
159 sth 6,0(3)
160 addi 3,3,2
1611: /* Copy 1 byte. */
162 bf 31,0f
163
164 lbz 6,0(12)
165 stb 6,0(3)
1660: /* Return DST + LEN pointer. */
167 add 3,30,5
168 lwz 30,20(1)
169 lwz 31,24(1)
170 addi 1,1,32
171 blr
172
173 /* Handle copies of 0~31 bytes. */
174 .align 4
175L(copy_LT_32):
176 cmplwi cr6,5,8
177 mr 12,4
178 mtcrf 0x01,5
179 ble cr6,L(copy_LE_8)
180
181 /* At least 9 bytes to go. */
182 neg 8,4
183 clrrwi 11,4,2
184 andi. 0,8,3
185 cmplwi cr1,5,16
186 mr 10,5
187 beq L(copy_LT_32_aligned)
188
189 /* Force 4-bytes alignment for SRC. */
190 mtocrf 0x01,0
191 subf 10,0,5
1922: bf 30,1f
193
194 lhz 6,0(12)
195 addi 12,12,2
196 sth 6,0(3)
197 addi 3,3,2
1981: bf 31,L(end_4bytes_alignment)
199
200 lbz 6,0(12)
201 addi 12,12,1
202 stb 6,0(3)
203 addi 3,3,1
204
205 .align 4
206L(end_4bytes_alignment):
207 cmplwi cr1,10,16
208 mtcrf 0x01,10
209
210L(copy_LT_32_aligned):
211 /* At least 6 bytes to go, and SRC is word-aligned. */
212 blt cr1,8f
213
214 /* Copy 16 bytes. */
215 lwz 6,0(12)
216 lwz 7,4(12)
217 stw 6,0(3)
218 lwz 8,8(12)
219 stw 7,4(3)
220 lwz 6,12(12)
221 addi 12,12,16
222 stw 8,8(3)
223 stw 6,12(3)
224 addi 3,3,16
2258: /* Copy 8 bytes. */
226 bf 28,4f
227
228 lwz 6,0(12)
229 lwz 7,4(12)
230 addi 12,12,8
231 stw 6,0(3)
232 stw 7,4(3)
233 addi 3,3,8
2344: /* Copy 4 bytes. */
235 bf 29,2f
236
237 lwz 6,0(12)
238 addi 12,12,4
239 stw 6,0(3)
240 addi 3,3,4
2412: /* Copy 2-3 bytes. */
242 bf 30,1f
243
244 lhz 6,0(12)
245 sth 6,0(3)
246 bf 31,0f
247 lbz 7,2(12)
248 stb 7,2(3)
249
250 /* Return DST + LEN pointer. */
251 add 3,30,5
252 lwz 30,20(1)
253 addi 1,1,32
254 blr
255
256 .align 4
2571: /* Copy 1 byte. */
258 bf 31,0f
259
260 lbz 6,0(12)
261 stb 6,0(3)
2620: /* Return DST + LEN pointer. */
263 add 3,30,5
264 lwz 30,20(1)
265 addi 1,1,32
266 blr
267
268 /* Handles copies of 0~8 bytes. */
269 .align 4
270L(copy_LE_8):
271 bne cr6,4f
272
273 /* Though we could've used lfd/stfd here, they are still
274 slow for unaligned cases. */
275
276 lwz 6,0(4)
277 lwz 7,4(4)
278 stw 6,0(3)
279 stw 7,4(3)
280
281 /* Return DST + LEN pointer. */
282 add 3,30,5
283 lwz 30,20(1)
284 addi 1,1,32
285 blr
286
287 .align 4
2884: /* Copies 4~7 bytes. */
289 bf 29,2b
290
291 lwz 6,0(4)
292 stw 6,0(3)
293 bf 30,5f
294 lhz 7,4(4)
295 sth 7,4(3)
296 bf 31,0f
297 lbz 8,6(4)
298 stb 8,6(3)
299
300 /* Return DST + LEN pointer. */
301 add 3,30,5
302 lwz 30,20(1)
303 addi 1,1,32
304 blr
305
306 .align 4
3075: /* Copy 1 byte. */
308 bf 31,0f
309
310 lbz 6,4(4)
311 stb 6,4(3)
312
3130: /* Return DST + LEN pointer. */
314 add 3,30,5
315 lwz 30,20(1)
316 addi 1,1,32
317 blr
318
319 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
320 SRC is not. Use aligned quadword loads from SRC, shifted to realign
321 the data, allowing for aligned DST stores. */
322 .align 4
323L(copy_GE_32_unaligned):
324 andi. 11,3,15 /* Check alignment of DST. */
325 clrlwi 0,0,28 /* Number of bytes until the 1st
326 quadword of DST. */
327 srwi 9,5,4 /* Number of full quadwords remaining. */
328
329 beq L(copy_GE_32_unaligned_cont)
330
331 /* SRC is not quadword aligned, get it aligned. */
332
333 mtcrf 0x01,0
334 subf 31,0,5
335
336 /* Vector instructions work best when proper alignment (16-bytes)
337 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3381: /* Copy 1 byte. */
339 bf 31,2f
340
341 lbz 6,0(12)
342 addi 12,12,1
343 stb 6,0(3)
344 addi 3,3,1
3452: /* Copy 2 bytes. */
346 bf 30,4f
347
348 lhz 6,0(12)
349 addi 12,12,2
350 sth 6,0(3)
351 addi 3,3,2
3524: /* Copy 4 bytes. */
353 bf 29,8f
354
355 lwz 6,0(12)
356 addi 12,12,4
357 stw 6,0(3)
358 addi 3,3,4
3598: /* Copy 8 bytes. */
360 bf 28,0f
361
362 lfd 6,0(12)
363 addi 12,12,8
364 stfd 6,0(3)
365 addi 3,3,8
3660:
367 clrlwi 10,12,28 /* Check alignment of SRC. */
368 srwi 9,31,4 /* Number of full quadwords remaining. */
369
370 /* The proper alignment is present, it is OK to copy the bytes now. */
371L(copy_GE_32_unaligned_cont):
372
373 /* Setup two indexes to speed up the indexed vector operations. */
374 clrlwi 11,31,28
375 li 6,16 /* Index for 16-bytes offsets. */
376 li 7,32 /* Index for 32-bytes offsets. */
377 cmplwi cr1,11,0
378 srwi 8,31,5 /* Setup the loop counter. */
379 mr 10,3
380 mr 11,12
381 mtcrf 0x01,9
382 cmplwi cr6,9,1
383 lvsl 5,0,12
384 lvx 3,0,12
385 bf 31,L(setup_unaligned_loop)
386
387 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
388 lvx 4,12,6
389 vperm 6,3,4,5
390 addi 11,12,16
391 addi 10,3,16
392 stvx 6,0,3
393 vor 3,4,4
394
395L(setup_unaligned_loop):
396 mtctr 8
397 ble cr6,L(end_unaligned_loop)
398
399 /* Copy 32 bytes at a time using vector instructions. */
400 .align 4
401L(unaligned_loop):
402
403 /* Note: vr6/vr10 may contain data that was already copied,
404 but in order to get proper alignment, we may have to copy
405 some portions again. This is faster than having unaligned
406 vector instructions though. */
407
408 lvx 4,11,6 /* vr4 = r11+16. */
409 vperm 6,3,4,5 /* Merge the correctly-aligned portions
410 of vr3/vr4 into vr6. */
411 lvx 3,11,7 /* vr3 = r11+32. */
412 vperm 10,4,3,5 /* Merge the correctly-aligned portions
413 of vr3/vr4 into vr10. */
414 addi 11,11,32
415 stvx 6,0,10
416 stvx 10,10,6
417 addi 10,10,32
418
419 bdnz L(unaligned_loop)
420
421 .align 4
422L(end_unaligned_loop):
423
424 /* Check for tail bytes. */
425 clrrwi 0,31,4
426 mtcrf 0x01,31
427 beq cr1,0f
428
429 add 3,3,0
430 add 12,12,0
431
432 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4338: /* Copy 8 bytes. */
434 bf 28,4f
435
436 lwz 6,0(12)
437 lwz 7,4(12)
438 addi 12,12,8
439 stw 6,0(3)
440 stw 7,4(3)
441 addi 3,3,8
4424: /* Copy 4 bytes. */
443 bf 29,2f
444
445 lwz 6,0(12)
446 addi 12,12,4
447 stw 6,0(3)
448 addi 3,3,4
4492: /* Copy 2~3 bytes. */
450 bf 30,1f
451
452 lhz 6,0(12)
453 addi 12,12,2
454 sth 6,0(3)
455 addi 3,3,2
4561: /* Copy 1 byte. */
457 bf 31,0f
458
459 lbz 6,0(12)
460 stb 6,0(3)
4610: /* Return DST + LEN pointer. */
462 add 3,30,5
463 lwz 30,20(1)
464 lwz 31,24(1)
465 addi 1,1,32
466 blr
467
468END (BP_SYM (__mempcpy))
469libc_hidden_def (BP_SYM (__mempcpy))
470weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
471libc_hidden_builtin_def (mempcpy)