]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/mempcpy.S
* stdio-common/vfprintf.c (vfprintf): Check malloc return; don't
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S
CommitLineData
344d0b54 1/* Optimized mempcpy implementation for POWER7.
568035b7 2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
344d0b54
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
344d0b54
LM
19
20#include <sysdep.h>
21#include <bp-sym.h>
22#include <bp-asm.h>
23
24
25/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
26 Returns 'dst' + 'len'. */
27
28 .machine power7
29EALIGN (BP_SYM (__mempcpy), 5, 0)
30 CALL_MCOUNT 3
31
32 cmpldi cr1,5,31
33 neg 0,3
34 std 3,-16(1)
35 std 31,-8(1)
36 cfi_offset(31,-8)
37 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
38 code. */
39
40 andi. 11,3,7 /* Check alignment of DST. */
41
42
43 clrldi 10,4,61 /* Check alignment of SRC. */
44 cmpld cr6,10,11 /* SRC and DST alignments match? */
45 mr 12,4
46 mr 31,5
47 bne cr6,L(copy_GE_32_unaligned)
48
49 srdi 9,5,3 /* Number of full quadwords remaining. */
50
51 beq L(copy_GE_32_aligned_cont)
52
53 clrldi 0,0,61
54 mtcrf 0x01,0
55 subf 31,0,5
56
57 /* Get the SRC aligned to 8 bytes. */
58
591: bf 31,2f
60 lbz 6,0(12)
61 addi 12,12,1
62 stb 6,0(3)
63 addi 3,3,1
642: bf 30,4f
65 lhz 6,0(12)
66 addi 12,12,2
67 sth 6,0(3)
68 addi 3,3,2
694: bf 29,0f
70 lwz 6,0(12)
71 addi 12,12,4
72 stw 6,0(3)
73 addi 3,3,4
740:
75 clrldi 10,12,61 /* Check alignment of SRC again. */
76 srdi 9,31,3 /* Number of full doublewords remaining. */
77
78L(copy_GE_32_aligned_cont):
79
80 clrldi 11,31,61
81 mtcrf 0x01,9
82
83 srdi 8,31,5
84 cmpldi cr1,9,4
85 cmpldi cr6,11,0
86 mr 11,12
87
88 /* Copy 1~3 doublewords so the main loop starts
89 at a multiple of 32 bytes. */
90
91 bf 30,1f
92 ld 6,0(12)
93 ld 7,8(12)
94 addi 11,12,16
95 mtctr 8
96 std 6,0(3)
97 std 7,8(3)
98 addi 10,3,16
99 bf 31,4f
100 ld 0,16(12)
101 std 0,16(3)
102 blt cr1,3f
103 addi 11,12,24
104 addi 10,3,24
105 b 4f
106
107 .align 4
1081: /* Copy 1 doubleword and set the counter. */
109 mr 10,3
110 mtctr 8
111 bf 31,4f
112 ld 6,0(12)
113 addi 11,12,8
114 std 6,0(3)
115 addi 10,3,8
116
117 /* Main aligned copy loop. Copies 32-bytes at a time. */
118 .align 4
1194:
120 ld 6,0(11)
121 ld 7,8(11)
122 ld 8,16(11)
123 ld 0,24(11)
124 addi 11,11,32
125
126 std 6,0(10)
127 std 7,8(10)
128 std 8,16(10)
129 std 0,24(10)
130 addi 10,10,32
131 bdnz 4b
1323:
133
134 /* Check for tail bytes. */
135 rldicr 0,31,0,60
136 mtcrf 0x01,31
137 beq cr6,0f
138
139.L9:
140 add 3,3,0
141 add 12,12,0
142
143 /* At this point we have a tail of 0-7 bytes and we know that the
144 destination is doubleword-aligned. */
1454: /* Copy 4 bytes. */
146 bf 29,2f
147
148 lwz 6,0(12)
149 addi 12,12,4
150 stw 6,0(3)
151 addi 3,3,4
1522: /* Copy 2 bytes. */
153 bf 30,1f
154
155 lhz 6,0(12)
156 addi 12,12,2
157 sth 6,0(3)
158 addi 3,3,2
1591: /* Copy 1 byte. */
160 bf 31,0f
161
162 lbz 6,0(12)
163 stb 6,0(3)
1640: /* Return DST + LEN pointer. */
165 ld 31,-8(1)
166 ld 3,-16(1)
167 add 3,3,5
168 blr
169
170 /* Handle copies of 0~31 bytes. */
171 .align 4
172L(copy_LT_32):
173 cmpldi cr6,5,8
174 mr 12,4
175 mtcrf 0x01,5
176 ble cr6,L(copy_LE_8)
177
178 /* At least 9 bytes to go. */
179 neg 8,4
180 clrrdi 11,4,2
181 andi. 0,8,3
182 cmpldi cr1,5,16
183 mr 10,5
184 beq L(copy_LT_32_aligned)
185
186 /* Force 4-bytes alignment for SRC. */
187 mtocrf 0x01,0
188 subf 10,0,5
1892: bf 30,1f
190
191 lhz 6,0(12)
192 addi 12,12,2
193 sth 6,0(3)
194 addi 3,3,2
1951: bf 31,L(end_4bytes_alignment)
196
197 lbz 6,0(12)
198 addi 12,12,1
199 stb 6,0(3)
200 addi 3,3,1
201
202 .align 4
203L(end_4bytes_alignment):
204 cmpldi cr1,10,16
205 mtcrf 0x01,10
206
207L(copy_LT_32_aligned):
208 /* At least 6 bytes to go, and SRC is word-aligned. */
209 blt cr1,8f
210
211 /* Copy 16 bytes. */
212 lwz 6,0(12)
213 lwz 7,4(12)
214 stw 6,0(3)
215 lwz 8,8(12)
216 stw 7,4(3)
217 lwz 6,12(12)
218 addi 12,12,16
219 stw 8,8(3)
220 stw 6,12(3)
221 addi 3,3,16
2228: /* Copy 8 bytes. */
223 bf 28,4f
224
225 lwz 6,0(12)
226 lwz 7,4(12)
227 addi 12,12,8
228 stw 6,0(3)
229 stw 7,4(3)
230 addi 3,3,8
2314: /* Copy 4 bytes. */
232 bf 29,2f
233
234 lwz 6,0(12)
235 addi 12,12,4
236 stw 6,0(3)
237 addi 3,3,4
2382: /* Copy 2-3 bytes. */
239 bf 30,1f
240
241 lhz 6,0(12)
242 sth 6,0(3)
243 bf 31,0f
244 lbz 7,2(12)
245 stb 7,2(3)
246 ld 3,-16(1)
247 add 3,3,5
248 blr
249
250 .align 4
2511: /* Copy 1 byte. */
252 bf 31,0f
253
254 lbz 6,0(12)
255 stb 6,0(3)
2560: /* Return DST + LEN pointer. */
257 ld 3,-16(1)
258 add 3,3,5
259 blr
260
261 /* Handles copies of 0~8 bytes. */
262 .align 4
263L(copy_LE_8):
264 bne cr6,4f
265
266 /* Though we could've used ld/std here, they are still
267 slow for unaligned cases. */
268
269 lwz 6,0(4)
270 lwz 7,4(4)
271 stw 6,0(3)
272 stw 7,4(3)
273 ld 3,-16(1) /* Return DST + LEN pointer. */
274 add 3,3,5
275 blr
276
277 .align 4
2784: /* Copies 4~7 bytes. */
279 bf 29,2b
280
281 lwz 6,0(4)
282 stw 6,0(3)
283 bf 30,5f
284 lhz 7,4(4)
285 sth 7,4(3)
286 bf 31,0f
287 lbz 8,6(4)
288 stb 8,6(3)
289 ld 3,-16(1)
290 add 3,3,5
291 blr
292
293 .align 4
2945: /* Copy 1 byte. */
295 bf 31,0f
296
297 lbz 6,4(4)
298 stb 6,4(3)
299
3000: /* Return DST + LEN pointer. */
301 ld 3,-16(1)
302 add 3,3,5
303 blr
304
305 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
306 SRC is not. Use aligned quadword loads from SRC, shifted to realign
307 the data, allowing for aligned DST stores. */
308 .align 4
309L(copy_GE_32_unaligned):
310 clrldi 0,0,60 /* Number of bytes until the 1st
311 quadword. */
312 andi. 11,3,15 /* Check alignment of DST (against
313 quadwords). */
314 srdi 9,5,4 /* Number of full quadwords remaining. */
315
316 beq L(copy_GE_32_unaligned_cont)
317
318 /* SRC is not quadword aligned, get it aligned. */
319
320 mtcrf 0x01,0
321 subf 31,0,5
322
323 /* Vector instructions work best when proper alignment (16-bytes)
324 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3251: /* Copy 1 byte. */
326 bf 31,2f
327
328 lbz 6,0(12)
329 addi 12,12,1
330 stb 6,0(3)
331 addi 3,3,1
3322: /* Copy 2 bytes. */
333 bf 30,4f
334
335 lhz 6,0(12)
336 addi 12,12,2
337 sth 6,0(3)
338 addi 3,3,2
3394: /* Copy 4 bytes. */
340 bf 29,8f
341
342 lwz 6,0(12)
343 addi 12,12,4
344 stw 6,0(3)
345 addi 3,3,4
3468: /* Copy 8 bytes. */
347 bf 28,0f
348
349 ld 6,0(12)
350 addi 12,12,8
351 std 6,0(3)
352 addi 3,3,8
3530:
354 clrldi 10,12,60 /* Check alignment of SRC. */
355 srdi 9,31,4 /* Number of full quadwords remaining. */
356
357 /* The proper alignment is present, it is OK to copy the bytes now. */
358L(copy_GE_32_unaligned_cont):
359
360 /* Setup two indexes to speed up the indexed vector operations. */
361 clrldi 11,31,60
362 li 6,16 /* Index for 16-bytes offsets. */
363 li 7,32 /* Index for 32-bytes offsets. */
364 cmpldi cr1,11,0
365 srdi 8,31,5 /* Setup the loop counter. */
366 mr 10,3
367 mr 11,12
368 mtcrf 0x01,9
369 cmpldi cr6,9,1
370 lvsl 5,0,12
371 lvx 3,0,12
372 bf 31,L(setup_unaligned_loop)
373
374 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
375 lvx 4,12,6
376 vperm 6,3,4,5
377 addi 11,12,16
378 addi 10,3,16
379 stvx 6,0,3
380 vor 3,4,4
381
382L(setup_unaligned_loop):
383 mtctr 8
384 ble cr6,L(end_unaligned_loop)
385
386 /* Copy 32 bytes at a time using vector instructions. */
387 .align 4
388L(unaligned_loop):
389
390 /* Note: vr6/vr10 may contain data that was already copied,
391 but in order to get proper alignment, we may have to copy
392 some portions again. This is faster than having unaligned
393 vector instructions though. */
394
395 lvx 4,11,6 /* vr4 = r11+16. */
396 vperm 6,3,4,5 /* Merge the correctly-aligned portions
397 of vr3/vr4 into vr6. */
398 lvx 3,11,7 /* vr3 = r11+32. */
399 vperm 10,4,3,5 /* Merge the correctly-aligned portions
400 of vr3/vr4 into vr10. */
401 addi 11,11,32
402 stvx 6,0,10
403 stvx 10,10,6
404 addi 10,10,32
405
406 bdnz L(unaligned_loop)
407
408 .align 4
409L(end_unaligned_loop):
410
411 /* Check for tail bytes. */
412 rldicr 0,31,0,59
413 mtcrf 0x01,31
414 beq cr1,0f
415
416 add 3,3,0
417 add 12,12,0
418
419 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4208: /* Copy 8 bytes. */
421 bf 28,4f
422
423 lwz 6,0(12)
424 lwz 7,4(12)
425 addi 12,12,8
426 stw 6,0(3)
427 stw 7,4(3)
428 addi 3,3,8
4294: /* Copy 4 bytes. */
430 bf 29,2f
431
432 lwz 6,0(12)
433 addi 12,12,4
434 stw 6,0(3)
435 addi 3,3,4
4362: /* Copy 2~3 bytes. */
437 bf 30,1f
438
439 lhz 6,0(12)
440 addi 12,12,2
441 sth 6,0(3)
442 addi 3,3,2
4431: /* Copy 1 byte. */
444 bf 31,0f
445
446 lbz 6,0(12)
447 stb 6,0(3)
4480: /* Return DST + LEN pointer. */
449 ld 31,-8(1)
450 ld 3,-16(1)
451 add 3,3,5
452 blr
453
454END_GEN_TB (BP_SYM (__mempcpy),TB_TOCLESS)
455libc_hidden_def (BP_SYM (__mempcpy))
456weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy))
457libc_hidden_builtin_def (mempcpy)