]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/mempcpy.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S
CommitLineData
344d0b54 1/* Optimized mempcpy implementation for POWER7.
f7a9f785 2 Copyright (C) 2010-2016 Free Software Foundation, Inc.
344d0b54
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
344d0b54
LM
19
20#include <sysdep.h>
344d0b54
LM
21
22
23/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Returns 'dst' + 'len'. */
25
26 .machine power7
2d67d91a 27EALIGN (__mempcpy, 5, 0)
344d0b54
LM
28 CALL_MCOUNT 3
29
30 cmpldi cr1,5,31
31 neg 0,3
32 std 3,-16(1)
33 std 31,-8(1)
34 cfi_offset(31,-8)
35 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
36 code. */
37
38 andi. 11,3,7 /* Check alignment of DST. */
39
40
41 clrldi 10,4,61 /* Check alignment of SRC. */
42 cmpld cr6,10,11 /* SRC and DST alignments match? */
43 mr 12,4
44 mr 31,5
45 bne cr6,L(copy_GE_32_unaligned)
46
47 srdi 9,5,3 /* Number of full quadwords remaining. */
48
49 beq L(copy_GE_32_aligned_cont)
50
51 clrldi 0,0,61
52 mtcrf 0x01,0
53 subf 31,0,5
54
55 /* Get the SRC aligned to 8 bytes. */
56
571: bf 31,2f
58 lbz 6,0(12)
59 addi 12,12,1
60 stb 6,0(3)
61 addi 3,3,1
622: bf 30,4f
63 lhz 6,0(12)
64 addi 12,12,2
65 sth 6,0(3)
66 addi 3,3,2
674: bf 29,0f
68 lwz 6,0(12)
69 addi 12,12,4
70 stw 6,0(3)
71 addi 3,3,4
720:
73 clrldi 10,12,61 /* Check alignment of SRC again. */
74 srdi 9,31,3 /* Number of full doublewords remaining. */
75
76L(copy_GE_32_aligned_cont):
77
78 clrldi 11,31,61
79 mtcrf 0x01,9
80
81 srdi 8,31,5
82 cmpldi cr1,9,4
83 cmpldi cr6,11,0
84 mr 11,12
85
86 /* Copy 1~3 doublewords so the main loop starts
87 at a multiple of 32 bytes. */
88
89 bf 30,1f
90 ld 6,0(12)
91 ld 7,8(12)
92 addi 11,12,16
93 mtctr 8
94 std 6,0(3)
95 std 7,8(3)
96 addi 10,3,16
97 bf 31,4f
98 ld 0,16(12)
99 std 0,16(3)
100 blt cr1,3f
101 addi 11,12,24
102 addi 10,3,24
103 b 4f
104
105 .align 4
1061: /* Copy 1 doubleword and set the counter. */
107 mr 10,3
108 mtctr 8
109 bf 31,4f
110 ld 6,0(12)
111 addi 11,12,8
112 std 6,0(3)
113 addi 10,3,8
114
115 /* Main aligned copy loop. Copies 32-bytes at a time. */
116 .align 4
1174:
118 ld 6,0(11)
119 ld 7,8(11)
120 ld 8,16(11)
121 ld 0,24(11)
122 addi 11,11,32
123
124 std 6,0(10)
125 std 7,8(10)
126 std 8,16(10)
127 std 0,24(10)
128 addi 10,10,32
129 bdnz 4b
1303:
131
132 /* Check for tail bytes. */
133 rldicr 0,31,0,60
134 mtcrf 0x01,31
135 beq cr6,0f
136
137.L9:
138 add 3,3,0
139 add 12,12,0
140
141 /* At this point we have a tail of 0-7 bytes and we know that the
142 destination is doubleword-aligned. */
1434: /* Copy 4 bytes. */
144 bf 29,2f
145
146 lwz 6,0(12)
147 addi 12,12,4
148 stw 6,0(3)
149 addi 3,3,4
1502: /* Copy 2 bytes. */
151 bf 30,1f
152
153 lhz 6,0(12)
154 addi 12,12,2
155 sth 6,0(3)
156 addi 3,3,2
1571: /* Copy 1 byte. */
158 bf 31,0f
159
160 lbz 6,0(12)
161 stb 6,0(3)
1620: /* Return DST + LEN pointer. */
163 ld 31,-8(1)
164 ld 3,-16(1)
165 add 3,3,5
166 blr
167
168 /* Handle copies of 0~31 bytes. */
169 .align 4
170L(copy_LT_32):
171 cmpldi cr6,5,8
172 mr 12,4
173 mtcrf 0x01,5
174 ble cr6,L(copy_LE_8)
175
176 /* At least 9 bytes to go. */
177 neg 8,4
178 clrrdi 11,4,2
179 andi. 0,8,3
180 cmpldi cr1,5,16
181 mr 10,5
182 beq L(copy_LT_32_aligned)
183
184 /* Force 4-bytes alignment for SRC. */
185 mtocrf 0x01,0
186 subf 10,0,5
1872: bf 30,1f
188
189 lhz 6,0(12)
190 addi 12,12,2
191 sth 6,0(3)
192 addi 3,3,2
1931: bf 31,L(end_4bytes_alignment)
194
195 lbz 6,0(12)
196 addi 12,12,1
197 stb 6,0(3)
198 addi 3,3,1
199
200 .align 4
201L(end_4bytes_alignment):
202 cmpldi cr1,10,16
203 mtcrf 0x01,10
204
205L(copy_LT_32_aligned):
206 /* At least 6 bytes to go, and SRC is word-aligned. */
207 blt cr1,8f
208
209 /* Copy 16 bytes. */
210 lwz 6,0(12)
211 lwz 7,4(12)
212 stw 6,0(3)
213 lwz 8,8(12)
214 stw 7,4(3)
215 lwz 6,12(12)
216 addi 12,12,16
217 stw 8,8(3)
218 stw 6,12(3)
219 addi 3,3,16
2208: /* Copy 8 bytes. */
221 bf 28,4f
222
223 lwz 6,0(12)
224 lwz 7,4(12)
225 addi 12,12,8
226 stw 6,0(3)
227 stw 7,4(3)
228 addi 3,3,8
2294: /* Copy 4 bytes. */
230 bf 29,2f
231
232 lwz 6,0(12)
233 addi 12,12,4
234 stw 6,0(3)
235 addi 3,3,4
2362: /* Copy 2-3 bytes. */
237 bf 30,1f
238
239 lhz 6,0(12)
240 sth 6,0(3)
241 bf 31,0f
242 lbz 7,2(12)
243 stb 7,2(3)
244 ld 3,-16(1)
245 add 3,3,5
246 blr
247
248 .align 4
2491: /* Copy 1 byte. */
250 bf 31,0f
251
252 lbz 6,0(12)
253 stb 6,0(3)
2540: /* Return DST + LEN pointer. */
255 ld 3,-16(1)
256 add 3,3,5
257 blr
258
259 /* Handles copies of 0~8 bytes. */
260 .align 4
261L(copy_LE_8):
262 bne cr6,4f
263
264 /* Though we could've used ld/std here, they are still
265 slow for unaligned cases. */
266
267 lwz 6,0(4)
268 lwz 7,4(4)
269 stw 6,0(3)
270 stw 7,4(3)
271 ld 3,-16(1) /* Return DST + LEN pointer. */
272 add 3,3,5
273 blr
274
275 .align 4
2764: /* Copies 4~7 bytes. */
277 bf 29,2b
278
279 lwz 6,0(4)
280 stw 6,0(3)
281 bf 30,5f
282 lhz 7,4(4)
283 sth 7,4(3)
284 bf 31,0f
285 lbz 8,6(4)
286 stb 8,6(3)
287 ld 3,-16(1)
288 add 3,3,5
289 blr
290
291 .align 4
2925: /* Copy 1 byte. */
293 bf 31,0f
294
295 lbz 6,4(4)
296 stb 6,4(3)
297
2980: /* Return DST + LEN pointer. */
299 ld 3,-16(1)
300 add 3,3,5
301 blr
302
303 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
304 SRC is not. Use aligned quadword loads from SRC, shifted to realign
305 the data, allowing for aligned DST stores. */
306 .align 4
307L(copy_GE_32_unaligned):
308 clrldi 0,0,60 /* Number of bytes until the 1st
309 quadword. */
310 andi. 11,3,15 /* Check alignment of DST (against
311 quadwords). */
312 srdi 9,5,4 /* Number of full quadwords remaining. */
313
314 beq L(copy_GE_32_unaligned_cont)
315
316 /* SRC is not quadword aligned, get it aligned. */
317
318 mtcrf 0x01,0
319 subf 31,0,5
320
321 /* Vector instructions work best when proper alignment (16-bytes)
322 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3231: /* Copy 1 byte. */
324 bf 31,2f
325
326 lbz 6,0(12)
327 addi 12,12,1
328 stb 6,0(3)
329 addi 3,3,1
3302: /* Copy 2 bytes. */
331 bf 30,4f
332
333 lhz 6,0(12)
334 addi 12,12,2
335 sth 6,0(3)
336 addi 3,3,2
3374: /* Copy 4 bytes. */
338 bf 29,8f
339
340 lwz 6,0(12)
341 addi 12,12,4
342 stw 6,0(3)
343 addi 3,3,4
3448: /* Copy 8 bytes. */
345 bf 28,0f
346
347 ld 6,0(12)
348 addi 12,12,8
349 std 6,0(3)
350 addi 3,3,8
3510:
352 clrldi 10,12,60 /* Check alignment of SRC. */
353 srdi 9,31,4 /* Number of full quadwords remaining. */
354
355 /* The proper alignment is present, it is OK to copy the bytes now. */
356L(copy_GE_32_unaligned_cont):
357
358 /* Setup two indexes to speed up the indexed vector operations. */
359 clrldi 11,31,60
360 li 6,16 /* Index for 16-bytes offsets. */
361 li 7,32 /* Index for 32-bytes offsets. */
362 cmpldi cr1,11,0
363 srdi 8,31,5 /* Setup the loop counter. */
364 mr 10,3
365 mr 11,12
366 mtcrf 0x01,9
367 cmpldi cr6,9,1
759cfef3
AM
368#ifdef __LITTLE_ENDIAN__
369 lvsr 5,0,12
370#else
371 lvsl 5,0,12
372#endif
344d0b54
LM
373 lvx 3,0,12
374 bf 31,L(setup_unaligned_loop)
375
376 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
377 lvx 4,12,6
759cfef3
AM
378#ifdef __LITTLE_ENDIAN__
379 vperm 6,4,3,5
380#else
381 vperm 6,3,4,5
382#endif
344d0b54
LM
383 addi 11,12,16
384 addi 10,3,16
385 stvx 6,0,3
386 vor 3,4,4
387
388L(setup_unaligned_loop):
389 mtctr 8
390 ble cr6,L(end_unaligned_loop)
391
392 /* Copy 32 bytes at a time using vector instructions. */
393 .align 4
394L(unaligned_loop):
395
396 /* Note: vr6/vr10 may contain data that was already copied,
397 but in order to get proper alignment, we may have to copy
398 some portions again. This is faster than having unaligned
399 vector instructions though. */
400
401 lvx 4,11,6 /* vr4 = r11+16. */
759cfef3
AM
402#ifdef __LITTLE_ENDIAN__
403 vperm 6,4,3,5
404#else
405 vperm 6,3,4,5
406#endif
344d0b54 407 lvx 3,11,7 /* vr3 = r11+32. */
759cfef3
AM
408#ifdef __LITTLE_ENDIAN__
409 vperm 10,3,4,5
410#else
411 vperm 10,4,3,5
412#endif
344d0b54
LM
413 addi 11,11,32
414 stvx 6,0,10
415 stvx 10,10,6
416 addi 10,10,32
417
418 bdnz L(unaligned_loop)
419
420 .align 4
421L(end_unaligned_loop):
422
423 /* Check for tail bytes. */
424 rldicr 0,31,0,59
425 mtcrf 0x01,31
426 beq cr1,0f
427
428 add 3,3,0
429 add 12,12,0
430
431 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4328: /* Copy 8 bytes. */
433 bf 28,4f
434
435 lwz 6,0(12)
436 lwz 7,4(12)
437 addi 12,12,8
438 stw 6,0(3)
439 stw 7,4(3)
440 addi 3,3,8
4414: /* Copy 4 bytes. */
442 bf 29,2f
443
444 lwz 6,0(12)
445 addi 12,12,4
446 stw 6,0(3)
447 addi 3,3,4
4482: /* Copy 2~3 bytes. */
449 bf 30,1f
450
451 lhz 6,0(12)
452 addi 12,12,2
453 sth 6,0(3)
454 addi 3,3,2
4551: /* Copy 1 byte. */
456 bf 31,0f
457
458 lbz 6,0(12)
459 stb 6,0(3)
4600: /* Return DST + LEN pointer. */
461 ld 31,-8(1)
462 ld 3,-16(1)
463 add 3,3,5
464 blr
465
2d67d91a
JM
466END_GEN_TB (__mempcpy,TB_TOCLESS)
467libc_hidden_def (__mempcpy)
468weak_alias (__mempcpy, mempcpy)
344d0b54 469libc_hidden_builtin_def (mempcpy)