]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc32/power7/mempcpy.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc32 / power7 / mempcpy.S
CommitLineData
344d0b54 1/* Optimized mempcpy implementation for POWER7.
04277e02 2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
344d0b54
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
5a82c748 18 <https://www.gnu.org/licenses/>. */
344d0b54
LM
19
20#include <sysdep.h>
344d0b54 21
f17a4233 22/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
344d0b54
LM
23 Returns 'dst' + 'len'. */
24
25 .machine power7
b5510883 26EALIGN (__mempcpy, 5, 0)
344d0b54
LM
27 CALL_MCOUNT
28
29 stwu 1,-32(1)
30 cfi_adjust_cfa_offset(32)
31 stw 30,20(1)
32 cfi_offset(30,(20-32))
33 stw 31,24(1)
34 mr 30,3
35 cmplwi cr1,5,31
36 neg 0,3
37 cfi_offset(31,-8)
38 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
39 code. */
40
41 andi. 11,3,7 /* Check alignment of DST. */
42 clrlwi 10,4,29 /* Check alignment of SRC. */
43 cmplw cr6,10,11 /* SRC and DST alignments match? */
44 mr 12,4
45 mr 31,5
46 bne cr6,L(copy_GE_32_unaligned)
47
48 srwi 9,5,3 /* Number of full quadwords remaining. */
49
50 beq L(copy_GE_32_aligned_cont)
51
52 clrlwi 0,0,29
53 mtcrf 0x01,0
54 subf 31,0,5
55
56 /* Get the SRC aligned to 8 bytes. */
57
581: bf 31,2f
59 lbz 6,0(12)
60 addi 12,12,1
61 stb 6,0(3)
62 addi 3,3,1
632: bf 30,4f
64 lhz 6,0(12)
65 addi 12,12,2
66 sth 6,0(3)
67 addi 3,3,2
684: bf 29,0f
69 lwz 6,0(12)
70 addi 12,12,4
71 stw 6,0(3)
72 addi 3,3,4
730:
74 clrlwi 10,12,29 /* Check alignment of SRC again. */
75 srwi 9,31,3 /* Number of full doublewords remaining. */
76
77L(copy_GE_32_aligned_cont):
78
79 clrlwi 11,31,29
80 mtcrf 0x01,9
81
82 srwi 8,31,5
83 cmplwi cr1,9,4
84 cmplwi cr6,11,0
85 mr 11,12
86
87 /* Copy 1~3 doublewords so the main loop starts
88 at a multiple of 32 bytes. */
89
90 bf 30,1f
91 lfd 6,0(12)
92 lfd 7,8(12)
93 addi 11,12,16
94 mtctr 8
95 stfd 6,0(3)
96 stfd 7,8(3)
97 addi 10,3,16
98 bf 31,4f
99 lfd 0,16(12)
100 stfd 0,16(3)
101 blt cr1,3f
102 addi 11,12,24
103 addi 10,3,24
104 b 4f
105
106 .align 4
1071: /* Copy 1 doubleword and set the counter. */
108 mr 10,3
109 mtctr 8
110 bf 31,4f
111 lfd 6,0(12)
112 addi 11,12,8
113 stfd 6,0(3)
114 addi 10,3,8
115
116 .align 4
1174: /* Main aligned copy loop. Copies 32-bytes at a time. */
118 lfd 6,0(11)
119 lfd 7,8(11)
120 lfd 8,16(11)
121 lfd 0,24(11)
122 addi 11,11,32
123
124 stfd 6,0(10)
125 stfd 7,8(10)
126 stfd 8,16(10)
127 stfd 0,24(10)
128 addi 10,10,32
129 bdnz 4b
1303:
131
132 /* Check for tail bytes. */
133
134 clrrwi 0,31,3
135 mtcrf 0x01,31
136 beq cr6,0f
137
138.L9:
139 add 3,3,0
140 add 12,12,0
141
142 /* At this point we have a tail of 0-7 bytes and we know that the
143 destination is doubleword-aligned. */
1444: /* Copy 4 bytes. */
145 bf 29,2f
146
147 lwz 6,0(12)
148 addi 12,12,4
149 stw 6,0(3)
150 addi 3,3,4
1512: /* Copy 2 bytes. */
152 bf 30,1f
153
154 lhz 6,0(12)
155 addi 12,12,2
156 sth 6,0(3)
157 addi 3,3,2
1581: /* Copy 1 byte. */
159 bf 31,0f
160
161 lbz 6,0(12)
162 stb 6,0(3)
1630: /* Return DST + LEN pointer. */
164 add 3,30,5
165 lwz 30,20(1)
166 lwz 31,24(1)
167 addi 1,1,32
168 blr
169
170 /* Handle copies of 0~31 bytes. */
171 .align 4
172L(copy_LT_32):
173 cmplwi cr6,5,8
174 mr 12,4
175 mtcrf 0x01,5
176 ble cr6,L(copy_LE_8)
177
178 /* At least 9 bytes to go. */
179 neg 8,4
180 clrrwi 11,4,2
181 andi. 0,8,3
182 cmplwi cr1,5,16
183 mr 10,5
184 beq L(copy_LT_32_aligned)
185
186 /* Force 4-bytes alignment for SRC. */
187 mtocrf 0x01,0
188 subf 10,0,5
1892: bf 30,1f
190
191 lhz 6,0(12)
192 addi 12,12,2
193 sth 6,0(3)
194 addi 3,3,2
1951: bf 31,L(end_4bytes_alignment)
196
197 lbz 6,0(12)
198 addi 12,12,1
199 stb 6,0(3)
200 addi 3,3,1
201
202 .align 4
203L(end_4bytes_alignment):
204 cmplwi cr1,10,16
205 mtcrf 0x01,10
206
207L(copy_LT_32_aligned):
208 /* At least 6 bytes to go, and SRC is word-aligned. */
209 blt cr1,8f
210
211 /* Copy 16 bytes. */
212 lwz 6,0(12)
213 lwz 7,4(12)
214 stw 6,0(3)
215 lwz 8,8(12)
216 stw 7,4(3)
217 lwz 6,12(12)
218 addi 12,12,16
219 stw 8,8(3)
220 stw 6,12(3)
221 addi 3,3,16
2228: /* Copy 8 bytes. */
223 bf 28,4f
224
225 lwz 6,0(12)
226 lwz 7,4(12)
227 addi 12,12,8
228 stw 6,0(3)
229 stw 7,4(3)
230 addi 3,3,8
2314: /* Copy 4 bytes. */
232 bf 29,2f
233
234 lwz 6,0(12)
235 addi 12,12,4
236 stw 6,0(3)
237 addi 3,3,4
2382: /* Copy 2-3 bytes. */
239 bf 30,1f
240
241 lhz 6,0(12)
242 sth 6,0(3)
243 bf 31,0f
244 lbz 7,2(12)
245 stb 7,2(3)
246
247 /* Return DST + LEN pointer. */
248 add 3,30,5
249 lwz 30,20(1)
250 addi 1,1,32
251 blr
252
253 .align 4
2541: /* Copy 1 byte. */
255 bf 31,0f
256
257 lbz 6,0(12)
258 stb 6,0(3)
2590: /* Return DST + LEN pointer. */
260 add 3,30,5
261 lwz 30,20(1)
262 addi 1,1,32
263 blr
264
265 /* Handles copies of 0~8 bytes. */
266 .align 4
267L(copy_LE_8):
268 bne cr6,4f
269
270 /* Though we could've used lfd/stfd here, they are still
271 slow for unaligned cases. */
272
273 lwz 6,0(4)
274 lwz 7,4(4)
275 stw 6,0(3)
276 stw 7,4(3)
277
278 /* Return DST + LEN pointer. */
279 add 3,30,5
280 lwz 30,20(1)
281 addi 1,1,32
282 blr
283
284 .align 4
2854: /* Copies 4~7 bytes. */
286 bf 29,2b
287
288 lwz 6,0(4)
289 stw 6,0(3)
290 bf 30,5f
291 lhz 7,4(4)
292 sth 7,4(3)
293 bf 31,0f
294 lbz 8,6(4)
295 stb 8,6(3)
296
297 /* Return DST + LEN pointer. */
298 add 3,30,5
299 lwz 30,20(1)
300 addi 1,1,32
301 blr
302
303 .align 4
3045: /* Copy 1 byte. */
305 bf 31,0f
306
307 lbz 6,4(4)
308 stb 6,4(3)
309
3100: /* Return DST + LEN pointer. */
311 add 3,30,5
312 lwz 30,20(1)
313 addi 1,1,32
314 blr
315
316 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
317 SRC is not. Use aligned quadword loads from SRC, shifted to realign
318 the data, allowing for aligned DST stores. */
319 .align 4
320L(copy_GE_32_unaligned):
321 andi. 11,3,15 /* Check alignment of DST. */
322 clrlwi 0,0,28 /* Number of bytes until the 1st
323 quadword of DST. */
324 srwi 9,5,4 /* Number of full quadwords remaining. */
325
326 beq L(copy_GE_32_unaligned_cont)
327
759cfef3 328 /* DST is not quadword aligned, get it aligned. */
344d0b54
LM
329
330 mtcrf 0x01,0
331 subf 31,0,5
332
333 /* Vector instructions work best when proper alignment (16-bytes)
334 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
3351: /* Copy 1 byte. */
336 bf 31,2f
337
338 lbz 6,0(12)
339 addi 12,12,1
340 stb 6,0(3)
341 addi 3,3,1
3422: /* Copy 2 bytes. */
343 bf 30,4f
344
345 lhz 6,0(12)
346 addi 12,12,2
347 sth 6,0(3)
348 addi 3,3,2
3494: /* Copy 4 bytes. */
350 bf 29,8f
351
352 lwz 6,0(12)
353 addi 12,12,4
354 stw 6,0(3)
355 addi 3,3,4
3568: /* Copy 8 bytes. */
357 bf 28,0f
358
359 lfd 6,0(12)
360 addi 12,12,8
361 stfd 6,0(3)
362 addi 3,3,8
3630:
364 clrlwi 10,12,28 /* Check alignment of SRC. */
365 srwi 9,31,4 /* Number of full quadwords remaining. */
366
367 /* The proper alignment is present, it is OK to copy the bytes now. */
368L(copy_GE_32_unaligned_cont):
369
370 /* Setup two indexes to speed up the indexed vector operations. */
371 clrlwi 11,31,28
372 li 6,16 /* Index for 16-bytes offsets. */
373 li 7,32 /* Index for 32-bytes offsets. */
374 cmplwi cr1,11,0
375 srwi 8,31,5 /* Setup the loop counter. */
376 mr 10,3
377 mr 11,12
378 mtcrf 0x01,9
379 cmplwi cr6,9,1
759cfef3
AM
380#ifdef __LITTLE_ENDIAN__
381 lvsr 5,0,12
382#else
383 lvsl 5,0,12
384#endif
344d0b54
LM
385 lvx 3,0,12
386 bf 31,L(setup_unaligned_loop)
387
388 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
389 lvx 4,12,6
759cfef3
AM
390#ifdef __LITTLE_ENDIAN__
391 vperm 6,4,3,5
392#else
393 vperm 6,3,4,5
394#endif
344d0b54
LM
395 addi 11,12,16
396 addi 10,3,16
397 stvx 6,0,3
398 vor 3,4,4
399
400L(setup_unaligned_loop):
401 mtctr 8
402 ble cr6,L(end_unaligned_loop)
403
404 /* Copy 32 bytes at a time using vector instructions. */
405 .align 4
406L(unaligned_loop):
407
408 /* Note: vr6/vr10 may contain data that was already copied,
409 but in order to get proper alignment, we may have to copy
410 some portions again. This is faster than having unaligned
411 vector instructions though. */
412
413 lvx 4,11,6 /* vr4 = r11+16. */
759cfef3
AM
414#ifdef __LITTLE_ENDIAN__
415 vperm 6,4,3,5
416#else
417 vperm 6,3,4,5
418#endif
344d0b54 419 lvx 3,11,7 /* vr3 = r11+32. */
759cfef3
AM
420#ifdef __LITTLE_ENDIAN__
421 vperm 10,3,4,5
422#else
423 vperm 10,4,3,5
424#endif
344d0b54
LM
425 addi 11,11,32
426 stvx 6,0,10
427 stvx 10,10,6
428 addi 10,10,32
429
430 bdnz L(unaligned_loop)
431
432 .align 4
433L(end_unaligned_loop):
434
435 /* Check for tail bytes. */
436 clrrwi 0,31,4
437 mtcrf 0x01,31
438 beq cr1,0f
439
440 add 3,3,0
441 add 12,12,0
442
443 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
4448: /* Copy 8 bytes. */
445 bf 28,4f
446
447 lwz 6,0(12)
448 lwz 7,4(12)
449 addi 12,12,8
450 stw 6,0(3)
451 stw 7,4(3)
452 addi 3,3,8
4534: /* Copy 4 bytes. */
454 bf 29,2f
455
456 lwz 6,0(12)
457 addi 12,12,4
458 stw 6,0(3)
459 addi 3,3,4
4602: /* Copy 2~3 bytes. */
461 bf 30,1f
462
463 lhz 6,0(12)
464 addi 12,12,2
465 sth 6,0(3)
466 addi 3,3,2
4671: /* Copy 1 byte. */
468 bf 31,0f
469
470 lbz 6,0(12)
471 stb 6,0(3)
4720: /* Return DST + LEN pointer. */
473 add 3,30,5
474 lwz 30,20(1)
475 lwz 31,24(1)
476 addi 1,1,32
477 blr
478
b5510883
JM
479END (__mempcpy)
480libc_hidden_def (__mempcpy)
481weak_alias (__mempcpy, mempcpy)
344d0b54 482libc_hidden_builtin_def (mempcpy)