]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/mempcpy.S
7f5a4745efc24865112e7092b0586145cde98476
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / mempcpy.S
1 /* Optimized mempcpy implementation for POWER7.
2 Copyright (C) 2010-2018 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22
23 /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
24 Returns 'dst' + 'len'. */
25
26 #ifndef MEMPCPY
27 # define MEMPCPY __mempcpy
28 #endif
29 .machine power7
30 ENTRY_TOCLESS (MEMPCPY, 5)
31 CALL_MCOUNT 3
32
33 cmpldi cr1,5,31
34 neg 0,3
35 std 3,-16(1)
36 std 31,-8(1)
37 cfi_offset(31,-8)
38 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
39 code. */
40
41 andi. 11,3,7 /* Check alignment of DST. */
42
43
44 clrldi 10,4,61 /* Check alignment of SRC. */
45 cmpld cr6,10,11 /* SRC and DST alignments match? */
46 mr 12,4
47 mr 31,5
48 bne cr6,L(copy_GE_32_unaligned)
49
50 srdi 9,5,3 /* Number of full quadwords remaining. */
51
52 beq L(copy_GE_32_aligned_cont)
53
54 clrldi 0,0,61
55 mtcrf 0x01,0
56 subf 31,0,5
57
58 /* Get the SRC aligned to 8 bytes. */
59
60 1: bf 31,2f
61 lbz 6,0(12)
62 addi 12,12,1
63 stb 6,0(3)
64 addi 3,3,1
65 2: bf 30,4f
66 lhz 6,0(12)
67 addi 12,12,2
68 sth 6,0(3)
69 addi 3,3,2
70 4: bf 29,0f
71 lwz 6,0(12)
72 addi 12,12,4
73 stw 6,0(3)
74 addi 3,3,4
75 0:
76 clrldi 10,12,61 /* Check alignment of SRC again. */
77 srdi 9,31,3 /* Number of full doublewords remaining. */
78
79 L(copy_GE_32_aligned_cont):
80
81 clrldi 11,31,61
82 mtcrf 0x01,9
83
84 srdi 8,31,5
85 cmpldi cr1,9,4
86 cmpldi cr6,11,0
87 mr 11,12
88
89 /* Copy 1~3 doublewords so the main loop starts
90 at a multiple of 32 bytes. */
91
92 bf 30,1f
93 ld 6,0(12)
94 ld 7,8(12)
95 addi 11,12,16
96 mtctr 8
97 std 6,0(3)
98 std 7,8(3)
99 addi 10,3,16
100 bf 31,4f
101 ld 0,16(12)
102 std 0,16(3)
103 blt cr1,3f
104 addi 11,12,24
105 addi 10,3,24
106 b 4f
107
108 .align 4
109 1: /* Copy 1 doubleword and set the counter. */
110 mr 10,3
111 mtctr 8
112 bf 31,4f
113 ld 6,0(12)
114 addi 11,12,8
115 std 6,0(3)
116 addi 10,3,8
117
118 /* Main aligned copy loop. Copies 32-bytes at a time. */
119 .align 4
120 4:
121 ld 6,0(11)
122 ld 7,8(11)
123 ld 8,16(11)
124 ld 0,24(11)
125 addi 11,11,32
126
127 std 6,0(10)
128 std 7,8(10)
129 std 8,16(10)
130 std 0,24(10)
131 addi 10,10,32
132 bdnz 4b
133 3:
134
135 /* Check for tail bytes. */
136 rldicr 0,31,0,60
137 mtcrf 0x01,31
138 beq cr6,0f
139
140 .L9:
141 add 3,3,0
142 add 12,12,0
143
144 /* At this point we have a tail of 0-7 bytes and we know that the
145 destination is doubleword-aligned. */
146 4: /* Copy 4 bytes. */
147 bf 29,2f
148
149 lwz 6,0(12)
150 addi 12,12,4
151 stw 6,0(3)
152 addi 3,3,4
153 2: /* Copy 2 bytes. */
154 bf 30,1f
155
156 lhz 6,0(12)
157 addi 12,12,2
158 sth 6,0(3)
159 addi 3,3,2
160 1: /* Copy 1 byte. */
161 bf 31,0f
162
163 lbz 6,0(12)
164 stb 6,0(3)
165 0: /* Return DST + LEN pointer. */
166 ld 31,-8(1)
167 ld 3,-16(1)
168 add 3,3,5
169 blr
170
171 /* Handle copies of 0~31 bytes. */
172 .align 4
173 L(copy_LT_32):
174 cmpldi cr6,5,8
175 mr 12,4
176 mtcrf 0x01,5
177 ble cr6,L(copy_LE_8)
178
179 /* At least 9 bytes to go. */
180 neg 8,4
181 clrrdi 11,4,2
182 andi. 0,8,3
183 cmpldi cr1,5,16
184 mr 10,5
185 beq L(copy_LT_32_aligned)
186
187 /* Force 4-bytes alignment for SRC. */
188 mtocrf 0x01,0
189 subf 10,0,5
190 2: bf 30,1f
191
192 lhz 6,0(12)
193 addi 12,12,2
194 sth 6,0(3)
195 addi 3,3,2
196 1: bf 31,L(end_4bytes_alignment)
197
198 lbz 6,0(12)
199 addi 12,12,1
200 stb 6,0(3)
201 addi 3,3,1
202
203 .align 4
204 L(end_4bytes_alignment):
205 cmpldi cr1,10,16
206 mtcrf 0x01,10
207
208 L(copy_LT_32_aligned):
209 /* At least 6 bytes to go, and SRC is word-aligned. */
210 blt cr1,8f
211
212 /* Copy 16 bytes. */
213 lwz 6,0(12)
214 lwz 7,4(12)
215 stw 6,0(3)
216 lwz 8,8(12)
217 stw 7,4(3)
218 lwz 6,12(12)
219 addi 12,12,16
220 stw 8,8(3)
221 stw 6,12(3)
222 addi 3,3,16
223 8: /* Copy 8 bytes. */
224 bf 28,4f
225
226 lwz 6,0(12)
227 lwz 7,4(12)
228 addi 12,12,8
229 stw 6,0(3)
230 stw 7,4(3)
231 addi 3,3,8
232 4: /* Copy 4 bytes. */
233 bf 29,2f
234
235 lwz 6,0(12)
236 addi 12,12,4
237 stw 6,0(3)
238 addi 3,3,4
239 2: /* Copy 2-3 bytes. */
240 bf 30,1f
241
242 lhz 6,0(12)
243 sth 6,0(3)
244 bf 31,0f
245 lbz 7,2(12)
246 stb 7,2(3)
247 ld 3,-16(1)
248 add 3,3,5
249 blr
250
251 .align 4
252 1: /* Copy 1 byte. */
253 bf 31,0f
254
255 lbz 6,0(12)
256 stb 6,0(3)
257 0: /* Return DST + LEN pointer. */
258 ld 3,-16(1)
259 add 3,3,5
260 blr
261
262 /* Handles copies of 0~8 bytes. */
263 .align 4
264 L(copy_LE_8):
265 bne cr6,4f
266
267 /* Though we could've used ld/std here, they are still
268 slow for unaligned cases. */
269
270 lwz 6,0(4)
271 lwz 7,4(4)
272 stw 6,0(3)
273 stw 7,4(3)
274 ld 3,-16(1) /* Return DST + LEN pointer. */
275 add 3,3,5
276 blr
277
278 .align 4
279 4: /* Copies 4~7 bytes. */
280 bf 29,2b
281
282 lwz 6,0(4)
283 stw 6,0(3)
284 bf 30,5f
285 lhz 7,4(4)
286 sth 7,4(3)
287 bf 31,0f
288 lbz 8,6(4)
289 stb 8,6(3)
290 ld 3,-16(1)
291 add 3,3,5
292 blr
293
294 .align 4
295 5: /* Copy 1 byte. */
296 bf 31,0f
297
298 lbz 6,4(4)
299 stb 6,4(3)
300
301 0: /* Return DST + LEN pointer. */
302 ld 3,-16(1)
303 add 3,3,5
304 blr
305
306 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
307 SRC is not. Use aligned quadword loads from SRC, shifted to realign
308 the data, allowing for aligned DST stores. */
309 .align 4
310 L(copy_GE_32_unaligned):
311 clrldi 0,0,60 /* Number of bytes until the 1st
312 quadword. */
313 andi. 11,3,15 /* Check alignment of DST (against
314 quadwords). */
315 srdi 9,5,4 /* Number of full quadwords remaining. */
316
317 beq L(copy_GE_32_unaligned_cont)
318
319 /* SRC is not quadword aligned, get it aligned. */
320
321 mtcrf 0x01,0
322 subf 31,0,5
323
324 /* Vector instructions work best when proper alignment (16-bytes)
325 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
326 1: /* Copy 1 byte. */
327 bf 31,2f
328
329 lbz 6,0(12)
330 addi 12,12,1
331 stb 6,0(3)
332 addi 3,3,1
333 2: /* Copy 2 bytes. */
334 bf 30,4f
335
336 lhz 6,0(12)
337 addi 12,12,2
338 sth 6,0(3)
339 addi 3,3,2
340 4: /* Copy 4 bytes. */
341 bf 29,8f
342
343 lwz 6,0(12)
344 addi 12,12,4
345 stw 6,0(3)
346 addi 3,3,4
347 8: /* Copy 8 bytes. */
348 bf 28,0f
349
350 ld 6,0(12)
351 addi 12,12,8
352 std 6,0(3)
353 addi 3,3,8
354 0:
355 clrldi 10,12,60 /* Check alignment of SRC. */
356 srdi 9,31,4 /* Number of full quadwords remaining. */
357
358 /* The proper alignment is present, it is OK to copy the bytes now. */
359 L(copy_GE_32_unaligned_cont):
360
361 /* Setup two indexes to speed up the indexed vector operations. */
362 clrldi 11,31,60
363 li 6,16 /* Index for 16-bytes offsets. */
364 li 7,32 /* Index for 32-bytes offsets. */
365 cmpldi cr1,11,0
366 srdi 8,31,5 /* Setup the loop counter. */
367 mr 10,3
368 mr 11,12
369 mtcrf 0x01,9
370 cmpldi cr6,9,1
371 #ifdef __LITTLE_ENDIAN__
372 lvsr 5,0,12
373 #else
374 lvsl 5,0,12
375 #endif
376 lvx 3,0,12
377 bf 31,L(setup_unaligned_loop)
378
379 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
380 lvx 4,12,6
381 #ifdef __LITTLE_ENDIAN__
382 vperm 6,4,3,5
383 #else
384 vperm 6,3,4,5
385 #endif
386 addi 11,12,16
387 addi 10,3,16
388 stvx 6,0,3
389 vor 3,4,4
390
391 L(setup_unaligned_loop):
392 mtctr 8
393 ble cr6,L(end_unaligned_loop)
394
395 /* Copy 32 bytes at a time using vector instructions. */
396 .align 4
397 L(unaligned_loop):
398
399 /* Note: vr6/vr10 may contain data that was already copied,
400 but in order to get proper alignment, we may have to copy
401 some portions again. This is faster than having unaligned
402 vector instructions though. */
403
404 lvx 4,11,6 /* vr4 = r11+16. */
405 #ifdef __LITTLE_ENDIAN__
406 vperm 6,4,3,5
407 #else
408 vperm 6,3,4,5
409 #endif
410 lvx 3,11,7 /* vr3 = r11+32. */
411 #ifdef __LITTLE_ENDIAN__
412 vperm 10,3,4,5
413 #else
414 vperm 10,4,3,5
415 #endif
416 addi 11,11,32
417 stvx 6,0,10
418 stvx 10,10,6
419 addi 10,10,32
420
421 bdnz L(unaligned_loop)
422
423 .align 4
424 L(end_unaligned_loop):
425
426 /* Check for tail bytes. */
427 rldicr 0,31,0,59
428 mtcrf 0x01,31
429 beq cr1,0f
430
431 add 3,3,0
432 add 12,12,0
433
434 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
435 8: /* Copy 8 bytes. */
436 bf 28,4f
437
438 lwz 6,0(12)
439 lwz 7,4(12)
440 addi 12,12,8
441 stw 6,0(3)
442 stw 7,4(3)
443 addi 3,3,8
444 4: /* Copy 4 bytes. */
445 bf 29,2f
446
447 lwz 6,0(12)
448 addi 12,12,4
449 stw 6,0(3)
450 addi 3,3,4
451 2: /* Copy 2~3 bytes. */
452 bf 30,1f
453
454 lhz 6,0(12)
455 addi 12,12,2
456 sth 6,0(3)
457 addi 3,3,2
458 1: /* Copy 1 byte. */
459 bf 31,0f
460
461 lbz 6,0(12)
462 stb 6,0(3)
463 0: /* Return DST + LEN pointer. */
464 ld 31,-8(1)
465 ld 3,-16(1)
466 add 3,3,5
467 blr
468
469 END_GEN_TB (MEMPCPY,TB_TOCLESS)
470 libc_hidden_def (__mempcpy)
471 weak_alias (__mempcpy, mempcpy)
472 libc_hidden_builtin_def (mempcpy)