]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/powerpc64/power7/memcpy.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memcpy.S
CommitLineData
fb084e5e 1/* Optimized memcpy implementation for PowerPC64/POWER7.
d4697bc9 2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
fb084e5e
LM
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
fb084e5e
LM
19
20#include <sysdep.h>
fb084e5e
LM
21
22
23/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
24 Returns 'dst'. */
25
759cfef3
AM
26#define dst 11 /* Use r11 so r3 kept unchanged. */
27#define src 4
28#define cnt 5
29
fb084e5e 30 .machine power7
2d67d91a 31EALIGN (memcpy, 5, 0)
fb084e5e
LM
32 CALL_MCOUNT 3
33
759cfef3 34 cmpldi cr1,cnt,31
fb084e5e 35 neg 0,3
fb084e5e
LM
36 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
37 code. */
38
759cfef3
AM
39#ifdef __LITTLE_ENDIAN__
40/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
41 or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
42 loop is only used for quadword aligned copies. */
43 andi. 10,3,15
44 clrldi 11,4,60
45#else
46 andi. 10,3,7 /* Check alignment of DST. */
47 clrldi 11,4,61 /* Check alignment of SRC. */
48#endif
49 cmpld cr6,10,11 /* SRC and DST alignments match? */
50
51 mr dst,3
fb084e5e 52 bne cr6,L(copy_GE_32_unaligned)
759cfef3 53 beq L(aligned_copy)
fb084e5e 54
759cfef3
AM
55 mtocrf 0x01,0
56#ifdef __LITTLE_ENDIAN__
57 clrldi 0,0,60
58#else
59 clrldi 0,0,61
60#endif
fb084e5e 61
759cfef3
AM
62/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
631:
64 bf 31,2f
65 lbz 6,0(src)
66 addi src,src,1
67 stb 6,0(dst)
68 addi dst,dst,1
692:
70 bf 30,4f
71 lhz 6,0(src)
72 addi src,src,2
73 sth 6,0(dst)
74 addi dst,dst,2
fb084e5e 754:
759cfef3
AM
76 bf 29,8f
77 lwz 6,0(src)
78 addi src,src,4
79 stw 6,0(dst)
80 addi dst,dst,4
818:
82#ifdef __LITTLE_ENDIAN__
83 bf 28,16f
84 ld 6,0(src)
85 addi src,src,8
86 std 6,0(dst)
87 addi dst,dst,8
8816:
89#endif
90 subf cnt,0,cnt
91
92/* Main aligned copy loop. Copies 128 bytes at a time. */
93L(aligned_copy):
94 li 6,16
95 li 7,32
96 li 8,48
97 mtocrf 0x02,cnt
98 srdi 12,cnt,7
99 cmpdi 12,0
100 beq L(aligned_tail)
101 lxvd2x 6,0,src
102 lxvd2x 7,src,6
103 mtctr 12
5025581e
WS
104 b L(aligned_128loop)
105
759cfef3 106 .align 4
5025581e
WS
107L(aligned_128head):
108 /* for the 2nd + iteration of this loop. */
759cfef3
AM
109 lxvd2x 6,0,src
110 lxvd2x 7,src,6
5025581e 111L(aligned_128loop):
759cfef3
AM
112 lxvd2x 8,src,7
113 lxvd2x 9,src,8
114 stxvd2x 6,0,dst
115 addi src,src,64
116 stxvd2x 7,dst,6
117 stxvd2x 8,dst,7
118 stxvd2x 9,dst,8
119 lxvd2x 6,0,src
120 lxvd2x 7,src,6
121 addi dst,dst,64
122 lxvd2x 8,src,7
123 lxvd2x 9,src,8
124 addi src,src,64
125 stxvd2x 6,0,dst
126 stxvd2x 7,dst,6
127 stxvd2x 8,dst,7
128 stxvd2x 9,dst,8
129 addi dst,dst,64
5025581e 130 bdnz L(aligned_128head)
fb084e5e 131
759cfef3
AM
132L(aligned_tail):
133 mtocrf 0x01,cnt
134 bf 25,32f
135 lxvd2x 6,0,src
136 lxvd2x 7,src,6
137 lxvd2x 8,src,7
138 lxvd2x 9,src,8
139 addi src,src,64
140 stxvd2x 6,0,dst
141 stxvd2x 7,dst,6
142 stxvd2x 8,dst,7
143 stxvd2x 9,dst,8
144 addi dst,dst,64
14532:
146 bf 26,16f
147 lxvd2x 6,0,src
148 lxvd2x 7,src,6
149 addi src,src,32
150 stxvd2x 6,0,dst
151 stxvd2x 7,dst,6
152 addi dst,dst,32
15316:
154 bf 27,8f
155 lxvd2x 6,0,src
156 addi src,src,16
157 stxvd2x 6,0,dst
158 addi dst,dst,16
1598:
160 bf 28,4f
161 ld 6,0(src)
162 addi src,src,8
163 std 6,0(dst)
164 addi dst,dst,8
1654: /* Copies 4~7 bytes. */
166 bf 29,L(tail2)
167 lwz 6,0(src)
168 stw 6,0(dst)
169 bf 30,L(tail5)
170 lhz 7,4(src)
171 sth 7,4(dst)
172 bflr 31
173 lbz 8,6(src)
174 stb 8,6(dst)
175 /* Return original DST pointer. */
b8907dfd
UD
176 blr
177
759cfef3
AM
178
179/* Handle copies of 0~31 bytes. */
180 .align 4
fb084e5e 181L(copy_LT_32):
759cfef3
AM
182 mr dst,3
183 cmpldi cr6,cnt,8
184 mtocrf 0x01,cnt
b8907dfd
UD
185 ble cr6,L(copy_LE_8)
186
187 /* At least 9 bytes to go. */
188 neg 8,4
759cfef3
AM
189 andi. 0,8,3
190 cmpldi cr1,cnt,16
b8907dfd
UD
191 beq L(copy_LT_32_aligned)
192
759cfef3
AM
193 /* Force 4-byte alignment for SRC. */
194 mtocrf 0x01,0
195 subf cnt,0,cnt
1962:
197 bf 30,1f
198 lhz 6,0(src)
199 addi src,src,2
200 sth 6,0(dst)
201 addi dst,dst,2
2021:
203 bf 31,L(end_4bytes_alignment)
204 lbz 6,0(src)
205 addi src,src,1
206 stb 6,0(dst)
207 addi dst,dst,1
208
209 .align 4
fb084e5e 210L(end_4bytes_alignment):
759cfef3
AM
211 cmpldi cr1,cnt,16
212 mtocrf 0x01,cnt
fb084e5e
LM
213
214L(copy_LT_32_aligned):
215 /* At least 6 bytes to go, and SRC is word-aligned. */
b8907dfd
UD
216 blt cr1,8f
217
218 /* Copy 16 bytes. */
759cfef3
AM
219 lwz 6,0(src)
220 lwz 7,4(src)
221 stw 6,0(dst)
222 lwz 8,8(src)
223 stw 7,4(dst)
224 lwz 6,12(src)
225 addi src,src,16
226 stw 8,8(dst)
227 stw 6,12(dst)
228 addi dst,dst,16
b8907dfd 2298: /* Copy 8 bytes. */
759cfef3
AM
230 bf 28,L(tail4)
231 lwz 6,0(src)
232 lwz 7,4(src)
233 addi src,src,8
234 stw 6,0(dst)
235 stw 7,4(dst)
236 addi dst,dst,8
237
238 .align 4
239/* Copies 4~7 bytes. */
240L(tail4):
241 bf 29,L(tail2)
242 lwz 6,0(src)
243 stw 6,0(dst)
244 bf 30,L(tail5)
245 lhz 7,4(src)
246 sth 7,4(dst)
247 bflr 31
248 lbz 8,6(src)
249 stb 8,6(dst)
250 /* Return original DST pointer. */
251 blr
b8907dfd 252
759cfef3
AM
253 .align 4
254/* Copies 2~3 bytes. */
255L(tail2):
b8907dfd 256 bf 30,1f
759cfef3
AM
257 lhz 6,0(src)
258 sth 6,0(dst)
259 bflr 31
260 lbz 7,2(src)
261 stb 7,2(dst)
b8907dfd
UD
262 blr
263
759cfef3
AM
264 .align 4
265L(tail5):
266 bflr 31
267 lbz 6,4(src)
268 stb 6,4(dst)
269 blr
b8907dfd 270
759cfef3
AM
271 .align 4
2721:
273 bflr 31
274 lbz 6,0(src)
275 stb 6,0(dst)
276 /* Return original DST pointer. */
b8907dfd
UD
277 blr
278
759cfef3
AM
279
280/* Handles copies of 0~8 bytes. */
281 .align 4
fb084e5e 282L(copy_LE_8):
759cfef3 283 bne cr6,L(tail4)
fb084e5e 284
b8907dfd
UD
285 /* Though we could've used ld/std here, they are still
286 slow for unaligned cases. */
287
759cfef3
AM
288 lwz 6,0(src)
289 lwz 7,4(src)
290 stw 6,0(dst)
291 stw 7,4(dst)
b8907dfd
UD
292 blr
293
b8907dfd 294
759cfef3
AM
295/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
296 SRC is not. Use aligned quadword loads from SRC, shifted to realign
297 the data, allowing for aligned DST stores. */
298 .align 4
fb084e5e 299L(copy_GE_32_unaligned):
759cfef3
AM
300 clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
301#ifndef __LITTLE_ENDIAN__
302 andi. 10,3,15 /* Check alignment of DST (against quadwords). */
303#endif
304 srdi 9,cnt,4 /* Number of full quadwords remaining. */
b8907dfd
UD
305
306 beq L(copy_GE_32_unaligned_cont)
307
759cfef3 308 /* DST is not quadword aligned, get it aligned. */
b8907dfd 309
759cfef3
AM
310 mtocrf 0x01,0
311 subf cnt,0,cnt
b8907dfd
UD
312
313 /* Vector instructions work best when proper alignment (16-bytes)
314 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
759cfef3 3151:
b8907dfd 316 bf 31,2f
759cfef3
AM
317 lbz 6,0(src)
318 addi src,src,1
319 stb 6,0(dst)
320 addi dst,dst,1
3212:
b8907dfd 322 bf 30,4f
759cfef3
AM
323 lhz 6,0(src)
324 addi src,src,2
325 sth 6,0(dst)
326 addi dst,dst,2
3274:
b8907dfd 328 bf 29,8f
759cfef3
AM
329 lwz 6,0(src)
330 addi src,src,4
331 stw 6,0(dst)
332 addi dst,dst,4
3338:
b8907dfd 334 bf 28,0f
759cfef3
AM
335 ld 6,0(src)
336 addi src,src,8
337 std 6,0(dst)
338 addi dst,dst,8
fb084e5e 3390:
759cfef3 340 srdi 9,cnt,4 /* Number of full quadwords remaining. */
fb084e5e 341
b8907dfd 342 /* The proper alignment is present, it is OK to copy the bytes now. */
fb084e5e
LM
343L(copy_GE_32_unaligned_cont):
344
b8907dfd 345 /* Setup two indexes to speed up the indexed vector operations. */
759cfef3
AM
346 clrldi 10,cnt,60
347 li 6,16 /* Index for 16-bytes offsets. */
b8907dfd 348 li 7,32 /* Index for 32-bytes offsets. */
759cfef3
AM
349 cmpldi cr1,10,0
350 srdi 8,cnt,5 /* Setup the loop counter. */
351 mtocrf 0x01,9
352 cmpldi cr6,9,1
353#ifdef __LITTLE_ENDIAN__
354 lvsr 5,0,src
355#else
356 lvsl 5,0,src
357#endif
358 lvx 3,0,src
359 li 0,0
360 bf 31,L(setup_unaligned_loop)
361
362 /* Copy another 16 bytes to align to 32-bytes due to the loop. */
363 lvx 4,src,6
364#ifdef __LITTLE_ENDIAN__
365 vperm 6,4,3,5
366#else
367 vperm 6,3,4,5
368#endif
369 addi src,src,16
370 stvx 6,0,dst
371 addi dst,dst,16
b8907dfd 372 vor 3,4,4
759cfef3 373 clrrdi 0,src,60
fb084e5e
LM
374
375L(setup_unaligned_loop):
759cfef3
AM
376 mtctr 8
377 ble cr6,L(end_unaligned_loop)
fb084e5e 378
b8907dfd 379 /* Copy 32 bytes at a time using vector instructions. */
759cfef3 380 .align 4
fb084e5e
LM
381L(unaligned_loop):
382
b8907dfd
UD
383 /* Note: vr6/vr10 may contain data that was already copied,
384 but in order to get proper alignment, we may have to copy
385 some portions again. This is faster than having unaligned
386 vector instructions though. */
387
759cfef3
AM
388 lvx 4,src,6
389#ifdef __LITTLE_ENDIAN__
390 vperm 6,4,3,5
391#else
392 vperm 6,3,4,5
393#endif
394 lvx 3,src,7
395#ifdef __LITTLE_ENDIAN__
396 vperm 10,3,4,5
397#else
398 vperm 10,4,3,5
399#endif
400 addi src,src,32
401 stvx 6,0,dst
402 stvx 10,dst,6
403 addi dst,dst,32
b8907dfd
UD
404 bdnz L(unaligned_loop)
405
759cfef3
AM
406 clrrdi 0,src,60
407
408 .align 4
fb084e5e
LM
409L(end_unaligned_loop):
410
b8907dfd 411 /* Check for tail bytes. */
759cfef3
AM
412 mtocrf 0x01,cnt
413 beqlr cr1
b8907dfd 414
759cfef3 415 add src,src,0
b8907dfd
UD
416
417 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
759cfef3 418 /* Copy 8 bytes. */
b8907dfd 419 bf 28,4f
759cfef3
AM
420 lwz 6,0(src)
421 lwz 7,4(src)
422 addi src,src,8
423 stw 6,0(dst)
424 stw 7,4(dst)
425 addi dst,dst,8
4264: /* Copy 4~7 bytes. */
427 bf 29,L(tail2)
428 lwz 6,0(src)
429 stw 6,0(dst)
430 bf 30,L(tail5)
431 lhz 7,4(src)
432 sth 7,4(dst)
433 bflr 31
434 lbz 8,6(src)
435 stb 8,6(dst)
436 /* Return original DST pointer. */
b8907dfd 437 blr
fb084e5e 438
2d67d91a 439END_GEN_TB (memcpy,TB_TOCLESS)
fb084e5e 440libc_hidden_builtin_def (memcpy)