]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power7/memcpy.S
Do not use __ptr_t.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power7 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC64/POWER7.
2 Copyright (C) 2010-2017 Free Software Foundation, Inc.
3 Contributed by Luis Machado <luisgpm@br.ibm.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22
23 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
24 Returns 'dst'. */
25
26 #ifndef MEMCPY
27 # define MEMCPY memcpy
28 #endif
29
30 #define dst 11 /* Use r11 so r3 kept unchanged. */
31 #define src 4
32 #define cnt 5
33
34 .machine power7
35 ENTRY_TOCLESS (MEMCPY, 5)
36 CALL_MCOUNT 3
37
38 cmpldi cr1,cnt,31
39 neg 0,3
40 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
41 code. */
42
43 /* Align copies using VSX instructions to quadword. It is to avoid alignment
44 traps when memcpy is used on non-cacheable memory (for instance, memory
45 mapped I/O). */
46 andi. 10,3,15
47 clrldi 11,4,60
48 cmpld cr6,10,11 /* SRC and DST alignments match? */
49
50 mr dst,3
51 bne cr6,L(copy_GE_32_unaligned)
52 beq L(aligned_copy)
53
54 mtocrf 0x01,0
55 clrldi 0,0,60
56
57 /* Get the DST and SRC aligned to 16 bytes. */
58 1:
59 bf 31,2f
60 lbz 6,0(src)
61 addi src,src,1
62 stb 6,0(dst)
63 addi dst,dst,1
64 2:
65 bf 30,4f
66 lhz 6,0(src)
67 addi src,src,2
68 sth 6,0(dst)
69 addi dst,dst,2
70 4:
71 bf 29,8f
72 lwz 6,0(src)
73 addi src,src,4
74 stw 6,0(dst)
75 addi dst,dst,4
76 8:
77 bf 28,16f
78 ld 6,0(src)
79 addi src,src,8
80 std 6,0(dst)
81 addi dst,dst,8
82 16:
83 subf cnt,0,cnt
84
85 /* Main aligned copy loop. Copies 128 bytes at a time. */
86 L(aligned_copy):
87 li 6,16
88 li 7,32
89 li 8,48
90 mtocrf 0x02,cnt
91 srdi 12,cnt,7
92 cmpdi 12,0
93 beq L(aligned_tail)
94 lxvd2x 6,0,src
95 lxvd2x 7,src,6
96 mtctr 12
97 b L(aligned_128loop)
98
99 .align 4
100 L(aligned_128head):
101 /* for the 2nd + iteration of this loop. */
102 lxvd2x 6,0,src
103 lxvd2x 7,src,6
104 L(aligned_128loop):
105 lxvd2x 8,src,7
106 lxvd2x 9,src,8
107 stxvd2x 6,0,dst
108 addi src,src,64
109 stxvd2x 7,dst,6
110 stxvd2x 8,dst,7
111 stxvd2x 9,dst,8
112 lxvd2x 6,0,src
113 lxvd2x 7,src,6
114 addi dst,dst,64
115 lxvd2x 8,src,7
116 lxvd2x 9,src,8
117 addi src,src,64
118 stxvd2x 6,0,dst
119 stxvd2x 7,dst,6
120 stxvd2x 8,dst,7
121 stxvd2x 9,dst,8
122 addi dst,dst,64
123 bdnz L(aligned_128head)
124
125 L(aligned_tail):
126 mtocrf 0x01,cnt
127 bf 25,32f
128 lxvd2x 6,0,src
129 lxvd2x 7,src,6
130 lxvd2x 8,src,7
131 lxvd2x 9,src,8
132 addi src,src,64
133 stxvd2x 6,0,dst
134 stxvd2x 7,dst,6
135 stxvd2x 8,dst,7
136 stxvd2x 9,dst,8
137 addi dst,dst,64
138 32:
139 bf 26,16f
140 lxvd2x 6,0,src
141 lxvd2x 7,src,6
142 addi src,src,32
143 stxvd2x 6,0,dst
144 stxvd2x 7,dst,6
145 addi dst,dst,32
146 16:
147 bf 27,8f
148 lxvd2x 6,0,src
149 addi src,src,16
150 stxvd2x 6,0,dst
151 addi dst,dst,16
152 8:
153 bf 28,4f
154 ld 6,0(src)
155 addi src,src,8
156 std 6,0(dst)
157 addi dst,dst,8
158 4: /* Copies 4~7 bytes. */
159 bf 29,L(tail2)
160 lwz 6,0(src)
161 stw 6,0(dst)
162 bf 30,L(tail5)
163 lhz 7,4(src)
164 sth 7,4(dst)
165 bflr 31
166 lbz 8,6(src)
167 stb 8,6(dst)
168 /* Return original DST pointer. */
169 blr
170
171
172 /* Handle copies of 0~31 bytes. */
173 .align 4
174 L(copy_LT_32):
175 mr dst,3
176 cmpldi cr6,cnt,8
177 mtocrf 0x01,cnt
178 ble cr6,L(copy_LE_8)
179
180 /* At least 9 bytes to go. */
181 neg 8,4
182 andi. 0,8,3
183 cmpldi cr1,cnt,16
184 beq L(copy_LT_32_aligned)
185
186 /* Force 4-byte alignment for SRC. */
187 mtocrf 0x01,0
188 subf cnt,0,cnt
189 2:
190 bf 30,1f
191 lhz 6,0(src)
192 addi src,src,2
193 sth 6,0(dst)
194 addi dst,dst,2
195 1:
196 bf 31,L(end_4bytes_alignment)
197 lbz 6,0(src)
198 addi src,src,1
199 stb 6,0(dst)
200 addi dst,dst,1
201
202 .align 4
203 L(end_4bytes_alignment):
204 cmpldi cr1,cnt,16
205 mtocrf 0x01,cnt
206
207 L(copy_LT_32_aligned):
208 /* At least 6 bytes to go, and SRC is word-aligned. */
209 blt cr1,8f
210
211 /* Copy 16 bytes. */
212 lwz 6,0(src)
213 lwz 7,4(src)
214 stw 6,0(dst)
215 lwz 8,8(src)
216 stw 7,4(dst)
217 lwz 6,12(src)
218 addi src,src,16
219 stw 8,8(dst)
220 stw 6,12(dst)
221 addi dst,dst,16
222 8: /* Copy 8 bytes. */
223 bf 28,L(tail4)
224 lwz 6,0(src)
225 lwz 7,4(src)
226 addi src,src,8
227 stw 6,0(dst)
228 stw 7,4(dst)
229 addi dst,dst,8
230
231 .align 4
232 /* Copies 4~7 bytes. */
233 L(tail4):
234 bf 29,L(tail2)
235 lwz 6,0(src)
236 stw 6,0(dst)
237 bf 30,L(tail5)
238 lhz 7,4(src)
239 sth 7,4(dst)
240 bflr 31
241 lbz 8,6(src)
242 stb 8,6(dst)
243 /* Return original DST pointer. */
244 blr
245
246 .align 4
247 /* Copies 2~3 bytes. */
248 L(tail2):
249 bf 30,1f
250 lhz 6,0(src)
251 sth 6,0(dst)
252 bflr 31
253 lbz 7,2(src)
254 stb 7,2(dst)
255 blr
256
257 .align 4
258 L(tail5):
259 bflr 31
260 lbz 6,4(src)
261 stb 6,4(dst)
262 blr
263
264 .align 4
265 1:
266 bflr 31
267 lbz 6,0(src)
268 stb 6,0(dst)
269 /* Return original DST pointer. */
270 blr
271
272
273 /* Handles copies of 0~8 bytes. */
274 .align 4
275 L(copy_LE_8):
276 bne cr6,L(tail4)
277
278 /* Though we could've used ld/std here, they are still
279 slow for unaligned cases. */
280
281 lwz 6,0(src)
282 lwz 7,4(src)
283 stw 6,0(dst)
284 stw 7,4(dst)
285 blr
286
287
288 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
289 SRC is not. Use aligned quadword loads from SRC, shifted to realign
290 the data, allowing for aligned DST stores. */
291 .align 4
292 L(copy_GE_32_unaligned):
293 clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
294 srdi 9,cnt,4 /* Number of full quadwords remaining. */
295
296 beq L(copy_GE_32_unaligned_cont)
297
298 /* DST is not quadword aligned, get it aligned. */
299
300 mtocrf 0x01,0
301 subf cnt,0,cnt
302
303 /* Vector instructions work best when proper alignment (16-bytes)
304 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
305 1:
306 bf 31,2f
307 lbz 6,0(src)
308 addi src,src,1
309 stb 6,0(dst)
310 addi dst,dst,1
311 2:
312 bf 30,4f
313 lhz 6,0(src)
314 addi src,src,2
315 sth 6,0(dst)
316 addi dst,dst,2
317 4:
318 bf 29,8f
319 lwz 6,0(src)
320 addi src,src,4
321 stw 6,0(dst)
322 addi dst,dst,4
323 8:
324 bf 28,0f
325 ld 6,0(src)
326 addi src,src,8
327 std 6,0(dst)
328 addi dst,dst,8
329 0:
330 srdi 9,cnt,4 /* Number of full quadwords remaining. */
331
332 /* The proper alignment is present, it is OK to copy the bytes now. */
333 L(copy_GE_32_unaligned_cont):
334
335 /* Setup two indexes to speed up the indexed vector operations. */
336 clrldi 10,cnt,60
337 li 6,16 /* Index for 16-bytes offsets. */
338 li 7,32 /* Index for 32-bytes offsets. */
339 cmpldi cr1,10,0
340 srdi 8,cnt,5 /* Setup the loop counter. */
341 mtocrf 0x01,9
342 cmpldi cr6,9,1
343 #ifdef __LITTLE_ENDIAN__
344 lvsr 5,0,src
345 #else
346 lvsl 5,0,src
347 #endif
348 lvx 3,0,src
349 li 0,0
350 bf 31,L(setup_unaligned_loop)
351
352 /* Copy another 16 bytes to align to 32-bytes due to the loop. */
353 lvx 4,src,6
354 #ifdef __LITTLE_ENDIAN__
355 vperm 6,4,3,5
356 #else
357 vperm 6,3,4,5
358 #endif
359 addi src,src,16
360 stvx 6,0,dst
361 addi dst,dst,16
362 vor 3,4,4
363 clrrdi 0,src,60
364
365 L(setup_unaligned_loop):
366 mtctr 8
367 ble cr6,L(end_unaligned_loop)
368
369 /* Copy 32 bytes at a time using vector instructions. */
370 .align 4
371 L(unaligned_loop):
372
373 /* Note: vr6/vr10 may contain data that was already copied,
374 but in order to get proper alignment, we may have to copy
375 some portions again. This is faster than having unaligned
376 vector instructions though. */
377
378 lvx 4,src,6
379 #ifdef __LITTLE_ENDIAN__
380 vperm 6,4,3,5
381 #else
382 vperm 6,3,4,5
383 #endif
384 lvx 3,src,7
385 #ifdef __LITTLE_ENDIAN__
386 vperm 10,3,4,5
387 #else
388 vperm 10,4,3,5
389 #endif
390 addi src,src,32
391 stvx 6,0,dst
392 stvx 10,dst,6
393 addi dst,dst,32
394 bdnz L(unaligned_loop)
395
396 clrrdi 0,src,60
397
398 .align 4
399 L(end_unaligned_loop):
400
401 /* Check for tail bytes. */
402 mtocrf 0x01,cnt
403 beqlr cr1
404
405 add src,src,0
406
407 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
408 /* Copy 8 bytes. */
409 bf 28,4f
410 lwz 6,0(src)
411 lwz 7,4(src)
412 addi src,src,8
413 stw 6,0(dst)
414 stw 7,4(dst)
415 addi dst,dst,8
416 4: /* Copy 4~7 bytes. */
417 bf 29,L(tail2)
418 lwz 6,0(src)
419 stw 6,0(dst)
420 bf 30,L(tail5)
421 lhz 7,4(src)
422 sth 7,4(dst)
423 bflr 31
424 lbz 8,6(src)
425 stb 8,6(dst)
426 /* Return original DST pointer. */
427 blr
428
429 END_GEN_TB (MEMCPY,TB_TOCLESS)
430 libc_hidden_builtin_def (memcpy)