]>
Commit | Line | Data |
---|---|---|
fb084e5e | 1 | /* Optimized memcpy implementation for PowerPC64/POWER7. |
d4697bc9 | 2 | Copyright (C) 2010-2014 Free Software Foundation, Inc. |
fb084e5e LM |
3 | Contributed by Luis Machado <luisgpm@br.ibm.com>. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
fb084e5e LM |
19 | |
20 | #include <sysdep.h> | |
fb084e5e LM |
21 | |
22 | ||
23 | /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); | |
24 | Returns 'dst'. */ | |
25 | ||
759cfef3 AM |
26 | #define dst 11 /* Use r11 so r3 kept unchanged. */ |
27 | #define src 4 | |
28 | #define cnt 5 | |
29 | ||
fb084e5e | 30 | .machine power7 |
2d67d91a | 31 | EALIGN (memcpy, 5, 0) |
fb084e5e LM |
32 | CALL_MCOUNT 3 |
33 | ||
759cfef3 | 34 | cmpldi cr1,cnt,31 |
fb084e5e | 35 | neg 0,3 |
fb084e5e LM |
36 | ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move |
37 | code. */ | |
38 | ||
759cfef3 AM |
39 | #ifdef __LITTLE_ENDIAN__ |
40 | /* In little-endian mode, power7 takes an alignment trap on any lxvd2x | |
41 | or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy | |
42 | loop is only used for quadword aligned copies. */ | |
43 | andi. 10,3,15 | |
44 | clrldi 11,4,60 | |
45 | #else | |
46 | andi. 10,3,7 /* Check alignment of DST. */ | |
47 | clrldi 11,4,61 /* Check alignment of SRC. */ | |
48 | #endif | |
49 | cmpld cr6,10,11 /* SRC and DST alignments match? */ | |
50 | ||
51 | mr dst,3 | |
fb084e5e | 52 | bne cr6,L(copy_GE_32_unaligned) |
759cfef3 | 53 | beq L(aligned_copy) |
fb084e5e | 54 | |
759cfef3 AM |
55 | mtocrf 0x01,0 |
56 | #ifdef __LITTLE_ENDIAN__ | |
57 | clrldi 0,0,60 | |
58 | #else | |
59 | clrldi 0,0,61 | |
60 | #endif | |
fb084e5e | 61 | |
759cfef3 AM |
62 | /* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ |
63 | 1: | |
64 | bf 31,2f | |
65 | lbz 6,0(src) | |
66 | addi src,src,1 | |
67 | stb 6,0(dst) | |
68 | addi dst,dst,1 | |
69 | 2: | |
70 | bf 30,4f | |
71 | lhz 6,0(src) | |
72 | addi src,src,2 | |
73 | sth 6,0(dst) | |
74 | addi dst,dst,2 | |
fb084e5e | 75 | 4: |
759cfef3 AM |
76 | bf 29,8f |
77 | lwz 6,0(src) | |
78 | addi src,src,4 | |
79 | stw 6,0(dst) | |
80 | addi dst,dst,4 | |
81 | 8: | |
82 | #ifdef __LITTLE_ENDIAN__ | |
83 | bf 28,16f | |
84 | ld 6,0(src) | |
85 | addi src,src,8 | |
86 | std 6,0(dst) | |
87 | addi dst,dst,8 | |
88 | 16: | |
89 | #endif | |
90 | subf cnt,0,cnt | |
91 | ||
92 | /* Main aligned copy loop. Copies 128 bytes at a time. */ | |
93 | L(aligned_copy): | |
94 | li 6,16 | |
95 | li 7,32 | |
96 | li 8,48 | |
97 | mtocrf 0x02,cnt | |
98 | srdi 12,cnt,7 | |
99 | cmpdi 12,0 | |
100 | beq L(aligned_tail) | |
101 | lxvd2x 6,0,src | |
102 | lxvd2x 7,src,6 | |
103 | mtctr 12 | |
5025581e WS |
104 | b L(aligned_128loop) |
105 | ||
759cfef3 | 106 | .align 4 |
5025581e WS |
107 | L(aligned_128head): |
108 | /* for the 2nd + iteration of this loop. */ | |
759cfef3 AM |
109 | lxvd2x 6,0,src |
110 | lxvd2x 7,src,6 | |
5025581e | 111 | L(aligned_128loop): |
759cfef3 AM |
112 | lxvd2x 8,src,7 |
113 | lxvd2x 9,src,8 | |
114 | stxvd2x 6,0,dst | |
115 | addi src,src,64 | |
116 | stxvd2x 7,dst,6 | |
117 | stxvd2x 8,dst,7 | |
118 | stxvd2x 9,dst,8 | |
119 | lxvd2x 6,0,src | |
120 | lxvd2x 7,src,6 | |
121 | addi dst,dst,64 | |
122 | lxvd2x 8,src,7 | |
123 | lxvd2x 9,src,8 | |
124 | addi src,src,64 | |
125 | stxvd2x 6,0,dst | |
126 | stxvd2x 7,dst,6 | |
127 | stxvd2x 8,dst,7 | |
128 | stxvd2x 9,dst,8 | |
129 | addi dst,dst,64 | |
5025581e | 130 | bdnz L(aligned_128head) |
fb084e5e | 131 | |
759cfef3 AM |
132 | L(aligned_tail): |
133 | mtocrf 0x01,cnt | |
134 | bf 25,32f | |
135 | lxvd2x 6,0,src | |
136 | lxvd2x 7,src,6 | |
137 | lxvd2x 8,src,7 | |
138 | lxvd2x 9,src,8 | |
139 | addi src,src,64 | |
140 | stxvd2x 6,0,dst | |
141 | stxvd2x 7,dst,6 | |
142 | stxvd2x 8,dst,7 | |
143 | stxvd2x 9,dst,8 | |
144 | addi dst,dst,64 | |
145 | 32: | |
146 | bf 26,16f | |
147 | lxvd2x 6,0,src | |
148 | lxvd2x 7,src,6 | |
149 | addi src,src,32 | |
150 | stxvd2x 6,0,dst | |
151 | stxvd2x 7,dst,6 | |
152 | addi dst,dst,32 | |
153 | 16: | |
154 | bf 27,8f | |
155 | lxvd2x 6,0,src | |
156 | addi src,src,16 | |
157 | stxvd2x 6,0,dst | |
158 | addi dst,dst,16 | |
159 | 8: | |
160 | bf 28,4f | |
161 | ld 6,0(src) | |
162 | addi src,src,8 | |
163 | std 6,0(dst) | |
164 | addi dst,dst,8 | |
165 | 4: /* Copies 4~7 bytes. */ | |
166 | bf 29,L(tail2) | |
167 | lwz 6,0(src) | |
168 | stw 6,0(dst) | |
169 | bf 30,L(tail5) | |
170 | lhz 7,4(src) | |
171 | sth 7,4(dst) | |
172 | bflr 31 | |
173 | lbz 8,6(src) | |
174 | stb 8,6(dst) | |
175 | /* Return original DST pointer. */ | |
b8907dfd UD |
176 | blr |
177 | ||
759cfef3 AM |
178 | |
179 | /* Handle copies of 0~31 bytes. */ | |
180 | .align 4 | |
fb084e5e | 181 | L(copy_LT_32): |
759cfef3 AM |
182 | mr dst,3 |
183 | cmpldi cr6,cnt,8 | |
184 | mtocrf 0x01,cnt | |
b8907dfd UD |
185 | ble cr6,L(copy_LE_8) |
186 | ||
187 | /* At least 9 bytes to go. */ | |
188 | neg 8,4 | |
759cfef3 AM |
189 | andi. 0,8,3 |
190 | cmpldi cr1,cnt,16 | |
b8907dfd UD |
191 | beq L(copy_LT_32_aligned) |
192 | ||
759cfef3 AM |
193 | /* Force 4-byte alignment for SRC. */ |
194 | mtocrf 0x01,0 | |
195 | subf cnt,0,cnt | |
196 | 2: | |
197 | bf 30,1f | |
198 | lhz 6,0(src) | |
199 | addi src,src,2 | |
200 | sth 6,0(dst) | |
201 | addi dst,dst,2 | |
202 | 1: | |
203 | bf 31,L(end_4bytes_alignment) | |
204 | lbz 6,0(src) | |
205 | addi src,src,1 | |
206 | stb 6,0(dst) | |
207 | addi dst,dst,1 | |
208 | ||
209 | .align 4 | |
fb084e5e | 210 | L(end_4bytes_alignment): |
759cfef3 AM |
211 | cmpldi cr1,cnt,16 |
212 | mtocrf 0x01,cnt | |
fb084e5e LM |
213 | |
214 | L(copy_LT_32_aligned): | |
215 | /* At least 6 bytes to go, and SRC is word-aligned. */ | |
b8907dfd UD |
216 | blt cr1,8f |
217 | ||
218 | /* Copy 16 bytes. */ | |
759cfef3 AM |
219 | lwz 6,0(src) |
220 | lwz 7,4(src) | |
221 | stw 6,0(dst) | |
222 | lwz 8,8(src) | |
223 | stw 7,4(dst) | |
224 | lwz 6,12(src) | |
225 | addi src,src,16 | |
226 | stw 8,8(dst) | |
227 | stw 6,12(dst) | |
228 | addi dst,dst,16 | |
b8907dfd | 229 | 8: /* Copy 8 bytes. */ |
759cfef3 AM |
230 | bf 28,L(tail4) |
231 | lwz 6,0(src) | |
232 | lwz 7,4(src) | |
233 | addi src,src,8 | |
234 | stw 6,0(dst) | |
235 | stw 7,4(dst) | |
236 | addi dst,dst,8 | |
237 | ||
238 | .align 4 | |
239 | /* Copies 4~7 bytes. */ | |
240 | L(tail4): | |
241 | bf 29,L(tail2) | |
242 | lwz 6,0(src) | |
243 | stw 6,0(dst) | |
244 | bf 30,L(tail5) | |
245 | lhz 7,4(src) | |
246 | sth 7,4(dst) | |
247 | bflr 31 | |
248 | lbz 8,6(src) | |
249 | stb 8,6(dst) | |
250 | /* Return original DST pointer. */ | |
251 | blr | |
b8907dfd | 252 | |
759cfef3 AM |
253 | .align 4 |
254 | /* Copies 2~3 bytes. */ | |
255 | L(tail2): | |
b8907dfd | 256 | bf 30,1f |
759cfef3 AM |
257 | lhz 6,0(src) |
258 | sth 6,0(dst) | |
259 | bflr 31 | |
260 | lbz 7,2(src) | |
261 | stb 7,2(dst) | |
b8907dfd UD |
262 | blr |
263 | ||
759cfef3 AM |
264 | .align 4 |
265 | L(tail5): | |
266 | bflr 31 | |
267 | lbz 6,4(src) | |
268 | stb 6,4(dst) | |
269 | blr | |
b8907dfd | 270 | |
759cfef3 AM |
271 | .align 4 |
272 | 1: | |
273 | bflr 31 | |
274 | lbz 6,0(src) | |
275 | stb 6,0(dst) | |
276 | /* Return original DST pointer. */ | |
b8907dfd UD |
277 | blr |
278 | ||
759cfef3 AM |
279 | |
280 | /* Handles copies of 0~8 bytes. */ | |
281 | .align 4 | |
fb084e5e | 282 | L(copy_LE_8): |
759cfef3 | 283 | bne cr6,L(tail4) |
fb084e5e | 284 | |
b8907dfd UD |
285 | /* Though we could've used ld/std here, they are still |
286 | slow for unaligned cases. */ | |
287 | ||
759cfef3 AM |
288 | lwz 6,0(src) |
289 | lwz 7,4(src) | |
290 | stw 6,0(dst) | |
291 | stw 7,4(dst) | |
b8907dfd UD |
292 | blr |
293 | ||
b8907dfd | 294 | |
759cfef3 AM |
295 | /* Handle copies of 32+ bytes where DST is aligned (to quadword) but |
296 | SRC is not. Use aligned quadword loads from SRC, shifted to realign | |
297 | the data, allowing for aligned DST stores. */ | |
298 | .align 4 | |
fb084e5e | 299 | L(copy_GE_32_unaligned): |
759cfef3 AM |
300 | clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ |
301 | #ifndef __LITTLE_ENDIAN__ | |
302 | andi. 10,3,15 /* Check alignment of DST (against quadwords). */ | |
303 | #endif | |
304 | srdi 9,cnt,4 /* Number of full quadwords remaining. */ | |
b8907dfd UD |
305 | |
306 | beq L(copy_GE_32_unaligned_cont) | |
307 | ||
759cfef3 | 308 | /* DST is not quadword aligned, get it aligned. */ |
b8907dfd | 309 | |
759cfef3 AM |
310 | mtocrf 0x01,0 |
311 | subf cnt,0,cnt | |
b8907dfd UD |
312 | |
313 | /* Vector instructions work best when proper alignment (16-bytes) | |
314 | is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ | |
759cfef3 | 315 | 1: |
b8907dfd | 316 | bf 31,2f |
759cfef3 AM |
317 | lbz 6,0(src) |
318 | addi src,src,1 | |
319 | stb 6,0(dst) | |
320 | addi dst,dst,1 | |
321 | 2: | |
b8907dfd | 322 | bf 30,4f |
759cfef3 AM |
323 | lhz 6,0(src) |
324 | addi src,src,2 | |
325 | sth 6,0(dst) | |
326 | addi dst,dst,2 | |
327 | 4: | |
b8907dfd | 328 | bf 29,8f |
759cfef3 AM |
329 | lwz 6,0(src) |
330 | addi src,src,4 | |
331 | stw 6,0(dst) | |
332 | addi dst,dst,4 | |
333 | 8: | |
b8907dfd | 334 | bf 28,0f |
759cfef3 AM |
335 | ld 6,0(src) |
336 | addi src,src,8 | |
337 | std 6,0(dst) | |
338 | addi dst,dst,8 | |
fb084e5e | 339 | 0: |
759cfef3 | 340 | srdi 9,cnt,4 /* Number of full quadwords remaining. */ |
fb084e5e | 341 | |
b8907dfd | 342 | /* The proper alignment is present, it is OK to copy the bytes now. */ |
fb084e5e LM |
343 | L(copy_GE_32_unaligned_cont): |
344 | ||
b8907dfd | 345 | /* Setup two indexes to speed up the indexed vector operations. */ |
759cfef3 AM |
346 | clrldi 10,cnt,60 |
347 | li 6,16 /* Index for 16-bytes offsets. */ | |
b8907dfd | 348 | li 7,32 /* Index for 32-bytes offsets. */ |
759cfef3 AM |
349 | cmpldi cr1,10,0 |
350 | srdi 8,cnt,5 /* Setup the loop counter. */ | |
351 | mtocrf 0x01,9 | |
352 | cmpldi cr6,9,1 | |
353 | #ifdef __LITTLE_ENDIAN__ | |
354 | lvsr 5,0,src | |
355 | #else | |
356 | lvsl 5,0,src | |
357 | #endif | |
358 | lvx 3,0,src | |
359 | li 0,0 | |
360 | bf 31,L(setup_unaligned_loop) | |
361 | ||
362 | /* Copy another 16 bytes to align to 32-bytes due to the loop. */ | |
363 | lvx 4,src,6 | |
364 | #ifdef __LITTLE_ENDIAN__ | |
365 | vperm 6,4,3,5 | |
366 | #else | |
367 | vperm 6,3,4,5 | |
368 | #endif | |
369 | addi src,src,16 | |
370 | stvx 6,0,dst | |
371 | addi dst,dst,16 | |
b8907dfd | 372 | vor 3,4,4 |
759cfef3 | 373 | clrrdi 0,src,60 |
fb084e5e LM |
374 | |
375 | L(setup_unaligned_loop): | |
759cfef3 AM |
376 | mtctr 8 |
377 | ble cr6,L(end_unaligned_loop) | |
fb084e5e | 378 | |
b8907dfd | 379 | /* Copy 32 bytes at a time using vector instructions. */ |
759cfef3 | 380 | .align 4 |
fb084e5e LM |
381 | L(unaligned_loop): |
382 | ||
b8907dfd UD |
383 | /* Note: vr6/vr10 may contain data that was already copied, |
384 | but in order to get proper alignment, we may have to copy | |
385 | some portions again. This is faster than having unaligned | |
386 | vector instructions though. */ | |
387 | ||
759cfef3 AM |
388 | lvx 4,src,6 |
389 | #ifdef __LITTLE_ENDIAN__ | |
390 | vperm 6,4,3,5 | |
391 | #else | |
392 | vperm 6,3,4,5 | |
393 | #endif | |
394 | lvx 3,src,7 | |
395 | #ifdef __LITTLE_ENDIAN__ | |
396 | vperm 10,3,4,5 | |
397 | #else | |
398 | vperm 10,4,3,5 | |
399 | #endif | |
400 | addi src,src,32 | |
401 | stvx 6,0,dst | |
402 | stvx 10,dst,6 | |
403 | addi dst,dst,32 | |
b8907dfd UD |
404 | bdnz L(unaligned_loop) |
405 | ||
759cfef3 AM |
406 | clrrdi 0,src,60 |
407 | ||
408 | .align 4 | |
fb084e5e LM |
409 | L(end_unaligned_loop): |
410 | ||
b8907dfd | 411 | /* Check for tail bytes. */ |
759cfef3 AM |
412 | mtocrf 0x01,cnt |
413 | beqlr cr1 | |
b8907dfd | 414 | |
759cfef3 | 415 | add src,src,0 |
b8907dfd UD |
416 | |
417 | /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ | |
759cfef3 | 418 | /* Copy 8 bytes. */ |
b8907dfd | 419 | bf 28,4f |
759cfef3 AM |
420 | lwz 6,0(src) |
421 | lwz 7,4(src) | |
422 | addi src,src,8 | |
423 | stw 6,0(dst) | |
424 | stw 7,4(dst) | |
425 | addi dst,dst,8 | |
426 | 4: /* Copy 4~7 bytes. */ | |
427 | bf 29,L(tail2) | |
428 | lwz 6,0(src) | |
429 | stw 6,0(dst) | |
430 | bf 30,L(tail5) | |
431 | lhz 7,4(src) | |
432 | sth 7,4(dst) | |
433 | bflr 31 | |
434 | lbz 8,6(src) | |
435 | stb 8,6(dst) | |
436 | /* Return original DST pointer. */ | |
b8907dfd | 437 | blr |
fb084e5e | 438 | |
2d67d91a | 439 | END_GEN_TB (memcpy,TB_TOCLESS) |
fb084e5e | 440 | libc_hidden_builtin_def (memcpy) |