]>
Commit | Line | Data |
---|---|---|
bfe6f5fa UD |
1 | /* |
2 | Optimized memcpy for x86-64. | |
3 | ||
4 | Copyright (C) 2007 Free Software Foundation, Inc. | |
5 | Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007. | |
6 | ||
78df0fcb | 7 | This file is part of the GNU C Library. |
78df0fcb AJ |
8 | |
9 | The GNU C Library is free software; you can redistribute it and/or | |
10 | modify it under the terms of the GNU Lesser General Public | |
11 | License as published by the Free Software Foundation; either | |
12 | version 2.1 of the License, or (at your option) any later version. | |
13 | ||
14 | The GNU C Library is distributed in the hope that it will be useful, | |
15 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | Lesser General Public License for more details. | |
18 | ||
19 | You should have received a copy of the GNU Lesser General Public | |
20 | License along with the GNU C Library; if not, write to the Free | |
21 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
bfe6f5fa UD |
22 | 02111-1307 USA. |
23 | */ | |
78df0fcb AJ |
24 | |
25 | #include <sysdep.h> | |
26 | #include "asm-syntax.h" | |
78df0fcb | 27 | |
bfe6f5fa UD |
28 | /* Stack slots in the red-zone. */ |
29 | ||
30 | #ifdef USE_AS_MEMPCPY | |
31 | # define RETVAL (0) | |
32 | #else | |
33 | # define RETVAL (-8) | |
34 | #endif | |
35 | #define SAVE0 (RETVAL - 8) | |
36 | #define SAVE1 (SAVE0 - 8) | |
37 | #define SAVE2 (SAVE1 - 8) | |
38 | #define SAVE3 (SAVE2 - 8) | |
78df0fcb AJ |
39 | |
40 | .text | |
bfe6f5fa | 41 | |
b5cc329c UD |
42 | #if defined PIC && !defined NOT_IN_libc |
43 | ENTRY (__memcpy_chk) | |
bfe6f5fa | 44 | |
b5cc329c UD |
45 | cmpq %rdx, %rcx |
46 | jb HIDDEN_JUMPTARGET (__chk_fail) | |
bfe6f5fa | 47 | |
b5cc329c UD |
48 | END (__memcpy_chk) |
49 | #endif | |
bfe6f5fa UD |
50 | |
51 | ENTRY(memcpy) /* (void *, const void*, size_t) */ | |
52 | ||
53 | /* Handle tiny blocks. */ | |
54 | ||
55 | L(1try): /* up to 32B */ | |
78df0fcb | 56 | cmpq $32, %rdx |
bfe6f5fa UD |
57 | #ifndef USE_AS_MEMPCPY |
58 | movq %rdi, %rax /* save return value */ | |
78df0fcb | 59 | #endif |
bfe6f5fa | 60 | jae L(1after) |
78df0fcb | 61 | |
bfe6f5fa UD |
62 | L(1): /* 1-byte once */ |
63 | testb $1, %dl | |
64 | jz L(1a) | |
78df0fcb | 65 | |
bfe6f5fa UD |
66 | movzbl (%rsi), %ecx |
67 | movb %cl, (%rdi) | |
78df0fcb | 68 | |
bfe6f5fa UD |
69 | incq %rsi |
70 | incq %rdi | |
71 | ||
72 | .p2align 4,, 4 | |
73 | ||
74 | L(1a): /* 2-byte once */ | |
75 | testb $2, %dl | |
76 | jz L(1b) | |
77 | ||
78 | movzwl (%rsi), %ecx | |
79 | movw %cx, (%rdi) | |
80 | ||
81 | addq $2, %rsi | |
82 | addq $2, %rdi | |
83 | ||
84 | .p2align 4,, 4 | |
85 | ||
86 | L(1b): /* 4-byte once */ | |
87 | testb $4, %dl | |
88 | jz L(1c) | |
89 | ||
90 | movl (%rsi), %ecx | |
91 | movl %ecx, (%rdi) | |
92 | ||
93 | addq $4, %rsi | |
94 | addq $4, %rdi | |
95 | ||
96 | .p2align 4,, 4 | |
97 | ||
98 | L(1c): /* 8-byte once */ | |
99 | testb $8, %dl | |
100 | jz L(1d) | |
101 | ||
102 | movq (%rsi), %rcx | |
103 | movq %rcx, (%rdi) | |
104 | ||
105 | addq $8, %rsi | |
106 | addq $8, %rdi | |
107 | ||
108 | .p2align 4,, 4 | |
109 | ||
110 | L(1d): /* 16-byte loop */ | |
111 | andl $0xf0, %edx | |
112 | jz L(exit) | |
113 | ||
114 | .p2align 4 | |
115 | ||
116 | L(1loop): | |
0435403c UD |
117 | movq (%rsi), %rcx |
118 | movq 8(%rsi), %r8 | |
119 | movq %rcx, (%rdi) | |
120 | movq %r8, 8(%rdi) | |
bfe6f5fa UD |
121 | |
122 | subl $16, %edx | |
123 | ||
0435403c UD |
124 | leaq 16(%rsi), %rsi |
125 | leaq 16(%rdi), %rdi | |
bfe6f5fa UD |
126 | |
127 | jnz L(1loop) | |
128 | ||
129 | .p2align 4,, 4 | |
130 | ||
131 | L(exit): /* exit */ | |
132 | #ifdef USE_AS_MEMPCPY | |
133 | movq %rdi, %rax /* return value */ | |
134 | #else | |
135 | rep | |
136 | #endif | |
137 | retq | |
138 | ||
139 | .p2align 4 | |
140 | ||
141 | L(1after): | |
142 | #ifndef USE_AS_MEMPCPY | |
0435403c | 143 | movq %rax, RETVAL(%rsp) /* save return value */ |
bfe6f5fa UD |
144 | #endif |
145 | ||
146 | /* Align to the natural word size. */ | |
147 | ||
148 | L(aligntry): | |
0435403c | 149 | movl %esi, %ecx /* align by source */ |
bfe6f5fa UD |
150 | |
151 | andl $7, %ecx | |
152 | jz L(alignafter) /* already aligned */ | |
153 | ||
154 | L(align): /* align */ | |
0435403c | 155 | leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ |
bfe6f5fa UD |
156 | subl $8, %ecx |
157 | ||
158 | .p2align 4 | |
159 | ||
160 | L(alignloop): /* 1-byte alignment loop */ | |
161 | movzbl (%rsi), %eax | |
162 | movb %al, (%rdi) | |
163 | ||
164 | incl %ecx | |
165 | ||
0435403c UD |
166 | leaq 1(%rsi), %rsi |
167 | leaq 1(%rdi), %rdi | |
78df0fcb | 168 | |
bfe6f5fa | 169 | jnz L(alignloop) |
78df0fcb | 170 | |
bfe6f5fa UD |
171 | .p2align 4 |
172 | ||
173 | L(alignafter): | |
174 | ||
0435403c | 175 | /* Handle mid-sized blocks. */ |
bfe6f5fa UD |
176 | |
177 | L(32try): /* up to 1KB */ | |
178 | cmpq $1024, %rdx | |
179 | ja L(32after) | |
180 | ||
181 | L(32): /* 32-byte loop */ | |
182 | movl %edx, %ecx | |
183 | shrl $5, %ecx | |
184 | jz L(32skip) | |
78df0fcb AJ |
185 | |
186 | .p2align 4 | |
78df0fcb | 187 | |
bfe6f5fa UD |
188 | L(32loop): |
189 | decl %ecx | |
78df0fcb | 190 | |
0435403c UD |
191 | movq (%rsi), %rax |
192 | movq 8(%rsi), %r8 | |
193 | movq 16(%rsi), %r9 | |
194 | movq 24(%rsi), %r10 | |
bfe6f5fa | 195 | |
0435403c UD |
196 | movq %rax, (%rdi) |
197 | movq %r8, 8(%rdi) | |
198 | movq %r9, 16(%rdi) | |
199 | movq %r10, 24(%rdi) | |
78df0fcb AJ |
200 | |
201 | leaq 32(%rsi), %rsi | |
202 | leaq 32(%rdi), %rdi | |
203 | ||
bfe6f5fa UD |
204 | jz L(32skip) /* help out smaller blocks */ |
205 | ||
206 | decl %ecx | |
207 | ||
0435403c UD |
208 | movq (%rsi), %rax |
209 | movq 8(%rsi), %r8 | |
210 | movq 16(%rsi), %r9 | |
211 | movq 24(%rsi), %r10 | |
bfe6f5fa | 212 | |
0435403c UD |
213 | movq %rax, (%rdi) |
214 | movq %r8, 8(%rdi) | |
215 | movq %r9, 16(%rdi) | |
216 | movq %r10, 24(%rdi) | |
bfe6f5fa | 217 | |
0435403c UD |
218 | leaq 32(%rsi), %rsi |
219 | leaq 32(%rdi), %rdi | |
78df0fcb | 220 | |
bfe6f5fa | 221 | jnz L(32loop) |
78df0fcb | 222 | |
bfe6f5fa UD |
223 | .p2align 4 |
224 | ||
225 | L(32skip): | |
226 | andl $31, %edx /* check for left overs */ | |
227 | #ifdef USE_AS_MEMPCPY | |
228 | jnz L(1) | |
229 | ||
230 | movq %rdi, %rax | |
78df0fcb | 231 | #else |
0435403c | 232 | movq RETVAL(%rsp), %rax |
bfe6f5fa | 233 | jnz L(1) |
0435403c | 234 | |
bfe6f5fa UD |
235 | rep |
236 | #endif | |
237 | retq /* exit */ | |
238 | ||
239 | .p2align 4 | |
240 | ||
241 | L(32after): | |
242 | ||
243 | /* | |
244 | In order to minimize code-size in RTLD, algorithms specific for | |
245 | larger blocks are excluded when building for RTLD. | |
246 | */ | |
247 | ||
0435403c | 248 | /* Handle blocks smaller than 1/2 L1. */ |
bfe6f5fa UD |
249 | |
250 | L(fasttry): /* first 1/2 L1 */ | |
251 | #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */ | |
0435403c | 252 | movq __x86_64_data_cache_size_half(%rip), %r11 |
bfe6f5fa UD |
253 | cmpq %rdx, %r11 /* calculate the smaller of */ |
254 | cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ | |
255 | #endif | |
256 | ||
257 | L(fast): /* good ol' MOVS */ | |
258 | #ifndef NOT_IN_libc | |
259 | movq %r11, %rcx | |
260 | andq $-8, %r11 | |
261 | #else | |
262 | movq %rdx, %rcx | |
263 | #endif | |
264 | shrq $3, %rcx | |
265 | jz L(fastskip) | |
266 | ||
267 | rep | |
268 | movsq | |
269 | ||
270 | .p2align 4,, 4 | |
271 | ||
272 | L(fastskip): | |
273 | #ifndef NOT_IN_libc | |
274 | subq %r11, %rdx /* check for more */ | |
275 | testq $-8, %rdx | |
276 | jnz L(fastafter) | |
277 | #endif | |
278 | ||
279 | andl $7, %edx /* check for left overs */ | |
280 | #ifdef USE_AS_MEMPCPY | |
281 | jnz L(1) | |
282 | ||
283 | movq %rdi, %rax | |
284 | #else | |
0435403c | 285 | movq RETVAL(%rsp), %rax |
bfe6f5fa UD |
286 | jnz L(1) |
287 | ||
288 | rep | |
289 | #endif | |
290 | retq /* exit */ | |
291 | ||
292 | #ifndef NOT_IN_libc /* none of the algorithms below for RTLD */ | |
293 | ||
294 | .p2align 4 | |
295 | ||
296 | L(fastafter): | |
297 | ||
298 | /* Handle large blocks smaller than 1/2 L2. */ | |
299 | ||
300 | L(pretry): /* first 1/2 L2 */ | |
301 | movq __x86_64_shared_cache_size_half (%rip), %r8 | |
302 | cmpq %rdx, %r8 /* calculate the lesser of */ | |
303 | cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */ | |
304 | ||
305 | L(pre): /* 64-byte with prefetching */ | |
306 | movq %r8, %rcx | |
307 | andq $-64, %r8 | |
308 | shrq $6, %rcx | |
309 | jz L(preskip) | |
310 | ||
0435403c | 311 | movq %r14, SAVE0(%rsp) |
bfe6f5fa | 312 | cfi_rel_offset (%r14, SAVE0) |
0435403c | 313 | movq %r13, SAVE1(%rsp) |
bfe6f5fa | 314 | cfi_rel_offset (%r13, SAVE1) |
0435403c | 315 | movq %r12, SAVE2(%rsp) |
bfe6f5fa | 316 | cfi_rel_offset (%r12, SAVE2) |
0435403c | 317 | movq %rbx, SAVE3(%rsp) |
bfe6f5fa UD |
318 | cfi_rel_offset (%rbx, SAVE3) |
319 | ||
0435403c | 320 | cmpl $0, __x86_64_prefetchw(%rip) |
bfe6f5fa UD |
321 | jz L(preloop) /* check if PREFETCHW OK */ |
322 | ||
323 | .p2align 4 | |
324 | ||
325 | /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */ | |
326 | ||
327 | L(prewloop): /* cache-line in state M */ | |
328 | decq %rcx | |
329 | ||
330 | movq (%rsi), %rax | |
331 | movq 8 (%rsi), %rbx | |
332 | movq 16 (%rsi), %r9 | |
333 | movq 24 (%rsi), %r10 | |
334 | movq 32 (%rsi), %r11 | |
335 | movq 40 (%rsi), %r12 | |
336 | movq 48 (%rsi), %r13 | |
337 | movq 56 (%rsi), %r14 | |
338 | ||
339 | prefetcht0 0 + 896 (%rsi) | |
340 | prefetcht0 64 + 896 (%rsi) | |
341 | ||
0435403c UD |
342 | movq %rax, (%rdi) |
343 | movq %rbx, 8(%rdi) | |
344 | movq %r9, 16(%rdi) | |
345 | movq %r10, 24(%rdi) | |
346 | movq %r11, 32(%rdi) | |
347 | movq %r12, 40(%rdi) | |
348 | movq %r13, 48(%rdi) | |
349 | movq %r14, 56(%rdi) | |
bfe6f5fa | 350 | |
0435403c UD |
351 | leaq 64(%rsi), %rsi |
352 | leaq 64(%rdi), %rdi | |
bfe6f5fa UD |
353 | |
354 | jz L(prebail) | |
355 | ||
356 | decq %rcx | |
357 | ||
0435403c UD |
358 | movq (%rsi), %rax |
359 | movq 8(%rsi), %rbx | |
360 | movq 16(%rsi), %r9 | |
361 | movq 24(%rsi), %r10 | |
362 | movq 32(%rsi), %r11 | |
363 | movq 40(%rsi), %r12 | |
364 | movq 48(%rsi), %r13 | |
365 | movq 56(%rsi), %r14 | |
366 | ||
367 | movq %rax, (%rdi) | |
368 | movq %rbx, 8(%rdi) | |
369 | movq %r9, 16(%rdi) | |
370 | movq %r10, 24(%rdi) | |
371 | movq %r11, 32(%rdi) | |
372 | movq %r12, 40(%rdi) | |
373 | movq %r13, 48(%rdi) | |
374 | movq %r14, 56(%rdi) | |
375 | ||
376 | prefetchw 896 - 64(%rdi) | |
377 | prefetchw 896 - 0(%rdi) | |
378 | ||
379 | leaq 64(%rsi), %rsi | |
380 | leaq 64(%rdi), %rdi | |
bfe6f5fa UD |
381 | |
382 | jnz L(prewloop) | |
383 | jmp L(prebail) | |
384 | ||
385 | .p2align 4 | |
386 | ||
387 | /* ... when PREFETCHW is not available. */ | |
388 | ||
389 | L(preloop): /* cache-line in state E */ | |
390 | decq %rcx | |
391 | ||
0435403c UD |
392 | movq (%rsi), %rax |
393 | movq 8(%rsi), %rbx | |
394 | movq 16(%rsi), %r9 | |
395 | movq 24(%rsi), %r10 | |
396 | movq 32(%rsi), %r11 | |
397 | movq 40(%rsi), %r12 | |
398 | movq 48(%rsi), %r13 | |
399 | movq 56(%rsi), %r14 | |
400 | ||
401 | prefetcht0 896 + 0(%rsi) | |
402 | prefetcht0 896 + 64(%rsi) | |
403 | ||
404 | movq %rax, (%rdi) | |
405 | movq %rbx, 8(%rdi) | |
406 | movq %r9, 16(%rdi) | |
407 | movq %r10, 24(%rdi) | |
408 | movq %r11, 32(%rdi) | |
409 | movq %r12, 40(%rdi) | |
410 | movq %r13, 48(%rdi) | |
411 | movq %r14, 56(%rdi) | |
bfe6f5fa UD |
412 | |
413 | leaq 64 (%rsi), %rsi | |
414 | leaq 64 (%rdi), %rdi | |
415 | ||
416 | jz L(prebail) | |
417 | ||
418 | decq %rcx | |
419 | ||
0435403c UD |
420 | movq (%rsi), %rax |
421 | movq 8(%rsi), %rbx | |
422 | movq 16(%rsi), %r9 | |
423 | movq 24(%rsi), %r10 | |
424 | movq 32(%rsi), %r11 | |
425 | movq 40(%rsi), %r12 | |
426 | movq 48(%rsi), %r13 | |
427 | movq 56(%rsi), %r14 | |
428 | ||
429 | prefetcht0 896 - 64(%rdi) | |
430 | prefetcht0 896 - 0(%rdi) | |
431 | ||
432 | movq %rax, (%rdi) | |
433 | movq %rbx, 8(%rdi) | |
434 | movq %r9, 16(%rdi) | |
435 | movq %r10, 24(%rdi) | |
436 | movq %r11, 32(%rdi) | |
437 | movq %r12, 40(%rdi) | |
438 | movq %r13, 48(%rdi) | |
439 | movq %r14, 56(%rdi) | |
440 | ||
441 | leaq 64(%rsi), %rsi | |
442 | leaq 64(%rdi), %rdi | |
bfe6f5fa UD |
443 | |
444 | jnz L(preloop) | |
445 | ||
446 | L(prebail): | |
0435403c | 447 | movq SAVE3(%rsp), %rbx |
bfe6f5fa | 448 | cfi_restore (%rbx) |
0435403c | 449 | movq SAVE2(%rsp), %r12 |
bfe6f5fa | 450 | cfi_restore (%r12) |
0435403c | 451 | movq SAVE1(%rsp), %r13 |
bfe6f5fa | 452 | cfi_restore (%r13) |
0435403c | 453 | movq SAVE0(%rsp), %r14 |
bfe6f5fa UD |
454 | cfi_restore (%r14) |
455 | ||
456 | /* .p2align 4 */ | |
457 | ||
458 | L(preskip): | |
459 | subq %r8, %rdx /* check for more */ | |
460 | testq $-64, %rdx | |
461 | jnz L(preafter) | |
462 | ||
463 | andl $63, %edx /* check for left overs */ | |
464 | #ifdef USE_AS_MEMPCPY | |
465 | jnz L(1) | |
466 | ||
467 | movq %rdi, %rax | |
468 | #else | |
0435403c | 469 | movq RETVAL(%rsp), %rax |
bfe6f5fa UD |
470 | jnz L(1) |
471 | ||
472 | rep | |
473 | #endif | |
474 | retq /* exit */ | |
475 | ||
476 | .p2align 4 | |
477 | ||
478 | L(preafter): | |
479 | ||
0435403c | 480 | /* Handle huge blocks. */ |
bfe6f5fa UD |
481 | |
482 | L(NTtry): | |
483 | ||
484 | L(NT): /* non-temporal 128-byte */ | |
485 | movq %rdx, %rcx | |
486 | shrq $7, %rcx | |
487 | jz L(NTskip) | |
488 | ||
0435403c | 489 | movq %r14, SAVE0(%rsp) |
bfe6f5fa | 490 | cfi_rel_offset (%r14, SAVE0) |
0435403c | 491 | movq %r13, SAVE1(%rsp) |
bfe6f5fa | 492 | cfi_rel_offset (%r13, SAVE1) |
0435403c | 493 | movq %r12, SAVE2(%rsp) |
bfe6f5fa UD |
494 | cfi_rel_offset (%r12, SAVE2) |
495 | ||
496 | .p2align 4 | |
497 | ||
498 | L(NTloop): | |
0435403c UD |
499 | prefetchnta 768(%rsi) |
500 | prefetchnta 832(%rsi) | |
bfe6f5fa UD |
501 | |
502 | decq %rcx | |
503 | ||
0435403c UD |
504 | movq (%rsi), %rax |
505 | movq 8(%rsi), %r8 | |
506 | movq 16(%rsi), %r9 | |
507 | movq 24(%rsi), %r10 | |
508 | movq 32(%rsi), %r11 | |
509 | movq 40(%rsi), %r12 | |
510 | movq 48(%rsi), %r13 | |
511 | movq 56(%rsi), %r14 | |
512 | ||
513 | movntiq %rax, (%rdi) | |
514 | movntiq %r8, 8(%rdi) | |
515 | movntiq %r9, 16(%rdi) | |
516 | movntiq %r10, 24(%rdi) | |
517 | movntiq %r11, 32(%rdi) | |
518 | movntiq %r12, 40(%rdi) | |
519 | movntiq %r13, 48(%rdi) | |
520 | movntiq %r14, 56(%rdi) | |
521 | ||
522 | movq 64(%rsi), %rax | |
523 | movq 72(%rsi), %r8 | |
524 | movq 80(%rsi), %r9 | |
525 | movq 88(%rsi), %r10 | |
526 | movq 96(%rsi), %r11 | |
527 | movq 104(%rsi), %r12 | |
528 | movq 112(%rsi), %r13 | |
529 | movq 120(%rsi), %r14 | |
530 | ||
531 | movntiq %rax, 64(%rdi) | |
532 | movntiq %r8, 72(%rdi) | |
533 | movntiq %r9, 80(%rdi) | |
534 | movntiq %r10, 88(%rdi) | |
535 | movntiq %r11, 96(%rdi) | |
536 | movntiq %r12, 104(%rdi) | |
537 | movntiq %r13, 112(%rdi) | |
538 | movntiq %r14, 120(%rdi) | |
539 | ||
540 | leaq 128(%rsi), %rsi | |
541 | leaq 128(%rdi), %rdi | |
bfe6f5fa UD |
542 | |
543 | jnz L(NTloop) | |
544 | ||
545 | sfence /* serialize memory stores */ | |
546 | ||
0435403c | 547 | movq SAVE2(%rsp), %r12 |
bfe6f5fa | 548 | cfi_restore (%r12) |
0435403c | 549 | movq SAVE1(%rsp), %r13 |
bfe6f5fa | 550 | cfi_restore (%r13) |
0435403c | 551 | movq SAVE0(%rsp), %r14 |
bfe6f5fa UD |
552 | cfi_restore (%r14) |
553 | ||
554 | L(NTskip): | |
555 | andl $127, %edx /* check for left overs */ | |
556 | #ifdef USE_AS_MEMPCPY | |
557 | jnz L(1) | |
558 | ||
559 | movq %rdi, %rax | |
560 | #else | |
0435403c | 561 | movq RETVAL(%rsp), %rax |
bfe6f5fa UD |
562 | jnz L(1) |
563 | ||
564 | rep | |
78df0fcb | 565 | #endif |
bfe6f5fa UD |
566 | retq /* exit */ |
567 | ||
568 | #endif /* !NOT_IN_libc */ | |
569 | ||
570 | END(memcpy) | |
78df0fcb | 571 | |
bfe6f5fa | 572 | #ifndef USE_AS_MEMPCPY |
85dd1003 | 573 | libc_hidden_builtin_def (memcpy) |
3dbfd811 | 574 | #endif |