]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/memcpy.S
Update copyright notices with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / memcpy.S
1 /*
2 Optimized memcpy for x86-64.
3
4 Copyright (C) 2007-2013 Free Software Foundation, Inc.
5 Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
6
7 This file is part of the GNU C Library.
8
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
20 License along with the GNU C Library; if not, see
21 <http://www.gnu.org/licenses/>.
22 */
23
24 #include <sysdep.h>
25 #include "asm-syntax.h"
26
27 /* Stack slots in the red-zone. */
28
29 #ifdef USE_AS_MEMPCPY
30 # define RETVAL (0)
31 #else
32 # define RETVAL (-8)
33 # if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
34 # define memcpy __memcpy
35 # undef libc_hidden_builtin_def
36 # define libc_hidden_builtin_def(name) \
37 .globl __GI_memcpy; __GI_memcpy = __memcpy
38 # endif
39 #endif
40 #define SAVE0 (RETVAL - 8)
41 #define SAVE1 (SAVE0 - 8)
42 #define SAVE2 (SAVE1 - 8)
43 #define SAVE3 (SAVE2 - 8)
44
45 .text
46
47 #if defined PIC && !defined NOT_IN_libc
48 ENTRY_CHK (__memcpy_chk)
49
50 cmpq %rdx, %rcx
51 jb HIDDEN_JUMPTARGET (__chk_fail)
52
53 END_CHK (__memcpy_chk)
54 #endif
55
56 ENTRY(memcpy) /* (void *, const void*, size_t) */
57
58 /* Handle tiny blocks. */
59
60 L(1try): /* up to 32B */
61 cmpq $32, %rdx
62 #ifndef USE_AS_MEMPCPY
63 movq %rdi, %rax /* save return value */
64 #endif
65 jae L(1after)
66
67 L(1): /* 1-byte once */
68 testb $1, %dl
69 jz L(1a)
70
71 movzbl (%rsi), %ecx
72 movb %cl, (%rdi)
73
74 incq %rsi
75 incq %rdi
76
77 .p2align 4,, 4
78
79 L(1a): /* 2-byte once */
80 testb $2, %dl
81 jz L(1b)
82
83 movzwl (%rsi), %ecx
84 movw %cx, (%rdi)
85
86 addq $2, %rsi
87 addq $2, %rdi
88
89 .p2align 4,, 4
90
91 L(1b): /* 4-byte once */
92 testb $4, %dl
93 jz L(1c)
94
95 movl (%rsi), %ecx
96 movl %ecx, (%rdi)
97
98 addq $4, %rsi
99 addq $4, %rdi
100
101 .p2align 4,, 4
102
103 L(1c): /* 8-byte once */
104 testb $8, %dl
105 jz L(1d)
106
107 movq (%rsi), %rcx
108 movq %rcx, (%rdi)
109
110 addq $8, %rsi
111 addq $8, %rdi
112
113 .p2align 4,, 4
114
115 L(1d): /* 16-byte loop */
116 andl $0xf0, %edx
117 jz L(exit)
118
119 .p2align 4
120
121 L(1loop):
122 movq (%rsi), %rcx
123 movq 8(%rsi), %r8
124 movq %rcx, (%rdi)
125 movq %r8, 8(%rdi)
126
127 subl $16, %edx
128
129 leaq 16(%rsi), %rsi
130 leaq 16(%rdi), %rdi
131
132 jnz L(1loop)
133
134 .p2align 4,, 4
135
136 L(exit): /* exit */
137 #ifdef USE_AS_MEMPCPY
138 movq %rdi, %rax /* return value */
139 #else
140 rep
141 #endif
142 retq
143
144 .p2align 4
145
146 L(1after):
147 #ifndef USE_AS_MEMPCPY
148 movq %rax, RETVAL(%rsp) /* save return value */
149 #endif
150
151 /* Align to the natural word size. */
152
153 L(aligntry):
154 movl %esi, %ecx /* align by source */
155
156 andl $7, %ecx
157 jz L(alignafter) /* already aligned */
158
159 L(align): /* align */
160 leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
161 subl $8, %ecx
162
163 .p2align 4
164
165 L(alignloop): /* 1-byte alignment loop */
166 movzbl (%rsi), %eax
167 movb %al, (%rdi)
168
169 incl %ecx
170
171 leaq 1(%rsi), %rsi
172 leaq 1(%rdi), %rdi
173
174 jnz L(alignloop)
175
176 .p2align 4
177
178 L(alignafter):
179
180 /* Handle mid-sized blocks. */
181
182 L(32try): /* up to 1KB */
183 cmpq $1024, %rdx
184 ja L(32after)
185
186 L(32): /* 32-byte loop */
187 movl %edx, %ecx
188 shrl $5, %ecx
189 jz L(32skip)
190
191 .p2align 4
192
193 L(32loop):
194 decl %ecx
195
196 movq (%rsi), %rax
197 movq 8(%rsi), %r8
198 movq 16(%rsi), %r9
199 movq 24(%rsi), %r10
200
201 movq %rax, (%rdi)
202 movq %r8, 8(%rdi)
203 movq %r9, 16(%rdi)
204 movq %r10, 24(%rdi)
205
206 leaq 32(%rsi), %rsi
207 leaq 32(%rdi), %rdi
208
209 jz L(32skip) /* help out smaller blocks */
210
211 decl %ecx
212
213 movq (%rsi), %rax
214 movq 8(%rsi), %r8
215 movq 16(%rsi), %r9
216 movq 24(%rsi), %r10
217
218 movq %rax, (%rdi)
219 movq %r8, 8(%rdi)
220 movq %r9, 16(%rdi)
221 movq %r10, 24(%rdi)
222
223 leaq 32(%rsi), %rsi
224 leaq 32(%rdi), %rdi
225
226 jnz L(32loop)
227
228 .p2align 4
229
230 L(32skip):
231 andl $31, %edx /* check for left overs */
232 #ifdef USE_AS_MEMPCPY
233 jnz L(1)
234
235 movq %rdi, %rax
236 #else
237 movq RETVAL(%rsp), %rax
238 jnz L(1)
239
240 rep
241 #endif
242 retq /* exit */
243
244 .p2align 4
245
246 L(32after):
247
248 /*
249 In order to minimize code-size in RTLD, algorithms specific for
250 larger blocks are excluded when building for RTLD.
251 */
252
253 /* Handle blocks smaller than 1/2 L1. */
254
255 L(fasttry): /* first 1/2 L1 */
256 #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
257 mov __x86_64_data_cache_size_half(%rip), %R11_LP
258 cmpq %rdx, %r11 /* calculate the smaller of */
259 cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
260 #endif
261
262 L(fast): /* good ol' MOVS */
263 #ifndef NOT_IN_libc
264 movq %r11, %rcx
265 andq $-8, %r11
266 #else
267 movq %rdx, %rcx
268 #endif
269 shrq $3, %rcx
270 jz L(fastskip)
271
272 rep
273 movsq
274
275 .p2align 4,, 4
276
277 L(fastskip):
278 #ifndef NOT_IN_libc
279 subq %r11, %rdx /* check for more */
280 testq $-8, %rdx
281 jnz L(fastafter)
282 #endif
283
284 andl $7, %edx /* check for left overs */
285 #ifdef USE_AS_MEMPCPY
286 jnz L(1)
287
288 movq %rdi, %rax
289 #else
290 movq RETVAL(%rsp), %rax
291 jnz L(1)
292
293 rep
294 #endif
295 retq /* exit */
296
297 #ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
298
299 .p2align 4
300
301 L(fastafter):
302
303 /* Handle large blocks smaller than 1/2 L2. */
304
305 L(pretry): /* first 1/2 L2 */
306 mov __x86_64_shared_cache_size_half (%rip), %R8_LP
307 cmpq %rdx, %r8 /* calculate the lesser of */
308 cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
309
310 L(pre): /* 64-byte with prefetching */
311 movq %r8, %rcx
312 andq $-64, %r8
313 shrq $6, %rcx
314 jz L(preskip)
315
316 movq %r14, SAVE0(%rsp)
317 cfi_rel_offset (%r14, SAVE0)
318 movq %r13, SAVE1(%rsp)
319 cfi_rel_offset (%r13, SAVE1)
320 movq %r12, SAVE2(%rsp)
321 cfi_rel_offset (%r12, SAVE2)
322 movq %rbx, SAVE3(%rsp)
323 cfi_rel_offset (%rbx, SAVE3)
324
325 cmpl $0, __x86_64_prefetchw(%rip)
326 jz L(preloop) /* check if PREFETCHW OK */
327
328 .p2align 4
329
330 /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
331
332 L(prewloop): /* cache-line in state M */
333 decq %rcx
334
335 movq (%rsi), %rax
336 movq 8 (%rsi), %rbx
337 movq 16 (%rsi), %r9
338 movq 24 (%rsi), %r10
339 movq 32 (%rsi), %r11
340 movq 40 (%rsi), %r12
341 movq 48 (%rsi), %r13
342 movq 56 (%rsi), %r14
343
344 prefetcht0 0 + 896 (%rsi)
345 prefetcht0 64 + 896 (%rsi)
346
347 movq %rax, (%rdi)
348 movq %rbx, 8(%rdi)
349 movq %r9, 16(%rdi)
350 movq %r10, 24(%rdi)
351 movq %r11, 32(%rdi)
352 movq %r12, 40(%rdi)
353 movq %r13, 48(%rdi)
354 movq %r14, 56(%rdi)
355
356 leaq 64(%rsi), %rsi
357 leaq 64(%rdi), %rdi
358
359 jz L(prebail)
360
361 decq %rcx
362
363 movq (%rsi), %rax
364 movq 8(%rsi), %rbx
365 movq 16(%rsi), %r9
366 movq 24(%rsi), %r10
367 movq 32(%rsi), %r11
368 movq 40(%rsi), %r12
369 movq 48(%rsi), %r13
370 movq 56(%rsi), %r14
371
372 movq %rax, (%rdi)
373 movq %rbx, 8(%rdi)
374 movq %r9, 16(%rdi)
375 movq %r10, 24(%rdi)
376 movq %r11, 32(%rdi)
377 movq %r12, 40(%rdi)
378 movq %r13, 48(%rdi)
379 movq %r14, 56(%rdi)
380
381 prefetchw 896 - 64(%rdi)
382 prefetchw 896 - 0(%rdi)
383
384 leaq 64(%rsi), %rsi
385 leaq 64(%rdi), %rdi
386
387 jnz L(prewloop)
388 jmp L(prebail)
389
390 .p2align 4
391
392 /* ... when PREFETCHW is not available. */
393
394 L(preloop): /* cache-line in state E */
395 decq %rcx
396
397 movq (%rsi), %rax
398 movq 8(%rsi), %rbx
399 movq 16(%rsi), %r9
400 movq 24(%rsi), %r10
401 movq 32(%rsi), %r11
402 movq 40(%rsi), %r12
403 movq 48(%rsi), %r13
404 movq 56(%rsi), %r14
405
406 prefetcht0 896 + 0(%rsi)
407 prefetcht0 896 + 64(%rsi)
408
409 movq %rax, (%rdi)
410 movq %rbx, 8(%rdi)
411 movq %r9, 16(%rdi)
412 movq %r10, 24(%rdi)
413 movq %r11, 32(%rdi)
414 movq %r12, 40(%rdi)
415 movq %r13, 48(%rdi)
416 movq %r14, 56(%rdi)
417
418 leaq 64 (%rsi), %rsi
419 leaq 64 (%rdi), %rdi
420
421 jz L(prebail)
422
423 decq %rcx
424
425 movq (%rsi), %rax
426 movq 8(%rsi), %rbx
427 movq 16(%rsi), %r9
428 movq 24(%rsi), %r10
429 movq 32(%rsi), %r11
430 movq 40(%rsi), %r12
431 movq 48(%rsi), %r13
432 movq 56(%rsi), %r14
433
434 prefetcht0 896 - 64(%rdi)
435 prefetcht0 896 - 0(%rdi)
436
437 movq %rax, (%rdi)
438 movq %rbx, 8(%rdi)
439 movq %r9, 16(%rdi)
440 movq %r10, 24(%rdi)
441 movq %r11, 32(%rdi)
442 movq %r12, 40(%rdi)
443 movq %r13, 48(%rdi)
444 movq %r14, 56(%rdi)
445
446 leaq 64(%rsi), %rsi
447 leaq 64(%rdi), %rdi
448
449 jnz L(preloop)
450
451 L(prebail):
452 movq SAVE3(%rsp), %rbx
453 cfi_restore (%rbx)
454 movq SAVE2(%rsp), %r12
455 cfi_restore (%r12)
456 movq SAVE1(%rsp), %r13
457 cfi_restore (%r13)
458 movq SAVE0(%rsp), %r14
459 cfi_restore (%r14)
460
461 /* .p2align 4 */
462
463 L(preskip):
464 subq %r8, %rdx /* check for more */
465 testq $-64, %rdx
466 jnz L(preafter)
467
468 andl $63, %edx /* check for left overs */
469 #ifdef USE_AS_MEMPCPY
470 jnz L(1)
471
472 movq %rdi, %rax
473 #else
474 movq RETVAL(%rsp), %rax
475 jnz L(1)
476
477 rep
478 #endif
479 retq /* exit */
480
481 .p2align 4
482
483 L(preafter):
484
485 /* Handle huge blocks. */
486
487 L(NTtry):
488
489 L(NT): /* non-temporal 128-byte */
490 movq %rdx, %rcx
491 shrq $7, %rcx
492 jz L(NTskip)
493
494 movq %r14, SAVE0(%rsp)
495 cfi_rel_offset (%r14, SAVE0)
496 movq %r13, SAVE1(%rsp)
497 cfi_rel_offset (%r13, SAVE1)
498 movq %r12, SAVE2(%rsp)
499 cfi_rel_offset (%r12, SAVE2)
500
501 .p2align 4
502
503 L(NTloop):
504 prefetchnta 768(%rsi)
505 prefetchnta 832(%rsi)
506
507 decq %rcx
508
509 movq (%rsi), %rax
510 movq 8(%rsi), %r8
511 movq 16(%rsi), %r9
512 movq 24(%rsi), %r10
513 movq 32(%rsi), %r11
514 movq 40(%rsi), %r12
515 movq 48(%rsi), %r13
516 movq 56(%rsi), %r14
517
518 movntiq %rax, (%rdi)
519 movntiq %r8, 8(%rdi)
520 movntiq %r9, 16(%rdi)
521 movntiq %r10, 24(%rdi)
522 movntiq %r11, 32(%rdi)
523 movntiq %r12, 40(%rdi)
524 movntiq %r13, 48(%rdi)
525 movntiq %r14, 56(%rdi)
526
527 movq 64(%rsi), %rax
528 movq 72(%rsi), %r8
529 movq 80(%rsi), %r9
530 movq 88(%rsi), %r10
531 movq 96(%rsi), %r11
532 movq 104(%rsi), %r12
533 movq 112(%rsi), %r13
534 movq 120(%rsi), %r14
535
536 movntiq %rax, 64(%rdi)
537 movntiq %r8, 72(%rdi)
538 movntiq %r9, 80(%rdi)
539 movntiq %r10, 88(%rdi)
540 movntiq %r11, 96(%rdi)
541 movntiq %r12, 104(%rdi)
542 movntiq %r13, 112(%rdi)
543 movntiq %r14, 120(%rdi)
544
545 leaq 128(%rsi), %rsi
546 leaq 128(%rdi), %rdi
547
548 jnz L(NTloop)
549
550 sfence /* serialize memory stores */
551
552 movq SAVE2(%rsp), %r12
553 cfi_restore (%r12)
554 movq SAVE1(%rsp), %r13
555 cfi_restore (%r13)
556 movq SAVE0(%rsp), %r14
557 cfi_restore (%r14)
558
559 L(NTskip):
560 andl $127, %edx /* check for left overs */
561 #ifdef USE_AS_MEMPCPY
562 jnz L(1)
563
564 movq %rdi, %rax
565 #else
566 movq RETVAL(%rsp), %rax
567 jnz L(1)
568
569 rep
570 #endif
571 retq /* exit */
572
573 #endif /* !NOT_IN_libc */
574
575 END(memcpy)
576
577 #ifndef USE_AS_MEMPCPY
578 libc_hidden_builtin_def (memcpy)
579 # if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
580 # undef memcpy
581 # include <shlib-compat.h>
582 versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
583 # endif
584 #endif