]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memcpy.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / memcpy.S
CommitLineData
bfe6f5fa
UD
1/*
2 Optimized memcpy for x86-64.
3
d4697bc9 4 Copyright (C) 2007-2014 Free Software Foundation, Inc.
bfe6f5fa
UD
5 Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
6
78df0fcb 7 This file is part of the GNU C Library.
78df0fcb
AJ
8
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
20 License along with the GNU C Library; if not, see
21 <http://www.gnu.org/licenses/>.
bfe6f5fa 22*/
78df0fcb
AJ
23
24#include <sysdep.h>
25#include "asm-syntax.h"
78df0fcb 26
bfe6f5fa
UD
27/* Stack slots in the red-zone. */
28
29#ifdef USE_AS_MEMPCPY
30# define RETVAL (0)
31#else
32# define RETVAL (-8)
0354e355
L
33# if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
34# define memcpy __memcpy
35# undef libc_hidden_builtin_def
36# define libc_hidden_builtin_def(name) \
37 .globl __GI_memcpy; __GI_memcpy = __memcpy
38# endif
bfe6f5fa
UD
39#endif
40#define SAVE0 (RETVAL - 8)
41#define SAVE1 (SAVE0 - 8)
42#define SAVE2 (SAVE1 - 8)
43#define SAVE3 (SAVE2 - 8)
78df0fcb
AJ
44
45 .text
bfe6f5fa 46
b5cc329c 47#if defined PIC && !defined NOT_IN_libc
6fb8cbcb 48ENTRY_CHK (__memcpy_chk)
bfe6f5fa 49
b5cc329c
UD
50 cmpq %rdx, %rcx
51 jb HIDDEN_JUMPTARGET (__chk_fail)
bfe6f5fa 52
6fb8cbcb 53END_CHK (__memcpy_chk)
b5cc329c 54#endif
bfe6f5fa
UD
55
56ENTRY(memcpy) /* (void *, const void*, size_t) */
57
58/* Handle tiny blocks. */
59
60L(1try): /* up to 32B */
78df0fcb 61 cmpq $32, %rdx
bfe6f5fa
UD
62#ifndef USE_AS_MEMPCPY
63 movq %rdi, %rax /* save return value */
78df0fcb 64#endif
bfe6f5fa 65 jae L(1after)
78df0fcb 66
bfe6f5fa
UD
67L(1): /* 1-byte once */
68 testb $1, %dl
69 jz L(1a)
78df0fcb 70
bfe6f5fa
UD
71 movzbl (%rsi), %ecx
72 movb %cl, (%rdi)
78df0fcb 73
bfe6f5fa
UD
74 incq %rsi
75 incq %rdi
76
77 .p2align 4,, 4
78
79L(1a): /* 2-byte once */
80 testb $2, %dl
81 jz L(1b)
82
83 movzwl (%rsi), %ecx
84 movw %cx, (%rdi)
85
86 addq $2, %rsi
87 addq $2, %rdi
88
89 .p2align 4,, 4
90
91L(1b): /* 4-byte once */
92 testb $4, %dl
93 jz L(1c)
94
95 movl (%rsi), %ecx
96 movl %ecx, (%rdi)
97
98 addq $4, %rsi
99 addq $4, %rdi
100
101 .p2align 4,, 4
102
103L(1c): /* 8-byte once */
104 testb $8, %dl
105 jz L(1d)
106
107 movq (%rsi), %rcx
108 movq %rcx, (%rdi)
109
110 addq $8, %rsi
111 addq $8, %rdi
112
113 .p2align 4,, 4
114
115L(1d): /* 16-byte loop */
116 andl $0xf0, %edx
117 jz L(exit)
118
119 .p2align 4
120
121L(1loop):
0435403c
UD
122 movq (%rsi), %rcx
123 movq 8(%rsi), %r8
124 movq %rcx, (%rdi)
125 movq %r8, 8(%rdi)
bfe6f5fa
UD
126
127 subl $16, %edx
128
0435403c
UD
129 leaq 16(%rsi), %rsi
130 leaq 16(%rdi), %rdi
bfe6f5fa
UD
131
132 jnz L(1loop)
133
134 .p2align 4,, 4
135
136L(exit): /* exit */
137#ifdef USE_AS_MEMPCPY
138 movq %rdi, %rax /* return value */
139#else
140 rep
141#endif
142 retq
143
144 .p2align 4
145
146L(1after):
147#ifndef USE_AS_MEMPCPY
0435403c 148 movq %rax, RETVAL(%rsp) /* save return value */
bfe6f5fa
UD
149#endif
150
151/* Align to the natural word size. */
152
153L(aligntry):
0435403c 154 movl %esi, %ecx /* align by source */
bfe6f5fa
UD
155
156 andl $7, %ecx
157 jz L(alignafter) /* already aligned */
158
159L(align): /* align */
0435403c 160 leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
bfe6f5fa
UD
161 subl $8, %ecx
162
163 .p2align 4
164
165L(alignloop): /* 1-byte alignment loop */
166 movzbl (%rsi), %eax
167 movb %al, (%rdi)
168
169 incl %ecx
170
0435403c
UD
171 leaq 1(%rsi), %rsi
172 leaq 1(%rdi), %rdi
78df0fcb 173
bfe6f5fa 174 jnz L(alignloop)
78df0fcb 175
bfe6f5fa
UD
176 .p2align 4
177
178L(alignafter):
179
0435403c 180/* Handle mid-sized blocks. */
bfe6f5fa
UD
181
182L(32try): /* up to 1KB */
183 cmpq $1024, %rdx
184 ja L(32after)
185
186L(32): /* 32-byte loop */
187 movl %edx, %ecx
188 shrl $5, %ecx
189 jz L(32skip)
78df0fcb
AJ
190
191 .p2align 4
78df0fcb 192
bfe6f5fa
UD
193L(32loop):
194 decl %ecx
78df0fcb 195
0435403c
UD
196 movq (%rsi), %rax
197 movq 8(%rsi), %r8
198 movq 16(%rsi), %r9
199 movq 24(%rsi), %r10
bfe6f5fa 200
0435403c
UD
201 movq %rax, (%rdi)
202 movq %r8, 8(%rdi)
203 movq %r9, 16(%rdi)
204 movq %r10, 24(%rdi)
78df0fcb
AJ
205
206 leaq 32(%rsi), %rsi
207 leaq 32(%rdi), %rdi
208
bfe6f5fa
UD
209 jz L(32skip) /* help out smaller blocks */
210
211 decl %ecx
212
0435403c
UD
213 movq (%rsi), %rax
214 movq 8(%rsi), %r8
215 movq 16(%rsi), %r9
216 movq 24(%rsi), %r10
bfe6f5fa 217
0435403c
UD
218 movq %rax, (%rdi)
219 movq %r8, 8(%rdi)
220 movq %r9, 16(%rdi)
221 movq %r10, 24(%rdi)
bfe6f5fa 222
0435403c
UD
223 leaq 32(%rsi), %rsi
224 leaq 32(%rdi), %rdi
78df0fcb 225
bfe6f5fa 226 jnz L(32loop)
78df0fcb 227
bfe6f5fa
UD
228 .p2align 4
229
230L(32skip):
231 andl $31, %edx /* check for left overs */
232#ifdef USE_AS_MEMPCPY
233 jnz L(1)
234
235 movq %rdi, %rax
78df0fcb 236#else
0435403c 237 movq RETVAL(%rsp), %rax
bfe6f5fa 238 jnz L(1)
0435403c 239
bfe6f5fa
UD
240 rep
241#endif
242 retq /* exit */
243
244 .p2align 4
245
246L(32after):
247
248/*
249 In order to minimize code-size in RTLD, algorithms specific for
250 larger blocks are excluded when building for RTLD.
251*/
252
0435403c 253/* Handle blocks smaller than 1/2 L1. */
bfe6f5fa
UD
254
255L(fasttry): /* first 1/2 L1 */
256#ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
afec409a 257 mov __x86_data_cache_size_half(%rip), %R11_LP
bfe6f5fa
UD
258 cmpq %rdx, %r11 /* calculate the smaller of */
259 cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
260#endif
261
262L(fast): /* good ol' MOVS */
263#ifndef NOT_IN_libc
264 movq %r11, %rcx
265 andq $-8, %r11
266#else
267 movq %rdx, %rcx
268#endif
269 shrq $3, %rcx
270 jz L(fastskip)
271
272 rep
273 movsq
274
275 .p2align 4,, 4
276
277L(fastskip):
278#ifndef NOT_IN_libc
279 subq %r11, %rdx /* check for more */
280 testq $-8, %rdx
281 jnz L(fastafter)
282#endif
283
284 andl $7, %edx /* check for left overs */
285#ifdef USE_AS_MEMPCPY
286 jnz L(1)
287
288 movq %rdi, %rax
289#else
0435403c 290 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
291 jnz L(1)
292
293 rep
294#endif
295 retq /* exit */
296
297#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
298
299 .p2align 4
300
301L(fastafter):
302
303/* Handle large blocks smaller than 1/2 L2. */
304
305L(pretry): /* first 1/2 L2 */
afec409a 306 mov __x86_shared_cache_size_half (%rip), %R8_LP
bfe6f5fa
UD
307 cmpq %rdx, %r8 /* calculate the lesser of */
308 cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
309
310L(pre): /* 64-byte with prefetching */
311 movq %r8, %rcx
312 andq $-64, %r8
313 shrq $6, %rcx
314 jz L(preskip)
315
0435403c 316 movq %r14, SAVE0(%rsp)
bfe6f5fa 317 cfi_rel_offset (%r14, SAVE0)
0435403c 318 movq %r13, SAVE1(%rsp)
bfe6f5fa 319 cfi_rel_offset (%r13, SAVE1)
0435403c 320 movq %r12, SAVE2(%rsp)
bfe6f5fa 321 cfi_rel_offset (%r12, SAVE2)
0435403c 322 movq %rbx, SAVE3(%rsp)
bfe6f5fa
UD
323 cfi_rel_offset (%rbx, SAVE3)
324
afec409a 325 cmpl $0, __x86_prefetchw(%rip)
bfe6f5fa
UD
326 jz L(preloop) /* check if PREFETCHW OK */
327
328 .p2align 4
329
330/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
331
332L(prewloop): /* cache-line in state M */
333 decq %rcx
334
335 movq (%rsi), %rax
336 movq 8 (%rsi), %rbx
337 movq 16 (%rsi), %r9
338 movq 24 (%rsi), %r10
339 movq 32 (%rsi), %r11
340 movq 40 (%rsi), %r12
341 movq 48 (%rsi), %r13
342 movq 56 (%rsi), %r14
343
344 prefetcht0 0 + 896 (%rsi)
345 prefetcht0 64 + 896 (%rsi)
346
0435403c
UD
347 movq %rax, (%rdi)
348 movq %rbx, 8(%rdi)
349 movq %r9, 16(%rdi)
350 movq %r10, 24(%rdi)
351 movq %r11, 32(%rdi)
352 movq %r12, 40(%rdi)
353 movq %r13, 48(%rdi)
354 movq %r14, 56(%rdi)
bfe6f5fa 355
0435403c
UD
356 leaq 64(%rsi), %rsi
357 leaq 64(%rdi), %rdi
bfe6f5fa
UD
358
359 jz L(prebail)
360
361 decq %rcx
362
0435403c
UD
363 movq (%rsi), %rax
364 movq 8(%rsi), %rbx
365 movq 16(%rsi), %r9
366 movq 24(%rsi), %r10
367 movq 32(%rsi), %r11
368 movq 40(%rsi), %r12
369 movq 48(%rsi), %r13
370 movq 56(%rsi), %r14
371
372 movq %rax, (%rdi)
373 movq %rbx, 8(%rdi)
374 movq %r9, 16(%rdi)
375 movq %r10, 24(%rdi)
376 movq %r11, 32(%rdi)
377 movq %r12, 40(%rdi)
378 movq %r13, 48(%rdi)
379 movq %r14, 56(%rdi)
380
381 prefetchw 896 - 64(%rdi)
382 prefetchw 896 - 0(%rdi)
383
384 leaq 64(%rsi), %rsi
385 leaq 64(%rdi), %rdi
bfe6f5fa
UD
386
387 jnz L(prewloop)
388 jmp L(prebail)
389
390 .p2align 4
391
392/* ... when PREFETCHW is not available. */
393
394L(preloop): /* cache-line in state E */
395 decq %rcx
396
0435403c
UD
397 movq (%rsi), %rax
398 movq 8(%rsi), %rbx
399 movq 16(%rsi), %r9
400 movq 24(%rsi), %r10
401 movq 32(%rsi), %r11
402 movq 40(%rsi), %r12
403 movq 48(%rsi), %r13
404 movq 56(%rsi), %r14
405
406 prefetcht0 896 + 0(%rsi)
407 prefetcht0 896 + 64(%rsi)
408
409 movq %rax, (%rdi)
410 movq %rbx, 8(%rdi)
411 movq %r9, 16(%rdi)
412 movq %r10, 24(%rdi)
413 movq %r11, 32(%rdi)
414 movq %r12, 40(%rdi)
415 movq %r13, 48(%rdi)
416 movq %r14, 56(%rdi)
bfe6f5fa
UD
417
418 leaq 64 (%rsi), %rsi
419 leaq 64 (%rdi), %rdi
420
421 jz L(prebail)
422
423 decq %rcx
424
0435403c
UD
425 movq (%rsi), %rax
426 movq 8(%rsi), %rbx
427 movq 16(%rsi), %r9
428 movq 24(%rsi), %r10
429 movq 32(%rsi), %r11
430 movq 40(%rsi), %r12
431 movq 48(%rsi), %r13
432 movq 56(%rsi), %r14
433
434 prefetcht0 896 - 64(%rdi)
435 prefetcht0 896 - 0(%rdi)
436
437 movq %rax, (%rdi)
438 movq %rbx, 8(%rdi)
439 movq %r9, 16(%rdi)
440 movq %r10, 24(%rdi)
441 movq %r11, 32(%rdi)
442 movq %r12, 40(%rdi)
443 movq %r13, 48(%rdi)
444 movq %r14, 56(%rdi)
445
446 leaq 64(%rsi), %rsi
447 leaq 64(%rdi), %rdi
bfe6f5fa
UD
448
449 jnz L(preloop)
450
451L(prebail):
0435403c 452 movq SAVE3(%rsp), %rbx
bfe6f5fa 453 cfi_restore (%rbx)
0435403c 454 movq SAVE2(%rsp), %r12
bfe6f5fa 455 cfi_restore (%r12)
0435403c 456 movq SAVE1(%rsp), %r13
bfe6f5fa 457 cfi_restore (%r13)
0435403c 458 movq SAVE0(%rsp), %r14
bfe6f5fa
UD
459 cfi_restore (%r14)
460
461/* .p2align 4 */
462
463L(preskip):
464 subq %r8, %rdx /* check for more */
465 testq $-64, %rdx
466 jnz L(preafter)
467
468 andl $63, %edx /* check for left overs */
469#ifdef USE_AS_MEMPCPY
470 jnz L(1)
471
472 movq %rdi, %rax
473#else
0435403c 474 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
475 jnz L(1)
476
477 rep
478#endif
479 retq /* exit */
480
481 .p2align 4
482
483L(preafter):
484
0435403c 485/* Handle huge blocks. */
bfe6f5fa
UD
486
487L(NTtry):
488
489L(NT): /* non-temporal 128-byte */
490 movq %rdx, %rcx
491 shrq $7, %rcx
492 jz L(NTskip)
493
0435403c 494 movq %r14, SAVE0(%rsp)
bfe6f5fa 495 cfi_rel_offset (%r14, SAVE0)
0435403c 496 movq %r13, SAVE1(%rsp)
bfe6f5fa 497 cfi_rel_offset (%r13, SAVE1)
0435403c 498 movq %r12, SAVE2(%rsp)
bfe6f5fa
UD
499 cfi_rel_offset (%r12, SAVE2)
500
501 .p2align 4
502
503L(NTloop):
0435403c
UD
504 prefetchnta 768(%rsi)
505 prefetchnta 832(%rsi)
bfe6f5fa
UD
506
507 decq %rcx
508
0435403c
UD
509 movq (%rsi), %rax
510 movq 8(%rsi), %r8
511 movq 16(%rsi), %r9
512 movq 24(%rsi), %r10
513 movq 32(%rsi), %r11
514 movq 40(%rsi), %r12
515 movq 48(%rsi), %r13
516 movq 56(%rsi), %r14
517
518 movntiq %rax, (%rdi)
519 movntiq %r8, 8(%rdi)
520 movntiq %r9, 16(%rdi)
521 movntiq %r10, 24(%rdi)
522 movntiq %r11, 32(%rdi)
523 movntiq %r12, 40(%rdi)
524 movntiq %r13, 48(%rdi)
525 movntiq %r14, 56(%rdi)
526
527 movq 64(%rsi), %rax
528 movq 72(%rsi), %r8
529 movq 80(%rsi), %r9
530 movq 88(%rsi), %r10
531 movq 96(%rsi), %r11
532 movq 104(%rsi), %r12
533 movq 112(%rsi), %r13
534 movq 120(%rsi), %r14
535
536 movntiq %rax, 64(%rdi)
537 movntiq %r8, 72(%rdi)
538 movntiq %r9, 80(%rdi)
539 movntiq %r10, 88(%rdi)
540 movntiq %r11, 96(%rdi)
541 movntiq %r12, 104(%rdi)
542 movntiq %r13, 112(%rdi)
543 movntiq %r14, 120(%rdi)
544
545 leaq 128(%rsi), %rsi
546 leaq 128(%rdi), %rdi
bfe6f5fa
UD
547
548 jnz L(NTloop)
549
550 sfence /* serialize memory stores */
551
0435403c 552 movq SAVE2(%rsp), %r12
bfe6f5fa 553 cfi_restore (%r12)
0435403c 554 movq SAVE1(%rsp), %r13
bfe6f5fa 555 cfi_restore (%r13)
0435403c 556 movq SAVE0(%rsp), %r14
bfe6f5fa
UD
557 cfi_restore (%r14)
558
559L(NTskip):
560 andl $127, %edx /* check for left overs */
561#ifdef USE_AS_MEMPCPY
562 jnz L(1)
563
564 movq %rdi, %rax
565#else
0435403c 566 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
567 jnz L(1)
568
569 rep
78df0fcb 570#endif
bfe6f5fa
UD
571 retq /* exit */
572
573#endif /* !NOT_IN_libc */
574
575END(memcpy)
78df0fcb 576
bfe6f5fa 577#ifndef USE_AS_MEMPCPY
85dd1003 578libc_hidden_builtin_def (memcpy)
0354e355
L
579# if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
580# undef memcpy
581# include <shlib-compat.h>
582versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
583# endif
3dbfd811 584#endif