]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memcpy.S
Revert unwanted patch. Again.
[thirdparty/glibc.git] / sysdeps / x86_64 / memcpy.S
CommitLineData
bfe6f5fa
UD
1/*
2 Optimized memcpy for x86-64.
3
4 Copyright (C) 2007 Free Software Foundation, Inc.
5 Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
6
78df0fcb 7 This file is part of the GNU C Library.
78df0fcb
AJ
8
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
20 License along with the GNU C Library; if not, write to the Free
21 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
bfe6f5fa
UD
22 02111-1307 USA.
23*/
78df0fcb
AJ
24
25#include <sysdep.h>
26#include "asm-syntax.h"
78df0fcb 27
bfe6f5fa
UD
28/* Stack slots in the red-zone. */
29
30#ifdef USE_AS_MEMPCPY
31# define RETVAL (0)
32#else
33# define RETVAL (-8)
34#endif
35#define SAVE0 (RETVAL - 8)
36#define SAVE1 (SAVE0 - 8)
37#define SAVE2 (SAVE1 - 8)
38#define SAVE3 (SAVE2 - 8)
78df0fcb
AJ
39
40 .text
bfe6f5fa 41
b5cc329c
UD
42#if defined PIC && !defined NOT_IN_libc
43ENTRY (__memcpy_chk)
bfe6f5fa 44
b5cc329c
UD
45 cmpq %rdx, %rcx
46 jb HIDDEN_JUMPTARGET (__chk_fail)
bfe6f5fa 47
b5cc329c
UD
48END (__memcpy_chk)
49#endif
bfe6f5fa
UD
50
51ENTRY(memcpy) /* (void *, const void*, size_t) */
52
53/* Handle tiny blocks. */
54
55L(1try): /* up to 32B */
78df0fcb 56 cmpq $32, %rdx
bfe6f5fa
UD
57#ifndef USE_AS_MEMPCPY
58 movq %rdi, %rax /* save return value */
78df0fcb 59#endif
bfe6f5fa 60 jae L(1after)
78df0fcb 61
bfe6f5fa
UD
62L(1): /* 1-byte once */
63 testb $1, %dl
64 jz L(1a)
78df0fcb 65
bfe6f5fa
UD
66 movzbl (%rsi), %ecx
67 movb %cl, (%rdi)
78df0fcb 68
bfe6f5fa
UD
69 incq %rsi
70 incq %rdi
71
72 .p2align 4,, 4
73
74L(1a): /* 2-byte once */
75 testb $2, %dl
76 jz L(1b)
77
78 movzwl (%rsi), %ecx
79 movw %cx, (%rdi)
80
81 addq $2, %rsi
82 addq $2, %rdi
83
84 .p2align 4,, 4
85
86L(1b): /* 4-byte once */
87 testb $4, %dl
88 jz L(1c)
89
90 movl (%rsi), %ecx
91 movl %ecx, (%rdi)
92
93 addq $4, %rsi
94 addq $4, %rdi
95
96 .p2align 4,, 4
97
98L(1c): /* 8-byte once */
99 testb $8, %dl
100 jz L(1d)
101
102 movq (%rsi), %rcx
103 movq %rcx, (%rdi)
104
105 addq $8, %rsi
106 addq $8, %rdi
107
108 .p2align 4,, 4
109
110L(1d): /* 16-byte loop */
111 andl $0xf0, %edx
112 jz L(exit)
113
114 .p2align 4
115
116L(1loop):
0435403c
UD
117 movq (%rsi), %rcx
118 movq 8(%rsi), %r8
119 movq %rcx, (%rdi)
120 movq %r8, 8(%rdi)
bfe6f5fa
UD
121
122 subl $16, %edx
123
0435403c
UD
124 leaq 16(%rsi), %rsi
125 leaq 16(%rdi), %rdi
bfe6f5fa
UD
126
127 jnz L(1loop)
128
129 .p2align 4,, 4
130
131L(exit): /* exit */
132#ifdef USE_AS_MEMPCPY
133 movq %rdi, %rax /* return value */
134#else
135 rep
136#endif
137 retq
138
139 .p2align 4
140
141L(1after):
142#ifndef USE_AS_MEMPCPY
0435403c 143 movq %rax, RETVAL(%rsp) /* save return value */
bfe6f5fa
UD
144#endif
145
146/* Align to the natural word size. */
147
148L(aligntry):
0435403c 149 movl %esi, %ecx /* align by source */
bfe6f5fa
UD
150
151 andl $7, %ecx
152 jz L(alignafter) /* already aligned */
153
154L(align): /* align */
0435403c 155 leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
bfe6f5fa
UD
156 subl $8, %ecx
157
158 .p2align 4
159
160L(alignloop): /* 1-byte alignment loop */
161 movzbl (%rsi), %eax
162 movb %al, (%rdi)
163
164 incl %ecx
165
0435403c
UD
166 leaq 1(%rsi), %rsi
167 leaq 1(%rdi), %rdi
78df0fcb 168
bfe6f5fa 169 jnz L(alignloop)
78df0fcb 170
bfe6f5fa
UD
171 .p2align 4
172
173L(alignafter):
174
0435403c 175/* Handle mid-sized blocks. */
bfe6f5fa
UD
176
177L(32try): /* up to 1KB */
178 cmpq $1024, %rdx
179 ja L(32after)
180
181L(32): /* 32-byte loop */
182 movl %edx, %ecx
183 shrl $5, %ecx
184 jz L(32skip)
78df0fcb
AJ
185
186 .p2align 4
78df0fcb 187
bfe6f5fa
UD
188L(32loop):
189 decl %ecx
78df0fcb 190
0435403c
UD
191 movq (%rsi), %rax
192 movq 8(%rsi), %r8
193 movq 16(%rsi), %r9
194 movq 24(%rsi), %r10
bfe6f5fa 195
0435403c
UD
196 movq %rax, (%rdi)
197 movq %r8, 8(%rdi)
198 movq %r9, 16(%rdi)
199 movq %r10, 24(%rdi)
78df0fcb
AJ
200
201 leaq 32(%rsi), %rsi
202 leaq 32(%rdi), %rdi
203
bfe6f5fa
UD
204 jz L(32skip) /* help out smaller blocks */
205
206 decl %ecx
207
0435403c
UD
208 movq (%rsi), %rax
209 movq 8(%rsi), %r8
210 movq 16(%rsi), %r9
211 movq 24(%rsi), %r10
bfe6f5fa 212
0435403c
UD
213 movq %rax, (%rdi)
214 movq %r8, 8(%rdi)
215 movq %r9, 16(%rdi)
216 movq %r10, 24(%rdi)
bfe6f5fa 217
0435403c
UD
218 leaq 32(%rsi), %rsi
219 leaq 32(%rdi), %rdi
78df0fcb 220
bfe6f5fa 221 jnz L(32loop)
78df0fcb 222
bfe6f5fa
UD
223 .p2align 4
224
225L(32skip):
226 andl $31, %edx /* check for left overs */
227#ifdef USE_AS_MEMPCPY
228 jnz L(1)
229
230 movq %rdi, %rax
78df0fcb 231#else
0435403c 232 movq RETVAL(%rsp), %rax
bfe6f5fa 233 jnz L(1)
0435403c 234
bfe6f5fa
UD
235 rep
236#endif
237 retq /* exit */
238
239 .p2align 4
240
241L(32after):
242
243/*
244 In order to minimize code-size in RTLD, algorithms specific for
245 larger blocks are excluded when building for RTLD.
246*/
247
0435403c 248/* Handle blocks smaller than 1/2 L1. */
bfe6f5fa
UD
249
250L(fasttry): /* first 1/2 L1 */
251#ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
0435403c 252 movq __x86_64_data_cache_size_half(%rip), %r11
bfe6f5fa
UD
253 cmpq %rdx, %r11 /* calculate the smaller of */
254 cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
255#endif
256
257L(fast): /* good ol' MOVS */
258#ifndef NOT_IN_libc
259 movq %r11, %rcx
260 andq $-8, %r11
261#else
262 movq %rdx, %rcx
263#endif
264 shrq $3, %rcx
265 jz L(fastskip)
266
267 rep
268 movsq
269
270 .p2align 4,, 4
271
272L(fastskip):
273#ifndef NOT_IN_libc
274 subq %r11, %rdx /* check for more */
275 testq $-8, %rdx
276 jnz L(fastafter)
277#endif
278
279 andl $7, %edx /* check for left overs */
280#ifdef USE_AS_MEMPCPY
281 jnz L(1)
282
283 movq %rdi, %rax
284#else
0435403c 285 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
286 jnz L(1)
287
288 rep
289#endif
290 retq /* exit */
291
292#ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
293
294 .p2align 4
295
296L(fastafter):
297
298/* Handle large blocks smaller than 1/2 L2. */
299
300L(pretry): /* first 1/2 L2 */
301 movq __x86_64_shared_cache_size_half (%rip), %r8
302 cmpq %rdx, %r8 /* calculate the lesser of */
303 cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
304
305L(pre): /* 64-byte with prefetching */
306 movq %r8, %rcx
307 andq $-64, %r8
308 shrq $6, %rcx
309 jz L(preskip)
310
0435403c 311 movq %r14, SAVE0(%rsp)
bfe6f5fa 312 cfi_rel_offset (%r14, SAVE0)
0435403c 313 movq %r13, SAVE1(%rsp)
bfe6f5fa 314 cfi_rel_offset (%r13, SAVE1)
0435403c 315 movq %r12, SAVE2(%rsp)
bfe6f5fa 316 cfi_rel_offset (%r12, SAVE2)
0435403c 317 movq %rbx, SAVE3(%rsp)
bfe6f5fa
UD
318 cfi_rel_offset (%rbx, SAVE3)
319
0435403c 320 cmpl $0, __x86_64_prefetchw(%rip)
bfe6f5fa
UD
321 jz L(preloop) /* check if PREFETCHW OK */
322
323 .p2align 4
324
325/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
326
327L(prewloop): /* cache-line in state M */
328 decq %rcx
329
330 movq (%rsi), %rax
331 movq 8 (%rsi), %rbx
332 movq 16 (%rsi), %r9
333 movq 24 (%rsi), %r10
334 movq 32 (%rsi), %r11
335 movq 40 (%rsi), %r12
336 movq 48 (%rsi), %r13
337 movq 56 (%rsi), %r14
338
339 prefetcht0 0 + 896 (%rsi)
340 prefetcht0 64 + 896 (%rsi)
341
0435403c
UD
342 movq %rax, (%rdi)
343 movq %rbx, 8(%rdi)
344 movq %r9, 16(%rdi)
345 movq %r10, 24(%rdi)
346 movq %r11, 32(%rdi)
347 movq %r12, 40(%rdi)
348 movq %r13, 48(%rdi)
349 movq %r14, 56(%rdi)
bfe6f5fa 350
0435403c
UD
351 leaq 64(%rsi), %rsi
352 leaq 64(%rdi), %rdi
bfe6f5fa
UD
353
354 jz L(prebail)
355
356 decq %rcx
357
0435403c
UD
358 movq (%rsi), %rax
359 movq 8(%rsi), %rbx
360 movq 16(%rsi), %r9
361 movq 24(%rsi), %r10
362 movq 32(%rsi), %r11
363 movq 40(%rsi), %r12
364 movq 48(%rsi), %r13
365 movq 56(%rsi), %r14
366
367 movq %rax, (%rdi)
368 movq %rbx, 8(%rdi)
369 movq %r9, 16(%rdi)
370 movq %r10, 24(%rdi)
371 movq %r11, 32(%rdi)
372 movq %r12, 40(%rdi)
373 movq %r13, 48(%rdi)
374 movq %r14, 56(%rdi)
375
376 prefetchw 896 - 64(%rdi)
377 prefetchw 896 - 0(%rdi)
378
379 leaq 64(%rsi), %rsi
380 leaq 64(%rdi), %rdi
bfe6f5fa
UD
381
382 jnz L(prewloop)
383 jmp L(prebail)
384
385 .p2align 4
386
387/* ... when PREFETCHW is not available. */
388
389L(preloop): /* cache-line in state E */
390 decq %rcx
391
0435403c
UD
392 movq (%rsi), %rax
393 movq 8(%rsi), %rbx
394 movq 16(%rsi), %r9
395 movq 24(%rsi), %r10
396 movq 32(%rsi), %r11
397 movq 40(%rsi), %r12
398 movq 48(%rsi), %r13
399 movq 56(%rsi), %r14
400
401 prefetcht0 896 + 0(%rsi)
402 prefetcht0 896 + 64(%rsi)
403
404 movq %rax, (%rdi)
405 movq %rbx, 8(%rdi)
406 movq %r9, 16(%rdi)
407 movq %r10, 24(%rdi)
408 movq %r11, 32(%rdi)
409 movq %r12, 40(%rdi)
410 movq %r13, 48(%rdi)
411 movq %r14, 56(%rdi)
bfe6f5fa
UD
412
413 leaq 64 (%rsi), %rsi
414 leaq 64 (%rdi), %rdi
415
416 jz L(prebail)
417
418 decq %rcx
419
0435403c
UD
420 movq (%rsi), %rax
421 movq 8(%rsi), %rbx
422 movq 16(%rsi), %r9
423 movq 24(%rsi), %r10
424 movq 32(%rsi), %r11
425 movq 40(%rsi), %r12
426 movq 48(%rsi), %r13
427 movq 56(%rsi), %r14
428
429 prefetcht0 896 - 64(%rdi)
430 prefetcht0 896 - 0(%rdi)
431
432 movq %rax, (%rdi)
433 movq %rbx, 8(%rdi)
434 movq %r9, 16(%rdi)
435 movq %r10, 24(%rdi)
436 movq %r11, 32(%rdi)
437 movq %r12, 40(%rdi)
438 movq %r13, 48(%rdi)
439 movq %r14, 56(%rdi)
440
441 leaq 64(%rsi), %rsi
442 leaq 64(%rdi), %rdi
bfe6f5fa
UD
443
444 jnz L(preloop)
445
446L(prebail):
0435403c 447 movq SAVE3(%rsp), %rbx
bfe6f5fa 448 cfi_restore (%rbx)
0435403c 449 movq SAVE2(%rsp), %r12
bfe6f5fa 450 cfi_restore (%r12)
0435403c 451 movq SAVE1(%rsp), %r13
bfe6f5fa 452 cfi_restore (%r13)
0435403c 453 movq SAVE0(%rsp), %r14
bfe6f5fa
UD
454 cfi_restore (%r14)
455
456/* .p2align 4 */
457
458L(preskip):
459 subq %r8, %rdx /* check for more */
460 testq $-64, %rdx
461 jnz L(preafter)
462
463 andl $63, %edx /* check for left overs */
464#ifdef USE_AS_MEMPCPY
465 jnz L(1)
466
467 movq %rdi, %rax
468#else
0435403c 469 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
470 jnz L(1)
471
472 rep
473#endif
474 retq /* exit */
475
476 .p2align 4
477
478L(preafter):
479
0435403c 480/* Handle huge blocks. */
bfe6f5fa
UD
481
482L(NTtry):
483
484L(NT): /* non-temporal 128-byte */
485 movq %rdx, %rcx
486 shrq $7, %rcx
487 jz L(NTskip)
488
0435403c 489 movq %r14, SAVE0(%rsp)
bfe6f5fa 490 cfi_rel_offset (%r14, SAVE0)
0435403c 491 movq %r13, SAVE1(%rsp)
bfe6f5fa 492 cfi_rel_offset (%r13, SAVE1)
0435403c 493 movq %r12, SAVE2(%rsp)
bfe6f5fa
UD
494 cfi_rel_offset (%r12, SAVE2)
495
496 .p2align 4
497
498L(NTloop):
0435403c
UD
499 prefetchnta 768(%rsi)
500 prefetchnta 832(%rsi)
bfe6f5fa
UD
501
502 decq %rcx
503
0435403c
UD
504 movq (%rsi), %rax
505 movq 8(%rsi), %r8
506 movq 16(%rsi), %r9
507 movq 24(%rsi), %r10
508 movq 32(%rsi), %r11
509 movq 40(%rsi), %r12
510 movq 48(%rsi), %r13
511 movq 56(%rsi), %r14
512
513 movntiq %rax, (%rdi)
514 movntiq %r8, 8(%rdi)
515 movntiq %r9, 16(%rdi)
516 movntiq %r10, 24(%rdi)
517 movntiq %r11, 32(%rdi)
518 movntiq %r12, 40(%rdi)
519 movntiq %r13, 48(%rdi)
520 movntiq %r14, 56(%rdi)
521
522 movq 64(%rsi), %rax
523 movq 72(%rsi), %r8
524 movq 80(%rsi), %r9
525 movq 88(%rsi), %r10
526 movq 96(%rsi), %r11
527 movq 104(%rsi), %r12
528 movq 112(%rsi), %r13
529 movq 120(%rsi), %r14
530
531 movntiq %rax, 64(%rdi)
532 movntiq %r8, 72(%rdi)
533 movntiq %r9, 80(%rdi)
534 movntiq %r10, 88(%rdi)
535 movntiq %r11, 96(%rdi)
536 movntiq %r12, 104(%rdi)
537 movntiq %r13, 112(%rdi)
538 movntiq %r14, 120(%rdi)
539
540 leaq 128(%rsi), %rsi
541 leaq 128(%rdi), %rdi
bfe6f5fa
UD
542
543 jnz L(NTloop)
544
545 sfence /* serialize memory stores */
546
0435403c 547 movq SAVE2(%rsp), %r12
bfe6f5fa 548 cfi_restore (%r12)
0435403c 549 movq SAVE1(%rsp), %r13
bfe6f5fa 550 cfi_restore (%r13)
0435403c 551 movq SAVE0(%rsp), %r14
bfe6f5fa
UD
552 cfi_restore (%r14)
553
554L(NTskip):
555 andl $127, %edx /* check for left overs */
556#ifdef USE_AS_MEMPCPY
557 jnz L(1)
558
559 movq %rdi, %rax
560#else
0435403c 561 movq RETVAL(%rsp), %rax
bfe6f5fa
UD
562 jnz L(1)
563
564 rep
78df0fcb 565#endif
bfe6f5fa
UD
566 retq /* exit */
567
568#endif /* !NOT_IN_libc */
569
570END(memcpy)
78df0fcb 571
bfe6f5fa 572#ifndef USE_AS_MEMPCPY
85dd1003 573libc_hidden_builtin_def (memcpy)
3dbfd811 574#endif