1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53 # define VZEROUPPER vzeroupper
59 /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66 #ifndef REP_MOVSB_THRESHOLD
67 # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
71 # define PREFETCH(addr) prefetcht0 addr
74 /* Assume 64-byte prefetch size. */
76 # define PREFETCH_SIZE 64
79 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
81 #if PREFETCH_SIZE == 64
82 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83 # define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86 # define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90 # define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96 # error Unsupported PREFETCHED_LOAD_SIZE!
99 # error Unsupported PREFETCH_SIZE!
103 # error SECTION is not defined!
106 .section SECTION(.text),"ax",@progbits
107 #if defined SHARED && IS_IN (libc)
108 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 jb HIDDEN_JUMPTARGET (__chk_fail)
111 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
114 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
118 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
120 #if defined SHARED && IS_IN (libc)
121 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
123 jb HIDDEN_JUMPTARGET (__chk_fail)
124 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
127 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131 /* Clear the upper 32 bits. */
134 cmp $VEC_SIZE, %RDX_LP
136 cmp $(VEC_SIZE * 2), %RDX_LP
138 #if !defined USE_MULTIARCH || !IS_IN (libc)
141 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
142 VMOVU (%rsi), %VEC(0)
143 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
144 VMOVU %VEC(0), (%rdi)
145 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
147 #if !defined USE_MULTIARCH || !IS_IN (libc)
151 #if defined USE_MULTIARCH && IS_IN (libc)
152 END (MEMMOVE_SYMBOL (__memmove, unaligned))
155 ENTRY (__mempcpy_chk_erms)
157 jb HIDDEN_JUMPTARGET (__chk_fail)
158 END (__mempcpy_chk_erms)
160 /* Only used to measure performance of REP MOVSB. */
161 ENTRY (__mempcpy_erms)
163 /* Skip zero length. */
164 test %RDX_LP, %RDX_LP
170 ENTRY (__memmove_chk_erms)
172 jb HIDDEN_JUMPTARGET (__chk_fail)
173 END (__memmove_chk_erms)
175 ENTRY (__memmove_erms)
177 /* Skip zero length. */
178 test %RDX_LP, %RDX_LP
184 /* Source == destination is less common. */
186 lea (%rsi,%rcx), %RDX_LP
194 leaq -1(%rdi,%rcx), %rdi
195 leaq -1(%rsi,%rcx), %rsi
201 strong_alias (__memmove_erms, __memcpy_erms)
202 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
206 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
208 jb HIDDEN_JUMPTARGET (__chk_fail)
209 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
212 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
216 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
219 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
221 jb HIDDEN_JUMPTARGET (__chk_fail)
222 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
225 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
229 /* Clear the upper 32 bits. */
232 cmp $VEC_SIZE, %RDX_LP
234 cmp $(VEC_SIZE * 2), %RDX_LP
235 ja L(movsb_more_2x_vec)
237 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
238 VMOVU (%rsi), %VEC(0)
239 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
240 VMOVU %VEC(0), (%rdi)
241 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
247 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
251 /* Source == destination is less common. */
253 leaq (%rsi,%rdx), %r9
255 /* Avoid slow backward REP MOVSB. */
256 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
257 # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
259 jb L(more_8x_vec_backward)
268 /* Less than 1 VEC. */
269 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
270 # error Unsupported VEC_SIZE!
293 /* From 32 to 63. No branch when size == 32. */
294 vmovdqu (%rsi), %ymm0
295 vmovdqu -32(%rsi,%rdx), %ymm1
296 vmovdqu %ymm0, (%rdi)
297 vmovdqu %ymm1, -32(%rdi,%rdx)
302 /* From 16 to 31. No branch when size == 16. */
304 vmovdqu (%rsi), %xmm0
305 vmovdqu -16(%rsi,%rdx), %xmm1
306 vmovdqu %xmm0, (%rdi)
307 vmovdqu %xmm1, -16(%rdi,%rdx)
311 /* From 8 to 15. No branch when size == 8. */
312 movq -8(%rsi,%rdx), %rcx
314 movq %rcx, -8(%rdi,%rdx)
318 /* From 4 to 7. No branch when size == 4. */
319 movl -4(%rsi,%rdx), %ecx
321 movl %ecx, -4(%rdi,%rdx)
325 /* From 2 to 3. No branch when size == 2. */
326 movzwl -2(%rsi,%rdx), %ecx
328 movw %cx, -2(%rdi,%rdx)
332 #if defined USE_MULTIARCH && IS_IN (libc)
333 L(movsb_more_2x_vec):
334 cmpq $REP_MOVSB_THRESHOLD, %rdx
338 /* More than 2 * VEC and there may be overlap between destination
340 cmpq $(VEC_SIZE * 8), %rdx
342 cmpq $(VEC_SIZE * 4), %rdx
344 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
345 VMOVU (%rsi), %VEC(0)
346 VMOVU VEC_SIZE(%rsi), %VEC(1)
347 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
348 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
349 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
350 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
351 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
352 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
353 VMOVU %VEC(0), (%rdi)
354 VMOVU %VEC(1), VEC_SIZE(%rdi)
355 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
356 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
357 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
358 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
359 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
360 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
364 /* Copy from 2 * VEC to 4 * VEC. */
365 VMOVU (%rsi), %VEC(0)
366 VMOVU VEC_SIZE(%rsi), %VEC(1)
367 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
368 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
369 VMOVU %VEC(0), (%rdi)
370 VMOVU %VEC(1), VEC_SIZE(%rdi)
371 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
372 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
378 ja L(more_8x_vec_backward)
379 /* Source == destination is less common. */
381 /* Load the first VEC and last 4 * VEC to support overlapping
383 VMOVU (%rsi), %VEC(4)
384 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
385 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
386 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
387 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
388 /* Save start and stop of the destination buffer. */
390 leaq -VEC_SIZE(%rdi, %rdx), %rcx
391 /* Align destination for aligned stores in the loop. Compute
392 how much destination is misaligned. */
394 andq $(VEC_SIZE - 1), %r8
395 /* Get the negative of offset for alignment. */
399 /* Adjust destination which should be aligned now. */
403 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
404 /* Check non-temporal store threshold. */
405 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
408 L(loop_4x_vec_forward):
409 /* Copy 4 * VEC a time forward. */
410 VMOVU (%rsi), %VEC(0)
411 VMOVU VEC_SIZE(%rsi), %VEC(1)
412 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
413 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
414 addq $(VEC_SIZE * 4), %rsi
415 subq $(VEC_SIZE * 4), %rdx
416 VMOVA %VEC(0), (%rdi)
417 VMOVA %VEC(1), VEC_SIZE(%rdi)
418 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
419 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
420 addq $(VEC_SIZE * 4), %rdi
421 cmpq $(VEC_SIZE * 4), %rdx
422 ja L(loop_4x_vec_forward)
423 /* Store the last 4 * VEC. */
424 VMOVU %VEC(5), (%rcx)
425 VMOVU %VEC(6), -VEC_SIZE(%rcx)
426 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
427 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
428 /* Store the first VEC. */
429 VMOVU %VEC(4), (%r11)
433 L(more_8x_vec_backward):
434 /* Load the first 4 * VEC and last VEC to support overlapping
436 VMOVU (%rsi), %VEC(4)
437 VMOVU VEC_SIZE(%rsi), %VEC(5)
438 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
439 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
440 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
441 /* Save stop of the destination buffer. */
442 leaq -VEC_SIZE(%rdi, %rdx), %r11
443 /* Align destination end for aligned stores in the loop. Compute
444 how much destination end is misaligned. */
445 leaq -VEC_SIZE(%rsi, %rdx), %rcx
448 andq $(VEC_SIZE - 1), %r8
451 /* Adjust the end of destination which should be aligned now. */
455 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
456 /* Check non-temporal store threshold. */
457 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
460 L(loop_4x_vec_backward):
461 /* Copy 4 * VEC a time backward. */
462 VMOVU (%rcx), %VEC(0)
463 VMOVU -VEC_SIZE(%rcx), %VEC(1)
464 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
465 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
466 subq $(VEC_SIZE * 4), %rcx
467 subq $(VEC_SIZE * 4), %rdx
469 VMOVA %VEC(1), -VEC_SIZE(%r9)
470 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
471 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
472 subq $(VEC_SIZE * 4), %r9
473 cmpq $(VEC_SIZE * 4), %rdx
474 ja L(loop_4x_vec_backward)
475 /* Store the first 4 * VEC. */
476 VMOVU %VEC(4), (%rdi)
477 VMOVU %VEC(5), VEC_SIZE(%rdi)
478 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
479 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
480 /* Store the last VEC. */
481 VMOVU %VEC(8), (%r11)
485 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
487 /* Don't use non-temporal store if there is overlap between
488 destination and source since destination may be in cache
489 when source is loaded. */
490 leaq (%rdi, %rdx), %r10
492 jb L(loop_4x_vec_forward)
493 L(loop_large_forward):
494 /* Copy 4 * VEC a time forward with non-temporal stores. */
495 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
496 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
497 VMOVU (%rsi), %VEC(0)
498 VMOVU VEC_SIZE(%rsi), %VEC(1)
499 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
500 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
501 addq $PREFETCHED_LOAD_SIZE, %rsi
502 subq $PREFETCHED_LOAD_SIZE, %rdx
503 VMOVNT %VEC(0), (%rdi)
504 VMOVNT %VEC(1), VEC_SIZE(%rdi)
505 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
506 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
507 addq $PREFETCHED_LOAD_SIZE, %rdi
508 cmpq $PREFETCHED_LOAD_SIZE, %rdx
509 ja L(loop_large_forward)
511 /* Store the last 4 * VEC. */
512 VMOVU %VEC(5), (%rcx)
513 VMOVU %VEC(6), -VEC_SIZE(%rcx)
514 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
515 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
516 /* Store the first VEC. */
517 VMOVU %VEC(4), (%r11)
522 /* Don't use non-temporal store if there is overlap between
523 destination and source since destination may be in cache
524 when source is loaded. */
525 leaq (%rcx, %rdx), %r10
527 jb L(loop_4x_vec_backward)
528 L(loop_large_backward):
529 /* Copy 4 * VEC a time backward with non-temporal stores. */
530 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
531 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
532 VMOVU (%rcx), %VEC(0)
533 VMOVU -VEC_SIZE(%rcx), %VEC(1)
534 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
535 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
536 subq $PREFETCHED_LOAD_SIZE, %rcx
537 subq $PREFETCHED_LOAD_SIZE, %rdx
538 VMOVNT %VEC(0), (%r9)
539 VMOVNT %VEC(1), -VEC_SIZE(%r9)
540 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
541 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
542 subq $PREFETCHED_LOAD_SIZE, %r9
543 cmpq $PREFETCHED_LOAD_SIZE, %rdx
544 ja L(loop_large_backward)
546 /* Store the first 4 * VEC. */
547 VMOVU %VEC(4), (%rdi)
548 VMOVU %VEC(5), VEC_SIZE(%rdi)
549 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
550 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
551 /* Store the last VEC. */
552 VMOVU %VEC(8), (%r11)
556 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
559 # ifdef USE_MULTIARCH
560 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
561 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
563 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
564 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
568 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
569 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
572 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
573 MEMCPY_SYMBOL (__memcpy, unaligned))