1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53 # define VZEROUPPER vzeroupper
59 /* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66 #ifndef REP_MOVSB_THRESHOLD
67 # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
70 /* Avoid short distance rep movsb only with non-SSE vector. */
71 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
72 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
74 # define AVOID_SHORT_DISTANCE_REP_MOVSB 0
78 # define PREFETCH(addr) prefetcht0 addr
81 /* Assume 64-byte prefetch size. */
83 # define PREFETCH_SIZE 64
86 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
88 #if PREFETCH_SIZE == 64
89 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
90 # define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base)
92 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
93 # define PREFETCH_ONE_SET(dir, base, offset) \
94 PREFETCH ((offset)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
96 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
97 # define PREFETCH_ONE_SET(dir, base, offset) \
98 PREFETCH ((offset)base); \
99 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
100 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
101 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
103 # error Unsupported PREFETCHED_LOAD_SIZE!
106 # error Unsupported PREFETCH_SIZE!
110 # error SECTION is not defined!
113 .section SECTION(.text),"ax",@progbits
114 #if defined SHARED && IS_IN (libc)
115 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
117 jb HIDDEN_JUMPTARGET (__chk_fail)
118 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
121 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
125 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
127 #if defined SHARED && IS_IN (libc)
128 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
134 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
138 /* Clear the upper 32 bits. */
141 cmp $VEC_SIZE, %RDX_LP
143 cmp $(VEC_SIZE * 2), %RDX_LP
145 #if !defined USE_MULTIARCH || !IS_IN (libc)
148 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
149 VMOVU (%rsi), %VEC(0)
150 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
151 VMOVU %VEC(0), (%rdi)
152 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
154 #if !defined USE_MULTIARCH || !IS_IN (libc)
158 #if defined USE_MULTIARCH && IS_IN (libc)
159 END (MEMMOVE_SYMBOL (__memmove, unaligned))
162 ENTRY (__mempcpy_chk_erms)
164 jb HIDDEN_JUMPTARGET (__chk_fail)
165 END (__mempcpy_chk_erms)
167 /* Only used to measure performance of REP MOVSB. */
168 ENTRY (__mempcpy_erms)
170 /* Skip zero length. */
171 test %RDX_LP, %RDX_LP
177 ENTRY (__memmove_chk_erms)
179 jb HIDDEN_JUMPTARGET (__chk_fail)
180 END (__memmove_chk_erms)
182 ENTRY (__memmove_erms)
184 /* Skip zero length. */
185 test %RDX_LP, %RDX_LP
191 /* Source == destination is less common. */
193 lea (%rsi,%rcx), %RDX_LP
201 leaq -1(%rdi,%rcx), %rdi
202 leaq -1(%rsi,%rcx), %rsi
208 strong_alias (__memmove_erms, __memcpy_erms)
209 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
213 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
215 jb HIDDEN_JUMPTARGET (__chk_fail)
216 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
219 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
223 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
226 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
228 jb HIDDEN_JUMPTARGET (__chk_fail)
229 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
232 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
236 /* Clear the upper 32 bits. */
239 cmp $VEC_SIZE, %RDX_LP
241 cmp $(VEC_SIZE * 2), %RDX_LP
242 ja L(movsb_more_2x_vec)
244 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
245 VMOVU (%rsi), %VEC(0)
246 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
247 VMOVU %VEC(0), (%rdi)
248 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
254 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
258 /* Source == destination is less common. */
260 leaq (%rsi,%rdx), %r9
262 /* Avoid slow backward REP MOVSB. */
263 # if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
264 # error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
266 jb L(more_8x_vec_backward)
267 # if AVOID_SHORT_DISTANCE_REP_MOVSB
273 # if AVOID_SHORT_DISTANCE_REP_MOVSB
277 /* Avoid "rep movsb" if RCX, the distance between source and destination,
278 is N*4GB + [1..63] with N >= 0. */
280 jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
289 /* Less than 1 VEC. */
290 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
291 # error Unsupported VEC_SIZE!
314 /* From 32 to 63. No branch when size == 32. */
315 vmovdqu (%rsi), %ymm0
316 vmovdqu -32(%rsi,%rdx), %ymm1
317 vmovdqu %ymm0, (%rdi)
318 vmovdqu %ymm1, -32(%rdi,%rdx)
323 /* From 16 to 31. No branch when size == 16. */
325 vmovdqu (%rsi), %xmm0
326 vmovdqu -16(%rsi,%rdx), %xmm1
327 vmovdqu %xmm0, (%rdi)
328 vmovdqu %xmm1, -16(%rdi,%rdx)
332 /* From 8 to 15. No branch when size == 8. */
333 movq -8(%rsi,%rdx), %rcx
335 movq %rcx, -8(%rdi,%rdx)
339 /* From 4 to 7. No branch when size == 4. */
340 movl -4(%rsi,%rdx), %ecx
342 movl %ecx, -4(%rdi,%rdx)
346 /* From 2 to 3. No branch when size == 2. */
347 movzwl -2(%rsi,%rdx), %ecx
349 movw %cx, -2(%rdi,%rdx)
353 #if defined USE_MULTIARCH && IS_IN (libc)
354 L(movsb_more_2x_vec):
355 cmpq $REP_MOVSB_THRESHOLD, %rdx
359 /* More than 2 * VEC and there may be overlap between destination
361 cmpq $(VEC_SIZE * 8), %rdx
363 cmpq $(VEC_SIZE * 4), %rdx
365 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
366 VMOVU (%rsi), %VEC(0)
367 VMOVU VEC_SIZE(%rsi), %VEC(1)
368 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
369 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
370 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
371 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
372 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
373 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
374 VMOVU %VEC(0), (%rdi)
375 VMOVU %VEC(1), VEC_SIZE(%rdi)
376 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
377 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
378 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
379 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
380 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
381 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
385 /* Copy from 2 * VEC to 4 * VEC. */
386 VMOVU (%rsi), %VEC(0)
387 VMOVU VEC_SIZE(%rsi), %VEC(1)
388 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
389 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
390 VMOVU %VEC(0), (%rdi)
391 VMOVU %VEC(1), VEC_SIZE(%rdi)
392 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
393 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
399 ja L(more_8x_vec_backward)
400 /* Source == destination is less common. */
402 /* Load the first VEC and last 4 * VEC to support overlapping
404 VMOVU (%rsi), %VEC(4)
405 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
406 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
407 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
408 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
409 /* Save start and stop of the destination buffer. */
411 leaq -VEC_SIZE(%rdi, %rdx), %rcx
412 /* Align destination for aligned stores in the loop. Compute
413 how much destination is misaligned. */
415 andq $(VEC_SIZE - 1), %r8
416 /* Get the negative of offset for alignment. */
420 /* Adjust destination which should be aligned now. */
424 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
425 /* Check non-temporal store threshold. */
426 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
429 L(loop_4x_vec_forward):
430 /* Copy 4 * VEC a time forward. */
431 VMOVU (%rsi), %VEC(0)
432 VMOVU VEC_SIZE(%rsi), %VEC(1)
433 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
434 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
435 addq $(VEC_SIZE * 4), %rsi
436 subq $(VEC_SIZE * 4), %rdx
437 VMOVA %VEC(0), (%rdi)
438 VMOVA %VEC(1), VEC_SIZE(%rdi)
439 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
440 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
441 addq $(VEC_SIZE * 4), %rdi
442 cmpq $(VEC_SIZE * 4), %rdx
443 ja L(loop_4x_vec_forward)
444 /* Store the last 4 * VEC. */
445 VMOVU %VEC(5), (%rcx)
446 VMOVU %VEC(6), -VEC_SIZE(%rcx)
447 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
448 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
449 /* Store the first VEC. */
450 VMOVU %VEC(4), (%r11)
454 L(more_8x_vec_backward):
455 /* Load the first 4 * VEC and last VEC to support overlapping
457 VMOVU (%rsi), %VEC(4)
458 VMOVU VEC_SIZE(%rsi), %VEC(5)
459 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
460 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
461 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
462 /* Save stop of the destination buffer. */
463 leaq -VEC_SIZE(%rdi, %rdx), %r11
464 /* Align destination end for aligned stores in the loop. Compute
465 how much destination end is misaligned. */
466 leaq -VEC_SIZE(%rsi, %rdx), %rcx
469 andq $(VEC_SIZE - 1), %r8
472 /* Adjust the end of destination which should be aligned now. */
476 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
477 /* Check non-temporal store threshold. */
478 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
481 L(loop_4x_vec_backward):
482 /* Copy 4 * VEC a time backward. */
483 VMOVU (%rcx), %VEC(0)
484 VMOVU -VEC_SIZE(%rcx), %VEC(1)
485 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
486 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
487 subq $(VEC_SIZE * 4), %rcx
488 subq $(VEC_SIZE * 4), %rdx
490 VMOVA %VEC(1), -VEC_SIZE(%r9)
491 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
492 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
493 subq $(VEC_SIZE * 4), %r9
494 cmpq $(VEC_SIZE * 4), %rdx
495 ja L(loop_4x_vec_backward)
496 /* Store the first 4 * VEC. */
497 VMOVU %VEC(4), (%rdi)
498 VMOVU %VEC(5), VEC_SIZE(%rdi)
499 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
500 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
501 /* Store the last VEC. */
502 VMOVU %VEC(8), (%r11)
506 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
508 /* Don't use non-temporal store if there is overlap between
509 destination and source since destination may be in cache
510 when source is loaded. */
511 leaq (%rdi, %rdx), %r10
513 jb L(loop_4x_vec_forward)
514 L(loop_large_forward):
515 /* Copy 4 * VEC a time forward with non-temporal stores. */
516 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
517 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
518 VMOVU (%rsi), %VEC(0)
519 VMOVU VEC_SIZE(%rsi), %VEC(1)
520 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
521 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
522 addq $PREFETCHED_LOAD_SIZE, %rsi
523 subq $PREFETCHED_LOAD_SIZE, %rdx
524 VMOVNT %VEC(0), (%rdi)
525 VMOVNT %VEC(1), VEC_SIZE(%rdi)
526 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
527 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
528 addq $PREFETCHED_LOAD_SIZE, %rdi
529 cmpq $PREFETCHED_LOAD_SIZE, %rdx
530 ja L(loop_large_forward)
532 /* Store the last 4 * VEC. */
533 VMOVU %VEC(5), (%rcx)
534 VMOVU %VEC(6), -VEC_SIZE(%rcx)
535 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
536 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
537 /* Store the first VEC. */
538 VMOVU %VEC(4), (%r11)
543 /* Don't use non-temporal store if there is overlap between
544 destination and source since destination may be in cache
545 when source is loaded. */
546 leaq (%rcx, %rdx), %r10
548 jb L(loop_4x_vec_backward)
549 L(loop_large_backward):
550 /* Copy 4 * VEC a time backward with non-temporal stores. */
551 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
552 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
553 VMOVU (%rcx), %VEC(0)
554 VMOVU -VEC_SIZE(%rcx), %VEC(1)
555 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
556 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
557 subq $PREFETCHED_LOAD_SIZE, %rcx
558 subq $PREFETCHED_LOAD_SIZE, %rdx
559 VMOVNT %VEC(0), (%r9)
560 VMOVNT %VEC(1), -VEC_SIZE(%r9)
561 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
562 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
563 subq $PREFETCHED_LOAD_SIZE, %r9
564 cmpq $PREFETCHED_LOAD_SIZE, %rdx
565 ja L(loop_large_backward)
567 /* Store the first 4 * VEC. */
568 VMOVU %VEC(4), (%rdi)
569 VMOVU %VEC(5), VEC_SIZE(%rdi)
570 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
571 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
572 /* Store the last VEC. */
573 VMOVU %VEC(8), (%r11)
577 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
580 # ifdef USE_MULTIARCH
581 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
582 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
584 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
585 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
589 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
590 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
593 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
594 MEMCPY_SYMBOL (__memcpy, unaligned))