1 /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 /* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
40 # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
43 #ifndef MEMPCPY_SYMBOL
44 # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
47 #ifndef MEMMOVE_CHK_SYMBOL
48 # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
53 # define VZEROUPPER vzeroupper
60 # define PREFETCH(addr) prefetcht0 addr
63 /* Assume 64-byte prefetch size. */
65 # define PREFETCH_SIZE 64
68 #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
70 #if PREFETCH_SIZE == 64
71 # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
72 # define PREFETCH_ONE_SET(dir, base, offset) \
73 PREFETCH ((offset)base)
74 # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
75 # define PREFETCH_ONE_SET(dir, base, offset) \
76 PREFETCH ((offset)base); \
77 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
78 # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
79 # define PREFETCH_ONE_SET(dir, base, offset) \
80 PREFETCH ((offset)base); \
81 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
82 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
83 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
85 # error Unsupported PREFETCHED_LOAD_SIZE!
88 # error Unsupported PREFETCH_SIZE!
92 # error SECTION is not defined!
95 .section SECTION(.text),"ax",@progbits
96 #if defined SHARED && IS_IN (libc)
97 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
99 jb HIDDEN_JUMPTARGET (__chk_fail)
100 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
103 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
107 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
109 #if defined SHARED && IS_IN (libc)
110 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
112 jb HIDDEN_JUMPTARGET (__chk_fail)
113 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
116 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
120 /* Clear the upper 32 bits. */
123 cmp $VEC_SIZE, %RDX_LP
125 cmp $(VEC_SIZE * 2), %RDX_LP
127 #if !defined USE_MULTIARCH || !IS_IN (libc)
130 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
131 VMOVU (%rsi), %VEC(0)
132 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
133 VMOVU %VEC(0), (%rdi)
134 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
136 #if !defined USE_MULTIARCH || !IS_IN (libc)
140 #if defined USE_MULTIARCH && IS_IN (libc)
141 END (MEMMOVE_SYMBOL (__memmove, unaligned))
144 ENTRY (__mempcpy_chk_erms)
146 jb HIDDEN_JUMPTARGET (__chk_fail)
147 END (__mempcpy_chk_erms)
149 /* Only used to measure performance of REP MOVSB. */
150 ENTRY (__mempcpy_erms)
152 /* Skip zero length. */
153 test %RDX_LP, %RDX_LP
159 ENTRY (__memmove_chk_erms)
161 jb HIDDEN_JUMPTARGET (__chk_fail)
162 END (__memmove_chk_erms)
164 ENTRY (__memmove_erms)
166 /* Skip zero length. */
167 test %RDX_LP, %RDX_LP
173 /* Source == destination is less common. */
175 lea (%rsi,%rcx), %RDX_LP
183 leaq -1(%rdi,%rcx), %rdi
184 leaq -1(%rsi,%rcx), %rsi
190 strong_alias (__memmove_erms, __memcpy_erms)
191 strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
195 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
197 jb HIDDEN_JUMPTARGET (__chk_fail)
198 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
201 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
205 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
208 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
210 jb HIDDEN_JUMPTARGET (__chk_fail)
211 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
214 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
218 /* Clear the upper 32 bits. */
221 cmp $VEC_SIZE, %RDX_LP
223 cmp $(VEC_SIZE * 2), %RDX_LP
224 ja L(movsb_more_2x_vec)
226 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
227 VMOVU (%rsi), %VEC(0)
228 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
229 VMOVU %VEC(0), (%rdi)
230 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
236 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
240 /* Source == destination is less common. */
242 leaq (%rsi,%rdx), %r9
244 /* Avoid slow backward REP MOVSB. */
245 jb L(more_8x_vec_backward)
254 /* Less than 1 VEC. */
255 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
256 # error Unsupported VEC_SIZE!
279 /* From 32 to 63. No branch when size == 32. */
280 vmovdqu (%rsi), %ymm0
281 vmovdqu -32(%rsi,%rdx), %ymm1
282 vmovdqu %ymm0, (%rdi)
283 vmovdqu %ymm1, -32(%rdi,%rdx)
288 /* From 16 to 31. No branch when size == 16. */
290 vmovdqu (%rsi), %xmm0
291 vmovdqu -16(%rsi,%rdx), %xmm1
292 vmovdqu %xmm0, (%rdi)
293 vmovdqu %xmm1, -16(%rdi,%rdx)
297 /* From 8 to 15. No branch when size == 8. */
298 movq -8(%rsi,%rdx), %rcx
300 movq %rcx, -8(%rdi,%rdx)
304 /* From 4 to 7. No branch when size == 4. */
305 movl -4(%rsi,%rdx), %ecx
307 movl %ecx, -4(%rdi,%rdx)
311 /* From 2 to 3. No branch when size == 2. */
312 movzwl -2(%rsi,%rdx), %ecx
314 movw %cx, -2(%rdi,%rdx)
318 #if defined USE_MULTIARCH && IS_IN (libc)
319 L(movsb_more_2x_vec):
320 cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
324 /* More than 2 * VEC and there may be overlap between destination
326 cmpq $(VEC_SIZE * 8), %rdx
328 cmpq $(VEC_SIZE * 4), %rdx
330 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
331 VMOVU (%rsi), %VEC(0)
332 VMOVU VEC_SIZE(%rsi), %VEC(1)
333 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
334 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
335 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
336 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
337 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
338 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
339 VMOVU %VEC(0), (%rdi)
340 VMOVU %VEC(1), VEC_SIZE(%rdi)
341 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
342 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
343 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
344 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
345 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
346 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
350 /* Copy from 2 * VEC to 4 * VEC. */
351 VMOVU (%rsi), %VEC(0)
352 VMOVU VEC_SIZE(%rsi), %VEC(1)
353 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
354 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
355 VMOVU %VEC(0), (%rdi)
356 VMOVU %VEC(1), VEC_SIZE(%rdi)
357 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
358 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
364 ja L(more_8x_vec_backward)
365 /* Source == destination is less common. */
367 /* Load the first VEC and last 4 * VEC to support overlapping
369 VMOVU (%rsi), %VEC(4)
370 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
371 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
372 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
373 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
374 /* Save start and stop of the destination buffer. */
376 leaq -VEC_SIZE(%rdi, %rdx), %rcx
377 /* Align destination for aligned stores in the loop. Compute
378 how much destination is misaligned. */
380 andq $(VEC_SIZE - 1), %r8
381 /* Get the negative of offset for alignment. */
385 /* Adjust destination which should be aligned now. */
389 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
390 /* Check non-temporal store threshold. */
391 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
394 L(loop_4x_vec_forward):
395 /* Copy 4 * VEC a time forward. */
396 VMOVU (%rsi), %VEC(0)
397 VMOVU VEC_SIZE(%rsi), %VEC(1)
398 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
399 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
400 addq $(VEC_SIZE * 4), %rsi
401 subq $(VEC_SIZE * 4), %rdx
402 VMOVA %VEC(0), (%rdi)
403 VMOVA %VEC(1), VEC_SIZE(%rdi)
404 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
405 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
406 addq $(VEC_SIZE * 4), %rdi
407 cmpq $(VEC_SIZE * 4), %rdx
408 ja L(loop_4x_vec_forward)
409 /* Store the last 4 * VEC. */
410 VMOVU %VEC(5), (%rcx)
411 VMOVU %VEC(6), -VEC_SIZE(%rcx)
412 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
413 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
414 /* Store the first VEC. */
415 VMOVU %VEC(4), (%r11)
419 L(more_8x_vec_backward):
420 /* Load the first 4 * VEC and last VEC to support overlapping
422 VMOVU (%rsi), %VEC(4)
423 VMOVU VEC_SIZE(%rsi), %VEC(5)
424 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
425 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
426 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
427 /* Save stop of the destination buffer. */
428 leaq -VEC_SIZE(%rdi, %rdx), %r11
429 /* Align destination end for aligned stores in the loop. Compute
430 how much destination end is misaligned. */
431 leaq -VEC_SIZE(%rsi, %rdx), %rcx
434 andq $(VEC_SIZE - 1), %r8
437 /* Adjust the end of destination which should be aligned now. */
441 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
442 /* Check non-temporal store threshold. */
443 cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
446 L(loop_4x_vec_backward):
447 /* Copy 4 * VEC a time backward. */
448 VMOVU (%rcx), %VEC(0)
449 VMOVU -VEC_SIZE(%rcx), %VEC(1)
450 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
451 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
452 subq $(VEC_SIZE * 4), %rcx
453 subq $(VEC_SIZE * 4), %rdx
455 VMOVA %VEC(1), -VEC_SIZE(%r9)
456 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
457 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
458 subq $(VEC_SIZE * 4), %r9
459 cmpq $(VEC_SIZE * 4), %rdx
460 ja L(loop_4x_vec_backward)
461 /* Store the first 4 * VEC. */
462 VMOVU %VEC(4), (%rdi)
463 VMOVU %VEC(5), VEC_SIZE(%rdi)
464 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
465 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
466 /* Store the last VEC. */
467 VMOVU %VEC(8), (%r11)
471 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
473 /* Don't use non-temporal store if there is overlap between
474 destination and source since destination may be in cache
475 when source is loaded. */
476 leaq (%rdi, %rdx), %r10
478 jb L(loop_4x_vec_forward)
479 L(loop_large_forward):
480 /* Copy 4 * VEC a time forward with non-temporal stores. */
481 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
482 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
483 VMOVU (%rsi), %VEC(0)
484 VMOVU VEC_SIZE(%rsi), %VEC(1)
485 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
486 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
487 addq $PREFETCHED_LOAD_SIZE, %rsi
488 subq $PREFETCHED_LOAD_SIZE, %rdx
489 VMOVNT %VEC(0), (%rdi)
490 VMOVNT %VEC(1), VEC_SIZE(%rdi)
491 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
492 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
493 addq $PREFETCHED_LOAD_SIZE, %rdi
494 cmpq $PREFETCHED_LOAD_SIZE, %rdx
495 ja L(loop_large_forward)
497 /* Store the last 4 * VEC. */
498 VMOVU %VEC(5), (%rcx)
499 VMOVU %VEC(6), -VEC_SIZE(%rcx)
500 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
501 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
502 /* Store the first VEC. */
503 VMOVU %VEC(4), (%r11)
508 /* Don't use non-temporal store if there is overlap between
509 destination and source since destination may be in cache
510 when source is loaded. */
511 leaq (%rcx, %rdx), %r10
513 jb L(loop_4x_vec_backward)
514 L(loop_large_backward):
515 /* Copy 4 * VEC a time backward with non-temporal stores. */
516 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
517 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
518 VMOVU (%rcx), %VEC(0)
519 VMOVU -VEC_SIZE(%rcx), %VEC(1)
520 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
521 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
522 subq $PREFETCHED_LOAD_SIZE, %rcx
523 subq $PREFETCHED_LOAD_SIZE, %rdx
524 VMOVNT %VEC(0), (%r9)
525 VMOVNT %VEC(1), -VEC_SIZE(%r9)
526 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
527 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
528 subq $PREFETCHED_LOAD_SIZE, %r9
529 cmpq $PREFETCHED_LOAD_SIZE, %rdx
530 ja L(loop_large_backward)
532 /* Store the first 4 * VEC. */
533 VMOVU %VEC(4), (%rdi)
534 VMOVU %VEC(5), VEC_SIZE(%rdi)
535 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
536 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
537 /* Store the last VEC. */
538 VMOVU %VEC(8), (%r11)
542 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
545 # ifdef USE_MULTIARCH
546 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
547 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
549 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
550 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
554 strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
558 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
559 MEMCPY_SYMBOL (__memcpy, unaligned))