1 /* memcpy optimized with SSE2 unaligned memory access instructions.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
25 # include "asm-syntax.h"
28 # define MEMCPY __memcpy_sse2_unaligned
29 # define MEMCPY_CHK __memcpy_chk_sse2_unaligned
42 # define CFI_PUSH(REG) \
43 cfi_adjust_cfa_offset (4); \
44 cfi_rel_offset (REG, 0)
46 # define CFI_POP(REG) \
47 cfi_adjust_cfa_offset (-4); \
50 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
51 # define POP(REG) popl REG; CFI_POP (REG)
53 # define PARMS 8 /* Preserve EBX. */
54 # define ENTRANCE PUSH (%ebx);
55 # define RETURN_END POP (%ebx); ret
56 # define RETURN RETURN_END; CFI_PUSH (%ebx)
58 .section .text.sse2,"ax",@progbits
59 # if !defined USE_AS_BCOPY && defined SHARED
63 jb HIDDEN_JUMPTARGET (__chk_fail)
74 # ifdef USE_AS_MEMMOVE
77 L(mm_len_0_or_more_backward):
78 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
81 jbe L(mm_len_0_16_bytes_backward)
84 jg L(mm_len_32_or_more_backward)
86 /* Copy [0..32] and return. */
88 movdqu -16(%eax, %ecx), %xmm1
90 movdqu %xmm1, -16(%edx, %ecx)
93 L(mm_len_32_or_more_backward):
95 jg L(mm_len_64_or_more_backward)
97 /* Copy [0..64] and return. */
99 movdqu 16(%eax), %xmm1
100 movdqu -16(%eax, %ecx), %xmm2
101 movdqu -32(%eax, %ecx), %xmm3
103 movdqu %xmm1, 16(%edx)
104 movdqu %xmm2, -16(%edx, %ecx)
105 movdqu %xmm3, -32(%edx, %ecx)
108 L(mm_len_64_or_more_backward):
110 jg L(mm_len_128_or_more_backward)
112 /* Copy [0..128] and return. */
114 movdqu 16(%eax), %xmm1
115 movdqu 32(%eax), %xmm2
116 movdqu 48(%eax), %xmm3
117 movdqu -64(%eax, %ecx), %xmm4
118 movdqu -48(%eax, %ecx), %xmm5
119 movdqu -32(%eax, %ecx), %xmm6
120 movdqu -16(%eax, %ecx), %xmm7
122 movdqu %xmm1, 16(%edx)
123 movdqu %xmm2, 32(%edx)
124 movdqu %xmm3, 48(%edx)
125 movdqu %xmm4, -64(%edx, %ecx)
126 movdqu %xmm5, -48(%edx, %ecx)
127 movdqu %xmm6, -32(%edx, %ecx)
128 movdqu %xmm7, -16(%edx, %ecx)
131 L(mm_len_128_or_more_backward):
140 /* Aligning the address of destination. */
142 movdqu 16(%eax), %xmm5
143 movdqu 32(%eax), %xmm6
144 movdqu 48(%eax), %xmm7
145 leal (%edx, %ecx), %esi
146 movdqu -16(%eax, %ecx), %xmm0
154 leal (%eax, %ebx), %eax
157 # ifdef SHARED_CACHE_SIZE_HALF
158 cmp $SHARED_CACHE_SIZE_HALF, %edi
163 add $_GLOBAL_OFFSET_TABLE_, %ebx
164 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
167 cmp __x86_shared_cache_size_half, %edi
170 jae L(mm_large_page_loop_backward)
173 L(mm_main_loop_backward):
175 prefetcht0 -128(%eax)
177 movdqu -64(%eax), %xmm0
178 movdqu -48(%eax), %xmm1
179 movdqu -32(%eax), %xmm2
180 movdqu -16(%eax), %xmm3
181 movaps %xmm0, -64(%ecx)
183 movaps %xmm1, -48(%ecx)
184 movaps %xmm2, -32(%ecx)
185 movaps %xmm3, -16(%ecx)
188 jnz L(mm_main_loop_backward)
191 movdqu %xmm0, -16(%esi)
193 movdqu %xmm5, 16(%edx)
194 movdqu %xmm6, 32(%edx)
195 movdqu %xmm7, 48(%edx)
197 jmp L(mm_return_pop_all)
199 /* Copy [0..16] and return. */
200 L(mm_len_0_16_bytes_backward):
202 jnz L(mm_len_9_16_bytes_backward)
205 jnz L(mm_len_5_8_bytes_backward)
211 jne L(mm_len_3_4_bytes_backward)
212 movzbl -1(%eax,%ecx), %ebx
214 movb %bl, -1(%edx,%ecx)
218 L(mm_len_3_4_bytes_backward):
219 movzwl -2(%eax,%ecx), %ebx
221 movw %bx, -2(%edx,%ecx)
225 L(mm_len_9_16_bytes_backward):
227 movl -4(%eax,%ecx), %ebx
228 movl -8(%eax,%ecx), %esi
229 movl %ebx, -4(%edx,%ecx)
230 movl %esi, -8(%edx,%ecx)
233 jmp L(mm_len_0_16_bytes_backward)
235 L(mm_len_5_8_bytes_backward):
237 movl -4(%eax,%ecx), %eax
239 movl %eax, -4(%edx,%ecx)
242 /* Big length copy backward part. */
244 L(mm_large_page_loop_backward):
245 movdqu -64(%eax), %xmm0
246 movdqu -48(%eax), %xmm1
247 movdqu -32(%eax), %xmm2
248 movdqu -16(%eax), %xmm3
249 movntdq %xmm0, -64(%ecx)
251 movntdq %xmm1, -48(%ecx)
252 movntdq %xmm2, -32(%ecx)
253 movntdq %xmm3, -16(%ecx)
256 jnz L(mm_large_page_loop_backward)
260 movdqu %xmm0, -16(%esi)
262 movdqu %xmm5, 16(%edx)
263 movdqu %xmm6, 32(%edx)
264 movdqu %xmm7, 48(%edx)
266 jmp L(mm_return_pop_all)
274 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
277 jbe L(mm_len_0_16_bytes_forward)
280 ja L(mm_len_32_or_more_forward)
282 /* Copy [0..32] and return. */
284 movdqu -16(%eax, %ecx), %xmm1
286 movdqu %xmm1, -16(%edx, %ecx)
289 L(mm_len_32_or_more_forward):
291 ja L(mm_len_64_or_more_forward)
293 /* Copy [0..64] and return. */
295 movdqu 16(%eax), %xmm1
296 movdqu -16(%eax, %ecx), %xmm2
297 movdqu -32(%eax, %ecx), %xmm3
299 movdqu %xmm1, 16(%edx)
300 movdqu %xmm2, -16(%edx, %ecx)
301 movdqu %xmm3, -32(%edx, %ecx)
304 L(mm_len_64_or_more_forward):
306 ja L(mm_len_128_or_more_forward)
308 /* Copy [0..128] and return. */
310 movdqu 16(%eax), %xmm1
311 movdqu 32(%eax), %xmm2
312 movdqu 48(%eax), %xmm3
313 movdqu -64(%eax, %ecx), %xmm4
314 movdqu -48(%eax, %ecx), %xmm5
315 movdqu -32(%eax, %ecx), %xmm6
316 movdqu -16(%eax, %ecx), %xmm7
318 movdqu %xmm1, 16(%edx)
319 movdqu %xmm2, 32(%edx)
320 movdqu %xmm3, 48(%edx)
321 movdqu %xmm4, -64(%edx, %ecx)
322 movdqu %xmm5, -48(%edx, %ecx)
323 movdqu %xmm6, -32(%edx, %ecx)
324 movdqu %xmm7, -16(%edx, %ecx)
327 L(mm_len_128_or_more_forward):
332 /* Aligning the address of destination. */
333 movdqu -16(%eax, %ecx), %xmm4
334 movdqu -32(%eax, %ecx), %xmm5
335 movdqu -48(%eax, %ecx), %xmm6
336 movdqu -64(%eax, %ecx), %xmm7
337 leal (%edx, %ecx), %esi
351 # ifdef SHARED_CACHE_SIZE_HALF
352 cmp $SHARED_CACHE_SIZE_HALF, %edi
357 add $_GLOBAL_OFFSET_TABLE_, %ebx
358 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
361 cmp __x86_shared_cache_size_half, %edi
364 jae L(mm_large_page_loop_forward)
367 L(mm_main_loop_forward):
372 movdqu 16(%eax), %xmm1
373 movdqu 32(%eax), %xmm2
374 movdqu 48(%eax), %xmm3
377 movaps %xmm1, 16(%ecx)
378 movaps %xmm2, 32(%ecx)
379 movaps %xmm3, 48(%ecx)
382 jnz L(mm_main_loop_forward)
386 movdqu %xmm4, -16(%esi)
387 movdqu %xmm5, -32(%esi)
388 movdqu %xmm6, -48(%esi)
389 movdqu %xmm7, -64(%esi)
391 jmp L(mm_return_pop_all)
393 L(mm_len_0_16_bytes_forward):
395 jne L(mm_len_9_16_bytes_forward)
398 jne L(mm_len_5_8_bytes_forward)
404 jne L(mm_len_2_4_bytes_forward)
405 movzbl -1(%eax,%ecx), %ebx
407 movb %bl, -1(%edx,%ecx)
411 L(mm_len_2_4_bytes_forward):
412 movzwl -2(%eax,%ecx), %ebx
414 movw %bx, -2(%edx,%ecx)
418 L(mm_len_5_8_bytes_forward):
420 movl -4(%eax,%ecx), %eax
422 movl %eax, -4(%edx,%ecx)
425 L(mm_len_9_16_bytes_forward):
427 movq -8(%eax, %ecx), %xmm1
429 movq %xmm1, -8(%edx, %ecx)
432 L(mm_return_pop_all):
438 /* Big length copy forward part. */
440 L(mm_large_page_loop_forward):
442 movdqu 16(%eax), %xmm1
443 movdqu 32(%eax), %xmm2
444 movdqu 48(%eax), %xmm3
445 movntdq %xmm0, (%ecx)
447 movntdq %xmm1, 16(%ecx)
448 movntdq %xmm2, 32(%ecx)
449 movntdq %xmm3, 48(%ecx)
452 jnz L(mm_large_page_loop_forward)
457 movdqu %xmm4, -16(%esi)
458 movdqu %xmm5, -32(%esi)
459 movdqu %xmm6, -48(%esi)
460 movdqu %xmm7, -64(%esi)
462 jmp L(mm_return_pop_all)
467 jbe L(len_0_16_bytes)
469 # ifdef SHARED_CACHE_SIZE_HALF
470 cmp $SHARED_CACHE_SIZE_HALF, %ecx
474 add $_GLOBAL_OFFSET_TABLE_, %ebx
475 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
477 cmp __x86_shared_cache_size_half, %ecx
483 movdqu -16(%eax, %ecx), %xmm1
486 movdqu %xmm1, -16(%edx, %ecx)
489 movdqu 16(%eax), %xmm0
490 movdqu -32(%eax, %ecx), %xmm1
492 movdqu %xmm0, 16(%edx)
493 movdqu %xmm1, -32(%edx, %ecx)
496 movdqu 32(%eax), %xmm0
497 movdqu 48(%eax), %xmm1
498 movdqu -48(%eax, %ecx), %xmm2
499 movdqu -64(%eax, %ecx), %xmm3
501 movdqu %xmm0, 32(%edx)
502 movdqu %xmm1, 48(%edx)
503 movdqu %xmm2, -48(%edx, %ecx)
504 movdqu %xmm3, -64(%edx, %ecx)
507 /* Now the main loop: we align the address of the destination. */
516 /* We should stop two iterations before the termination
517 (in order not to misprefetch). */
520 je L(main_loop_just_one_iteration)
524 je L(main_loop_last_two_iterations)
529 prefetcht0 128(%ebx, %eax)
531 movdqu (%ebx, %eax), %xmm0
532 movdqu 16(%ebx, %eax), %xmm1
533 movdqu 32(%ebx, %eax), %xmm2
534 movdqu 48(%ebx, %eax), %xmm3
536 movaps %xmm1, 16(%ebx)
537 movaps %xmm2, 32(%ebx)
538 movaps %xmm3, 48(%ebx)
541 jne L(main_loop_cache)
543 L(main_loop_last_two_iterations):
544 movdqu (%ebx, %eax), %xmm0
545 movdqu 16(%ebx, %eax), %xmm1
546 movdqu 32(%ebx, %eax), %xmm2
547 movdqu 48(%ebx, %eax), %xmm3
548 movdqu 64(%ebx, %eax), %xmm4
549 movdqu 80(%ebx, %eax), %xmm5
550 movdqu 96(%ebx, %eax), %xmm6
551 movdqu 112(%ebx, %eax), %xmm7
553 movaps %xmm1, 16(%ebx)
554 movaps %xmm2, 32(%ebx)
555 movaps %xmm3, 48(%ebx)
556 movaps %xmm4, 64(%ebx)
557 movaps %xmm5, 80(%ebx)
558 movaps %xmm6, 96(%ebx)
559 movaps %xmm7, 112(%ebx)
562 L(main_loop_just_one_iteration):
563 movdqu (%ebx, %eax), %xmm0
564 movdqu 16(%ebx, %eax), %xmm1
565 movdqu 32(%ebx, %eax), %xmm2
566 movdqu 48(%ebx, %eax), %xmm3
568 movaps %xmm1, 16(%ebx)
569 movaps %xmm2, 32(%ebx)
570 movaps %xmm3, 48(%ebx)
575 movdqu 16(%eax), %xmm1
576 movdqu 32(%eax), %xmm2
577 movdqu 48(%eax), %xmm3
578 movdqu -64(%eax, %ecx), %xmm4
579 movdqu -48(%eax, %ecx), %xmm5
580 movdqu -32(%eax, %ecx), %xmm6
581 movdqu -16(%eax, %ecx), %xmm7
583 movdqu %xmm1, 16(%edx)
584 movdqu %xmm2, 32(%edx)
585 movdqu %xmm3, 48(%edx)
586 movdqu %xmm4, -64(%edx, %ecx)
587 movdqu %xmm5, -48(%edx, %ecx)
588 movdqu %xmm6, -32(%edx, %ecx)
589 movdqu %xmm7, -16(%edx, %ecx)
591 movdqu 64(%eax), %xmm0
592 movdqu 80(%eax), %xmm1
593 movdqu 96(%eax), %xmm2
594 movdqu 112(%eax), %xmm3
595 movdqu -128(%eax, %ecx), %xmm4
596 movdqu -112(%eax, %ecx), %xmm5
597 movdqu -96(%eax, %ecx), %xmm6
598 movdqu -80(%eax, %ecx), %xmm7
599 movdqu %xmm0, 64(%edx)
600 movdqu %xmm1, 80(%edx)
601 movdqu %xmm2, 96(%edx)
602 movdqu %xmm3, 112(%edx)
603 movdqu %xmm4, -128(%edx, %ecx)
604 movdqu %xmm5, -112(%edx, %ecx)
605 movdqu %xmm6, -96(%edx, %ecx)
606 movdqu %xmm7, -80(%edx, %ecx)
608 /* Now the main loop with non temporal stores. We align
609 the address of the destination. */
619 L(main_loop_large_page):
620 movdqu (%ebx, %eax), %xmm0
621 movdqu 16(%ebx, %eax), %xmm1
622 movdqu 32(%ebx, %eax), %xmm2
623 movdqu 48(%ebx, %eax), %xmm3
624 movdqu 64(%ebx, %eax), %xmm4
625 movdqu 80(%ebx, %eax), %xmm5
626 movdqu 96(%ebx, %eax), %xmm6
627 movdqu 112(%ebx, %eax), %xmm7
628 movntdq %xmm0, (%ebx)
629 movntdq %xmm1, 16(%ebx)
630 movntdq %xmm2, 32(%ebx)
631 movntdq %xmm3, 48(%ebx)
632 movntdq %xmm4, 64(%ebx)
633 movntdq %xmm5, 80(%ebx)
634 movntdq %xmm6, 96(%ebx)
635 movntdq %xmm7, 112(%ebx)
638 jne L(main_loop_large_page)
644 jne L(len_9_16_bytes)
655 movzwl -2(%eax,%ecx), %ebx
656 movw %bx, -2(%edx,%ecx)
661 movq -8(%eax, %ecx), %xmm1
663 movq %xmm1, -8(%edx, %ecx)
669 movl -4(%eax,%ecx), %ebx
670 movl %ebx, -4(%edx,%ecx)
674 # if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY