2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
25 # include "asm-syntax.h"
28 # define MEMCPY __memcpy_ssse3
29 # define MEMCPY_CHK __memcpy_chk_ssse3
42 # define CFI_PUSH(REG) \
43 cfi_adjust_cfa_offset (4); \
44 cfi_rel_offset (REG, 0)
46 # define CFI_POP(REG) \
47 cfi_adjust_cfa_offset (-4); \
50 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
51 # define POP(REG) popl REG; CFI_POP (REG)
54 # define PARMS 8 /* Preserve EBX. */
55 # define ENTRANCE PUSH (%ebx);
56 # define RETURN_END POP (%ebx); ret
57 # define RETURN RETURN_END; CFI_PUSH (%ebx)
58 # define JMPTBL(I, B) I - B
60 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
61 jump table with relative offsets. INDEX is a register contains the
62 index into the jump table. SCALE is the scale of INDEX. */
64 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
65 /* We first load PC into EBX. */ \
67 /* Get the address of the jump table. */ \
68 addl $(TABLE - .), %ebx; \
69 /* Get the entry and convert the relative offset to the \
70 absolute address. */ \
71 addl (%ebx, INDEX, SCALE), %ebx; \
72 /* We loaded the jump table. Go. */ \
73 _CET_NOTRACK jmp *%ebx
78 # define RETURN_END ret
79 # define RETURN RETURN_END
80 # define JMPTBL(I, B) I
82 /* Branch to an entry in a jump table. TABLE is a jump table with
83 absolute offsets. INDEX is a register contains the index into the
84 jump table. SCALE is the scale of INDEX. */
86 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
87 _CET_NOTRACK jmp *TABLE(, INDEX, SCALE)
90 .section .text.ssse3,"ax",@progbits
91 # if !defined USE_AS_BCOPY && defined SHARED
95 jb HIDDEN_JUMPTARGET (__chk_fail)
102 movl DEST(%esp), %edx
104 # ifdef USE_AS_MEMMOVE
107 je L(fwd_write_0bytes)
110 jmp L(bk_write_less32bytes_2)
124 L(fwd_write_less32bytes):
125 # ifndef USE_AS_MEMMOVE
131 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
132 # ifndef USE_AS_MEMMOVE
135 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
140 # ifndef USE_AS_MEMMOVE
142 movlpd 8(%eax), %xmm1
144 movlpd %xmm1, 8(%edx)
156 # ifdef SHARED_CACHE_SIZE_HALF
157 cmp $SHARED_CACHE_SIZE_HALF, %ecx
161 add $_GLOBAL_OFFSET_TABLE_, %ebx
162 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
164 cmp __x86_shared_cache_size_half, %ecx
172 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
176 # ifdef USE_AS_MEMMOVE
177 movl DEST+4(%esp), %edi
187 movdqa (%eax, %edi), %xmm0
188 movdqa 16(%eax, %edi), %xmm1
190 movdqa %xmm0, (%edx, %edi)
191 movdqa %xmm1, 16(%edx, %edi)
195 movdqa (%eax, %edi), %xmm0
196 movdqa 16(%eax, %edi), %xmm1
198 movdqa %xmm0, (%edx, %edi)
199 movdqa %xmm1, 16(%edx, %edi)
203 movdqa (%eax, %edi), %xmm0
204 movdqa 16(%eax, %edi), %xmm1
206 movdqa %xmm0, (%edx, %edi)
207 movdqa %xmm1, 16(%edx, %edi)
211 movdqa (%eax, %edi), %xmm0
212 movdqa 16(%eax, %edi), %xmm1
214 movdqa %xmm0, (%edx, %edi)
215 movdqa %xmm1, 16(%edx, %edi)
224 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
230 # ifdef DATA_CACHE_SIZE_HALF
231 cmp $DATA_CACHE_SIZE_HALF, %ecx
235 add $_GLOBAL_OFFSET_TABLE_, %ebx
236 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
238 cmp __x86_data_cache_size_half, %ecx
243 jae L(shl_0_gobble_mem_loop)
246 L(shl_0_gobble_cache_loop):
248 movdqa 0x10(%eax), %xmm1
249 movdqa 0x20(%eax), %xmm2
250 movdqa 0x30(%eax), %xmm3
251 movdqa 0x40(%eax), %xmm4
252 movdqa 0x50(%eax), %xmm5
253 movdqa 0x60(%eax), %xmm6
254 movdqa 0x70(%eax), %xmm7
258 movdqa %xmm1, 0x10(%edx)
259 movdqa %xmm2, 0x20(%edx)
260 movdqa %xmm3, 0x30(%edx)
261 movdqa %xmm4, 0x40(%edx)
262 movdqa %xmm5, 0x50(%edx)
263 movdqa %xmm6, 0x60(%edx)
264 movdqa %xmm7, 0x70(%edx)
267 jae L(shl_0_gobble_cache_loop)
270 jl L(shl_0_cache_less_64bytes)
274 movdqa 0x10(%eax), %xmm1
276 movdqa %xmm1, 0x10(%edx)
277 movdqa 0x20(%eax), %xmm0
278 movdqa 0x30(%eax), %xmm1
280 movdqa %xmm0, 0x20(%edx)
281 movdqa %xmm1, 0x30(%edx)
284 L(shl_0_cache_less_64bytes):
286 jb L(shl_0_cache_less_32bytes)
289 movdqa 0x10(%eax), %xmm1
292 movdqa %xmm1, 0x10(%edx)
295 L(shl_0_cache_less_32bytes):
297 jb L(shl_0_cache_less_16bytes)
304 L(shl_0_cache_less_16bytes):
307 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
310 L(shl_0_gobble_mem_loop):
311 prefetcht0 0x1c0(%eax)
312 prefetcht0 0x280(%eax)
313 prefetcht0 0x1c0(%edx)
316 movdqa 0x10(%eax), %xmm1
317 movdqa 0x20(%eax), %xmm2
318 movdqa 0x30(%eax), %xmm3
319 movdqa 0x40(%eax), %xmm4
320 movdqa 0x50(%eax), %xmm5
321 movdqa 0x60(%eax), %xmm6
322 movdqa 0x70(%eax), %xmm7
326 movdqa %xmm1, 0x10(%edx)
327 movdqa %xmm2, 0x20(%edx)
328 movdqa %xmm3, 0x30(%edx)
329 movdqa %xmm4, 0x40(%edx)
330 movdqa %xmm5, 0x50(%edx)
331 movdqa %xmm6, 0x60(%edx)
332 movdqa %xmm7, 0x70(%edx)
335 jae L(shl_0_gobble_mem_loop)
338 jl L(shl_0_mem_less_64bytes)
342 movdqa 0x10(%eax), %xmm1
345 movdqa %xmm1, 0x10(%edx)
347 movdqa 0x20(%eax), %xmm0
348 movdqa 0x30(%eax), %xmm1
351 movdqa %xmm0, 0x20(%edx)
352 movdqa %xmm1, 0x30(%edx)
355 L(shl_0_mem_less_64bytes):
357 jb L(shl_0_mem_less_32bytes)
360 movdqa 0x10(%eax), %xmm1
363 movdqa %xmm1, 0x10(%edx)
366 L(shl_0_mem_less_32bytes):
368 jb L(shl_0_mem_less_16bytes)
375 L(shl_0_mem_less_16bytes):
378 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
382 # ifndef USE_AS_MEMMOVE
383 movaps -1(%eax), %xmm1
385 movl DEST+4(%esp), %edi
386 movaps -1(%eax), %xmm1
389 # ifdef DATA_CACHE_SIZE_HALF
390 cmp $DATA_CACHE_SIZE_HALF, %ecx
394 add $_GLOBAL_OFFSET_TABLE_, %ebx
395 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
397 cmp __x86_data_cache_size_half, %ecx
400 jb L(sh_1_no_prefetch)
406 prefetcht0 0x1c0(%eax)
407 prefetcht0 0x1c0(%edx)
408 movaps 15(%eax), %xmm2
409 movaps 31(%eax), %xmm3
410 movaps 47(%eax), %xmm4
411 movaps 63(%eax), %xmm5
413 palignr $1, %xmm4, %xmm5
414 palignr $1, %xmm3, %xmm4
415 movaps %xmm5, 48(%edx)
416 palignr $1, %xmm2, %xmm3
418 palignr $1, %xmm1, %xmm2
419 movaps %xmm4, 32(%edx)
420 movaps %xmm3, 16(%edx)
431 movaps 15(%eax), %xmm2
432 movaps 31(%eax), %xmm3
433 palignr $1, %xmm2, %xmm3
434 palignr $1, %xmm1, %xmm2
436 movaps %xmm3, 16(%edx)
437 lea 32(%edx, %ecx), %edx
438 lea 32(%eax, %ecx), %eax
440 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
451 L(sh_1_no_prefetch_loop):
452 movdqa 16(%eax, %edi), %xmm2
454 movdqa 32(%eax, %edi), %xmm3
456 palignr $1, %xmm2, %xmm3
457 palignr $1, %xmm1, %xmm2
459 movdqa %xmm2, -32(%edx, %edi)
460 movdqa %xmm3, -16(%edx, %edi)
461 jb L(sh_1_end_no_prefetch_loop)
463 movdqa 16(%eax, %edi), %xmm2
465 movdqa 32(%eax, %edi), %xmm3
467 palignr $1, %xmm2, %xmm3
468 palignr $1, %xmm4, %xmm2
470 movdqa %xmm2, -32(%edx, %edi)
471 movdqa %xmm3, -16(%edx, %edi)
472 jae L(sh_1_no_prefetch_loop)
474 L(sh_1_end_no_prefetch_loop):
478 lea 1(%edi, %eax), %eax
480 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
486 # ifndef USE_AS_MEMMOVE
487 movaps -2(%eax), %xmm1
489 movl DEST+4(%esp), %edi
490 movaps -2(%eax), %xmm1
493 # ifdef DATA_CACHE_SIZE_HALF
494 cmp $DATA_CACHE_SIZE_HALF, %ecx
498 add $_GLOBAL_OFFSET_TABLE_, %ebx
499 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
501 cmp __x86_data_cache_size_half, %ecx
504 jb L(sh_2_no_prefetch)
510 prefetcht0 0x1c0(%eax)
511 prefetcht0 0x1c0(%edx)
512 movaps 14(%eax), %xmm2
513 movaps 30(%eax), %xmm3
514 movaps 46(%eax), %xmm4
515 movaps 62(%eax), %xmm5
517 palignr $2, %xmm4, %xmm5
518 palignr $2, %xmm3, %xmm4
519 movaps %xmm5, 48(%edx)
520 palignr $2, %xmm2, %xmm3
522 palignr $2, %xmm1, %xmm2
523 movaps %xmm4, 32(%edx)
524 movaps %xmm3, 16(%edx)
535 movaps 14(%eax), %xmm2
536 movaps 30(%eax), %xmm3
537 palignr $2, %xmm2, %xmm3
538 palignr $2, %xmm1, %xmm2
540 movaps %xmm3, 16(%edx)
541 lea 32(%edx, %ecx), %edx
542 lea 32(%eax, %ecx), %eax
544 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
555 L(sh_2_no_prefetch_loop):
556 movdqa 16(%eax, %edi), %xmm2
558 movdqa 32(%eax, %edi), %xmm3
560 palignr $2, %xmm2, %xmm3
561 palignr $2, %xmm1, %xmm2
563 movdqa %xmm2, -32(%edx, %edi)
564 movdqa %xmm3, -16(%edx, %edi)
565 jb L(sh_2_end_no_prefetch_loop)
567 movdqa 16(%eax, %edi), %xmm2
569 movdqa 32(%eax, %edi), %xmm3
571 palignr $2, %xmm2, %xmm3
572 palignr $2, %xmm4, %xmm2
574 movdqa %xmm2, -32(%edx, %edi)
575 movdqa %xmm3, -16(%edx, %edi)
576 jae L(sh_2_no_prefetch_loop)
578 L(sh_2_end_no_prefetch_loop):
582 lea 2(%edi, %eax), %eax
584 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
590 # ifndef USE_AS_MEMMOVE
591 movaps -3(%eax), %xmm1
593 movl DEST+4(%esp), %edi
594 movaps -3(%eax), %xmm1
597 # ifdef DATA_CACHE_SIZE_HALF
598 cmp $DATA_CACHE_SIZE_HALF, %ecx
602 add $_GLOBAL_OFFSET_TABLE_, %ebx
603 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
605 cmp __x86_data_cache_size_half, %ecx
608 jb L(sh_3_no_prefetch)
614 prefetcht0 0x1c0(%eax)
615 prefetcht0 0x1c0(%edx)
616 movaps 13(%eax), %xmm2
617 movaps 29(%eax), %xmm3
618 movaps 45(%eax), %xmm4
619 movaps 61(%eax), %xmm5
621 palignr $3, %xmm4, %xmm5
622 palignr $3, %xmm3, %xmm4
623 movaps %xmm5, 48(%edx)
624 palignr $3, %xmm2, %xmm3
626 palignr $3, %xmm1, %xmm2
627 movaps %xmm4, 32(%edx)
628 movaps %xmm3, 16(%edx)
639 movaps 13(%eax), %xmm2
640 movaps 29(%eax), %xmm3
641 palignr $3, %xmm2, %xmm3
642 palignr $3, %xmm1, %xmm2
644 movaps %xmm3, 16(%edx)
645 lea 32(%edx, %ecx), %edx
646 lea 32(%eax, %ecx), %eax
648 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
659 L(sh_3_no_prefetch_loop):
660 movdqa 16(%eax, %edi), %xmm2
662 movdqa 32(%eax, %edi), %xmm3
664 palignr $3, %xmm2, %xmm3
665 palignr $3, %xmm1, %xmm2
667 movdqa %xmm2, -32(%edx, %edi)
668 movdqa %xmm3, -16(%edx, %edi)
670 jb L(sh_3_end_no_prefetch_loop)
672 movdqa 16(%eax, %edi), %xmm2
674 movdqa 32(%eax, %edi), %xmm3
676 palignr $3, %xmm2, %xmm3
677 palignr $3, %xmm4, %xmm2
679 movdqa %xmm2, -32(%edx, %edi)
680 movdqa %xmm3, -16(%edx, %edi)
682 jae L(sh_3_no_prefetch_loop)
684 L(sh_3_end_no_prefetch_loop):
688 lea 3(%edi, %eax), %eax
690 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
696 # ifndef USE_AS_MEMMOVE
697 movaps -4(%eax), %xmm1
699 movl DEST+4(%esp), %edi
700 movaps -4(%eax), %xmm1
703 # ifdef DATA_CACHE_SIZE_HALF
704 cmp $DATA_CACHE_SIZE_HALF, %ecx
708 add $_GLOBAL_OFFSET_TABLE_, %ebx
709 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
711 cmp __x86_data_cache_size_half, %ecx
714 jb L(sh_4_no_prefetch)
720 prefetcht0 0x1c0(%eax)
721 prefetcht0 0x1c0(%edx)
722 movaps 12(%eax), %xmm2
723 movaps 28(%eax), %xmm3
724 movaps 44(%eax), %xmm4
725 movaps 60(%eax), %xmm5
727 palignr $4, %xmm4, %xmm5
728 palignr $4, %xmm3, %xmm4
729 movaps %xmm5, 48(%edx)
730 palignr $4, %xmm2, %xmm3
732 palignr $4, %xmm1, %xmm2
733 movaps %xmm4, 32(%edx)
734 movaps %xmm3, 16(%edx)
745 movaps 12(%eax), %xmm2
746 movaps 28(%eax), %xmm3
747 palignr $4, %xmm2, %xmm3
748 palignr $4, %xmm1, %xmm2
750 movaps %xmm3, 16(%edx)
751 lea 32(%edx, %ecx), %edx
752 lea 32(%eax, %ecx), %eax
754 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
765 L(sh_4_no_prefetch_loop):
766 movdqa 16(%eax, %edi), %xmm2
768 movdqa 32(%eax, %edi), %xmm3
770 palignr $4, %xmm2, %xmm3
771 palignr $4, %xmm1, %xmm2
773 movdqa %xmm2, -32(%edx, %edi)
774 movdqa %xmm3, -16(%edx, %edi)
776 jb L(sh_4_end_no_prefetch_loop)
778 movdqa 16(%eax, %edi), %xmm2
780 movdqa 32(%eax, %edi), %xmm3
782 palignr $4, %xmm2, %xmm3
783 palignr $4, %xmm4, %xmm2
785 movdqa %xmm2, -32(%edx, %edi)
786 movdqa %xmm3, -16(%edx, %edi)
788 jae L(sh_4_no_prefetch_loop)
790 L(sh_4_end_no_prefetch_loop):
794 lea 4(%edi, %eax), %eax
796 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
802 # ifndef USE_AS_MEMMOVE
803 movaps -5(%eax), %xmm1
805 movl DEST+4(%esp), %edi
806 movaps -5(%eax), %xmm1
809 # ifdef DATA_CACHE_SIZE_HALF
810 cmp $DATA_CACHE_SIZE_HALF, %ecx
814 add $_GLOBAL_OFFSET_TABLE_, %ebx
815 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
817 cmp __x86_data_cache_size_half, %ecx
820 jb L(sh_5_no_prefetch)
826 prefetcht0 0x1c0(%eax)
827 prefetcht0 0x1c0(%edx)
828 movaps 11(%eax), %xmm2
829 movaps 27(%eax), %xmm3
830 movaps 43(%eax), %xmm4
831 movaps 59(%eax), %xmm5
833 palignr $5, %xmm4, %xmm5
834 palignr $5, %xmm3, %xmm4
835 movaps %xmm5, 48(%edx)
836 palignr $5, %xmm2, %xmm3
838 palignr $5, %xmm1, %xmm2
839 movaps %xmm4, 32(%edx)
840 movaps %xmm3, 16(%edx)
851 movaps 11(%eax), %xmm2
852 movaps 27(%eax), %xmm3
853 palignr $5, %xmm2, %xmm3
854 palignr $5, %xmm1, %xmm2
856 movaps %xmm3, 16(%edx)
857 lea 32(%edx, %ecx), %edx
858 lea 32(%eax, %ecx), %eax
860 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
871 L(sh_5_no_prefetch_loop):
872 movdqa 16(%eax, %edi), %xmm2
874 movdqa 32(%eax, %edi), %xmm3
876 palignr $5, %xmm2, %xmm3
877 palignr $5, %xmm1, %xmm2
879 movdqa %xmm2, -32(%edx, %edi)
880 movdqa %xmm3, -16(%edx, %edi)
882 jb L(sh_5_end_no_prefetch_loop)
884 movdqa 16(%eax, %edi), %xmm2
886 movdqa 32(%eax, %edi), %xmm3
888 palignr $5, %xmm2, %xmm3
889 palignr $5, %xmm4, %xmm2
891 movdqa %xmm2, -32(%edx, %edi)
892 movdqa %xmm3, -16(%edx, %edi)
894 jae L(sh_5_no_prefetch_loop)
896 L(sh_5_end_no_prefetch_loop):
900 lea 5(%edi, %eax), %eax
902 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
908 # ifndef USE_AS_MEMMOVE
909 movaps -6(%eax), %xmm1
911 movl DEST+4(%esp), %edi
912 movaps -6(%eax), %xmm1
915 # ifdef DATA_CACHE_SIZE_HALF
916 cmp $DATA_CACHE_SIZE_HALF, %ecx
920 add $_GLOBAL_OFFSET_TABLE_, %ebx
921 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
923 cmp __x86_data_cache_size_half, %ecx
926 jb L(sh_6_no_prefetch)
932 prefetcht0 0x1c0(%eax)
933 prefetcht0 0x1c0(%edx)
934 movaps 10(%eax), %xmm2
935 movaps 26(%eax), %xmm3
936 movaps 42(%eax), %xmm4
937 movaps 58(%eax), %xmm5
939 palignr $6, %xmm4, %xmm5
940 palignr $6, %xmm3, %xmm4
941 movaps %xmm5, 48(%edx)
942 palignr $6, %xmm2, %xmm3
944 palignr $6, %xmm1, %xmm2
945 movaps %xmm4, 32(%edx)
946 movaps %xmm3, 16(%edx)
957 movaps 10(%eax), %xmm2
958 movaps 26(%eax), %xmm3
959 palignr $6, %xmm2, %xmm3
960 palignr $6, %xmm1, %xmm2
962 movaps %xmm3, 16(%edx)
963 lea 32(%edx, %ecx), %edx
964 lea 32(%eax, %ecx), %eax
966 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
977 L(sh_6_no_prefetch_loop):
978 movdqa 16(%eax, %edi), %xmm2
980 movdqa 32(%eax, %edi), %xmm3
982 palignr $6, %xmm2, %xmm3
983 palignr $6, %xmm1, %xmm2
985 movdqa %xmm2, -32(%edx, %edi)
986 movdqa %xmm3, -16(%edx, %edi)
988 jb L(sh_6_end_no_prefetch_loop)
990 movdqa 16(%eax, %edi), %xmm2
992 movdqa 32(%eax, %edi), %xmm3
994 palignr $6, %xmm2, %xmm3
995 palignr $6, %xmm4, %xmm2
997 movdqa %xmm2, -32(%edx, %edi)
998 movdqa %xmm3, -16(%edx, %edi)
1000 jae L(sh_6_no_prefetch_loop)
1002 L(sh_6_end_no_prefetch_loop):
1006 lea 6(%edi, %eax), %eax
1008 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1014 # ifndef USE_AS_MEMMOVE
1015 movaps -7(%eax), %xmm1
1017 movl DEST+4(%esp), %edi
1018 movaps -7(%eax), %xmm1
1019 movdqu %xmm0, (%edi)
1021 # ifdef DATA_CACHE_SIZE_HALF
1022 cmp $DATA_CACHE_SIZE_HALF, %ecx
1026 add $_GLOBAL_OFFSET_TABLE_, %ebx
1027 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1029 cmp __x86_data_cache_size_half, %ecx
1032 jb L(sh_7_no_prefetch)
1038 prefetcht0 0x1c0(%eax)
1039 prefetcht0 0x1c0(%edx)
1040 movaps 9(%eax), %xmm2
1041 movaps 25(%eax), %xmm3
1042 movaps 41(%eax), %xmm4
1043 movaps 57(%eax), %xmm5
1045 palignr $7, %xmm4, %xmm5
1046 palignr $7, %xmm3, %xmm4
1047 movaps %xmm5, 48(%edx)
1048 palignr $7, %xmm2, %xmm3
1050 palignr $7, %xmm1, %xmm2
1051 movaps %xmm4, 32(%edx)
1052 movaps %xmm3, 16(%edx)
1054 movaps %xmm2, (%edx)
1063 movaps 9(%eax), %xmm2
1064 movaps 25(%eax), %xmm3
1065 palignr $7, %xmm2, %xmm3
1066 palignr $7, %xmm1, %xmm2
1067 movaps %xmm2, (%edx)
1068 movaps %xmm3, 16(%edx)
1069 lea 32(%edx, %ecx), %edx
1070 lea 32(%eax, %ecx), %eax
1072 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1077 L(sh_7_no_prefetch):
1083 L(sh_7_no_prefetch_loop):
1084 movdqa 16(%eax, %edi), %xmm2
1086 movdqa 32(%eax, %edi), %xmm3
1088 palignr $7, %xmm2, %xmm3
1089 palignr $7, %xmm1, %xmm2
1091 movdqa %xmm2, -32(%edx, %edi)
1092 movdqa %xmm3, -16(%edx, %edi)
1093 jb L(sh_7_end_no_prefetch_loop)
1095 movdqa 16(%eax, %edi), %xmm2
1097 movdqa 32(%eax, %edi), %xmm3
1099 palignr $7, %xmm2, %xmm3
1100 palignr $7, %xmm4, %xmm2
1102 movdqa %xmm2, -32(%edx, %edi)
1103 movdqa %xmm3, -16(%edx, %edi)
1104 jae L(sh_7_no_prefetch_loop)
1106 L(sh_7_end_no_prefetch_loop):
1110 lea 7(%edi, %eax), %eax
1112 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1118 # ifndef USE_AS_MEMMOVE
1119 movaps -8(%eax), %xmm1
1121 movl DEST+4(%esp), %edi
1122 movaps -8(%eax), %xmm1
1123 movdqu %xmm0, (%edi)
1125 # ifdef DATA_CACHE_SIZE_HALF
1126 cmp $DATA_CACHE_SIZE_HALF, %ecx
1130 add $_GLOBAL_OFFSET_TABLE_, %ebx
1131 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1133 cmp __x86_data_cache_size_half, %ecx
1136 jb L(sh_8_no_prefetch)
1142 prefetcht0 0x1c0(%eax)
1143 prefetcht0 0x1c0(%edx)
1144 movaps 8(%eax), %xmm2
1145 movaps 24(%eax), %xmm3
1146 movaps 40(%eax), %xmm4
1147 movaps 56(%eax), %xmm5
1149 palignr $8, %xmm4, %xmm5
1150 palignr $8, %xmm3, %xmm4
1151 movaps %xmm5, 48(%edx)
1152 palignr $8, %xmm2, %xmm3
1154 palignr $8, %xmm1, %xmm2
1155 movaps %xmm4, 32(%edx)
1156 movaps %xmm3, 16(%edx)
1158 movaps %xmm2, (%edx)
1167 movaps 8(%eax), %xmm2
1168 movaps 24(%eax), %xmm3
1169 palignr $8, %xmm2, %xmm3
1170 palignr $8, %xmm1, %xmm2
1171 movaps %xmm2, (%edx)
1172 movaps %xmm3, 16(%edx)
1173 lea 32(%edx, %ecx), %edx
1174 lea 32(%eax, %ecx), %eax
1176 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1181 L(sh_8_no_prefetch):
1187 L(sh_8_no_prefetch_loop):
1188 movdqa 16(%eax, %edi), %xmm2
1190 movdqa 32(%eax, %edi), %xmm3
1192 palignr $8, %xmm2, %xmm3
1193 palignr $8, %xmm1, %xmm2
1195 movdqa %xmm2, -32(%edx, %edi)
1196 movdqa %xmm3, -16(%edx, %edi)
1197 jb L(sh_8_end_no_prefetch_loop)
1199 movdqa 16(%eax, %edi), %xmm2
1201 movdqa 32(%eax, %edi), %xmm3
1203 palignr $8, %xmm2, %xmm3
1204 palignr $8, %xmm4, %xmm2
1206 movdqa %xmm2, -32(%edx, %edi)
1207 movdqa %xmm3, -16(%edx, %edi)
1208 jae L(sh_8_no_prefetch_loop)
1210 L(sh_8_end_no_prefetch_loop):
1214 lea 8(%edi, %eax), %eax
1216 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1222 # ifndef USE_AS_MEMMOVE
1223 movaps -9(%eax), %xmm1
1225 movl DEST+4(%esp), %edi
1226 movaps -9(%eax), %xmm1
1227 movdqu %xmm0, (%edi)
1229 # ifdef DATA_CACHE_SIZE_HALF
1230 cmp $DATA_CACHE_SIZE_HALF, %ecx
1234 add $_GLOBAL_OFFSET_TABLE_, %ebx
1235 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1237 cmp __x86_data_cache_size_half, %ecx
1240 jb L(sh_9_no_prefetch)
1246 prefetcht0 0x1c0(%eax)
1247 prefetcht0 0x1c0(%edx)
1248 movaps 7(%eax), %xmm2
1249 movaps 23(%eax), %xmm3
1250 movaps 39(%eax), %xmm4
1251 movaps 55(%eax), %xmm5
1253 palignr $9, %xmm4, %xmm5
1254 palignr $9, %xmm3, %xmm4
1255 movaps %xmm5, 48(%edx)
1256 palignr $9, %xmm2, %xmm3
1258 palignr $9, %xmm1, %xmm2
1259 movaps %xmm4, 32(%edx)
1260 movaps %xmm3, 16(%edx)
1262 movaps %xmm2, (%edx)
1271 movaps 7(%eax), %xmm2
1272 movaps 23(%eax), %xmm3
1273 palignr $9, %xmm2, %xmm3
1274 palignr $9, %xmm1, %xmm2
1276 movaps %xmm2, (%edx)
1277 movaps %xmm3, 16(%edx)
1278 lea 32(%edx, %ecx), %edx
1279 lea 32(%eax, %ecx), %eax
1281 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1286 L(sh_9_no_prefetch):
1292 L(sh_9_no_prefetch_loop):
1293 movdqa 16(%eax, %edi), %xmm2
1295 movdqa 32(%eax, %edi), %xmm3
1297 palignr $9, %xmm2, %xmm3
1298 palignr $9, %xmm1, %xmm2
1300 movdqa %xmm2, -32(%edx, %edi)
1301 movdqa %xmm3, -16(%edx, %edi)
1302 jb L(sh_9_end_no_prefetch_loop)
1304 movdqa 16(%eax, %edi), %xmm2
1306 movdqa 32(%eax, %edi), %xmm3
1308 palignr $9, %xmm2, %xmm3
1309 palignr $9, %xmm4, %xmm2
1311 movdqa %xmm2, -32(%edx, %edi)
1312 movdqa %xmm3, -16(%edx, %edi)
1313 jae L(sh_9_no_prefetch_loop)
1315 L(sh_9_end_no_prefetch_loop):
1319 lea 9(%edi, %eax), %eax
1321 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1327 # ifndef USE_AS_MEMMOVE
1328 movaps -10(%eax), %xmm1
1330 movl DEST+4(%esp), %edi
1331 movaps -10(%eax), %xmm1
1332 movdqu %xmm0, (%edi)
1334 # ifdef DATA_CACHE_SIZE_HALF
1335 cmp $DATA_CACHE_SIZE_HALF, %ecx
1339 add $_GLOBAL_OFFSET_TABLE_, %ebx
1340 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1342 cmp __x86_data_cache_size_half, %ecx
1345 jb L(sh_10_no_prefetch)
1351 prefetcht0 0x1c0(%eax)
1352 prefetcht0 0x1c0(%edx)
1353 movaps 6(%eax), %xmm2
1354 movaps 22(%eax), %xmm3
1355 movaps 38(%eax), %xmm4
1356 movaps 54(%eax), %xmm5
1358 palignr $10, %xmm4, %xmm5
1359 palignr $10, %xmm3, %xmm4
1360 movaps %xmm5, 48(%edx)
1361 palignr $10, %xmm2, %xmm3
1363 palignr $10, %xmm1, %xmm2
1364 movaps %xmm4, 32(%edx)
1365 movaps %xmm3, 16(%edx)
1367 movaps %xmm2, (%edx)
1370 ja L(Shl10LoopStart)
1376 movaps 6(%eax), %xmm2
1377 movaps 22(%eax), %xmm3
1378 palignr $10, %xmm2, %xmm3
1379 palignr $10, %xmm1, %xmm2
1381 movaps %xmm2, (%edx)
1382 movaps %xmm3, 16(%edx)
1383 lea 32(%edx, %ecx), %edx
1384 lea 32(%eax, %ecx), %eax
1386 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1391 L(sh_10_no_prefetch):
1397 L(sh_10_no_prefetch_loop):
1398 movdqa 16(%eax, %edi), %xmm2
1400 movdqa 32(%eax, %edi), %xmm3
1402 palignr $10, %xmm2, %xmm3
1403 palignr $10, %xmm1, %xmm2
1405 movdqa %xmm2, -32(%edx, %edi)
1406 movdqa %xmm3, -16(%edx, %edi)
1407 jb L(sh_10_end_no_prefetch_loop)
1409 movdqa 16(%eax, %edi), %xmm2
1411 movdqa 32(%eax, %edi), %xmm3
1413 palignr $10, %xmm2, %xmm3
1414 palignr $10, %xmm4, %xmm2
1416 movdqa %xmm2, -32(%edx, %edi)
1417 movdqa %xmm3, -16(%edx, %edi)
1418 jae L(sh_10_no_prefetch_loop)
1420 L(sh_10_end_no_prefetch_loop):
1424 lea 10(%edi, %eax), %eax
1426 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1432 # ifndef USE_AS_MEMMOVE
1433 movaps -11(%eax), %xmm1
1435 movl DEST+4(%esp), %edi
1436 movaps -11(%eax), %xmm1
1437 movdqu %xmm0, (%edi)
1439 # ifdef DATA_CACHE_SIZE_HALF
1440 cmp $DATA_CACHE_SIZE_HALF, %ecx
1444 add $_GLOBAL_OFFSET_TABLE_, %ebx
1445 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1447 cmp __x86_data_cache_size_half, %ecx
1450 jb L(sh_11_no_prefetch)
1456 prefetcht0 0x1c0(%eax)
1457 prefetcht0 0x1c0(%edx)
1458 movaps 5(%eax), %xmm2
1459 movaps 21(%eax), %xmm3
1460 movaps 37(%eax), %xmm4
1461 movaps 53(%eax), %xmm5
1463 palignr $11, %xmm4, %xmm5
1464 palignr $11, %xmm3, %xmm4
1465 movaps %xmm5, 48(%edx)
1466 palignr $11, %xmm2, %xmm3
1468 palignr $11, %xmm1, %xmm2
1469 movaps %xmm4, 32(%edx)
1470 movaps %xmm3, 16(%edx)
1472 movaps %xmm2, (%edx)
1475 ja L(Shl11LoopStart)
1481 movaps 5(%eax), %xmm2
1482 movaps 21(%eax), %xmm3
1483 palignr $11, %xmm2, %xmm3
1484 palignr $11, %xmm1, %xmm2
1486 movaps %xmm2, (%edx)
1487 movaps %xmm3, 16(%edx)
1488 lea 32(%edx, %ecx), %edx
1489 lea 32(%eax, %ecx), %eax
1491 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1496 L(sh_11_no_prefetch):
1502 L(sh_11_no_prefetch_loop):
1503 movdqa 16(%eax, %edi), %xmm2
1505 movdqa 32(%eax, %edi), %xmm3
1507 palignr $11, %xmm2, %xmm3
1508 palignr $11, %xmm1, %xmm2
1510 movdqa %xmm2, -32(%edx, %edi)
1511 movdqa %xmm3, -16(%edx, %edi)
1512 jb L(sh_11_end_no_prefetch_loop)
1514 movdqa 16(%eax, %edi), %xmm2
1516 movdqa 32(%eax, %edi), %xmm3
1518 palignr $11, %xmm2, %xmm3
1519 palignr $11, %xmm4, %xmm2
1521 movdqa %xmm2, -32(%edx, %edi)
1522 movdqa %xmm3, -16(%edx, %edi)
1523 jae L(sh_11_no_prefetch_loop)
1525 L(sh_11_end_no_prefetch_loop):
1529 lea 11(%edi, %eax), %eax
1531 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1537 # ifndef USE_AS_MEMMOVE
1538 movaps -12(%eax), %xmm1
1540 movl DEST+4(%esp), %edi
1541 movaps -12(%eax), %xmm1
1542 movdqu %xmm0, (%edi)
1544 # ifdef DATA_CACHE_SIZE_HALF
1545 cmp $DATA_CACHE_SIZE_HALF, %ecx
1549 add $_GLOBAL_OFFSET_TABLE_, %ebx
1550 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1552 cmp __x86_data_cache_size_half, %ecx
1555 jb L(sh_12_no_prefetch)
1561 prefetcht0 0x1c0(%eax)
1562 prefetcht0 0x1c0(%edx)
1563 movaps 4(%eax), %xmm2
1564 movaps 20(%eax), %xmm3
1565 movaps 36(%eax), %xmm4
1566 movaps 52(%eax), %xmm5
1568 palignr $12, %xmm4, %xmm5
1569 palignr $12, %xmm3, %xmm4
1570 movaps %xmm5, 48(%edx)
1571 palignr $12, %xmm2, %xmm3
1573 palignr $12, %xmm1, %xmm2
1574 movaps %xmm4, 32(%edx)
1575 movaps %xmm3, 16(%edx)
1577 movaps %xmm2, (%edx)
1580 ja L(Shl12LoopStart)
1586 movaps 4(%eax), %xmm2
1587 movaps 20(%eax), %xmm3
1588 palignr $12, %xmm2, %xmm3
1589 palignr $12, %xmm1, %xmm2
1591 movaps %xmm2, (%edx)
1592 movaps %xmm3, 16(%edx)
1593 lea 32(%edx, %ecx), %edx
1594 lea 32(%eax, %ecx), %eax
1596 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1601 L(sh_12_no_prefetch):
1607 L(sh_12_no_prefetch_loop):
1608 movdqa 16(%eax, %edi), %xmm2
1610 movdqa 32(%eax, %edi), %xmm3
1612 palignr $12, %xmm2, %xmm3
1613 palignr $12, %xmm1, %xmm2
1615 movdqa %xmm2, -32(%edx, %edi)
1616 movdqa %xmm3, -16(%edx, %edi)
1617 jb L(sh_12_end_no_prefetch_loop)
1619 movdqa 16(%eax, %edi), %xmm2
1621 movdqa 32(%eax, %edi), %xmm3
1623 palignr $12, %xmm2, %xmm3
1624 palignr $12, %xmm4, %xmm2
1626 movdqa %xmm2, -32(%edx, %edi)
1627 movdqa %xmm3, -16(%edx, %edi)
1628 jae L(sh_12_no_prefetch_loop)
1630 L(sh_12_end_no_prefetch_loop):
1634 lea 12(%edi, %eax), %eax
1636 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1642 # ifndef USE_AS_MEMMOVE
1643 movaps -13(%eax), %xmm1
1645 movl DEST+4(%esp), %edi
1646 movaps -13(%eax), %xmm1
1647 movdqu %xmm0, (%edi)
1649 # ifdef DATA_CACHE_SIZE_HALF
1650 cmp $DATA_CACHE_SIZE_HALF, %ecx
1654 add $_GLOBAL_OFFSET_TABLE_, %ebx
1655 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1657 cmp __x86_data_cache_size_half, %ecx
1660 jb L(sh_13_no_prefetch)
1666 prefetcht0 0x1c0(%eax)
1667 prefetcht0 0x1c0(%edx)
1668 movaps 3(%eax), %xmm2
1669 movaps 19(%eax), %xmm3
1670 movaps 35(%eax), %xmm4
1671 movaps 51(%eax), %xmm5
1673 palignr $13, %xmm4, %xmm5
1674 palignr $13, %xmm3, %xmm4
1675 movaps %xmm5, 48(%edx)
1676 palignr $13, %xmm2, %xmm3
1678 palignr $13, %xmm1, %xmm2
1679 movaps %xmm4, 32(%edx)
1680 movaps %xmm3, 16(%edx)
1682 movaps %xmm2, (%edx)
1685 ja L(Shl13LoopStart)
1691 movaps 3(%eax), %xmm2
1692 movaps 19(%eax), %xmm3
1693 palignr $13, %xmm2, %xmm3
1694 palignr $13, %xmm1, %xmm2
1696 movaps %xmm2, (%edx)
1697 movaps %xmm3, 16(%edx)
1698 lea 32(%edx, %ecx), %edx
1699 lea 32(%eax, %ecx), %eax
1701 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1706 L(sh_13_no_prefetch):
1712 L(sh_13_no_prefetch_loop):
1713 movdqa 16(%eax, %edi), %xmm2
1715 movdqa 32(%eax, %edi), %xmm3
1717 palignr $13, %xmm2, %xmm3
1718 palignr $13, %xmm1, %xmm2
1720 movdqa %xmm2, -32(%edx, %edi)
1721 movdqa %xmm3, -16(%edx, %edi)
1722 jb L(sh_13_end_no_prefetch_loop)
1724 movdqa 16(%eax, %edi), %xmm2
1726 movdqa 32(%eax, %edi), %xmm3
1728 palignr $13, %xmm2, %xmm3
1729 palignr $13, %xmm4, %xmm2
1731 movdqa %xmm2, -32(%edx, %edi)
1732 movdqa %xmm3, -16(%edx, %edi)
1733 jae L(sh_13_no_prefetch_loop)
1735 L(sh_13_end_no_prefetch_loop):
1739 lea 13(%edi, %eax), %eax
1741 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1747 # ifndef USE_AS_MEMMOVE
1748 movaps -14(%eax), %xmm1
1750 movl DEST+4(%esp), %edi
1751 movaps -14(%eax), %xmm1
1752 movdqu %xmm0, (%edi)
1754 # ifdef DATA_CACHE_SIZE_HALF
1755 cmp $DATA_CACHE_SIZE_HALF, %ecx
1759 add $_GLOBAL_OFFSET_TABLE_, %ebx
1760 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1762 cmp __x86_data_cache_size_half, %ecx
1765 jb L(sh_14_no_prefetch)
1771 prefetcht0 0x1c0(%eax)
1772 prefetcht0 0x1c0(%edx)
1773 movaps 2(%eax), %xmm2
1774 movaps 18(%eax), %xmm3
1775 movaps 34(%eax), %xmm4
1776 movaps 50(%eax), %xmm5
1778 palignr $14, %xmm4, %xmm5
1779 palignr $14, %xmm3, %xmm4
1780 movaps %xmm5, 48(%edx)
1781 palignr $14, %xmm2, %xmm3
1783 palignr $14, %xmm1, %xmm2
1784 movaps %xmm4, 32(%edx)
1785 movaps %xmm3, 16(%edx)
1787 movaps %xmm2, (%edx)
1790 ja L(Shl14LoopStart)
1796 movaps 2(%eax), %xmm2
1797 movaps 18(%eax), %xmm3
1798 palignr $14, %xmm2, %xmm3
1799 palignr $14, %xmm1, %xmm2
1801 movaps %xmm2, (%edx)
1802 movaps %xmm3, 16(%edx)
1803 lea 32(%edx, %ecx), %edx
1804 lea 32(%eax, %ecx), %eax
1806 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1811 L(sh_14_no_prefetch):
1817 L(sh_14_no_prefetch_loop):
1818 movdqa 16(%eax, %edi), %xmm2
1820 movdqa 32(%eax, %edi), %xmm3
1822 palignr $14, %xmm2, %xmm3
1823 palignr $14, %xmm1, %xmm2
1825 movdqa %xmm2, -32(%edx, %edi)
1826 movdqa %xmm3, -16(%edx, %edi)
1827 jb L(sh_14_end_no_prefetch_loop)
1829 movdqa 16(%eax, %edi), %xmm2
1831 movdqa 32(%eax, %edi), %xmm3
1833 palignr $14, %xmm2, %xmm3
1834 palignr $14, %xmm4, %xmm2
1836 movdqa %xmm2, -32(%edx, %edi)
1837 movdqa %xmm3, -16(%edx, %edi)
1838 jae L(sh_14_no_prefetch_loop)
1840 L(sh_14_end_no_prefetch_loop):
1844 lea 14(%edi, %eax), %eax
1846 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1852 # ifndef USE_AS_MEMMOVE
1853 movaps -15(%eax), %xmm1
1855 movl DEST+4(%esp), %edi
1856 movaps -15(%eax), %xmm1
1857 movdqu %xmm0, (%edi)
1859 # ifdef DATA_CACHE_SIZE_HALF
1860 cmp $DATA_CACHE_SIZE_HALF, %ecx
1864 add $_GLOBAL_OFFSET_TABLE_, %ebx
1865 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1867 cmp __x86_data_cache_size_half, %ecx
1870 jb L(sh_15_no_prefetch)
1876 prefetcht0 0x1c0(%eax)
1877 prefetcht0 0x1c0(%edx)
1878 movaps 1(%eax), %xmm2
1879 movaps 17(%eax), %xmm3
1880 movaps 33(%eax), %xmm4
1881 movaps 49(%eax), %xmm5
1883 palignr $15, %xmm4, %xmm5
1884 palignr $15, %xmm3, %xmm4
1885 movaps %xmm5, 48(%edx)
1886 palignr $15, %xmm2, %xmm3
1888 palignr $15, %xmm1, %xmm2
1889 movaps %xmm4, 32(%edx)
1890 movaps %xmm3, 16(%edx)
1892 movaps %xmm2, (%edx)
1895 ja L(Shl15LoopStart)
1901 movaps 1(%eax), %xmm2
1902 movaps 17(%eax), %xmm3
1903 palignr $15, %xmm2, %xmm3
1904 palignr $15, %xmm1, %xmm2
1906 movaps %xmm2, (%edx)
1907 movaps %xmm3, 16(%edx)
1908 lea 32(%edx, %ecx), %edx
1909 lea 32(%eax, %ecx), %eax
1911 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1916 L(sh_15_no_prefetch):
1922 L(sh_15_no_prefetch_loop):
1923 movdqa 16(%eax, %edi), %xmm2
1925 movdqa 32(%eax, %edi), %xmm3
1927 palignr $15, %xmm2, %xmm3
1928 palignr $15, %xmm1, %xmm2
1930 movdqa %xmm2, -32(%edx, %edi)
1931 movdqa %xmm3, -16(%edx, %edi)
1932 jb L(sh_15_end_no_prefetch_loop)
1934 movdqa 16(%eax, %edi), %xmm2
1936 movdqa 32(%eax, %edi), %xmm3
1938 palignr $15, %xmm2, %xmm3
1939 palignr $15, %xmm4, %xmm2
1941 movdqa %xmm2, -32(%edx, %edi)
1942 movdqa %xmm3, -16(%edx, %edi)
1943 jae L(sh_15_no_prefetch_loop)
1945 L(sh_15_end_no_prefetch_loop):
1949 lea 15(%edi, %eax), %eax
1951 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1958 lea (%edx, %ecx), %edx
1959 lea (%eax, %ecx), %eax
1961 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1964 L(fwd_write_44bytes):
1965 movq -44(%eax), %xmm0
1966 movq %xmm0, -44(%edx)
1967 L(fwd_write_36bytes):
1968 movq -36(%eax), %xmm0
1969 movq %xmm0, -36(%edx)
1970 L(fwd_write_28bytes):
1971 movq -28(%eax), %xmm0
1972 movq %xmm0, -28(%edx)
1973 L(fwd_write_20bytes):
1974 movq -20(%eax), %xmm0
1975 movq %xmm0, -20(%edx)
1976 L(fwd_write_12bytes):
1977 movq -12(%eax), %xmm0
1978 movq %xmm0, -12(%edx)
1979 L(fwd_write_4bytes):
1982 # ifndef USE_AS_BCOPY
1983 # ifdef USE_AS_MEMPCPY
1986 movl DEST(%esp), %eax
1992 L(fwd_write_40bytes):
1993 movq -40(%eax), %xmm0
1994 movq %xmm0, -40(%edx)
1995 L(fwd_write_32bytes):
1996 movq -32(%eax), %xmm0
1997 movq %xmm0, -32(%edx)
1998 L(fwd_write_24bytes):
1999 movq -24(%eax), %xmm0
2000 movq %xmm0, -24(%edx)
2001 L(fwd_write_16bytes):
2002 movq -16(%eax), %xmm0
2003 movq %xmm0, -16(%edx)
2004 L(fwd_write_8bytes):
2005 movq -8(%eax), %xmm0
2006 movq %xmm0, -8(%edx)
2007 L(fwd_write_0bytes):
2008 # ifndef USE_AS_BCOPY
2009 # ifdef USE_AS_MEMPCPY
2012 movl DEST(%esp), %eax
2018 L(fwd_write_5bytes):
2023 # ifndef USE_AS_BCOPY
2024 # ifdef USE_AS_MEMPCPY
2027 movl DEST(%esp), %eax
2033 L(fwd_write_45bytes):
2034 movq -45(%eax), %xmm0
2035 movq %xmm0, -45(%edx)
2036 L(fwd_write_37bytes):
2037 movq -37(%eax), %xmm0
2038 movq %xmm0, -37(%edx)
2039 L(fwd_write_29bytes):
2040 movq -29(%eax), %xmm0
2041 movq %xmm0, -29(%edx)
2042 L(fwd_write_21bytes):
2043 movq -21(%eax), %xmm0
2044 movq %xmm0, -21(%edx)
2045 L(fwd_write_13bytes):
2046 movq -13(%eax), %xmm0
2047 movq %xmm0, -13(%edx)
2050 movzbl -1(%eax), %ecx
2052 # ifndef USE_AS_BCOPY
2053 # ifdef USE_AS_MEMPCPY
2056 movl DEST(%esp), %eax
2062 L(fwd_write_41bytes):
2063 movq -41(%eax), %xmm0
2064 movq %xmm0, -41(%edx)
2065 L(fwd_write_33bytes):
2066 movq -33(%eax), %xmm0
2067 movq %xmm0, -33(%edx)
2068 L(fwd_write_25bytes):
2069 movq -25(%eax), %xmm0
2070 movq %xmm0, -25(%edx)
2071 L(fwd_write_17bytes):
2072 movq -17(%eax), %xmm0
2073 movq %xmm0, -17(%edx)
2074 L(fwd_write_9bytes):
2075 movq -9(%eax), %xmm0
2076 movq %xmm0, -9(%edx)
2077 L(fwd_write_1bytes):
2078 movzbl -1(%eax), %ecx
2080 # ifndef USE_AS_BCOPY
2081 # ifdef USE_AS_MEMPCPY
2084 movl DEST(%esp), %eax
2090 L(fwd_write_46bytes):
2091 movq -46(%eax), %xmm0
2092 movq %xmm0, -46(%edx)
2093 L(fwd_write_38bytes):
2094 movq -38(%eax), %xmm0
2095 movq %xmm0, -38(%edx)
2096 L(fwd_write_30bytes):
2097 movq -30(%eax), %xmm0
2098 movq %xmm0, -30(%edx)
2099 L(fwd_write_22bytes):
2100 movq -22(%eax), %xmm0
2101 movq %xmm0, -22(%edx)
2102 L(fwd_write_14bytes):
2103 movq -14(%eax), %xmm0
2104 movq %xmm0, -14(%edx)
2105 L(fwd_write_6bytes):
2108 movzwl -2(%eax), %ecx
2110 # ifndef USE_AS_BCOPY
2111 # ifdef USE_AS_MEMPCPY
2114 movl DEST(%esp), %eax
2120 L(fwd_write_42bytes):
2121 movq -42(%eax), %xmm0
2122 movq %xmm0, -42(%edx)
2123 L(fwd_write_34bytes):
2124 movq -34(%eax), %xmm0
2125 movq %xmm0, -34(%edx)
2126 L(fwd_write_26bytes):
2127 movq -26(%eax), %xmm0
2128 movq %xmm0, -26(%edx)
2129 L(fwd_write_18bytes):
2130 movq -18(%eax), %xmm0
2131 movq %xmm0, -18(%edx)
2132 L(fwd_write_10bytes):
2133 movq -10(%eax), %xmm0
2134 movq %xmm0, -10(%edx)
2135 L(fwd_write_2bytes):
2136 movzwl -2(%eax), %ecx
2138 # ifndef USE_AS_BCOPY
2139 # ifdef USE_AS_MEMPCPY
2142 movl DEST(%esp), %eax
2148 L(fwd_write_47bytes):
2149 movq -47(%eax), %xmm0
2150 movq %xmm0, -47(%edx)
2151 L(fwd_write_39bytes):
2152 movq -39(%eax), %xmm0
2153 movq %xmm0, -39(%edx)
2154 L(fwd_write_31bytes):
2155 movq -31(%eax), %xmm0
2156 movq %xmm0, -31(%edx)
2157 L(fwd_write_23bytes):
2158 movq -23(%eax), %xmm0
2159 movq %xmm0, -23(%edx)
2160 L(fwd_write_15bytes):
2161 movq -15(%eax), %xmm0
2162 movq %xmm0, -15(%edx)
2163 L(fwd_write_7bytes):
2166 movzwl -3(%eax), %ecx
2167 movzbl -1(%eax), %eax
2170 # ifndef USE_AS_BCOPY
2171 # ifdef USE_AS_MEMPCPY
2174 movl DEST(%esp), %eax
2180 L(fwd_write_43bytes):
2181 movq -43(%eax), %xmm0
2182 movq %xmm0, -43(%edx)
2183 L(fwd_write_35bytes):
2184 movq -35(%eax), %xmm0
2185 movq %xmm0, -35(%edx)
2186 L(fwd_write_27bytes):
2187 movq -27(%eax), %xmm0
2188 movq %xmm0, -27(%edx)
2189 L(fwd_write_19bytes):
2190 movq -19(%eax), %xmm0
2191 movq %xmm0, -19(%edx)
2192 L(fwd_write_11bytes):
2193 movq -11(%eax), %xmm0
2194 movq %xmm0, -11(%edx)
2195 L(fwd_write_3bytes):
2196 movzwl -3(%eax), %ecx
2197 movzbl -1(%eax), %eax
2200 # ifndef USE_AS_BCOPY
2201 # ifdef USE_AS_MEMPCPY
2204 movl DEST(%esp), %eax
2210 L(fwd_write_40bytes_align):
2211 movdqa -40(%eax), %xmm0
2212 movdqa %xmm0, -40(%edx)
2213 L(fwd_write_24bytes_align):
2214 movdqa -24(%eax), %xmm0
2215 movdqa %xmm0, -24(%edx)
2216 L(fwd_write_8bytes_align):
2217 movq -8(%eax), %xmm0
2218 movq %xmm0, -8(%edx)
2219 L(fwd_write_0bytes_align):
2220 # ifndef USE_AS_BCOPY
2221 # ifdef USE_AS_MEMPCPY
2224 movl DEST(%esp), %eax
2230 L(fwd_write_32bytes_align):
2231 movdqa -32(%eax), %xmm0
2232 movdqa %xmm0, -32(%edx)
2233 L(fwd_write_16bytes_align):
2234 movdqa -16(%eax), %xmm0
2235 movdqa %xmm0, -16(%edx)
2236 # ifndef USE_AS_BCOPY
2237 # ifdef USE_AS_MEMPCPY
2240 movl DEST(%esp), %eax
2246 L(fwd_write_5bytes_align):
2251 # ifndef USE_AS_BCOPY
2252 # ifdef USE_AS_MEMPCPY
2255 movl DEST(%esp), %eax
2261 L(fwd_write_45bytes_align):
2262 movdqa -45(%eax), %xmm0
2263 movdqa %xmm0, -45(%edx)
2264 L(fwd_write_29bytes_align):
2265 movdqa -29(%eax), %xmm0
2266 movdqa %xmm0, -29(%edx)
2267 L(fwd_write_13bytes_align):
2268 movq -13(%eax), %xmm0
2269 movq %xmm0, -13(%edx)
2272 movzbl -1(%eax), %ecx
2274 # ifndef USE_AS_BCOPY
2275 # ifdef USE_AS_MEMPCPY
2278 movl DEST(%esp), %eax
2284 L(fwd_write_37bytes_align):
2285 movdqa -37(%eax), %xmm0
2286 movdqa %xmm0, -37(%edx)
2287 L(fwd_write_21bytes_align):
2288 movdqa -21(%eax), %xmm0
2289 movdqa %xmm0, -21(%edx)
2292 movzbl -1(%eax), %ecx
2294 # ifndef USE_AS_BCOPY
2295 # ifdef USE_AS_MEMPCPY
2298 movl DEST(%esp), %eax
2304 L(fwd_write_41bytes_align):
2305 movdqa -41(%eax), %xmm0
2306 movdqa %xmm0, -41(%edx)
2307 L(fwd_write_25bytes_align):
2308 movdqa -25(%eax), %xmm0
2309 movdqa %xmm0, -25(%edx)
2310 L(fwd_write_9bytes_align):
2311 movq -9(%eax), %xmm0
2312 movq %xmm0, -9(%edx)
2313 L(fwd_write_1bytes_align):
2314 movzbl -1(%eax), %ecx
2316 # ifndef USE_AS_BCOPY
2317 # ifdef USE_AS_MEMPCPY
2320 movl DEST(%esp), %eax
2326 L(fwd_write_33bytes_align):
2327 movdqa -33(%eax), %xmm0
2328 movdqa %xmm0, -33(%edx)
2329 L(fwd_write_17bytes_align):
2330 movdqa -17(%eax), %xmm0
2331 movdqa %xmm0, -17(%edx)
2332 movzbl -1(%eax), %ecx
2334 # ifndef USE_AS_BCOPY
2335 # ifdef USE_AS_MEMPCPY
2338 movl DEST(%esp), %eax
2344 L(fwd_write_46bytes_align):
2345 movdqa -46(%eax), %xmm0
2346 movdqa %xmm0, -46(%edx)
2347 L(fwd_write_30bytes_align):
2348 movdqa -30(%eax), %xmm0
2349 movdqa %xmm0, -30(%edx)
2350 L(fwd_write_14bytes_align):
2351 movq -14(%eax), %xmm0
2352 movq %xmm0, -14(%edx)
2353 L(fwd_write_6bytes_align):
2356 movzwl -2(%eax), %ecx
2358 # ifndef USE_AS_BCOPY
2359 # ifdef USE_AS_MEMPCPY
2362 movl DEST(%esp), %eax
2368 L(fwd_write_38bytes_align):
2369 movdqa -38(%eax), %xmm0
2370 movdqa %xmm0, -38(%edx)
2371 L(fwd_write_22bytes_align):
2372 movdqa -22(%eax), %xmm0
2373 movdqa %xmm0, -22(%edx)
2376 movzwl -2(%eax), %ecx
2378 # ifndef USE_AS_BCOPY
2379 # ifdef USE_AS_MEMPCPY
2382 movl DEST(%esp), %eax
2388 L(fwd_write_42bytes_align):
2389 movdqa -42(%eax), %xmm0
2390 movdqa %xmm0, -42(%edx)
2391 L(fwd_write_26bytes_align):
2392 movdqa -26(%eax), %xmm0
2393 movdqa %xmm0, -26(%edx)
2394 L(fwd_write_10bytes_align):
2395 movq -10(%eax), %xmm0
2396 movq %xmm0, -10(%edx)
2397 L(fwd_write_2bytes_align):
2398 movzwl -2(%eax), %ecx
2400 # ifndef USE_AS_BCOPY
2401 # ifdef USE_AS_MEMPCPY
2404 movl DEST(%esp), %eax
2410 L(fwd_write_34bytes_align):
2411 movdqa -34(%eax), %xmm0
2412 movdqa %xmm0, -34(%edx)
2413 L(fwd_write_18bytes_align):
2414 movdqa -18(%eax), %xmm0
2415 movdqa %xmm0, -18(%edx)
2416 movzwl -2(%eax), %ecx
2418 # ifndef USE_AS_BCOPY
2419 # ifdef USE_AS_MEMPCPY
2422 movl DEST(%esp), %eax
2428 L(fwd_write_47bytes_align):
2429 movdqa -47(%eax), %xmm0
2430 movdqa %xmm0, -47(%edx)
2431 L(fwd_write_31bytes_align):
2432 movdqa -31(%eax), %xmm0
2433 movdqa %xmm0, -31(%edx)
2434 L(fwd_write_15bytes_align):
2435 movq -15(%eax), %xmm0
2436 movq %xmm0, -15(%edx)
2437 L(fwd_write_7bytes_align):
2440 movzwl -3(%eax), %ecx
2441 movzbl -1(%eax), %eax
2444 # ifndef USE_AS_BCOPY
2445 # ifdef USE_AS_MEMPCPY
2448 movl DEST(%esp), %eax
2454 L(fwd_write_39bytes_align):
2455 movdqa -39(%eax), %xmm0
2456 movdqa %xmm0, -39(%edx)
2457 L(fwd_write_23bytes_align):
2458 movdqa -23(%eax), %xmm0
2459 movdqa %xmm0, -23(%edx)
2462 movzwl -3(%eax), %ecx
2463 movzbl -1(%eax), %eax
2466 # ifndef USE_AS_BCOPY
2467 # ifdef USE_AS_MEMPCPY
2470 movl DEST(%esp), %eax
2476 L(fwd_write_43bytes_align):
2477 movdqa -43(%eax), %xmm0
2478 movdqa %xmm0, -43(%edx)
2479 L(fwd_write_27bytes_align):
2480 movdqa -27(%eax), %xmm0
2481 movdqa %xmm0, -27(%edx)
2482 L(fwd_write_11bytes_align):
2483 movq -11(%eax), %xmm0
2484 movq %xmm0, -11(%edx)
2485 L(fwd_write_3bytes_align):
2486 movzwl -3(%eax), %ecx
2487 movzbl -1(%eax), %eax
2490 # ifndef USE_AS_BCOPY
2491 # ifdef USE_AS_MEMPCPY
2494 movl DEST(%esp), %eax
2500 L(fwd_write_35bytes_align):
2501 movdqa -35(%eax), %xmm0
2502 movdqa %xmm0, -35(%edx)
2503 L(fwd_write_19bytes_align):
2504 movdqa -19(%eax), %xmm0
2505 movdqa %xmm0, -19(%edx)
2506 movzwl -3(%eax), %ecx
2507 movzbl -1(%eax), %eax
2510 # ifndef USE_AS_BCOPY
2511 # ifdef USE_AS_MEMPCPY
2514 movl DEST(%esp), %eax
2520 L(fwd_write_44bytes_align):
2521 movdqa -44(%eax), %xmm0
2522 movdqa %xmm0, -44(%edx)
2523 L(fwd_write_28bytes_align):
2524 movdqa -28(%eax), %xmm0
2525 movdqa %xmm0, -28(%edx)
2526 L(fwd_write_12bytes_align):
2527 movq -12(%eax), %xmm0
2528 movq %xmm0, -12(%edx)
2529 L(fwd_write_4bytes_align):
2532 # ifndef USE_AS_BCOPY
2533 # ifdef USE_AS_MEMPCPY
2536 movl DEST(%esp), %eax
2542 L(fwd_write_36bytes_align):
2543 movdqa -36(%eax), %xmm0
2544 movdqa %xmm0, -36(%edx)
2545 L(fwd_write_20bytes_align):
2546 movdqa -20(%eax), %xmm0
2547 movdqa %xmm0, -20(%edx)
2550 # ifndef USE_AS_BCOPY
2551 # ifdef USE_AS_MEMPCPY
2554 movl DEST(%esp), %eax
2563 movdqu (%eax), %xmm1
2564 # ifdef USE_AS_MEMMOVE
2565 movl DEST+4(%esp), %edi
2566 movdqu %xmm0, (%edi)
2569 movntdq %xmm1, (%edx)
2571 lea -0x90(%ecx), %ecx
2576 movdqu (%eax), %xmm0
2577 movdqu 0x10(%eax), %xmm1
2578 movdqu 0x20(%eax), %xmm2
2579 movdqu 0x30(%eax), %xmm3
2580 movdqu 0x40(%eax), %xmm4
2581 movdqu 0x50(%eax), %xmm5
2582 movdqu 0x60(%eax), %xmm6
2583 movdqu 0x70(%eax), %xmm7
2584 lea 0x80(%eax), %eax
2587 movntdq %xmm0, (%edx)
2588 movntdq %xmm1, 0x10(%edx)
2589 movntdq %xmm2, 0x20(%edx)
2590 movntdq %xmm3, 0x30(%edx)
2591 movntdq %xmm4, 0x40(%edx)
2592 movntdq %xmm5, 0x50(%edx)
2593 movntdq %xmm6, 0x60(%edx)
2594 movntdq %xmm7, 0x70(%edx)
2595 lea 0x80(%edx), %edx
2596 jae L(large_page_loop)
2598 lea 0x80(%ecx), %ecx
2599 jl L(large_page_less_64bytes)
2601 movdqu (%eax), %xmm0
2602 movdqu 0x10(%eax), %xmm1
2603 movdqu 0x20(%eax), %xmm2
2604 movdqu 0x30(%eax), %xmm3
2605 lea 0x40(%eax), %eax
2607 movntdq %xmm0, (%edx)
2608 movntdq %xmm1, 0x10(%edx)
2609 movntdq %xmm2, 0x20(%edx)
2610 movntdq %xmm3, 0x30(%edx)
2611 lea 0x40(%edx), %edx
2613 L(large_page_less_64bytes):
2615 jb L(large_page_less_32bytes)
2616 movdqu (%eax), %xmm0
2617 movdqu 0x10(%eax), %xmm1
2618 lea 0x20(%eax), %eax
2619 movntdq %xmm0, (%edx)
2620 movntdq %xmm1, 0x10(%edx)
2621 lea 0x20(%edx), %edx
2623 L(large_page_less_32bytes):
2627 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2630 L(bk_write_44bytes):
2631 movq 36(%eax), %xmm0
2632 movq %xmm0, 36(%edx)
2633 L(bk_write_36bytes):
2634 movq 28(%eax), %xmm0
2635 movq %xmm0, 28(%edx)
2636 L(bk_write_28bytes):
2637 movq 20(%eax), %xmm0
2638 movq %xmm0, 20(%edx)
2639 L(bk_write_20bytes):
2640 movq 12(%eax), %xmm0
2641 movq %xmm0, 12(%edx)
2642 L(bk_write_12bytes):
2649 # ifndef USE_AS_BCOPY
2650 movl DEST(%esp), %eax
2651 # ifdef USE_AS_MEMPCPY
2652 movl LEN(%esp), %ecx
2659 L(bk_write_40bytes):
2660 movq 32(%eax), %xmm0
2661 movq %xmm0, 32(%edx)
2662 L(bk_write_32bytes):
2663 movq 24(%eax), %xmm0
2664 movq %xmm0, 24(%edx)
2665 L(bk_write_24bytes):
2666 movq 16(%eax), %xmm0
2667 movq %xmm0, 16(%edx)
2668 L(bk_write_16bytes):
2674 # ifndef USE_AS_BCOPY
2675 movl DEST(%esp), %eax
2676 # ifdef USE_AS_MEMPCPY
2677 movl LEN(%esp), %ecx
2684 L(bk_write_45bytes):
2685 movq 37(%eax), %xmm0
2686 movq %xmm0, 37(%edx)
2687 L(bk_write_37bytes):
2688 movq 29(%eax), %xmm0
2689 movq %xmm0, 29(%edx)
2690 L(bk_write_29bytes):
2691 movq 21(%eax), %xmm0
2692 movq %xmm0, 21(%edx)
2693 L(bk_write_21bytes):
2694 movq 13(%eax), %xmm0
2695 movq %xmm0, 13(%edx)
2696 L(bk_write_13bytes):
2705 # ifndef USE_AS_BCOPY
2706 movl DEST(%esp), %eax
2707 # ifdef USE_AS_MEMPCPY
2708 movl LEN(%esp), %ecx
2715 L(bk_write_41bytes):
2716 movq 33(%eax), %xmm0
2717 movq %xmm0, 33(%edx)
2718 L(bk_write_33bytes):
2719 movq 25(%eax), %xmm0
2720 movq %xmm0, 25(%edx)
2721 L(bk_write_25bytes):
2722 movq 17(%eax), %xmm0
2723 movq %xmm0, 17(%edx)
2724 L(bk_write_17bytes):
2732 # ifndef USE_AS_BCOPY
2733 movl DEST(%esp), %eax
2734 # ifdef USE_AS_MEMPCPY
2735 movl LEN(%esp), %ecx
2742 L(bk_write_46bytes):
2743 movq 38(%eax), %xmm0
2744 movq %xmm0, 38(%edx)
2745 L(bk_write_38bytes):
2746 movq 30(%eax), %xmm0
2747 movq %xmm0, 30(%edx)
2748 L(bk_write_30bytes):
2749 movq 22(%eax), %xmm0
2750 movq %xmm0, 22(%edx)
2751 L(bk_write_22bytes):
2752 movq 14(%eax), %xmm0
2753 movq %xmm0, 14(%edx)
2754 L(bk_write_14bytes):
2762 # ifndef USE_AS_BCOPY
2763 movl DEST(%esp), %eax
2764 # ifdef USE_AS_MEMPCPY
2765 movl LEN(%esp), %ecx
2772 L(bk_write_42bytes):
2773 movq 34(%eax), %xmm0
2774 movq %xmm0, 34(%edx)
2775 L(bk_write_34bytes):
2776 movq 26(%eax), %xmm0
2777 movq %xmm0, 26(%edx)
2778 L(bk_write_26bytes):
2779 movq 18(%eax), %xmm0
2780 movq %xmm0, 18(%edx)
2781 L(bk_write_18bytes):
2782 movq 10(%eax), %xmm0
2783 movq %xmm0, 10(%edx)
2784 L(bk_write_10bytes):
2790 # ifndef USE_AS_BCOPY
2791 movl DEST(%esp), %eax
2792 # ifdef USE_AS_MEMPCPY
2793 movl LEN(%esp), %ecx
2800 L(bk_write_47bytes):
2801 movq 39(%eax), %xmm0
2802 movq %xmm0, 39(%edx)
2803 L(bk_write_39bytes):
2804 movq 31(%eax), %xmm0
2805 movq %xmm0, 31(%edx)
2806 L(bk_write_31bytes):
2807 movq 23(%eax), %xmm0
2808 movq %xmm0, 23(%edx)
2809 L(bk_write_23bytes):
2810 movq 15(%eax), %xmm0
2811 movq %xmm0, 15(%edx)
2812 L(bk_write_15bytes):
2818 movzwl 1(%eax), %ecx
2822 # ifndef USE_AS_BCOPY
2823 movl DEST(%esp), %eax
2824 # ifdef USE_AS_MEMPCPY
2825 movl LEN(%esp), %ecx
2832 L(bk_write_43bytes):
2833 movq 35(%eax), %xmm0
2834 movq %xmm0, 35(%edx)
2835 L(bk_write_35bytes):
2836 movq 27(%eax), %xmm0
2837 movq %xmm0, 27(%edx)
2838 L(bk_write_27bytes):
2839 movq 19(%eax), %xmm0
2840 movq %xmm0, 19(%edx)
2841 L(bk_write_19bytes):
2842 movq 11(%eax), %xmm0
2843 movq %xmm0, 11(%edx)
2844 L(bk_write_11bytes):
2848 movzwl 1(%eax), %ecx
2852 # ifndef USE_AS_BCOPY
2853 movl DEST(%esp), %eax
2854 # ifdef USE_AS_MEMPCPY
2855 movl LEN(%esp), %ecx
2862 .pushsection .rodata.ssse3,"a",@progbits
2864 L(table_48bytes_fwd):
2865 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2866 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2879 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2880 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2881 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2882 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2883 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2884 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2885 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2886 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2887 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2888 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2889 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2890 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2891 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2892 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2893 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2894 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2895 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2896 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2897 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2898 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2899 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2900 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2901 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2902 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2903 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2904 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2915 L(table_48bytes_fwd_align):
2916 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2917 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2930 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2931 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2932 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2933 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2934 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2935 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2936 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2937 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2938 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2939 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2940 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2941 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2942 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2943 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2944 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2945 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2946 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2947 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2948 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2949 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2950 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2951 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2952 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2953 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2954 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2955 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2967 .int JMPTBL (L(shl_0), L(shl_table))
2968 .int JMPTBL (L(shl_1), L(shl_table))
2969 .int JMPTBL (L(shl_2), L(shl_table))
2970 .int JMPTBL (L(shl_3), L(shl_table))
2971 .int JMPTBL (L(shl_4), L(shl_table))
2972 .int JMPTBL (L(shl_5), L(shl_table))
2973 .int JMPTBL (L(shl_6), L(shl_table))
2974 .int JMPTBL (L(shl_7), L(shl_table))
2975 .int JMPTBL (L(shl_8), L(shl_table))
2976 .int JMPTBL (L(shl_9), L(shl_table))
2977 .int JMPTBL (L(shl_10), L(shl_table))
2978 .int JMPTBL (L(shl_11), L(shl_table))
2979 .int JMPTBL (L(shl_12), L(shl_table))
2980 .int JMPTBL (L(shl_13), L(shl_table))
2981 .int JMPTBL (L(shl_14), L(shl_table))
2982 .int JMPTBL (L(shl_15), L(shl_table))
2985 L(table_48_bytes_bwd):
2986 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2987 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3000 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3001 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3002 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3003 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3004 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3005 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3006 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3007 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3008 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3009 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3010 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3011 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3012 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3013 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3014 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3015 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3016 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3017 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3018 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3019 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3020 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3021 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3022 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3023 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3024 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3025 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3037 # ifdef USE_AS_MEMMOVE
3042 lea (%ecx,%edx,1),%edx
3043 lea (%ecx,%edi,1),%edi
3049 jae L(bk_write_more64bytes)
3051 L(bk_write_64bytesless):
3053 jb L(bk_write_less32bytes)
3055 L(bk_write_more32bytes):
3056 /* Copy 32 bytes at a time. */
3058 movq -8(%edi), %xmm0
3059 movq %xmm0, -8(%edx)
3060 movq -16(%edi), %xmm0
3061 movq %xmm0, -16(%edx)
3062 movq -24(%edi), %xmm0
3063 movq %xmm0, -24(%edx)
3064 movq -32(%edi), %xmm0
3065 movq %xmm0, -32(%edx)
3069 L(bk_write_less32bytes):
3074 L(bk_write_less32bytes_2):
3075 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3082 jbe L(bk_write_less32bytes)
3084 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3085 then (EDX & 2) must be != 0. */
3105 L(bk_write_more64bytes):
3106 /* Check alignment of last byte. */
3108 jz L(bk_ssse3_cpy_pre)
3110 /* EDX is aligned 4 bytes, but not 16 bytes. */
3119 jz L(bk_ssse3_cpy_pre)
3128 jz L(bk_ssse3_cpy_pre)
3136 L(bk_ssse3_cpy_pre):
3138 jb L(bk_write_more32bytes)
3145 movdqu 0x30(%edi), %xmm3
3146 movdqa %xmm3, 0x30(%edx)
3147 movdqu 0x20(%edi), %xmm2
3148 movdqa %xmm2, 0x20(%edx)
3149 movdqu 0x10(%edi), %xmm1
3150 movdqa %xmm1, 0x10(%edx)
3151 movdqu (%edi), %xmm0
3152 movdqa %xmm0, (%edx)
3155 jmp L(bk_write_64bytesless)