2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
27 #include "asm-syntax.h"
30 # define MEMCPY __memcpy_ssse3
31 # define MEMCPY_CHK __memcpy_chk_ssse3
34 #define JMPTBL(I, B) I - B
36 /* Branch to an entry in a jump table. TABLE is a jump table with
37 relative offsets. INDEX is a register contains the index into the
38 jump table. SCALE is the scale of INDEX. */
39 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
40 lea TABLE(%rip), %r11; \
41 movslq (%r11, INDEX, SCALE), INDEX; \
42 lea (%r11, INDEX), INDEX; \
46 .section .text.ssse3,"ax",@progbits
47 #if !defined USE_AS_BCOPY
50 jb HIDDEN_JUMPTARGET (__chk_fail)
70 lea L(table_less_80bytes)(%rip), %r11
72 movslq (%r11, %rdx, 4), %r9
81 #ifndef USE_AS_MEMMOVE
95 #ifdef SHARED_CACHE_SIZE_HALF
96 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
98 mov __x86_shared_cache_size_half(%rip), %RCX_LP
105 #ifdef DATA_CACHE_SIZE_HALF
106 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
108 mov __x86_data_cache_size_half(%rip), %RCX_LP
110 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
114 movdqu -16(%rsi, %rdx), %xmm0
116 lea -16(%rdi, %rdx), %r8
125 #ifdef SHARED_CACHE_SIZE_HALF
126 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
128 mov __x86_shared_cache_size_half(%rip), %RCX_LP
136 #ifdef DATA_CACHE_SIZE_HALF
137 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
139 mov __x86_data_cache_size_half(%rip), %RCX_LP
141 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
154 jb L(shl_0_less_64bytes)
156 movaps 16(%rsi), %xmm1
157 movaps 32(%rsi), %xmm2
158 movaps 48(%rsi), %xmm3
160 movaps %xmm1, 16(%rdi)
161 movaps %xmm2, 32(%rdi)
162 movaps %xmm3, 48(%rdi)
166 L(shl_0_less_64bytes):
169 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
173 #ifdef DATA_CACHE_SIZE_HALF
174 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
176 cmp __x86_data_cache_size_half(%rip), %RDX_LP
179 jae L(shl_0_gobble_mem_loop)
180 L(shl_0_gobble_cache_loop):
182 movaps 0x10(%rsi), %xmm1
183 movaps 0x20(%rsi), %xmm2
184 movaps 0x30(%rsi), %xmm3
187 movaps %xmm1, 0x10(%rdi)
188 movaps %xmm2, 0x20(%rdi)
189 movaps %xmm3, 0x30(%rdi)
192 movaps 0x40(%rsi), %xmm4
193 movaps 0x50(%rsi), %xmm5
194 movaps 0x60(%rsi), %xmm6
195 movaps 0x70(%rsi), %xmm7
197 movaps %xmm4, 0x40(%rdi)
198 movaps %xmm5, 0x50(%rdi)
199 movaps %xmm6, 0x60(%rdi)
200 movaps %xmm7, 0x70(%rdi)
203 jae L(shl_0_gobble_cache_loop)
206 jl L(shl_0_cache_less_64bytes)
210 movdqa 0x10(%rsi), %xmm1
213 movdqa %xmm1, 0x10(%rdi)
215 movdqa 0x20(%rsi), %xmm4
216 movdqa 0x30(%rsi), %xmm1
219 movdqa %xmm4, 0x20(%rdi)
220 movdqa %xmm1, 0x30(%rdi)
222 L(shl_0_cache_less_64bytes):
225 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
228 L(shl_0_gobble_mem_loop):
229 prefetcht0 0x1c0(%rsi)
230 prefetcht0 0x280(%rsi)
233 movdqa 0x10(%rsi), %xmm1
234 movdqa 0x20(%rsi), %xmm2
235 movdqa 0x30(%rsi), %xmm3
236 movdqa 0x40(%rsi), %xmm4
237 movdqa 0x50(%rsi), %xmm5
238 movdqa 0x60(%rsi), %xmm6
239 movdqa 0x70(%rsi), %xmm7
243 movdqa %xmm1, 0x10(%rdi)
244 movdqa %xmm2, 0x20(%rdi)
245 movdqa %xmm3, 0x30(%rdi)
246 movdqa %xmm4, 0x40(%rdi)
247 movdqa %xmm5, 0x50(%rdi)
248 movdqa %xmm6, 0x60(%rdi)
249 movdqa %xmm7, 0x70(%rdi)
252 jae L(shl_0_gobble_mem_loop)
255 jl L(shl_0_mem_less_64bytes)
259 movdqa 0x10(%rsi), %xmm1
262 movdqa %xmm1, 0x10(%rdi)
264 movdqa 0x20(%rsi), %xmm0
265 movdqa 0x30(%rsi), %xmm1
268 movdqa %xmm0, 0x20(%rdi)
269 movdqa %xmm1, 0x30(%rdi)
271 L(shl_0_mem_less_64bytes):
273 jb L(shl_0_mem_less_32bytes)
276 movdqa 0x10(%rsi), %xmm1
279 movdqa %xmm1, 0x10(%rdi)
281 L(shl_0_mem_less_32bytes):
284 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
289 movdqa -0x10(%rsi), %xmm1
291 movdqa %xmm1, -0x10(%rdi)
295 ja L(shl_0_gobble_bwd)
297 jb L(shl_0_less_64bytes_bwd)
298 movaps -0x10(%rsi), %xmm0
299 movaps -0x20(%rsi), %xmm1
300 movaps -0x30(%rsi), %xmm2
301 movaps -0x40(%rsi), %xmm3
302 movaps %xmm0, -0x10(%rdi)
303 movaps %xmm1, -0x20(%rdi)
304 movaps %xmm2, -0x30(%rdi)
305 movaps %xmm3, -0x40(%rdi)
309 L(shl_0_less_64bytes_bwd):
310 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
314 #ifdef DATA_CACHE_SIZE_HALF
315 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
317 cmp __x86_data_cache_size_half(%rip), %RDX_LP
320 jae L(shl_0_gobble_mem_bwd_loop)
321 L(shl_0_gobble_bwd_loop):
322 movdqa -0x10(%rsi), %xmm0
323 movaps -0x20(%rsi), %xmm1
324 movaps -0x30(%rsi), %xmm2
325 movaps -0x40(%rsi), %xmm3
327 movdqa %xmm0, -0x10(%rdi)
328 movaps %xmm1, -0x20(%rdi)
329 movaps %xmm2, -0x30(%rdi)
330 movaps %xmm3, -0x40(%rdi)
333 movaps -0x50(%rsi), %xmm4
334 movaps -0x60(%rsi), %xmm5
335 movaps -0x70(%rsi), %xmm6
336 movaps -0x80(%rsi), %xmm7
337 lea -0x80(%rsi), %rsi
338 movaps %xmm4, -0x50(%rdi)
339 movaps %xmm5, -0x60(%rdi)
340 movaps %xmm6, -0x70(%rdi)
341 movaps %xmm7, -0x80(%rdi)
342 lea -0x80(%rdi), %rdi
344 jae L(shl_0_gobble_bwd_loop)
347 jl L(shl_0_gobble_bwd_less_64bytes)
349 movdqa -0x10(%rsi), %xmm0
351 movdqa -0x20(%rsi), %xmm1
353 movdqa %xmm0, -0x10(%rdi)
354 movdqa %xmm1, -0x20(%rdi)
356 movdqa -0x30(%rsi), %xmm0
357 movdqa -0x40(%rsi), %xmm1
360 movdqa %xmm0, -0x30(%rdi)
361 movdqa %xmm1, -0x40(%rdi)
363 L(shl_0_gobble_bwd_less_64bytes):
364 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
367 L(shl_0_gobble_mem_bwd_loop):
368 prefetcht0 -0x1c0(%rsi)
369 prefetcht0 -0x280(%rsi)
370 movdqa -0x10(%rsi), %xmm0
371 movdqa -0x20(%rsi), %xmm1
372 movdqa -0x30(%rsi), %xmm2
373 movdqa -0x40(%rsi), %xmm3
374 movdqa -0x50(%rsi), %xmm4
375 movdqa -0x60(%rsi), %xmm5
376 movdqa -0x70(%rsi), %xmm6
377 movdqa -0x80(%rsi), %xmm7
378 lea -0x80(%rsi), %rsi
380 movdqa %xmm0, -0x10(%rdi)
381 movdqa %xmm1, -0x20(%rdi)
382 movdqa %xmm2, -0x30(%rdi)
383 movdqa %xmm3, -0x40(%rdi)
384 movdqa %xmm4, -0x50(%rdi)
385 movdqa %xmm5, -0x60(%rdi)
386 movdqa %xmm6, -0x70(%rdi)
387 movdqa %xmm7, -0x80(%rdi)
388 lea -0x80(%rdi), %rdi
390 jae L(shl_0_gobble_mem_bwd_loop)
393 jl L(shl_0_mem_bwd_less_64bytes)
395 movdqa -0x10(%rsi), %xmm0
397 movdqa -0x20(%rsi), %xmm1
399 movdqa %xmm0, -0x10(%rdi)
400 movdqa %xmm1, -0x20(%rdi)
402 movdqa -0x30(%rsi), %xmm0
403 movdqa -0x40(%rsi), %xmm1
406 movdqa %xmm0, -0x30(%rdi)
407 movdqa %xmm1, -0x40(%rdi)
409 L(shl_0_mem_bwd_less_64bytes):
411 jb L(shl_0_mem_bwd_less_32bytes)
412 movdqa -0x10(%rsi), %xmm0
414 movdqa -0x20(%rsi), %xmm1
416 movdqa %xmm0, -0x10(%rdi)
417 movdqa %xmm1, -0x20(%rdi)
419 L(shl_0_mem_bwd_less_32bytes):
420 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
424 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
426 movaps -0x01(%rsi), %xmm1
428 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
434 prefetchnta 0x1c0(%rsi)
437 movaps 0x0f(%rsi), %xmm2
438 movaps 0x1f(%rsi), %xmm3
439 movaps 0x2f(%rsi), %xmm4
440 movaps 0x3f(%rsi), %xmm5
442 palignr $1, %xmm4, %xmm5
444 palignr $1, %xmm3, %xmm4
445 palignr $1, %xmm2, %xmm3
447 palignr $1, %xmm1, %xmm2
449 movdqa %xmm2, -0x40(%rdi)
450 movaps %xmm3, -0x30(%rdi)
452 movaps %xmm4, -0x20(%rdi)
453 movaps %xmm5, -0x10(%rdi)
457 movaps %xmm4, -0x20(%rdi)
459 movaps %xmm5, -0x10(%rdi)
463 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
467 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
469 movaps -0x01(%rsi), %xmm1
471 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
476 L(shl_1_bwd_loop_L2):
477 prefetchnta -0x1c0(%rsi)
478 L(shl_1_bwd_loop_L1):
479 movaps -0x11(%rsi), %xmm2
481 movaps -0x21(%rsi), %xmm3
482 movaps -0x31(%rsi), %xmm4
483 movaps -0x41(%rsi), %xmm5
484 lea -0x40(%rsi), %rsi
485 palignr $1, %xmm2, %xmm1
486 palignr $1, %xmm3, %xmm2
487 palignr $1, %xmm4, %xmm3
488 palignr $1, %xmm5, %xmm4
490 movaps %xmm1, -0x10(%rdi)
493 movaps %xmm2, -0x20(%rdi)
494 lea -0x40(%rdi), %rdi
496 movaps %xmm3, 0x10(%rdi)
505 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
509 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
511 movaps -0x02(%rsi), %xmm1
513 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
519 prefetchnta 0x1c0(%rsi)
522 movaps 0x0e(%rsi), %xmm2
523 movaps 0x1e(%rsi), %xmm3
524 movaps 0x2e(%rsi), %xmm4
525 movaps 0x3e(%rsi), %xmm5
527 palignr $2, %xmm4, %xmm5
529 palignr $2, %xmm3, %xmm4
530 palignr $2, %xmm2, %xmm3
532 palignr $2, %xmm1, %xmm2
534 movdqa %xmm2, -0x40(%rdi)
535 movaps %xmm3, -0x30(%rdi)
537 movaps %xmm4, -0x20(%rdi)
538 movaps %xmm5, -0x10(%rdi)
542 movaps %xmm4, -0x20(%rdi)
544 movaps %xmm5, -0x10(%rdi)
548 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
552 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
554 movaps -0x02(%rsi), %xmm1
556 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
561 L(shl_2_bwd_loop_L2):
562 prefetchnta -0x1c0(%rsi)
563 L(shl_2_bwd_loop_L1):
564 movaps -0x12(%rsi), %xmm2
566 movaps -0x22(%rsi), %xmm3
567 movaps -0x32(%rsi), %xmm4
568 movaps -0x42(%rsi), %xmm5
569 lea -0x40(%rsi), %rsi
570 palignr $2, %xmm2, %xmm1
571 palignr $2, %xmm3, %xmm2
572 palignr $2, %xmm4, %xmm3
573 palignr $2, %xmm5, %xmm4
575 movaps %xmm1, -0x10(%rdi)
578 movaps %xmm2, -0x20(%rdi)
579 lea -0x40(%rdi), %rdi
581 movaps %xmm3, 0x10(%rdi)
590 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
594 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
596 movaps -0x03(%rsi), %xmm1
598 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
604 prefetchnta 0x1c0(%rsi)
607 movaps 0x0d(%rsi), %xmm2
608 movaps 0x1d(%rsi), %xmm3
609 movaps 0x2d(%rsi), %xmm4
610 movaps 0x3d(%rsi), %xmm5
612 palignr $3, %xmm4, %xmm5
614 palignr $3, %xmm3, %xmm4
615 palignr $3, %xmm2, %xmm3
617 palignr $3, %xmm1, %xmm2
619 movdqa %xmm2, -0x40(%rdi)
620 movaps %xmm3, -0x30(%rdi)
622 movaps %xmm4, -0x20(%rdi)
623 movaps %xmm5, -0x10(%rdi)
627 movaps %xmm4, -0x20(%rdi)
629 movaps %xmm5, -0x10(%rdi)
633 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
637 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
639 movaps -0x03(%rsi), %xmm1
641 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
646 L(shl_3_bwd_loop_L2):
647 prefetchnta -0x1c0(%rsi)
648 L(shl_3_bwd_loop_L1):
649 movaps -0x13(%rsi), %xmm2
651 movaps -0x23(%rsi), %xmm3
652 movaps -0x33(%rsi), %xmm4
653 movaps -0x43(%rsi), %xmm5
654 lea -0x40(%rsi), %rsi
655 palignr $3, %xmm2, %xmm1
656 palignr $3, %xmm3, %xmm2
657 palignr $3, %xmm4, %xmm3
658 palignr $3, %xmm5, %xmm4
660 movaps %xmm1, -0x10(%rdi)
663 movaps %xmm2, -0x20(%rdi)
664 lea -0x40(%rdi), %rdi
666 movaps %xmm3, 0x10(%rdi)
675 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
679 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
681 movaps -0x04(%rsi), %xmm1
683 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
689 prefetchnta 0x1c0(%rsi)
692 movaps 0x0c(%rsi), %xmm2
693 movaps 0x1c(%rsi), %xmm3
694 movaps 0x2c(%rsi), %xmm4
695 movaps 0x3c(%rsi), %xmm5
697 palignr $4, %xmm4, %xmm5
699 palignr $4, %xmm3, %xmm4
700 palignr $4, %xmm2, %xmm3
702 palignr $4, %xmm1, %xmm2
704 movdqa %xmm2, -0x40(%rdi)
705 movaps %xmm3, -0x30(%rdi)
707 movaps %xmm4, -0x20(%rdi)
708 movaps %xmm5, -0x10(%rdi)
712 movaps %xmm4, -0x20(%rdi)
714 movaps %xmm5, -0x10(%rdi)
718 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
722 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
724 movaps -0x04(%rsi), %xmm1
726 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
731 L(shl_4_bwd_loop_L2):
732 prefetchnta -0x1c0(%rsi)
733 L(shl_4_bwd_loop_L1):
734 movaps -0x14(%rsi), %xmm2
736 movaps -0x24(%rsi), %xmm3
737 movaps -0x34(%rsi), %xmm4
738 movaps -0x44(%rsi), %xmm5
739 lea -0x40(%rsi), %rsi
740 palignr $4, %xmm2, %xmm1
741 palignr $4, %xmm3, %xmm2
742 palignr $4, %xmm4, %xmm3
743 palignr $4, %xmm5, %xmm4
745 movaps %xmm1, -0x10(%rdi)
748 movaps %xmm2, -0x20(%rdi)
749 lea -0x40(%rdi), %rdi
751 movaps %xmm3, 0x10(%rdi)
760 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
764 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
766 movaps -0x05(%rsi), %xmm1
768 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
774 prefetchnta 0x1c0(%rsi)
777 movaps 0x0b(%rsi), %xmm2
778 movaps 0x1b(%rsi), %xmm3
779 movaps 0x2b(%rsi), %xmm4
780 movaps 0x3b(%rsi), %xmm5
782 palignr $5, %xmm4, %xmm5
784 palignr $5, %xmm3, %xmm4
785 palignr $5, %xmm2, %xmm3
787 palignr $5, %xmm1, %xmm2
789 movdqa %xmm2, -0x40(%rdi)
790 movaps %xmm3, -0x30(%rdi)
792 movaps %xmm4, -0x20(%rdi)
793 movaps %xmm5, -0x10(%rdi)
797 movaps %xmm4, -0x20(%rdi)
799 movaps %xmm5, -0x10(%rdi)
803 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
807 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
809 movaps -0x05(%rsi), %xmm1
811 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
816 L(shl_5_bwd_loop_L2):
817 prefetchnta -0x1c0(%rsi)
818 L(shl_5_bwd_loop_L1):
819 movaps -0x15(%rsi), %xmm2
821 movaps -0x25(%rsi), %xmm3
822 movaps -0x35(%rsi), %xmm4
823 movaps -0x45(%rsi), %xmm5
824 lea -0x40(%rsi), %rsi
825 palignr $5, %xmm2, %xmm1
826 palignr $5, %xmm3, %xmm2
827 palignr $5, %xmm4, %xmm3
828 palignr $5, %xmm5, %xmm4
830 movaps %xmm1, -0x10(%rdi)
833 movaps %xmm2, -0x20(%rdi)
834 lea -0x40(%rdi), %rdi
836 movaps %xmm3, 0x10(%rdi)
845 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
849 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
851 movaps -0x06(%rsi), %xmm1
853 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
859 prefetchnta 0x1c0(%rsi)
862 movaps 0x0a(%rsi), %xmm2
863 movaps 0x1a(%rsi), %xmm3
864 movaps 0x2a(%rsi), %xmm4
865 movaps 0x3a(%rsi), %xmm5
867 palignr $6, %xmm4, %xmm5
869 palignr $6, %xmm3, %xmm4
870 palignr $6, %xmm2, %xmm3
872 palignr $6, %xmm1, %xmm2
874 movdqa %xmm2, -0x40(%rdi)
875 movaps %xmm3, -0x30(%rdi)
877 movaps %xmm4, -0x20(%rdi)
878 movaps %xmm5, -0x10(%rdi)
882 movaps %xmm4, -0x20(%rdi)
884 movaps %xmm5, -0x10(%rdi)
888 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
892 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
894 movaps -0x06(%rsi), %xmm1
896 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
901 L(shl_6_bwd_loop_L2):
902 prefetchnta -0x1c0(%rsi)
903 L(shl_6_bwd_loop_L1):
904 movaps -0x16(%rsi), %xmm2
906 movaps -0x26(%rsi), %xmm3
907 movaps -0x36(%rsi), %xmm4
908 movaps -0x46(%rsi), %xmm5
909 lea -0x40(%rsi), %rsi
910 palignr $6, %xmm2, %xmm1
911 palignr $6, %xmm3, %xmm2
912 palignr $6, %xmm4, %xmm3
913 palignr $6, %xmm5, %xmm4
915 movaps %xmm1, -0x10(%rdi)
918 movaps %xmm2, -0x20(%rdi)
919 lea -0x40(%rdi), %rdi
921 movaps %xmm3, 0x10(%rdi)
930 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
934 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
936 movaps -0x07(%rsi), %xmm1
938 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
944 prefetchnta 0x1c0(%rsi)
947 movaps 0x09(%rsi), %xmm2
948 movaps 0x19(%rsi), %xmm3
949 movaps 0x29(%rsi), %xmm4
950 movaps 0x39(%rsi), %xmm5
952 palignr $7, %xmm4, %xmm5
954 palignr $7, %xmm3, %xmm4
955 palignr $7, %xmm2, %xmm3
957 palignr $7, %xmm1, %xmm2
959 movdqa %xmm2, -0x40(%rdi)
960 movaps %xmm3, -0x30(%rdi)
962 movaps %xmm4, -0x20(%rdi)
963 movaps %xmm5, -0x10(%rdi)
967 movaps %xmm4, -0x20(%rdi)
969 movaps %xmm5, -0x10(%rdi)
973 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
977 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
979 movaps -0x07(%rsi), %xmm1
981 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
986 L(shl_7_bwd_loop_L2):
987 prefetchnta -0x1c0(%rsi)
988 L(shl_7_bwd_loop_L1):
989 movaps -0x17(%rsi), %xmm2
991 movaps -0x27(%rsi), %xmm3
992 movaps -0x37(%rsi), %xmm4
993 movaps -0x47(%rsi), %xmm5
994 lea -0x40(%rsi), %rsi
995 palignr $7, %xmm2, %xmm1
996 palignr $7, %xmm3, %xmm2
997 palignr $7, %xmm4, %xmm3
998 palignr $7, %xmm5, %xmm4
1000 movaps %xmm1, -0x10(%rdi)
1003 movaps %xmm2, -0x20(%rdi)
1004 lea -0x40(%rdi), %rdi
1006 movaps %xmm3, 0x10(%rdi)
1008 movaps %xmm4, (%rdi)
1012 movaps %xmm4, (%rdi)
1015 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1019 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1021 movaps -0x08(%rsi), %xmm1
1023 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1028 prefetchnta 0x1c0(%rsi)
1031 movaps 0x08(%rsi), %xmm2
1032 movaps 0x18(%rsi), %xmm3
1033 movaps 0x28(%rsi), %xmm4
1034 movaps 0x38(%rsi), %xmm5
1036 palignr $8, %xmm4, %xmm5
1038 palignr $8, %xmm3, %xmm4
1039 palignr $8, %xmm2, %xmm3
1041 palignr $8, %xmm1, %xmm2
1043 movdqa %xmm2, -0x40(%rdi)
1044 movaps %xmm3, -0x30(%rdi)
1046 movaps %xmm4, -0x20(%rdi)
1047 movaps %xmm5, -0x10(%rdi)
1053 movaps %xmm4, -0x20(%rdi)
1055 movaps %xmm5, -0x10(%rdi)
1058 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1062 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1064 movaps -0x08(%rsi), %xmm1
1066 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1071 L(shl_8_bwd_loop_L2):
1072 prefetchnta -0x1c0(%rsi)
1073 L(shl_8_bwd_loop_L1):
1074 movaps -0x18(%rsi), %xmm2
1076 movaps -0x28(%rsi), %xmm3
1077 movaps -0x38(%rsi), %xmm4
1078 movaps -0x48(%rsi), %xmm5
1079 lea -0x40(%rsi), %rsi
1080 palignr $8, %xmm2, %xmm1
1081 palignr $8, %xmm3, %xmm2
1082 palignr $8, %xmm4, %xmm3
1083 palignr $8, %xmm5, %xmm4
1085 movaps %xmm1, -0x10(%rdi)
1088 movaps %xmm2, -0x20(%rdi)
1089 lea -0x40(%rdi), %rdi
1091 movaps %xmm3, 0x10(%rdi)
1093 movaps %xmm4, (%rdi)
1097 movaps %xmm4, (%rdi)
1100 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1104 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1106 movaps -0x09(%rsi), %xmm1
1108 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1114 prefetchnta 0x1c0(%rsi)
1117 movaps 0x07(%rsi), %xmm2
1118 movaps 0x17(%rsi), %xmm3
1119 movaps 0x27(%rsi), %xmm4
1120 movaps 0x37(%rsi), %xmm5
1122 palignr $9, %xmm4, %xmm5
1124 palignr $9, %xmm3, %xmm4
1125 palignr $9, %xmm2, %xmm3
1127 palignr $9, %xmm1, %xmm2
1129 movdqa %xmm2, -0x40(%rdi)
1130 movaps %xmm3, -0x30(%rdi)
1132 movaps %xmm4, -0x20(%rdi)
1133 movaps %xmm5, -0x10(%rdi)
1137 movaps %xmm4, -0x20(%rdi)
1139 movaps %xmm5, -0x10(%rdi)
1143 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1147 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1149 movaps -0x09(%rsi), %xmm1
1151 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1156 L(shl_9_bwd_loop_L2):
1157 prefetchnta -0x1c0(%rsi)
1158 L(shl_9_bwd_loop_L1):
1159 movaps -0x19(%rsi), %xmm2
1161 movaps -0x29(%rsi), %xmm3
1162 movaps -0x39(%rsi), %xmm4
1163 movaps -0x49(%rsi), %xmm5
1164 lea -0x40(%rsi), %rsi
1165 palignr $9, %xmm2, %xmm1
1166 palignr $9, %xmm3, %xmm2
1167 palignr $9, %xmm4, %xmm3
1168 palignr $9, %xmm5, %xmm4
1170 movaps %xmm1, -0x10(%rdi)
1173 movaps %xmm2, -0x20(%rdi)
1174 lea -0x40(%rdi), %rdi
1176 movaps %xmm3, 0x10(%rdi)
1178 movaps %xmm4, (%rdi)
1182 movaps %xmm4, (%rdi)
1185 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1189 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1191 movaps -0x0a(%rsi), %xmm1
1193 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1199 prefetchnta 0x1c0(%rsi)
1202 movaps 0x06(%rsi), %xmm2
1203 movaps 0x16(%rsi), %xmm3
1204 movaps 0x26(%rsi), %xmm4
1205 movaps 0x36(%rsi), %xmm5
1207 palignr $10, %xmm4, %xmm5
1209 palignr $10, %xmm3, %xmm4
1210 palignr $10, %xmm2, %xmm3
1212 palignr $10, %xmm1, %xmm2
1214 movdqa %xmm2, -0x40(%rdi)
1215 movaps %xmm3, -0x30(%rdi)
1217 movaps %xmm4, -0x20(%rdi)
1218 movaps %xmm5, -0x10(%rdi)
1222 movaps %xmm4, -0x20(%rdi)
1224 movaps %xmm5, -0x10(%rdi)
1228 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1232 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1234 movaps -0x0a(%rsi), %xmm1
1236 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1241 L(shl_10_bwd_loop_L2):
1242 prefetchnta -0x1c0(%rsi)
1243 L(shl_10_bwd_loop_L1):
1244 movaps -0x1a(%rsi), %xmm2
1246 movaps -0x2a(%rsi), %xmm3
1247 movaps -0x3a(%rsi), %xmm4
1248 movaps -0x4a(%rsi), %xmm5
1249 lea -0x40(%rsi), %rsi
1250 palignr $10, %xmm2, %xmm1
1251 palignr $10, %xmm3, %xmm2
1252 palignr $10, %xmm4, %xmm3
1253 palignr $10, %xmm5, %xmm4
1255 movaps %xmm1, -0x10(%rdi)
1258 movaps %xmm2, -0x20(%rdi)
1259 lea -0x40(%rdi), %rdi
1261 movaps %xmm3, 0x10(%rdi)
1262 jb L(shl_10_bwd_end)
1263 movaps %xmm4, (%rdi)
1267 movaps %xmm4, (%rdi)
1270 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1274 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1276 movaps -0x0b(%rsi), %xmm1
1278 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1284 prefetchnta 0x1c0(%rsi)
1287 movaps 0x05(%rsi), %xmm2
1288 movaps 0x15(%rsi), %xmm3
1289 movaps 0x25(%rsi), %xmm4
1290 movaps 0x35(%rsi), %xmm5
1292 palignr $11, %xmm4, %xmm5
1294 palignr $11, %xmm3, %xmm4
1295 palignr $11, %xmm2, %xmm3
1297 palignr $11, %xmm1, %xmm2
1299 movdqa %xmm2, -0x40(%rdi)
1300 movaps %xmm3, -0x30(%rdi)
1302 movaps %xmm4, -0x20(%rdi)
1303 movaps %xmm5, -0x10(%rdi)
1307 movaps %xmm4, -0x20(%rdi)
1309 movaps %xmm5, -0x10(%rdi)
1313 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1317 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1319 movaps -0x0b(%rsi), %xmm1
1321 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1326 L(shl_11_bwd_loop_L2):
1327 prefetchnta -0x1c0(%rsi)
1328 L(shl_11_bwd_loop_L1):
1329 movaps -0x1b(%rsi), %xmm2
1331 movaps -0x2b(%rsi), %xmm3
1332 movaps -0x3b(%rsi), %xmm4
1333 movaps -0x4b(%rsi), %xmm5
1334 lea -0x40(%rsi), %rsi
1335 palignr $11, %xmm2, %xmm1
1336 palignr $11, %xmm3, %xmm2
1337 palignr $11, %xmm4, %xmm3
1338 palignr $11, %xmm5, %xmm4
1340 movaps %xmm1, -0x10(%rdi)
1343 movaps %xmm2, -0x20(%rdi)
1344 lea -0x40(%rdi), %rdi
1346 movaps %xmm3, 0x10(%rdi)
1347 jb L(shl_11_bwd_end)
1348 movaps %xmm4, (%rdi)
1352 movaps %xmm4, (%rdi)
1355 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1359 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1361 movaps -0x0c(%rsi), %xmm1
1363 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1369 prefetchnta 0x1c0(%rsi)
1372 movaps 0x04(%rsi), %xmm2
1373 movaps 0x14(%rsi), %xmm3
1374 movaps 0x24(%rsi), %xmm4
1375 movaps 0x34(%rsi), %xmm5
1377 palignr $12, %xmm4, %xmm5
1379 palignr $12, %xmm3, %xmm4
1380 palignr $12, %xmm2, %xmm3
1382 palignr $12, %xmm1, %xmm2
1384 movdqa %xmm2, -0x40(%rdi)
1385 movaps %xmm3, -0x30(%rdi)
1387 movaps %xmm4, -0x20(%rdi)
1388 movaps %xmm5, -0x10(%rdi)
1392 movaps %xmm4, -0x20(%rdi)
1394 movaps %xmm5, -0x10(%rdi)
1398 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1402 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1404 movaps -0x0c(%rsi), %xmm1
1406 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1411 L(shl_12_bwd_loop_L2):
1412 prefetchnta -0x1c0(%rsi)
1413 L(shl_12_bwd_loop_L1):
1414 movaps -0x1c(%rsi), %xmm2
1416 movaps -0x2c(%rsi), %xmm3
1417 movaps -0x3c(%rsi), %xmm4
1418 movaps -0x4c(%rsi), %xmm5
1419 lea -0x40(%rsi), %rsi
1420 palignr $12, %xmm2, %xmm1
1421 palignr $12, %xmm3, %xmm2
1422 palignr $12, %xmm4, %xmm3
1423 palignr $12, %xmm5, %xmm4
1425 movaps %xmm1, -0x10(%rdi)
1428 movaps %xmm2, -0x20(%rdi)
1429 lea -0x40(%rdi), %rdi
1431 movaps %xmm3, 0x10(%rdi)
1432 jb L(shl_12_bwd_end)
1433 movaps %xmm4, (%rdi)
1437 movaps %xmm4, (%rdi)
1440 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1444 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1446 movaps -0x0d(%rsi), %xmm1
1448 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1454 prefetchnta 0x1c0(%rsi)
1457 movaps 0x03(%rsi), %xmm2
1458 movaps 0x13(%rsi), %xmm3
1459 movaps 0x23(%rsi), %xmm4
1460 movaps 0x33(%rsi), %xmm5
1462 palignr $13, %xmm4, %xmm5
1464 palignr $13, %xmm3, %xmm4
1465 palignr $13, %xmm2, %xmm3
1467 palignr $13, %xmm1, %xmm2
1469 movdqa %xmm2, -0x40(%rdi)
1470 movaps %xmm3, -0x30(%rdi)
1472 movaps %xmm4, -0x20(%rdi)
1473 movaps %xmm5, -0x10(%rdi)
1477 movaps %xmm4, -0x20(%rdi)
1479 movaps %xmm5, -0x10(%rdi)
1483 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1487 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1489 movaps -0x0d(%rsi), %xmm1
1491 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1496 L(shl_13_bwd_loop_L2):
1497 prefetchnta -0x1c0(%rsi)
1498 L(shl_13_bwd_loop_L1):
1499 movaps -0x1d(%rsi), %xmm2
1501 movaps -0x2d(%rsi), %xmm3
1502 movaps -0x3d(%rsi), %xmm4
1503 movaps -0x4d(%rsi), %xmm5
1504 lea -0x40(%rsi), %rsi
1505 palignr $13, %xmm2, %xmm1
1506 palignr $13, %xmm3, %xmm2
1507 palignr $13, %xmm4, %xmm3
1508 palignr $13, %xmm5, %xmm4
1510 movaps %xmm1, -0x10(%rdi)
1513 movaps %xmm2, -0x20(%rdi)
1514 lea -0x40(%rdi), %rdi
1516 movaps %xmm3, 0x10(%rdi)
1517 jb L(shl_13_bwd_end)
1518 movaps %xmm4, (%rdi)
1522 movaps %xmm4, (%rdi)
1525 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1529 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1531 movaps -0x0e(%rsi), %xmm1
1533 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1539 prefetchnta 0x1c0(%rsi)
1542 movaps 0x02(%rsi), %xmm2
1543 movaps 0x12(%rsi), %xmm3
1544 movaps 0x22(%rsi), %xmm4
1545 movaps 0x32(%rsi), %xmm5
1547 palignr $14, %xmm4, %xmm5
1549 palignr $14, %xmm3, %xmm4
1550 palignr $14, %xmm2, %xmm3
1552 palignr $14, %xmm1, %xmm2
1554 movdqa %xmm2, -0x40(%rdi)
1555 movaps %xmm3, -0x30(%rdi)
1557 movaps %xmm4, -0x20(%rdi)
1558 movaps %xmm5, -0x10(%rdi)
1562 movaps %xmm4, -0x20(%rdi)
1564 movaps %xmm5, -0x10(%rdi)
1568 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1572 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1574 movaps -0x0e(%rsi), %xmm1
1576 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1581 L(shl_14_bwd_loop_L2):
1582 prefetchnta -0x1c0(%rsi)
1583 L(shl_14_bwd_loop_L1):
1584 movaps -0x1e(%rsi), %xmm2
1586 movaps -0x2e(%rsi), %xmm3
1587 movaps -0x3e(%rsi), %xmm4
1588 movaps -0x4e(%rsi), %xmm5
1589 lea -0x40(%rsi), %rsi
1590 palignr $14, %xmm2, %xmm1
1591 palignr $14, %xmm3, %xmm2
1592 palignr $14, %xmm4, %xmm3
1593 palignr $14, %xmm5, %xmm4
1595 movaps %xmm1, -0x10(%rdi)
1598 movaps %xmm2, -0x20(%rdi)
1599 lea -0x40(%rdi), %rdi
1601 movaps %xmm3, 0x10(%rdi)
1602 jb L(shl_14_bwd_end)
1603 movaps %xmm4, (%rdi)
1607 movaps %xmm4, (%rdi)
1610 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1614 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1616 movaps -0x0f(%rsi), %xmm1
1618 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1624 prefetchnta 0x1c0(%rsi)
1627 movaps 0x01(%rsi), %xmm2
1628 movaps 0x11(%rsi), %xmm3
1629 movaps 0x21(%rsi), %xmm4
1630 movaps 0x31(%rsi), %xmm5
1632 palignr $15, %xmm4, %xmm5
1634 palignr $15, %xmm3, %xmm4
1635 palignr $15, %xmm2, %xmm3
1637 palignr $15, %xmm1, %xmm2
1639 movdqa %xmm2, -0x40(%rdi)
1640 movaps %xmm3, -0x30(%rdi)
1642 movaps %xmm4, -0x20(%rdi)
1643 movaps %xmm5, -0x10(%rdi)
1647 movaps %xmm4, -0x20(%rdi)
1649 movaps %xmm5, -0x10(%rdi)
1653 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1657 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1659 movaps -0x0f(%rsi), %xmm1
1661 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1666 L(shl_15_bwd_loop_L2):
1667 prefetchnta -0x1c0(%rsi)
1668 L(shl_15_bwd_loop_L1):
1669 movaps -0x1f(%rsi), %xmm2
1671 movaps -0x2f(%rsi), %xmm3
1672 movaps -0x3f(%rsi), %xmm4
1673 movaps -0x4f(%rsi), %xmm5
1674 lea -0x40(%rsi), %rsi
1675 palignr $15, %xmm2, %xmm1
1676 palignr $15, %xmm3, %xmm2
1677 palignr $15, %xmm4, %xmm3
1678 palignr $15, %xmm5, %xmm4
1680 movaps %xmm1, -0x10(%rdi)
1683 movaps %xmm2, -0x20(%rdi)
1684 lea -0x40(%rdi), %rdi
1686 movaps %xmm3, 0x10(%rdi)
1687 jb L(shl_15_bwd_end)
1688 movaps %xmm4, (%rdi)
1692 movaps %xmm4, (%rdi)
1695 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1699 movdqu -72(%rsi), %xmm0
1700 movdqu -56(%rsi), %xmm1
1706 movdqu %xmm0, -72(%rdi)
1707 movdqu %xmm1, -56(%rdi)
1717 movdqu -64(%rsi), %xmm0
1724 movdqu %xmm0, -64(%rdi)
1735 movdqu -56(%rsi), %xmm0
1741 movdqu %xmm0, -56(%rdi)
1818 movdqu -73(%rsi), %xmm0
1819 movdqu -57(%rsi), %xmm1
1826 movdqu %xmm0, -73(%rdi)
1827 movdqu %xmm1, -57(%rdi)
1838 movdqu -65(%rsi), %xmm0
1839 movdqu -49(%rsi), %xmm1
1845 movdqu %xmm0, -65(%rdi)
1846 movdqu %xmm1, -49(%rdi)
1856 movdqu -57(%rsi), %xmm0
1863 movdqu %xmm0, -57(%rdi)
1874 movdqu -49(%rsi), %xmm0
1880 movdqu %xmm0, -49(%rdi)
1956 movdqu -74(%rsi), %xmm0
1957 movdqu -58(%rsi), %xmm1
1964 movdqu %xmm0, -74(%rdi)
1965 movdqu %xmm1, -58(%rdi)
1976 movdqu -66(%rsi), %xmm0
1977 movdqu -50(%rsi), %xmm1
1984 movdqu %xmm0, -66(%rdi)
1985 movdqu %xmm1, -50(%rdi)
1996 movdqu -58(%rsi), %xmm1
2003 movdqu %xmm1, -58(%rdi)
2014 movdqu -50(%rsi), %xmm0
2020 movdqu %xmm0, -50(%rdi)
2096 movdqu -75(%rsi), %xmm0
2097 movdqu -59(%rsi), %xmm1
2104 movdqu %xmm0, -75(%rdi)
2105 movdqu %xmm1, -59(%rdi)
2116 movdqu -67(%rsi), %xmm0
2117 movdqu -59(%rsi), %xmm1
2124 movdqu %xmm0, -67(%rdi)
2125 movdqu %xmm1, -59(%rdi)
2136 movdqu -59(%rsi), %xmm0
2143 movdqu %xmm0, -59(%rdi)
2154 movdqu -51(%rsi), %xmm0
2160 movdqu %xmm0, -51(%rdi)
2238 movdqu -76(%rsi), %xmm0
2239 movdqu -60(%rsi), %xmm1
2246 movdqu %xmm0, -76(%rdi)
2247 movdqu %xmm1, -60(%rdi)
2258 movdqu -68(%rsi), %xmm0
2259 movdqu -52(%rsi), %xmm1
2265 movdqu %xmm0, -68(%rdi)
2266 movdqu %xmm1, -52(%rdi)
2276 movdqu -60(%rsi), %xmm0
2283 movdqu %xmm0, -60(%rdi)
2294 movdqu -52(%rsi), %xmm0
2300 movdqu %xmm0, -52(%rdi)
2376 movdqu -77(%rsi), %xmm0
2377 movdqu -61(%rsi), %xmm1
2384 movdqu %xmm0, -77(%rdi)
2385 movdqu %xmm1, -61(%rdi)
2396 movdqu -69(%rsi), %xmm0
2397 movdqu -53(%rsi), %xmm1
2403 movdqu %xmm0, -69(%rdi)
2404 movdqu %xmm1, -53(%rdi)
2414 movdqu -61(%rsi), %xmm0
2421 movdqu %xmm0, -61(%rdi)
2432 movdqu -53(%rsi), %xmm0
2439 movdqu %xmm0, -53(%rdi)
2517 movdqu -78(%rsi), %xmm0
2518 movdqu -62(%rsi), %xmm1
2525 movdqu %xmm0, -78(%rdi)
2526 movdqu %xmm1, -62(%rdi)
2537 movdqu -70(%rsi), %xmm0
2538 movdqu -54(%rsi), %xmm1
2544 movdqu %xmm0, -70(%rdi)
2545 movdqu %xmm1, -54(%rdi)
2555 movdqu -62(%rsi), %xmm0
2562 movdqu %xmm0, -62(%rdi)
2573 movdqu -54(%rsi), %xmm0
2579 movdqu %xmm0, -54(%rdi)
2657 movdqu -79(%rsi), %xmm0
2658 movdqu -63(%rsi), %xmm1
2665 movdqu %xmm0, -79(%rdi)
2666 movdqu %xmm1, -63(%rdi)
2677 movdqu -71(%rsi), %xmm0
2678 movdqu -55(%rsi), %xmm1
2684 movdqu %xmm0, -71(%rdi)
2685 movdqu %xmm1, -55(%rdi)
2695 movdqu -63(%rsi), %xmm0
2702 movdqu %xmm0, -63(%rdi)
2713 movdqu -55(%rsi), %xmm0
2719 movdqu %xmm0, -55(%rdi)
2797 movdqu (%rsi), %xmm1
2800 movntdq %xmm1, (%rdi)
2802 lea -0x90(%rdx), %rdx
2803 #ifdef USE_AS_MEMMOVE
2807 jae L(memmove_is_memcpy_fwd)
2810 jb L(ll_cache_copy_fwd_start)
2811 L(memmove_is_memcpy_fwd):
2814 movdqu (%rsi), %xmm0
2815 movdqu 0x10(%rsi), %xmm1
2816 movdqu 0x20(%rsi), %xmm2
2817 movdqu 0x30(%rsi), %xmm3
2818 movdqu 0x40(%rsi), %xmm4
2819 movdqu 0x50(%rsi), %xmm5
2820 movdqu 0x60(%rsi), %xmm6
2821 movdqu 0x70(%rsi), %xmm7
2822 lea 0x80(%rsi), %rsi
2825 movntdq %xmm0, (%rdi)
2826 movntdq %xmm1, 0x10(%rdi)
2827 movntdq %xmm2, 0x20(%rdi)
2828 movntdq %xmm3, 0x30(%rdi)
2829 movntdq %xmm4, 0x40(%rdi)
2830 movntdq %xmm5, 0x50(%rdi)
2831 movntdq %xmm6, 0x60(%rdi)
2832 movntdq %xmm7, 0x70(%rdi)
2833 lea 0x80(%rdi), %rdi
2834 jae L(large_page_loop)
2836 lea 0x80(%rdx), %rdx
2837 jl L(large_page_less_64bytes)
2839 movdqu (%rsi), %xmm0
2840 movdqu 0x10(%rsi), %xmm1
2841 movdqu 0x20(%rsi), %xmm2
2842 movdqu 0x30(%rsi), %xmm3
2843 lea 0x40(%rsi), %rsi
2845 movntdq %xmm0, (%rdi)
2846 movntdq %xmm1, 0x10(%rdi)
2847 movntdq %xmm2, 0x20(%rdi)
2848 movntdq %xmm3, 0x30(%rdi)
2849 lea 0x40(%rdi), %rdi
2851 L(large_page_less_64bytes):
2855 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2857 #ifdef USE_AS_MEMMOVE
2859 L(ll_cache_copy_fwd_start):
2860 prefetcht0 0x1c0(%rsi)
2861 prefetcht0 0x200(%rsi)
2862 movdqu (%rsi), %xmm0
2863 movdqu 0x10(%rsi), %xmm1
2864 movdqu 0x20(%rsi), %xmm2
2865 movdqu 0x30(%rsi), %xmm3
2866 movdqu 0x40(%rsi), %xmm4
2867 movdqu 0x50(%rsi), %xmm5
2868 movdqu 0x60(%rsi), %xmm6
2869 movdqu 0x70(%rsi), %xmm7
2870 lea 0x80(%rsi), %rsi
2873 movaps %xmm0, (%rdi)
2874 movaps %xmm1, 0x10(%rdi)
2875 movaps %xmm2, 0x20(%rdi)
2876 movaps %xmm3, 0x30(%rdi)
2877 movaps %xmm4, 0x40(%rdi)
2878 movaps %xmm5, 0x50(%rdi)
2879 movaps %xmm6, 0x60(%rdi)
2880 movaps %xmm7, 0x70(%rdi)
2881 lea 0x80(%rdi), %rdi
2882 jae L(ll_cache_copy_fwd_start)
2884 lea 0x80(%rdx), %rdx
2885 jl L(large_page_ll_less_fwd_64bytes)
2887 movdqu (%rsi), %xmm0
2888 movdqu 0x10(%rsi), %xmm1
2889 movdqu 0x20(%rsi), %xmm2
2890 movdqu 0x30(%rsi), %xmm3
2891 lea 0x40(%rsi), %rsi
2893 movaps %xmm0, (%rdi)
2894 movaps %xmm1, 0x10(%rdi)
2895 movaps %xmm2, 0x20(%rdi)
2896 movaps %xmm3, 0x30(%rdi)
2897 lea 0x40(%rdi), %rdi
2899 L(large_page_ll_less_fwd_64bytes):
2902 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2907 movdqu -0x10(%rsi), %xmm1
2910 movdqa %xmm1, -0x10(%rdi)
2912 lea -0x90(%rdx), %rdx
2913 #ifdef USE_AS_MEMMOVE
2917 jae L(memmove_is_memcpy_bwd)
2919 jb L(ll_cache_copy_bwd_start)
2920 L(memmove_is_memcpy_bwd):
2922 L(large_page_bwd_loop):
2923 movdqu -0x10(%rsi), %xmm0
2924 movdqu -0x20(%rsi), %xmm1
2925 movdqu -0x30(%rsi), %xmm2
2926 movdqu -0x40(%rsi), %xmm3
2927 movdqu -0x50(%rsi), %xmm4
2928 movdqu -0x60(%rsi), %xmm5
2929 movdqu -0x70(%rsi), %xmm6
2930 movdqu -0x80(%rsi), %xmm7
2931 lea -0x80(%rsi), %rsi
2934 movntdq %xmm0, -0x10(%rdi)
2935 movntdq %xmm1, -0x20(%rdi)
2936 movntdq %xmm2, -0x30(%rdi)
2937 movntdq %xmm3, -0x40(%rdi)
2938 movntdq %xmm4, -0x50(%rdi)
2939 movntdq %xmm5, -0x60(%rdi)
2940 movntdq %xmm6, -0x70(%rdi)
2941 movntdq %xmm7, -0x80(%rdi)
2942 lea -0x80(%rdi), %rdi
2943 jae L(large_page_bwd_loop)
2945 lea 0x80(%rdx), %rdx
2946 jl L(large_page_less_bwd_64bytes)
2948 movdqu -0x10(%rsi), %xmm0
2949 movdqu -0x20(%rsi), %xmm1
2950 movdqu -0x30(%rsi), %xmm2
2951 movdqu -0x40(%rsi), %xmm3
2952 lea -0x40(%rsi), %rsi
2954 movntdq %xmm0, -0x10(%rdi)
2955 movntdq %xmm1, -0x20(%rdi)
2956 movntdq %xmm2, -0x30(%rdi)
2957 movntdq %xmm3, -0x40(%rdi)
2958 lea -0x40(%rdi), %rdi
2960 L(large_page_less_bwd_64bytes):
2962 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2964 #ifdef USE_AS_MEMMOVE
2966 L(ll_cache_copy_bwd_start):
2967 prefetcht0 -0x1c0(%rsi)
2968 prefetcht0 -0x200(%rsi)
2969 movdqu -0x10(%rsi), %xmm0
2970 movdqu -0x20(%rsi), %xmm1
2971 movdqu -0x30(%rsi), %xmm2
2972 movdqu -0x40(%rsi), %xmm3
2973 movdqu -0x50(%rsi), %xmm4
2974 movdqu -0x60(%rsi), %xmm5
2975 movdqu -0x70(%rsi), %xmm6
2976 movdqu -0x80(%rsi), %xmm7
2977 lea -0x80(%rsi), %rsi
2980 movaps %xmm0, -0x10(%rdi)
2981 movaps %xmm1, -0x20(%rdi)
2982 movaps %xmm2, -0x30(%rdi)
2983 movaps %xmm3, -0x40(%rdi)
2984 movaps %xmm4, -0x50(%rdi)
2985 movaps %xmm5, -0x60(%rdi)
2986 movaps %xmm6, -0x70(%rdi)
2987 movaps %xmm7, -0x80(%rdi)
2988 lea -0x80(%rdi), %rdi
2989 jae L(ll_cache_copy_bwd_start)
2991 lea 0x80(%rdx), %rdx
2992 jl L(large_page_ll_less_bwd_64bytes)
2994 movdqu -0x10(%rsi), %xmm0
2995 movdqu -0x20(%rsi), %xmm1
2996 movdqu -0x30(%rsi), %xmm2
2997 movdqu -0x40(%rsi), %xmm3
2998 lea -0x40(%rsi), %rsi
3000 movaps %xmm0, -0x10(%rdi)
3001 movaps %xmm1, -0x20(%rdi)
3002 movaps %xmm2, -0x30(%rdi)
3003 movaps %xmm3, -0x40(%rdi)
3004 lea -0x40(%rdi), %rdi
3006 L(large_page_ll_less_bwd_64bytes):
3007 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3012 .section .rodata.ssse3,"a",@progbits
3014 L(table_less_80bytes):
3015 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3016 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3017 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3018 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3019 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3020 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3021 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3022 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3023 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3024 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3025 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3026 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3027 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3028 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3029 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3030 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3031 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(shl_0), L(shl_table))
3099 .int JMPTBL (L(shl_1), L(shl_table))
3100 .int JMPTBL (L(shl_2), L(shl_table))
3101 .int JMPTBL (L(shl_3), L(shl_table))
3102 .int JMPTBL (L(shl_4), L(shl_table))
3103 .int JMPTBL (L(shl_5), L(shl_table))
3104 .int JMPTBL (L(shl_6), L(shl_table))
3105 .int JMPTBL (L(shl_7), L(shl_table))
3106 .int JMPTBL (L(shl_8), L(shl_table))
3107 .int JMPTBL (L(shl_9), L(shl_table))
3108 .int JMPTBL (L(shl_10), L(shl_table))
3109 .int JMPTBL (L(shl_11), L(shl_table))
3110 .int JMPTBL (L(shl_12), L(shl_table))
3111 .int JMPTBL (L(shl_13), L(shl_table))
3112 .int JMPTBL (L(shl_14), L(shl_table))
3113 .int JMPTBL (L(shl_15), L(shl_table))
3117 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3118 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3119 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3120 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3121 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3122 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3123 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3124 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3125 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3126 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3127 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3128 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3129 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3130 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3131 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3132 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))