2 Copyright (C) 2010-2018 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 #include "asm-syntax.h"
27 # define MEMCPY __memcpy_ssse3
28 # define MEMCPY_CHK __memcpy_chk_ssse3
29 # define MEMPCPY __mempcpy_ssse3
30 # define MEMPCPY_CHK __mempcpy_chk_ssse3
33 #define JMPTBL(I, B) I - B
35 /* Branch to an entry in a jump table. TABLE is a jump table with
36 relative offsets. INDEX is a register contains the index into the
37 jump table. SCALE is the scale of INDEX. */
38 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
39 lea TABLE(%rip), %r11; \
40 movslq (%r11, INDEX, SCALE), INDEX; \
41 lea (%r11, INDEX), INDEX; \
42 _CET_NOTRACK jmp *INDEX; \
45 .section .text.ssse3,"ax",@progbits
46 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
49 jb HIDDEN_JUMPTARGET (__chk_fail)
59 #if !defined USE_AS_BCOPY
62 jb HIDDEN_JUMPTARGET (__chk_fail)
83 lea L(table_less_80bytes)(%rip), %r11
85 movslq (%r11, %rdx, 4), %r9
94 #ifndef USE_AS_MEMMOVE
108 #ifdef SHARED_CACHE_SIZE_HALF
109 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
111 mov __x86_shared_cache_size_half(%rip), %RCX_LP
118 #ifdef DATA_CACHE_SIZE_HALF
119 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
121 mov __x86_data_cache_size_half(%rip), %RCX_LP
123 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
127 movdqu -16(%rsi, %rdx), %xmm0
129 lea -16(%rdi, %rdx), %r8
138 #ifdef SHARED_CACHE_SIZE_HALF
139 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
141 mov __x86_shared_cache_size_half(%rip), %RCX_LP
149 #ifdef DATA_CACHE_SIZE_HALF
150 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
152 mov __x86_data_cache_size_half(%rip), %RCX_LP
154 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
167 jb L(shl_0_less_64bytes)
169 movaps 16(%rsi), %xmm1
170 movaps 32(%rsi), %xmm2
171 movaps 48(%rsi), %xmm3
173 movaps %xmm1, 16(%rdi)
174 movaps %xmm2, 32(%rdi)
175 movaps %xmm3, 48(%rdi)
179 L(shl_0_less_64bytes):
182 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
186 #ifdef DATA_CACHE_SIZE_HALF
187 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
189 cmp __x86_data_cache_size_half(%rip), %RDX_LP
192 jae L(shl_0_gobble_mem_loop)
193 L(shl_0_gobble_cache_loop):
195 movaps 0x10(%rsi), %xmm1
196 movaps 0x20(%rsi), %xmm2
197 movaps 0x30(%rsi), %xmm3
200 movaps %xmm1, 0x10(%rdi)
201 movaps %xmm2, 0x20(%rdi)
202 movaps %xmm3, 0x30(%rdi)
205 movaps 0x40(%rsi), %xmm4
206 movaps 0x50(%rsi), %xmm5
207 movaps 0x60(%rsi), %xmm6
208 movaps 0x70(%rsi), %xmm7
210 movaps %xmm4, 0x40(%rdi)
211 movaps %xmm5, 0x50(%rdi)
212 movaps %xmm6, 0x60(%rdi)
213 movaps %xmm7, 0x70(%rdi)
216 jae L(shl_0_gobble_cache_loop)
219 jl L(shl_0_cache_less_64bytes)
223 movdqa 0x10(%rsi), %xmm1
226 movdqa %xmm1, 0x10(%rdi)
228 movdqa 0x20(%rsi), %xmm4
229 movdqa 0x30(%rsi), %xmm1
232 movdqa %xmm4, 0x20(%rdi)
233 movdqa %xmm1, 0x30(%rdi)
235 L(shl_0_cache_less_64bytes):
238 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
241 L(shl_0_gobble_mem_loop):
242 prefetcht0 0x1c0(%rsi)
243 prefetcht0 0x280(%rsi)
246 movdqa 0x10(%rsi), %xmm1
247 movdqa 0x20(%rsi), %xmm2
248 movdqa 0x30(%rsi), %xmm3
249 movdqa 0x40(%rsi), %xmm4
250 movdqa 0x50(%rsi), %xmm5
251 movdqa 0x60(%rsi), %xmm6
252 movdqa 0x70(%rsi), %xmm7
256 movdqa %xmm1, 0x10(%rdi)
257 movdqa %xmm2, 0x20(%rdi)
258 movdqa %xmm3, 0x30(%rdi)
259 movdqa %xmm4, 0x40(%rdi)
260 movdqa %xmm5, 0x50(%rdi)
261 movdqa %xmm6, 0x60(%rdi)
262 movdqa %xmm7, 0x70(%rdi)
265 jae L(shl_0_gobble_mem_loop)
268 jl L(shl_0_mem_less_64bytes)
272 movdqa 0x10(%rsi), %xmm1
275 movdqa %xmm1, 0x10(%rdi)
277 movdqa 0x20(%rsi), %xmm0
278 movdqa 0x30(%rsi), %xmm1
281 movdqa %xmm0, 0x20(%rdi)
282 movdqa %xmm1, 0x30(%rdi)
284 L(shl_0_mem_less_64bytes):
286 jb L(shl_0_mem_less_32bytes)
289 movdqa 0x10(%rsi), %xmm1
292 movdqa %xmm1, 0x10(%rdi)
294 L(shl_0_mem_less_32bytes):
297 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
302 movdqa -0x10(%rsi), %xmm1
304 movdqa %xmm1, -0x10(%rdi)
308 ja L(shl_0_gobble_bwd)
310 jb L(shl_0_less_64bytes_bwd)
311 movaps -0x10(%rsi), %xmm0
312 movaps -0x20(%rsi), %xmm1
313 movaps -0x30(%rsi), %xmm2
314 movaps -0x40(%rsi), %xmm3
315 movaps %xmm0, -0x10(%rdi)
316 movaps %xmm1, -0x20(%rdi)
317 movaps %xmm2, -0x30(%rdi)
318 movaps %xmm3, -0x40(%rdi)
322 L(shl_0_less_64bytes_bwd):
323 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
327 #ifdef DATA_CACHE_SIZE_HALF
328 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
330 cmp __x86_data_cache_size_half(%rip), %RDX_LP
333 jae L(shl_0_gobble_mem_bwd_loop)
334 L(shl_0_gobble_bwd_loop):
335 movdqa -0x10(%rsi), %xmm0
336 movaps -0x20(%rsi), %xmm1
337 movaps -0x30(%rsi), %xmm2
338 movaps -0x40(%rsi), %xmm3
340 movdqa %xmm0, -0x10(%rdi)
341 movaps %xmm1, -0x20(%rdi)
342 movaps %xmm2, -0x30(%rdi)
343 movaps %xmm3, -0x40(%rdi)
346 movaps -0x50(%rsi), %xmm4
347 movaps -0x60(%rsi), %xmm5
348 movaps -0x70(%rsi), %xmm6
349 movaps -0x80(%rsi), %xmm7
350 lea -0x80(%rsi), %rsi
351 movaps %xmm4, -0x50(%rdi)
352 movaps %xmm5, -0x60(%rdi)
353 movaps %xmm6, -0x70(%rdi)
354 movaps %xmm7, -0x80(%rdi)
355 lea -0x80(%rdi), %rdi
357 jae L(shl_0_gobble_bwd_loop)
360 jl L(shl_0_gobble_bwd_less_64bytes)
362 movdqa -0x10(%rsi), %xmm0
364 movdqa -0x20(%rsi), %xmm1
366 movdqa %xmm0, -0x10(%rdi)
367 movdqa %xmm1, -0x20(%rdi)
369 movdqa -0x30(%rsi), %xmm0
370 movdqa -0x40(%rsi), %xmm1
373 movdqa %xmm0, -0x30(%rdi)
374 movdqa %xmm1, -0x40(%rdi)
376 L(shl_0_gobble_bwd_less_64bytes):
377 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
380 L(shl_0_gobble_mem_bwd_loop):
381 prefetcht0 -0x1c0(%rsi)
382 prefetcht0 -0x280(%rsi)
383 movdqa -0x10(%rsi), %xmm0
384 movdqa -0x20(%rsi), %xmm1
385 movdqa -0x30(%rsi), %xmm2
386 movdqa -0x40(%rsi), %xmm3
387 movdqa -0x50(%rsi), %xmm4
388 movdqa -0x60(%rsi), %xmm5
389 movdqa -0x70(%rsi), %xmm6
390 movdqa -0x80(%rsi), %xmm7
391 lea -0x80(%rsi), %rsi
393 movdqa %xmm0, -0x10(%rdi)
394 movdqa %xmm1, -0x20(%rdi)
395 movdqa %xmm2, -0x30(%rdi)
396 movdqa %xmm3, -0x40(%rdi)
397 movdqa %xmm4, -0x50(%rdi)
398 movdqa %xmm5, -0x60(%rdi)
399 movdqa %xmm6, -0x70(%rdi)
400 movdqa %xmm7, -0x80(%rdi)
401 lea -0x80(%rdi), %rdi
403 jae L(shl_0_gobble_mem_bwd_loop)
406 jl L(shl_0_mem_bwd_less_64bytes)
408 movdqa -0x10(%rsi), %xmm0
410 movdqa -0x20(%rsi), %xmm1
412 movdqa %xmm0, -0x10(%rdi)
413 movdqa %xmm1, -0x20(%rdi)
415 movdqa -0x30(%rsi), %xmm0
416 movdqa -0x40(%rsi), %xmm1
419 movdqa %xmm0, -0x30(%rdi)
420 movdqa %xmm1, -0x40(%rdi)
422 L(shl_0_mem_bwd_less_64bytes):
424 jb L(shl_0_mem_bwd_less_32bytes)
425 movdqa -0x10(%rsi), %xmm0
427 movdqa -0x20(%rsi), %xmm1
429 movdqa %xmm0, -0x10(%rdi)
430 movdqa %xmm1, -0x20(%rdi)
432 L(shl_0_mem_bwd_less_32bytes):
433 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
437 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
439 movaps -0x01(%rsi), %xmm1
441 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
444 _CET_NOTRACK jmp *%r9
447 prefetchnta 0x1c0(%rsi)
450 movaps 0x0f(%rsi), %xmm2
451 movaps 0x1f(%rsi), %xmm3
452 movaps 0x2f(%rsi), %xmm4
453 movaps 0x3f(%rsi), %xmm5
455 palignr $1, %xmm4, %xmm5
457 palignr $1, %xmm3, %xmm4
458 palignr $1, %xmm2, %xmm3
460 palignr $1, %xmm1, %xmm2
462 movdqa %xmm2, -0x40(%rdi)
463 movaps %xmm3, -0x30(%rdi)
465 movaps %xmm4, -0x20(%rdi)
466 movaps %xmm5, -0x10(%rdi)
467 _CET_NOTRACK jmp *%r9
470 movaps %xmm4, -0x20(%rdi)
472 movaps %xmm5, -0x10(%rdi)
476 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
480 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
482 movaps -0x01(%rsi), %xmm1
484 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
487 _CET_NOTRACK jmp *%r9
489 L(shl_1_bwd_loop_L2):
490 prefetchnta -0x1c0(%rsi)
491 L(shl_1_bwd_loop_L1):
492 movaps -0x11(%rsi), %xmm2
494 movaps -0x21(%rsi), %xmm3
495 movaps -0x31(%rsi), %xmm4
496 movaps -0x41(%rsi), %xmm5
497 lea -0x40(%rsi), %rsi
498 palignr $1, %xmm2, %xmm1
499 palignr $1, %xmm3, %xmm2
500 palignr $1, %xmm4, %xmm3
501 palignr $1, %xmm5, %xmm4
503 movaps %xmm1, -0x10(%rdi)
506 movaps %xmm2, -0x20(%rdi)
507 lea -0x40(%rdi), %rdi
509 movaps %xmm3, 0x10(%rdi)
512 _CET_NOTRACK jmp *%r9
518 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
522 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
524 movaps -0x02(%rsi), %xmm1
526 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
529 _CET_NOTRACK jmp *%r9
532 prefetchnta 0x1c0(%rsi)
535 movaps 0x0e(%rsi), %xmm2
536 movaps 0x1e(%rsi), %xmm3
537 movaps 0x2e(%rsi), %xmm4
538 movaps 0x3e(%rsi), %xmm5
540 palignr $2, %xmm4, %xmm5
542 palignr $2, %xmm3, %xmm4
543 palignr $2, %xmm2, %xmm3
545 palignr $2, %xmm1, %xmm2
547 movdqa %xmm2, -0x40(%rdi)
548 movaps %xmm3, -0x30(%rdi)
550 movaps %xmm4, -0x20(%rdi)
551 movaps %xmm5, -0x10(%rdi)
552 _CET_NOTRACK jmp *%r9
555 movaps %xmm4, -0x20(%rdi)
557 movaps %xmm5, -0x10(%rdi)
561 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
565 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
567 movaps -0x02(%rsi), %xmm1
569 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
572 _CET_NOTRACK jmp *%r9
574 L(shl_2_bwd_loop_L2):
575 prefetchnta -0x1c0(%rsi)
576 L(shl_2_bwd_loop_L1):
577 movaps -0x12(%rsi), %xmm2
579 movaps -0x22(%rsi), %xmm3
580 movaps -0x32(%rsi), %xmm4
581 movaps -0x42(%rsi), %xmm5
582 lea -0x40(%rsi), %rsi
583 palignr $2, %xmm2, %xmm1
584 palignr $2, %xmm3, %xmm2
585 palignr $2, %xmm4, %xmm3
586 palignr $2, %xmm5, %xmm4
588 movaps %xmm1, -0x10(%rdi)
591 movaps %xmm2, -0x20(%rdi)
592 lea -0x40(%rdi), %rdi
594 movaps %xmm3, 0x10(%rdi)
597 _CET_NOTRACK jmp *%r9
603 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
607 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
609 movaps -0x03(%rsi), %xmm1
611 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
614 _CET_NOTRACK jmp *%r9
617 prefetchnta 0x1c0(%rsi)
620 movaps 0x0d(%rsi), %xmm2
621 movaps 0x1d(%rsi), %xmm3
622 movaps 0x2d(%rsi), %xmm4
623 movaps 0x3d(%rsi), %xmm5
625 palignr $3, %xmm4, %xmm5
627 palignr $3, %xmm3, %xmm4
628 palignr $3, %xmm2, %xmm3
630 palignr $3, %xmm1, %xmm2
632 movdqa %xmm2, -0x40(%rdi)
633 movaps %xmm3, -0x30(%rdi)
635 movaps %xmm4, -0x20(%rdi)
636 movaps %xmm5, -0x10(%rdi)
637 _CET_NOTRACK jmp *%r9
640 movaps %xmm4, -0x20(%rdi)
642 movaps %xmm5, -0x10(%rdi)
646 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
650 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
652 movaps -0x03(%rsi), %xmm1
654 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
657 _CET_NOTRACK jmp *%r9
659 L(shl_3_bwd_loop_L2):
660 prefetchnta -0x1c0(%rsi)
661 L(shl_3_bwd_loop_L1):
662 movaps -0x13(%rsi), %xmm2
664 movaps -0x23(%rsi), %xmm3
665 movaps -0x33(%rsi), %xmm4
666 movaps -0x43(%rsi), %xmm5
667 lea -0x40(%rsi), %rsi
668 palignr $3, %xmm2, %xmm1
669 palignr $3, %xmm3, %xmm2
670 palignr $3, %xmm4, %xmm3
671 palignr $3, %xmm5, %xmm4
673 movaps %xmm1, -0x10(%rdi)
676 movaps %xmm2, -0x20(%rdi)
677 lea -0x40(%rdi), %rdi
679 movaps %xmm3, 0x10(%rdi)
682 _CET_NOTRACK jmp *%r9
688 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
692 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
694 movaps -0x04(%rsi), %xmm1
696 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
699 _CET_NOTRACK jmp *%r9
702 prefetchnta 0x1c0(%rsi)
705 movaps 0x0c(%rsi), %xmm2
706 movaps 0x1c(%rsi), %xmm3
707 movaps 0x2c(%rsi), %xmm4
708 movaps 0x3c(%rsi), %xmm5
710 palignr $4, %xmm4, %xmm5
712 palignr $4, %xmm3, %xmm4
713 palignr $4, %xmm2, %xmm3
715 palignr $4, %xmm1, %xmm2
717 movdqa %xmm2, -0x40(%rdi)
718 movaps %xmm3, -0x30(%rdi)
720 movaps %xmm4, -0x20(%rdi)
721 movaps %xmm5, -0x10(%rdi)
722 _CET_NOTRACK jmp *%r9
725 movaps %xmm4, -0x20(%rdi)
727 movaps %xmm5, -0x10(%rdi)
731 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
735 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
737 movaps -0x04(%rsi), %xmm1
739 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
742 _CET_NOTRACK jmp *%r9
744 L(shl_4_bwd_loop_L2):
745 prefetchnta -0x1c0(%rsi)
746 L(shl_4_bwd_loop_L1):
747 movaps -0x14(%rsi), %xmm2
749 movaps -0x24(%rsi), %xmm3
750 movaps -0x34(%rsi), %xmm4
751 movaps -0x44(%rsi), %xmm5
752 lea -0x40(%rsi), %rsi
753 palignr $4, %xmm2, %xmm1
754 palignr $4, %xmm3, %xmm2
755 palignr $4, %xmm4, %xmm3
756 palignr $4, %xmm5, %xmm4
758 movaps %xmm1, -0x10(%rdi)
761 movaps %xmm2, -0x20(%rdi)
762 lea -0x40(%rdi), %rdi
764 movaps %xmm3, 0x10(%rdi)
767 _CET_NOTRACK jmp *%r9
773 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
777 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
779 movaps -0x05(%rsi), %xmm1
781 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
784 _CET_NOTRACK jmp *%r9
787 prefetchnta 0x1c0(%rsi)
790 movaps 0x0b(%rsi), %xmm2
791 movaps 0x1b(%rsi), %xmm3
792 movaps 0x2b(%rsi), %xmm4
793 movaps 0x3b(%rsi), %xmm5
795 palignr $5, %xmm4, %xmm5
797 palignr $5, %xmm3, %xmm4
798 palignr $5, %xmm2, %xmm3
800 palignr $5, %xmm1, %xmm2
802 movdqa %xmm2, -0x40(%rdi)
803 movaps %xmm3, -0x30(%rdi)
805 movaps %xmm4, -0x20(%rdi)
806 movaps %xmm5, -0x10(%rdi)
807 _CET_NOTRACK jmp *%r9
810 movaps %xmm4, -0x20(%rdi)
812 movaps %xmm5, -0x10(%rdi)
816 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
820 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
822 movaps -0x05(%rsi), %xmm1
824 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
827 _CET_NOTRACK jmp *%r9
829 L(shl_5_bwd_loop_L2):
830 prefetchnta -0x1c0(%rsi)
831 L(shl_5_bwd_loop_L1):
832 movaps -0x15(%rsi), %xmm2
834 movaps -0x25(%rsi), %xmm3
835 movaps -0x35(%rsi), %xmm4
836 movaps -0x45(%rsi), %xmm5
837 lea -0x40(%rsi), %rsi
838 palignr $5, %xmm2, %xmm1
839 palignr $5, %xmm3, %xmm2
840 palignr $5, %xmm4, %xmm3
841 palignr $5, %xmm5, %xmm4
843 movaps %xmm1, -0x10(%rdi)
846 movaps %xmm2, -0x20(%rdi)
847 lea -0x40(%rdi), %rdi
849 movaps %xmm3, 0x10(%rdi)
852 _CET_NOTRACK jmp *%r9
858 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
862 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
864 movaps -0x06(%rsi), %xmm1
866 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
869 _CET_NOTRACK jmp *%r9
872 prefetchnta 0x1c0(%rsi)
875 movaps 0x0a(%rsi), %xmm2
876 movaps 0x1a(%rsi), %xmm3
877 movaps 0x2a(%rsi), %xmm4
878 movaps 0x3a(%rsi), %xmm5
880 palignr $6, %xmm4, %xmm5
882 palignr $6, %xmm3, %xmm4
883 palignr $6, %xmm2, %xmm3
885 palignr $6, %xmm1, %xmm2
887 movdqa %xmm2, -0x40(%rdi)
888 movaps %xmm3, -0x30(%rdi)
890 movaps %xmm4, -0x20(%rdi)
891 movaps %xmm5, -0x10(%rdi)
892 _CET_NOTRACK jmp *%r9
895 movaps %xmm4, -0x20(%rdi)
897 movaps %xmm5, -0x10(%rdi)
901 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
905 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
907 movaps -0x06(%rsi), %xmm1
909 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
912 _CET_NOTRACK jmp *%r9
914 L(shl_6_bwd_loop_L2):
915 prefetchnta -0x1c0(%rsi)
916 L(shl_6_bwd_loop_L1):
917 movaps -0x16(%rsi), %xmm2
919 movaps -0x26(%rsi), %xmm3
920 movaps -0x36(%rsi), %xmm4
921 movaps -0x46(%rsi), %xmm5
922 lea -0x40(%rsi), %rsi
923 palignr $6, %xmm2, %xmm1
924 palignr $6, %xmm3, %xmm2
925 palignr $6, %xmm4, %xmm3
926 palignr $6, %xmm5, %xmm4
928 movaps %xmm1, -0x10(%rdi)
931 movaps %xmm2, -0x20(%rdi)
932 lea -0x40(%rdi), %rdi
934 movaps %xmm3, 0x10(%rdi)
937 _CET_NOTRACK jmp *%r9
943 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
947 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
949 movaps -0x07(%rsi), %xmm1
951 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
954 _CET_NOTRACK jmp *%r9
957 prefetchnta 0x1c0(%rsi)
960 movaps 0x09(%rsi), %xmm2
961 movaps 0x19(%rsi), %xmm3
962 movaps 0x29(%rsi), %xmm4
963 movaps 0x39(%rsi), %xmm5
965 palignr $7, %xmm4, %xmm5
967 palignr $7, %xmm3, %xmm4
968 palignr $7, %xmm2, %xmm3
970 palignr $7, %xmm1, %xmm2
972 movdqa %xmm2, -0x40(%rdi)
973 movaps %xmm3, -0x30(%rdi)
975 movaps %xmm4, -0x20(%rdi)
976 movaps %xmm5, -0x10(%rdi)
977 _CET_NOTRACK jmp *%r9
980 movaps %xmm4, -0x20(%rdi)
982 movaps %xmm5, -0x10(%rdi)
986 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
990 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
992 movaps -0x07(%rsi), %xmm1
994 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
997 _CET_NOTRACK jmp *%r9
999 L(shl_7_bwd_loop_L2):
1000 prefetchnta -0x1c0(%rsi)
1001 L(shl_7_bwd_loop_L1):
1002 movaps -0x17(%rsi), %xmm2
1004 movaps -0x27(%rsi), %xmm3
1005 movaps -0x37(%rsi), %xmm4
1006 movaps -0x47(%rsi), %xmm5
1007 lea -0x40(%rsi), %rsi
1008 palignr $7, %xmm2, %xmm1
1009 palignr $7, %xmm3, %xmm2
1010 palignr $7, %xmm4, %xmm3
1011 palignr $7, %xmm5, %xmm4
1013 movaps %xmm1, -0x10(%rdi)
1016 movaps %xmm2, -0x20(%rdi)
1017 lea -0x40(%rdi), %rdi
1019 movaps %xmm3, 0x10(%rdi)
1021 movaps %xmm4, (%rdi)
1022 _CET_NOTRACK jmp *%r9
1025 movaps %xmm4, (%rdi)
1028 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1032 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1034 movaps -0x08(%rsi), %xmm1
1036 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1039 _CET_NOTRACK jmp *%r9
1041 prefetchnta 0x1c0(%rsi)
1044 movaps 0x08(%rsi), %xmm2
1045 movaps 0x18(%rsi), %xmm3
1046 movaps 0x28(%rsi), %xmm4
1047 movaps 0x38(%rsi), %xmm5
1049 palignr $8, %xmm4, %xmm5
1051 palignr $8, %xmm3, %xmm4
1052 palignr $8, %xmm2, %xmm3
1054 palignr $8, %xmm1, %xmm2
1056 movdqa %xmm2, -0x40(%rdi)
1057 movaps %xmm3, -0x30(%rdi)
1059 movaps %xmm4, -0x20(%rdi)
1060 movaps %xmm5, -0x10(%rdi)
1061 _CET_NOTRACK jmp *%r9
1066 movaps %xmm4, -0x20(%rdi)
1068 movaps %xmm5, -0x10(%rdi)
1071 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1075 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1077 movaps -0x08(%rsi), %xmm1
1079 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1082 _CET_NOTRACK jmp *%r9
1084 L(shl_8_bwd_loop_L2):
1085 prefetchnta -0x1c0(%rsi)
1086 L(shl_8_bwd_loop_L1):
1087 movaps -0x18(%rsi), %xmm2
1089 movaps -0x28(%rsi), %xmm3
1090 movaps -0x38(%rsi), %xmm4
1091 movaps -0x48(%rsi), %xmm5
1092 lea -0x40(%rsi), %rsi
1093 palignr $8, %xmm2, %xmm1
1094 palignr $8, %xmm3, %xmm2
1095 palignr $8, %xmm4, %xmm3
1096 palignr $8, %xmm5, %xmm4
1098 movaps %xmm1, -0x10(%rdi)
1101 movaps %xmm2, -0x20(%rdi)
1102 lea -0x40(%rdi), %rdi
1104 movaps %xmm3, 0x10(%rdi)
1106 movaps %xmm4, (%rdi)
1107 _CET_NOTRACK jmp *%r9
1110 movaps %xmm4, (%rdi)
1113 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1117 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1119 movaps -0x09(%rsi), %xmm1
1121 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1124 _CET_NOTRACK jmp *%r9
1127 prefetchnta 0x1c0(%rsi)
1130 movaps 0x07(%rsi), %xmm2
1131 movaps 0x17(%rsi), %xmm3
1132 movaps 0x27(%rsi), %xmm4
1133 movaps 0x37(%rsi), %xmm5
1135 palignr $9, %xmm4, %xmm5
1137 palignr $9, %xmm3, %xmm4
1138 palignr $9, %xmm2, %xmm3
1140 palignr $9, %xmm1, %xmm2
1142 movdqa %xmm2, -0x40(%rdi)
1143 movaps %xmm3, -0x30(%rdi)
1145 movaps %xmm4, -0x20(%rdi)
1146 movaps %xmm5, -0x10(%rdi)
1147 _CET_NOTRACK jmp *%r9
1150 movaps %xmm4, -0x20(%rdi)
1152 movaps %xmm5, -0x10(%rdi)
1156 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1160 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1162 movaps -0x09(%rsi), %xmm1
1164 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1167 _CET_NOTRACK jmp *%r9
1169 L(shl_9_bwd_loop_L2):
1170 prefetchnta -0x1c0(%rsi)
1171 L(shl_9_bwd_loop_L1):
1172 movaps -0x19(%rsi), %xmm2
1174 movaps -0x29(%rsi), %xmm3
1175 movaps -0x39(%rsi), %xmm4
1176 movaps -0x49(%rsi), %xmm5
1177 lea -0x40(%rsi), %rsi
1178 palignr $9, %xmm2, %xmm1
1179 palignr $9, %xmm3, %xmm2
1180 palignr $9, %xmm4, %xmm3
1181 palignr $9, %xmm5, %xmm4
1183 movaps %xmm1, -0x10(%rdi)
1186 movaps %xmm2, -0x20(%rdi)
1187 lea -0x40(%rdi), %rdi
1189 movaps %xmm3, 0x10(%rdi)
1191 movaps %xmm4, (%rdi)
1192 _CET_NOTRACK jmp *%r9
1195 movaps %xmm4, (%rdi)
1198 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1202 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1204 movaps -0x0a(%rsi), %xmm1
1206 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1209 _CET_NOTRACK jmp *%r9
1212 prefetchnta 0x1c0(%rsi)
1215 movaps 0x06(%rsi), %xmm2
1216 movaps 0x16(%rsi), %xmm3
1217 movaps 0x26(%rsi), %xmm4
1218 movaps 0x36(%rsi), %xmm5
1220 palignr $10, %xmm4, %xmm5
1222 palignr $10, %xmm3, %xmm4
1223 palignr $10, %xmm2, %xmm3
1225 palignr $10, %xmm1, %xmm2
1227 movdqa %xmm2, -0x40(%rdi)
1228 movaps %xmm3, -0x30(%rdi)
1230 movaps %xmm4, -0x20(%rdi)
1231 movaps %xmm5, -0x10(%rdi)
1232 _CET_NOTRACK jmp *%r9
1235 movaps %xmm4, -0x20(%rdi)
1237 movaps %xmm5, -0x10(%rdi)
1241 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1245 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1247 movaps -0x0a(%rsi), %xmm1
1249 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1252 _CET_NOTRACK jmp *%r9
1254 L(shl_10_bwd_loop_L2):
1255 prefetchnta -0x1c0(%rsi)
1256 L(shl_10_bwd_loop_L1):
1257 movaps -0x1a(%rsi), %xmm2
1259 movaps -0x2a(%rsi), %xmm3
1260 movaps -0x3a(%rsi), %xmm4
1261 movaps -0x4a(%rsi), %xmm5
1262 lea -0x40(%rsi), %rsi
1263 palignr $10, %xmm2, %xmm1
1264 palignr $10, %xmm3, %xmm2
1265 palignr $10, %xmm4, %xmm3
1266 palignr $10, %xmm5, %xmm4
1268 movaps %xmm1, -0x10(%rdi)
1271 movaps %xmm2, -0x20(%rdi)
1272 lea -0x40(%rdi), %rdi
1274 movaps %xmm3, 0x10(%rdi)
1275 jb L(shl_10_bwd_end)
1276 movaps %xmm4, (%rdi)
1277 _CET_NOTRACK jmp *%r9
1280 movaps %xmm4, (%rdi)
1283 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1287 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1289 movaps -0x0b(%rsi), %xmm1
1291 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1294 _CET_NOTRACK jmp *%r9
1297 prefetchnta 0x1c0(%rsi)
1300 movaps 0x05(%rsi), %xmm2
1301 movaps 0x15(%rsi), %xmm3
1302 movaps 0x25(%rsi), %xmm4
1303 movaps 0x35(%rsi), %xmm5
1305 palignr $11, %xmm4, %xmm5
1307 palignr $11, %xmm3, %xmm4
1308 palignr $11, %xmm2, %xmm3
1310 palignr $11, %xmm1, %xmm2
1312 movdqa %xmm2, -0x40(%rdi)
1313 movaps %xmm3, -0x30(%rdi)
1315 movaps %xmm4, -0x20(%rdi)
1316 movaps %xmm5, -0x10(%rdi)
1317 _CET_NOTRACK jmp *%r9
1320 movaps %xmm4, -0x20(%rdi)
1322 movaps %xmm5, -0x10(%rdi)
1326 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1330 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1332 movaps -0x0b(%rsi), %xmm1
1334 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1337 _CET_NOTRACK jmp *%r9
1339 L(shl_11_bwd_loop_L2):
1340 prefetchnta -0x1c0(%rsi)
1341 L(shl_11_bwd_loop_L1):
1342 movaps -0x1b(%rsi), %xmm2
1344 movaps -0x2b(%rsi), %xmm3
1345 movaps -0x3b(%rsi), %xmm4
1346 movaps -0x4b(%rsi), %xmm5
1347 lea -0x40(%rsi), %rsi
1348 palignr $11, %xmm2, %xmm1
1349 palignr $11, %xmm3, %xmm2
1350 palignr $11, %xmm4, %xmm3
1351 palignr $11, %xmm5, %xmm4
1353 movaps %xmm1, -0x10(%rdi)
1356 movaps %xmm2, -0x20(%rdi)
1357 lea -0x40(%rdi), %rdi
1359 movaps %xmm3, 0x10(%rdi)
1360 jb L(shl_11_bwd_end)
1361 movaps %xmm4, (%rdi)
1362 _CET_NOTRACK jmp *%r9
1365 movaps %xmm4, (%rdi)
1368 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1372 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1374 movaps -0x0c(%rsi), %xmm1
1376 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1379 _CET_NOTRACK jmp *%r9
1382 prefetchnta 0x1c0(%rsi)
1385 movaps 0x04(%rsi), %xmm2
1386 movaps 0x14(%rsi), %xmm3
1387 movaps 0x24(%rsi), %xmm4
1388 movaps 0x34(%rsi), %xmm5
1390 palignr $12, %xmm4, %xmm5
1392 palignr $12, %xmm3, %xmm4
1393 palignr $12, %xmm2, %xmm3
1395 palignr $12, %xmm1, %xmm2
1397 movdqa %xmm2, -0x40(%rdi)
1398 movaps %xmm3, -0x30(%rdi)
1400 movaps %xmm4, -0x20(%rdi)
1401 movaps %xmm5, -0x10(%rdi)
1402 _CET_NOTRACK jmp *%r9
1405 movaps %xmm4, -0x20(%rdi)
1407 movaps %xmm5, -0x10(%rdi)
1411 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1415 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1417 movaps -0x0c(%rsi), %xmm1
1419 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1422 _CET_NOTRACK jmp *%r9
1424 L(shl_12_bwd_loop_L2):
1425 prefetchnta -0x1c0(%rsi)
1426 L(shl_12_bwd_loop_L1):
1427 movaps -0x1c(%rsi), %xmm2
1429 movaps -0x2c(%rsi), %xmm3
1430 movaps -0x3c(%rsi), %xmm4
1431 movaps -0x4c(%rsi), %xmm5
1432 lea -0x40(%rsi), %rsi
1433 palignr $12, %xmm2, %xmm1
1434 palignr $12, %xmm3, %xmm2
1435 palignr $12, %xmm4, %xmm3
1436 palignr $12, %xmm5, %xmm4
1438 movaps %xmm1, -0x10(%rdi)
1441 movaps %xmm2, -0x20(%rdi)
1442 lea -0x40(%rdi), %rdi
1444 movaps %xmm3, 0x10(%rdi)
1445 jb L(shl_12_bwd_end)
1446 movaps %xmm4, (%rdi)
1447 _CET_NOTRACK jmp *%r9
1450 movaps %xmm4, (%rdi)
1453 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1457 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1459 movaps -0x0d(%rsi), %xmm1
1461 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1464 _CET_NOTRACK jmp *%r9
1467 prefetchnta 0x1c0(%rsi)
1470 movaps 0x03(%rsi), %xmm2
1471 movaps 0x13(%rsi), %xmm3
1472 movaps 0x23(%rsi), %xmm4
1473 movaps 0x33(%rsi), %xmm5
1475 palignr $13, %xmm4, %xmm5
1477 palignr $13, %xmm3, %xmm4
1478 palignr $13, %xmm2, %xmm3
1480 palignr $13, %xmm1, %xmm2
1482 movdqa %xmm2, -0x40(%rdi)
1483 movaps %xmm3, -0x30(%rdi)
1485 movaps %xmm4, -0x20(%rdi)
1486 movaps %xmm5, -0x10(%rdi)
1487 _CET_NOTRACK jmp *%r9
1490 movaps %xmm4, -0x20(%rdi)
1492 movaps %xmm5, -0x10(%rdi)
1496 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1500 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1502 movaps -0x0d(%rsi), %xmm1
1504 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1507 _CET_NOTRACK jmp *%r9
1509 L(shl_13_bwd_loop_L2):
1510 prefetchnta -0x1c0(%rsi)
1511 L(shl_13_bwd_loop_L1):
1512 movaps -0x1d(%rsi), %xmm2
1514 movaps -0x2d(%rsi), %xmm3
1515 movaps -0x3d(%rsi), %xmm4
1516 movaps -0x4d(%rsi), %xmm5
1517 lea -0x40(%rsi), %rsi
1518 palignr $13, %xmm2, %xmm1
1519 palignr $13, %xmm3, %xmm2
1520 palignr $13, %xmm4, %xmm3
1521 palignr $13, %xmm5, %xmm4
1523 movaps %xmm1, -0x10(%rdi)
1526 movaps %xmm2, -0x20(%rdi)
1527 lea -0x40(%rdi), %rdi
1529 movaps %xmm3, 0x10(%rdi)
1530 jb L(shl_13_bwd_end)
1531 movaps %xmm4, (%rdi)
1532 _CET_NOTRACK jmp *%r9
1535 movaps %xmm4, (%rdi)
1538 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1542 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1544 movaps -0x0e(%rsi), %xmm1
1546 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1549 _CET_NOTRACK jmp *%r9
1552 prefetchnta 0x1c0(%rsi)
1555 movaps 0x02(%rsi), %xmm2
1556 movaps 0x12(%rsi), %xmm3
1557 movaps 0x22(%rsi), %xmm4
1558 movaps 0x32(%rsi), %xmm5
1560 palignr $14, %xmm4, %xmm5
1562 palignr $14, %xmm3, %xmm4
1563 palignr $14, %xmm2, %xmm3
1565 palignr $14, %xmm1, %xmm2
1567 movdqa %xmm2, -0x40(%rdi)
1568 movaps %xmm3, -0x30(%rdi)
1570 movaps %xmm4, -0x20(%rdi)
1571 movaps %xmm5, -0x10(%rdi)
1572 _CET_NOTRACK jmp *%r9
1575 movaps %xmm4, -0x20(%rdi)
1577 movaps %xmm5, -0x10(%rdi)
1581 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1585 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1587 movaps -0x0e(%rsi), %xmm1
1589 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1592 _CET_NOTRACK jmp *%r9
1594 L(shl_14_bwd_loop_L2):
1595 prefetchnta -0x1c0(%rsi)
1596 L(shl_14_bwd_loop_L1):
1597 movaps -0x1e(%rsi), %xmm2
1599 movaps -0x2e(%rsi), %xmm3
1600 movaps -0x3e(%rsi), %xmm4
1601 movaps -0x4e(%rsi), %xmm5
1602 lea -0x40(%rsi), %rsi
1603 palignr $14, %xmm2, %xmm1
1604 palignr $14, %xmm3, %xmm2
1605 palignr $14, %xmm4, %xmm3
1606 palignr $14, %xmm5, %xmm4
1608 movaps %xmm1, -0x10(%rdi)
1611 movaps %xmm2, -0x20(%rdi)
1612 lea -0x40(%rdi), %rdi
1614 movaps %xmm3, 0x10(%rdi)
1615 jb L(shl_14_bwd_end)
1616 movaps %xmm4, (%rdi)
1617 _CET_NOTRACK jmp *%r9
1620 movaps %xmm4, (%rdi)
1623 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1627 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1629 movaps -0x0f(%rsi), %xmm1
1631 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1634 _CET_NOTRACK jmp *%r9
1637 prefetchnta 0x1c0(%rsi)
1640 movaps 0x01(%rsi), %xmm2
1641 movaps 0x11(%rsi), %xmm3
1642 movaps 0x21(%rsi), %xmm4
1643 movaps 0x31(%rsi), %xmm5
1645 palignr $15, %xmm4, %xmm5
1647 palignr $15, %xmm3, %xmm4
1648 palignr $15, %xmm2, %xmm3
1650 palignr $15, %xmm1, %xmm2
1652 movdqa %xmm2, -0x40(%rdi)
1653 movaps %xmm3, -0x30(%rdi)
1655 movaps %xmm4, -0x20(%rdi)
1656 movaps %xmm5, -0x10(%rdi)
1657 _CET_NOTRACK jmp *%r9
1660 movaps %xmm4, -0x20(%rdi)
1662 movaps %xmm5, -0x10(%rdi)
1666 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1670 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1672 movaps -0x0f(%rsi), %xmm1
1674 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1677 _CET_NOTRACK jmp *%r9
1679 L(shl_15_bwd_loop_L2):
1680 prefetchnta -0x1c0(%rsi)
1681 L(shl_15_bwd_loop_L1):
1682 movaps -0x1f(%rsi), %xmm2
1684 movaps -0x2f(%rsi), %xmm3
1685 movaps -0x3f(%rsi), %xmm4
1686 movaps -0x4f(%rsi), %xmm5
1687 lea -0x40(%rsi), %rsi
1688 palignr $15, %xmm2, %xmm1
1689 palignr $15, %xmm3, %xmm2
1690 palignr $15, %xmm4, %xmm3
1691 palignr $15, %xmm5, %xmm4
1693 movaps %xmm1, -0x10(%rdi)
1696 movaps %xmm2, -0x20(%rdi)
1697 lea -0x40(%rdi), %rdi
1699 movaps %xmm3, 0x10(%rdi)
1700 jb L(shl_15_bwd_end)
1701 movaps %xmm4, (%rdi)
1702 _CET_NOTRACK jmp *%r9
1705 movaps %xmm4, (%rdi)
1708 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1712 movdqu -72(%rsi), %xmm0
1713 movdqu -56(%rsi), %xmm1
1719 movdqu %xmm0, -72(%rdi)
1720 movdqu %xmm1, -56(%rdi)
1730 movdqu -64(%rsi), %xmm0
1737 movdqu %xmm0, -64(%rdi)
1748 movdqu -56(%rsi), %xmm0
1754 movdqu %xmm0, -56(%rdi)
1831 movdqu -73(%rsi), %xmm0
1832 movdqu -57(%rsi), %xmm1
1839 movdqu %xmm0, -73(%rdi)
1840 movdqu %xmm1, -57(%rdi)
1851 movdqu -65(%rsi), %xmm0
1852 movdqu -49(%rsi), %xmm1
1858 movdqu %xmm0, -65(%rdi)
1859 movdqu %xmm1, -49(%rdi)
1869 movdqu -57(%rsi), %xmm0
1876 movdqu %xmm0, -57(%rdi)
1887 movdqu -49(%rsi), %xmm0
1893 movdqu %xmm0, -49(%rdi)
1969 movdqu -74(%rsi), %xmm0
1970 movdqu -58(%rsi), %xmm1
1977 movdqu %xmm0, -74(%rdi)
1978 movdqu %xmm1, -58(%rdi)
1989 movdqu -66(%rsi), %xmm0
1990 movdqu -50(%rsi), %xmm1
1997 movdqu %xmm0, -66(%rdi)
1998 movdqu %xmm1, -50(%rdi)
2009 movdqu -58(%rsi), %xmm1
2016 movdqu %xmm1, -58(%rdi)
2027 movdqu -50(%rsi), %xmm0
2033 movdqu %xmm0, -50(%rdi)
2109 movdqu -75(%rsi), %xmm0
2110 movdqu -59(%rsi), %xmm1
2117 movdqu %xmm0, -75(%rdi)
2118 movdqu %xmm1, -59(%rdi)
2129 movdqu -67(%rsi), %xmm0
2130 movdqu -59(%rsi), %xmm1
2137 movdqu %xmm0, -67(%rdi)
2138 movdqu %xmm1, -59(%rdi)
2149 movdqu -59(%rsi), %xmm0
2156 movdqu %xmm0, -59(%rdi)
2167 movdqu -51(%rsi), %xmm0
2173 movdqu %xmm0, -51(%rdi)
2251 movdqu -76(%rsi), %xmm0
2252 movdqu -60(%rsi), %xmm1
2259 movdqu %xmm0, -76(%rdi)
2260 movdqu %xmm1, -60(%rdi)
2271 movdqu -68(%rsi), %xmm0
2272 movdqu -52(%rsi), %xmm1
2278 movdqu %xmm0, -68(%rdi)
2279 movdqu %xmm1, -52(%rdi)
2289 movdqu -60(%rsi), %xmm0
2296 movdqu %xmm0, -60(%rdi)
2307 movdqu -52(%rsi), %xmm0
2313 movdqu %xmm0, -52(%rdi)
2389 movdqu -77(%rsi), %xmm0
2390 movdqu -61(%rsi), %xmm1
2397 movdqu %xmm0, -77(%rdi)
2398 movdqu %xmm1, -61(%rdi)
2409 movdqu -69(%rsi), %xmm0
2410 movdqu -53(%rsi), %xmm1
2416 movdqu %xmm0, -69(%rdi)
2417 movdqu %xmm1, -53(%rdi)
2427 movdqu -61(%rsi), %xmm0
2434 movdqu %xmm0, -61(%rdi)
2445 movdqu -53(%rsi), %xmm0
2452 movdqu %xmm0, -53(%rdi)
2530 movdqu -78(%rsi), %xmm0
2531 movdqu -62(%rsi), %xmm1
2538 movdqu %xmm0, -78(%rdi)
2539 movdqu %xmm1, -62(%rdi)
2550 movdqu -70(%rsi), %xmm0
2551 movdqu -54(%rsi), %xmm1
2557 movdqu %xmm0, -70(%rdi)
2558 movdqu %xmm1, -54(%rdi)
2568 movdqu -62(%rsi), %xmm0
2575 movdqu %xmm0, -62(%rdi)
2586 movdqu -54(%rsi), %xmm0
2592 movdqu %xmm0, -54(%rdi)
2670 movdqu -79(%rsi), %xmm0
2671 movdqu -63(%rsi), %xmm1
2678 movdqu %xmm0, -79(%rdi)
2679 movdqu %xmm1, -63(%rdi)
2690 movdqu -71(%rsi), %xmm0
2691 movdqu -55(%rsi), %xmm1
2697 movdqu %xmm0, -71(%rdi)
2698 movdqu %xmm1, -55(%rdi)
2708 movdqu -63(%rsi), %xmm0
2715 movdqu %xmm0, -63(%rdi)
2726 movdqu -55(%rsi), %xmm0
2732 movdqu %xmm0, -55(%rdi)
2810 movdqu (%rsi), %xmm1
2813 movntdq %xmm1, (%rdi)
2815 lea -0x90(%rdx), %rdx
2816 #ifdef USE_AS_MEMMOVE
2820 jae L(memmove_is_memcpy_fwd)
2823 jb L(ll_cache_copy_fwd_start)
2824 L(memmove_is_memcpy_fwd):
2827 movdqu (%rsi), %xmm0
2828 movdqu 0x10(%rsi), %xmm1
2829 movdqu 0x20(%rsi), %xmm2
2830 movdqu 0x30(%rsi), %xmm3
2831 movdqu 0x40(%rsi), %xmm4
2832 movdqu 0x50(%rsi), %xmm5
2833 movdqu 0x60(%rsi), %xmm6
2834 movdqu 0x70(%rsi), %xmm7
2835 lea 0x80(%rsi), %rsi
2838 movntdq %xmm0, (%rdi)
2839 movntdq %xmm1, 0x10(%rdi)
2840 movntdq %xmm2, 0x20(%rdi)
2841 movntdq %xmm3, 0x30(%rdi)
2842 movntdq %xmm4, 0x40(%rdi)
2843 movntdq %xmm5, 0x50(%rdi)
2844 movntdq %xmm6, 0x60(%rdi)
2845 movntdq %xmm7, 0x70(%rdi)
2846 lea 0x80(%rdi), %rdi
2847 jae L(large_page_loop)
2849 lea 0x80(%rdx), %rdx
2850 jl L(large_page_less_64bytes)
2852 movdqu (%rsi), %xmm0
2853 movdqu 0x10(%rsi), %xmm1
2854 movdqu 0x20(%rsi), %xmm2
2855 movdqu 0x30(%rsi), %xmm3
2856 lea 0x40(%rsi), %rsi
2858 movntdq %xmm0, (%rdi)
2859 movntdq %xmm1, 0x10(%rdi)
2860 movntdq %xmm2, 0x20(%rdi)
2861 movntdq %xmm3, 0x30(%rdi)
2862 lea 0x40(%rdi), %rdi
2864 L(large_page_less_64bytes):
2868 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2870 #ifdef USE_AS_MEMMOVE
2872 L(ll_cache_copy_fwd_start):
2873 prefetcht0 0x1c0(%rsi)
2874 prefetcht0 0x200(%rsi)
2875 movdqu (%rsi), %xmm0
2876 movdqu 0x10(%rsi), %xmm1
2877 movdqu 0x20(%rsi), %xmm2
2878 movdqu 0x30(%rsi), %xmm3
2879 movdqu 0x40(%rsi), %xmm4
2880 movdqu 0x50(%rsi), %xmm5
2881 movdqu 0x60(%rsi), %xmm6
2882 movdqu 0x70(%rsi), %xmm7
2883 lea 0x80(%rsi), %rsi
2886 movaps %xmm0, (%rdi)
2887 movaps %xmm1, 0x10(%rdi)
2888 movaps %xmm2, 0x20(%rdi)
2889 movaps %xmm3, 0x30(%rdi)
2890 movaps %xmm4, 0x40(%rdi)
2891 movaps %xmm5, 0x50(%rdi)
2892 movaps %xmm6, 0x60(%rdi)
2893 movaps %xmm7, 0x70(%rdi)
2894 lea 0x80(%rdi), %rdi
2895 jae L(ll_cache_copy_fwd_start)
2897 lea 0x80(%rdx), %rdx
2898 jl L(large_page_ll_less_fwd_64bytes)
2900 movdqu (%rsi), %xmm0
2901 movdqu 0x10(%rsi), %xmm1
2902 movdqu 0x20(%rsi), %xmm2
2903 movdqu 0x30(%rsi), %xmm3
2904 lea 0x40(%rsi), %rsi
2906 movaps %xmm0, (%rdi)
2907 movaps %xmm1, 0x10(%rdi)
2908 movaps %xmm2, 0x20(%rdi)
2909 movaps %xmm3, 0x30(%rdi)
2910 lea 0x40(%rdi), %rdi
2912 L(large_page_ll_less_fwd_64bytes):
2915 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2920 movdqu -0x10(%rsi), %xmm1
2923 movdqa %xmm1, -0x10(%rdi)
2925 lea -0x90(%rdx), %rdx
2926 #ifdef USE_AS_MEMMOVE
2930 jae L(memmove_is_memcpy_bwd)
2932 jb L(ll_cache_copy_bwd_start)
2933 L(memmove_is_memcpy_bwd):
2935 L(large_page_bwd_loop):
2936 movdqu -0x10(%rsi), %xmm0
2937 movdqu -0x20(%rsi), %xmm1
2938 movdqu -0x30(%rsi), %xmm2
2939 movdqu -0x40(%rsi), %xmm3
2940 movdqu -0x50(%rsi), %xmm4
2941 movdqu -0x60(%rsi), %xmm5
2942 movdqu -0x70(%rsi), %xmm6
2943 movdqu -0x80(%rsi), %xmm7
2944 lea -0x80(%rsi), %rsi
2947 movntdq %xmm0, -0x10(%rdi)
2948 movntdq %xmm1, -0x20(%rdi)
2949 movntdq %xmm2, -0x30(%rdi)
2950 movntdq %xmm3, -0x40(%rdi)
2951 movntdq %xmm4, -0x50(%rdi)
2952 movntdq %xmm5, -0x60(%rdi)
2953 movntdq %xmm6, -0x70(%rdi)
2954 movntdq %xmm7, -0x80(%rdi)
2955 lea -0x80(%rdi), %rdi
2956 jae L(large_page_bwd_loop)
2958 lea 0x80(%rdx), %rdx
2959 jl L(large_page_less_bwd_64bytes)
2961 movdqu -0x10(%rsi), %xmm0
2962 movdqu -0x20(%rsi), %xmm1
2963 movdqu -0x30(%rsi), %xmm2
2964 movdqu -0x40(%rsi), %xmm3
2965 lea -0x40(%rsi), %rsi
2967 movntdq %xmm0, -0x10(%rdi)
2968 movntdq %xmm1, -0x20(%rdi)
2969 movntdq %xmm2, -0x30(%rdi)
2970 movntdq %xmm3, -0x40(%rdi)
2971 lea -0x40(%rdi), %rdi
2973 L(large_page_less_bwd_64bytes):
2975 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2977 #ifdef USE_AS_MEMMOVE
2979 L(ll_cache_copy_bwd_start):
2980 prefetcht0 -0x1c0(%rsi)
2981 prefetcht0 -0x200(%rsi)
2982 movdqu -0x10(%rsi), %xmm0
2983 movdqu -0x20(%rsi), %xmm1
2984 movdqu -0x30(%rsi), %xmm2
2985 movdqu -0x40(%rsi), %xmm3
2986 movdqu -0x50(%rsi), %xmm4
2987 movdqu -0x60(%rsi), %xmm5
2988 movdqu -0x70(%rsi), %xmm6
2989 movdqu -0x80(%rsi), %xmm7
2990 lea -0x80(%rsi), %rsi
2993 movaps %xmm0, -0x10(%rdi)
2994 movaps %xmm1, -0x20(%rdi)
2995 movaps %xmm2, -0x30(%rdi)
2996 movaps %xmm3, -0x40(%rdi)
2997 movaps %xmm4, -0x50(%rdi)
2998 movaps %xmm5, -0x60(%rdi)
2999 movaps %xmm6, -0x70(%rdi)
3000 movaps %xmm7, -0x80(%rdi)
3001 lea -0x80(%rdi), %rdi
3002 jae L(ll_cache_copy_bwd_start)
3004 lea 0x80(%rdx), %rdx
3005 jl L(large_page_ll_less_bwd_64bytes)
3007 movdqu -0x10(%rsi), %xmm0
3008 movdqu -0x20(%rsi), %xmm1
3009 movdqu -0x30(%rsi), %xmm2
3010 movdqu -0x40(%rsi), %xmm3
3011 lea -0x40(%rsi), %rsi
3013 movaps %xmm0, -0x10(%rdi)
3014 movaps %xmm1, -0x20(%rdi)
3015 movaps %xmm2, -0x30(%rdi)
3016 movaps %xmm3, -0x40(%rdi)
3017 lea -0x40(%rdi), %rdi
3019 L(large_page_ll_less_bwd_64bytes):
3020 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3025 .section .rodata.ssse3,"a",@progbits
3027 L(table_less_80bytes):
3028 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3029 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3030 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3031 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3111 .int JMPTBL (L(shl_0), L(shl_table))
3112 .int JMPTBL (L(shl_1), L(shl_table))
3113 .int JMPTBL (L(shl_2), L(shl_table))
3114 .int JMPTBL (L(shl_3), L(shl_table))
3115 .int JMPTBL (L(shl_4), L(shl_table))
3116 .int JMPTBL (L(shl_5), L(shl_table))
3117 .int JMPTBL (L(shl_6), L(shl_table))
3118 .int JMPTBL (L(shl_7), L(shl_table))
3119 .int JMPTBL (L(shl_8), L(shl_table))
3120 .int JMPTBL (L(shl_9), L(shl_table))
3121 .int JMPTBL (L(shl_10), L(shl_table))
3122 .int JMPTBL (L(shl_11), L(shl_table))
3123 .int JMPTBL (L(shl_12), L(shl_table))
3124 .int JMPTBL (L(shl_13), L(shl_table))
3125 .int JMPTBL (L(shl_14), L(shl_table))
3126 .int JMPTBL (L(shl_15), L(shl_table))
3130 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3131 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3132 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3133 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3134 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))