2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
24 #include "asm-syntax.h"
27 # define MEMCPY __memcpy_ssse3
28 # define MEMCPY_CHK __memcpy_chk_ssse3
29 # define MEMPCPY __mempcpy_ssse3
30 # define MEMPCPY_CHK __mempcpy_chk_ssse3
33 #define JMPTBL(I, B) I - B
35 /* Branch to an entry in a jump table. TABLE is a jump table with
36 relative offsets. INDEX is a register contains the index into the
37 jump table. SCALE is the scale of INDEX. */
38 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
39 lea TABLE(%rip), %r11; \
40 movslq (%r11, INDEX, SCALE), INDEX; \
41 lea (%r11, INDEX), INDEX; \
42 _CET_NOTRACK jmp *INDEX; \
45 .section .text.ssse3,"ax",@progbits
46 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
49 jb HIDDEN_JUMPTARGET (__chk_fail)
59 #if !defined USE_AS_BCOPY
62 jb HIDDEN_JUMPTARGET (__chk_fail)
73 /* Clear the upper 32 bits. */
88 lea L(table_less_80bytes)(%rip), %r11
90 movslq (%r11, %rdx, 4), %r9
99 #ifndef USE_AS_MEMMOVE
113 #ifdef SHARED_CACHE_SIZE_HALF
114 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
116 mov __x86_shared_cache_size_half(%rip), %RCX_LP
123 #ifdef DATA_CACHE_SIZE_HALF
124 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
126 mov __x86_data_cache_size_half(%rip), %RCX_LP
128 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
132 movdqu -16(%rsi, %rdx), %xmm0
134 lea -16(%rdi, %rdx), %r8
143 #ifdef SHARED_CACHE_SIZE_HALF
144 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
146 mov __x86_shared_cache_size_half(%rip), %RCX_LP
154 #ifdef DATA_CACHE_SIZE_HALF
155 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
157 mov __x86_data_cache_size_half(%rip), %RCX_LP
159 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
172 jb L(shl_0_less_64bytes)
174 movaps 16(%rsi), %xmm1
175 movaps 32(%rsi), %xmm2
176 movaps 48(%rsi), %xmm3
178 movaps %xmm1, 16(%rdi)
179 movaps %xmm2, 32(%rdi)
180 movaps %xmm3, 48(%rdi)
184 L(shl_0_less_64bytes):
187 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
191 #ifdef DATA_CACHE_SIZE_HALF
192 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
194 cmp __x86_data_cache_size_half(%rip), %RDX_LP
197 jae L(shl_0_gobble_mem_loop)
198 L(shl_0_gobble_cache_loop):
200 movaps 0x10(%rsi), %xmm1
201 movaps 0x20(%rsi), %xmm2
202 movaps 0x30(%rsi), %xmm3
205 movaps %xmm1, 0x10(%rdi)
206 movaps %xmm2, 0x20(%rdi)
207 movaps %xmm3, 0x30(%rdi)
210 movaps 0x40(%rsi), %xmm4
211 movaps 0x50(%rsi), %xmm5
212 movaps 0x60(%rsi), %xmm6
213 movaps 0x70(%rsi), %xmm7
215 movaps %xmm4, 0x40(%rdi)
216 movaps %xmm5, 0x50(%rdi)
217 movaps %xmm6, 0x60(%rdi)
218 movaps %xmm7, 0x70(%rdi)
221 jae L(shl_0_gobble_cache_loop)
224 jl L(shl_0_cache_less_64bytes)
228 movdqa 0x10(%rsi), %xmm1
231 movdqa %xmm1, 0x10(%rdi)
233 movdqa 0x20(%rsi), %xmm4
234 movdqa 0x30(%rsi), %xmm1
237 movdqa %xmm4, 0x20(%rdi)
238 movdqa %xmm1, 0x30(%rdi)
240 L(shl_0_cache_less_64bytes):
243 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
246 L(shl_0_gobble_mem_loop):
247 prefetcht0 0x1c0(%rsi)
248 prefetcht0 0x280(%rsi)
251 movdqa 0x10(%rsi), %xmm1
252 movdqa 0x20(%rsi), %xmm2
253 movdqa 0x30(%rsi), %xmm3
254 movdqa 0x40(%rsi), %xmm4
255 movdqa 0x50(%rsi), %xmm5
256 movdqa 0x60(%rsi), %xmm6
257 movdqa 0x70(%rsi), %xmm7
261 movdqa %xmm1, 0x10(%rdi)
262 movdqa %xmm2, 0x20(%rdi)
263 movdqa %xmm3, 0x30(%rdi)
264 movdqa %xmm4, 0x40(%rdi)
265 movdqa %xmm5, 0x50(%rdi)
266 movdqa %xmm6, 0x60(%rdi)
267 movdqa %xmm7, 0x70(%rdi)
270 jae L(shl_0_gobble_mem_loop)
273 jl L(shl_0_mem_less_64bytes)
277 movdqa 0x10(%rsi), %xmm1
280 movdqa %xmm1, 0x10(%rdi)
282 movdqa 0x20(%rsi), %xmm0
283 movdqa 0x30(%rsi), %xmm1
286 movdqa %xmm0, 0x20(%rdi)
287 movdqa %xmm1, 0x30(%rdi)
289 L(shl_0_mem_less_64bytes):
291 jb L(shl_0_mem_less_32bytes)
294 movdqa 0x10(%rsi), %xmm1
297 movdqa %xmm1, 0x10(%rdi)
299 L(shl_0_mem_less_32bytes):
302 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
307 movdqa -0x10(%rsi), %xmm1
309 movdqa %xmm1, -0x10(%rdi)
313 ja L(shl_0_gobble_bwd)
315 jb L(shl_0_less_64bytes_bwd)
316 movaps -0x10(%rsi), %xmm0
317 movaps -0x20(%rsi), %xmm1
318 movaps -0x30(%rsi), %xmm2
319 movaps -0x40(%rsi), %xmm3
320 movaps %xmm0, -0x10(%rdi)
321 movaps %xmm1, -0x20(%rdi)
322 movaps %xmm2, -0x30(%rdi)
323 movaps %xmm3, -0x40(%rdi)
327 L(shl_0_less_64bytes_bwd):
328 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
332 #ifdef DATA_CACHE_SIZE_HALF
333 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
335 cmp __x86_data_cache_size_half(%rip), %RDX_LP
338 jae L(shl_0_gobble_mem_bwd_loop)
339 L(shl_0_gobble_bwd_loop):
340 movdqa -0x10(%rsi), %xmm0
341 movaps -0x20(%rsi), %xmm1
342 movaps -0x30(%rsi), %xmm2
343 movaps -0x40(%rsi), %xmm3
345 movdqa %xmm0, -0x10(%rdi)
346 movaps %xmm1, -0x20(%rdi)
347 movaps %xmm2, -0x30(%rdi)
348 movaps %xmm3, -0x40(%rdi)
351 movaps -0x50(%rsi), %xmm4
352 movaps -0x60(%rsi), %xmm5
353 movaps -0x70(%rsi), %xmm6
354 movaps -0x80(%rsi), %xmm7
355 lea -0x80(%rsi), %rsi
356 movaps %xmm4, -0x50(%rdi)
357 movaps %xmm5, -0x60(%rdi)
358 movaps %xmm6, -0x70(%rdi)
359 movaps %xmm7, -0x80(%rdi)
360 lea -0x80(%rdi), %rdi
362 jae L(shl_0_gobble_bwd_loop)
365 jl L(shl_0_gobble_bwd_less_64bytes)
367 movdqa -0x10(%rsi), %xmm0
369 movdqa -0x20(%rsi), %xmm1
371 movdqa %xmm0, -0x10(%rdi)
372 movdqa %xmm1, -0x20(%rdi)
374 movdqa -0x30(%rsi), %xmm0
375 movdqa -0x40(%rsi), %xmm1
378 movdqa %xmm0, -0x30(%rdi)
379 movdqa %xmm1, -0x40(%rdi)
381 L(shl_0_gobble_bwd_less_64bytes):
382 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
385 L(shl_0_gobble_mem_bwd_loop):
386 prefetcht0 -0x1c0(%rsi)
387 prefetcht0 -0x280(%rsi)
388 movdqa -0x10(%rsi), %xmm0
389 movdqa -0x20(%rsi), %xmm1
390 movdqa -0x30(%rsi), %xmm2
391 movdqa -0x40(%rsi), %xmm3
392 movdqa -0x50(%rsi), %xmm4
393 movdqa -0x60(%rsi), %xmm5
394 movdqa -0x70(%rsi), %xmm6
395 movdqa -0x80(%rsi), %xmm7
396 lea -0x80(%rsi), %rsi
398 movdqa %xmm0, -0x10(%rdi)
399 movdqa %xmm1, -0x20(%rdi)
400 movdqa %xmm2, -0x30(%rdi)
401 movdqa %xmm3, -0x40(%rdi)
402 movdqa %xmm4, -0x50(%rdi)
403 movdqa %xmm5, -0x60(%rdi)
404 movdqa %xmm6, -0x70(%rdi)
405 movdqa %xmm7, -0x80(%rdi)
406 lea -0x80(%rdi), %rdi
408 jae L(shl_0_gobble_mem_bwd_loop)
411 jl L(shl_0_mem_bwd_less_64bytes)
413 movdqa -0x10(%rsi), %xmm0
415 movdqa -0x20(%rsi), %xmm1
417 movdqa %xmm0, -0x10(%rdi)
418 movdqa %xmm1, -0x20(%rdi)
420 movdqa -0x30(%rsi), %xmm0
421 movdqa -0x40(%rsi), %xmm1
424 movdqa %xmm0, -0x30(%rdi)
425 movdqa %xmm1, -0x40(%rdi)
427 L(shl_0_mem_bwd_less_64bytes):
429 jb L(shl_0_mem_bwd_less_32bytes)
430 movdqa -0x10(%rsi), %xmm0
432 movdqa -0x20(%rsi), %xmm1
434 movdqa %xmm0, -0x10(%rdi)
435 movdqa %xmm1, -0x20(%rdi)
437 L(shl_0_mem_bwd_less_32bytes):
438 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
442 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
444 movaps -0x01(%rsi), %xmm1
446 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
449 _CET_NOTRACK jmp *%r9
452 prefetchnta 0x1c0(%rsi)
455 movaps 0x0f(%rsi), %xmm2
456 movaps 0x1f(%rsi), %xmm3
457 movaps 0x2f(%rsi), %xmm4
458 movaps 0x3f(%rsi), %xmm5
460 palignr $1, %xmm4, %xmm5
462 palignr $1, %xmm3, %xmm4
463 palignr $1, %xmm2, %xmm3
465 palignr $1, %xmm1, %xmm2
467 movdqa %xmm2, -0x40(%rdi)
468 movaps %xmm3, -0x30(%rdi)
470 movaps %xmm4, -0x20(%rdi)
471 movaps %xmm5, -0x10(%rdi)
472 _CET_NOTRACK jmp *%r9
475 movaps %xmm4, -0x20(%rdi)
477 movaps %xmm5, -0x10(%rdi)
481 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
485 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
487 movaps -0x01(%rsi), %xmm1
489 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
492 _CET_NOTRACK jmp *%r9
494 L(shl_1_bwd_loop_L2):
495 prefetchnta -0x1c0(%rsi)
496 L(shl_1_bwd_loop_L1):
497 movaps -0x11(%rsi), %xmm2
499 movaps -0x21(%rsi), %xmm3
500 movaps -0x31(%rsi), %xmm4
501 movaps -0x41(%rsi), %xmm5
502 lea -0x40(%rsi), %rsi
503 palignr $1, %xmm2, %xmm1
504 palignr $1, %xmm3, %xmm2
505 palignr $1, %xmm4, %xmm3
506 palignr $1, %xmm5, %xmm4
508 movaps %xmm1, -0x10(%rdi)
511 movaps %xmm2, -0x20(%rdi)
512 lea -0x40(%rdi), %rdi
514 movaps %xmm3, 0x10(%rdi)
517 _CET_NOTRACK jmp *%r9
523 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
527 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
529 movaps -0x02(%rsi), %xmm1
531 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
534 _CET_NOTRACK jmp *%r9
537 prefetchnta 0x1c0(%rsi)
540 movaps 0x0e(%rsi), %xmm2
541 movaps 0x1e(%rsi), %xmm3
542 movaps 0x2e(%rsi), %xmm4
543 movaps 0x3e(%rsi), %xmm5
545 palignr $2, %xmm4, %xmm5
547 palignr $2, %xmm3, %xmm4
548 palignr $2, %xmm2, %xmm3
550 palignr $2, %xmm1, %xmm2
552 movdqa %xmm2, -0x40(%rdi)
553 movaps %xmm3, -0x30(%rdi)
555 movaps %xmm4, -0x20(%rdi)
556 movaps %xmm5, -0x10(%rdi)
557 _CET_NOTRACK jmp *%r9
560 movaps %xmm4, -0x20(%rdi)
562 movaps %xmm5, -0x10(%rdi)
566 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
570 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
572 movaps -0x02(%rsi), %xmm1
574 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
577 _CET_NOTRACK jmp *%r9
579 L(shl_2_bwd_loop_L2):
580 prefetchnta -0x1c0(%rsi)
581 L(shl_2_bwd_loop_L1):
582 movaps -0x12(%rsi), %xmm2
584 movaps -0x22(%rsi), %xmm3
585 movaps -0x32(%rsi), %xmm4
586 movaps -0x42(%rsi), %xmm5
587 lea -0x40(%rsi), %rsi
588 palignr $2, %xmm2, %xmm1
589 palignr $2, %xmm3, %xmm2
590 palignr $2, %xmm4, %xmm3
591 palignr $2, %xmm5, %xmm4
593 movaps %xmm1, -0x10(%rdi)
596 movaps %xmm2, -0x20(%rdi)
597 lea -0x40(%rdi), %rdi
599 movaps %xmm3, 0x10(%rdi)
602 _CET_NOTRACK jmp *%r9
608 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
612 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
614 movaps -0x03(%rsi), %xmm1
616 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
619 _CET_NOTRACK jmp *%r9
622 prefetchnta 0x1c0(%rsi)
625 movaps 0x0d(%rsi), %xmm2
626 movaps 0x1d(%rsi), %xmm3
627 movaps 0x2d(%rsi), %xmm4
628 movaps 0x3d(%rsi), %xmm5
630 palignr $3, %xmm4, %xmm5
632 palignr $3, %xmm3, %xmm4
633 palignr $3, %xmm2, %xmm3
635 palignr $3, %xmm1, %xmm2
637 movdqa %xmm2, -0x40(%rdi)
638 movaps %xmm3, -0x30(%rdi)
640 movaps %xmm4, -0x20(%rdi)
641 movaps %xmm5, -0x10(%rdi)
642 _CET_NOTRACK jmp *%r9
645 movaps %xmm4, -0x20(%rdi)
647 movaps %xmm5, -0x10(%rdi)
651 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
655 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
657 movaps -0x03(%rsi), %xmm1
659 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
662 _CET_NOTRACK jmp *%r9
664 L(shl_3_bwd_loop_L2):
665 prefetchnta -0x1c0(%rsi)
666 L(shl_3_bwd_loop_L1):
667 movaps -0x13(%rsi), %xmm2
669 movaps -0x23(%rsi), %xmm3
670 movaps -0x33(%rsi), %xmm4
671 movaps -0x43(%rsi), %xmm5
672 lea -0x40(%rsi), %rsi
673 palignr $3, %xmm2, %xmm1
674 palignr $3, %xmm3, %xmm2
675 palignr $3, %xmm4, %xmm3
676 palignr $3, %xmm5, %xmm4
678 movaps %xmm1, -0x10(%rdi)
681 movaps %xmm2, -0x20(%rdi)
682 lea -0x40(%rdi), %rdi
684 movaps %xmm3, 0x10(%rdi)
687 _CET_NOTRACK jmp *%r9
693 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
697 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
699 movaps -0x04(%rsi), %xmm1
701 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
704 _CET_NOTRACK jmp *%r9
707 prefetchnta 0x1c0(%rsi)
710 movaps 0x0c(%rsi), %xmm2
711 movaps 0x1c(%rsi), %xmm3
712 movaps 0x2c(%rsi), %xmm4
713 movaps 0x3c(%rsi), %xmm5
715 palignr $4, %xmm4, %xmm5
717 palignr $4, %xmm3, %xmm4
718 palignr $4, %xmm2, %xmm3
720 palignr $4, %xmm1, %xmm2
722 movdqa %xmm2, -0x40(%rdi)
723 movaps %xmm3, -0x30(%rdi)
725 movaps %xmm4, -0x20(%rdi)
726 movaps %xmm5, -0x10(%rdi)
727 _CET_NOTRACK jmp *%r9
730 movaps %xmm4, -0x20(%rdi)
732 movaps %xmm5, -0x10(%rdi)
736 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
740 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
742 movaps -0x04(%rsi), %xmm1
744 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
747 _CET_NOTRACK jmp *%r9
749 L(shl_4_bwd_loop_L2):
750 prefetchnta -0x1c0(%rsi)
751 L(shl_4_bwd_loop_L1):
752 movaps -0x14(%rsi), %xmm2
754 movaps -0x24(%rsi), %xmm3
755 movaps -0x34(%rsi), %xmm4
756 movaps -0x44(%rsi), %xmm5
757 lea -0x40(%rsi), %rsi
758 palignr $4, %xmm2, %xmm1
759 palignr $4, %xmm3, %xmm2
760 palignr $4, %xmm4, %xmm3
761 palignr $4, %xmm5, %xmm4
763 movaps %xmm1, -0x10(%rdi)
766 movaps %xmm2, -0x20(%rdi)
767 lea -0x40(%rdi), %rdi
769 movaps %xmm3, 0x10(%rdi)
772 _CET_NOTRACK jmp *%r9
778 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
782 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
784 movaps -0x05(%rsi), %xmm1
786 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
789 _CET_NOTRACK jmp *%r9
792 prefetchnta 0x1c0(%rsi)
795 movaps 0x0b(%rsi), %xmm2
796 movaps 0x1b(%rsi), %xmm3
797 movaps 0x2b(%rsi), %xmm4
798 movaps 0x3b(%rsi), %xmm5
800 palignr $5, %xmm4, %xmm5
802 palignr $5, %xmm3, %xmm4
803 palignr $5, %xmm2, %xmm3
805 palignr $5, %xmm1, %xmm2
807 movdqa %xmm2, -0x40(%rdi)
808 movaps %xmm3, -0x30(%rdi)
810 movaps %xmm4, -0x20(%rdi)
811 movaps %xmm5, -0x10(%rdi)
812 _CET_NOTRACK jmp *%r9
815 movaps %xmm4, -0x20(%rdi)
817 movaps %xmm5, -0x10(%rdi)
821 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
825 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
827 movaps -0x05(%rsi), %xmm1
829 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
832 _CET_NOTRACK jmp *%r9
834 L(shl_5_bwd_loop_L2):
835 prefetchnta -0x1c0(%rsi)
836 L(shl_5_bwd_loop_L1):
837 movaps -0x15(%rsi), %xmm2
839 movaps -0x25(%rsi), %xmm3
840 movaps -0x35(%rsi), %xmm4
841 movaps -0x45(%rsi), %xmm5
842 lea -0x40(%rsi), %rsi
843 palignr $5, %xmm2, %xmm1
844 palignr $5, %xmm3, %xmm2
845 palignr $5, %xmm4, %xmm3
846 palignr $5, %xmm5, %xmm4
848 movaps %xmm1, -0x10(%rdi)
851 movaps %xmm2, -0x20(%rdi)
852 lea -0x40(%rdi), %rdi
854 movaps %xmm3, 0x10(%rdi)
857 _CET_NOTRACK jmp *%r9
863 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
867 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
869 movaps -0x06(%rsi), %xmm1
871 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
874 _CET_NOTRACK jmp *%r9
877 prefetchnta 0x1c0(%rsi)
880 movaps 0x0a(%rsi), %xmm2
881 movaps 0x1a(%rsi), %xmm3
882 movaps 0x2a(%rsi), %xmm4
883 movaps 0x3a(%rsi), %xmm5
885 palignr $6, %xmm4, %xmm5
887 palignr $6, %xmm3, %xmm4
888 palignr $6, %xmm2, %xmm3
890 palignr $6, %xmm1, %xmm2
892 movdqa %xmm2, -0x40(%rdi)
893 movaps %xmm3, -0x30(%rdi)
895 movaps %xmm4, -0x20(%rdi)
896 movaps %xmm5, -0x10(%rdi)
897 _CET_NOTRACK jmp *%r9
900 movaps %xmm4, -0x20(%rdi)
902 movaps %xmm5, -0x10(%rdi)
906 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
910 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
912 movaps -0x06(%rsi), %xmm1
914 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
917 _CET_NOTRACK jmp *%r9
919 L(shl_6_bwd_loop_L2):
920 prefetchnta -0x1c0(%rsi)
921 L(shl_6_bwd_loop_L1):
922 movaps -0x16(%rsi), %xmm2
924 movaps -0x26(%rsi), %xmm3
925 movaps -0x36(%rsi), %xmm4
926 movaps -0x46(%rsi), %xmm5
927 lea -0x40(%rsi), %rsi
928 palignr $6, %xmm2, %xmm1
929 palignr $6, %xmm3, %xmm2
930 palignr $6, %xmm4, %xmm3
931 palignr $6, %xmm5, %xmm4
933 movaps %xmm1, -0x10(%rdi)
936 movaps %xmm2, -0x20(%rdi)
937 lea -0x40(%rdi), %rdi
939 movaps %xmm3, 0x10(%rdi)
942 _CET_NOTRACK jmp *%r9
948 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
952 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
954 movaps -0x07(%rsi), %xmm1
956 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
959 _CET_NOTRACK jmp *%r9
962 prefetchnta 0x1c0(%rsi)
965 movaps 0x09(%rsi), %xmm2
966 movaps 0x19(%rsi), %xmm3
967 movaps 0x29(%rsi), %xmm4
968 movaps 0x39(%rsi), %xmm5
970 palignr $7, %xmm4, %xmm5
972 palignr $7, %xmm3, %xmm4
973 palignr $7, %xmm2, %xmm3
975 palignr $7, %xmm1, %xmm2
977 movdqa %xmm2, -0x40(%rdi)
978 movaps %xmm3, -0x30(%rdi)
980 movaps %xmm4, -0x20(%rdi)
981 movaps %xmm5, -0x10(%rdi)
982 _CET_NOTRACK jmp *%r9
985 movaps %xmm4, -0x20(%rdi)
987 movaps %xmm5, -0x10(%rdi)
991 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
995 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
997 movaps -0x07(%rsi), %xmm1
999 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
1002 _CET_NOTRACK jmp *%r9
1004 L(shl_7_bwd_loop_L2):
1005 prefetchnta -0x1c0(%rsi)
1006 L(shl_7_bwd_loop_L1):
1007 movaps -0x17(%rsi), %xmm2
1009 movaps -0x27(%rsi), %xmm3
1010 movaps -0x37(%rsi), %xmm4
1011 movaps -0x47(%rsi), %xmm5
1012 lea -0x40(%rsi), %rsi
1013 palignr $7, %xmm2, %xmm1
1014 palignr $7, %xmm3, %xmm2
1015 palignr $7, %xmm4, %xmm3
1016 palignr $7, %xmm5, %xmm4
1018 movaps %xmm1, -0x10(%rdi)
1021 movaps %xmm2, -0x20(%rdi)
1022 lea -0x40(%rdi), %rdi
1024 movaps %xmm3, 0x10(%rdi)
1026 movaps %xmm4, (%rdi)
1027 _CET_NOTRACK jmp *%r9
1030 movaps %xmm4, (%rdi)
1033 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1037 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1039 movaps -0x08(%rsi), %xmm1
1041 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1044 _CET_NOTRACK jmp *%r9
1046 prefetchnta 0x1c0(%rsi)
1049 movaps 0x08(%rsi), %xmm2
1050 movaps 0x18(%rsi), %xmm3
1051 movaps 0x28(%rsi), %xmm4
1052 movaps 0x38(%rsi), %xmm5
1054 palignr $8, %xmm4, %xmm5
1056 palignr $8, %xmm3, %xmm4
1057 palignr $8, %xmm2, %xmm3
1059 palignr $8, %xmm1, %xmm2
1061 movdqa %xmm2, -0x40(%rdi)
1062 movaps %xmm3, -0x30(%rdi)
1064 movaps %xmm4, -0x20(%rdi)
1065 movaps %xmm5, -0x10(%rdi)
1066 _CET_NOTRACK jmp *%r9
1071 movaps %xmm4, -0x20(%rdi)
1073 movaps %xmm5, -0x10(%rdi)
1076 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1080 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1082 movaps -0x08(%rsi), %xmm1
1084 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1087 _CET_NOTRACK jmp *%r9
1089 L(shl_8_bwd_loop_L2):
1090 prefetchnta -0x1c0(%rsi)
1091 L(shl_8_bwd_loop_L1):
1092 movaps -0x18(%rsi), %xmm2
1094 movaps -0x28(%rsi), %xmm3
1095 movaps -0x38(%rsi), %xmm4
1096 movaps -0x48(%rsi), %xmm5
1097 lea -0x40(%rsi), %rsi
1098 palignr $8, %xmm2, %xmm1
1099 palignr $8, %xmm3, %xmm2
1100 palignr $8, %xmm4, %xmm3
1101 palignr $8, %xmm5, %xmm4
1103 movaps %xmm1, -0x10(%rdi)
1106 movaps %xmm2, -0x20(%rdi)
1107 lea -0x40(%rdi), %rdi
1109 movaps %xmm3, 0x10(%rdi)
1111 movaps %xmm4, (%rdi)
1112 _CET_NOTRACK jmp *%r9
1115 movaps %xmm4, (%rdi)
1118 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1122 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1124 movaps -0x09(%rsi), %xmm1
1126 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1129 _CET_NOTRACK jmp *%r9
1132 prefetchnta 0x1c0(%rsi)
1135 movaps 0x07(%rsi), %xmm2
1136 movaps 0x17(%rsi), %xmm3
1137 movaps 0x27(%rsi), %xmm4
1138 movaps 0x37(%rsi), %xmm5
1140 palignr $9, %xmm4, %xmm5
1142 palignr $9, %xmm3, %xmm4
1143 palignr $9, %xmm2, %xmm3
1145 palignr $9, %xmm1, %xmm2
1147 movdqa %xmm2, -0x40(%rdi)
1148 movaps %xmm3, -0x30(%rdi)
1150 movaps %xmm4, -0x20(%rdi)
1151 movaps %xmm5, -0x10(%rdi)
1152 _CET_NOTRACK jmp *%r9
1155 movaps %xmm4, -0x20(%rdi)
1157 movaps %xmm5, -0x10(%rdi)
1161 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1165 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1167 movaps -0x09(%rsi), %xmm1
1169 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1172 _CET_NOTRACK jmp *%r9
1174 L(shl_9_bwd_loop_L2):
1175 prefetchnta -0x1c0(%rsi)
1176 L(shl_9_bwd_loop_L1):
1177 movaps -0x19(%rsi), %xmm2
1179 movaps -0x29(%rsi), %xmm3
1180 movaps -0x39(%rsi), %xmm4
1181 movaps -0x49(%rsi), %xmm5
1182 lea -0x40(%rsi), %rsi
1183 palignr $9, %xmm2, %xmm1
1184 palignr $9, %xmm3, %xmm2
1185 palignr $9, %xmm4, %xmm3
1186 palignr $9, %xmm5, %xmm4
1188 movaps %xmm1, -0x10(%rdi)
1191 movaps %xmm2, -0x20(%rdi)
1192 lea -0x40(%rdi), %rdi
1194 movaps %xmm3, 0x10(%rdi)
1196 movaps %xmm4, (%rdi)
1197 _CET_NOTRACK jmp *%r9
1200 movaps %xmm4, (%rdi)
1203 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1207 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1209 movaps -0x0a(%rsi), %xmm1
1211 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1214 _CET_NOTRACK jmp *%r9
1217 prefetchnta 0x1c0(%rsi)
1220 movaps 0x06(%rsi), %xmm2
1221 movaps 0x16(%rsi), %xmm3
1222 movaps 0x26(%rsi), %xmm4
1223 movaps 0x36(%rsi), %xmm5
1225 palignr $10, %xmm4, %xmm5
1227 palignr $10, %xmm3, %xmm4
1228 palignr $10, %xmm2, %xmm3
1230 palignr $10, %xmm1, %xmm2
1232 movdqa %xmm2, -0x40(%rdi)
1233 movaps %xmm3, -0x30(%rdi)
1235 movaps %xmm4, -0x20(%rdi)
1236 movaps %xmm5, -0x10(%rdi)
1237 _CET_NOTRACK jmp *%r9
1240 movaps %xmm4, -0x20(%rdi)
1242 movaps %xmm5, -0x10(%rdi)
1246 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1250 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1252 movaps -0x0a(%rsi), %xmm1
1254 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1257 _CET_NOTRACK jmp *%r9
1259 L(shl_10_bwd_loop_L2):
1260 prefetchnta -0x1c0(%rsi)
1261 L(shl_10_bwd_loop_L1):
1262 movaps -0x1a(%rsi), %xmm2
1264 movaps -0x2a(%rsi), %xmm3
1265 movaps -0x3a(%rsi), %xmm4
1266 movaps -0x4a(%rsi), %xmm5
1267 lea -0x40(%rsi), %rsi
1268 palignr $10, %xmm2, %xmm1
1269 palignr $10, %xmm3, %xmm2
1270 palignr $10, %xmm4, %xmm3
1271 palignr $10, %xmm5, %xmm4
1273 movaps %xmm1, -0x10(%rdi)
1276 movaps %xmm2, -0x20(%rdi)
1277 lea -0x40(%rdi), %rdi
1279 movaps %xmm3, 0x10(%rdi)
1280 jb L(shl_10_bwd_end)
1281 movaps %xmm4, (%rdi)
1282 _CET_NOTRACK jmp *%r9
1285 movaps %xmm4, (%rdi)
1288 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1292 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1294 movaps -0x0b(%rsi), %xmm1
1296 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1299 _CET_NOTRACK jmp *%r9
1302 prefetchnta 0x1c0(%rsi)
1305 movaps 0x05(%rsi), %xmm2
1306 movaps 0x15(%rsi), %xmm3
1307 movaps 0x25(%rsi), %xmm4
1308 movaps 0x35(%rsi), %xmm5
1310 palignr $11, %xmm4, %xmm5
1312 palignr $11, %xmm3, %xmm4
1313 palignr $11, %xmm2, %xmm3
1315 palignr $11, %xmm1, %xmm2
1317 movdqa %xmm2, -0x40(%rdi)
1318 movaps %xmm3, -0x30(%rdi)
1320 movaps %xmm4, -0x20(%rdi)
1321 movaps %xmm5, -0x10(%rdi)
1322 _CET_NOTRACK jmp *%r9
1325 movaps %xmm4, -0x20(%rdi)
1327 movaps %xmm5, -0x10(%rdi)
1331 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1335 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1337 movaps -0x0b(%rsi), %xmm1
1339 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1342 _CET_NOTRACK jmp *%r9
1344 L(shl_11_bwd_loop_L2):
1345 prefetchnta -0x1c0(%rsi)
1346 L(shl_11_bwd_loop_L1):
1347 movaps -0x1b(%rsi), %xmm2
1349 movaps -0x2b(%rsi), %xmm3
1350 movaps -0x3b(%rsi), %xmm4
1351 movaps -0x4b(%rsi), %xmm5
1352 lea -0x40(%rsi), %rsi
1353 palignr $11, %xmm2, %xmm1
1354 palignr $11, %xmm3, %xmm2
1355 palignr $11, %xmm4, %xmm3
1356 palignr $11, %xmm5, %xmm4
1358 movaps %xmm1, -0x10(%rdi)
1361 movaps %xmm2, -0x20(%rdi)
1362 lea -0x40(%rdi), %rdi
1364 movaps %xmm3, 0x10(%rdi)
1365 jb L(shl_11_bwd_end)
1366 movaps %xmm4, (%rdi)
1367 _CET_NOTRACK jmp *%r9
1370 movaps %xmm4, (%rdi)
1373 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1377 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1379 movaps -0x0c(%rsi), %xmm1
1381 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1384 _CET_NOTRACK jmp *%r9
1387 prefetchnta 0x1c0(%rsi)
1390 movaps 0x04(%rsi), %xmm2
1391 movaps 0x14(%rsi), %xmm3
1392 movaps 0x24(%rsi), %xmm4
1393 movaps 0x34(%rsi), %xmm5
1395 palignr $12, %xmm4, %xmm5
1397 palignr $12, %xmm3, %xmm4
1398 palignr $12, %xmm2, %xmm3
1400 palignr $12, %xmm1, %xmm2
1402 movdqa %xmm2, -0x40(%rdi)
1403 movaps %xmm3, -0x30(%rdi)
1405 movaps %xmm4, -0x20(%rdi)
1406 movaps %xmm5, -0x10(%rdi)
1407 _CET_NOTRACK jmp *%r9
1410 movaps %xmm4, -0x20(%rdi)
1412 movaps %xmm5, -0x10(%rdi)
1416 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1420 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1422 movaps -0x0c(%rsi), %xmm1
1424 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1427 _CET_NOTRACK jmp *%r9
1429 L(shl_12_bwd_loop_L2):
1430 prefetchnta -0x1c0(%rsi)
1431 L(shl_12_bwd_loop_L1):
1432 movaps -0x1c(%rsi), %xmm2
1434 movaps -0x2c(%rsi), %xmm3
1435 movaps -0x3c(%rsi), %xmm4
1436 movaps -0x4c(%rsi), %xmm5
1437 lea -0x40(%rsi), %rsi
1438 palignr $12, %xmm2, %xmm1
1439 palignr $12, %xmm3, %xmm2
1440 palignr $12, %xmm4, %xmm3
1441 palignr $12, %xmm5, %xmm4
1443 movaps %xmm1, -0x10(%rdi)
1446 movaps %xmm2, -0x20(%rdi)
1447 lea -0x40(%rdi), %rdi
1449 movaps %xmm3, 0x10(%rdi)
1450 jb L(shl_12_bwd_end)
1451 movaps %xmm4, (%rdi)
1452 _CET_NOTRACK jmp *%r9
1455 movaps %xmm4, (%rdi)
1458 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1462 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1464 movaps -0x0d(%rsi), %xmm1
1466 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1469 _CET_NOTRACK jmp *%r9
1472 prefetchnta 0x1c0(%rsi)
1475 movaps 0x03(%rsi), %xmm2
1476 movaps 0x13(%rsi), %xmm3
1477 movaps 0x23(%rsi), %xmm4
1478 movaps 0x33(%rsi), %xmm5
1480 palignr $13, %xmm4, %xmm5
1482 palignr $13, %xmm3, %xmm4
1483 palignr $13, %xmm2, %xmm3
1485 palignr $13, %xmm1, %xmm2
1487 movdqa %xmm2, -0x40(%rdi)
1488 movaps %xmm3, -0x30(%rdi)
1490 movaps %xmm4, -0x20(%rdi)
1491 movaps %xmm5, -0x10(%rdi)
1492 _CET_NOTRACK jmp *%r9
1495 movaps %xmm4, -0x20(%rdi)
1497 movaps %xmm5, -0x10(%rdi)
1501 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1505 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1507 movaps -0x0d(%rsi), %xmm1
1509 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1512 _CET_NOTRACK jmp *%r9
1514 L(shl_13_bwd_loop_L2):
1515 prefetchnta -0x1c0(%rsi)
1516 L(shl_13_bwd_loop_L1):
1517 movaps -0x1d(%rsi), %xmm2
1519 movaps -0x2d(%rsi), %xmm3
1520 movaps -0x3d(%rsi), %xmm4
1521 movaps -0x4d(%rsi), %xmm5
1522 lea -0x40(%rsi), %rsi
1523 palignr $13, %xmm2, %xmm1
1524 palignr $13, %xmm3, %xmm2
1525 palignr $13, %xmm4, %xmm3
1526 palignr $13, %xmm5, %xmm4
1528 movaps %xmm1, -0x10(%rdi)
1531 movaps %xmm2, -0x20(%rdi)
1532 lea -0x40(%rdi), %rdi
1534 movaps %xmm3, 0x10(%rdi)
1535 jb L(shl_13_bwd_end)
1536 movaps %xmm4, (%rdi)
1537 _CET_NOTRACK jmp *%r9
1540 movaps %xmm4, (%rdi)
1543 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1547 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1549 movaps -0x0e(%rsi), %xmm1
1551 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1554 _CET_NOTRACK jmp *%r9
1557 prefetchnta 0x1c0(%rsi)
1560 movaps 0x02(%rsi), %xmm2
1561 movaps 0x12(%rsi), %xmm3
1562 movaps 0x22(%rsi), %xmm4
1563 movaps 0x32(%rsi), %xmm5
1565 palignr $14, %xmm4, %xmm5
1567 palignr $14, %xmm3, %xmm4
1568 palignr $14, %xmm2, %xmm3
1570 palignr $14, %xmm1, %xmm2
1572 movdqa %xmm2, -0x40(%rdi)
1573 movaps %xmm3, -0x30(%rdi)
1575 movaps %xmm4, -0x20(%rdi)
1576 movaps %xmm5, -0x10(%rdi)
1577 _CET_NOTRACK jmp *%r9
1580 movaps %xmm4, -0x20(%rdi)
1582 movaps %xmm5, -0x10(%rdi)
1586 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1590 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1592 movaps -0x0e(%rsi), %xmm1
1594 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1597 _CET_NOTRACK jmp *%r9
1599 L(shl_14_bwd_loop_L2):
1600 prefetchnta -0x1c0(%rsi)
1601 L(shl_14_bwd_loop_L1):
1602 movaps -0x1e(%rsi), %xmm2
1604 movaps -0x2e(%rsi), %xmm3
1605 movaps -0x3e(%rsi), %xmm4
1606 movaps -0x4e(%rsi), %xmm5
1607 lea -0x40(%rsi), %rsi
1608 palignr $14, %xmm2, %xmm1
1609 palignr $14, %xmm3, %xmm2
1610 palignr $14, %xmm4, %xmm3
1611 palignr $14, %xmm5, %xmm4
1613 movaps %xmm1, -0x10(%rdi)
1616 movaps %xmm2, -0x20(%rdi)
1617 lea -0x40(%rdi), %rdi
1619 movaps %xmm3, 0x10(%rdi)
1620 jb L(shl_14_bwd_end)
1621 movaps %xmm4, (%rdi)
1622 _CET_NOTRACK jmp *%r9
1625 movaps %xmm4, (%rdi)
1628 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1632 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1634 movaps -0x0f(%rsi), %xmm1
1636 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1639 _CET_NOTRACK jmp *%r9
1642 prefetchnta 0x1c0(%rsi)
1645 movaps 0x01(%rsi), %xmm2
1646 movaps 0x11(%rsi), %xmm3
1647 movaps 0x21(%rsi), %xmm4
1648 movaps 0x31(%rsi), %xmm5
1650 palignr $15, %xmm4, %xmm5
1652 palignr $15, %xmm3, %xmm4
1653 palignr $15, %xmm2, %xmm3
1655 palignr $15, %xmm1, %xmm2
1657 movdqa %xmm2, -0x40(%rdi)
1658 movaps %xmm3, -0x30(%rdi)
1660 movaps %xmm4, -0x20(%rdi)
1661 movaps %xmm5, -0x10(%rdi)
1662 _CET_NOTRACK jmp *%r9
1665 movaps %xmm4, -0x20(%rdi)
1667 movaps %xmm5, -0x10(%rdi)
1671 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1675 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1677 movaps -0x0f(%rsi), %xmm1
1679 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1682 _CET_NOTRACK jmp *%r9
1684 L(shl_15_bwd_loop_L2):
1685 prefetchnta -0x1c0(%rsi)
1686 L(shl_15_bwd_loop_L1):
1687 movaps -0x1f(%rsi), %xmm2
1689 movaps -0x2f(%rsi), %xmm3
1690 movaps -0x3f(%rsi), %xmm4
1691 movaps -0x4f(%rsi), %xmm5
1692 lea -0x40(%rsi), %rsi
1693 palignr $15, %xmm2, %xmm1
1694 palignr $15, %xmm3, %xmm2
1695 palignr $15, %xmm4, %xmm3
1696 palignr $15, %xmm5, %xmm4
1698 movaps %xmm1, -0x10(%rdi)
1701 movaps %xmm2, -0x20(%rdi)
1702 lea -0x40(%rdi), %rdi
1704 movaps %xmm3, 0x10(%rdi)
1705 jb L(shl_15_bwd_end)
1706 movaps %xmm4, (%rdi)
1707 _CET_NOTRACK jmp *%r9
1710 movaps %xmm4, (%rdi)
1713 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1717 movdqu -72(%rsi), %xmm0
1718 movdqu -56(%rsi), %xmm1
1724 movdqu %xmm0, -72(%rdi)
1725 movdqu %xmm1, -56(%rdi)
1735 movdqu -64(%rsi), %xmm0
1742 movdqu %xmm0, -64(%rdi)
1753 movdqu -56(%rsi), %xmm0
1759 movdqu %xmm0, -56(%rdi)
1836 movdqu -73(%rsi), %xmm0
1837 movdqu -57(%rsi), %xmm1
1844 movdqu %xmm0, -73(%rdi)
1845 movdqu %xmm1, -57(%rdi)
1856 movdqu -65(%rsi), %xmm0
1857 movdqu -49(%rsi), %xmm1
1863 movdqu %xmm0, -65(%rdi)
1864 movdqu %xmm1, -49(%rdi)
1874 movdqu -57(%rsi), %xmm0
1881 movdqu %xmm0, -57(%rdi)
1892 movdqu -49(%rsi), %xmm0
1898 movdqu %xmm0, -49(%rdi)
1974 movdqu -74(%rsi), %xmm0
1975 movdqu -58(%rsi), %xmm1
1982 movdqu %xmm0, -74(%rdi)
1983 movdqu %xmm1, -58(%rdi)
1994 movdqu -66(%rsi), %xmm0
1995 movdqu -50(%rsi), %xmm1
2002 movdqu %xmm0, -66(%rdi)
2003 movdqu %xmm1, -50(%rdi)
2014 movdqu -58(%rsi), %xmm1
2021 movdqu %xmm1, -58(%rdi)
2032 movdqu -50(%rsi), %xmm0
2038 movdqu %xmm0, -50(%rdi)
2114 movdqu -75(%rsi), %xmm0
2115 movdqu -59(%rsi), %xmm1
2122 movdqu %xmm0, -75(%rdi)
2123 movdqu %xmm1, -59(%rdi)
2134 movdqu -67(%rsi), %xmm0
2135 movdqu -59(%rsi), %xmm1
2142 movdqu %xmm0, -67(%rdi)
2143 movdqu %xmm1, -59(%rdi)
2154 movdqu -59(%rsi), %xmm0
2161 movdqu %xmm0, -59(%rdi)
2172 movdqu -51(%rsi), %xmm0
2178 movdqu %xmm0, -51(%rdi)
2256 movdqu -76(%rsi), %xmm0
2257 movdqu -60(%rsi), %xmm1
2264 movdqu %xmm0, -76(%rdi)
2265 movdqu %xmm1, -60(%rdi)
2276 movdqu -68(%rsi), %xmm0
2277 movdqu -52(%rsi), %xmm1
2283 movdqu %xmm0, -68(%rdi)
2284 movdqu %xmm1, -52(%rdi)
2294 movdqu -60(%rsi), %xmm0
2301 movdqu %xmm0, -60(%rdi)
2312 movdqu -52(%rsi), %xmm0
2318 movdqu %xmm0, -52(%rdi)
2394 movdqu -77(%rsi), %xmm0
2395 movdqu -61(%rsi), %xmm1
2402 movdqu %xmm0, -77(%rdi)
2403 movdqu %xmm1, -61(%rdi)
2414 movdqu -69(%rsi), %xmm0
2415 movdqu -53(%rsi), %xmm1
2421 movdqu %xmm0, -69(%rdi)
2422 movdqu %xmm1, -53(%rdi)
2432 movdqu -61(%rsi), %xmm0
2439 movdqu %xmm0, -61(%rdi)
2450 movdqu -53(%rsi), %xmm0
2457 movdqu %xmm0, -53(%rdi)
2535 movdqu -78(%rsi), %xmm0
2536 movdqu -62(%rsi), %xmm1
2543 movdqu %xmm0, -78(%rdi)
2544 movdqu %xmm1, -62(%rdi)
2555 movdqu -70(%rsi), %xmm0
2556 movdqu -54(%rsi), %xmm1
2562 movdqu %xmm0, -70(%rdi)
2563 movdqu %xmm1, -54(%rdi)
2573 movdqu -62(%rsi), %xmm0
2580 movdqu %xmm0, -62(%rdi)
2591 movdqu -54(%rsi), %xmm0
2597 movdqu %xmm0, -54(%rdi)
2675 movdqu -79(%rsi), %xmm0
2676 movdqu -63(%rsi), %xmm1
2683 movdqu %xmm0, -79(%rdi)
2684 movdqu %xmm1, -63(%rdi)
2695 movdqu -71(%rsi), %xmm0
2696 movdqu -55(%rsi), %xmm1
2702 movdqu %xmm0, -71(%rdi)
2703 movdqu %xmm1, -55(%rdi)
2713 movdqu -63(%rsi), %xmm0
2720 movdqu %xmm0, -63(%rdi)
2731 movdqu -55(%rsi), %xmm0
2737 movdqu %xmm0, -55(%rdi)
2815 movdqu (%rsi), %xmm1
2818 movntdq %xmm1, (%rdi)
2820 lea -0x90(%rdx), %rdx
2821 #ifdef USE_AS_MEMMOVE
2825 jae L(memmove_is_memcpy_fwd)
2828 jb L(ll_cache_copy_fwd_start)
2829 L(memmove_is_memcpy_fwd):
2832 movdqu (%rsi), %xmm0
2833 movdqu 0x10(%rsi), %xmm1
2834 movdqu 0x20(%rsi), %xmm2
2835 movdqu 0x30(%rsi), %xmm3
2836 movdqu 0x40(%rsi), %xmm4
2837 movdqu 0x50(%rsi), %xmm5
2838 movdqu 0x60(%rsi), %xmm6
2839 movdqu 0x70(%rsi), %xmm7
2840 lea 0x80(%rsi), %rsi
2843 movntdq %xmm0, (%rdi)
2844 movntdq %xmm1, 0x10(%rdi)
2845 movntdq %xmm2, 0x20(%rdi)
2846 movntdq %xmm3, 0x30(%rdi)
2847 movntdq %xmm4, 0x40(%rdi)
2848 movntdq %xmm5, 0x50(%rdi)
2849 movntdq %xmm6, 0x60(%rdi)
2850 movntdq %xmm7, 0x70(%rdi)
2851 lea 0x80(%rdi), %rdi
2852 jae L(large_page_loop)
2854 lea 0x80(%rdx), %rdx
2855 jl L(large_page_less_64bytes)
2857 movdqu (%rsi), %xmm0
2858 movdqu 0x10(%rsi), %xmm1
2859 movdqu 0x20(%rsi), %xmm2
2860 movdqu 0x30(%rsi), %xmm3
2861 lea 0x40(%rsi), %rsi
2863 movntdq %xmm0, (%rdi)
2864 movntdq %xmm1, 0x10(%rdi)
2865 movntdq %xmm2, 0x20(%rdi)
2866 movntdq %xmm3, 0x30(%rdi)
2867 lea 0x40(%rdi), %rdi
2869 L(large_page_less_64bytes):
2873 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2875 #ifdef USE_AS_MEMMOVE
2877 L(ll_cache_copy_fwd_start):
2878 prefetcht0 0x1c0(%rsi)
2879 prefetcht0 0x200(%rsi)
2880 movdqu (%rsi), %xmm0
2881 movdqu 0x10(%rsi), %xmm1
2882 movdqu 0x20(%rsi), %xmm2
2883 movdqu 0x30(%rsi), %xmm3
2884 movdqu 0x40(%rsi), %xmm4
2885 movdqu 0x50(%rsi), %xmm5
2886 movdqu 0x60(%rsi), %xmm6
2887 movdqu 0x70(%rsi), %xmm7
2888 lea 0x80(%rsi), %rsi
2891 movaps %xmm0, (%rdi)
2892 movaps %xmm1, 0x10(%rdi)
2893 movaps %xmm2, 0x20(%rdi)
2894 movaps %xmm3, 0x30(%rdi)
2895 movaps %xmm4, 0x40(%rdi)
2896 movaps %xmm5, 0x50(%rdi)
2897 movaps %xmm6, 0x60(%rdi)
2898 movaps %xmm7, 0x70(%rdi)
2899 lea 0x80(%rdi), %rdi
2900 jae L(ll_cache_copy_fwd_start)
2902 lea 0x80(%rdx), %rdx
2903 jl L(large_page_ll_less_fwd_64bytes)
2905 movdqu (%rsi), %xmm0
2906 movdqu 0x10(%rsi), %xmm1
2907 movdqu 0x20(%rsi), %xmm2
2908 movdqu 0x30(%rsi), %xmm3
2909 lea 0x40(%rsi), %rsi
2911 movaps %xmm0, (%rdi)
2912 movaps %xmm1, 0x10(%rdi)
2913 movaps %xmm2, 0x20(%rdi)
2914 movaps %xmm3, 0x30(%rdi)
2915 lea 0x40(%rdi), %rdi
2917 L(large_page_ll_less_fwd_64bytes):
2920 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2925 movdqu -0x10(%rsi), %xmm1
2928 movdqa %xmm1, -0x10(%rdi)
2930 lea -0x90(%rdx), %rdx
2931 #ifdef USE_AS_MEMMOVE
2935 jae L(memmove_is_memcpy_bwd)
2937 jb L(ll_cache_copy_bwd_start)
2938 L(memmove_is_memcpy_bwd):
2940 L(large_page_bwd_loop):
2941 movdqu -0x10(%rsi), %xmm0
2942 movdqu -0x20(%rsi), %xmm1
2943 movdqu -0x30(%rsi), %xmm2
2944 movdqu -0x40(%rsi), %xmm3
2945 movdqu -0x50(%rsi), %xmm4
2946 movdqu -0x60(%rsi), %xmm5
2947 movdqu -0x70(%rsi), %xmm6
2948 movdqu -0x80(%rsi), %xmm7
2949 lea -0x80(%rsi), %rsi
2952 movntdq %xmm0, -0x10(%rdi)
2953 movntdq %xmm1, -0x20(%rdi)
2954 movntdq %xmm2, -0x30(%rdi)
2955 movntdq %xmm3, -0x40(%rdi)
2956 movntdq %xmm4, -0x50(%rdi)
2957 movntdq %xmm5, -0x60(%rdi)
2958 movntdq %xmm6, -0x70(%rdi)
2959 movntdq %xmm7, -0x80(%rdi)
2960 lea -0x80(%rdi), %rdi
2961 jae L(large_page_bwd_loop)
2963 lea 0x80(%rdx), %rdx
2964 jl L(large_page_less_bwd_64bytes)
2966 movdqu -0x10(%rsi), %xmm0
2967 movdqu -0x20(%rsi), %xmm1
2968 movdqu -0x30(%rsi), %xmm2
2969 movdqu -0x40(%rsi), %xmm3
2970 lea -0x40(%rsi), %rsi
2972 movntdq %xmm0, -0x10(%rdi)
2973 movntdq %xmm1, -0x20(%rdi)
2974 movntdq %xmm2, -0x30(%rdi)
2975 movntdq %xmm3, -0x40(%rdi)
2976 lea -0x40(%rdi), %rdi
2978 L(large_page_less_bwd_64bytes):
2980 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2982 #ifdef USE_AS_MEMMOVE
2984 L(ll_cache_copy_bwd_start):
2985 prefetcht0 -0x1c0(%rsi)
2986 prefetcht0 -0x200(%rsi)
2987 movdqu -0x10(%rsi), %xmm0
2988 movdqu -0x20(%rsi), %xmm1
2989 movdqu -0x30(%rsi), %xmm2
2990 movdqu -0x40(%rsi), %xmm3
2991 movdqu -0x50(%rsi), %xmm4
2992 movdqu -0x60(%rsi), %xmm5
2993 movdqu -0x70(%rsi), %xmm6
2994 movdqu -0x80(%rsi), %xmm7
2995 lea -0x80(%rsi), %rsi
2998 movaps %xmm0, -0x10(%rdi)
2999 movaps %xmm1, -0x20(%rdi)
3000 movaps %xmm2, -0x30(%rdi)
3001 movaps %xmm3, -0x40(%rdi)
3002 movaps %xmm4, -0x50(%rdi)
3003 movaps %xmm5, -0x60(%rdi)
3004 movaps %xmm6, -0x70(%rdi)
3005 movaps %xmm7, -0x80(%rdi)
3006 lea -0x80(%rdi), %rdi
3007 jae L(ll_cache_copy_bwd_start)
3009 lea 0x80(%rdx), %rdx
3010 jl L(large_page_ll_less_bwd_64bytes)
3012 movdqu -0x10(%rsi), %xmm0
3013 movdqu -0x20(%rsi), %xmm1
3014 movdqu -0x30(%rsi), %xmm2
3015 movdqu -0x40(%rsi), %xmm3
3016 lea -0x40(%rsi), %rsi
3018 movaps %xmm0, -0x10(%rdi)
3019 movaps %xmm1, -0x20(%rdi)
3020 movaps %xmm2, -0x30(%rdi)
3021 movaps %xmm3, -0x40(%rdi)
3022 lea -0x40(%rdi), %rdi
3024 L(large_page_ll_less_bwd_64bytes):
3025 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3030 .section .rodata.ssse3,"a",@progbits
3032 L(table_less_80bytes):
3033 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3111 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3112 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3116 .int JMPTBL (L(shl_0), L(shl_table))
3117 .int JMPTBL (L(shl_1), L(shl_table))
3118 .int JMPTBL (L(shl_2), L(shl_table))
3119 .int JMPTBL (L(shl_3), L(shl_table))
3120 .int JMPTBL (L(shl_4), L(shl_table))
3121 .int JMPTBL (L(shl_5), L(shl_table))
3122 .int JMPTBL (L(shl_6), L(shl_table))
3123 .int JMPTBL (L(shl_7), L(shl_table))
3124 .int JMPTBL (L(shl_8), L(shl_table))
3125 .int JMPTBL (L(shl_9), L(shl_table))
3126 .int JMPTBL (L(shl_10), L(shl_table))
3127 .int JMPTBL (L(shl_11), L(shl_table))
3128 .int JMPTBL (L(shl_12), L(shl_table))
3129 .int JMPTBL (L(shl_13), L(shl_table))
3130 .int JMPTBL (L(shl_14), L(shl_table))
3131 .int JMPTBL (L(shl_15), L(shl_table))
3135 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3149 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3150 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))