]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcpy-ssse3.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcpy-ssse3.S
1 /* memcpy with SSSE3
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #if IS_IN (libc)
23
24 #include "asm-syntax.h"
25
26 #ifndef MEMCPY
27 # define MEMCPY __memcpy_ssse3
28 # define MEMCPY_CHK __memcpy_chk_ssse3
29 # define MEMPCPY __mempcpy_ssse3
30 # define MEMPCPY_CHK __mempcpy_chk_ssse3
31 #endif
32
33 #define JMPTBL(I, B) I - B
34
35 /* Branch to an entry in a jump table. TABLE is a jump table with
36 relative offsets. INDEX is a register contains the index into the
37 jump table. SCALE is the scale of INDEX. */
38 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
39 lea TABLE(%rip), %r11; \
40 movslq (%r11, INDEX, SCALE), INDEX; \
41 lea (%r11, INDEX), INDEX; \
42 _CET_NOTRACK jmp *INDEX; \
43 ud2
44
45 .section .text.ssse3,"ax",@progbits
46 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
47 ENTRY (MEMPCPY_CHK)
48 cmpq %rdx, %rcx
49 jb HIDDEN_JUMPTARGET (__chk_fail)
50 END (MEMPCPY_CHK)
51
52 ENTRY (MEMPCPY)
53 movq %rdi, %rax
54 addq %rdx, %rax
55 jmp L(start)
56 END (MEMPCPY)
57 #endif
58
59 #if !defined USE_AS_BCOPY
60 ENTRY (MEMCPY_CHK)
61 cmpq %rdx, %rcx
62 jb HIDDEN_JUMPTARGET (__chk_fail)
63 END (MEMCPY_CHK)
64 #endif
65
66 ENTRY (MEMCPY)
67 mov %rdi, %rax
68 #ifdef USE_AS_MEMPCPY
69 add %rdx, %rax
70 #endif
71
72 #ifdef USE_AS_MEMMOVE
73 cmp %rsi, %rdi
74 jb L(copy_forward)
75 je L(write_0bytes)
76 cmp $79, %rdx
77 jbe L(copy_forward)
78 jmp L(copy_backward)
79 L(copy_forward):
80 #endif
81 L(start):
82 cmp $79, %rdx
83 lea L(table_less_80bytes)(%rip), %r11
84 ja L(80bytesormore)
85 movslq (%r11, %rdx, 4), %r9
86 add %rdx, %rsi
87 add %rdx, %rdi
88 add %r11, %r9
89 _CET_NOTRACK jmp *%r9
90 ud2
91
92 .p2align 4
93 L(80bytesormore):
94 #ifndef USE_AS_MEMMOVE
95 cmp %dil, %sil
96 jle L(copy_backward)
97 #endif
98
99 movdqu (%rsi), %xmm0
100 mov %rdi, %rcx
101 and $-16, %rdi
102 add $16, %rdi
103 mov %rcx, %r8
104 sub %rdi, %rcx
105 add %rcx, %rdx
106 sub %rcx, %rsi
107
108 #ifdef SHARED_CACHE_SIZE_HALF
109 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
110 #else
111 mov __x86_shared_cache_size_half(%rip), %RCX_LP
112 #endif
113 cmp %rcx, %rdx
114 mov %rsi, %r9
115 ja L(large_page_fwd)
116 and $0xf, %r9
117 jz L(shl_0)
118 #ifdef DATA_CACHE_SIZE_HALF
119 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
120 #else
121 mov __x86_data_cache_size_half(%rip), %RCX_LP
122 #endif
123 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
124
125 .p2align 4
126 L(copy_backward):
127 movdqu -16(%rsi, %rdx), %xmm0
128 add %rdx, %rsi
129 lea -16(%rdi, %rdx), %r8
130 add %rdx, %rdi
131
132 mov %rdi, %rcx
133 and $0xf, %rcx
134 xor %rcx, %rdi
135 sub %rcx, %rdx
136 sub %rcx, %rsi
137
138 #ifdef SHARED_CACHE_SIZE_HALF
139 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
140 #else
141 mov __x86_shared_cache_size_half(%rip), %RCX_LP
142 #endif
143
144 cmp %rcx, %rdx
145 mov %rsi, %r9
146 ja L(large_page_bwd)
147 and $0xf, %r9
148 jz L(shl_0_bwd)
149 #ifdef DATA_CACHE_SIZE_HALF
150 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
151 #else
152 mov __x86_data_cache_size_half(%rip), %RCX_LP
153 #endif
154 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
155
156 .p2align 4
157 L(shl_0):
158 sub $16, %rdx
159 movdqa (%rsi), %xmm1
160 add $16, %rsi
161 movdqa %xmm1, (%rdi)
162 add $16, %rdi
163 cmp $128, %rdx
164 movdqu %xmm0, (%r8)
165 ja L(shl_0_gobble)
166 cmp $64, %rdx
167 jb L(shl_0_less_64bytes)
168 movaps (%rsi), %xmm4
169 movaps 16(%rsi), %xmm1
170 movaps 32(%rsi), %xmm2
171 movaps 48(%rsi), %xmm3
172 movaps %xmm4, (%rdi)
173 movaps %xmm1, 16(%rdi)
174 movaps %xmm2, 32(%rdi)
175 movaps %xmm3, 48(%rdi)
176 sub $64, %rdx
177 add $64, %rsi
178 add $64, %rdi
179 L(shl_0_less_64bytes):
180 add %rdx, %rsi
181 add %rdx, %rdi
182 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
183
184 .p2align 4
185 L(shl_0_gobble):
186 #ifdef DATA_CACHE_SIZE_HALF
187 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
188 #else
189 cmp __x86_data_cache_size_half(%rip), %RDX_LP
190 #endif
191 lea -128(%rdx), %rdx
192 jae L(shl_0_gobble_mem_loop)
193 L(shl_0_gobble_cache_loop):
194 movdqa (%rsi), %xmm4
195 movaps 0x10(%rsi), %xmm1
196 movaps 0x20(%rsi), %xmm2
197 movaps 0x30(%rsi), %xmm3
198
199 movdqa %xmm4, (%rdi)
200 movaps %xmm1, 0x10(%rdi)
201 movaps %xmm2, 0x20(%rdi)
202 movaps %xmm3, 0x30(%rdi)
203
204 sub $128, %rdx
205 movaps 0x40(%rsi), %xmm4
206 movaps 0x50(%rsi), %xmm5
207 movaps 0x60(%rsi), %xmm6
208 movaps 0x70(%rsi), %xmm7
209 lea 0x80(%rsi), %rsi
210 movaps %xmm4, 0x40(%rdi)
211 movaps %xmm5, 0x50(%rdi)
212 movaps %xmm6, 0x60(%rdi)
213 movaps %xmm7, 0x70(%rdi)
214 lea 0x80(%rdi), %rdi
215
216 jae L(shl_0_gobble_cache_loop)
217 cmp $-0x40, %rdx
218 lea 0x80(%rdx), %rdx
219 jl L(shl_0_cache_less_64bytes)
220
221 movdqa (%rsi), %xmm4
222 sub $0x40, %rdx
223 movdqa 0x10(%rsi), %xmm1
224
225 movdqa %xmm4, (%rdi)
226 movdqa %xmm1, 0x10(%rdi)
227
228 movdqa 0x20(%rsi), %xmm4
229 movdqa 0x30(%rsi), %xmm1
230 add $0x40, %rsi
231
232 movdqa %xmm4, 0x20(%rdi)
233 movdqa %xmm1, 0x30(%rdi)
234 add $0x40, %rdi
235 L(shl_0_cache_less_64bytes):
236 add %rdx, %rsi
237 add %rdx, %rdi
238 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
239
240 .p2align 4
241 L(shl_0_gobble_mem_loop):
242 prefetcht0 0x1c0(%rsi)
243 prefetcht0 0x280(%rsi)
244
245 movdqa (%rsi), %xmm0
246 movdqa 0x10(%rsi), %xmm1
247 movdqa 0x20(%rsi), %xmm2
248 movdqa 0x30(%rsi), %xmm3
249 movdqa 0x40(%rsi), %xmm4
250 movdqa 0x50(%rsi), %xmm5
251 movdqa 0x60(%rsi), %xmm6
252 movdqa 0x70(%rsi), %xmm7
253 lea 0x80(%rsi), %rsi
254 sub $0x80, %rdx
255 movdqa %xmm0, (%rdi)
256 movdqa %xmm1, 0x10(%rdi)
257 movdqa %xmm2, 0x20(%rdi)
258 movdqa %xmm3, 0x30(%rdi)
259 movdqa %xmm4, 0x40(%rdi)
260 movdqa %xmm5, 0x50(%rdi)
261 movdqa %xmm6, 0x60(%rdi)
262 movdqa %xmm7, 0x70(%rdi)
263 lea 0x80(%rdi), %rdi
264
265 jae L(shl_0_gobble_mem_loop)
266 cmp $-0x40, %rdx
267 lea 0x80(%rdx), %rdx
268 jl L(shl_0_mem_less_64bytes)
269
270 movdqa (%rsi), %xmm0
271 sub $0x40, %rdx
272 movdqa 0x10(%rsi), %xmm1
273
274 movdqa %xmm0, (%rdi)
275 movdqa %xmm1, 0x10(%rdi)
276
277 movdqa 0x20(%rsi), %xmm0
278 movdqa 0x30(%rsi), %xmm1
279 add $0x40, %rsi
280
281 movdqa %xmm0, 0x20(%rdi)
282 movdqa %xmm1, 0x30(%rdi)
283 add $0x40, %rdi
284 L(shl_0_mem_less_64bytes):
285 cmp $0x20, %rdx
286 jb L(shl_0_mem_less_32bytes)
287 movdqa (%rsi), %xmm0
288 sub $0x20, %rdx
289 movdqa 0x10(%rsi), %xmm1
290 add $0x20, %rsi
291 movdqa %xmm0, (%rdi)
292 movdqa %xmm1, 0x10(%rdi)
293 add $0x20, %rdi
294 L(shl_0_mem_less_32bytes):
295 add %rdx, %rdi
296 add %rdx, %rsi
297 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
298
299 .p2align 4
300 L(shl_0_bwd):
301 sub $16, %rdx
302 movdqa -0x10(%rsi), %xmm1
303 sub $16, %rsi
304 movdqa %xmm1, -0x10(%rdi)
305 sub $16, %rdi
306 cmp $0x80, %rdx
307 movdqu %xmm0, (%r8)
308 ja L(shl_0_gobble_bwd)
309 cmp $64, %rdx
310 jb L(shl_0_less_64bytes_bwd)
311 movaps -0x10(%rsi), %xmm0
312 movaps -0x20(%rsi), %xmm1
313 movaps -0x30(%rsi), %xmm2
314 movaps -0x40(%rsi), %xmm3
315 movaps %xmm0, -0x10(%rdi)
316 movaps %xmm1, -0x20(%rdi)
317 movaps %xmm2, -0x30(%rdi)
318 movaps %xmm3, -0x40(%rdi)
319 sub $64, %rdx
320 sub $0x40, %rsi
321 sub $0x40, %rdi
322 L(shl_0_less_64bytes_bwd):
323 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
324
325 .p2align 4
326 L(shl_0_gobble_bwd):
327 #ifdef DATA_CACHE_SIZE_HALF
328 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
329 #else
330 cmp __x86_data_cache_size_half(%rip), %RDX_LP
331 #endif
332 lea -128(%rdx), %rdx
333 jae L(shl_0_gobble_mem_bwd_loop)
334 L(shl_0_gobble_bwd_loop):
335 movdqa -0x10(%rsi), %xmm0
336 movaps -0x20(%rsi), %xmm1
337 movaps -0x30(%rsi), %xmm2
338 movaps -0x40(%rsi), %xmm3
339
340 movdqa %xmm0, -0x10(%rdi)
341 movaps %xmm1, -0x20(%rdi)
342 movaps %xmm2, -0x30(%rdi)
343 movaps %xmm3, -0x40(%rdi)
344
345 sub $0x80, %rdx
346 movaps -0x50(%rsi), %xmm4
347 movaps -0x60(%rsi), %xmm5
348 movaps -0x70(%rsi), %xmm6
349 movaps -0x80(%rsi), %xmm7
350 lea -0x80(%rsi), %rsi
351 movaps %xmm4, -0x50(%rdi)
352 movaps %xmm5, -0x60(%rdi)
353 movaps %xmm6, -0x70(%rdi)
354 movaps %xmm7, -0x80(%rdi)
355 lea -0x80(%rdi), %rdi
356
357 jae L(shl_0_gobble_bwd_loop)
358 cmp $-0x40, %rdx
359 lea 0x80(%rdx), %rdx
360 jl L(shl_0_gobble_bwd_less_64bytes)
361
362 movdqa -0x10(%rsi), %xmm0
363 sub $0x40, %rdx
364 movdqa -0x20(%rsi), %xmm1
365
366 movdqa %xmm0, -0x10(%rdi)
367 movdqa %xmm1, -0x20(%rdi)
368
369 movdqa -0x30(%rsi), %xmm0
370 movdqa -0x40(%rsi), %xmm1
371 sub $0x40, %rsi
372
373 movdqa %xmm0, -0x30(%rdi)
374 movdqa %xmm1, -0x40(%rdi)
375 sub $0x40, %rdi
376 L(shl_0_gobble_bwd_less_64bytes):
377 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
378
379 .p2align 4
380 L(shl_0_gobble_mem_bwd_loop):
381 prefetcht0 -0x1c0(%rsi)
382 prefetcht0 -0x280(%rsi)
383 movdqa -0x10(%rsi), %xmm0
384 movdqa -0x20(%rsi), %xmm1
385 movdqa -0x30(%rsi), %xmm2
386 movdqa -0x40(%rsi), %xmm3
387 movdqa -0x50(%rsi), %xmm4
388 movdqa -0x60(%rsi), %xmm5
389 movdqa -0x70(%rsi), %xmm6
390 movdqa -0x80(%rsi), %xmm7
391 lea -0x80(%rsi), %rsi
392 sub $0x80, %rdx
393 movdqa %xmm0, -0x10(%rdi)
394 movdqa %xmm1, -0x20(%rdi)
395 movdqa %xmm2, -0x30(%rdi)
396 movdqa %xmm3, -0x40(%rdi)
397 movdqa %xmm4, -0x50(%rdi)
398 movdqa %xmm5, -0x60(%rdi)
399 movdqa %xmm6, -0x70(%rdi)
400 movdqa %xmm7, -0x80(%rdi)
401 lea -0x80(%rdi), %rdi
402
403 jae L(shl_0_gobble_mem_bwd_loop)
404 cmp $-0x40, %rdx
405 lea 0x80(%rdx), %rdx
406 jl L(shl_0_mem_bwd_less_64bytes)
407
408 movdqa -0x10(%rsi), %xmm0
409 sub $0x40, %rdx
410 movdqa -0x20(%rsi), %xmm1
411
412 movdqa %xmm0, -0x10(%rdi)
413 movdqa %xmm1, -0x20(%rdi)
414
415 movdqa -0x30(%rsi), %xmm0
416 movdqa -0x40(%rsi), %xmm1
417 sub $0x40, %rsi
418
419 movdqa %xmm0, -0x30(%rdi)
420 movdqa %xmm1, -0x40(%rdi)
421 sub $0x40, %rdi
422 L(shl_0_mem_bwd_less_64bytes):
423 cmp $0x20, %rdx
424 jb L(shl_0_mem_bwd_less_32bytes)
425 movdqa -0x10(%rsi), %xmm0
426 sub $0x20, %rdx
427 movdqa -0x20(%rsi), %xmm1
428 sub $0x20, %rsi
429 movdqa %xmm0, -0x10(%rdi)
430 movdqa %xmm1, -0x20(%rdi)
431 sub $0x20, %rdi
432 L(shl_0_mem_bwd_less_32bytes):
433 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
434
435 .p2align 4
436 L(shl_1):
437 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
438 cmp %rcx, %rdx
439 movaps -0x01(%rsi), %xmm1
440 jb L(L1_fwd)
441 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
442 L(L1_fwd):
443 lea -64(%rdx), %rdx
444 _CET_NOTRACK jmp *%r9
445 ud2
446 L(shl_1_loop_L2):
447 prefetchnta 0x1c0(%rsi)
448 L(shl_1_loop_L1):
449 sub $64, %rdx
450 movaps 0x0f(%rsi), %xmm2
451 movaps 0x1f(%rsi), %xmm3
452 movaps 0x2f(%rsi), %xmm4
453 movaps 0x3f(%rsi), %xmm5
454 movdqa %xmm5, %xmm6
455 palignr $1, %xmm4, %xmm5
456 lea 64(%rsi), %rsi
457 palignr $1, %xmm3, %xmm4
458 palignr $1, %xmm2, %xmm3
459 lea 64(%rdi), %rdi
460 palignr $1, %xmm1, %xmm2
461 movdqa %xmm6, %xmm1
462 movdqa %xmm2, -0x40(%rdi)
463 movaps %xmm3, -0x30(%rdi)
464 jb L(shl_1_end)
465 movaps %xmm4, -0x20(%rdi)
466 movaps %xmm5, -0x10(%rdi)
467 _CET_NOTRACK jmp *%r9
468 ud2
469 L(shl_1_end):
470 movaps %xmm4, -0x20(%rdi)
471 lea 64(%rdx), %rdx
472 movaps %xmm5, -0x10(%rdi)
473 add %rdx, %rdi
474 movdqu %xmm0, (%r8)
475 add %rdx, %rsi
476 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
477
478 .p2align 4
479 L(shl_1_bwd):
480 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
481 cmp %rcx, %rdx
482 movaps -0x01(%rsi), %xmm1
483 jb L(L1_bwd)
484 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
485 L(L1_bwd):
486 lea -64(%rdx), %rdx
487 _CET_NOTRACK jmp *%r9
488 ud2
489 L(shl_1_bwd_loop_L2):
490 prefetchnta -0x1c0(%rsi)
491 L(shl_1_bwd_loop_L1):
492 movaps -0x11(%rsi), %xmm2
493 sub $0x40, %rdx
494 movaps -0x21(%rsi), %xmm3
495 movaps -0x31(%rsi), %xmm4
496 movaps -0x41(%rsi), %xmm5
497 lea -0x40(%rsi), %rsi
498 palignr $1, %xmm2, %xmm1
499 palignr $1, %xmm3, %xmm2
500 palignr $1, %xmm4, %xmm3
501 palignr $1, %xmm5, %xmm4
502
503 movaps %xmm1, -0x10(%rdi)
504 movaps %xmm5, %xmm1
505
506 movaps %xmm2, -0x20(%rdi)
507 lea -0x40(%rdi), %rdi
508
509 movaps %xmm3, 0x10(%rdi)
510 jb L(shl_1_bwd_end)
511 movaps %xmm4, (%rdi)
512 _CET_NOTRACK jmp *%r9
513 ud2
514 L(shl_1_bwd_end):
515 movaps %xmm4, (%rdi)
516 lea 64(%rdx), %rdx
517 movdqu %xmm0, (%r8)
518 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
519
520 .p2align 4
521 L(shl_2):
522 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
523 cmp %rcx, %rdx
524 movaps -0x02(%rsi), %xmm1
525 jb L(L2_fwd)
526 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
527 L(L2_fwd):
528 lea -64(%rdx), %rdx
529 _CET_NOTRACK jmp *%r9
530 ud2
531 L(shl_2_loop_L2):
532 prefetchnta 0x1c0(%rsi)
533 L(shl_2_loop_L1):
534 sub $64, %rdx
535 movaps 0x0e(%rsi), %xmm2
536 movaps 0x1e(%rsi), %xmm3
537 movaps 0x2e(%rsi), %xmm4
538 movaps 0x3e(%rsi), %xmm5
539 movdqa %xmm5, %xmm6
540 palignr $2, %xmm4, %xmm5
541 lea 64(%rsi), %rsi
542 palignr $2, %xmm3, %xmm4
543 palignr $2, %xmm2, %xmm3
544 lea 64(%rdi), %rdi
545 palignr $2, %xmm1, %xmm2
546 movdqa %xmm6, %xmm1
547 movdqa %xmm2, -0x40(%rdi)
548 movaps %xmm3, -0x30(%rdi)
549 jb L(shl_2_end)
550 movaps %xmm4, -0x20(%rdi)
551 movaps %xmm5, -0x10(%rdi)
552 _CET_NOTRACK jmp *%r9
553 ud2
554 L(shl_2_end):
555 movaps %xmm4, -0x20(%rdi)
556 lea 64(%rdx), %rdx
557 movaps %xmm5, -0x10(%rdi)
558 add %rdx, %rdi
559 movdqu %xmm0, (%r8)
560 add %rdx, %rsi
561 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
562
563 .p2align 4
564 L(shl_2_bwd):
565 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
566 cmp %rcx, %rdx
567 movaps -0x02(%rsi), %xmm1
568 jb L(L2_bwd)
569 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
570 L(L2_bwd):
571 lea -64(%rdx), %rdx
572 _CET_NOTRACK jmp *%r9
573 ud2
574 L(shl_2_bwd_loop_L2):
575 prefetchnta -0x1c0(%rsi)
576 L(shl_2_bwd_loop_L1):
577 movaps -0x12(%rsi), %xmm2
578 sub $0x40, %rdx
579 movaps -0x22(%rsi), %xmm3
580 movaps -0x32(%rsi), %xmm4
581 movaps -0x42(%rsi), %xmm5
582 lea -0x40(%rsi), %rsi
583 palignr $2, %xmm2, %xmm1
584 palignr $2, %xmm3, %xmm2
585 palignr $2, %xmm4, %xmm3
586 palignr $2, %xmm5, %xmm4
587
588 movaps %xmm1, -0x10(%rdi)
589 movaps %xmm5, %xmm1
590
591 movaps %xmm2, -0x20(%rdi)
592 lea -0x40(%rdi), %rdi
593
594 movaps %xmm3, 0x10(%rdi)
595 jb L(shl_2_bwd_end)
596 movaps %xmm4, (%rdi)
597 _CET_NOTRACK jmp *%r9
598 ud2
599 L(shl_2_bwd_end):
600 movaps %xmm4, (%rdi)
601 lea 64(%rdx), %rdx
602 movdqu %xmm0, (%r8)
603 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
604
605 .p2align 4
606 L(shl_3):
607 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
608 cmp %rcx, %rdx
609 movaps -0x03(%rsi), %xmm1
610 jb L(L3_fwd)
611 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
612 L(L3_fwd):
613 lea -64(%rdx), %rdx
614 _CET_NOTRACK jmp *%r9
615 ud2
616 L(shl_3_loop_L2):
617 prefetchnta 0x1c0(%rsi)
618 L(shl_3_loop_L1):
619 sub $64, %rdx
620 movaps 0x0d(%rsi), %xmm2
621 movaps 0x1d(%rsi), %xmm3
622 movaps 0x2d(%rsi), %xmm4
623 movaps 0x3d(%rsi), %xmm5
624 movdqa %xmm5, %xmm6
625 palignr $3, %xmm4, %xmm5
626 lea 64(%rsi), %rsi
627 palignr $3, %xmm3, %xmm4
628 palignr $3, %xmm2, %xmm3
629 lea 64(%rdi), %rdi
630 palignr $3, %xmm1, %xmm2
631 movdqa %xmm6, %xmm1
632 movdqa %xmm2, -0x40(%rdi)
633 movaps %xmm3, -0x30(%rdi)
634 jb L(shl_3_end)
635 movaps %xmm4, -0x20(%rdi)
636 movaps %xmm5, -0x10(%rdi)
637 _CET_NOTRACK jmp *%r9
638 ud2
639 L(shl_3_end):
640 movaps %xmm4, -0x20(%rdi)
641 lea 64(%rdx), %rdx
642 movaps %xmm5, -0x10(%rdi)
643 add %rdx, %rdi
644 movdqu %xmm0, (%r8)
645 add %rdx, %rsi
646 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
647
648 .p2align 4
649 L(shl_3_bwd):
650 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
651 cmp %rcx, %rdx
652 movaps -0x03(%rsi), %xmm1
653 jb L(L3_bwd)
654 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
655 L(L3_bwd):
656 lea -64(%rdx), %rdx
657 _CET_NOTRACK jmp *%r9
658 ud2
659 L(shl_3_bwd_loop_L2):
660 prefetchnta -0x1c0(%rsi)
661 L(shl_3_bwd_loop_L1):
662 movaps -0x13(%rsi), %xmm2
663 sub $0x40, %rdx
664 movaps -0x23(%rsi), %xmm3
665 movaps -0x33(%rsi), %xmm4
666 movaps -0x43(%rsi), %xmm5
667 lea -0x40(%rsi), %rsi
668 palignr $3, %xmm2, %xmm1
669 palignr $3, %xmm3, %xmm2
670 palignr $3, %xmm4, %xmm3
671 palignr $3, %xmm5, %xmm4
672
673 movaps %xmm1, -0x10(%rdi)
674 movaps %xmm5, %xmm1
675
676 movaps %xmm2, -0x20(%rdi)
677 lea -0x40(%rdi), %rdi
678
679 movaps %xmm3, 0x10(%rdi)
680 jb L(shl_3_bwd_end)
681 movaps %xmm4, (%rdi)
682 _CET_NOTRACK jmp *%r9
683 ud2
684 L(shl_3_bwd_end):
685 movaps %xmm4, (%rdi)
686 lea 64(%rdx), %rdx
687 movdqu %xmm0, (%r8)
688 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
689
690 .p2align 4
691 L(shl_4):
692 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
693 cmp %rcx, %rdx
694 movaps -0x04(%rsi), %xmm1
695 jb L(L4_fwd)
696 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
697 L(L4_fwd):
698 lea -64(%rdx), %rdx
699 _CET_NOTRACK jmp *%r9
700 ud2
701 L(shl_4_loop_L2):
702 prefetchnta 0x1c0(%rsi)
703 L(shl_4_loop_L1):
704 sub $64, %rdx
705 movaps 0x0c(%rsi), %xmm2
706 movaps 0x1c(%rsi), %xmm3
707 movaps 0x2c(%rsi), %xmm4
708 movaps 0x3c(%rsi), %xmm5
709 movdqa %xmm5, %xmm6
710 palignr $4, %xmm4, %xmm5
711 lea 64(%rsi), %rsi
712 palignr $4, %xmm3, %xmm4
713 palignr $4, %xmm2, %xmm3
714 lea 64(%rdi), %rdi
715 palignr $4, %xmm1, %xmm2
716 movdqa %xmm6, %xmm1
717 movdqa %xmm2, -0x40(%rdi)
718 movaps %xmm3, -0x30(%rdi)
719 jb L(shl_4_end)
720 movaps %xmm4, -0x20(%rdi)
721 movaps %xmm5, -0x10(%rdi)
722 _CET_NOTRACK jmp *%r9
723 ud2
724 L(shl_4_end):
725 movaps %xmm4, -0x20(%rdi)
726 lea 64(%rdx), %rdx
727 movaps %xmm5, -0x10(%rdi)
728 add %rdx, %rdi
729 movdqu %xmm0, (%r8)
730 add %rdx, %rsi
731 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
732
733 .p2align 4
734 L(shl_4_bwd):
735 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
736 cmp %rcx, %rdx
737 movaps -0x04(%rsi), %xmm1
738 jb L(L4_bwd)
739 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
740 L(L4_bwd):
741 lea -64(%rdx), %rdx
742 _CET_NOTRACK jmp *%r9
743 ud2
744 L(shl_4_bwd_loop_L2):
745 prefetchnta -0x1c0(%rsi)
746 L(shl_4_bwd_loop_L1):
747 movaps -0x14(%rsi), %xmm2
748 sub $0x40, %rdx
749 movaps -0x24(%rsi), %xmm3
750 movaps -0x34(%rsi), %xmm4
751 movaps -0x44(%rsi), %xmm5
752 lea -0x40(%rsi), %rsi
753 palignr $4, %xmm2, %xmm1
754 palignr $4, %xmm3, %xmm2
755 palignr $4, %xmm4, %xmm3
756 palignr $4, %xmm5, %xmm4
757
758 movaps %xmm1, -0x10(%rdi)
759 movaps %xmm5, %xmm1
760
761 movaps %xmm2, -0x20(%rdi)
762 lea -0x40(%rdi), %rdi
763
764 movaps %xmm3, 0x10(%rdi)
765 jb L(shl_4_bwd_end)
766 movaps %xmm4, (%rdi)
767 _CET_NOTRACK jmp *%r9
768 ud2
769 L(shl_4_bwd_end):
770 movaps %xmm4, (%rdi)
771 lea 64(%rdx), %rdx
772 movdqu %xmm0, (%r8)
773 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
774
775 .p2align 4
776 L(shl_5):
777 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
778 cmp %rcx, %rdx
779 movaps -0x05(%rsi), %xmm1
780 jb L(L5_fwd)
781 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
782 L(L5_fwd):
783 lea -64(%rdx), %rdx
784 _CET_NOTRACK jmp *%r9
785 ud2
786 L(shl_5_loop_L2):
787 prefetchnta 0x1c0(%rsi)
788 L(shl_5_loop_L1):
789 sub $64, %rdx
790 movaps 0x0b(%rsi), %xmm2
791 movaps 0x1b(%rsi), %xmm3
792 movaps 0x2b(%rsi), %xmm4
793 movaps 0x3b(%rsi), %xmm5
794 movdqa %xmm5, %xmm6
795 palignr $5, %xmm4, %xmm5
796 lea 64(%rsi), %rsi
797 palignr $5, %xmm3, %xmm4
798 palignr $5, %xmm2, %xmm3
799 lea 64(%rdi), %rdi
800 palignr $5, %xmm1, %xmm2
801 movdqa %xmm6, %xmm1
802 movdqa %xmm2, -0x40(%rdi)
803 movaps %xmm3, -0x30(%rdi)
804 jb L(shl_5_end)
805 movaps %xmm4, -0x20(%rdi)
806 movaps %xmm5, -0x10(%rdi)
807 _CET_NOTRACK jmp *%r9
808 ud2
809 L(shl_5_end):
810 movaps %xmm4, -0x20(%rdi)
811 lea 64(%rdx), %rdx
812 movaps %xmm5, -0x10(%rdi)
813 add %rdx, %rdi
814 movdqu %xmm0, (%r8)
815 add %rdx, %rsi
816 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
817
818 .p2align 4
819 L(shl_5_bwd):
820 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
821 cmp %rcx, %rdx
822 movaps -0x05(%rsi), %xmm1
823 jb L(L5_bwd)
824 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
825 L(L5_bwd):
826 lea -64(%rdx), %rdx
827 _CET_NOTRACK jmp *%r9
828 ud2
829 L(shl_5_bwd_loop_L2):
830 prefetchnta -0x1c0(%rsi)
831 L(shl_5_bwd_loop_L1):
832 movaps -0x15(%rsi), %xmm2
833 sub $0x40, %rdx
834 movaps -0x25(%rsi), %xmm3
835 movaps -0x35(%rsi), %xmm4
836 movaps -0x45(%rsi), %xmm5
837 lea -0x40(%rsi), %rsi
838 palignr $5, %xmm2, %xmm1
839 palignr $5, %xmm3, %xmm2
840 palignr $5, %xmm4, %xmm3
841 palignr $5, %xmm5, %xmm4
842
843 movaps %xmm1, -0x10(%rdi)
844 movaps %xmm5, %xmm1
845
846 movaps %xmm2, -0x20(%rdi)
847 lea -0x40(%rdi), %rdi
848
849 movaps %xmm3, 0x10(%rdi)
850 jb L(shl_5_bwd_end)
851 movaps %xmm4, (%rdi)
852 _CET_NOTRACK jmp *%r9
853 ud2
854 L(shl_5_bwd_end):
855 movaps %xmm4, (%rdi)
856 lea 64(%rdx), %rdx
857 movdqu %xmm0, (%r8)
858 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
859
860 .p2align 4
861 L(shl_6):
862 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
863 cmp %rcx, %rdx
864 movaps -0x06(%rsi), %xmm1
865 jb L(L6_fwd)
866 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
867 L(L6_fwd):
868 lea -64(%rdx), %rdx
869 _CET_NOTRACK jmp *%r9
870 ud2
871 L(shl_6_loop_L2):
872 prefetchnta 0x1c0(%rsi)
873 L(shl_6_loop_L1):
874 sub $64, %rdx
875 movaps 0x0a(%rsi), %xmm2
876 movaps 0x1a(%rsi), %xmm3
877 movaps 0x2a(%rsi), %xmm4
878 movaps 0x3a(%rsi), %xmm5
879 movdqa %xmm5, %xmm6
880 palignr $6, %xmm4, %xmm5
881 lea 64(%rsi), %rsi
882 palignr $6, %xmm3, %xmm4
883 palignr $6, %xmm2, %xmm3
884 lea 64(%rdi), %rdi
885 palignr $6, %xmm1, %xmm2
886 movdqa %xmm6, %xmm1
887 movdqa %xmm2, -0x40(%rdi)
888 movaps %xmm3, -0x30(%rdi)
889 jb L(shl_6_end)
890 movaps %xmm4, -0x20(%rdi)
891 movaps %xmm5, -0x10(%rdi)
892 _CET_NOTRACK jmp *%r9
893 ud2
894 L(shl_6_end):
895 movaps %xmm4, -0x20(%rdi)
896 lea 64(%rdx), %rdx
897 movaps %xmm5, -0x10(%rdi)
898 add %rdx, %rdi
899 movdqu %xmm0, (%r8)
900 add %rdx, %rsi
901 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
902
903 .p2align 4
904 L(shl_6_bwd):
905 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
906 cmp %rcx, %rdx
907 movaps -0x06(%rsi), %xmm1
908 jb L(L6_bwd)
909 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
910 L(L6_bwd):
911 lea -64(%rdx), %rdx
912 _CET_NOTRACK jmp *%r9
913 ud2
914 L(shl_6_bwd_loop_L2):
915 prefetchnta -0x1c0(%rsi)
916 L(shl_6_bwd_loop_L1):
917 movaps -0x16(%rsi), %xmm2
918 sub $0x40, %rdx
919 movaps -0x26(%rsi), %xmm3
920 movaps -0x36(%rsi), %xmm4
921 movaps -0x46(%rsi), %xmm5
922 lea -0x40(%rsi), %rsi
923 palignr $6, %xmm2, %xmm1
924 palignr $6, %xmm3, %xmm2
925 palignr $6, %xmm4, %xmm3
926 palignr $6, %xmm5, %xmm4
927
928 movaps %xmm1, -0x10(%rdi)
929 movaps %xmm5, %xmm1
930
931 movaps %xmm2, -0x20(%rdi)
932 lea -0x40(%rdi), %rdi
933
934 movaps %xmm3, 0x10(%rdi)
935 jb L(shl_6_bwd_end)
936 movaps %xmm4, (%rdi)
937 _CET_NOTRACK jmp *%r9
938 ud2
939 L(shl_6_bwd_end):
940 movaps %xmm4, (%rdi)
941 lea 64(%rdx), %rdx
942 movdqu %xmm0, (%r8)
943 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
944
945 .p2align 4
946 L(shl_7):
947 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
948 cmp %rcx, %rdx
949 movaps -0x07(%rsi), %xmm1
950 jb L(L7_fwd)
951 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
952 L(L7_fwd):
953 lea -64(%rdx), %rdx
954 _CET_NOTRACK jmp *%r9
955 ud2
956 L(shl_7_loop_L2):
957 prefetchnta 0x1c0(%rsi)
958 L(shl_7_loop_L1):
959 sub $64, %rdx
960 movaps 0x09(%rsi), %xmm2
961 movaps 0x19(%rsi), %xmm3
962 movaps 0x29(%rsi), %xmm4
963 movaps 0x39(%rsi), %xmm5
964 movdqa %xmm5, %xmm6
965 palignr $7, %xmm4, %xmm5
966 lea 64(%rsi), %rsi
967 palignr $7, %xmm3, %xmm4
968 palignr $7, %xmm2, %xmm3
969 lea 64(%rdi), %rdi
970 palignr $7, %xmm1, %xmm2
971 movdqa %xmm6, %xmm1
972 movdqa %xmm2, -0x40(%rdi)
973 movaps %xmm3, -0x30(%rdi)
974 jb L(shl_7_end)
975 movaps %xmm4, -0x20(%rdi)
976 movaps %xmm5, -0x10(%rdi)
977 _CET_NOTRACK jmp *%r9
978 ud2
979 L(shl_7_end):
980 movaps %xmm4, -0x20(%rdi)
981 lea 64(%rdx), %rdx
982 movaps %xmm5, -0x10(%rdi)
983 add %rdx, %rdi
984 movdqu %xmm0, (%r8)
985 add %rdx, %rsi
986 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
987
988 .p2align 4
989 L(shl_7_bwd):
990 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
991 cmp %rcx, %rdx
992 movaps -0x07(%rsi), %xmm1
993 jb L(L7_bwd)
994 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
995 L(L7_bwd):
996 lea -64(%rdx), %rdx
997 _CET_NOTRACK jmp *%r9
998 ud2
999 L(shl_7_bwd_loop_L2):
1000 prefetchnta -0x1c0(%rsi)
1001 L(shl_7_bwd_loop_L1):
1002 movaps -0x17(%rsi), %xmm2
1003 sub $0x40, %rdx
1004 movaps -0x27(%rsi), %xmm3
1005 movaps -0x37(%rsi), %xmm4
1006 movaps -0x47(%rsi), %xmm5
1007 lea -0x40(%rsi), %rsi
1008 palignr $7, %xmm2, %xmm1
1009 palignr $7, %xmm3, %xmm2
1010 palignr $7, %xmm4, %xmm3
1011 palignr $7, %xmm5, %xmm4
1012
1013 movaps %xmm1, -0x10(%rdi)
1014 movaps %xmm5, %xmm1
1015
1016 movaps %xmm2, -0x20(%rdi)
1017 lea -0x40(%rdi), %rdi
1018
1019 movaps %xmm3, 0x10(%rdi)
1020 jb L(shl_7_bwd_end)
1021 movaps %xmm4, (%rdi)
1022 _CET_NOTRACK jmp *%r9
1023 ud2
1024 L(shl_7_bwd_end):
1025 movaps %xmm4, (%rdi)
1026 lea 64(%rdx), %rdx
1027 movdqu %xmm0, (%r8)
1028 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1029
1030 .p2align 4
1031 L(shl_8):
1032 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1033 cmp %rcx, %rdx
1034 movaps -0x08(%rsi), %xmm1
1035 jb L(L8_fwd)
1036 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1037 L(L8_fwd):
1038 lea -64(%rdx), %rdx
1039 _CET_NOTRACK jmp *%r9
1040 L(shl_8_loop_L2):
1041 prefetchnta 0x1c0(%rsi)
1042 L(shl_8_loop_L1):
1043 sub $64, %rdx
1044 movaps 0x08(%rsi), %xmm2
1045 movaps 0x18(%rsi), %xmm3
1046 movaps 0x28(%rsi), %xmm4
1047 movaps 0x38(%rsi), %xmm5
1048 movdqa %xmm5, %xmm6
1049 palignr $8, %xmm4, %xmm5
1050 lea 64(%rsi), %rsi
1051 palignr $8, %xmm3, %xmm4
1052 palignr $8, %xmm2, %xmm3
1053 lea 64(%rdi), %rdi
1054 palignr $8, %xmm1, %xmm2
1055 movdqa %xmm6, %xmm1
1056 movdqa %xmm2, -0x40(%rdi)
1057 movaps %xmm3, -0x30(%rdi)
1058 jb L(shl_8_end)
1059 movaps %xmm4, -0x20(%rdi)
1060 movaps %xmm5, -0x10(%rdi)
1061 _CET_NOTRACK jmp *%r9
1062 ud2
1063 .p2align 4
1064 L(shl_8_end):
1065 lea 64(%rdx), %rdx
1066 movaps %xmm4, -0x20(%rdi)
1067 add %rdx, %rsi
1068 movaps %xmm5, -0x10(%rdi)
1069 add %rdx, %rdi
1070 movdqu %xmm0, (%r8)
1071 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1072
1073 .p2align 4
1074 L(shl_8_bwd):
1075 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1076 cmp %rcx, %rdx
1077 movaps -0x08(%rsi), %xmm1
1078 jb L(L8_bwd)
1079 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1080 L(L8_bwd):
1081 lea -64(%rdx), %rdx
1082 _CET_NOTRACK jmp *%r9
1083 ud2
1084 L(shl_8_bwd_loop_L2):
1085 prefetchnta -0x1c0(%rsi)
1086 L(shl_8_bwd_loop_L1):
1087 movaps -0x18(%rsi), %xmm2
1088 sub $0x40, %rdx
1089 movaps -0x28(%rsi), %xmm3
1090 movaps -0x38(%rsi), %xmm4
1091 movaps -0x48(%rsi), %xmm5
1092 lea -0x40(%rsi), %rsi
1093 palignr $8, %xmm2, %xmm1
1094 palignr $8, %xmm3, %xmm2
1095 palignr $8, %xmm4, %xmm3
1096 palignr $8, %xmm5, %xmm4
1097
1098 movaps %xmm1, -0x10(%rdi)
1099 movaps %xmm5, %xmm1
1100
1101 movaps %xmm2, -0x20(%rdi)
1102 lea -0x40(%rdi), %rdi
1103
1104 movaps %xmm3, 0x10(%rdi)
1105 jb L(shl_8_bwd_end)
1106 movaps %xmm4, (%rdi)
1107 _CET_NOTRACK jmp *%r9
1108 ud2
1109 L(shl_8_bwd_end):
1110 movaps %xmm4, (%rdi)
1111 lea 64(%rdx), %rdx
1112 movdqu %xmm0, (%r8)
1113 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1114
1115 .p2align 4
1116 L(shl_9):
1117 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1118 cmp %rcx, %rdx
1119 movaps -0x09(%rsi), %xmm1
1120 jb L(L9_fwd)
1121 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1122 L(L9_fwd):
1123 lea -64(%rdx), %rdx
1124 _CET_NOTRACK jmp *%r9
1125 ud2
1126 L(shl_9_loop_L2):
1127 prefetchnta 0x1c0(%rsi)
1128 L(shl_9_loop_L1):
1129 sub $64, %rdx
1130 movaps 0x07(%rsi), %xmm2
1131 movaps 0x17(%rsi), %xmm3
1132 movaps 0x27(%rsi), %xmm4
1133 movaps 0x37(%rsi), %xmm5
1134 movdqa %xmm5, %xmm6
1135 palignr $9, %xmm4, %xmm5
1136 lea 64(%rsi), %rsi
1137 palignr $9, %xmm3, %xmm4
1138 palignr $9, %xmm2, %xmm3
1139 lea 64(%rdi), %rdi
1140 palignr $9, %xmm1, %xmm2
1141 movdqa %xmm6, %xmm1
1142 movdqa %xmm2, -0x40(%rdi)
1143 movaps %xmm3, -0x30(%rdi)
1144 jb L(shl_9_end)
1145 movaps %xmm4, -0x20(%rdi)
1146 movaps %xmm5, -0x10(%rdi)
1147 _CET_NOTRACK jmp *%r9
1148 ud2
1149 L(shl_9_end):
1150 movaps %xmm4, -0x20(%rdi)
1151 lea 64(%rdx), %rdx
1152 movaps %xmm5, -0x10(%rdi)
1153 add %rdx, %rdi
1154 movdqu %xmm0, (%r8)
1155 add %rdx, %rsi
1156 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1157
1158 .p2align 4
1159 L(shl_9_bwd):
1160 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1161 cmp %rcx, %rdx
1162 movaps -0x09(%rsi), %xmm1
1163 jb L(L9_bwd)
1164 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1165 L(L9_bwd):
1166 lea -64(%rdx), %rdx
1167 _CET_NOTRACK jmp *%r9
1168 ud2
1169 L(shl_9_bwd_loop_L2):
1170 prefetchnta -0x1c0(%rsi)
1171 L(shl_9_bwd_loop_L1):
1172 movaps -0x19(%rsi), %xmm2
1173 sub $0x40, %rdx
1174 movaps -0x29(%rsi), %xmm3
1175 movaps -0x39(%rsi), %xmm4
1176 movaps -0x49(%rsi), %xmm5
1177 lea -0x40(%rsi), %rsi
1178 palignr $9, %xmm2, %xmm1
1179 palignr $9, %xmm3, %xmm2
1180 palignr $9, %xmm4, %xmm3
1181 palignr $9, %xmm5, %xmm4
1182
1183 movaps %xmm1, -0x10(%rdi)
1184 movaps %xmm5, %xmm1
1185
1186 movaps %xmm2, -0x20(%rdi)
1187 lea -0x40(%rdi), %rdi
1188
1189 movaps %xmm3, 0x10(%rdi)
1190 jb L(shl_9_bwd_end)
1191 movaps %xmm4, (%rdi)
1192 _CET_NOTRACK jmp *%r9
1193 ud2
1194 L(shl_9_bwd_end):
1195 movaps %xmm4, (%rdi)
1196 lea 64(%rdx), %rdx
1197 movdqu %xmm0, (%r8)
1198 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1199
1200 .p2align 4
1201 L(shl_10):
1202 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1203 cmp %rcx, %rdx
1204 movaps -0x0a(%rsi), %xmm1
1205 jb L(L10_fwd)
1206 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1207 L(L10_fwd):
1208 lea -64(%rdx), %rdx
1209 _CET_NOTRACK jmp *%r9
1210 ud2
1211 L(shl_10_loop_L2):
1212 prefetchnta 0x1c0(%rsi)
1213 L(shl_10_loop_L1):
1214 sub $64, %rdx
1215 movaps 0x06(%rsi), %xmm2
1216 movaps 0x16(%rsi), %xmm3
1217 movaps 0x26(%rsi), %xmm4
1218 movaps 0x36(%rsi), %xmm5
1219 movdqa %xmm5, %xmm6
1220 palignr $10, %xmm4, %xmm5
1221 lea 64(%rsi), %rsi
1222 palignr $10, %xmm3, %xmm4
1223 palignr $10, %xmm2, %xmm3
1224 lea 64(%rdi), %rdi
1225 palignr $10, %xmm1, %xmm2
1226 movdqa %xmm6, %xmm1
1227 movdqa %xmm2, -0x40(%rdi)
1228 movaps %xmm3, -0x30(%rdi)
1229 jb L(shl_10_end)
1230 movaps %xmm4, -0x20(%rdi)
1231 movaps %xmm5, -0x10(%rdi)
1232 _CET_NOTRACK jmp *%r9
1233 ud2
1234 L(shl_10_end):
1235 movaps %xmm4, -0x20(%rdi)
1236 lea 64(%rdx), %rdx
1237 movaps %xmm5, -0x10(%rdi)
1238 add %rdx, %rdi
1239 movdqu %xmm0, (%r8)
1240 add %rdx, %rsi
1241 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1242
1243 .p2align 4
1244 L(shl_10_bwd):
1245 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1246 cmp %rcx, %rdx
1247 movaps -0x0a(%rsi), %xmm1
1248 jb L(L10_bwd)
1249 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1250 L(L10_bwd):
1251 lea -64(%rdx), %rdx
1252 _CET_NOTRACK jmp *%r9
1253 ud2
1254 L(shl_10_bwd_loop_L2):
1255 prefetchnta -0x1c0(%rsi)
1256 L(shl_10_bwd_loop_L1):
1257 movaps -0x1a(%rsi), %xmm2
1258 sub $0x40, %rdx
1259 movaps -0x2a(%rsi), %xmm3
1260 movaps -0x3a(%rsi), %xmm4
1261 movaps -0x4a(%rsi), %xmm5
1262 lea -0x40(%rsi), %rsi
1263 palignr $10, %xmm2, %xmm1
1264 palignr $10, %xmm3, %xmm2
1265 palignr $10, %xmm4, %xmm3
1266 palignr $10, %xmm5, %xmm4
1267
1268 movaps %xmm1, -0x10(%rdi)
1269 movaps %xmm5, %xmm1
1270
1271 movaps %xmm2, -0x20(%rdi)
1272 lea -0x40(%rdi), %rdi
1273
1274 movaps %xmm3, 0x10(%rdi)
1275 jb L(shl_10_bwd_end)
1276 movaps %xmm4, (%rdi)
1277 _CET_NOTRACK jmp *%r9
1278 ud2
1279 L(shl_10_bwd_end):
1280 movaps %xmm4, (%rdi)
1281 lea 64(%rdx), %rdx
1282 movdqu %xmm0, (%r8)
1283 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1284
1285 .p2align 4
1286 L(shl_11):
1287 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1288 cmp %rcx, %rdx
1289 movaps -0x0b(%rsi), %xmm1
1290 jb L(L11_fwd)
1291 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1292 L(L11_fwd):
1293 lea -64(%rdx), %rdx
1294 _CET_NOTRACK jmp *%r9
1295 ud2
1296 L(shl_11_loop_L2):
1297 prefetchnta 0x1c0(%rsi)
1298 L(shl_11_loop_L1):
1299 sub $64, %rdx
1300 movaps 0x05(%rsi), %xmm2
1301 movaps 0x15(%rsi), %xmm3
1302 movaps 0x25(%rsi), %xmm4
1303 movaps 0x35(%rsi), %xmm5
1304 movdqa %xmm5, %xmm6
1305 palignr $11, %xmm4, %xmm5
1306 lea 64(%rsi), %rsi
1307 palignr $11, %xmm3, %xmm4
1308 palignr $11, %xmm2, %xmm3
1309 lea 64(%rdi), %rdi
1310 palignr $11, %xmm1, %xmm2
1311 movdqa %xmm6, %xmm1
1312 movdqa %xmm2, -0x40(%rdi)
1313 movaps %xmm3, -0x30(%rdi)
1314 jb L(shl_11_end)
1315 movaps %xmm4, -0x20(%rdi)
1316 movaps %xmm5, -0x10(%rdi)
1317 _CET_NOTRACK jmp *%r9
1318 ud2
1319 L(shl_11_end):
1320 movaps %xmm4, -0x20(%rdi)
1321 lea 64(%rdx), %rdx
1322 movaps %xmm5, -0x10(%rdi)
1323 add %rdx, %rdi
1324 movdqu %xmm0, (%r8)
1325 add %rdx, %rsi
1326 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1327
1328 .p2align 4
1329 L(shl_11_bwd):
1330 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1331 cmp %rcx, %rdx
1332 movaps -0x0b(%rsi), %xmm1
1333 jb L(L11_bwd)
1334 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1335 L(L11_bwd):
1336 lea -64(%rdx), %rdx
1337 _CET_NOTRACK jmp *%r9
1338 ud2
1339 L(shl_11_bwd_loop_L2):
1340 prefetchnta -0x1c0(%rsi)
1341 L(shl_11_bwd_loop_L1):
1342 movaps -0x1b(%rsi), %xmm2
1343 sub $0x40, %rdx
1344 movaps -0x2b(%rsi), %xmm3
1345 movaps -0x3b(%rsi), %xmm4
1346 movaps -0x4b(%rsi), %xmm5
1347 lea -0x40(%rsi), %rsi
1348 palignr $11, %xmm2, %xmm1
1349 palignr $11, %xmm3, %xmm2
1350 palignr $11, %xmm4, %xmm3
1351 palignr $11, %xmm5, %xmm4
1352
1353 movaps %xmm1, -0x10(%rdi)
1354 movaps %xmm5, %xmm1
1355
1356 movaps %xmm2, -0x20(%rdi)
1357 lea -0x40(%rdi), %rdi
1358
1359 movaps %xmm3, 0x10(%rdi)
1360 jb L(shl_11_bwd_end)
1361 movaps %xmm4, (%rdi)
1362 _CET_NOTRACK jmp *%r9
1363 ud2
1364 L(shl_11_bwd_end):
1365 movaps %xmm4, (%rdi)
1366 lea 64(%rdx), %rdx
1367 movdqu %xmm0, (%r8)
1368 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1369
1370 .p2align 4
1371 L(shl_12):
1372 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1373 cmp %rcx, %rdx
1374 movaps -0x0c(%rsi), %xmm1
1375 jb L(L12_fwd)
1376 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1377 L(L12_fwd):
1378 lea -64(%rdx), %rdx
1379 _CET_NOTRACK jmp *%r9
1380 ud2
1381 L(shl_12_loop_L2):
1382 prefetchnta 0x1c0(%rsi)
1383 L(shl_12_loop_L1):
1384 sub $64, %rdx
1385 movaps 0x04(%rsi), %xmm2
1386 movaps 0x14(%rsi), %xmm3
1387 movaps 0x24(%rsi), %xmm4
1388 movaps 0x34(%rsi), %xmm5
1389 movdqa %xmm5, %xmm6
1390 palignr $12, %xmm4, %xmm5
1391 lea 64(%rsi), %rsi
1392 palignr $12, %xmm3, %xmm4
1393 palignr $12, %xmm2, %xmm3
1394 lea 64(%rdi), %rdi
1395 palignr $12, %xmm1, %xmm2
1396 movdqa %xmm6, %xmm1
1397 movdqa %xmm2, -0x40(%rdi)
1398 movaps %xmm3, -0x30(%rdi)
1399 jb L(shl_12_end)
1400 movaps %xmm4, -0x20(%rdi)
1401 movaps %xmm5, -0x10(%rdi)
1402 _CET_NOTRACK jmp *%r9
1403 ud2
1404 L(shl_12_end):
1405 movaps %xmm4, -0x20(%rdi)
1406 lea 64(%rdx), %rdx
1407 movaps %xmm5, -0x10(%rdi)
1408 add %rdx, %rdi
1409 movdqu %xmm0, (%r8)
1410 add %rdx, %rsi
1411 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1412
1413 .p2align 4
1414 L(shl_12_bwd):
1415 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1416 cmp %rcx, %rdx
1417 movaps -0x0c(%rsi), %xmm1
1418 jb L(L12_bwd)
1419 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1420 L(L12_bwd):
1421 lea -64(%rdx), %rdx
1422 _CET_NOTRACK jmp *%r9
1423 ud2
1424 L(shl_12_bwd_loop_L2):
1425 prefetchnta -0x1c0(%rsi)
1426 L(shl_12_bwd_loop_L1):
1427 movaps -0x1c(%rsi), %xmm2
1428 sub $0x40, %rdx
1429 movaps -0x2c(%rsi), %xmm3
1430 movaps -0x3c(%rsi), %xmm4
1431 movaps -0x4c(%rsi), %xmm5
1432 lea -0x40(%rsi), %rsi
1433 palignr $12, %xmm2, %xmm1
1434 palignr $12, %xmm3, %xmm2
1435 palignr $12, %xmm4, %xmm3
1436 palignr $12, %xmm5, %xmm4
1437
1438 movaps %xmm1, -0x10(%rdi)
1439 movaps %xmm5, %xmm1
1440
1441 movaps %xmm2, -0x20(%rdi)
1442 lea -0x40(%rdi), %rdi
1443
1444 movaps %xmm3, 0x10(%rdi)
1445 jb L(shl_12_bwd_end)
1446 movaps %xmm4, (%rdi)
1447 _CET_NOTRACK jmp *%r9
1448 ud2
1449 L(shl_12_bwd_end):
1450 movaps %xmm4, (%rdi)
1451 lea 64(%rdx), %rdx
1452 movdqu %xmm0, (%r8)
1453 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1454
1455 .p2align 4
1456 L(shl_13):
1457 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1458 cmp %rcx, %rdx
1459 movaps -0x0d(%rsi), %xmm1
1460 jb L(L13_fwd)
1461 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1462 L(L13_fwd):
1463 lea -64(%rdx), %rdx
1464 _CET_NOTRACK jmp *%r9
1465 ud2
1466 L(shl_13_loop_L2):
1467 prefetchnta 0x1c0(%rsi)
1468 L(shl_13_loop_L1):
1469 sub $64, %rdx
1470 movaps 0x03(%rsi), %xmm2
1471 movaps 0x13(%rsi), %xmm3
1472 movaps 0x23(%rsi), %xmm4
1473 movaps 0x33(%rsi), %xmm5
1474 movdqa %xmm5, %xmm6
1475 palignr $13, %xmm4, %xmm5
1476 lea 64(%rsi), %rsi
1477 palignr $13, %xmm3, %xmm4
1478 palignr $13, %xmm2, %xmm3
1479 lea 64(%rdi), %rdi
1480 palignr $13, %xmm1, %xmm2
1481 movdqa %xmm6, %xmm1
1482 movdqa %xmm2, -0x40(%rdi)
1483 movaps %xmm3, -0x30(%rdi)
1484 jb L(shl_13_end)
1485 movaps %xmm4, -0x20(%rdi)
1486 movaps %xmm5, -0x10(%rdi)
1487 _CET_NOTRACK jmp *%r9
1488 ud2
1489 L(shl_13_end):
1490 movaps %xmm4, -0x20(%rdi)
1491 lea 64(%rdx), %rdx
1492 movaps %xmm5, -0x10(%rdi)
1493 add %rdx, %rdi
1494 movdqu %xmm0, (%r8)
1495 add %rdx, %rsi
1496 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1497
1498 .p2align 4
1499 L(shl_13_bwd):
1500 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1501 cmp %rcx, %rdx
1502 movaps -0x0d(%rsi), %xmm1
1503 jb L(L13_bwd)
1504 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1505 L(L13_bwd):
1506 lea -64(%rdx), %rdx
1507 _CET_NOTRACK jmp *%r9
1508 ud2
1509 L(shl_13_bwd_loop_L2):
1510 prefetchnta -0x1c0(%rsi)
1511 L(shl_13_bwd_loop_L1):
1512 movaps -0x1d(%rsi), %xmm2
1513 sub $0x40, %rdx
1514 movaps -0x2d(%rsi), %xmm3
1515 movaps -0x3d(%rsi), %xmm4
1516 movaps -0x4d(%rsi), %xmm5
1517 lea -0x40(%rsi), %rsi
1518 palignr $13, %xmm2, %xmm1
1519 palignr $13, %xmm3, %xmm2
1520 palignr $13, %xmm4, %xmm3
1521 palignr $13, %xmm5, %xmm4
1522
1523 movaps %xmm1, -0x10(%rdi)
1524 movaps %xmm5, %xmm1
1525
1526 movaps %xmm2, -0x20(%rdi)
1527 lea -0x40(%rdi), %rdi
1528
1529 movaps %xmm3, 0x10(%rdi)
1530 jb L(shl_13_bwd_end)
1531 movaps %xmm4, (%rdi)
1532 _CET_NOTRACK jmp *%r9
1533 ud2
1534 L(shl_13_bwd_end):
1535 movaps %xmm4, (%rdi)
1536 lea 64(%rdx), %rdx
1537 movdqu %xmm0, (%r8)
1538 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1539
1540 .p2align 4
1541 L(shl_14):
1542 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1543 cmp %rcx, %rdx
1544 movaps -0x0e(%rsi), %xmm1
1545 jb L(L14_fwd)
1546 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1547 L(L14_fwd):
1548 lea -64(%rdx), %rdx
1549 _CET_NOTRACK jmp *%r9
1550 ud2
1551 L(shl_14_loop_L2):
1552 prefetchnta 0x1c0(%rsi)
1553 L(shl_14_loop_L1):
1554 sub $64, %rdx
1555 movaps 0x02(%rsi), %xmm2
1556 movaps 0x12(%rsi), %xmm3
1557 movaps 0x22(%rsi), %xmm4
1558 movaps 0x32(%rsi), %xmm5
1559 movdqa %xmm5, %xmm6
1560 palignr $14, %xmm4, %xmm5
1561 lea 64(%rsi), %rsi
1562 palignr $14, %xmm3, %xmm4
1563 palignr $14, %xmm2, %xmm3
1564 lea 64(%rdi), %rdi
1565 palignr $14, %xmm1, %xmm2
1566 movdqa %xmm6, %xmm1
1567 movdqa %xmm2, -0x40(%rdi)
1568 movaps %xmm3, -0x30(%rdi)
1569 jb L(shl_14_end)
1570 movaps %xmm4, -0x20(%rdi)
1571 movaps %xmm5, -0x10(%rdi)
1572 _CET_NOTRACK jmp *%r9
1573 ud2
1574 L(shl_14_end):
1575 movaps %xmm4, -0x20(%rdi)
1576 lea 64(%rdx), %rdx
1577 movaps %xmm5, -0x10(%rdi)
1578 add %rdx, %rdi
1579 movdqu %xmm0, (%r8)
1580 add %rdx, %rsi
1581 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1582
1583 .p2align 4
1584 L(shl_14_bwd):
1585 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1586 cmp %rcx, %rdx
1587 movaps -0x0e(%rsi), %xmm1
1588 jb L(L14_bwd)
1589 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1590 L(L14_bwd):
1591 lea -64(%rdx), %rdx
1592 _CET_NOTRACK jmp *%r9
1593 ud2
1594 L(shl_14_bwd_loop_L2):
1595 prefetchnta -0x1c0(%rsi)
1596 L(shl_14_bwd_loop_L1):
1597 movaps -0x1e(%rsi), %xmm2
1598 sub $0x40, %rdx
1599 movaps -0x2e(%rsi), %xmm3
1600 movaps -0x3e(%rsi), %xmm4
1601 movaps -0x4e(%rsi), %xmm5
1602 lea -0x40(%rsi), %rsi
1603 palignr $14, %xmm2, %xmm1
1604 palignr $14, %xmm3, %xmm2
1605 palignr $14, %xmm4, %xmm3
1606 palignr $14, %xmm5, %xmm4
1607
1608 movaps %xmm1, -0x10(%rdi)
1609 movaps %xmm5, %xmm1
1610
1611 movaps %xmm2, -0x20(%rdi)
1612 lea -0x40(%rdi), %rdi
1613
1614 movaps %xmm3, 0x10(%rdi)
1615 jb L(shl_14_bwd_end)
1616 movaps %xmm4, (%rdi)
1617 _CET_NOTRACK jmp *%r9
1618 ud2
1619 L(shl_14_bwd_end):
1620 movaps %xmm4, (%rdi)
1621 lea 64(%rdx), %rdx
1622 movdqu %xmm0, (%r8)
1623 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1624
1625 .p2align 4
1626 L(shl_15):
1627 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1628 cmp %rcx, %rdx
1629 movaps -0x0f(%rsi), %xmm1
1630 jb L(L15_fwd)
1631 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1632 L(L15_fwd):
1633 lea -64(%rdx), %rdx
1634 _CET_NOTRACK jmp *%r9
1635 ud2
1636 L(shl_15_loop_L2):
1637 prefetchnta 0x1c0(%rsi)
1638 L(shl_15_loop_L1):
1639 sub $64, %rdx
1640 movaps 0x01(%rsi), %xmm2
1641 movaps 0x11(%rsi), %xmm3
1642 movaps 0x21(%rsi), %xmm4
1643 movaps 0x31(%rsi), %xmm5
1644 movdqa %xmm5, %xmm6
1645 palignr $15, %xmm4, %xmm5
1646 lea 64(%rsi), %rsi
1647 palignr $15, %xmm3, %xmm4
1648 palignr $15, %xmm2, %xmm3
1649 lea 64(%rdi), %rdi
1650 palignr $15, %xmm1, %xmm2
1651 movdqa %xmm6, %xmm1
1652 movdqa %xmm2, -0x40(%rdi)
1653 movaps %xmm3, -0x30(%rdi)
1654 jb L(shl_15_end)
1655 movaps %xmm4, -0x20(%rdi)
1656 movaps %xmm5, -0x10(%rdi)
1657 _CET_NOTRACK jmp *%r9
1658 ud2
1659 L(shl_15_end):
1660 movaps %xmm4, -0x20(%rdi)
1661 lea 64(%rdx), %rdx
1662 movaps %xmm5, -0x10(%rdi)
1663 add %rdx, %rdi
1664 movdqu %xmm0, (%r8)
1665 add %rdx, %rsi
1666 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1667
1668 .p2align 4
1669 L(shl_15_bwd):
1670 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1671 cmp %rcx, %rdx
1672 movaps -0x0f(%rsi), %xmm1
1673 jb L(L15_bwd)
1674 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1675 L(L15_bwd):
1676 lea -64(%rdx), %rdx
1677 _CET_NOTRACK jmp *%r9
1678 ud2
1679 L(shl_15_bwd_loop_L2):
1680 prefetchnta -0x1c0(%rsi)
1681 L(shl_15_bwd_loop_L1):
1682 movaps -0x1f(%rsi), %xmm2
1683 sub $0x40, %rdx
1684 movaps -0x2f(%rsi), %xmm3
1685 movaps -0x3f(%rsi), %xmm4
1686 movaps -0x4f(%rsi), %xmm5
1687 lea -0x40(%rsi), %rsi
1688 palignr $15, %xmm2, %xmm1
1689 palignr $15, %xmm3, %xmm2
1690 palignr $15, %xmm4, %xmm3
1691 palignr $15, %xmm5, %xmm4
1692
1693 movaps %xmm1, -0x10(%rdi)
1694 movaps %xmm5, %xmm1
1695
1696 movaps %xmm2, -0x20(%rdi)
1697 lea -0x40(%rdi), %rdi
1698
1699 movaps %xmm3, 0x10(%rdi)
1700 jb L(shl_15_bwd_end)
1701 movaps %xmm4, (%rdi)
1702 _CET_NOTRACK jmp *%r9
1703 ud2
1704 L(shl_15_bwd_end):
1705 movaps %xmm4, (%rdi)
1706 lea 64(%rdx), %rdx
1707 movdqu %xmm0, (%r8)
1708 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1709
1710 .p2align 4
1711 L(write_72bytes):
1712 movdqu -72(%rsi), %xmm0
1713 movdqu -56(%rsi), %xmm1
1714 mov -40(%rsi), %r8
1715 mov -32(%rsi), %r9
1716 mov -24(%rsi), %r10
1717 mov -16(%rsi), %r11
1718 mov -8(%rsi), %rcx
1719 movdqu %xmm0, -72(%rdi)
1720 movdqu %xmm1, -56(%rdi)
1721 mov %r8, -40(%rdi)
1722 mov %r9, -32(%rdi)
1723 mov %r10, -24(%rdi)
1724 mov %r11, -16(%rdi)
1725 mov %rcx, -8(%rdi)
1726 ret
1727
1728 .p2align 4
1729 L(write_64bytes):
1730 movdqu -64(%rsi), %xmm0
1731 mov -48(%rsi), %rcx
1732 mov -40(%rsi), %r8
1733 mov -32(%rsi), %r9
1734 mov -24(%rsi), %r10
1735 mov -16(%rsi), %r11
1736 mov -8(%rsi), %rdx
1737 movdqu %xmm0, -64(%rdi)
1738 mov %rcx, -48(%rdi)
1739 mov %r8, -40(%rdi)
1740 mov %r9, -32(%rdi)
1741 mov %r10, -24(%rdi)
1742 mov %r11, -16(%rdi)
1743 mov %rdx, -8(%rdi)
1744 ret
1745
1746 .p2align 4
1747 L(write_56bytes):
1748 movdqu -56(%rsi), %xmm0
1749 mov -40(%rsi), %r8
1750 mov -32(%rsi), %r9
1751 mov -24(%rsi), %r10
1752 mov -16(%rsi), %r11
1753 mov -8(%rsi), %rcx
1754 movdqu %xmm0, -56(%rdi)
1755 mov %r8, -40(%rdi)
1756 mov %r9, -32(%rdi)
1757 mov %r10, -24(%rdi)
1758 mov %r11, -16(%rdi)
1759 mov %rcx, -8(%rdi)
1760 ret
1761
1762 .p2align 4
1763 L(write_48bytes):
1764 mov -48(%rsi), %rcx
1765 mov -40(%rsi), %r8
1766 mov -32(%rsi), %r9
1767 mov -24(%rsi), %r10
1768 mov -16(%rsi), %r11
1769 mov -8(%rsi), %rdx
1770 mov %rcx, -48(%rdi)
1771 mov %r8, -40(%rdi)
1772 mov %r9, -32(%rdi)
1773 mov %r10, -24(%rdi)
1774 mov %r11, -16(%rdi)
1775 mov %rdx, -8(%rdi)
1776 ret
1777
1778 .p2align 4
1779 L(write_40bytes):
1780 mov -40(%rsi), %r8
1781 mov -32(%rsi), %r9
1782 mov -24(%rsi), %r10
1783 mov -16(%rsi), %r11
1784 mov -8(%rsi), %rdx
1785 mov %r8, -40(%rdi)
1786 mov %r9, -32(%rdi)
1787 mov %r10, -24(%rdi)
1788 mov %r11, -16(%rdi)
1789 mov %rdx, -8(%rdi)
1790 ret
1791
1792 .p2align 4
1793 L(write_32bytes):
1794 mov -32(%rsi), %r9
1795 mov -24(%rsi), %r10
1796 mov -16(%rsi), %r11
1797 mov -8(%rsi), %rdx
1798 mov %r9, -32(%rdi)
1799 mov %r10, -24(%rdi)
1800 mov %r11, -16(%rdi)
1801 mov %rdx, -8(%rdi)
1802 ret
1803
1804 .p2align 4
1805 L(write_24bytes):
1806 mov -24(%rsi), %r10
1807 mov -16(%rsi), %r11
1808 mov -8(%rsi), %rdx
1809 mov %r10, -24(%rdi)
1810 mov %r11, -16(%rdi)
1811 mov %rdx, -8(%rdi)
1812 ret
1813
1814 .p2align 4
1815 L(write_16bytes):
1816 mov -16(%rsi), %r11
1817 mov -8(%rsi), %rdx
1818 mov %r11, -16(%rdi)
1819 mov %rdx, -8(%rdi)
1820 ret
1821
1822 .p2align 4
1823 L(write_8bytes):
1824 mov -8(%rsi), %rdx
1825 mov %rdx, -8(%rdi)
1826 L(write_0bytes):
1827 ret
1828
1829 .p2align 4
1830 L(write_73bytes):
1831 movdqu -73(%rsi), %xmm0
1832 movdqu -57(%rsi), %xmm1
1833 mov -41(%rsi), %rcx
1834 mov -33(%rsi), %r9
1835 mov -25(%rsi), %r10
1836 mov -17(%rsi), %r11
1837 mov -9(%rsi), %r8
1838 mov -4(%rsi), %edx
1839 movdqu %xmm0, -73(%rdi)
1840 movdqu %xmm1, -57(%rdi)
1841 mov %rcx, -41(%rdi)
1842 mov %r9, -33(%rdi)
1843 mov %r10, -25(%rdi)
1844 mov %r11, -17(%rdi)
1845 mov %r8, -9(%rdi)
1846 mov %edx, -4(%rdi)
1847 ret
1848
1849 .p2align 4
1850 L(write_65bytes):
1851 movdqu -65(%rsi), %xmm0
1852 movdqu -49(%rsi), %xmm1
1853 mov -33(%rsi), %r9
1854 mov -25(%rsi), %r10
1855 mov -17(%rsi), %r11
1856 mov -9(%rsi), %rcx
1857 mov -4(%rsi), %edx
1858 movdqu %xmm0, -65(%rdi)
1859 movdqu %xmm1, -49(%rdi)
1860 mov %r9, -33(%rdi)
1861 mov %r10, -25(%rdi)
1862 mov %r11, -17(%rdi)
1863 mov %rcx, -9(%rdi)
1864 mov %edx, -4(%rdi)
1865 ret
1866
1867 .p2align 4
1868 L(write_57bytes):
1869 movdqu -57(%rsi), %xmm0
1870 mov -41(%rsi), %r8
1871 mov -33(%rsi), %r9
1872 mov -25(%rsi), %r10
1873 mov -17(%rsi), %r11
1874 mov -9(%rsi), %rcx
1875 mov -4(%rsi), %edx
1876 movdqu %xmm0, -57(%rdi)
1877 mov %r8, -41(%rdi)
1878 mov %r9, -33(%rdi)
1879 mov %r10, -25(%rdi)
1880 mov %r11, -17(%rdi)
1881 mov %rcx, -9(%rdi)
1882 mov %edx, -4(%rdi)
1883 ret
1884
1885 .p2align 4
1886 L(write_49bytes):
1887 movdqu -49(%rsi), %xmm0
1888 mov -33(%rsi), %r9
1889 mov -25(%rsi), %r10
1890 mov -17(%rsi), %r11
1891 mov -9(%rsi), %rcx
1892 mov -4(%rsi), %edx
1893 movdqu %xmm0, -49(%rdi)
1894 mov %r9, -33(%rdi)
1895 mov %r10, -25(%rdi)
1896 mov %r11, -17(%rdi)
1897 mov %rcx, -9(%rdi)
1898 mov %edx, -4(%rdi)
1899 ret
1900
1901 .p2align 4
1902 L(write_41bytes):
1903 mov -41(%rsi), %r8
1904 mov -33(%rsi), %r9
1905 mov -25(%rsi), %r10
1906 mov -17(%rsi), %r11
1907 mov -9(%rsi), %rcx
1908 mov -1(%rsi), %dl
1909 mov %r8, -41(%rdi)
1910 mov %r9, -33(%rdi)
1911 mov %r10, -25(%rdi)
1912 mov %r11, -17(%rdi)
1913 mov %rcx, -9(%rdi)
1914 mov %dl, -1(%rdi)
1915 ret
1916
1917 .p2align 4
1918 L(write_33bytes):
1919 mov -33(%rsi), %r9
1920 mov -25(%rsi), %r10
1921 mov -17(%rsi), %r11
1922 mov -9(%rsi), %rcx
1923 mov -1(%rsi), %dl
1924 mov %r9, -33(%rdi)
1925 mov %r10, -25(%rdi)
1926 mov %r11, -17(%rdi)
1927 mov %rcx, -9(%rdi)
1928 mov %dl, -1(%rdi)
1929 ret
1930
1931 .p2align 4
1932 L(write_25bytes):
1933 mov -25(%rsi), %r10
1934 mov -17(%rsi), %r11
1935 mov -9(%rsi), %rcx
1936 mov -1(%rsi), %dl
1937 mov %r10, -25(%rdi)
1938 mov %r11, -17(%rdi)
1939 mov %rcx, -9(%rdi)
1940 mov %dl, -1(%rdi)
1941 ret
1942
1943 .p2align 4
1944 L(write_17bytes):
1945 mov -17(%rsi), %r11
1946 mov -9(%rsi), %rcx
1947 mov -4(%rsi), %edx
1948 mov %r11, -17(%rdi)
1949 mov %rcx, -9(%rdi)
1950 mov %edx, -4(%rdi)
1951 ret
1952
1953 .p2align 4
1954 L(write_9bytes):
1955 mov -9(%rsi), %rcx
1956 mov -4(%rsi), %edx
1957 mov %rcx, -9(%rdi)
1958 mov %edx, -4(%rdi)
1959 ret
1960
1961 .p2align 4
1962 L(write_1bytes):
1963 mov -1(%rsi), %dl
1964 mov %dl, -1(%rdi)
1965 ret
1966
1967 .p2align 4
1968 L(write_74bytes):
1969 movdqu -74(%rsi), %xmm0
1970 movdqu -58(%rsi), %xmm1
1971 mov -42(%rsi), %r8
1972 mov -34(%rsi), %r9
1973 mov -26(%rsi), %r10
1974 mov -18(%rsi), %r11
1975 mov -10(%rsi), %rcx
1976 mov -4(%rsi), %edx
1977 movdqu %xmm0, -74(%rdi)
1978 movdqu %xmm1, -58(%rdi)
1979 mov %r8, -42(%rdi)
1980 mov %r9, -34(%rdi)
1981 mov %r10, -26(%rdi)
1982 mov %r11, -18(%rdi)
1983 mov %rcx, -10(%rdi)
1984 mov %edx, -4(%rdi)
1985 ret
1986
1987 .p2align 4
1988 L(write_66bytes):
1989 movdqu -66(%rsi), %xmm0
1990 movdqu -50(%rsi), %xmm1
1991 mov -42(%rsi), %r8
1992 mov -34(%rsi), %r9
1993 mov -26(%rsi), %r10
1994 mov -18(%rsi), %r11
1995 mov -10(%rsi), %rcx
1996 mov -4(%rsi), %edx
1997 movdqu %xmm0, -66(%rdi)
1998 movdqu %xmm1, -50(%rdi)
1999 mov %r8, -42(%rdi)
2000 mov %r9, -34(%rdi)
2001 mov %r10, -26(%rdi)
2002 mov %r11, -18(%rdi)
2003 mov %rcx, -10(%rdi)
2004 mov %edx, -4(%rdi)
2005 ret
2006
2007 .p2align 4
2008 L(write_58bytes):
2009 movdqu -58(%rsi), %xmm1
2010 mov -42(%rsi), %r8
2011 mov -34(%rsi), %r9
2012 mov -26(%rsi), %r10
2013 mov -18(%rsi), %r11
2014 mov -10(%rsi), %rcx
2015 mov -4(%rsi), %edx
2016 movdqu %xmm1, -58(%rdi)
2017 mov %r8, -42(%rdi)
2018 mov %r9, -34(%rdi)
2019 mov %r10, -26(%rdi)
2020 mov %r11, -18(%rdi)
2021 mov %rcx, -10(%rdi)
2022 mov %edx, -4(%rdi)
2023 ret
2024
2025 .p2align 4
2026 L(write_50bytes):
2027 movdqu -50(%rsi), %xmm0
2028 mov -34(%rsi), %r9
2029 mov -26(%rsi), %r10
2030 mov -18(%rsi), %r11
2031 mov -10(%rsi), %rcx
2032 mov -4(%rsi), %edx
2033 movdqu %xmm0, -50(%rdi)
2034 mov %r9, -34(%rdi)
2035 mov %r10, -26(%rdi)
2036 mov %r11, -18(%rdi)
2037 mov %rcx, -10(%rdi)
2038 mov %edx, -4(%rdi)
2039 ret
2040
2041 .p2align 4
2042 L(write_42bytes):
2043 mov -42(%rsi), %r8
2044 mov -34(%rsi), %r9
2045 mov -26(%rsi), %r10
2046 mov -18(%rsi), %r11
2047 mov -10(%rsi), %rcx
2048 mov -4(%rsi), %edx
2049 mov %r8, -42(%rdi)
2050 mov %r9, -34(%rdi)
2051 mov %r10, -26(%rdi)
2052 mov %r11, -18(%rdi)
2053 mov %rcx, -10(%rdi)
2054 mov %edx, -4(%rdi)
2055 ret
2056
2057 .p2align 4
2058 L(write_34bytes):
2059 mov -34(%rsi), %r9
2060 mov -26(%rsi), %r10
2061 mov -18(%rsi), %r11
2062 mov -10(%rsi), %rcx
2063 mov -4(%rsi), %edx
2064 mov %r9, -34(%rdi)
2065 mov %r10, -26(%rdi)
2066 mov %r11, -18(%rdi)
2067 mov %rcx, -10(%rdi)
2068 mov %edx, -4(%rdi)
2069 ret
2070
2071 .p2align 4
2072 L(write_26bytes):
2073 mov -26(%rsi), %r10
2074 mov -18(%rsi), %r11
2075 mov -10(%rsi), %rcx
2076 mov -4(%rsi), %edx
2077 mov %r10, -26(%rdi)
2078 mov %r11, -18(%rdi)
2079 mov %rcx, -10(%rdi)
2080 mov %edx, -4(%rdi)
2081 ret
2082
2083 .p2align 4
2084 L(write_18bytes):
2085 mov -18(%rsi), %r11
2086 mov -10(%rsi), %rcx
2087 mov -4(%rsi), %edx
2088 mov %r11, -18(%rdi)
2089 mov %rcx, -10(%rdi)
2090 mov %edx, -4(%rdi)
2091 ret
2092
2093 .p2align 4
2094 L(write_10bytes):
2095 mov -10(%rsi), %rcx
2096 mov -4(%rsi), %edx
2097 mov %rcx, -10(%rdi)
2098 mov %edx, -4(%rdi)
2099 ret
2100
2101 .p2align 4
2102 L(write_2bytes):
2103 mov -2(%rsi), %dx
2104 mov %dx, -2(%rdi)
2105 ret
2106
2107 .p2align 4
2108 L(write_75bytes):
2109 movdqu -75(%rsi), %xmm0
2110 movdqu -59(%rsi), %xmm1
2111 mov -43(%rsi), %r8
2112 mov -35(%rsi), %r9
2113 mov -27(%rsi), %r10
2114 mov -19(%rsi), %r11
2115 mov -11(%rsi), %rcx
2116 mov -4(%rsi), %edx
2117 movdqu %xmm0, -75(%rdi)
2118 movdqu %xmm1, -59(%rdi)
2119 mov %r8, -43(%rdi)
2120 mov %r9, -35(%rdi)
2121 mov %r10, -27(%rdi)
2122 mov %r11, -19(%rdi)
2123 mov %rcx, -11(%rdi)
2124 mov %edx, -4(%rdi)
2125 ret
2126
2127 .p2align 4
2128 L(write_67bytes):
2129 movdqu -67(%rsi), %xmm0
2130 movdqu -59(%rsi), %xmm1
2131 mov -43(%rsi), %r8
2132 mov -35(%rsi), %r9
2133 mov -27(%rsi), %r10
2134 mov -19(%rsi), %r11
2135 mov -11(%rsi), %rcx
2136 mov -4(%rsi), %edx
2137 movdqu %xmm0, -67(%rdi)
2138 movdqu %xmm1, -59(%rdi)
2139 mov %r8, -43(%rdi)
2140 mov %r9, -35(%rdi)
2141 mov %r10, -27(%rdi)
2142 mov %r11, -19(%rdi)
2143 mov %rcx, -11(%rdi)
2144 mov %edx, -4(%rdi)
2145 ret
2146
2147 .p2align 4
2148 L(write_59bytes):
2149 movdqu -59(%rsi), %xmm0
2150 mov -43(%rsi), %r8
2151 mov -35(%rsi), %r9
2152 mov -27(%rsi), %r10
2153 mov -19(%rsi), %r11
2154 mov -11(%rsi), %rcx
2155 mov -4(%rsi), %edx
2156 movdqu %xmm0, -59(%rdi)
2157 mov %r8, -43(%rdi)
2158 mov %r9, -35(%rdi)
2159 mov %r10, -27(%rdi)
2160 mov %r11, -19(%rdi)
2161 mov %rcx, -11(%rdi)
2162 mov %edx, -4(%rdi)
2163 ret
2164
2165 .p2align 4
2166 L(write_51bytes):
2167 movdqu -51(%rsi), %xmm0
2168 mov -35(%rsi), %r9
2169 mov -27(%rsi), %r10
2170 mov -19(%rsi), %r11
2171 mov -11(%rsi), %rcx
2172 mov -4(%rsi), %edx
2173 movdqu %xmm0, -51(%rdi)
2174 mov %r9, -35(%rdi)
2175 mov %r10, -27(%rdi)
2176 mov %r11, -19(%rdi)
2177 mov %rcx, -11(%rdi)
2178 mov %edx, -4(%rdi)
2179 ret
2180
2181 .p2align 4
2182 L(write_43bytes):
2183 mov -43(%rsi), %r8
2184 mov -35(%rsi), %r9
2185 mov -27(%rsi), %r10
2186 mov -19(%rsi), %r11
2187 mov -11(%rsi), %rcx
2188 mov -4(%rsi), %edx
2189 mov %r8, -43(%rdi)
2190 mov %r9, -35(%rdi)
2191 mov %r10, -27(%rdi)
2192 mov %r11, -19(%rdi)
2193 mov %rcx, -11(%rdi)
2194 mov %edx, -4(%rdi)
2195 ret
2196
2197 .p2align 4
2198 L(write_35bytes):
2199 mov -35(%rsi), %r9
2200 mov -27(%rsi), %r10
2201 mov -19(%rsi), %r11
2202 mov -11(%rsi), %rcx
2203 mov -4(%rsi), %edx
2204 mov %r9, -35(%rdi)
2205 mov %r10, -27(%rdi)
2206 mov %r11, -19(%rdi)
2207 mov %rcx, -11(%rdi)
2208 mov %edx, -4(%rdi)
2209 ret
2210
2211 .p2align 4
2212 L(write_27bytes):
2213 mov -27(%rsi), %r10
2214 mov -19(%rsi), %r11
2215 mov -11(%rsi), %rcx
2216 mov -4(%rsi), %edx
2217 mov %r10, -27(%rdi)
2218 mov %r11, -19(%rdi)
2219 mov %rcx, -11(%rdi)
2220 mov %edx, -4(%rdi)
2221 ret
2222
2223 .p2align 4
2224 L(write_19bytes):
2225 mov -19(%rsi), %r11
2226 mov -11(%rsi), %rcx
2227 mov -4(%rsi), %edx
2228 mov %r11, -19(%rdi)
2229 mov %rcx, -11(%rdi)
2230 mov %edx, -4(%rdi)
2231 ret
2232
2233 .p2align 4
2234 L(write_11bytes):
2235 mov -11(%rsi), %rcx
2236 mov -4(%rsi), %edx
2237 mov %rcx, -11(%rdi)
2238 mov %edx, -4(%rdi)
2239 ret
2240
2241 .p2align 4
2242 L(write_3bytes):
2243 mov -3(%rsi), %dx
2244 mov -2(%rsi), %cx
2245 mov %dx, -3(%rdi)
2246 mov %cx, -2(%rdi)
2247 ret
2248
2249 .p2align 4
2250 L(write_76bytes):
2251 movdqu -76(%rsi), %xmm0
2252 movdqu -60(%rsi), %xmm1
2253 mov -44(%rsi), %r8
2254 mov -36(%rsi), %r9
2255 mov -28(%rsi), %r10
2256 mov -20(%rsi), %r11
2257 mov -12(%rsi), %rcx
2258 mov -4(%rsi), %edx
2259 movdqu %xmm0, -76(%rdi)
2260 movdqu %xmm1, -60(%rdi)
2261 mov %r8, -44(%rdi)
2262 mov %r9, -36(%rdi)
2263 mov %r10, -28(%rdi)
2264 mov %r11, -20(%rdi)
2265 mov %rcx, -12(%rdi)
2266 mov %edx, -4(%rdi)
2267 ret
2268
2269 .p2align 4
2270 L(write_68bytes):
2271 movdqu -68(%rsi), %xmm0
2272 movdqu -52(%rsi), %xmm1
2273 mov -36(%rsi), %r9
2274 mov -28(%rsi), %r10
2275 mov -20(%rsi), %r11
2276 mov -12(%rsi), %rcx
2277 mov -4(%rsi), %edx
2278 movdqu %xmm0, -68(%rdi)
2279 movdqu %xmm1, -52(%rdi)
2280 mov %r9, -36(%rdi)
2281 mov %r10, -28(%rdi)
2282 mov %r11, -20(%rdi)
2283 mov %rcx, -12(%rdi)
2284 mov %edx, -4(%rdi)
2285 ret
2286
2287 .p2align 4
2288 L(write_60bytes):
2289 movdqu -60(%rsi), %xmm0
2290 mov -44(%rsi), %r8
2291 mov -36(%rsi), %r9
2292 mov -28(%rsi), %r10
2293 mov -20(%rsi), %r11
2294 mov -12(%rsi), %rcx
2295 mov -4(%rsi), %edx
2296 movdqu %xmm0, -60(%rdi)
2297 mov %r8, -44(%rdi)
2298 mov %r9, -36(%rdi)
2299 mov %r10, -28(%rdi)
2300 mov %r11, -20(%rdi)
2301 mov %rcx, -12(%rdi)
2302 mov %edx, -4(%rdi)
2303 ret
2304
2305 .p2align 4
2306 L(write_52bytes):
2307 movdqu -52(%rsi), %xmm0
2308 mov -36(%rsi), %r9
2309 mov -28(%rsi), %r10
2310 mov -20(%rsi), %r11
2311 mov -12(%rsi), %rcx
2312 mov -4(%rsi), %edx
2313 movdqu %xmm0, -52(%rdi)
2314 mov %r9, -36(%rdi)
2315 mov %r10, -28(%rdi)
2316 mov %r11, -20(%rdi)
2317 mov %rcx, -12(%rdi)
2318 mov %edx, -4(%rdi)
2319 ret
2320
2321 .p2align 4
2322 L(write_44bytes):
2323 mov -44(%rsi), %r8
2324 mov -36(%rsi), %r9
2325 mov -28(%rsi), %r10
2326 mov -20(%rsi), %r11
2327 mov -12(%rsi), %rcx
2328 mov -4(%rsi), %edx
2329 mov %r8, -44(%rdi)
2330 mov %r9, -36(%rdi)
2331 mov %r10, -28(%rdi)
2332 mov %r11, -20(%rdi)
2333 mov %rcx, -12(%rdi)
2334 mov %edx, -4(%rdi)
2335 ret
2336
2337 .p2align 4
2338 L(write_36bytes):
2339 mov -36(%rsi), %r9
2340 mov -28(%rsi), %r10
2341 mov -20(%rsi), %r11
2342 mov -12(%rsi), %rcx
2343 mov -4(%rsi), %edx
2344 mov %r9, -36(%rdi)
2345 mov %r10, -28(%rdi)
2346 mov %r11, -20(%rdi)
2347 mov %rcx, -12(%rdi)
2348 mov %edx, -4(%rdi)
2349 ret
2350
2351 .p2align 4
2352 L(write_28bytes):
2353 mov -28(%rsi), %r10
2354 mov -20(%rsi), %r11
2355 mov -12(%rsi), %rcx
2356 mov -4(%rsi), %edx
2357 mov %r10, -28(%rdi)
2358 mov %r11, -20(%rdi)
2359 mov %rcx, -12(%rdi)
2360 mov %edx, -4(%rdi)
2361 ret
2362
2363 .p2align 4
2364 L(write_20bytes):
2365 mov -20(%rsi), %r11
2366 mov -12(%rsi), %rcx
2367 mov -4(%rsi), %edx
2368 mov %r11, -20(%rdi)
2369 mov %rcx, -12(%rdi)
2370 mov %edx, -4(%rdi)
2371 ret
2372
2373 .p2align 4
2374 L(write_12bytes):
2375 mov -12(%rsi), %rcx
2376 mov -4(%rsi), %edx
2377 mov %rcx, -12(%rdi)
2378 mov %edx, -4(%rdi)
2379 ret
2380
2381 .p2align 4
2382 L(write_4bytes):
2383 mov -4(%rsi), %edx
2384 mov %edx, -4(%rdi)
2385 ret
2386
2387 .p2align 4
2388 L(write_77bytes):
2389 movdqu -77(%rsi), %xmm0
2390 movdqu -61(%rsi), %xmm1
2391 mov -45(%rsi), %r8
2392 mov -37(%rsi), %r9
2393 mov -29(%rsi), %r10
2394 mov -21(%rsi), %r11
2395 mov -13(%rsi), %rcx
2396 mov -8(%rsi), %rdx
2397 movdqu %xmm0, -77(%rdi)
2398 movdqu %xmm1, -61(%rdi)
2399 mov %r8, -45(%rdi)
2400 mov %r9, -37(%rdi)
2401 mov %r10, -29(%rdi)
2402 mov %r11, -21(%rdi)
2403 mov %rcx, -13(%rdi)
2404 mov %rdx, -8(%rdi)
2405 ret
2406
2407 .p2align 4
2408 L(write_69bytes):
2409 movdqu -69(%rsi), %xmm0
2410 movdqu -53(%rsi), %xmm1
2411 mov -37(%rsi), %r9
2412 mov -29(%rsi), %r10
2413 mov -21(%rsi), %r11
2414 mov -13(%rsi), %rcx
2415 mov -8(%rsi), %rdx
2416 movdqu %xmm0, -69(%rdi)
2417 movdqu %xmm1, -53(%rdi)
2418 mov %r9, -37(%rdi)
2419 mov %r10, -29(%rdi)
2420 mov %r11, -21(%rdi)
2421 mov %rcx, -13(%rdi)
2422 mov %rdx, -8(%rdi)
2423 ret
2424
2425 .p2align 4
2426 L(write_61bytes):
2427 movdqu -61(%rsi), %xmm0
2428 mov -45(%rsi), %r8
2429 mov -37(%rsi), %r9
2430 mov -29(%rsi), %r10
2431 mov -21(%rsi), %r11
2432 mov -13(%rsi), %rcx
2433 mov -8(%rsi), %rdx
2434 movdqu %xmm0, -61(%rdi)
2435 mov %r8, -45(%rdi)
2436 mov %r9, -37(%rdi)
2437 mov %r10, -29(%rdi)
2438 mov %r11, -21(%rdi)
2439 mov %rcx, -13(%rdi)
2440 mov %rdx, -8(%rdi)
2441 ret
2442
2443 .p2align 4
2444 L(write_53bytes):
2445 movdqu -53(%rsi), %xmm0
2446 mov -45(%rsi), %r8
2447 mov -37(%rsi), %r9
2448 mov -29(%rsi), %r10
2449 mov -21(%rsi), %r11
2450 mov -13(%rsi), %rcx
2451 mov -8(%rsi), %rdx
2452 movdqu %xmm0, -53(%rdi)
2453 mov %r9, -37(%rdi)
2454 mov %r10, -29(%rdi)
2455 mov %r11, -21(%rdi)
2456 mov %rcx, -13(%rdi)
2457 mov %rdx, -8(%rdi)
2458 ret
2459
2460 .p2align 4
2461 L(write_45bytes):
2462 mov -45(%rsi), %r8
2463 mov -37(%rsi), %r9
2464 mov -29(%rsi), %r10
2465 mov -21(%rsi), %r11
2466 mov -13(%rsi), %rcx
2467 mov -8(%rsi), %rdx
2468 mov %r8, -45(%rdi)
2469 mov %r9, -37(%rdi)
2470 mov %r10, -29(%rdi)
2471 mov %r11, -21(%rdi)
2472 mov %rcx, -13(%rdi)
2473 mov %rdx, -8(%rdi)
2474 ret
2475
2476 .p2align 4
2477 L(write_37bytes):
2478 mov -37(%rsi), %r9
2479 mov -29(%rsi), %r10
2480 mov -21(%rsi), %r11
2481 mov -13(%rsi), %rcx
2482 mov -8(%rsi), %rdx
2483 mov %r9, -37(%rdi)
2484 mov %r10, -29(%rdi)
2485 mov %r11, -21(%rdi)
2486 mov %rcx, -13(%rdi)
2487 mov %rdx, -8(%rdi)
2488 ret
2489
2490 .p2align 4
2491 L(write_29bytes):
2492 mov -29(%rsi), %r10
2493 mov -21(%rsi), %r11
2494 mov -13(%rsi), %rcx
2495 mov -8(%rsi), %rdx
2496 mov %r10, -29(%rdi)
2497 mov %r11, -21(%rdi)
2498 mov %rcx, -13(%rdi)
2499 mov %rdx, -8(%rdi)
2500 ret
2501
2502 .p2align 4
2503 L(write_21bytes):
2504 mov -21(%rsi), %r11
2505 mov -13(%rsi), %rcx
2506 mov -8(%rsi), %rdx
2507 mov %r11, -21(%rdi)
2508 mov %rcx, -13(%rdi)
2509 mov %rdx, -8(%rdi)
2510 ret
2511
2512 .p2align 4
2513 L(write_13bytes):
2514 mov -13(%rsi), %rcx
2515 mov -8(%rsi), %rdx
2516 mov %rcx, -13(%rdi)
2517 mov %rdx, -8(%rdi)
2518 ret
2519
2520 .p2align 4
2521 L(write_5bytes):
2522 mov -5(%rsi), %edx
2523 mov -4(%rsi), %ecx
2524 mov %edx, -5(%rdi)
2525 mov %ecx, -4(%rdi)
2526 ret
2527
2528 .p2align 4
2529 L(write_78bytes):
2530 movdqu -78(%rsi), %xmm0
2531 movdqu -62(%rsi), %xmm1
2532 mov -46(%rsi), %r8
2533 mov -38(%rsi), %r9
2534 mov -30(%rsi), %r10
2535 mov -22(%rsi), %r11
2536 mov -14(%rsi), %rcx
2537 mov -8(%rsi), %rdx
2538 movdqu %xmm0, -78(%rdi)
2539 movdqu %xmm1, -62(%rdi)
2540 mov %r8, -46(%rdi)
2541 mov %r9, -38(%rdi)
2542 mov %r10, -30(%rdi)
2543 mov %r11, -22(%rdi)
2544 mov %rcx, -14(%rdi)
2545 mov %rdx, -8(%rdi)
2546 ret
2547
2548 .p2align 4
2549 L(write_70bytes):
2550 movdqu -70(%rsi), %xmm0
2551 movdqu -54(%rsi), %xmm1
2552 mov -38(%rsi), %r9
2553 mov -30(%rsi), %r10
2554 mov -22(%rsi), %r11
2555 mov -14(%rsi), %rcx
2556 mov -8(%rsi), %rdx
2557 movdqu %xmm0, -70(%rdi)
2558 movdqu %xmm1, -54(%rdi)
2559 mov %r9, -38(%rdi)
2560 mov %r10, -30(%rdi)
2561 mov %r11, -22(%rdi)
2562 mov %rcx, -14(%rdi)
2563 mov %rdx, -8(%rdi)
2564 ret
2565
2566 .p2align 4
2567 L(write_62bytes):
2568 movdqu -62(%rsi), %xmm0
2569 mov -46(%rsi), %r8
2570 mov -38(%rsi), %r9
2571 mov -30(%rsi), %r10
2572 mov -22(%rsi), %r11
2573 mov -14(%rsi), %rcx
2574 mov -8(%rsi), %rdx
2575 movdqu %xmm0, -62(%rdi)
2576 mov %r8, -46(%rdi)
2577 mov %r9, -38(%rdi)
2578 mov %r10, -30(%rdi)
2579 mov %r11, -22(%rdi)
2580 mov %rcx, -14(%rdi)
2581 mov %rdx, -8(%rdi)
2582 ret
2583
2584 .p2align 4
2585 L(write_54bytes):
2586 movdqu -54(%rsi), %xmm0
2587 mov -38(%rsi), %r9
2588 mov -30(%rsi), %r10
2589 mov -22(%rsi), %r11
2590 mov -14(%rsi), %rcx
2591 mov -8(%rsi), %rdx
2592 movdqu %xmm0, -54(%rdi)
2593 mov %r9, -38(%rdi)
2594 mov %r10, -30(%rdi)
2595 mov %r11, -22(%rdi)
2596 mov %rcx, -14(%rdi)
2597 mov %rdx, -8(%rdi)
2598 ret
2599
2600 .p2align 4
2601 L(write_46bytes):
2602 mov -46(%rsi), %r8
2603 mov -38(%rsi), %r9
2604 mov -30(%rsi), %r10
2605 mov -22(%rsi), %r11
2606 mov -14(%rsi), %rcx
2607 mov -8(%rsi), %rdx
2608 mov %r8, -46(%rdi)
2609 mov %r9, -38(%rdi)
2610 mov %r10, -30(%rdi)
2611 mov %r11, -22(%rdi)
2612 mov %rcx, -14(%rdi)
2613 mov %rdx, -8(%rdi)
2614 ret
2615
2616 .p2align 4
2617 L(write_38bytes):
2618 mov -38(%rsi), %r9
2619 mov -30(%rsi), %r10
2620 mov -22(%rsi), %r11
2621 mov -14(%rsi), %rcx
2622 mov -8(%rsi), %rdx
2623 mov %r9, -38(%rdi)
2624 mov %r10, -30(%rdi)
2625 mov %r11, -22(%rdi)
2626 mov %rcx, -14(%rdi)
2627 mov %rdx, -8(%rdi)
2628 ret
2629
2630 .p2align 4
2631 L(write_30bytes):
2632 mov -30(%rsi), %r10
2633 mov -22(%rsi), %r11
2634 mov -14(%rsi), %rcx
2635 mov -8(%rsi), %rdx
2636 mov %r10, -30(%rdi)
2637 mov %r11, -22(%rdi)
2638 mov %rcx, -14(%rdi)
2639 mov %rdx, -8(%rdi)
2640 ret
2641
2642 .p2align 4
2643 L(write_22bytes):
2644 mov -22(%rsi), %r11
2645 mov -14(%rsi), %rcx
2646 mov -8(%rsi), %rdx
2647 mov %r11, -22(%rdi)
2648 mov %rcx, -14(%rdi)
2649 mov %rdx, -8(%rdi)
2650 ret
2651
2652 .p2align 4
2653 L(write_14bytes):
2654 mov -14(%rsi), %rcx
2655 mov -8(%rsi), %rdx
2656 mov %rcx, -14(%rdi)
2657 mov %rdx, -8(%rdi)
2658 ret
2659
2660 .p2align 4
2661 L(write_6bytes):
2662 mov -6(%rsi), %edx
2663 mov -4(%rsi), %ecx
2664 mov %edx, -6(%rdi)
2665 mov %ecx, -4(%rdi)
2666 ret
2667
2668 .p2align 4
2669 L(write_79bytes):
2670 movdqu -79(%rsi), %xmm0
2671 movdqu -63(%rsi), %xmm1
2672 mov -47(%rsi), %r8
2673 mov -39(%rsi), %r9
2674 mov -31(%rsi), %r10
2675 mov -23(%rsi), %r11
2676 mov -15(%rsi), %rcx
2677 mov -8(%rsi), %rdx
2678 movdqu %xmm0, -79(%rdi)
2679 movdqu %xmm1, -63(%rdi)
2680 mov %r8, -47(%rdi)
2681 mov %r9, -39(%rdi)
2682 mov %r10, -31(%rdi)
2683 mov %r11, -23(%rdi)
2684 mov %rcx, -15(%rdi)
2685 mov %rdx, -8(%rdi)
2686 ret
2687
2688 .p2align 4
2689 L(write_71bytes):
2690 movdqu -71(%rsi), %xmm0
2691 movdqu -55(%rsi), %xmm1
2692 mov -39(%rsi), %r9
2693 mov -31(%rsi), %r10
2694 mov -23(%rsi), %r11
2695 mov -15(%rsi), %rcx
2696 mov -8(%rsi), %rdx
2697 movdqu %xmm0, -71(%rdi)
2698 movdqu %xmm1, -55(%rdi)
2699 mov %r9, -39(%rdi)
2700 mov %r10, -31(%rdi)
2701 mov %r11, -23(%rdi)
2702 mov %rcx, -15(%rdi)
2703 mov %rdx, -8(%rdi)
2704 ret
2705
2706 .p2align 4
2707 L(write_63bytes):
2708 movdqu -63(%rsi), %xmm0
2709 mov -47(%rsi), %r8
2710 mov -39(%rsi), %r9
2711 mov -31(%rsi), %r10
2712 mov -23(%rsi), %r11
2713 mov -15(%rsi), %rcx
2714 mov -8(%rsi), %rdx
2715 movdqu %xmm0, -63(%rdi)
2716 mov %r8, -47(%rdi)
2717 mov %r9, -39(%rdi)
2718 mov %r10, -31(%rdi)
2719 mov %r11, -23(%rdi)
2720 mov %rcx, -15(%rdi)
2721 mov %rdx, -8(%rdi)
2722 ret
2723
2724 .p2align 4
2725 L(write_55bytes):
2726 movdqu -55(%rsi), %xmm0
2727 mov -39(%rsi), %r9
2728 mov -31(%rsi), %r10
2729 mov -23(%rsi), %r11
2730 mov -15(%rsi), %rcx
2731 mov -8(%rsi), %rdx
2732 movdqu %xmm0, -55(%rdi)
2733 mov %r9, -39(%rdi)
2734 mov %r10, -31(%rdi)
2735 mov %r11, -23(%rdi)
2736 mov %rcx, -15(%rdi)
2737 mov %rdx, -8(%rdi)
2738 ret
2739
2740 .p2align 4
2741 L(write_47bytes):
2742 mov -47(%rsi), %r8
2743 mov -39(%rsi), %r9
2744 mov -31(%rsi), %r10
2745 mov -23(%rsi), %r11
2746 mov -15(%rsi), %rcx
2747 mov -8(%rsi), %rdx
2748 mov %r8, -47(%rdi)
2749 mov %r9, -39(%rdi)
2750 mov %r10, -31(%rdi)
2751 mov %r11, -23(%rdi)
2752 mov %rcx, -15(%rdi)
2753 mov %rdx, -8(%rdi)
2754 ret
2755
2756 .p2align 4
2757 L(write_39bytes):
2758 mov -39(%rsi), %r9
2759 mov -31(%rsi), %r10
2760 mov -23(%rsi), %r11
2761 mov -15(%rsi), %rcx
2762 mov -8(%rsi), %rdx
2763 mov %r9, -39(%rdi)
2764 mov %r10, -31(%rdi)
2765 mov %r11, -23(%rdi)
2766 mov %rcx, -15(%rdi)
2767 mov %rdx, -8(%rdi)
2768 ret
2769
2770 .p2align 4
2771 L(write_31bytes):
2772 mov -31(%rsi), %r10
2773 mov -23(%rsi), %r11
2774 mov -15(%rsi), %rcx
2775 mov -8(%rsi), %rdx
2776 mov %r10, -31(%rdi)
2777 mov %r11, -23(%rdi)
2778 mov %rcx, -15(%rdi)
2779 mov %rdx, -8(%rdi)
2780 ret
2781
2782 .p2align 4
2783 L(write_23bytes):
2784 mov -23(%rsi), %r11
2785 mov -15(%rsi), %rcx
2786 mov -8(%rsi), %rdx
2787 mov %r11, -23(%rdi)
2788 mov %rcx, -15(%rdi)
2789 mov %rdx, -8(%rdi)
2790 ret
2791
2792 .p2align 4
2793 L(write_15bytes):
2794 mov -15(%rsi), %rcx
2795 mov -8(%rsi), %rdx
2796 mov %rcx, -15(%rdi)
2797 mov %rdx, -8(%rdi)
2798 ret
2799
2800 .p2align 4
2801 L(write_7bytes):
2802 mov -7(%rsi), %edx
2803 mov -4(%rsi), %ecx
2804 mov %edx, -7(%rdi)
2805 mov %ecx, -4(%rdi)
2806 ret
2807
2808 .p2align 4
2809 L(large_page_fwd):
2810 movdqu (%rsi), %xmm1
2811 lea 16(%rsi), %rsi
2812 movdqu %xmm0, (%r8)
2813 movntdq %xmm1, (%rdi)
2814 lea 16(%rdi), %rdi
2815 lea -0x90(%rdx), %rdx
2816 #ifdef USE_AS_MEMMOVE
2817 mov %rsi, %r9
2818 sub %rdi, %r9
2819 cmp %rdx, %r9
2820 jae L(memmove_is_memcpy_fwd)
2821 shl $2, %rcx
2822 cmp %rcx, %rdx
2823 jb L(ll_cache_copy_fwd_start)
2824 L(memmove_is_memcpy_fwd):
2825 #endif
2826 L(large_page_loop):
2827 movdqu (%rsi), %xmm0
2828 movdqu 0x10(%rsi), %xmm1
2829 movdqu 0x20(%rsi), %xmm2
2830 movdqu 0x30(%rsi), %xmm3
2831 movdqu 0x40(%rsi), %xmm4
2832 movdqu 0x50(%rsi), %xmm5
2833 movdqu 0x60(%rsi), %xmm6
2834 movdqu 0x70(%rsi), %xmm7
2835 lea 0x80(%rsi), %rsi
2836
2837 sub $0x80, %rdx
2838 movntdq %xmm0, (%rdi)
2839 movntdq %xmm1, 0x10(%rdi)
2840 movntdq %xmm2, 0x20(%rdi)
2841 movntdq %xmm3, 0x30(%rdi)
2842 movntdq %xmm4, 0x40(%rdi)
2843 movntdq %xmm5, 0x50(%rdi)
2844 movntdq %xmm6, 0x60(%rdi)
2845 movntdq %xmm7, 0x70(%rdi)
2846 lea 0x80(%rdi), %rdi
2847 jae L(large_page_loop)
2848 cmp $-0x40, %rdx
2849 lea 0x80(%rdx), %rdx
2850 jl L(large_page_less_64bytes)
2851
2852 movdqu (%rsi), %xmm0
2853 movdqu 0x10(%rsi), %xmm1
2854 movdqu 0x20(%rsi), %xmm2
2855 movdqu 0x30(%rsi), %xmm3
2856 lea 0x40(%rsi), %rsi
2857
2858 movntdq %xmm0, (%rdi)
2859 movntdq %xmm1, 0x10(%rdi)
2860 movntdq %xmm2, 0x20(%rdi)
2861 movntdq %xmm3, 0x30(%rdi)
2862 lea 0x40(%rdi), %rdi
2863 sub $0x40, %rdx
2864 L(large_page_less_64bytes):
2865 add %rdx, %rsi
2866 add %rdx, %rdi
2867 sfence
2868 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2869
2870 #ifdef USE_AS_MEMMOVE
2871 .p2align 4
2872 L(ll_cache_copy_fwd_start):
2873 prefetcht0 0x1c0(%rsi)
2874 prefetcht0 0x200(%rsi)
2875 movdqu (%rsi), %xmm0
2876 movdqu 0x10(%rsi), %xmm1
2877 movdqu 0x20(%rsi), %xmm2
2878 movdqu 0x30(%rsi), %xmm3
2879 movdqu 0x40(%rsi), %xmm4
2880 movdqu 0x50(%rsi), %xmm5
2881 movdqu 0x60(%rsi), %xmm6
2882 movdqu 0x70(%rsi), %xmm7
2883 lea 0x80(%rsi), %rsi
2884
2885 sub $0x80, %rdx
2886 movaps %xmm0, (%rdi)
2887 movaps %xmm1, 0x10(%rdi)
2888 movaps %xmm2, 0x20(%rdi)
2889 movaps %xmm3, 0x30(%rdi)
2890 movaps %xmm4, 0x40(%rdi)
2891 movaps %xmm5, 0x50(%rdi)
2892 movaps %xmm6, 0x60(%rdi)
2893 movaps %xmm7, 0x70(%rdi)
2894 lea 0x80(%rdi), %rdi
2895 jae L(ll_cache_copy_fwd_start)
2896 cmp $-0x40, %rdx
2897 lea 0x80(%rdx), %rdx
2898 jl L(large_page_ll_less_fwd_64bytes)
2899
2900 movdqu (%rsi), %xmm0
2901 movdqu 0x10(%rsi), %xmm1
2902 movdqu 0x20(%rsi), %xmm2
2903 movdqu 0x30(%rsi), %xmm3
2904 lea 0x40(%rsi), %rsi
2905
2906 movaps %xmm0, (%rdi)
2907 movaps %xmm1, 0x10(%rdi)
2908 movaps %xmm2, 0x20(%rdi)
2909 movaps %xmm3, 0x30(%rdi)
2910 lea 0x40(%rdi), %rdi
2911 sub $0x40, %rdx
2912 L(large_page_ll_less_fwd_64bytes):
2913 add %rdx, %rsi
2914 add %rdx, %rdi
2915 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2916
2917 #endif
2918 .p2align 4
2919 L(large_page_bwd):
2920 movdqu -0x10(%rsi), %xmm1
2921 lea -16(%rsi), %rsi
2922 movdqu %xmm0, (%r8)
2923 movdqa %xmm1, -0x10(%rdi)
2924 lea -16(%rdi), %rdi
2925 lea -0x90(%rdx), %rdx
2926 #ifdef USE_AS_MEMMOVE
2927 mov %rdi, %r9
2928 sub %rsi, %r9
2929 cmp %rdx, %r9
2930 jae L(memmove_is_memcpy_bwd)
2931 cmp %rcx, %r9
2932 jb L(ll_cache_copy_bwd_start)
2933 L(memmove_is_memcpy_bwd):
2934 #endif
2935 L(large_page_bwd_loop):
2936 movdqu -0x10(%rsi), %xmm0
2937 movdqu -0x20(%rsi), %xmm1
2938 movdqu -0x30(%rsi), %xmm2
2939 movdqu -0x40(%rsi), %xmm3
2940 movdqu -0x50(%rsi), %xmm4
2941 movdqu -0x60(%rsi), %xmm5
2942 movdqu -0x70(%rsi), %xmm6
2943 movdqu -0x80(%rsi), %xmm7
2944 lea -0x80(%rsi), %rsi
2945
2946 sub $0x80, %rdx
2947 movntdq %xmm0, -0x10(%rdi)
2948 movntdq %xmm1, -0x20(%rdi)
2949 movntdq %xmm2, -0x30(%rdi)
2950 movntdq %xmm3, -0x40(%rdi)
2951 movntdq %xmm4, -0x50(%rdi)
2952 movntdq %xmm5, -0x60(%rdi)
2953 movntdq %xmm6, -0x70(%rdi)
2954 movntdq %xmm7, -0x80(%rdi)
2955 lea -0x80(%rdi), %rdi
2956 jae L(large_page_bwd_loop)
2957 cmp $-0x40, %rdx
2958 lea 0x80(%rdx), %rdx
2959 jl L(large_page_less_bwd_64bytes)
2960
2961 movdqu -0x10(%rsi), %xmm0
2962 movdqu -0x20(%rsi), %xmm1
2963 movdqu -0x30(%rsi), %xmm2
2964 movdqu -0x40(%rsi), %xmm3
2965 lea -0x40(%rsi), %rsi
2966
2967 movntdq %xmm0, -0x10(%rdi)
2968 movntdq %xmm1, -0x20(%rdi)
2969 movntdq %xmm2, -0x30(%rdi)
2970 movntdq %xmm3, -0x40(%rdi)
2971 lea -0x40(%rdi), %rdi
2972 sub $0x40, %rdx
2973 L(large_page_less_bwd_64bytes):
2974 sfence
2975 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2976
2977 #ifdef USE_AS_MEMMOVE
2978 .p2align 4
2979 L(ll_cache_copy_bwd_start):
2980 prefetcht0 -0x1c0(%rsi)
2981 prefetcht0 -0x200(%rsi)
2982 movdqu -0x10(%rsi), %xmm0
2983 movdqu -0x20(%rsi), %xmm1
2984 movdqu -0x30(%rsi), %xmm2
2985 movdqu -0x40(%rsi), %xmm3
2986 movdqu -0x50(%rsi), %xmm4
2987 movdqu -0x60(%rsi), %xmm5
2988 movdqu -0x70(%rsi), %xmm6
2989 movdqu -0x80(%rsi), %xmm7
2990 lea -0x80(%rsi), %rsi
2991
2992 sub $0x80, %rdx
2993 movaps %xmm0, -0x10(%rdi)
2994 movaps %xmm1, -0x20(%rdi)
2995 movaps %xmm2, -0x30(%rdi)
2996 movaps %xmm3, -0x40(%rdi)
2997 movaps %xmm4, -0x50(%rdi)
2998 movaps %xmm5, -0x60(%rdi)
2999 movaps %xmm6, -0x70(%rdi)
3000 movaps %xmm7, -0x80(%rdi)
3001 lea -0x80(%rdi), %rdi
3002 jae L(ll_cache_copy_bwd_start)
3003 cmp $-0x40, %rdx
3004 lea 0x80(%rdx), %rdx
3005 jl L(large_page_ll_less_bwd_64bytes)
3006
3007 movdqu -0x10(%rsi), %xmm0
3008 movdqu -0x20(%rsi), %xmm1
3009 movdqu -0x30(%rsi), %xmm2
3010 movdqu -0x40(%rsi), %xmm3
3011 lea -0x40(%rsi), %rsi
3012
3013 movaps %xmm0, -0x10(%rdi)
3014 movaps %xmm1, -0x20(%rdi)
3015 movaps %xmm2, -0x30(%rdi)
3016 movaps %xmm3, -0x40(%rdi)
3017 lea -0x40(%rdi), %rdi
3018 sub $0x40, %rdx
3019 L(large_page_ll_less_bwd_64bytes):
3020 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3021 #endif
3022
3023 END (MEMCPY)
3024
3025 .section .rodata.ssse3,"a",@progbits
3026 .p2align 3
3027 L(table_less_80bytes):
3028 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3029 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3030 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3031 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3108
3109 .p2align 3
3110 L(shl_table):
3111 .int JMPTBL (L(shl_0), L(shl_table))
3112 .int JMPTBL (L(shl_1), L(shl_table))
3113 .int JMPTBL (L(shl_2), L(shl_table))
3114 .int JMPTBL (L(shl_3), L(shl_table))
3115 .int JMPTBL (L(shl_4), L(shl_table))
3116 .int JMPTBL (L(shl_5), L(shl_table))
3117 .int JMPTBL (L(shl_6), L(shl_table))
3118 .int JMPTBL (L(shl_7), L(shl_table))
3119 .int JMPTBL (L(shl_8), L(shl_table))
3120 .int JMPTBL (L(shl_9), L(shl_table))
3121 .int JMPTBL (L(shl_10), L(shl_table))
3122 .int JMPTBL (L(shl_11), L(shl_table))
3123 .int JMPTBL (L(shl_12), L(shl_table))
3124 .int JMPTBL (L(shl_13), L(shl_table))
3125 .int JMPTBL (L(shl_14), L(shl_table))
3126 .int JMPTBL (L(shl_15), L(shl_table))
3127
3128 .p2align 3
3129 L(shl_table_bwd):
3130 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3131 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3132 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3133 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3134 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3146
3147 #endif