]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/memcpy-ssse3.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcpy-ssse3.S
CommitLineData
6fb8cbcb 1/* memcpy with SSSE3
bfff8b1b 2 Copyright (C) 2010-2017 Free Software Foundation, Inc.
6fb8cbcb
L
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
6fb8cbcb
L
19
20#include <sysdep.h>
21
4f41c682 22#if IS_IN (libc) \
6fb8cbcb
L
23 && (defined SHARED \
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
26
27#include "asm-syntax.h"
28
29#ifndef MEMCPY
30# define MEMCPY __memcpy_ssse3
31# define MEMCPY_CHK __memcpy_chk_ssse3
c365e615
L
32# define MEMPCPY __mempcpy_ssse3
33# define MEMPCPY_CHK __mempcpy_chk_ssse3
6fb8cbcb
L
34#endif
35
6fb8cbcb
L
36#define JMPTBL(I, B) I - B
37
38/* Branch to an entry in a jump table. TABLE is a jump table with
39 relative offsets. INDEX is a register contains the index into the
40 jump table. SCALE is the scale of INDEX. */
41#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
42 lea TABLE(%rip), %r11; \
43 movslq (%r11, INDEX, SCALE), INDEX; \
44 lea (%r11, INDEX), INDEX; \
45 jmp *INDEX; \
46 ud2
47
48 .section .text.ssse3,"ax",@progbits
c365e615
L
49#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
50ENTRY (MEMPCPY_CHK)
51 cmpq %rdx, %rcx
52 jb HIDDEN_JUMPTARGET (__chk_fail)
53END (MEMPCPY_CHK)
54
55ENTRY (MEMPCPY)
56 movq %rdi, %rax
57 addq %rdx, %rax
58 jmp L(start)
59END (MEMPCPY)
60#endif
61
4c559bcd 62#if !defined USE_AS_BCOPY
6fb8cbcb
L
63ENTRY (MEMCPY_CHK)
64 cmpq %rdx, %rcx
65 jb HIDDEN_JUMPTARGET (__chk_fail)
66END (MEMCPY_CHK)
67#endif
68
69ENTRY (MEMCPY)
70 mov %rdi, %rax
71#ifdef USE_AS_MEMPCPY
72 add %rdx, %rax
73#endif
74
75#ifdef USE_AS_MEMMOVE
76 cmp %rsi, %rdi
77 jb L(copy_forward)
78 je L(write_0bytes)
79 cmp $79, %rdx
80 jbe L(copy_forward)
81 jmp L(copy_backward)
82L(copy_forward):
83#endif
c365e615 84L(start):
6fb8cbcb
L
85 cmp $79, %rdx
86 lea L(table_less_80bytes)(%rip), %r11
87 ja L(80bytesormore)
88 movslq (%r11, %rdx, 4), %r9
89 add %rdx, %rsi
90 add %rdx, %rdi
91 add %r11, %r9
92 jmp *%r9
93 ud2
94
e7044ea7 95 .p2align 4
6fb8cbcb
L
96L(80bytesormore):
97#ifndef USE_AS_MEMMOVE
98 cmp %dil, %sil
99 jle L(copy_backward)
100#endif
101
102 movdqu (%rsi), %xmm0
103 mov %rdi, %rcx
104 and $-16, %rdi
105 add $16, %rdi
106 mov %rcx, %r8
107 sub %rdi, %rcx
108 add %rcx, %rdx
109 sub %rcx, %rsi
110
111#ifdef SHARED_CACHE_SIZE_HALF
9bc0b730 112 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
6fb8cbcb 113#else
afec409a 114 mov __x86_shared_cache_size_half(%rip), %RCX_LP
6fb8cbcb
L
115#endif
116 cmp %rcx, %rdx
117 mov %rsi, %r9
118 ja L(large_page_fwd)
119 and $0xf, %r9
120 jz L(shl_0)
121#ifdef DATA_CACHE_SIZE_HALF
9bc0b730 122 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
6fb8cbcb 123#else
afec409a 124 mov __x86_data_cache_size_half(%rip), %RCX_LP
6fb8cbcb
L
125#endif
126 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
127
e7044ea7 128 .p2align 4
6fb8cbcb
L
129L(copy_backward):
130 movdqu -16(%rsi, %rdx), %xmm0
131 add %rdx, %rsi
132 lea -16(%rdi, %rdx), %r8
133 add %rdx, %rdi
134
135 mov %rdi, %rcx
136 and $0xf, %rcx
137 xor %rcx, %rdi
138 sub %rcx, %rdx
139 sub %rcx, %rsi
140
141#ifdef SHARED_CACHE_SIZE_HALF
9bc0b730 142 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
6fb8cbcb 143#else
afec409a 144 mov __x86_shared_cache_size_half(%rip), %RCX_LP
6fb8cbcb
L
145#endif
146
147 cmp %rcx, %rdx
148 mov %rsi, %r9
149 ja L(large_page_bwd)
150 and $0xf, %r9
151 jz L(shl_0_bwd)
152#ifdef DATA_CACHE_SIZE_HALF
9bc0b730 153 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
6fb8cbcb 154#else
afec409a 155 mov __x86_data_cache_size_half(%rip), %RCX_LP
6fb8cbcb
L
156#endif
157 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
158
e7044ea7 159 .p2align 4
6fb8cbcb
L
160L(shl_0):
161 sub $16, %rdx
162 movdqa (%rsi), %xmm1
163 add $16, %rsi
164 movdqa %xmm1, (%rdi)
165 add $16, %rdi
166 cmp $128, %rdx
167 movdqu %xmm0, (%r8)
168 ja L(shl_0_gobble)
169 cmp $64, %rdx
170 jb L(shl_0_less_64bytes)
171 movaps (%rsi), %xmm4
172 movaps 16(%rsi), %xmm1
173 movaps 32(%rsi), %xmm2
174 movaps 48(%rsi), %xmm3
175 movaps %xmm4, (%rdi)
176 movaps %xmm1, 16(%rdi)
177 movaps %xmm2, 32(%rdi)
178 movaps %xmm3, 48(%rdi)
179 sub $64, %rdx
180 add $64, %rsi
181 add $64, %rdi
182L(shl_0_less_64bytes):
183 add %rdx, %rsi
184 add %rdx, %rdi
185 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
186
e7044ea7 187 .p2align 4
6fb8cbcb
L
188L(shl_0_gobble):
189#ifdef DATA_CACHE_SIZE_HALF
9bc0b730 190 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
6fb8cbcb 191#else
afec409a 192 cmp __x86_data_cache_size_half(%rip), %RDX_LP
6fb8cbcb
L
193#endif
194 lea -128(%rdx), %rdx
195 jae L(shl_0_gobble_mem_loop)
196L(shl_0_gobble_cache_loop):
197 movdqa (%rsi), %xmm4
198 movaps 0x10(%rsi), %xmm1
199 movaps 0x20(%rsi), %xmm2
200 movaps 0x30(%rsi), %xmm3
201
202 movdqa %xmm4, (%rdi)
203 movaps %xmm1, 0x10(%rdi)
204 movaps %xmm2, 0x20(%rdi)
205 movaps %xmm3, 0x30(%rdi)
206
207 sub $128, %rdx
208 movaps 0x40(%rsi), %xmm4
209 movaps 0x50(%rsi), %xmm5
210 movaps 0x60(%rsi), %xmm6
211 movaps 0x70(%rsi), %xmm7
212 lea 0x80(%rsi), %rsi
213 movaps %xmm4, 0x40(%rdi)
214 movaps %xmm5, 0x50(%rdi)
215 movaps %xmm6, 0x60(%rdi)
216 movaps %xmm7, 0x70(%rdi)
217 lea 0x80(%rdi), %rdi
218
219 jae L(shl_0_gobble_cache_loop)
220 cmp $-0x40, %rdx
221 lea 0x80(%rdx), %rdx
222 jl L(shl_0_cache_less_64bytes)
223
224 movdqa (%rsi), %xmm4
225 sub $0x40, %rdx
226 movdqa 0x10(%rsi), %xmm1
227
228 movdqa %xmm4, (%rdi)
229 movdqa %xmm1, 0x10(%rdi)
230
231 movdqa 0x20(%rsi), %xmm4
232 movdqa 0x30(%rsi), %xmm1
233 add $0x40, %rsi
234
235 movdqa %xmm4, 0x20(%rdi)
236 movdqa %xmm1, 0x30(%rdi)
237 add $0x40, %rdi
238L(shl_0_cache_less_64bytes):
239 add %rdx, %rsi
240 add %rdx, %rdi
241 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
242
e7044ea7 243 .p2align 4
6fb8cbcb
L
244L(shl_0_gobble_mem_loop):
245 prefetcht0 0x1c0(%rsi)
246 prefetcht0 0x280(%rsi)
247
248 movdqa (%rsi), %xmm0
249 movdqa 0x10(%rsi), %xmm1
250 movdqa 0x20(%rsi), %xmm2
251 movdqa 0x30(%rsi), %xmm3
252 movdqa 0x40(%rsi), %xmm4
253 movdqa 0x50(%rsi), %xmm5
254 movdqa 0x60(%rsi), %xmm6
255 movdqa 0x70(%rsi), %xmm7
256 lea 0x80(%rsi), %rsi
257 sub $0x80, %rdx
258 movdqa %xmm0, (%rdi)
259 movdqa %xmm1, 0x10(%rdi)
260 movdqa %xmm2, 0x20(%rdi)
261 movdqa %xmm3, 0x30(%rdi)
262 movdqa %xmm4, 0x40(%rdi)
263 movdqa %xmm5, 0x50(%rdi)
264 movdqa %xmm6, 0x60(%rdi)
265 movdqa %xmm7, 0x70(%rdi)
266 lea 0x80(%rdi), %rdi
267
268 jae L(shl_0_gobble_mem_loop)
269 cmp $-0x40, %rdx
270 lea 0x80(%rdx), %rdx
271 jl L(shl_0_mem_less_64bytes)
272
273 movdqa (%rsi), %xmm0
274 sub $0x40, %rdx
275 movdqa 0x10(%rsi), %xmm1
276
277 movdqa %xmm0, (%rdi)
278 movdqa %xmm1, 0x10(%rdi)
279
280 movdqa 0x20(%rsi), %xmm0
281 movdqa 0x30(%rsi), %xmm1
282 add $0x40, %rsi
283
284 movdqa %xmm0, 0x20(%rdi)
285 movdqa %xmm1, 0x30(%rdi)
286 add $0x40, %rdi
287L(shl_0_mem_less_64bytes):
288 cmp $0x20, %rdx
289 jb L(shl_0_mem_less_32bytes)
290 movdqa (%rsi), %xmm0
291 sub $0x20, %rdx
292 movdqa 0x10(%rsi), %xmm1
293 add $0x20, %rsi
294 movdqa %xmm0, (%rdi)
295 movdqa %xmm1, 0x10(%rdi)
296 add $0x20, %rdi
297L(shl_0_mem_less_32bytes):
298 add %rdx, %rdi
299 add %rdx, %rsi
300 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
301
e7044ea7 302 .p2align 4
6fb8cbcb
L
303L(shl_0_bwd):
304 sub $16, %rdx
305 movdqa -0x10(%rsi), %xmm1
306 sub $16, %rsi
307 movdqa %xmm1, -0x10(%rdi)
308 sub $16, %rdi
309 cmp $0x80, %rdx
310 movdqu %xmm0, (%r8)
311 ja L(shl_0_gobble_bwd)
312 cmp $64, %rdx
313 jb L(shl_0_less_64bytes_bwd)
314 movaps -0x10(%rsi), %xmm0
315 movaps -0x20(%rsi), %xmm1
316 movaps -0x30(%rsi), %xmm2
317 movaps -0x40(%rsi), %xmm3
318 movaps %xmm0, -0x10(%rdi)
319 movaps %xmm1, -0x20(%rdi)
320 movaps %xmm2, -0x30(%rdi)
321 movaps %xmm3, -0x40(%rdi)
322 sub $64, %rdx
323 sub $0x40, %rsi
324 sub $0x40, %rdi
325L(shl_0_less_64bytes_bwd):
326 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
327
e7044ea7 328 .p2align 4
6fb8cbcb
L
329L(shl_0_gobble_bwd):
330#ifdef DATA_CACHE_SIZE_HALF
9bc0b730 331 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
6fb8cbcb 332#else
afec409a 333 cmp __x86_data_cache_size_half(%rip), %RDX_LP
6fb8cbcb
L
334#endif
335 lea -128(%rdx), %rdx
336 jae L(shl_0_gobble_mem_bwd_loop)
337L(shl_0_gobble_bwd_loop):
338 movdqa -0x10(%rsi), %xmm0
339 movaps -0x20(%rsi), %xmm1
340 movaps -0x30(%rsi), %xmm2
341 movaps -0x40(%rsi), %xmm3
342
343 movdqa %xmm0, -0x10(%rdi)
344 movaps %xmm1, -0x20(%rdi)
345 movaps %xmm2, -0x30(%rdi)
346 movaps %xmm3, -0x40(%rdi)
347
348 sub $0x80, %rdx
349 movaps -0x50(%rsi), %xmm4
350 movaps -0x60(%rsi), %xmm5
351 movaps -0x70(%rsi), %xmm6
352 movaps -0x80(%rsi), %xmm7
353 lea -0x80(%rsi), %rsi
354 movaps %xmm4, -0x50(%rdi)
355 movaps %xmm5, -0x60(%rdi)
356 movaps %xmm6, -0x70(%rdi)
357 movaps %xmm7, -0x80(%rdi)
358 lea -0x80(%rdi), %rdi
359
360 jae L(shl_0_gobble_bwd_loop)
361 cmp $-0x40, %rdx
362 lea 0x80(%rdx), %rdx
363 jl L(shl_0_gobble_bwd_less_64bytes)
364
365 movdqa -0x10(%rsi), %xmm0
366 sub $0x40, %rdx
367 movdqa -0x20(%rsi), %xmm1
368
369 movdqa %xmm0, -0x10(%rdi)
370 movdqa %xmm1, -0x20(%rdi)
371
372 movdqa -0x30(%rsi), %xmm0
373 movdqa -0x40(%rsi), %xmm1
374 sub $0x40, %rsi
375
376 movdqa %xmm0, -0x30(%rdi)
377 movdqa %xmm1, -0x40(%rdi)
378 sub $0x40, %rdi
379L(shl_0_gobble_bwd_less_64bytes):
380 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
381
e7044ea7 382 .p2align 4
6fb8cbcb
L
383L(shl_0_gobble_mem_bwd_loop):
384 prefetcht0 -0x1c0(%rsi)
385 prefetcht0 -0x280(%rsi)
386 movdqa -0x10(%rsi), %xmm0
387 movdqa -0x20(%rsi), %xmm1
388 movdqa -0x30(%rsi), %xmm2
389 movdqa -0x40(%rsi), %xmm3
390 movdqa -0x50(%rsi), %xmm4
391 movdqa -0x60(%rsi), %xmm5
392 movdqa -0x70(%rsi), %xmm6
393 movdqa -0x80(%rsi), %xmm7
394 lea -0x80(%rsi), %rsi
395 sub $0x80, %rdx
396 movdqa %xmm0, -0x10(%rdi)
397 movdqa %xmm1, -0x20(%rdi)
398 movdqa %xmm2, -0x30(%rdi)
399 movdqa %xmm3, -0x40(%rdi)
400 movdqa %xmm4, -0x50(%rdi)
401 movdqa %xmm5, -0x60(%rdi)
402 movdqa %xmm6, -0x70(%rdi)
403 movdqa %xmm7, -0x80(%rdi)
404 lea -0x80(%rdi), %rdi
405
406 jae L(shl_0_gobble_mem_bwd_loop)
407 cmp $-0x40, %rdx
408 lea 0x80(%rdx), %rdx
409 jl L(shl_0_mem_bwd_less_64bytes)
410
411 movdqa -0x10(%rsi), %xmm0
412 sub $0x40, %rdx
413 movdqa -0x20(%rsi), %xmm1
414
415 movdqa %xmm0, -0x10(%rdi)
416 movdqa %xmm1, -0x20(%rdi)
417
418 movdqa -0x30(%rsi), %xmm0
419 movdqa -0x40(%rsi), %xmm1
420 sub $0x40, %rsi
421
422 movdqa %xmm0, -0x30(%rdi)
423 movdqa %xmm1, -0x40(%rdi)
424 sub $0x40, %rdi
425L(shl_0_mem_bwd_less_64bytes):
426 cmp $0x20, %rdx
427 jb L(shl_0_mem_bwd_less_32bytes)
428 movdqa -0x10(%rsi), %xmm0
429 sub $0x20, %rdx
430 movdqa -0x20(%rsi), %xmm1
431 sub $0x20, %rsi
432 movdqa %xmm0, -0x10(%rdi)
433 movdqa %xmm1, -0x20(%rdi)
434 sub $0x20, %rdi
435L(shl_0_mem_bwd_less_32bytes):
436 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
437
e7044ea7 438 .p2align 4
6fb8cbcb
L
439L(shl_1):
440 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
441 cmp %rcx, %rdx
442 movaps -0x01(%rsi), %xmm1
443 jb L(L1_fwd)
444 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
445L(L1_fwd):
446 lea -64(%rdx), %rdx
447 jmp *%r9
448 ud2
449L(shl_1_loop_L2):
450 prefetchnta 0x1c0(%rsi)
451L(shl_1_loop_L1):
452 sub $64, %rdx
453 movaps 0x0f(%rsi), %xmm2
454 movaps 0x1f(%rsi), %xmm3
455 movaps 0x2f(%rsi), %xmm4
456 movaps 0x3f(%rsi), %xmm5
457 movdqa %xmm5, %xmm6
458 palignr $1, %xmm4, %xmm5
459 lea 64(%rsi), %rsi
460 palignr $1, %xmm3, %xmm4
461 palignr $1, %xmm2, %xmm3
462 lea 64(%rdi), %rdi
463 palignr $1, %xmm1, %xmm2
464 movdqa %xmm6, %xmm1
465 movdqa %xmm2, -0x40(%rdi)
466 movaps %xmm3, -0x30(%rdi)
467 jb L(shl_1_end)
468 movaps %xmm4, -0x20(%rdi)
469 movaps %xmm5, -0x10(%rdi)
470 jmp *%r9
471 ud2
472L(shl_1_end):
473 movaps %xmm4, -0x20(%rdi)
474 lea 64(%rdx), %rdx
475 movaps %xmm5, -0x10(%rdi)
476 add %rdx, %rdi
477 movdqu %xmm0, (%r8)
478 add %rdx, %rsi
479 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
480
e7044ea7 481 .p2align 4
6fb8cbcb
L
482L(shl_1_bwd):
483 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
484 cmp %rcx, %rdx
485 movaps -0x01(%rsi), %xmm1
486 jb L(L1_bwd)
487 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
488L(L1_bwd):
489 lea -64(%rdx), %rdx
490 jmp *%r9
491 ud2
492L(shl_1_bwd_loop_L2):
493 prefetchnta -0x1c0(%rsi)
494L(shl_1_bwd_loop_L1):
495 movaps -0x11(%rsi), %xmm2
496 sub $0x40, %rdx
497 movaps -0x21(%rsi), %xmm3
498 movaps -0x31(%rsi), %xmm4
499 movaps -0x41(%rsi), %xmm5
500 lea -0x40(%rsi), %rsi
501 palignr $1, %xmm2, %xmm1
502 palignr $1, %xmm3, %xmm2
503 palignr $1, %xmm4, %xmm3
504 palignr $1, %xmm5, %xmm4
505
506 movaps %xmm1, -0x10(%rdi)
507 movaps %xmm5, %xmm1
508
509 movaps %xmm2, -0x20(%rdi)
510 lea -0x40(%rdi), %rdi
511
512 movaps %xmm3, 0x10(%rdi)
513 jb L(shl_1_bwd_end)
514 movaps %xmm4, (%rdi)
515 jmp *%r9
516 ud2
517L(shl_1_bwd_end):
518 movaps %xmm4, (%rdi)
519 lea 64(%rdx), %rdx
520 movdqu %xmm0, (%r8)
521 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
522
e7044ea7 523 .p2align 4
6fb8cbcb
L
524L(shl_2):
525 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
526 cmp %rcx, %rdx
527 movaps -0x02(%rsi), %xmm1
528 jb L(L2_fwd)
529 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
530L(L2_fwd):
531 lea -64(%rdx), %rdx
532 jmp *%r9
533 ud2
534L(shl_2_loop_L2):
535 prefetchnta 0x1c0(%rsi)
536L(shl_2_loop_L1):
537 sub $64, %rdx
538 movaps 0x0e(%rsi), %xmm2
539 movaps 0x1e(%rsi), %xmm3
540 movaps 0x2e(%rsi), %xmm4
541 movaps 0x3e(%rsi), %xmm5
542 movdqa %xmm5, %xmm6
543 palignr $2, %xmm4, %xmm5
544 lea 64(%rsi), %rsi
545 palignr $2, %xmm3, %xmm4
546 palignr $2, %xmm2, %xmm3
547 lea 64(%rdi), %rdi
548 palignr $2, %xmm1, %xmm2
549 movdqa %xmm6, %xmm1
550 movdqa %xmm2, -0x40(%rdi)
551 movaps %xmm3, -0x30(%rdi)
552 jb L(shl_2_end)
553 movaps %xmm4, -0x20(%rdi)
554 movaps %xmm5, -0x10(%rdi)
555 jmp *%r9
556 ud2
557L(shl_2_end):
558 movaps %xmm4, -0x20(%rdi)
559 lea 64(%rdx), %rdx
560 movaps %xmm5, -0x10(%rdi)
561 add %rdx, %rdi
562 movdqu %xmm0, (%r8)
563 add %rdx, %rsi
564 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
565
e7044ea7 566 .p2align 4
6fb8cbcb
L
567L(shl_2_bwd):
568 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
569 cmp %rcx, %rdx
570 movaps -0x02(%rsi), %xmm1
571 jb L(L2_bwd)
572 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
573L(L2_bwd):
574 lea -64(%rdx), %rdx
575 jmp *%r9
576 ud2
577L(shl_2_bwd_loop_L2):
578 prefetchnta -0x1c0(%rsi)
579L(shl_2_bwd_loop_L1):
580 movaps -0x12(%rsi), %xmm2
581 sub $0x40, %rdx
582 movaps -0x22(%rsi), %xmm3
583 movaps -0x32(%rsi), %xmm4
584 movaps -0x42(%rsi), %xmm5
585 lea -0x40(%rsi), %rsi
586 palignr $2, %xmm2, %xmm1
587 palignr $2, %xmm3, %xmm2
588 palignr $2, %xmm4, %xmm3
589 palignr $2, %xmm5, %xmm4
590
591 movaps %xmm1, -0x10(%rdi)
592 movaps %xmm5, %xmm1
593
594 movaps %xmm2, -0x20(%rdi)
595 lea -0x40(%rdi), %rdi
596
597 movaps %xmm3, 0x10(%rdi)
598 jb L(shl_2_bwd_end)
599 movaps %xmm4, (%rdi)
600 jmp *%r9
601 ud2
602L(shl_2_bwd_end):
603 movaps %xmm4, (%rdi)
604 lea 64(%rdx), %rdx
605 movdqu %xmm0, (%r8)
606 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
607
e7044ea7 608 .p2align 4
6fb8cbcb
L
609L(shl_3):
610 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
611 cmp %rcx, %rdx
612 movaps -0x03(%rsi), %xmm1
613 jb L(L3_fwd)
614 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
615L(L3_fwd):
616 lea -64(%rdx), %rdx
617 jmp *%r9
618 ud2
619L(shl_3_loop_L2):
620 prefetchnta 0x1c0(%rsi)
621L(shl_3_loop_L1):
622 sub $64, %rdx
623 movaps 0x0d(%rsi), %xmm2
624 movaps 0x1d(%rsi), %xmm3
625 movaps 0x2d(%rsi), %xmm4
626 movaps 0x3d(%rsi), %xmm5
627 movdqa %xmm5, %xmm6
628 palignr $3, %xmm4, %xmm5
629 lea 64(%rsi), %rsi
630 palignr $3, %xmm3, %xmm4
631 palignr $3, %xmm2, %xmm3
632 lea 64(%rdi), %rdi
633 palignr $3, %xmm1, %xmm2
634 movdqa %xmm6, %xmm1
635 movdqa %xmm2, -0x40(%rdi)
636 movaps %xmm3, -0x30(%rdi)
637 jb L(shl_3_end)
638 movaps %xmm4, -0x20(%rdi)
639 movaps %xmm5, -0x10(%rdi)
640 jmp *%r9
641 ud2
642L(shl_3_end):
643 movaps %xmm4, -0x20(%rdi)
644 lea 64(%rdx), %rdx
645 movaps %xmm5, -0x10(%rdi)
646 add %rdx, %rdi
647 movdqu %xmm0, (%r8)
648 add %rdx, %rsi
649 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
650
e7044ea7 651 .p2align 4
6fb8cbcb
L
652L(shl_3_bwd):
653 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
654 cmp %rcx, %rdx
655 movaps -0x03(%rsi), %xmm1
656 jb L(L3_bwd)
657 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
658L(L3_bwd):
659 lea -64(%rdx), %rdx
660 jmp *%r9
661 ud2
662L(shl_3_bwd_loop_L2):
663 prefetchnta -0x1c0(%rsi)
664L(shl_3_bwd_loop_L1):
665 movaps -0x13(%rsi), %xmm2
666 sub $0x40, %rdx
667 movaps -0x23(%rsi), %xmm3
668 movaps -0x33(%rsi), %xmm4
669 movaps -0x43(%rsi), %xmm5
670 lea -0x40(%rsi), %rsi
671 palignr $3, %xmm2, %xmm1
672 palignr $3, %xmm3, %xmm2
673 palignr $3, %xmm4, %xmm3
674 palignr $3, %xmm5, %xmm4
675
676 movaps %xmm1, -0x10(%rdi)
677 movaps %xmm5, %xmm1
678
679 movaps %xmm2, -0x20(%rdi)
680 lea -0x40(%rdi), %rdi
681
682 movaps %xmm3, 0x10(%rdi)
683 jb L(shl_3_bwd_end)
684 movaps %xmm4, (%rdi)
685 jmp *%r9
686 ud2
687L(shl_3_bwd_end):
688 movaps %xmm4, (%rdi)
689 lea 64(%rdx), %rdx
690 movdqu %xmm0, (%r8)
691 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
692
e7044ea7 693 .p2align 4
6fb8cbcb
L
694L(shl_4):
695 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
696 cmp %rcx, %rdx
697 movaps -0x04(%rsi), %xmm1
698 jb L(L4_fwd)
699 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
700L(L4_fwd):
701 lea -64(%rdx), %rdx
702 jmp *%r9
703 ud2
704L(shl_4_loop_L2):
705 prefetchnta 0x1c0(%rsi)
706L(shl_4_loop_L1):
707 sub $64, %rdx
708 movaps 0x0c(%rsi), %xmm2
709 movaps 0x1c(%rsi), %xmm3
710 movaps 0x2c(%rsi), %xmm4
711 movaps 0x3c(%rsi), %xmm5
712 movdqa %xmm5, %xmm6
713 palignr $4, %xmm4, %xmm5
714 lea 64(%rsi), %rsi
715 palignr $4, %xmm3, %xmm4
716 palignr $4, %xmm2, %xmm3
717 lea 64(%rdi), %rdi
718 palignr $4, %xmm1, %xmm2
719 movdqa %xmm6, %xmm1
720 movdqa %xmm2, -0x40(%rdi)
721 movaps %xmm3, -0x30(%rdi)
722 jb L(shl_4_end)
723 movaps %xmm4, -0x20(%rdi)
724 movaps %xmm5, -0x10(%rdi)
725 jmp *%r9
726 ud2
727L(shl_4_end):
728 movaps %xmm4, -0x20(%rdi)
729 lea 64(%rdx), %rdx
730 movaps %xmm5, -0x10(%rdi)
731 add %rdx, %rdi
732 movdqu %xmm0, (%r8)
733 add %rdx, %rsi
734 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
735
e7044ea7 736 .p2align 4
6fb8cbcb
L
737L(shl_4_bwd):
738 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
739 cmp %rcx, %rdx
740 movaps -0x04(%rsi), %xmm1
741 jb L(L4_bwd)
742 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
743L(L4_bwd):
744 lea -64(%rdx), %rdx
745 jmp *%r9
746 ud2
747L(shl_4_bwd_loop_L2):
748 prefetchnta -0x1c0(%rsi)
749L(shl_4_bwd_loop_L1):
750 movaps -0x14(%rsi), %xmm2
751 sub $0x40, %rdx
752 movaps -0x24(%rsi), %xmm3
753 movaps -0x34(%rsi), %xmm4
754 movaps -0x44(%rsi), %xmm5
755 lea -0x40(%rsi), %rsi
756 palignr $4, %xmm2, %xmm1
757 palignr $4, %xmm3, %xmm2
758 palignr $4, %xmm4, %xmm3
759 palignr $4, %xmm5, %xmm4
760
761 movaps %xmm1, -0x10(%rdi)
762 movaps %xmm5, %xmm1
763
764 movaps %xmm2, -0x20(%rdi)
765 lea -0x40(%rdi), %rdi
766
767 movaps %xmm3, 0x10(%rdi)
768 jb L(shl_4_bwd_end)
769 movaps %xmm4, (%rdi)
770 jmp *%r9
771 ud2
772L(shl_4_bwd_end):
773 movaps %xmm4, (%rdi)
774 lea 64(%rdx), %rdx
775 movdqu %xmm0, (%r8)
776 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
777
e7044ea7 778 .p2align 4
6fb8cbcb
L
779L(shl_5):
780 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
781 cmp %rcx, %rdx
782 movaps -0x05(%rsi), %xmm1
783 jb L(L5_fwd)
784 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
785L(L5_fwd):
786 lea -64(%rdx), %rdx
787 jmp *%r9
788 ud2
789L(shl_5_loop_L2):
790 prefetchnta 0x1c0(%rsi)
791L(shl_5_loop_L1):
792 sub $64, %rdx
793 movaps 0x0b(%rsi), %xmm2
794 movaps 0x1b(%rsi), %xmm3
795 movaps 0x2b(%rsi), %xmm4
796 movaps 0x3b(%rsi), %xmm5
797 movdqa %xmm5, %xmm6
798 palignr $5, %xmm4, %xmm5
799 lea 64(%rsi), %rsi
800 palignr $5, %xmm3, %xmm4
801 palignr $5, %xmm2, %xmm3
802 lea 64(%rdi), %rdi
803 palignr $5, %xmm1, %xmm2
804 movdqa %xmm6, %xmm1
805 movdqa %xmm2, -0x40(%rdi)
806 movaps %xmm3, -0x30(%rdi)
807 jb L(shl_5_end)
808 movaps %xmm4, -0x20(%rdi)
809 movaps %xmm5, -0x10(%rdi)
810 jmp *%r9
811 ud2
812L(shl_5_end):
813 movaps %xmm4, -0x20(%rdi)
814 lea 64(%rdx), %rdx
815 movaps %xmm5, -0x10(%rdi)
816 add %rdx, %rdi
817 movdqu %xmm0, (%r8)
818 add %rdx, %rsi
819 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
820
e7044ea7 821 .p2align 4
6fb8cbcb
L
822L(shl_5_bwd):
823 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
824 cmp %rcx, %rdx
825 movaps -0x05(%rsi), %xmm1
826 jb L(L5_bwd)
827 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
828L(L5_bwd):
829 lea -64(%rdx), %rdx
830 jmp *%r9
831 ud2
832L(shl_5_bwd_loop_L2):
833 prefetchnta -0x1c0(%rsi)
834L(shl_5_bwd_loop_L1):
835 movaps -0x15(%rsi), %xmm2
836 sub $0x40, %rdx
837 movaps -0x25(%rsi), %xmm3
838 movaps -0x35(%rsi), %xmm4
839 movaps -0x45(%rsi), %xmm5
840 lea -0x40(%rsi), %rsi
841 palignr $5, %xmm2, %xmm1
842 palignr $5, %xmm3, %xmm2
843 palignr $5, %xmm4, %xmm3
844 palignr $5, %xmm5, %xmm4
845
846 movaps %xmm1, -0x10(%rdi)
847 movaps %xmm5, %xmm1
848
849 movaps %xmm2, -0x20(%rdi)
850 lea -0x40(%rdi), %rdi
851
852 movaps %xmm3, 0x10(%rdi)
853 jb L(shl_5_bwd_end)
854 movaps %xmm4, (%rdi)
855 jmp *%r9
856 ud2
857L(shl_5_bwd_end):
858 movaps %xmm4, (%rdi)
859 lea 64(%rdx), %rdx
860 movdqu %xmm0, (%r8)
861 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
862
e7044ea7 863 .p2align 4
6fb8cbcb
L
864L(shl_6):
865 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
866 cmp %rcx, %rdx
867 movaps -0x06(%rsi), %xmm1
868 jb L(L6_fwd)
869 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
870L(L6_fwd):
871 lea -64(%rdx), %rdx
872 jmp *%r9
873 ud2
874L(shl_6_loop_L2):
875 prefetchnta 0x1c0(%rsi)
876L(shl_6_loop_L1):
877 sub $64, %rdx
878 movaps 0x0a(%rsi), %xmm2
879 movaps 0x1a(%rsi), %xmm3
880 movaps 0x2a(%rsi), %xmm4
881 movaps 0x3a(%rsi), %xmm5
882 movdqa %xmm5, %xmm6
883 palignr $6, %xmm4, %xmm5
884 lea 64(%rsi), %rsi
885 palignr $6, %xmm3, %xmm4
886 palignr $6, %xmm2, %xmm3
887 lea 64(%rdi), %rdi
888 palignr $6, %xmm1, %xmm2
889 movdqa %xmm6, %xmm1
890 movdqa %xmm2, -0x40(%rdi)
891 movaps %xmm3, -0x30(%rdi)
892 jb L(shl_6_end)
893 movaps %xmm4, -0x20(%rdi)
894 movaps %xmm5, -0x10(%rdi)
895 jmp *%r9
896 ud2
897L(shl_6_end):
898 movaps %xmm4, -0x20(%rdi)
899 lea 64(%rdx), %rdx
900 movaps %xmm5, -0x10(%rdi)
901 add %rdx, %rdi
902 movdqu %xmm0, (%r8)
903 add %rdx, %rsi
904 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
905
e7044ea7 906 .p2align 4
6fb8cbcb
L
907L(shl_6_bwd):
908 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
909 cmp %rcx, %rdx
910 movaps -0x06(%rsi), %xmm1
911 jb L(L6_bwd)
912 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
913L(L6_bwd):
914 lea -64(%rdx), %rdx
915 jmp *%r9
916 ud2
917L(shl_6_bwd_loop_L2):
918 prefetchnta -0x1c0(%rsi)
919L(shl_6_bwd_loop_L1):
920 movaps -0x16(%rsi), %xmm2
921 sub $0x40, %rdx
922 movaps -0x26(%rsi), %xmm3
923 movaps -0x36(%rsi), %xmm4
924 movaps -0x46(%rsi), %xmm5
925 lea -0x40(%rsi), %rsi
926 palignr $6, %xmm2, %xmm1
927 palignr $6, %xmm3, %xmm2
928 palignr $6, %xmm4, %xmm3
929 palignr $6, %xmm5, %xmm4
930
931 movaps %xmm1, -0x10(%rdi)
932 movaps %xmm5, %xmm1
933
934 movaps %xmm2, -0x20(%rdi)
935 lea -0x40(%rdi), %rdi
936
937 movaps %xmm3, 0x10(%rdi)
938 jb L(shl_6_bwd_end)
939 movaps %xmm4, (%rdi)
940 jmp *%r9
941 ud2
942L(shl_6_bwd_end):
943 movaps %xmm4, (%rdi)
944 lea 64(%rdx), %rdx
945 movdqu %xmm0, (%r8)
946 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
947
e7044ea7 948 .p2align 4
6fb8cbcb
L
949L(shl_7):
950 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
951 cmp %rcx, %rdx
952 movaps -0x07(%rsi), %xmm1
953 jb L(L7_fwd)
954 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
955L(L7_fwd):
956 lea -64(%rdx), %rdx
957 jmp *%r9
958 ud2
959L(shl_7_loop_L2):
960 prefetchnta 0x1c0(%rsi)
961L(shl_7_loop_L1):
962 sub $64, %rdx
963 movaps 0x09(%rsi), %xmm2
964 movaps 0x19(%rsi), %xmm3
965 movaps 0x29(%rsi), %xmm4
966 movaps 0x39(%rsi), %xmm5
967 movdqa %xmm5, %xmm6
968 palignr $7, %xmm4, %xmm5
969 lea 64(%rsi), %rsi
970 palignr $7, %xmm3, %xmm4
971 palignr $7, %xmm2, %xmm3
972 lea 64(%rdi), %rdi
973 palignr $7, %xmm1, %xmm2
974 movdqa %xmm6, %xmm1
975 movdqa %xmm2, -0x40(%rdi)
976 movaps %xmm3, -0x30(%rdi)
977 jb L(shl_7_end)
978 movaps %xmm4, -0x20(%rdi)
979 movaps %xmm5, -0x10(%rdi)
980 jmp *%r9
981 ud2
982L(shl_7_end):
983 movaps %xmm4, -0x20(%rdi)
984 lea 64(%rdx), %rdx
985 movaps %xmm5, -0x10(%rdi)
986 add %rdx, %rdi
987 movdqu %xmm0, (%r8)
988 add %rdx, %rsi
989 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
990
e7044ea7 991 .p2align 4
6fb8cbcb
L
992L(shl_7_bwd):
993 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
994 cmp %rcx, %rdx
995 movaps -0x07(%rsi), %xmm1
996 jb L(L7_bwd)
997 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
998L(L7_bwd):
999 lea -64(%rdx), %rdx
1000 jmp *%r9
1001 ud2
1002L(shl_7_bwd_loop_L2):
1003 prefetchnta -0x1c0(%rsi)
1004L(shl_7_bwd_loop_L1):
1005 movaps -0x17(%rsi), %xmm2
1006 sub $0x40, %rdx
1007 movaps -0x27(%rsi), %xmm3
1008 movaps -0x37(%rsi), %xmm4
1009 movaps -0x47(%rsi), %xmm5
1010 lea -0x40(%rsi), %rsi
1011 palignr $7, %xmm2, %xmm1
1012 palignr $7, %xmm3, %xmm2
1013 palignr $7, %xmm4, %xmm3
1014 palignr $7, %xmm5, %xmm4
1015
1016 movaps %xmm1, -0x10(%rdi)
1017 movaps %xmm5, %xmm1
1018
1019 movaps %xmm2, -0x20(%rdi)
1020 lea -0x40(%rdi), %rdi
1021
1022 movaps %xmm3, 0x10(%rdi)
1023 jb L(shl_7_bwd_end)
1024 movaps %xmm4, (%rdi)
1025 jmp *%r9
1026 ud2
1027L(shl_7_bwd_end):
1028 movaps %xmm4, (%rdi)
1029 lea 64(%rdx), %rdx
1030 movdqu %xmm0, (%r8)
1031 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1032
e7044ea7 1033 .p2align 4
6fb8cbcb
L
1034L(shl_8):
1035 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1036 cmp %rcx, %rdx
1037 movaps -0x08(%rsi), %xmm1
1038 jb L(L8_fwd)
1039 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1040L(L8_fwd):
1041 lea -64(%rdx), %rdx
1042 jmp *%r9
1043L(shl_8_loop_L2):
1044 prefetchnta 0x1c0(%rsi)
1045L(shl_8_loop_L1):
1046 sub $64, %rdx
1047 movaps 0x08(%rsi), %xmm2
1048 movaps 0x18(%rsi), %xmm3
1049 movaps 0x28(%rsi), %xmm4
1050 movaps 0x38(%rsi), %xmm5
1051 movdqa %xmm5, %xmm6
1052 palignr $8, %xmm4, %xmm5
1053 lea 64(%rsi), %rsi
1054 palignr $8, %xmm3, %xmm4
1055 palignr $8, %xmm2, %xmm3
1056 lea 64(%rdi), %rdi
1057 palignr $8, %xmm1, %xmm2
1058 movdqa %xmm6, %xmm1
1059 movdqa %xmm2, -0x40(%rdi)
1060 movaps %xmm3, -0x30(%rdi)
1061 jb L(shl_8_end)
1062 movaps %xmm4, -0x20(%rdi)
1063 movaps %xmm5, -0x10(%rdi)
1064 jmp *%r9
1065 ud2
e7044ea7 1066 .p2align 4
6fb8cbcb
L
1067L(shl_8_end):
1068 lea 64(%rdx), %rdx
1069 movaps %xmm4, -0x20(%rdi)
1070 add %rdx, %rsi
1071 movaps %xmm5, -0x10(%rdi)
1072 add %rdx, %rdi
1073 movdqu %xmm0, (%r8)
1074 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1075
e7044ea7 1076 .p2align 4
6fb8cbcb
L
1077L(shl_8_bwd):
1078 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1079 cmp %rcx, %rdx
1080 movaps -0x08(%rsi), %xmm1
1081 jb L(L8_bwd)
1082 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1083L(L8_bwd):
1084 lea -64(%rdx), %rdx
1085 jmp *%r9
1086 ud2
1087L(shl_8_bwd_loop_L2):
1088 prefetchnta -0x1c0(%rsi)
1089L(shl_8_bwd_loop_L1):
1090 movaps -0x18(%rsi), %xmm2
1091 sub $0x40, %rdx
1092 movaps -0x28(%rsi), %xmm3
1093 movaps -0x38(%rsi), %xmm4
1094 movaps -0x48(%rsi), %xmm5
1095 lea -0x40(%rsi), %rsi
1096 palignr $8, %xmm2, %xmm1
1097 palignr $8, %xmm3, %xmm2
1098 palignr $8, %xmm4, %xmm3
1099 palignr $8, %xmm5, %xmm4
1100
1101 movaps %xmm1, -0x10(%rdi)
1102 movaps %xmm5, %xmm1
1103
1104 movaps %xmm2, -0x20(%rdi)
1105 lea -0x40(%rdi), %rdi
1106
1107 movaps %xmm3, 0x10(%rdi)
1108 jb L(shl_8_bwd_end)
1109 movaps %xmm4, (%rdi)
1110 jmp *%r9
1111 ud2
1112L(shl_8_bwd_end):
1113 movaps %xmm4, (%rdi)
1114 lea 64(%rdx), %rdx
1115 movdqu %xmm0, (%r8)
1116 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1117
e7044ea7 1118 .p2align 4
6fb8cbcb
L
1119L(shl_9):
1120 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1121 cmp %rcx, %rdx
1122 movaps -0x09(%rsi), %xmm1
1123 jb L(L9_fwd)
1124 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1125L(L9_fwd):
1126 lea -64(%rdx), %rdx
1127 jmp *%r9
1128 ud2
1129L(shl_9_loop_L2):
1130 prefetchnta 0x1c0(%rsi)
1131L(shl_9_loop_L1):
1132 sub $64, %rdx
1133 movaps 0x07(%rsi), %xmm2
1134 movaps 0x17(%rsi), %xmm3
1135 movaps 0x27(%rsi), %xmm4
1136 movaps 0x37(%rsi), %xmm5
1137 movdqa %xmm5, %xmm6
1138 palignr $9, %xmm4, %xmm5
1139 lea 64(%rsi), %rsi
1140 palignr $9, %xmm3, %xmm4
1141 palignr $9, %xmm2, %xmm3
1142 lea 64(%rdi), %rdi
1143 palignr $9, %xmm1, %xmm2
1144 movdqa %xmm6, %xmm1
1145 movdqa %xmm2, -0x40(%rdi)
1146 movaps %xmm3, -0x30(%rdi)
1147 jb L(shl_9_end)
1148 movaps %xmm4, -0x20(%rdi)
1149 movaps %xmm5, -0x10(%rdi)
1150 jmp *%r9
1151 ud2
1152L(shl_9_end):
1153 movaps %xmm4, -0x20(%rdi)
1154 lea 64(%rdx), %rdx
1155 movaps %xmm5, -0x10(%rdi)
1156 add %rdx, %rdi
1157 movdqu %xmm0, (%r8)
1158 add %rdx, %rsi
1159 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1160
e7044ea7 1161 .p2align 4
6fb8cbcb
L
1162L(shl_9_bwd):
1163 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1164 cmp %rcx, %rdx
1165 movaps -0x09(%rsi), %xmm1
1166 jb L(L9_bwd)
1167 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1168L(L9_bwd):
1169 lea -64(%rdx), %rdx
1170 jmp *%r9
1171 ud2
1172L(shl_9_bwd_loop_L2):
1173 prefetchnta -0x1c0(%rsi)
1174L(shl_9_bwd_loop_L1):
1175 movaps -0x19(%rsi), %xmm2
1176 sub $0x40, %rdx
1177 movaps -0x29(%rsi), %xmm3
1178 movaps -0x39(%rsi), %xmm4
1179 movaps -0x49(%rsi), %xmm5
1180 lea -0x40(%rsi), %rsi
1181 palignr $9, %xmm2, %xmm1
1182 palignr $9, %xmm3, %xmm2
1183 palignr $9, %xmm4, %xmm3
1184 palignr $9, %xmm5, %xmm4
1185
1186 movaps %xmm1, -0x10(%rdi)
1187 movaps %xmm5, %xmm1
1188
1189 movaps %xmm2, -0x20(%rdi)
1190 lea -0x40(%rdi), %rdi
1191
1192 movaps %xmm3, 0x10(%rdi)
1193 jb L(shl_9_bwd_end)
1194 movaps %xmm4, (%rdi)
1195 jmp *%r9
1196 ud2
1197L(shl_9_bwd_end):
1198 movaps %xmm4, (%rdi)
1199 lea 64(%rdx), %rdx
1200 movdqu %xmm0, (%r8)
1201 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1202
e7044ea7 1203 .p2align 4
6fb8cbcb
L
1204L(shl_10):
1205 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1206 cmp %rcx, %rdx
1207 movaps -0x0a(%rsi), %xmm1
1208 jb L(L10_fwd)
1209 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1210L(L10_fwd):
1211 lea -64(%rdx), %rdx
1212 jmp *%r9
1213 ud2
1214L(shl_10_loop_L2):
1215 prefetchnta 0x1c0(%rsi)
1216L(shl_10_loop_L1):
1217 sub $64, %rdx
1218 movaps 0x06(%rsi), %xmm2
1219 movaps 0x16(%rsi), %xmm3
1220 movaps 0x26(%rsi), %xmm4
1221 movaps 0x36(%rsi), %xmm5
1222 movdqa %xmm5, %xmm6
1223 palignr $10, %xmm4, %xmm5
1224 lea 64(%rsi), %rsi
1225 palignr $10, %xmm3, %xmm4
1226 palignr $10, %xmm2, %xmm3
1227 lea 64(%rdi), %rdi
1228 palignr $10, %xmm1, %xmm2
1229 movdqa %xmm6, %xmm1
1230 movdqa %xmm2, -0x40(%rdi)
1231 movaps %xmm3, -0x30(%rdi)
1232 jb L(shl_10_end)
1233 movaps %xmm4, -0x20(%rdi)
1234 movaps %xmm5, -0x10(%rdi)
1235 jmp *%r9
1236 ud2
1237L(shl_10_end):
1238 movaps %xmm4, -0x20(%rdi)
1239 lea 64(%rdx), %rdx
1240 movaps %xmm5, -0x10(%rdi)
1241 add %rdx, %rdi
1242 movdqu %xmm0, (%r8)
1243 add %rdx, %rsi
1244 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1245
e7044ea7 1246 .p2align 4
6fb8cbcb
L
1247L(shl_10_bwd):
1248 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1249 cmp %rcx, %rdx
1250 movaps -0x0a(%rsi), %xmm1
1251 jb L(L10_bwd)
1252 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1253L(L10_bwd):
1254 lea -64(%rdx), %rdx
1255 jmp *%r9
1256 ud2
1257L(shl_10_bwd_loop_L2):
1258 prefetchnta -0x1c0(%rsi)
1259L(shl_10_bwd_loop_L1):
1260 movaps -0x1a(%rsi), %xmm2
1261 sub $0x40, %rdx
1262 movaps -0x2a(%rsi), %xmm3
1263 movaps -0x3a(%rsi), %xmm4
1264 movaps -0x4a(%rsi), %xmm5
1265 lea -0x40(%rsi), %rsi
1266 palignr $10, %xmm2, %xmm1
1267 palignr $10, %xmm3, %xmm2
1268 palignr $10, %xmm4, %xmm3
1269 palignr $10, %xmm5, %xmm4
1270
1271 movaps %xmm1, -0x10(%rdi)
1272 movaps %xmm5, %xmm1
1273
1274 movaps %xmm2, -0x20(%rdi)
1275 lea -0x40(%rdi), %rdi
1276
1277 movaps %xmm3, 0x10(%rdi)
1278 jb L(shl_10_bwd_end)
1279 movaps %xmm4, (%rdi)
1280 jmp *%r9
1281 ud2
1282L(shl_10_bwd_end):
1283 movaps %xmm4, (%rdi)
1284 lea 64(%rdx), %rdx
1285 movdqu %xmm0, (%r8)
1286 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1287
e7044ea7 1288 .p2align 4
6fb8cbcb
L
1289L(shl_11):
1290 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1291 cmp %rcx, %rdx
1292 movaps -0x0b(%rsi), %xmm1
1293 jb L(L11_fwd)
1294 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1295L(L11_fwd):
1296 lea -64(%rdx), %rdx
1297 jmp *%r9
1298 ud2
1299L(shl_11_loop_L2):
1300 prefetchnta 0x1c0(%rsi)
1301L(shl_11_loop_L1):
1302 sub $64, %rdx
1303 movaps 0x05(%rsi), %xmm2
1304 movaps 0x15(%rsi), %xmm3
1305 movaps 0x25(%rsi), %xmm4
1306 movaps 0x35(%rsi), %xmm5
1307 movdqa %xmm5, %xmm6
1308 palignr $11, %xmm4, %xmm5
1309 lea 64(%rsi), %rsi
1310 palignr $11, %xmm3, %xmm4
1311 palignr $11, %xmm2, %xmm3
1312 lea 64(%rdi), %rdi
1313 palignr $11, %xmm1, %xmm2
1314 movdqa %xmm6, %xmm1
1315 movdqa %xmm2, -0x40(%rdi)
1316 movaps %xmm3, -0x30(%rdi)
1317 jb L(shl_11_end)
1318 movaps %xmm4, -0x20(%rdi)
1319 movaps %xmm5, -0x10(%rdi)
1320 jmp *%r9
1321 ud2
1322L(shl_11_end):
1323 movaps %xmm4, -0x20(%rdi)
1324 lea 64(%rdx), %rdx
1325 movaps %xmm5, -0x10(%rdi)
1326 add %rdx, %rdi
1327 movdqu %xmm0, (%r8)
1328 add %rdx, %rsi
1329 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1330
e7044ea7 1331 .p2align 4
6fb8cbcb
L
1332L(shl_11_bwd):
1333 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1334 cmp %rcx, %rdx
1335 movaps -0x0b(%rsi), %xmm1
1336 jb L(L11_bwd)
1337 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1338L(L11_bwd):
1339 lea -64(%rdx), %rdx
1340 jmp *%r9
1341 ud2
1342L(shl_11_bwd_loop_L2):
1343 prefetchnta -0x1c0(%rsi)
1344L(shl_11_bwd_loop_L1):
1345 movaps -0x1b(%rsi), %xmm2
1346 sub $0x40, %rdx
1347 movaps -0x2b(%rsi), %xmm3
1348 movaps -0x3b(%rsi), %xmm4
1349 movaps -0x4b(%rsi), %xmm5
1350 lea -0x40(%rsi), %rsi
1351 palignr $11, %xmm2, %xmm1
1352 palignr $11, %xmm3, %xmm2
1353 palignr $11, %xmm4, %xmm3
1354 palignr $11, %xmm5, %xmm4
1355
1356 movaps %xmm1, -0x10(%rdi)
1357 movaps %xmm5, %xmm1
1358
1359 movaps %xmm2, -0x20(%rdi)
1360 lea -0x40(%rdi), %rdi
1361
1362 movaps %xmm3, 0x10(%rdi)
1363 jb L(shl_11_bwd_end)
1364 movaps %xmm4, (%rdi)
1365 jmp *%r9
1366 ud2
1367L(shl_11_bwd_end):
1368 movaps %xmm4, (%rdi)
1369 lea 64(%rdx), %rdx
1370 movdqu %xmm0, (%r8)
1371 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1372
e7044ea7 1373 .p2align 4
6fb8cbcb
L
1374L(shl_12):
1375 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1376 cmp %rcx, %rdx
1377 movaps -0x0c(%rsi), %xmm1
1378 jb L(L12_fwd)
1379 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1380L(L12_fwd):
1381 lea -64(%rdx), %rdx
1382 jmp *%r9
1383 ud2
1384L(shl_12_loop_L2):
1385 prefetchnta 0x1c0(%rsi)
1386L(shl_12_loop_L1):
1387 sub $64, %rdx
1388 movaps 0x04(%rsi), %xmm2
1389 movaps 0x14(%rsi), %xmm3
1390 movaps 0x24(%rsi), %xmm4
1391 movaps 0x34(%rsi), %xmm5
1392 movdqa %xmm5, %xmm6
1393 palignr $12, %xmm4, %xmm5
1394 lea 64(%rsi), %rsi
1395 palignr $12, %xmm3, %xmm4
1396 palignr $12, %xmm2, %xmm3
1397 lea 64(%rdi), %rdi
1398 palignr $12, %xmm1, %xmm2
1399 movdqa %xmm6, %xmm1
1400 movdqa %xmm2, -0x40(%rdi)
1401 movaps %xmm3, -0x30(%rdi)
1402 jb L(shl_12_end)
1403 movaps %xmm4, -0x20(%rdi)
1404 movaps %xmm5, -0x10(%rdi)
1405 jmp *%r9
1406 ud2
1407L(shl_12_end):
1408 movaps %xmm4, -0x20(%rdi)
1409 lea 64(%rdx), %rdx
1410 movaps %xmm5, -0x10(%rdi)
1411 add %rdx, %rdi
1412 movdqu %xmm0, (%r8)
1413 add %rdx, %rsi
1414 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1415
e7044ea7 1416 .p2align 4
6fb8cbcb
L
1417L(shl_12_bwd):
1418 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1419 cmp %rcx, %rdx
1420 movaps -0x0c(%rsi), %xmm1
1421 jb L(L12_bwd)
1422 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1423L(L12_bwd):
1424 lea -64(%rdx), %rdx
1425 jmp *%r9
1426 ud2
1427L(shl_12_bwd_loop_L2):
1428 prefetchnta -0x1c0(%rsi)
1429L(shl_12_bwd_loop_L1):
1430 movaps -0x1c(%rsi), %xmm2
1431 sub $0x40, %rdx
1432 movaps -0x2c(%rsi), %xmm3
1433 movaps -0x3c(%rsi), %xmm4
1434 movaps -0x4c(%rsi), %xmm5
1435 lea -0x40(%rsi), %rsi
1436 palignr $12, %xmm2, %xmm1
1437 palignr $12, %xmm3, %xmm2
1438 palignr $12, %xmm4, %xmm3
1439 palignr $12, %xmm5, %xmm4
1440
1441 movaps %xmm1, -0x10(%rdi)
1442 movaps %xmm5, %xmm1
1443
1444 movaps %xmm2, -0x20(%rdi)
1445 lea -0x40(%rdi), %rdi
1446
1447 movaps %xmm3, 0x10(%rdi)
1448 jb L(shl_12_bwd_end)
1449 movaps %xmm4, (%rdi)
1450 jmp *%r9
1451 ud2
1452L(shl_12_bwd_end):
1453 movaps %xmm4, (%rdi)
1454 lea 64(%rdx), %rdx
1455 movdqu %xmm0, (%r8)
1456 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1457
e7044ea7 1458 .p2align 4
6fb8cbcb
L
1459L(shl_13):
1460 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1461 cmp %rcx, %rdx
1462 movaps -0x0d(%rsi), %xmm1
1463 jb L(L13_fwd)
1464 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1465L(L13_fwd):
1466 lea -64(%rdx), %rdx
1467 jmp *%r9
1468 ud2
1469L(shl_13_loop_L2):
1470 prefetchnta 0x1c0(%rsi)
1471L(shl_13_loop_L1):
1472 sub $64, %rdx
1473 movaps 0x03(%rsi), %xmm2
1474 movaps 0x13(%rsi), %xmm3
1475 movaps 0x23(%rsi), %xmm4
1476 movaps 0x33(%rsi), %xmm5
1477 movdqa %xmm5, %xmm6
1478 palignr $13, %xmm4, %xmm5
1479 lea 64(%rsi), %rsi
1480 palignr $13, %xmm3, %xmm4
1481 palignr $13, %xmm2, %xmm3
1482 lea 64(%rdi), %rdi
1483 palignr $13, %xmm1, %xmm2
1484 movdqa %xmm6, %xmm1
1485 movdqa %xmm2, -0x40(%rdi)
1486 movaps %xmm3, -0x30(%rdi)
1487 jb L(shl_13_end)
1488 movaps %xmm4, -0x20(%rdi)
1489 movaps %xmm5, -0x10(%rdi)
1490 jmp *%r9
1491 ud2
1492L(shl_13_end):
1493 movaps %xmm4, -0x20(%rdi)
1494 lea 64(%rdx), %rdx
1495 movaps %xmm5, -0x10(%rdi)
1496 add %rdx, %rdi
1497 movdqu %xmm0, (%r8)
1498 add %rdx, %rsi
1499 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1500
e7044ea7 1501 .p2align 4
6fb8cbcb
L
1502L(shl_13_bwd):
1503 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1504 cmp %rcx, %rdx
1505 movaps -0x0d(%rsi), %xmm1
1506 jb L(L13_bwd)
1507 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1508L(L13_bwd):
1509 lea -64(%rdx), %rdx
1510 jmp *%r9
1511 ud2
1512L(shl_13_bwd_loop_L2):
1513 prefetchnta -0x1c0(%rsi)
1514L(shl_13_bwd_loop_L1):
1515 movaps -0x1d(%rsi), %xmm2
1516 sub $0x40, %rdx
1517 movaps -0x2d(%rsi), %xmm3
1518 movaps -0x3d(%rsi), %xmm4
1519 movaps -0x4d(%rsi), %xmm5
1520 lea -0x40(%rsi), %rsi
1521 palignr $13, %xmm2, %xmm1
1522 palignr $13, %xmm3, %xmm2
1523 palignr $13, %xmm4, %xmm3
1524 palignr $13, %xmm5, %xmm4
1525
1526 movaps %xmm1, -0x10(%rdi)
1527 movaps %xmm5, %xmm1
1528
1529 movaps %xmm2, -0x20(%rdi)
1530 lea -0x40(%rdi), %rdi
1531
1532 movaps %xmm3, 0x10(%rdi)
1533 jb L(shl_13_bwd_end)
1534 movaps %xmm4, (%rdi)
1535 jmp *%r9
1536 ud2
1537L(shl_13_bwd_end):
1538 movaps %xmm4, (%rdi)
1539 lea 64(%rdx), %rdx
1540 movdqu %xmm0, (%r8)
1541 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1542
e7044ea7 1543 .p2align 4
6fb8cbcb
L
1544L(shl_14):
1545 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1546 cmp %rcx, %rdx
1547 movaps -0x0e(%rsi), %xmm1
1548 jb L(L14_fwd)
1549 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1550L(L14_fwd):
1551 lea -64(%rdx), %rdx
1552 jmp *%r9
1553 ud2
1554L(shl_14_loop_L2):
1555 prefetchnta 0x1c0(%rsi)
1556L(shl_14_loop_L1):
1557 sub $64, %rdx
1558 movaps 0x02(%rsi), %xmm2
1559 movaps 0x12(%rsi), %xmm3
1560 movaps 0x22(%rsi), %xmm4
1561 movaps 0x32(%rsi), %xmm5
1562 movdqa %xmm5, %xmm6
1563 palignr $14, %xmm4, %xmm5
1564 lea 64(%rsi), %rsi
1565 palignr $14, %xmm3, %xmm4
1566 palignr $14, %xmm2, %xmm3
1567 lea 64(%rdi), %rdi
1568 palignr $14, %xmm1, %xmm2
1569 movdqa %xmm6, %xmm1
1570 movdqa %xmm2, -0x40(%rdi)
1571 movaps %xmm3, -0x30(%rdi)
1572 jb L(shl_14_end)
1573 movaps %xmm4, -0x20(%rdi)
1574 movaps %xmm5, -0x10(%rdi)
1575 jmp *%r9
1576 ud2
1577L(shl_14_end):
1578 movaps %xmm4, -0x20(%rdi)
1579 lea 64(%rdx), %rdx
1580 movaps %xmm5, -0x10(%rdi)
1581 add %rdx, %rdi
1582 movdqu %xmm0, (%r8)
1583 add %rdx, %rsi
1584 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1585
e7044ea7 1586 .p2align 4
6fb8cbcb
L
1587L(shl_14_bwd):
1588 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1589 cmp %rcx, %rdx
1590 movaps -0x0e(%rsi), %xmm1
1591 jb L(L14_bwd)
1592 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1593L(L14_bwd):
1594 lea -64(%rdx), %rdx
1595 jmp *%r9
1596 ud2
1597L(shl_14_bwd_loop_L2):
1598 prefetchnta -0x1c0(%rsi)
1599L(shl_14_bwd_loop_L1):
1600 movaps -0x1e(%rsi), %xmm2
1601 sub $0x40, %rdx
1602 movaps -0x2e(%rsi), %xmm3
1603 movaps -0x3e(%rsi), %xmm4
1604 movaps -0x4e(%rsi), %xmm5
1605 lea -0x40(%rsi), %rsi
1606 palignr $14, %xmm2, %xmm1
1607 palignr $14, %xmm3, %xmm2
1608 palignr $14, %xmm4, %xmm3
1609 palignr $14, %xmm5, %xmm4
1610
1611 movaps %xmm1, -0x10(%rdi)
1612 movaps %xmm5, %xmm1
1613
1614 movaps %xmm2, -0x20(%rdi)
1615 lea -0x40(%rdi), %rdi
1616
1617 movaps %xmm3, 0x10(%rdi)
1618 jb L(shl_14_bwd_end)
1619 movaps %xmm4, (%rdi)
1620 jmp *%r9
1621 ud2
1622L(shl_14_bwd_end):
1623 movaps %xmm4, (%rdi)
1624 lea 64(%rdx), %rdx
1625 movdqu %xmm0, (%r8)
1626 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1627
e7044ea7 1628 .p2align 4
6fb8cbcb
L
1629L(shl_15):
1630 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1631 cmp %rcx, %rdx
1632 movaps -0x0f(%rsi), %xmm1
1633 jb L(L15_fwd)
1634 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1635L(L15_fwd):
1636 lea -64(%rdx), %rdx
1637 jmp *%r9
1638 ud2
1639L(shl_15_loop_L2):
1640 prefetchnta 0x1c0(%rsi)
1641L(shl_15_loop_L1):
1642 sub $64, %rdx
1643 movaps 0x01(%rsi), %xmm2
1644 movaps 0x11(%rsi), %xmm3
1645 movaps 0x21(%rsi), %xmm4
1646 movaps 0x31(%rsi), %xmm5
1647 movdqa %xmm5, %xmm6
1648 palignr $15, %xmm4, %xmm5
1649 lea 64(%rsi), %rsi
1650 palignr $15, %xmm3, %xmm4
1651 palignr $15, %xmm2, %xmm3
1652 lea 64(%rdi), %rdi
1653 palignr $15, %xmm1, %xmm2
1654 movdqa %xmm6, %xmm1
1655 movdqa %xmm2, -0x40(%rdi)
1656 movaps %xmm3, -0x30(%rdi)
1657 jb L(shl_15_end)
1658 movaps %xmm4, -0x20(%rdi)
1659 movaps %xmm5, -0x10(%rdi)
1660 jmp *%r9
1661 ud2
1662L(shl_15_end):
1663 movaps %xmm4, -0x20(%rdi)
1664 lea 64(%rdx), %rdx
1665 movaps %xmm5, -0x10(%rdi)
1666 add %rdx, %rdi
1667 movdqu %xmm0, (%r8)
1668 add %rdx, %rsi
1669 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1670
e7044ea7 1671 .p2align 4
6fb8cbcb
L
1672L(shl_15_bwd):
1673 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1674 cmp %rcx, %rdx
1675 movaps -0x0f(%rsi), %xmm1
1676 jb L(L15_bwd)
1677 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1678L(L15_bwd):
1679 lea -64(%rdx), %rdx
1680 jmp *%r9
1681 ud2
1682L(shl_15_bwd_loop_L2):
1683 prefetchnta -0x1c0(%rsi)
1684L(shl_15_bwd_loop_L1):
1685 movaps -0x1f(%rsi), %xmm2
1686 sub $0x40, %rdx
1687 movaps -0x2f(%rsi), %xmm3
1688 movaps -0x3f(%rsi), %xmm4
1689 movaps -0x4f(%rsi), %xmm5
1690 lea -0x40(%rsi), %rsi
1691 palignr $15, %xmm2, %xmm1
1692 palignr $15, %xmm3, %xmm2
1693 palignr $15, %xmm4, %xmm3
1694 palignr $15, %xmm5, %xmm4
1695
1696 movaps %xmm1, -0x10(%rdi)
1697 movaps %xmm5, %xmm1
1698
1699 movaps %xmm2, -0x20(%rdi)
1700 lea -0x40(%rdi), %rdi
1701
1702 movaps %xmm3, 0x10(%rdi)
1703 jb L(shl_15_bwd_end)
1704 movaps %xmm4, (%rdi)
1705 jmp *%r9
1706 ud2
1707L(shl_15_bwd_end):
1708 movaps %xmm4, (%rdi)
1709 lea 64(%rdx), %rdx
1710 movdqu %xmm0, (%r8)
1711 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1712
e7044ea7 1713 .p2align 4
6fb8cbcb
L
1714L(write_72bytes):
1715 movdqu -72(%rsi), %xmm0
1716 movdqu -56(%rsi), %xmm1
1717 mov -40(%rsi), %r8
1718 mov -32(%rsi), %r9
1719 mov -24(%rsi), %r10
1720 mov -16(%rsi), %r11
1721 mov -8(%rsi), %rcx
1722 movdqu %xmm0, -72(%rdi)
1723 movdqu %xmm1, -56(%rdi)
1724 mov %r8, -40(%rdi)
1725 mov %r9, -32(%rdi)
1726 mov %r10, -24(%rdi)
1727 mov %r11, -16(%rdi)
1728 mov %rcx, -8(%rdi)
1729 ret
1730
e7044ea7 1731 .p2align 4
6fb8cbcb
L
1732L(write_64bytes):
1733 movdqu -64(%rsi), %xmm0
1734 mov -48(%rsi), %rcx
1735 mov -40(%rsi), %r8
1736 mov -32(%rsi), %r9
1737 mov -24(%rsi), %r10
1738 mov -16(%rsi), %r11
1739 mov -8(%rsi), %rdx
1740 movdqu %xmm0, -64(%rdi)
1741 mov %rcx, -48(%rdi)
1742 mov %r8, -40(%rdi)
1743 mov %r9, -32(%rdi)
1744 mov %r10, -24(%rdi)
1745 mov %r11, -16(%rdi)
1746 mov %rdx, -8(%rdi)
1747 ret
1748
e7044ea7 1749 .p2align 4
6fb8cbcb
L
1750L(write_56bytes):
1751 movdqu -56(%rsi), %xmm0
1752 mov -40(%rsi), %r8
1753 mov -32(%rsi), %r9
1754 mov -24(%rsi), %r10
1755 mov -16(%rsi), %r11
1756 mov -8(%rsi), %rcx
1757 movdqu %xmm0, -56(%rdi)
1758 mov %r8, -40(%rdi)
1759 mov %r9, -32(%rdi)
1760 mov %r10, -24(%rdi)
1761 mov %r11, -16(%rdi)
1762 mov %rcx, -8(%rdi)
1763 ret
1764
e7044ea7 1765 .p2align 4
6fb8cbcb
L
1766L(write_48bytes):
1767 mov -48(%rsi), %rcx
1768 mov -40(%rsi), %r8
1769 mov -32(%rsi), %r9
1770 mov -24(%rsi), %r10
1771 mov -16(%rsi), %r11
1772 mov -8(%rsi), %rdx
1773 mov %rcx, -48(%rdi)
1774 mov %r8, -40(%rdi)
1775 mov %r9, -32(%rdi)
1776 mov %r10, -24(%rdi)
1777 mov %r11, -16(%rdi)
1778 mov %rdx, -8(%rdi)
1779 ret
1780
e7044ea7 1781 .p2align 4
6fb8cbcb
L
1782L(write_40bytes):
1783 mov -40(%rsi), %r8
1784 mov -32(%rsi), %r9
1785 mov -24(%rsi), %r10
1786 mov -16(%rsi), %r11
1787 mov -8(%rsi), %rdx
1788 mov %r8, -40(%rdi)
1789 mov %r9, -32(%rdi)
1790 mov %r10, -24(%rdi)
1791 mov %r11, -16(%rdi)
1792 mov %rdx, -8(%rdi)
1793 ret
1794
e7044ea7 1795 .p2align 4
6fb8cbcb
L
1796L(write_32bytes):
1797 mov -32(%rsi), %r9
1798 mov -24(%rsi), %r10
1799 mov -16(%rsi), %r11
1800 mov -8(%rsi), %rdx
1801 mov %r9, -32(%rdi)
1802 mov %r10, -24(%rdi)
1803 mov %r11, -16(%rdi)
1804 mov %rdx, -8(%rdi)
1805 ret
1806
e7044ea7 1807 .p2align 4
6fb8cbcb
L
1808L(write_24bytes):
1809 mov -24(%rsi), %r10
1810 mov -16(%rsi), %r11
1811 mov -8(%rsi), %rdx
1812 mov %r10, -24(%rdi)
1813 mov %r11, -16(%rdi)
1814 mov %rdx, -8(%rdi)
1815 ret
1816
e7044ea7 1817 .p2align 4
6fb8cbcb
L
1818L(write_16bytes):
1819 mov -16(%rsi), %r11
1820 mov -8(%rsi), %rdx
1821 mov %r11, -16(%rdi)
1822 mov %rdx, -8(%rdi)
1823 ret
1824
e7044ea7 1825 .p2align 4
6fb8cbcb
L
1826L(write_8bytes):
1827 mov -8(%rsi), %rdx
1828 mov %rdx, -8(%rdi)
1829L(write_0bytes):
1830 ret
1831
e7044ea7 1832 .p2align 4
6fb8cbcb
L
1833L(write_73bytes):
1834 movdqu -73(%rsi), %xmm0
1835 movdqu -57(%rsi), %xmm1
1836 mov -41(%rsi), %rcx
1837 mov -33(%rsi), %r9
1838 mov -25(%rsi), %r10
1839 mov -17(%rsi), %r11
1840 mov -9(%rsi), %r8
1841 mov -4(%rsi), %edx
1842 movdqu %xmm0, -73(%rdi)
1843 movdqu %xmm1, -57(%rdi)
1844 mov %rcx, -41(%rdi)
1845 mov %r9, -33(%rdi)
1846 mov %r10, -25(%rdi)
1847 mov %r11, -17(%rdi)
1848 mov %r8, -9(%rdi)
1849 mov %edx, -4(%rdi)
1850 ret
1851
e7044ea7 1852 .p2align 4
6fb8cbcb
L
1853L(write_65bytes):
1854 movdqu -65(%rsi), %xmm0
1855 movdqu -49(%rsi), %xmm1
1856 mov -33(%rsi), %r9
1857 mov -25(%rsi), %r10
1858 mov -17(%rsi), %r11
1859 mov -9(%rsi), %rcx
1860 mov -4(%rsi), %edx
1861 movdqu %xmm0, -65(%rdi)
1862 movdqu %xmm1, -49(%rdi)
1863 mov %r9, -33(%rdi)
1864 mov %r10, -25(%rdi)
1865 mov %r11, -17(%rdi)
1866 mov %rcx, -9(%rdi)
1867 mov %edx, -4(%rdi)
1868 ret
1869
e7044ea7 1870 .p2align 4
6fb8cbcb
L
1871L(write_57bytes):
1872 movdqu -57(%rsi), %xmm0
1873 mov -41(%rsi), %r8
1874 mov -33(%rsi), %r9
1875 mov -25(%rsi), %r10
1876 mov -17(%rsi), %r11
1877 mov -9(%rsi), %rcx
1878 mov -4(%rsi), %edx
1879 movdqu %xmm0, -57(%rdi)
1880 mov %r8, -41(%rdi)
1881 mov %r9, -33(%rdi)
1882 mov %r10, -25(%rdi)
1883 mov %r11, -17(%rdi)
1884 mov %rcx, -9(%rdi)
1885 mov %edx, -4(%rdi)
1886 ret
1887
e7044ea7 1888 .p2align 4
6fb8cbcb
L
1889L(write_49bytes):
1890 movdqu -49(%rsi), %xmm0
1891 mov -33(%rsi), %r9
1892 mov -25(%rsi), %r10
1893 mov -17(%rsi), %r11
1894 mov -9(%rsi), %rcx
1895 mov -4(%rsi), %edx
1896 movdqu %xmm0, -49(%rdi)
1897 mov %r9, -33(%rdi)
1898 mov %r10, -25(%rdi)
1899 mov %r11, -17(%rdi)
1900 mov %rcx, -9(%rdi)
1901 mov %edx, -4(%rdi)
1902 ret
1903
e7044ea7 1904 .p2align 4
6fb8cbcb
L
1905L(write_41bytes):
1906 mov -41(%rsi), %r8
1907 mov -33(%rsi), %r9
1908 mov -25(%rsi), %r10
1909 mov -17(%rsi), %r11
1910 mov -9(%rsi), %rcx
1911 mov -1(%rsi), %dl
1912 mov %r8, -41(%rdi)
1913 mov %r9, -33(%rdi)
1914 mov %r10, -25(%rdi)
1915 mov %r11, -17(%rdi)
1916 mov %rcx, -9(%rdi)
1917 mov %dl, -1(%rdi)
1918 ret
1919
e7044ea7 1920 .p2align 4
6fb8cbcb
L
1921L(write_33bytes):
1922 mov -33(%rsi), %r9
1923 mov -25(%rsi), %r10
1924 mov -17(%rsi), %r11
1925 mov -9(%rsi), %rcx
1926 mov -1(%rsi), %dl
1927 mov %r9, -33(%rdi)
1928 mov %r10, -25(%rdi)
1929 mov %r11, -17(%rdi)
1930 mov %rcx, -9(%rdi)
1931 mov %dl, -1(%rdi)
1932 ret
1933
e7044ea7 1934 .p2align 4
6fb8cbcb
L
1935L(write_25bytes):
1936 mov -25(%rsi), %r10
1937 mov -17(%rsi), %r11
1938 mov -9(%rsi), %rcx
1939 mov -1(%rsi), %dl
1940 mov %r10, -25(%rdi)
1941 mov %r11, -17(%rdi)
1942 mov %rcx, -9(%rdi)
1943 mov %dl, -1(%rdi)
1944 ret
1945
e7044ea7 1946 .p2align 4
6fb8cbcb
L
1947L(write_17bytes):
1948 mov -17(%rsi), %r11
1949 mov -9(%rsi), %rcx
1950 mov -4(%rsi), %edx
1951 mov %r11, -17(%rdi)
1952 mov %rcx, -9(%rdi)
1953 mov %edx, -4(%rdi)
1954 ret
1955
e7044ea7 1956 .p2align 4
6fb8cbcb
L
1957L(write_9bytes):
1958 mov -9(%rsi), %rcx
1959 mov -4(%rsi), %edx
1960 mov %rcx, -9(%rdi)
1961 mov %edx, -4(%rdi)
1962 ret
1963
e7044ea7 1964 .p2align 4
6fb8cbcb
L
1965L(write_1bytes):
1966 mov -1(%rsi), %dl
1967 mov %dl, -1(%rdi)
1968 ret
1969
e7044ea7 1970 .p2align 4
6fb8cbcb
L
1971L(write_74bytes):
1972 movdqu -74(%rsi), %xmm0
1973 movdqu -58(%rsi), %xmm1
1974 mov -42(%rsi), %r8
1975 mov -34(%rsi), %r9
1976 mov -26(%rsi), %r10
1977 mov -18(%rsi), %r11
1978 mov -10(%rsi), %rcx
1979 mov -4(%rsi), %edx
1980 movdqu %xmm0, -74(%rdi)
1981 movdqu %xmm1, -58(%rdi)
1982 mov %r8, -42(%rdi)
1983 mov %r9, -34(%rdi)
1984 mov %r10, -26(%rdi)
1985 mov %r11, -18(%rdi)
1986 mov %rcx, -10(%rdi)
1987 mov %edx, -4(%rdi)
1988 ret
1989
e7044ea7 1990 .p2align 4
6fb8cbcb
L
1991L(write_66bytes):
1992 movdqu -66(%rsi), %xmm0
1993 movdqu -50(%rsi), %xmm1
1994 mov -42(%rsi), %r8
1995 mov -34(%rsi), %r9
1996 mov -26(%rsi), %r10
1997 mov -18(%rsi), %r11
1998 mov -10(%rsi), %rcx
1999 mov -4(%rsi), %edx
2000 movdqu %xmm0, -66(%rdi)
2001 movdqu %xmm1, -50(%rdi)
2002 mov %r8, -42(%rdi)
2003 mov %r9, -34(%rdi)
2004 mov %r10, -26(%rdi)
2005 mov %r11, -18(%rdi)
2006 mov %rcx, -10(%rdi)
2007 mov %edx, -4(%rdi)
2008 ret
2009
e7044ea7 2010 .p2align 4
6fb8cbcb
L
2011L(write_58bytes):
2012 movdqu -58(%rsi), %xmm1
2013 mov -42(%rsi), %r8
2014 mov -34(%rsi), %r9
2015 mov -26(%rsi), %r10
2016 mov -18(%rsi), %r11
2017 mov -10(%rsi), %rcx
2018 mov -4(%rsi), %edx
2019 movdqu %xmm1, -58(%rdi)
2020 mov %r8, -42(%rdi)
2021 mov %r9, -34(%rdi)
2022 mov %r10, -26(%rdi)
2023 mov %r11, -18(%rdi)
2024 mov %rcx, -10(%rdi)
2025 mov %edx, -4(%rdi)
2026 ret
2027
e7044ea7 2028 .p2align 4
6fb8cbcb
L
2029L(write_50bytes):
2030 movdqu -50(%rsi), %xmm0
2031 mov -34(%rsi), %r9
2032 mov -26(%rsi), %r10
2033 mov -18(%rsi), %r11
2034 mov -10(%rsi), %rcx
2035 mov -4(%rsi), %edx
2036 movdqu %xmm0, -50(%rdi)
2037 mov %r9, -34(%rdi)
2038 mov %r10, -26(%rdi)
2039 mov %r11, -18(%rdi)
2040 mov %rcx, -10(%rdi)
2041 mov %edx, -4(%rdi)
2042 ret
2043
e7044ea7 2044 .p2align 4
6fb8cbcb
L
2045L(write_42bytes):
2046 mov -42(%rsi), %r8
2047 mov -34(%rsi), %r9
2048 mov -26(%rsi), %r10
2049 mov -18(%rsi), %r11
2050 mov -10(%rsi), %rcx
2051 mov -4(%rsi), %edx
2052 mov %r8, -42(%rdi)
2053 mov %r9, -34(%rdi)
2054 mov %r10, -26(%rdi)
2055 mov %r11, -18(%rdi)
2056 mov %rcx, -10(%rdi)
2057 mov %edx, -4(%rdi)
2058 ret
2059
e7044ea7 2060 .p2align 4
6fb8cbcb
L
2061L(write_34bytes):
2062 mov -34(%rsi), %r9
2063 mov -26(%rsi), %r10
2064 mov -18(%rsi), %r11
2065 mov -10(%rsi), %rcx
2066 mov -4(%rsi), %edx
2067 mov %r9, -34(%rdi)
2068 mov %r10, -26(%rdi)
2069 mov %r11, -18(%rdi)
2070 mov %rcx, -10(%rdi)
2071 mov %edx, -4(%rdi)
2072 ret
2073
e7044ea7 2074 .p2align 4
6fb8cbcb
L
2075L(write_26bytes):
2076 mov -26(%rsi), %r10
2077 mov -18(%rsi), %r11
2078 mov -10(%rsi), %rcx
2079 mov -4(%rsi), %edx
2080 mov %r10, -26(%rdi)
2081 mov %r11, -18(%rdi)
2082 mov %rcx, -10(%rdi)
2083 mov %edx, -4(%rdi)
2084 ret
2085
e7044ea7 2086 .p2align 4
6fb8cbcb
L
2087L(write_18bytes):
2088 mov -18(%rsi), %r11
2089 mov -10(%rsi), %rcx
2090 mov -4(%rsi), %edx
2091 mov %r11, -18(%rdi)
2092 mov %rcx, -10(%rdi)
2093 mov %edx, -4(%rdi)
2094 ret
2095
e7044ea7 2096 .p2align 4
6fb8cbcb
L
2097L(write_10bytes):
2098 mov -10(%rsi), %rcx
2099 mov -4(%rsi), %edx
2100 mov %rcx, -10(%rdi)
2101 mov %edx, -4(%rdi)
2102 ret
2103
e7044ea7 2104 .p2align 4
6fb8cbcb
L
2105L(write_2bytes):
2106 mov -2(%rsi), %dx
2107 mov %dx, -2(%rdi)
2108 ret
2109
e7044ea7 2110 .p2align 4
6fb8cbcb
L
2111L(write_75bytes):
2112 movdqu -75(%rsi), %xmm0
2113 movdqu -59(%rsi), %xmm1
2114 mov -43(%rsi), %r8
2115 mov -35(%rsi), %r9
2116 mov -27(%rsi), %r10
2117 mov -19(%rsi), %r11
2118 mov -11(%rsi), %rcx
2119 mov -4(%rsi), %edx
2120 movdqu %xmm0, -75(%rdi)
2121 movdqu %xmm1, -59(%rdi)
2122 mov %r8, -43(%rdi)
2123 mov %r9, -35(%rdi)
2124 mov %r10, -27(%rdi)
2125 mov %r11, -19(%rdi)
2126 mov %rcx, -11(%rdi)
2127 mov %edx, -4(%rdi)
2128 ret
2129
e7044ea7 2130 .p2align 4
6fb8cbcb
L
2131L(write_67bytes):
2132 movdqu -67(%rsi), %xmm0
2133 movdqu -59(%rsi), %xmm1
2134 mov -43(%rsi), %r8
2135 mov -35(%rsi), %r9
2136 mov -27(%rsi), %r10
2137 mov -19(%rsi), %r11
2138 mov -11(%rsi), %rcx
2139 mov -4(%rsi), %edx
2140 movdqu %xmm0, -67(%rdi)
2141 movdqu %xmm1, -59(%rdi)
2142 mov %r8, -43(%rdi)
2143 mov %r9, -35(%rdi)
2144 mov %r10, -27(%rdi)
2145 mov %r11, -19(%rdi)
2146 mov %rcx, -11(%rdi)
2147 mov %edx, -4(%rdi)
2148 ret
2149
e7044ea7 2150 .p2align 4
6fb8cbcb
L
2151L(write_59bytes):
2152 movdqu -59(%rsi), %xmm0
2153 mov -43(%rsi), %r8
2154 mov -35(%rsi), %r9
2155 mov -27(%rsi), %r10
2156 mov -19(%rsi), %r11
2157 mov -11(%rsi), %rcx
2158 mov -4(%rsi), %edx
2159 movdqu %xmm0, -59(%rdi)
2160 mov %r8, -43(%rdi)
2161 mov %r9, -35(%rdi)
2162 mov %r10, -27(%rdi)
2163 mov %r11, -19(%rdi)
2164 mov %rcx, -11(%rdi)
2165 mov %edx, -4(%rdi)
2166 ret
2167
e7044ea7 2168 .p2align 4
6fb8cbcb
L
2169L(write_51bytes):
2170 movdqu -51(%rsi), %xmm0
2171 mov -35(%rsi), %r9
2172 mov -27(%rsi), %r10
2173 mov -19(%rsi), %r11
2174 mov -11(%rsi), %rcx
2175 mov -4(%rsi), %edx
2176 movdqu %xmm0, -51(%rdi)
2177 mov %r9, -35(%rdi)
2178 mov %r10, -27(%rdi)
2179 mov %r11, -19(%rdi)
2180 mov %rcx, -11(%rdi)
2181 mov %edx, -4(%rdi)
2182 ret
2183
e7044ea7 2184 .p2align 4
6fb8cbcb
L
2185L(write_43bytes):
2186 mov -43(%rsi), %r8
2187 mov -35(%rsi), %r9
2188 mov -27(%rsi), %r10
2189 mov -19(%rsi), %r11
2190 mov -11(%rsi), %rcx
2191 mov -4(%rsi), %edx
2192 mov %r8, -43(%rdi)
2193 mov %r9, -35(%rdi)
2194 mov %r10, -27(%rdi)
2195 mov %r11, -19(%rdi)
2196 mov %rcx, -11(%rdi)
2197 mov %edx, -4(%rdi)
2198 ret
2199
e7044ea7 2200 .p2align 4
6fb8cbcb
L
2201L(write_35bytes):
2202 mov -35(%rsi), %r9
2203 mov -27(%rsi), %r10
2204 mov -19(%rsi), %r11
2205 mov -11(%rsi), %rcx
2206 mov -4(%rsi), %edx
2207 mov %r9, -35(%rdi)
2208 mov %r10, -27(%rdi)
2209 mov %r11, -19(%rdi)
2210 mov %rcx, -11(%rdi)
2211 mov %edx, -4(%rdi)
2212 ret
2213
e7044ea7 2214 .p2align 4
6fb8cbcb
L
2215L(write_27bytes):
2216 mov -27(%rsi), %r10
2217 mov -19(%rsi), %r11
2218 mov -11(%rsi), %rcx
2219 mov -4(%rsi), %edx
2220 mov %r10, -27(%rdi)
2221 mov %r11, -19(%rdi)
2222 mov %rcx, -11(%rdi)
2223 mov %edx, -4(%rdi)
2224 ret
2225
e7044ea7 2226 .p2align 4
6fb8cbcb
L
2227L(write_19bytes):
2228 mov -19(%rsi), %r11
2229 mov -11(%rsi), %rcx
2230 mov -4(%rsi), %edx
2231 mov %r11, -19(%rdi)
2232 mov %rcx, -11(%rdi)
2233 mov %edx, -4(%rdi)
2234 ret
2235
e7044ea7 2236 .p2align 4
6fb8cbcb
L
2237L(write_11bytes):
2238 mov -11(%rsi), %rcx
2239 mov -4(%rsi), %edx
2240 mov %rcx, -11(%rdi)
2241 mov %edx, -4(%rdi)
2242 ret
2243
e7044ea7 2244 .p2align 4
6fb8cbcb
L
2245L(write_3bytes):
2246 mov -3(%rsi), %dx
2247 mov -2(%rsi), %cx
2248 mov %dx, -3(%rdi)
2249 mov %cx, -2(%rdi)
2250 ret
2251
e7044ea7 2252 .p2align 4
6fb8cbcb
L
2253L(write_76bytes):
2254 movdqu -76(%rsi), %xmm0
2255 movdqu -60(%rsi), %xmm1
2256 mov -44(%rsi), %r8
2257 mov -36(%rsi), %r9
2258 mov -28(%rsi), %r10
2259 mov -20(%rsi), %r11
2260 mov -12(%rsi), %rcx
2261 mov -4(%rsi), %edx
2262 movdqu %xmm0, -76(%rdi)
2263 movdqu %xmm1, -60(%rdi)
2264 mov %r8, -44(%rdi)
2265 mov %r9, -36(%rdi)
2266 mov %r10, -28(%rdi)
2267 mov %r11, -20(%rdi)
2268 mov %rcx, -12(%rdi)
2269 mov %edx, -4(%rdi)
2270 ret
2271
e7044ea7 2272 .p2align 4
6fb8cbcb
L
2273L(write_68bytes):
2274 movdqu -68(%rsi), %xmm0
2275 movdqu -52(%rsi), %xmm1
2276 mov -36(%rsi), %r9
2277 mov -28(%rsi), %r10
2278 mov -20(%rsi), %r11
2279 mov -12(%rsi), %rcx
2280 mov -4(%rsi), %edx
2281 movdqu %xmm0, -68(%rdi)
2282 movdqu %xmm1, -52(%rdi)
2283 mov %r9, -36(%rdi)
2284 mov %r10, -28(%rdi)
2285 mov %r11, -20(%rdi)
2286 mov %rcx, -12(%rdi)
2287 mov %edx, -4(%rdi)
2288 ret
2289
e7044ea7 2290 .p2align 4
6fb8cbcb
L
2291L(write_60bytes):
2292 movdqu -60(%rsi), %xmm0
2293 mov -44(%rsi), %r8
2294 mov -36(%rsi), %r9
2295 mov -28(%rsi), %r10
2296 mov -20(%rsi), %r11
2297 mov -12(%rsi), %rcx
2298 mov -4(%rsi), %edx
2299 movdqu %xmm0, -60(%rdi)
2300 mov %r8, -44(%rdi)
2301 mov %r9, -36(%rdi)
2302 mov %r10, -28(%rdi)
2303 mov %r11, -20(%rdi)
2304 mov %rcx, -12(%rdi)
2305 mov %edx, -4(%rdi)
2306 ret
2307
e7044ea7 2308 .p2align 4
6fb8cbcb
L
2309L(write_52bytes):
2310 movdqu -52(%rsi), %xmm0
2311 mov -36(%rsi), %r9
2312 mov -28(%rsi), %r10
2313 mov -20(%rsi), %r11
2314 mov -12(%rsi), %rcx
2315 mov -4(%rsi), %edx
2316 movdqu %xmm0, -52(%rdi)
2317 mov %r9, -36(%rdi)
2318 mov %r10, -28(%rdi)
2319 mov %r11, -20(%rdi)
2320 mov %rcx, -12(%rdi)
2321 mov %edx, -4(%rdi)
2322 ret
2323
e7044ea7 2324 .p2align 4
6fb8cbcb
L
2325L(write_44bytes):
2326 mov -44(%rsi), %r8
2327 mov -36(%rsi), %r9
2328 mov -28(%rsi), %r10
2329 mov -20(%rsi), %r11
2330 mov -12(%rsi), %rcx
2331 mov -4(%rsi), %edx
2332 mov %r8, -44(%rdi)
2333 mov %r9, -36(%rdi)
2334 mov %r10, -28(%rdi)
2335 mov %r11, -20(%rdi)
2336 mov %rcx, -12(%rdi)
2337 mov %edx, -4(%rdi)
2338 ret
2339
e7044ea7 2340 .p2align 4
6fb8cbcb
L
2341L(write_36bytes):
2342 mov -36(%rsi), %r9
2343 mov -28(%rsi), %r10
2344 mov -20(%rsi), %r11
2345 mov -12(%rsi), %rcx
2346 mov -4(%rsi), %edx
2347 mov %r9, -36(%rdi)
2348 mov %r10, -28(%rdi)
2349 mov %r11, -20(%rdi)
2350 mov %rcx, -12(%rdi)
2351 mov %edx, -4(%rdi)
2352 ret
2353
e7044ea7 2354 .p2align 4
6fb8cbcb
L
2355L(write_28bytes):
2356 mov -28(%rsi), %r10
2357 mov -20(%rsi), %r11
2358 mov -12(%rsi), %rcx
2359 mov -4(%rsi), %edx
2360 mov %r10, -28(%rdi)
2361 mov %r11, -20(%rdi)
2362 mov %rcx, -12(%rdi)
2363 mov %edx, -4(%rdi)
2364 ret
2365
e7044ea7 2366 .p2align 4
6fb8cbcb
L
2367L(write_20bytes):
2368 mov -20(%rsi), %r11
2369 mov -12(%rsi), %rcx
2370 mov -4(%rsi), %edx
2371 mov %r11, -20(%rdi)
2372 mov %rcx, -12(%rdi)
2373 mov %edx, -4(%rdi)
2374 ret
2375
e7044ea7 2376 .p2align 4
6fb8cbcb
L
2377L(write_12bytes):
2378 mov -12(%rsi), %rcx
2379 mov -4(%rsi), %edx
2380 mov %rcx, -12(%rdi)
2381 mov %edx, -4(%rdi)
2382 ret
2383
e7044ea7 2384 .p2align 4
6fb8cbcb
L
2385L(write_4bytes):
2386 mov -4(%rsi), %edx
2387 mov %edx, -4(%rdi)
2388 ret
2389
e7044ea7 2390 .p2align 4
6fb8cbcb
L
2391L(write_77bytes):
2392 movdqu -77(%rsi), %xmm0
2393 movdqu -61(%rsi), %xmm1
2394 mov -45(%rsi), %r8
2395 mov -37(%rsi), %r9
2396 mov -29(%rsi), %r10
2397 mov -21(%rsi), %r11
2398 mov -13(%rsi), %rcx
2399 mov -8(%rsi), %rdx
2400 movdqu %xmm0, -77(%rdi)
2401 movdqu %xmm1, -61(%rdi)
2402 mov %r8, -45(%rdi)
2403 mov %r9, -37(%rdi)
2404 mov %r10, -29(%rdi)
2405 mov %r11, -21(%rdi)
2406 mov %rcx, -13(%rdi)
2407 mov %rdx, -8(%rdi)
2408 ret
2409
e7044ea7 2410 .p2align 4
6fb8cbcb
L
2411L(write_69bytes):
2412 movdqu -69(%rsi), %xmm0
2413 movdqu -53(%rsi), %xmm1
2414 mov -37(%rsi), %r9
2415 mov -29(%rsi), %r10
2416 mov -21(%rsi), %r11
2417 mov -13(%rsi), %rcx
2418 mov -8(%rsi), %rdx
2419 movdqu %xmm0, -69(%rdi)
2420 movdqu %xmm1, -53(%rdi)
2421 mov %r9, -37(%rdi)
2422 mov %r10, -29(%rdi)
2423 mov %r11, -21(%rdi)
2424 mov %rcx, -13(%rdi)
2425 mov %rdx, -8(%rdi)
2426 ret
2427
e7044ea7 2428 .p2align 4
6fb8cbcb
L
2429L(write_61bytes):
2430 movdqu -61(%rsi), %xmm0
2431 mov -45(%rsi), %r8
2432 mov -37(%rsi), %r9
2433 mov -29(%rsi), %r10
2434 mov -21(%rsi), %r11
2435 mov -13(%rsi), %rcx
2436 mov -8(%rsi), %rdx
2437 movdqu %xmm0, -61(%rdi)
2438 mov %r8, -45(%rdi)
2439 mov %r9, -37(%rdi)
2440 mov %r10, -29(%rdi)
2441 mov %r11, -21(%rdi)
2442 mov %rcx, -13(%rdi)
2443 mov %rdx, -8(%rdi)
2444 ret
2445
e7044ea7 2446 .p2align 4
6fb8cbcb
L
2447L(write_53bytes):
2448 movdqu -53(%rsi), %xmm0
2449 mov -45(%rsi), %r8
2450 mov -37(%rsi), %r9
2451 mov -29(%rsi), %r10
2452 mov -21(%rsi), %r11
2453 mov -13(%rsi), %rcx
2454 mov -8(%rsi), %rdx
2455 movdqu %xmm0, -53(%rdi)
2456 mov %r9, -37(%rdi)
2457 mov %r10, -29(%rdi)
2458 mov %r11, -21(%rdi)
2459 mov %rcx, -13(%rdi)
2460 mov %rdx, -8(%rdi)
2461 ret
2462
e7044ea7 2463 .p2align 4
6fb8cbcb
L
2464L(write_45bytes):
2465 mov -45(%rsi), %r8
2466 mov -37(%rsi), %r9
2467 mov -29(%rsi), %r10
2468 mov -21(%rsi), %r11
2469 mov -13(%rsi), %rcx
2470 mov -8(%rsi), %rdx
2471 mov %r8, -45(%rdi)
2472 mov %r9, -37(%rdi)
2473 mov %r10, -29(%rdi)
2474 mov %r11, -21(%rdi)
2475 mov %rcx, -13(%rdi)
2476 mov %rdx, -8(%rdi)
2477 ret
2478
e7044ea7 2479 .p2align 4
6fb8cbcb
L
2480L(write_37bytes):
2481 mov -37(%rsi), %r9
2482 mov -29(%rsi), %r10
2483 mov -21(%rsi), %r11
2484 mov -13(%rsi), %rcx
2485 mov -8(%rsi), %rdx
2486 mov %r9, -37(%rdi)
2487 mov %r10, -29(%rdi)
2488 mov %r11, -21(%rdi)
2489 mov %rcx, -13(%rdi)
2490 mov %rdx, -8(%rdi)
2491 ret
2492
e7044ea7 2493 .p2align 4
6fb8cbcb
L
2494L(write_29bytes):
2495 mov -29(%rsi), %r10
2496 mov -21(%rsi), %r11
2497 mov -13(%rsi), %rcx
2498 mov -8(%rsi), %rdx
2499 mov %r10, -29(%rdi)
2500 mov %r11, -21(%rdi)
2501 mov %rcx, -13(%rdi)
2502 mov %rdx, -8(%rdi)
2503 ret
2504
e7044ea7 2505 .p2align 4
6fb8cbcb
L
2506L(write_21bytes):
2507 mov -21(%rsi), %r11
2508 mov -13(%rsi), %rcx
2509 mov -8(%rsi), %rdx
2510 mov %r11, -21(%rdi)
2511 mov %rcx, -13(%rdi)
2512 mov %rdx, -8(%rdi)
2513 ret
2514
e7044ea7 2515 .p2align 4
6fb8cbcb
L
2516L(write_13bytes):
2517 mov -13(%rsi), %rcx
2518 mov -8(%rsi), %rdx
2519 mov %rcx, -13(%rdi)
2520 mov %rdx, -8(%rdi)
2521 ret
2522
e7044ea7 2523 .p2align 4
6fb8cbcb
L
2524L(write_5bytes):
2525 mov -5(%rsi), %edx
2526 mov -4(%rsi), %ecx
2527 mov %edx, -5(%rdi)
2528 mov %ecx, -4(%rdi)
2529 ret
2530
e7044ea7 2531 .p2align 4
6fb8cbcb
L
2532L(write_78bytes):
2533 movdqu -78(%rsi), %xmm0
2534 movdqu -62(%rsi), %xmm1
2535 mov -46(%rsi), %r8
2536 mov -38(%rsi), %r9
2537 mov -30(%rsi), %r10
2538 mov -22(%rsi), %r11
2539 mov -14(%rsi), %rcx
2540 mov -8(%rsi), %rdx
2541 movdqu %xmm0, -78(%rdi)
2542 movdqu %xmm1, -62(%rdi)
2543 mov %r8, -46(%rdi)
2544 mov %r9, -38(%rdi)
2545 mov %r10, -30(%rdi)
2546 mov %r11, -22(%rdi)
2547 mov %rcx, -14(%rdi)
2548 mov %rdx, -8(%rdi)
2549 ret
2550
e7044ea7 2551 .p2align 4
6fb8cbcb
L
2552L(write_70bytes):
2553 movdqu -70(%rsi), %xmm0
2554 movdqu -54(%rsi), %xmm1
2555 mov -38(%rsi), %r9
2556 mov -30(%rsi), %r10
2557 mov -22(%rsi), %r11
2558 mov -14(%rsi), %rcx
2559 mov -8(%rsi), %rdx
2560 movdqu %xmm0, -70(%rdi)
2561 movdqu %xmm1, -54(%rdi)
2562 mov %r9, -38(%rdi)
2563 mov %r10, -30(%rdi)
2564 mov %r11, -22(%rdi)
2565 mov %rcx, -14(%rdi)
2566 mov %rdx, -8(%rdi)
2567 ret
2568
e7044ea7 2569 .p2align 4
6fb8cbcb
L
2570L(write_62bytes):
2571 movdqu -62(%rsi), %xmm0
2572 mov -46(%rsi), %r8
2573 mov -38(%rsi), %r9
2574 mov -30(%rsi), %r10
2575 mov -22(%rsi), %r11
2576 mov -14(%rsi), %rcx
2577 mov -8(%rsi), %rdx
2578 movdqu %xmm0, -62(%rdi)
2579 mov %r8, -46(%rdi)
2580 mov %r9, -38(%rdi)
2581 mov %r10, -30(%rdi)
2582 mov %r11, -22(%rdi)
2583 mov %rcx, -14(%rdi)
2584 mov %rdx, -8(%rdi)
2585 ret
2586
e7044ea7 2587 .p2align 4
6fb8cbcb
L
2588L(write_54bytes):
2589 movdqu -54(%rsi), %xmm0
2590 mov -38(%rsi), %r9
2591 mov -30(%rsi), %r10
2592 mov -22(%rsi), %r11
2593 mov -14(%rsi), %rcx
2594 mov -8(%rsi), %rdx
2595 movdqu %xmm0, -54(%rdi)
2596 mov %r9, -38(%rdi)
2597 mov %r10, -30(%rdi)
2598 mov %r11, -22(%rdi)
2599 mov %rcx, -14(%rdi)
2600 mov %rdx, -8(%rdi)
2601 ret
2602
e7044ea7 2603 .p2align 4
6fb8cbcb
L
2604L(write_46bytes):
2605 mov -46(%rsi), %r8
2606 mov -38(%rsi), %r9
2607 mov -30(%rsi), %r10
2608 mov -22(%rsi), %r11
2609 mov -14(%rsi), %rcx
2610 mov -8(%rsi), %rdx
2611 mov %r8, -46(%rdi)
2612 mov %r9, -38(%rdi)
2613 mov %r10, -30(%rdi)
2614 mov %r11, -22(%rdi)
2615 mov %rcx, -14(%rdi)
2616 mov %rdx, -8(%rdi)
2617 ret
2618
e7044ea7 2619 .p2align 4
6fb8cbcb
L
2620L(write_38bytes):
2621 mov -38(%rsi), %r9
2622 mov -30(%rsi), %r10
2623 mov -22(%rsi), %r11
2624 mov -14(%rsi), %rcx
2625 mov -8(%rsi), %rdx
2626 mov %r9, -38(%rdi)
2627 mov %r10, -30(%rdi)
2628 mov %r11, -22(%rdi)
2629 mov %rcx, -14(%rdi)
2630 mov %rdx, -8(%rdi)
2631 ret
2632
e7044ea7 2633 .p2align 4
6fb8cbcb
L
2634L(write_30bytes):
2635 mov -30(%rsi), %r10
2636 mov -22(%rsi), %r11
2637 mov -14(%rsi), %rcx
2638 mov -8(%rsi), %rdx
2639 mov %r10, -30(%rdi)
2640 mov %r11, -22(%rdi)
2641 mov %rcx, -14(%rdi)
2642 mov %rdx, -8(%rdi)
2643 ret
2644
e7044ea7 2645 .p2align 4
6fb8cbcb
L
2646L(write_22bytes):
2647 mov -22(%rsi), %r11
2648 mov -14(%rsi), %rcx
2649 mov -8(%rsi), %rdx
2650 mov %r11, -22(%rdi)
2651 mov %rcx, -14(%rdi)
2652 mov %rdx, -8(%rdi)
2653 ret
2654
e7044ea7 2655 .p2align 4
6fb8cbcb
L
2656L(write_14bytes):
2657 mov -14(%rsi), %rcx
2658 mov -8(%rsi), %rdx
2659 mov %rcx, -14(%rdi)
2660 mov %rdx, -8(%rdi)
2661 ret
2662
e7044ea7 2663 .p2align 4
6fb8cbcb
L
2664L(write_6bytes):
2665 mov -6(%rsi), %edx
2666 mov -4(%rsi), %ecx
2667 mov %edx, -6(%rdi)
2668 mov %ecx, -4(%rdi)
2669 ret
2670
e7044ea7 2671 .p2align 4
6fb8cbcb
L
2672L(write_79bytes):
2673 movdqu -79(%rsi), %xmm0
2674 movdqu -63(%rsi), %xmm1
2675 mov -47(%rsi), %r8
2676 mov -39(%rsi), %r9
2677 mov -31(%rsi), %r10
2678 mov -23(%rsi), %r11
2679 mov -15(%rsi), %rcx
2680 mov -8(%rsi), %rdx
2681 movdqu %xmm0, -79(%rdi)
2682 movdqu %xmm1, -63(%rdi)
2683 mov %r8, -47(%rdi)
2684 mov %r9, -39(%rdi)
2685 mov %r10, -31(%rdi)
2686 mov %r11, -23(%rdi)
2687 mov %rcx, -15(%rdi)
2688 mov %rdx, -8(%rdi)
2689 ret
2690
e7044ea7 2691 .p2align 4
6fb8cbcb
L
2692L(write_71bytes):
2693 movdqu -71(%rsi), %xmm0
2694 movdqu -55(%rsi), %xmm1
2695 mov -39(%rsi), %r9
2696 mov -31(%rsi), %r10
2697 mov -23(%rsi), %r11
2698 mov -15(%rsi), %rcx
2699 mov -8(%rsi), %rdx
2700 movdqu %xmm0, -71(%rdi)
2701 movdqu %xmm1, -55(%rdi)
2702 mov %r9, -39(%rdi)
2703 mov %r10, -31(%rdi)
2704 mov %r11, -23(%rdi)
2705 mov %rcx, -15(%rdi)
2706 mov %rdx, -8(%rdi)
2707 ret
2708
e7044ea7 2709 .p2align 4
6fb8cbcb
L
2710L(write_63bytes):
2711 movdqu -63(%rsi), %xmm0
2712 mov -47(%rsi), %r8
2713 mov -39(%rsi), %r9
2714 mov -31(%rsi), %r10
2715 mov -23(%rsi), %r11
2716 mov -15(%rsi), %rcx
2717 mov -8(%rsi), %rdx
2718 movdqu %xmm0, -63(%rdi)
2719 mov %r8, -47(%rdi)
2720 mov %r9, -39(%rdi)
2721 mov %r10, -31(%rdi)
2722 mov %r11, -23(%rdi)
2723 mov %rcx, -15(%rdi)
2724 mov %rdx, -8(%rdi)
2725 ret
2726
e7044ea7 2727 .p2align 4
6fb8cbcb
L
2728L(write_55bytes):
2729 movdqu -55(%rsi), %xmm0
2730 mov -39(%rsi), %r9
2731 mov -31(%rsi), %r10
2732 mov -23(%rsi), %r11
2733 mov -15(%rsi), %rcx
2734 mov -8(%rsi), %rdx
2735 movdqu %xmm0, -55(%rdi)
2736 mov %r9, -39(%rdi)
2737 mov %r10, -31(%rdi)
2738 mov %r11, -23(%rdi)
2739 mov %rcx, -15(%rdi)
2740 mov %rdx, -8(%rdi)
2741 ret
2742
e7044ea7 2743 .p2align 4
6fb8cbcb
L
2744L(write_47bytes):
2745 mov -47(%rsi), %r8
2746 mov -39(%rsi), %r9
2747 mov -31(%rsi), %r10
2748 mov -23(%rsi), %r11
2749 mov -15(%rsi), %rcx
2750 mov -8(%rsi), %rdx
2751 mov %r8, -47(%rdi)
2752 mov %r9, -39(%rdi)
2753 mov %r10, -31(%rdi)
2754 mov %r11, -23(%rdi)
2755 mov %rcx, -15(%rdi)
2756 mov %rdx, -8(%rdi)
2757 ret
2758
e7044ea7 2759 .p2align 4
6fb8cbcb
L
2760L(write_39bytes):
2761 mov -39(%rsi), %r9
2762 mov -31(%rsi), %r10
2763 mov -23(%rsi), %r11
2764 mov -15(%rsi), %rcx
2765 mov -8(%rsi), %rdx
2766 mov %r9, -39(%rdi)
2767 mov %r10, -31(%rdi)
2768 mov %r11, -23(%rdi)
2769 mov %rcx, -15(%rdi)
2770 mov %rdx, -8(%rdi)
2771 ret
2772
e7044ea7 2773 .p2align 4
6fb8cbcb
L
2774L(write_31bytes):
2775 mov -31(%rsi), %r10
2776 mov -23(%rsi), %r11
2777 mov -15(%rsi), %rcx
2778 mov -8(%rsi), %rdx
2779 mov %r10, -31(%rdi)
2780 mov %r11, -23(%rdi)
2781 mov %rcx, -15(%rdi)
2782 mov %rdx, -8(%rdi)
2783 ret
2784
e7044ea7 2785 .p2align 4
6fb8cbcb
L
2786L(write_23bytes):
2787 mov -23(%rsi), %r11
2788 mov -15(%rsi), %rcx
2789 mov -8(%rsi), %rdx
2790 mov %r11, -23(%rdi)
2791 mov %rcx, -15(%rdi)
2792 mov %rdx, -8(%rdi)
2793 ret
2794
e7044ea7 2795 .p2align 4
6fb8cbcb
L
2796L(write_15bytes):
2797 mov -15(%rsi), %rcx
2798 mov -8(%rsi), %rdx
2799 mov %rcx, -15(%rdi)
2800 mov %rdx, -8(%rdi)
2801 ret
2802
e7044ea7 2803 .p2align 4
6fb8cbcb
L
2804L(write_7bytes):
2805 mov -7(%rsi), %edx
2806 mov -4(%rsi), %ecx
2807 mov %edx, -7(%rdi)
2808 mov %ecx, -4(%rdi)
2809 ret
2810
e7044ea7 2811 .p2align 4
6fb8cbcb
L
2812L(large_page_fwd):
2813 movdqu (%rsi), %xmm1
2814 lea 16(%rsi), %rsi
2815 movdqu %xmm0, (%r8)
2816 movntdq %xmm1, (%rdi)
2817 lea 16(%rdi), %rdi
2818 lea -0x90(%rdx), %rdx
2819#ifdef USE_AS_MEMMOVE
2820 mov %rsi, %r9
2821 sub %rdi, %r9
2822 cmp %rdx, %r9
2823 jae L(memmove_is_memcpy_fwd)
2824 shl $2, %rcx
2825 cmp %rcx, %rdx
2826 jb L(ll_cache_copy_fwd_start)
2827L(memmove_is_memcpy_fwd):
2828#endif
2829L(large_page_loop):
2830 movdqu (%rsi), %xmm0
2831 movdqu 0x10(%rsi), %xmm1
2832 movdqu 0x20(%rsi), %xmm2
2833 movdqu 0x30(%rsi), %xmm3
2834 movdqu 0x40(%rsi), %xmm4
2835 movdqu 0x50(%rsi), %xmm5
2836 movdqu 0x60(%rsi), %xmm6
2837 movdqu 0x70(%rsi), %xmm7
2838 lea 0x80(%rsi), %rsi
2839
2840 sub $0x80, %rdx
2841 movntdq %xmm0, (%rdi)
2842 movntdq %xmm1, 0x10(%rdi)
2843 movntdq %xmm2, 0x20(%rdi)
2844 movntdq %xmm3, 0x30(%rdi)
2845 movntdq %xmm4, 0x40(%rdi)
2846 movntdq %xmm5, 0x50(%rdi)
2847 movntdq %xmm6, 0x60(%rdi)
2848 movntdq %xmm7, 0x70(%rdi)
2849 lea 0x80(%rdi), %rdi
2850 jae L(large_page_loop)
2851 cmp $-0x40, %rdx
2852 lea 0x80(%rdx), %rdx
2853 jl L(large_page_less_64bytes)
2854
2855 movdqu (%rsi), %xmm0
2856 movdqu 0x10(%rsi), %xmm1
2857 movdqu 0x20(%rsi), %xmm2
2858 movdqu 0x30(%rsi), %xmm3
2859 lea 0x40(%rsi), %rsi
2860
2861 movntdq %xmm0, (%rdi)
2862 movntdq %xmm1, 0x10(%rdi)
2863 movntdq %xmm2, 0x20(%rdi)
2864 movntdq %xmm3, 0x30(%rdi)
2865 lea 0x40(%rdi), %rdi
2866 sub $0x40, %rdx
2867L(large_page_less_64bytes):
2868 add %rdx, %rsi
2869 add %rdx, %rdi
2870 sfence
2871 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2872
2873#ifdef USE_AS_MEMMOVE
e7044ea7 2874 .p2align 4
6fb8cbcb
L
2875L(ll_cache_copy_fwd_start):
2876 prefetcht0 0x1c0(%rsi)
2877 prefetcht0 0x200(%rsi)
2878 movdqu (%rsi), %xmm0
2879 movdqu 0x10(%rsi), %xmm1
2880 movdqu 0x20(%rsi), %xmm2
2881 movdqu 0x30(%rsi), %xmm3
2882 movdqu 0x40(%rsi), %xmm4
2883 movdqu 0x50(%rsi), %xmm5
2884 movdqu 0x60(%rsi), %xmm6
2885 movdqu 0x70(%rsi), %xmm7
2886 lea 0x80(%rsi), %rsi
2887
2888 sub $0x80, %rdx
2889 movaps %xmm0, (%rdi)
2890 movaps %xmm1, 0x10(%rdi)
2891 movaps %xmm2, 0x20(%rdi)
2892 movaps %xmm3, 0x30(%rdi)
2893 movaps %xmm4, 0x40(%rdi)
2894 movaps %xmm5, 0x50(%rdi)
2895 movaps %xmm6, 0x60(%rdi)
2896 movaps %xmm7, 0x70(%rdi)
2897 lea 0x80(%rdi), %rdi
2898 jae L(ll_cache_copy_fwd_start)
2899 cmp $-0x40, %rdx
2900 lea 0x80(%rdx), %rdx
2901 jl L(large_page_ll_less_fwd_64bytes)
2902
2903 movdqu (%rsi), %xmm0
2904 movdqu 0x10(%rsi), %xmm1
2905 movdqu 0x20(%rsi), %xmm2
2906 movdqu 0x30(%rsi), %xmm3
2907 lea 0x40(%rsi), %rsi
2908
2909 movaps %xmm0, (%rdi)
2910 movaps %xmm1, 0x10(%rdi)
2911 movaps %xmm2, 0x20(%rdi)
2912 movaps %xmm3, 0x30(%rdi)
2913 lea 0x40(%rdi), %rdi
2914 sub $0x40, %rdx
2915L(large_page_ll_less_fwd_64bytes):
2916 add %rdx, %rsi
2917 add %rdx, %rdi
2918 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2919
2920#endif
e7044ea7 2921 .p2align 4
6fb8cbcb
L
2922L(large_page_bwd):
2923 movdqu -0x10(%rsi), %xmm1
2924 lea -16(%rsi), %rsi
2925 movdqu %xmm0, (%r8)
2926 movdqa %xmm1, -0x10(%rdi)
2927 lea -16(%rdi), %rdi
2928 lea -0x90(%rdx), %rdx
2929#ifdef USE_AS_MEMMOVE
2930 mov %rdi, %r9
2931 sub %rsi, %r9
2932 cmp %rdx, %r9
2933 jae L(memmove_is_memcpy_bwd)
2934 cmp %rcx, %r9
2935 jb L(ll_cache_copy_bwd_start)
2936L(memmove_is_memcpy_bwd):
2937#endif
2938L(large_page_bwd_loop):
2939 movdqu -0x10(%rsi), %xmm0
2940 movdqu -0x20(%rsi), %xmm1
2941 movdqu -0x30(%rsi), %xmm2
2942 movdqu -0x40(%rsi), %xmm3
2943 movdqu -0x50(%rsi), %xmm4
2944 movdqu -0x60(%rsi), %xmm5
2945 movdqu -0x70(%rsi), %xmm6
2946 movdqu -0x80(%rsi), %xmm7
2947 lea -0x80(%rsi), %rsi
2948
2949 sub $0x80, %rdx
2950 movntdq %xmm0, -0x10(%rdi)
2951 movntdq %xmm1, -0x20(%rdi)
2952 movntdq %xmm2, -0x30(%rdi)
2953 movntdq %xmm3, -0x40(%rdi)
2954 movntdq %xmm4, -0x50(%rdi)
2955 movntdq %xmm5, -0x60(%rdi)
2956 movntdq %xmm6, -0x70(%rdi)
2957 movntdq %xmm7, -0x80(%rdi)
2958 lea -0x80(%rdi), %rdi
2959 jae L(large_page_bwd_loop)
2960 cmp $-0x40, %rdx
2961 lea 0x80(%rdx), %rdx
2962 jl L(large_page_less_bwd_64bytes)
2963
2964 movdqu -0x10(%rsi), %xmm0
2965 movdqu -0x20(%rsi), %xmm1
2966 movdqu -0x30(%rsi), %xmm2
2967 movdqu -0x40(%rsi), %xmm3
2968 lea -0x40(%rsi), %rsi
2969
2970 movntdq %xmm0, -0x10(%rdi)
2971 movntdq %xmm1, -0x20(%rdi)
2972 movntdq %xmm2, -0x30(%rdi)
2973 movntdq %xmm3, -0x40(%rdi)
2974 lea -0x40(%rdi), %rdi
2975 sub $0x40, %rdx
2976L(large_page_less_bwd_64bytes):
2977 sfence
2978 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2979
2980#ifdef USE_AS_MEMMOVE
e7044ea7 2981 .p2align 4
6fb8cbcb
L
2982L(ll_cache_copy_bwd_start):
2983 prefetcht0 -0x1c0(%rsi)
2984 prefetcht0 -0x200(%rsi)
2985 movdqu -0x10(%rsi), %xmm0
2986 movdqu -0x20(%rsi), %xmm1
2987 movdqu -0x30(%rsi), %xmm2
2988 movdqu -0x40(%rsi), %xmm3
2989 movdqu -0x50(%rsi), %xmm4
2990 movdqu -0x60(%rsi), %xmm5
2991 movdqu -0x70(%rsi), %xmm6
2992 movdqu -0x80(%rsi), %xmm7
2993 lea -0x80(%rsi), %rsi
2994
2995 sub $0x80, %rdx
2996 movaps %xmm0, -0x10(%rdi)
2997 movaps %xmm1, -0x20(%rdi)
2998 movaps %xmm2, -0x30(%rdi)
2999 movaps %xmm3, -0x40(%rdi)
3000 movaps %xmm4, -0x50(%rdi)
3001 movaps %xmm5, -0x60(%rdi)
3002 movaps %xmm6, -0x70(%rdi)
3003 movaps %xmm7, -0x80(%rdi)
3004 lea -0x80(%rdi), %rdi
3005 jae L(ll_cache_copy_bwd_start)
3006 cmp $-0x40, %rdx
3007 lea 0x80(%rdx), %rdx
3008 jl L(large_page_ll_less_bwd_64bytes)
3009
3010 movdqu -0x10(%rsi), %xmm0
3011 movdqu -0x20(%rsi), %xmm1
3012 movdqu -0x30(%rsi), %xmm2
3013 movdqu -0x40(%rsi), %xmm3
3014 lea -0x40(%rsi), %rsi
3015
3016 movaps %xmm0, -0x10(%rdi)
3017 movaps %xmm1, -0x20(%rdi)
3018 movaps %xmm2, -0x30(%rdi)
3019 movaps %xmm3, -0x40(%rdi)
3020 lea -0x40(%rdi), %rdi
3021 sub $0x40, %rdx
3022L(large_page_ll_less_bwd_64bytes):
3023 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3024#endif
3025
3026END (MEMCPY)
3027
3028 .section .rodata.ssse3,"a",@progbits
e7044ea7 3029 .p2align 3
6fb8cbcb
L
3030L(table_less_80bytes):
3031 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3111
e7044ea7 3112 .p2align 3
6fb8cbcb
L
3113L(shl_table):
3114 .int JMPTBL (L(shl_0), L(shl_table))
3115 .int JMPTBL (L(shl_1), L(shl_table))
3116 .int JMPTBL (L(shl_2), L(shl_table))
3117 .int JMPTBL (L(shl_3), L(shl_table))
3118 .int JMPTBL (L(shl_4), L(shl_table))
3119 .int JMPTBL (L(shl_5), L(shl_table))
3120 .int JMPTBL (L(shl_6), L(shl_table))
3121 .int JMPTBL (L(shl_7), L(shl_table))
3122 .int JMPTBL (L(shl_8), L(shl_table))
3123 .int JMPTBL (L(shl_9), L(shl_table))
3124 .int JMPTBL (L(shl_10), L(shl_table))
3125 .int JMPTBL (L(shl_11), L(shl_table))
3126 .int JMPTBL (L(shl_12), L(shl_table))
3127 .int JMPTBL (L(shl_13), L(shl_table))
3128 .int JMPTBL (L(shl_14), L(shl_table))
3129 .int JMPTBL (L(shl_15), L(shl_table))
3130
e7044ea7 3131 .p2align 3
6fb8cbcb
L
3132L(shl_table_bwd):
3133 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3134 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3135 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3149
3150#endif