]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcpy-ssse3.S
7d0755e258b4873b67d3f190da3460ec088cc7a3
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcpy-ssse3.S
1 /* memcpy with SSSE3
2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #if IS_IN (libc) \
23 && (defined SHARED \
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
26
27 #include "asm-syntax.h"
28
29 #ifndef MEMCPY
30 # define MEMCPY __memcpy_ssse3
31 # define MEMCPY_CHK __memcpy_chk_ssse3
32 #endif
33
34 #define JMPTBL(I, B) I - B
35
36 /* Branch to an entry in a jump table. TABLE is a jump table with
37 relative offsets. INDEX is a register contains the index into the
38 jump table. SCALE is the scale of INDEX. */
39 #define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
40 lea TABLE(%rip), %r11; \
41 movslq (%r11, INDEX, SCALE), INDEX; \
42 lea (%r11, INDEX), INDEX; \
43 jmp *INDEX; \
44 ud2
45
46 .section .text.ssse3,"ax",@progbits
47 #if !defined USE_AS_BCOPY
48 ENTRY (MEMCPY_CHK)
49 cmpq %rdx, %rcx
50 jb HIDDEN_JUMPTARGET (__chk_fail)
51 END (MEMCPY_CHK)
52 #endif
53
54 ENTRY (MEMCPY)
55 mov %rdi, %rax
56 #ifdef USE_AS_MEMPCPY
57 add %rdx, %rax
58 #endif
59
60 #ifdef USE_AS_MEMMOVE
61 cmp %rsi, %rdi
62 jb L(copy_forward)
63 je L(write_0bytes)
64 cmp $79, %rdx
65 jbe L(copy_forward)
66 jmp L(copy_backward)
67 L(copy_forward):
68 #endif
69 cmp $79, %rdx
70 lea L(table_less_80bytes)(%rip), %r11
71 ja L(80bytesormore)
72 movslq (%r11, %rdx, 4), %r9
73 add %rdx, %rsi
74 add %rdx, %rdi
75 add %r11, %r9
76 jmp *%r9
77 ud2
78
79 .p2align 4
80 L(80bytesormore):
81 #ifndef USE_AS_MEMMOVE
82 cmp %dil, %sil
83 jle L(copy_backward)
84 #endif
85
86 movdqu (%rsi), %xmm0
87 mov %rdi, %rcx
88 and $-16, %rdi
89 add $16, %rdi
90 mov %rcx, %r8
91 sub %rdi, %rcx
92 add %rcx, %rdx
93 sub %rcx, %rsi
94
95 #ifdef SHARED_CACHE_SIZE_HALF
96 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
97 #else
98 mov __x86_shared_cache_size_half(%rip), %RCX_LP
99 #endif
100 cmp %rcx, %rdx
101 mov %rsi, %r9
102 ja L(large_page_fwd)
103 and $0xf, %r9
104 jz L(shl_0)
105 #ifdef DATA_CACHE_SIZE_HALF
106 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
107 #else
108 mov __x86_data_cache_size_half(%rip), %RCX_LP
109 #endif
110 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
111
112 .p2align 4
113 L(copy_backward):
114 movdqu -16(%rsi, %rdx), %xmm0
115 add %rdx, %rsi
116 lea -16(%rdi, %rdx), %r8
117 add %rdx, %rdi
118
119 mov %rdi, %rcx
120 and $0xf, %rcx
121 xor %rcx, %rdi
122 sub %rcx, %rdx
123 sub %rcx, %rsi
124
125 #ifdef SHARED_CACHE_SIZE_HALF
126 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
127 #else
128 mov __x86_shared_cache_size_half(%rip), %RCX_LP
129 #endif
130
131 cmp %rcx, %rdx
132 mov %rsi, %r9
133 ja L(large_page_bwd)
134 and $0xf, %r9
135 jz L(shl_0_bwd)
136 #ifdef DATA_CACHE_SIZE_HALF
137 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
138 #else
139 mov __x86_data_cache_size_half(%rip), %RCX_LP
140 #endif
141 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
142
143 .p2align 4
144 L(shl_0):
145 sub $16, %rdx
146 movdqa (%rsi), %xmm1
147 add $16, %rsi
148 movdqa %xmm1, (%rdi)
149 add $16, %rdi
150 cmp $128, %rdx
151 movdqu %xmm0, (%r8)
152 ja L(shl_0_gobble)
153 cmp $64, %rdx
154 jb L(shl_0_less_64bytes)
155 movaps (%rsi), %xmm4
156 movaps 16(%rsi), %xmm1
157 movaps 32(%rsi), %xmm2
158 movaps 48(%rsi), %xmm3
159 movaps %xmm4, (%rdi)
160 movaps %xmm1, 16(%rdi)
161 movaps %xmm2, 32(%rdi)
162 movaps %xmm3, 48(%rdi)
163 sub $64, %rdx
164 add $64, %rsi
165 add $64, %rdi
166 L(shl_0_less_64bytes):
167 add %rdx, %rsi
168 add %rdx, %rdi
169 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
170
171 .p2align 4
172 L(shl_0_gobble):
173 #ifdef DATA_CACHE_SIZE_HALF
174 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
175 #else
176 cmp __x86_data_cache_size_half(%rip), %RDX_LP
177 #endif
178 lea -128(%rdx), %rdx
179 jae L(shl_0_gobble_mem_loop)
180 L(shl_0_gobble_cache_loop):
181 movdqa (%rsi), %xmm4
182 movaps 0x10(%rsi), %xmm1
183 movaps 0x20(%rsi), %xmm2
184 movaps 0x30(%rsi), %xmm3
185
186 movdqa %xmm4, (%rdi)
187 movaps %xmm1, 0x10(%rdi)
188 movaps %xmm2, 0x20(%rdi)
189 movaps %xmm3, 0x30(%rdi)
190
191 sub $128, %rdx
192 movaps 0x40(%rsi), %xmm4
193 movaps 0x50(%rsi), %xmm5
194 movaps 0x60(%rsi), %xmm6
195 movaps 0x70(%rsi), %xmm7
196 lea 0x80(%rsi), %rsi
197 movaps %xmm4, 0x40(%rdi)
198 movaps %xmm5, 0x50(%rdi)
199 movaps %xmm6, 0x60(%rdi)
200 movaps %xmm7, 0x70(%rdi)
201 lea 0x80(%rdi), %rdi
202
203 jae L(shl_0_gobble_cache_loop)
204 cmp $-0x40, %rdx
205 lea 0x80(%rdx), %rdx
206 jl L(shl_0_cache_less_64bytes)
207
208 movdqa (%rsi), %xmm4
209 sub $0x40, %rdx
210 movdqa 0x10(%rsi), %xmm1
211
212 movdqa %xmm4, (%rdi)
213 movdqa %xmm1, 0x10(%rdi)
214
215 movdqa 0x20(%rsi), %xmm4
216 movdqa 0x30(%rsi), %xmm1
217 add $0x40, %rsi
218
219 movdqa %xmm4, 0x20(%rdi)
220 movdqa %xmm1, 0x30(%rdi)
221 add $0x40, %rdi
222 L(shl_0_cache_less_64bytes):
223 add %rdx, %rsi
224 add %rdx, %rdi
225 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
226
227 .p2align 4
228 L(shl_0_gobble_mem_loop):
229 prefetcht0 0x1c0(%rsi)
230 prefetcht0 0x280(%rsi)
231
232 movdqa (%rsi), %xmm0
233 movdqa 0x10(%rsi), %xmm1
234 movdqa 0x20(%rsi), %xmm2
235 movdqa 0x30(%rsi), %xmm3
236 movdqa 0x40(%rsi), %xmm4
237 movdqa 0x50(%rsi), %xmm5
238 movdqa 0x60(%rsi), %xmm6
239 movdqa 0x70(%rsi), %xmm7
240 lea 0x80(%rsi), %rsi
241 sub $0x80, %rdx
242 movdqa %xmm0, (%rdi)
243 movdqa %xmm1, 0x10(%rdi)
244 movdqa %xmm2, 0x20(%rdi)
245 movdqa %xmm3, 0x30(%rdi)
246 movdqa %xmm4, 0x40(%rdi)
247 movdqa %xmm5, 0x50(%rdi)
248 movdqa %xmm6, 0x60(%rdi)
249 movdqa %xmm7, 0x70(%rdi)
250 lea 0x80(%rdi), %rdi
251
252 jae L(shl_0_gobble_mem_loop)
253 cmp $-0x40, %rdx
254 lea 0x80(%rdx), %rdx
255 jl L(shl_0_mem_less_64bytes)
256
257 movdqa (%rsi), %xmm0
258 sub $0x40, %rdx
259 movdqa 0x10(%rsi), %xmm1
260
261 movdqa %xmm0, (%rdi)
262 movdqa %xmm1, 0x10(%rdi)
263
264 movdqa 0x20(%rsi), %xmm0
265 movdqa 0x30(%rsi), %xmm1
266 add $0x40, %rsi
267
268 movdqa %xmm0, 0x20(%rdi)
269 movdqa %xmm1, 0x30(%rdi)
270 add $0x40, %rdi
271 L(shl_0_mem_less_64bytes):
272 cmp $0x20, %rdx
273 jb L(shl_0_mem_less_32bytes)
274 movdqa (%rsi), %xmm0
275 sub $0x20, %rdx
276 movdqa 0x10(%rsi), %xmm1
277 add $0x20, %rsi
278 movdqa %xmm0, (%rdi)
279 movdqa %xmm1, 0x10(%rdi)
280 add $0x20, %rdi
281 L(shl_0_mem_less_32bytes):
282 add %rdx, %rdi
283 add %rdx, %rsi
284 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
285
286 .p2align 4
287 L(shl_0_bwd):
288 sub $16, %rdx
289 movdqa -0x10(%rsi), %xmm1
290 sub $16, %rsi
291 movdqa %xmm1, -0x10(%rdi)
292 sub $16, %rdi
293 cmp $0x80, %rdx
294 movdqu %xmm0, (%r8)
295 ja L(shl_0_gobble_bwd)
296 cmp $64, %rdx
297 jb L(shl_0_less_64bytes_bwd)
298 movaps -0x10(%rsi), %xmm0
299 movaps -0x20(%rsi), %xmm1
300 movaps -0x30(%rsi), %xmm2
301 movaps -0x40(%rsi), %xmm3
302 movaps %xmm0, -0x10(%rdi)
303 movaps %xmm1, -0x20(%rdi)
304 movaps %xmm2, -0x30(%rdi)
305 movaps %xmm3, -0x40(%rdi)
306 sub $64, %rdx
307 sub $0x40, %rsi
308 sub $0x40, %rdi
309 L(shl_0_less_64bytes_bwd):
310 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
311
312 .p2align 4
313 L(shl_0_gobble_bwd):
314 #ifdef DATA_CACHE_SIZE_HALF
315 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
316 #else
317 cmp __x86_data_cache_size_half(%rip), %RDX_LP
318 #endif
319 lea -128(%rdx), %rdx
320 jae L(shl_0_gobble_mem_bwd_loop)
321 L(shl_0_gobble_bwd_loop):
322 movdqa -0x10(%rsi), %xmm0
323 movaps -0x20(%rsi), %xmm1
324 movaps -0x30(%rsi), %xmm2
325 movaps -0x40(%rsi), %xmm3
326
327 movdqa %xmm0, -0x10(%rdi)
328 movaps %xmm1, -0x20(%rdi)
329 movaps %xmm2, -0x30(%rdi)
330 movaps %xmm3, -0x40(%rdi)
331
332 sub $0x80, %rdx
333 movaps -0x50(%rsi), %xmm4
334 movaps -0x60(%rsi), %xmm5
335 movaps -0x70(%rsi), %xmm6
336 movaps -0x80(%rsi), %xmm7
337 lea -0x80(%rsi), %rsi
338 movaps %xmm4, -0x50(%rdi)
339 movaps %xmm5, -0x60(%rdi)
340 movaps %xmm6, -0x70(%rdi)
341 movaps %xmm7, -0x80(%rdi)
342 lea -0x80(%rdi), %rdi
343
344 jae L(shl_0_gobble_bwd_loop)
345 cmp $-0x40, %rdx
346 lea 0x80(%rdx), %rdx
347 jl L(shl_0_gobble_bwd_less_64bytes)
348
349 movdqa -0x10(%rsi), %xmm0
350 sub $0x40, %rdx
351 movdqa -0x20(%rsi), %xmm1
352
353 movdqa %xmm0, -0x10(%rdi)
354 movdqa %xmm1, -0x20(%rdi)
355
356 movdqa -0x30(%rsi), %xmm0
357 movdqa -0x40(%rsi), %xmm1
358 sub $0x40, %rsi
359
360 movdqa %xmm0, -0x30(%rdi)
361 movdqa %xmm1, -0x40(%rdi)
362 sub $0x40, %rdi
363 L(shl_0_gobble_bwd_less_64bytes):
364 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
365
366 .p2align 4
367 L(shl_0_gobble_mem_bwd_loop):
368 prefetcht0 -0x1c0(%rsi)
369 prefetcht0 -0x280(%rsi)
370 movdqa -0x10(%rsi), %xmm0
371 movdqa -0x20(%rsi), %xmm1
372 movdqa -0x30(%rsi), %xmm2
373 movdqa -0x40(%rsi), %xmm3
374 movdqa -0x50(%rsi), %xmm4
375 movdqa -0x60(%rsi), %xmm5
376 movdqa -0x70(%rsi), %xmm6
377 movdqa -0x80(%rsi), %xmm7
378 lea -0x80(%rsi), %rsi
379 sub $0x80, %rdx
380 movdqa %xmm0, -0x10(%rdi)
381 movdqa %xmm1, -0x20(%rdi)
382 movdqa %xmm2, -0x30(%rdi)
383 movdqa %xmm3, -0x40(%rdi)
384 movdqa %xmm4, -0x50(%rdi)
385 movdqa %xmm5, -0x60(%rdi)
386 movdqa %xmm6, -0x70(%rdi)
387 movdqa %xmm7, -0x80(%rdi)
388 lea -0x80(%rdi), %rdi
389
390 jae L(shl_0_gobble_mem_bwd_loop)
391 cmp $-0x40, %rdx
392 lea 0x80(%rdx), %rdx
393 jl L(shl_0_mem_bwd_less_64bytes)
394
395 movdqa -0x10(%rsi), %xmm0
396 sub $0x40, %rdx
397 movdqa -0x20(%rsi), %xmm1
398
399 movdqa %xmm0, -0x10(%rdi)
400 movdqa %xmm1, -0x20(%rdi)
401
402 movdqa -0x30(%rsi), %xmm0
403 movdqa -0x40(%rsi), %xmm1
404 sub $0x40, %rsi
405
406 movdqa %xmm0, -0x30(%rdi)
407 movdqa %xmm1, -0x40(%rdi)
408 sub $0x40, %rdi
409 L(shl_0_mem_bwd_less_64bytes):
410 cmp $0x20, %rdx
411 jb L(shl_0_mem_bwd_less_32bytes)
412 movdqa -0x10(%rsi), %xmm0
413 sub $0x20, %rdx
414 movdqa -0x20(%rsi), %xmm1
415 sub $0x20, %rsi
416 movdqa %xmm0, -0x10(%rdi)
417 movdqa %xmm1, -0x20(%rdi)
418 sub $0x20, %rdi
419 L(shl_0_mem_bwd_less_32bytes):
420 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
421
422 .p2align 4
423 L(shl_1):
424 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
425 cmp %rcx, %rdx
426 movaps -0x01(%rsi), %xmm1
427 jb L(L1_fwd)
428 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
429 L(L1_fwd):
430 lea -64(%rdx), %rdx
431 jmp *%r9
432 ud2
433 L(shl_1_loop_L2):
434 prefetchnta 0x1c0(%rsi)
435 L(shl_1_loop_L1):
436 sub $64, %rdx
437 movaps 0x0f(%rsi), %xmm2
438 movaps 0x1f(%rsi), %xmm3
439 movaps 0x2f(%rsi), %xmm4
440 movaps 0x3f(%rsi), %xmm5
441 movdqa %xmm5, %xmm6
442 palignr $1, %xmm4, %xmm5
443 lea 64(%rsi), %rsi
444 palignr $1, %xmm3, %xmm4
445 palignr $1, %xmm2, %xmm3
446 lea 64(%rdi), %rdi
447 palignr $1, %xmm1, %xmm2
448 movdqa %xmm6, %xmm1
449 movdqa %xmm2, -0x40(%rdi)
450 movaps %xmm3, -0x30(%rdi)
451 jb L(shl_1_end)
452 movaps %xmm4, -0x20(%rdi)
453 movaps %xmm5, -0x10(%rdi)
454 jmp *%r9
455 ud2
456 L(shl_1_end):
457 movaps %xmm4, -0x20(%rdi)
458 lea 64(%rdx), %rdx
459 movaps %xmm5, -0x10(%rdi)
460 add %rdx, %rdi
461 movdqu %xmm0, (%r8)
462 add %rdx, %rsi
463 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
464
465 .p2align 4
466 L(shl_1_bwd):
467 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
468 cmp %rcx, %rdx
469 movaps -0x01(%rsi), %xmm1
470 jb L(L1_bwd)
471 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
472 L(L1_bwd):
473 lea -64(%rdx), %rdx
474 jmp *%r9
475 ud2
476 L(shl_1_bwd_loop_L2):
477 prefetchnta -0x1c0(%rsi)
478 L(shl_1_bwd_loop_L1):
479 movaps -0x11(%rsi), %xmm2
480 sub $0x40, %rdx
481 movaps -0x21(%rsi), %xmm3
482 movaps -0x31(%rsi), %xmm4
483 movaps -0x41(%rsi), %xmm5
484 lea -0x40(%rsi), %rsi
485 palignr $1, %xmm2, %xmm1
486 palignr $1, %xmm3, %xmm2
487 palignr $1, %xmm4, %xmm3
488 palignr $1, %xmm5, %xmm4
489
490 movaps %xmm1, -0x10(%rdi)
491 movaps %xmm5, %xmm1
492
493 movaps %xmm2, -0x20(%rdi)
494 lea -0x40(%rdi), %rdi
495
496 movaps %xmm3, 0x10(%rdi)
497 jb L(shl_1_bwd_end)
498 movaps %xmm4, (%rdi)
499 jmp *%r9
500 ud2
501 L(shl_1_bwd_end):
502 movaps %xmm4, (%rdi)
503 lea 64(%rdx), %rdx
504 movdqu %xmm0, (%r8)
505 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
506
507 .p2align 4
508 L(shl_2):
509 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
510 cmp %rcx, %rdx
511 movaps -0x02(%rsi), %xmm1
512 jb L(L2_fwd)
513 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
514 L(L2_fwd):
515 lea -64(%rdx), %rdx
516 jmp *%r9
517 ud2
518 L(shl_2_loop_L2):
519 prefetchnta 0x1c0(%rsi)
520 L(shl_2_loop_L1):
521 sub $64, %rdx
522 movaps 0x0e(%rsi), %xmm2
523 movaps 0x1e(%rsi), %xmm3
524 movaps 0x2e(%rsi), %xmm4
525 movaps 0x3e(%rsi), %xmm5
526 movdqa %xmm5, %xmm6
527 palignr $2, %xmm4, %xmm5
528 lea 64(%rsi), %rsi
529 palignr $2, %xmm3, %xmm4
530 palignr $2, %xmm2, %xmm3
531 lea 64(%rdi), %rdi
532 palignr $2, %xmm1, %xmm2
533 movdqa %xmm6, %xmm1
534 movdqa %xmm2, -0x40(%rdi)
535 movaps %xmm3, -0x30(%rdi)
536 jb L(shl_2_end)
537 movaps %xmm4, -0x20(%rdi)
538 movaps %xmm5, -0x10(%rdi)
539 jmp *%r9
540 ud2
541 L(shl_2_end):
542 movaps %xmm4, -0x20(%rdi)
543 lea 64(%rdx), %rdx
544 movaps %xmm5, -0x10(%rdi)
545 add %rdx, %rdi
546 movdqu %xmm0, (%r8)
547 add %rdx, %rsi
548 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
549
550 .p2align 4
551 L(shl_2_bwd):
552 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
553 cmp %rcx, %rdx
554 movaps -0x02(%rsi), %xmm1
555 jb L(L2_bwd)
556 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
557 L(L2_bwd):
558 lea -64(%rdx), %rdx
559 jmp *%r9
560 ud2
561 L(shl_2_bwd_loop_L2):
562 prefetchnta -0x1c0(%rsi)
563 L(shl_2_bwd_loop_L1):
564 movaps -0x12(%rsi), %xmm2
565 sub $0x40, %rdx
566 movaps -0x22(%rsi), %xmm3
567 movaps -0x32(%rsi), %xmm4
568 movaps -0x42(%rsi), %xmm5
569 lea -0x40(%rsi), %rsi
570 palignr $2, %xmm2, %xmm1
571 palignr $2, %xmm3, %xmm2
572 palignr $2, %xmm4, %xmm3
573 palignr $2, %xmm5, %xmm4
574
575 movaps %xmm1, -0x10(%rdi)
576 movaps %xmm5, %xmm1
577
578 movaps %xmm2, -0x20(%rdi)
579 lea -0x40(%rdi), %rdi
580
581 movaps %xmm3, 0x10(%rdi)
582 jb L(shl_2_bwd_end)
583 movaps %xmm4, (%rdi)
584 jmp *%r9
585 ud2
586 L(shl_2_bwd_end):
587 movaps %xmm4, (%rdi)
588 lea 64(%rdx), %rdx
589 movdqu %xmm0, (%r8)
590 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
591
592 .p2align 4
593 L(shl_3):
594 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
595 cmp %rcx, %rdx
596 movaps -0x03(%rsi), %xmm1
597 jb L(L3_fwd)
598 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
599 L(L3_fwd):
600 lea -64(%rdx), %rdx
601 jmp *%r9
602 ud2
603 L(shl_3_loop_L2):
604 prefetchnta 0x1c0(%rsi)
605 L(shl_3_loop_L1):
606 sub $64, %rdx
607 movaps 0x0d(%rsi), %xmm2
608 movaps 0x1d(%rsi), %xmm3
609 movaps 0x2d(%rsi), %xmm4
610 movaps 0x3d(%rsi), %xmm5
611 movdqa %xmm5, %xmm6
612 palignr $3, %xmm4, %xmm5
613 lea 64(%rsi), %rsi
614 palignr $3, %xmm3, %xmm4
615 palignr $3, %xmm2, %xmm3
616 lea 64(%rdi), %rdi
617 palignr $3, %xmm1, %xmm2
618 movdqa %xmm6, %xmm1
619 movdqa %xmm2, -0x40(%rdi)
620 movaps %xmm3, -0x30(%rdi)
621 jb L(shl_3_end)
622 movaps %xmm4, -0x20(%rdi)
623 movaps %xmm5, -0x10(%rdi)
624 jmp *%r9
625 ud2
626 L(shl_3_end):
627 movaps %xmm4, -0x20(%rdi)
628 lea 64(%rdx), %rdx
629 movaps %xmm5, -0x10(%rdi)
630 add %rdx, %rdi
631 movdqu %xmm0, (%r8)
632 add %rdx, %rsi
633 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
634
635 .p2align 4
636 L(shl_3_bwd):
637 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
638 cmp %rcx, %rdx
639 movaps -0x03(%rsi), %xmm1
640 jb L(L3_bwd)
641 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
642 L(L3_bwd):
643 lea -64(%rdx), %rdx
644 jmp *%r9
645 ud2
646 L(shl_3_bwd_loop_L2):
647 prefetchnta -0x1c0(%rsi)
648 L(shl_3_bwd_loop_L1):
649 movaps -0x13(%rsi), %xmm2
650 sub $0x40, %rdx
651 movaps -0x23(%rsi), %xmm3
652 movaps -0x33(%rsi), %xmm4
653 movaps -0x43(%rsi), %xmm5
654 lea -0x40(%rsi), %rsi
655 palignr $3, %xmm2, %xmm1
656 palignr $3, %xmm3, %xmm2
657 palignr $3, %xmm4, %xmm3
658 palignr $3, %xmm5, %xmm4
659
660 movaps %xmm1, -0x10(%rdi)
661 movaps %xmm5, %xmm1
662
663 movaps %xmm2, -0x20(%rdi)
664 lea -0x40(%rdi), %rdi
665
666 movaps %xmm3, 0x10(%rdi)
667 jb L(shl_3_bwd_end)
668 movaps %xmm4, (%rdi)
669 jmp *%r9
670 ud2
671 L(shl_3_bwd_end):
672 movaps %xmm4, (%rdi)
673 lea 64(%rdx), %rdx
674 movdqu %xmm0, (%r8)
675 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
676
677 .p2align 4
678 L(shl_4):
679 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
680 cmp %rcx, %rdx
681 movaps -0x04(%rsi), %xmm1
682 jb L(L4_fwd)
683 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
684 L(L4_fwd):
685 lea -64(%rdx), %rdx
686 jmp *%r9
687 ud2
688 L(shl_4_loop_L2):
689 prefetchnta 0x1c0(%rsi)
690 L(shl_4_loop_L1):
691 sub $64, %rdx
692 movaps 0x0c(%rsi), %xmm2
693 movaps 0x1c(%rsi), %xmm3
694 movaps 0x2c(%rsi), %xmm4
695 movaps 0x3c(%rsi), %xmm5
696 movdqa %xmm5, %xmm6
697 palignr $4, %xmm4, %xmm5
698 lea 64(%rsi), %rsi
699 palignr $4, %xmm3, %xmm4
700 palignr $4, %xmm2, %xmm3
701 lea 64(%rdi), %rdi
702 palignr $4, %xmm1, %xmm2
703 movdqa %xmm6, %xmm1
704 movdqa %xmm2, -0x40(%rdi)
705 movaps %xmm3, -0x30(%rdi)
706 jb L(shl_4_end)
707 movaps %xmm4, -0x20(%rdi)
708 movaps %xmm5, -0x10(%rdi)
709 jmp *%r9
710 ud2
711 L(shl_4_end):
712 movaps %xmm4, -0x20(%rdi)
713 lea 64(%rdx), %rdx
714 movaps %xmm5, -0x10(%rdi)
715 add %rdx, %rdi
716 movdqu %xmm0, (%r8)
717 add %rdx, %rsi
718 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
719
720 .p2align 4
721 L(shl_4_bwd):
722 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
723 cmp %rcx, %rdx
724 movaps -0x04(%rsi), %xmm1
725 jb L(L4_bwd)
726 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
727 L(L4_bwd):
728 lea -64(%rdx), %rdx
729 jmp *%r9
730 ud2
731 L(shl_4_bwd_loop_L2):
732 prefetchnta -0x1c0(%rsi)
733 L(shl_4_bwd_loop_L1):
734 movaps -0x14(%rsi), %xmm2
735 sub $0x40, %rdx
736 movaps -0x24(%rsi), %xmm3
737 movaps -0x34(%rsi), %xmm4
738 movaps -0x44(%rsi), %xmm5
739 lea -0x40(%rsi), %rsi
740 palignr $4, %xmm2, %xmm1
741 palignr $4, %xmm3, %xmm2
742 palignr $4, %xmm4, %xmm3
743 palignr $4, %xmm5, %xmm4
744
745 movaps %xmm1, -0x10(%rdi)
746 movaps %xmm5, %xmm1
747
748 movaps %xmm2, -0x20(%rdi)
749 lea -0x40(%rdi), %rdi
750
751 movaps %xmm3, 0x10(%rdi)
752 jb L(shl_4_bwd_end)
753 movaps %xmm4, (%rdi)
754 jmp *%r9
755 ud2
756 L(shl_4_bwd_end):
757 movaps %xmm4, (%rdi)
758 lea 64(%rdx), %rdx
759 movdqu %xmm0, (%r8)
760 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
761
762 .p2align 4
763 L(shl_5):
764 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
765 cmp %rcx, %rdx
766 movaps -0x05(%rsi), %xmm1
767 jb L(L5_fwd)
768 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
769 L(L5_fwd):
770 lea -64(%rdx), %rdx
771 jmp *%r9
772 ud2
773 L(shl_5_loop_L2):
774 prefetchnta 0x1c0(%rsi)
775 L(shl_5_loop_L1):
776 sub $64, %rdx
777 movaps 0x0b(%rsi), %xmm2
778 movaps 0x1b(%rsi), %xmm3
779 movaps 0x2b(%rsi), %xmm4
780 movaps 0x3b(%rsi), %xmm5
781 movdqa %xmm5, %xmm6
782 palignr $5, %xmm4, %xmm5
783 lea 64(%rsi), %rsi
784 palignr $5, %xmm3, %xmm4
785 palignr $5, %xmm2, %xmm3
786 lea 64(%rdi), %rdi
787 palignr $5, %xmm1, %xmm2
788 movdqa %xmm6, %xmm1
789 movdqa %xmm2, -0x40(%rdi)
790 movaps %xmm3, -0x30(%rdi)
791 jb L(shl_5_end)
792 movaps %xmm4, -0x20(%rdi)
793 movaps %xmm5, -0x10(%rdi)
794 jmp *%r9
795 ud2
796 L(shl_5_end):
797 movaps %xmm4, -0x20(%rdi)
798 lea 64(%rdx), %rdx
799 movaps %xmm5, -0x10(%rdi)
800 add %rdx, %rdi
801 movdqu %xmm0, (%r8)
802 add %rdx, %rsi
803 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
804
805 .p2align 4
806 L(shl_5_bwd):
807 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
808 cmp %rcx, %rdx
809 movaps -0x05(%rsi), %xmm1
810 jb L(L5_bwd)
811 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
812 L(L5_bwd):
813 lea -64(%rdx), %rdx
814 jmp *%r9
815 ud2
816 L(shl_5_bwd_loop_L2):
817 prefetchnta -0x1c0(%rsi)
818 L(shl_5_bwd_loop_L1):
819 movaps -0x15(%rsi), %xmm2
820 sub $0x40, %rdx
821 movaps -0x25(%rsi), %xmm3
822 movaps -0x35(%rsi), %xmm4
823 movaps -0x45(%rsi), %xmm5
824 lea -0x40(%rsi), %rsi
825 palignr $5, %xmm2, %xmm1
826 palignr $5, %xmm3, %xmm2
827 palignr $5, %xmm4, %xmm3
828 palignr $5, %xmm5, %xmm4
829
830 movaps %xmm1, -0x10(%rdi)
831 movaps %xmm5, %xmm1
832
833 movaps %xmm2, -0x20(%rdi)
834 lea -0x40(%rdi), %rdi
835
836 movaps %xmm3, 0x10(%rdi)
837 jb L(shl_5_bwd_end)
838 movaps %xmm4, (%rdi)
839 jmp *%r9
840 ud2
841 L(shl_5_bwd_end):
842 movaps %xmm4, (%rdi)
843 lea 64(%rdx), %rdx
844 movdqu %xmm0, (%r8)
845 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
846
847 .p2align 4
848 L(shl_6):
849 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
850 cmp %rcx, %rdx
851 movaps -0x06(%rsi), %xmm1
852 jb L(L6_fwd)
853 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
854 L(L6_fwd):
855 lea -64(%rdx), %rdx
856 jmp *%r9
857 ud2
858 L(shl_6_loop_L2):
859 prefetchnta 0x1c0(%rsi)
860 L(shl_6_loop_L1):
861 sub $64, %rdx
862 movaps 0x0a(%rsi), %xmm2
863 movaps 0x1a(%rsi), %xmm3
864 movaps 0x2a(%rsi), %xmm4
865 movaps 0x3a(%rsi), %xmm5
866 movdqa %xmm5, %xmm6
867 palignr $6, %xmm4, %xmm5
868 lea 64(%rsi), %rsi
869 palignr $6, %xmm3, %xmm4
870 palignr $6, %xmm2, %xmm3
871 lea 64(%rdi), %rdi
872 palignr $6, %xmm1, %xmm2
873 movdqa %xmm6, %xmm1
874 movdqa %xmm2, -0x40(%rdi)
875 movaps %xmm3, -0x30(%rdi)
876 jb L(shl_6_end)
877 movaps %xmm4, -0x20(%rdi)
878 movaps %xmm5, -0x10(%rdi)
879 jmp *%r9
880 ud2
881 L(shl_6_end):
882 movaps %xmm4, -0x20(%rdi)
883 lea 64(%rdx), %rdx
884 movaps %xmm5, -0x10(%rdi)
885 add %rdx, %rdi
886 movdqu %xmm0, (%r8)
887 add %rdx, %rsi
888 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
889
890 .p2align 4
891 L(shl_6_bwd):
892 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
893 cmp %rcx, %rdx
894 movaps -0x06(%rsi), %xmm1
895 jb L(L6_bwd)
896 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
897 L(L6_bwd):
898 lea -64(%rdx), %rdx
899 jmp *%r9
900 ud2
901 L(shl_6_bwd_loop_L2):
902 prefetchnta -0x1c0(%rsi)
903 L(shl_6_bwd_loop_L1):
904 movaps -0x16(%rsi), %xmm2
905 sub $0x40, %rdx
906 movaps -0x26(%rsi), %xmm3
907 movaps -0x36(%rsi), %xmm4
908 movaps -0x46(%rsi), %xmm5
909 lea -0x40(%rsi), %rsi
910 palignr $6, %xmm2, %xmm1
911 palignr $6, %xmm3, %xmm2
912 palignr $6, %xmm4, %xmm3
913 palignr $6, %xmm5, %xmm4
914
915 movaps %xmm1, -0x10(%rdi)
916 movaps %xmm5, %xmm1
917
918 movaps %xmm2, -0x20(%rdi)
919 lea -0x40(%rdi), %rdi
920
921 movaps %xmm3, 0x10(%rdi)
922 jb L(shl_6_bwd_end)
923 movaps %xmm4, (%rdi)
924 jmp *%r9
925 ud2
926 L(shl_6_bwd_end):
927 movaps %xmm4, (%rdi)
928 lea 64(%rdx), %rdx
929 movdqu %xmm0, (%r8)
930 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
931
932 .p2align 4
933 L(shl_7):
934 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
935 cmp %rcx, %rdx
936 movaps -0x07(%rsi), %xmm1
937 jb L(L7_fwd)
938 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
939 L(L7_fwd):
940 lea -64(%rdx), %rdx
941 jmp *%r9
942 ud2
943 L(shl_7_loop_L2):
944 prefetchnta 0x1c0(%rsi)
945 L(shl_7_loop_L1):
946 sub $64, %rdx
947 movaps 0x09(%rsi), %xmm2
948 movaps 0x19(%rsi), %xmm3
949 movaps 0x29(%rsi), %xmm4
950 movaps 0x39(%rsi), %xmm5
951 movdqa %xmm5, %xmm6
952 palignr $7, %xmm4, %xmm5
953 lea 64(%rsi), %rsi
954 palignr $7, %xmm3, %xmm4
955 palignr $7, %xmm2, %xmm3
956 lea 64(%rdi), %rdi
957 palignr $7, %xmm1, %xmm2
958 movdqa %xmm6, %xmm1
959 movdqa %xmm2, -0x40(%rdi)
960 movaps %xmm3, -0x30(%rdi)
961 jb L(shl_7_end)
962 movaps %xmm4, -0x20(%rdi)
963 movaps %xmm5, -0x10(%rdi)
964 jmp *%r9
965 ud2
966 L(shl_7_end):
967 movaps %xmm4, -0x20(%rdi)
968 lea 64(%rdx), %rdx
969 movaps %xmm5, -0x10(%rdi)
970 add %rdx, %rdi
971 movdqu %xmm0, (%r8)
972 add %rdx, %rsi
973 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
974
975 .p2align 4
976 L(shl_7_bwd):
977 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
978 cmp %rcx, %rdx
979 movaps -0x07(%rsi), %xmm1
980 jb L(L7_bwd)
981 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
982 L(L7_bwd):
983 lea -64(%rdx), %rdx
984 jmp *%r9
985 ud2
986 L(shl_7_bwd_loop_L2):
987 prefetchnta -0x1c0(%rsi)
988 L(shl_7_bwd_loop_L1):
989 movaps -0x17(%rsi), %xmm2
990 sub $0x40, %rdx
991 movaps -0x27(%rsi), %xmm3
992 movaps -0x37(%rsi), %xmm4
993 movaps -0x47(%rsi), %xmm5
994 lea -0x40(%rsi), %rsi
995 palignr $7, %xmm2, %xmm1
996 palignr $7, %xmm3, %xmm2
997 palignr $7, %xmm4, %xmm3
998 palignr $7, %xmm5, %xmm4
999
1000 movaps %xmm1, -0x10(%rdi)
1001 movaps %xmm5, %xmm1
1002
1003 movaps %xmm2, -0x20(%rdi)
1004 lea -0x40(%rdi), %rdi
1005
1006 movaps %xmm3, 0x10(%rdi)
1007 jb L(shl_7_bwd_end)
1008 movaps %xmm4, (%rdi)
1009 jmp *%r9
1010 ud2
1011 L(shl_7_bwd_end):
1012 movaps %xmm4, (%rdi)
1013 lea 64(%rdx), %rdx
1014 movdqu %xmm0, (%r8)
1015 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1016
1017 .p2align 4
1018 L(shl_8):
1019 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1020 cmp %rcx, %rdx
1021 movaps -0x08(%rsi), %xmm1
1022 jb L(L8_fwd)
1023 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1024 L(L8_fwd):
1025 lea -64(%rdx), %rdx
1026 jmp *%r9
1027 L(shl_8_loop_L2):
1028 prefetchnta 0x1c0(%rsi)
1029 L(shl_8_loop_L1):
1030 sub $64, %rdx
1031 movaps 0x08(%rsi), %xmm2
1032 movaps 0x18(%rsi), %xmm3
1033 movaps 0x28(%rsi), %xmm4
1034 movaps 0x38(%rsi), %xmm5
1035 movdqa %xmm5, %xmm6
1036 palignr $8, %xmm4, %xmm5
1037 lea 64(%rsi), %rsi
1038 palignr $8, %xmm3, %xmm4
1039 palignr $8, %xmm2, %xmm3
1040 lea 64(%rdi), %rdi
1041 palignr $8, %xmm1, %xmm2
1042 movdqa %xmm6, %xmm1
1043 movdqa %xmm2, -0x40(%rdi)
1044 movaps %xmm3, -0x30(%rdi)
1045 jb L(shl_8_end)
1046 movaps %xmm4, -0x20(%rdi)
1047 movaps %xmm5, -0x10(%rdi)
1048 jmp *%r9
1049 ud2
1050 .p2align 4
1051 L(shl_8_end):
1052 lea 64(%rdx), %rdx
1053 movaps %xmm4, -0x20(%rdi)
1054 add %rdx, %rsi
1055 movaps %xmm5, -0x10(%rdi)
1056 add %rdx, %rdi
1057 movdqu %xmm0, (%r8)
1058 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1059
1060 .p2align 4
1061 L(shl_8_bwd):
1062 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1063 cmp %rcx, %rdx
1064 movaps -0x08(%rsi), %xmm1
1065 jb L(L8_bwd)
1066 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1067 L(L8_bwd):
1068 lea -64(%rdx), %rdx
1069 jmp *%r9
1070 ud2
1071 L(shl_8_bwd_loop_L2):
1072 prefetchnta -0x1c0(%rsi)
1073 L(shl_8_bwd_loop_L1):
1074 movaps -0x18(%rsi), %xmm2
1075 sub $0x40, %rdx
1076 movaps -0x28(%rsi), %xmm3
1077 movaps -0x38(%rsi), %xmm4
1078 movaps -0x48(%rsi), %xmm5
1079 lea -0x40(%rsi), %rsi
1080 palignr $8, %xmm2, %xmm1
1081 palignr $8, %xmm3, %xmm2
1082 palignr $8, %xmm4, %xmm3
1083 palignr $8, %xmm5, %xmm4
1084
1085 movaps %xmm1, -0x10(%rdi)
1086 movaps %xmm5, %xmm1
1087
1088 movaps %xmm2, -0x20(%rdi)
1089 lea -0x40(%rdi), %rdi
1090
1091 movaps %xmm3, 0x10(%rdi)
1092 jb L(shl_8_bwd_end)
1093 movaps %xmm4, (%rdi)
1094 jmp *%r9
1095 ud2
1096 L(shl_8_bwd_end):
1097 movaps %xmm4, (%rdi)
1098 lea 64(%rdx), %rdx
1099 movdqu %xmm0, (%r8)
1100 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1101
1102 .p2align 4
1103 L(shl_9):
1104 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1105 cmp %rcx, %rdx
1106 movaps -0x09(%rsi), %xmm1
1107 jb L(L9_fwd)
1108 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1109 L(L9_fwd):
1110 lea -64(%rdx), %rdx
1111 jmp *%r9
1112 ud2
1113 L(shl_9_loop_L2):
1114 prefetchnta 0x1c0(%rsi)
1115 L(shl_9_loop_L1):
1116 sub $64, %rdx
1117 movaps 0x07(%rsi), %xmm2
1118 movaps 0x17(%rsi), %xmm3
1119 movaps 0x27(%rsi), %xmm4
1120 movaps 0x37(%rsi), %xmm5
1121 movdqa %xmm5, %xmm6
1122 palignr $9, %xmm4, %xmm5
1123 lea 64(%rsi), %rsi
1124 palignr $9, %xmm3, %xmm4
1125 palignr $9, %xmm2, %xmm3
1126 lea 64(%rdi), %rdi
1127 palignr $9, %xmm1, %xmm2
1128 movdqa %xmm6, %xmm1
1129 movdqa %xmm2, -0x40(%rdi)
1130 movaps %xmm3, -0x30(%rdi)
1131 jb L(shl_9_end)
1132 movaps %xmm4, -0x20(%rdi)
1133 movaps %xmm5, -0x10(%rdi)
1134 jmp *%r9
1135 ud2
1136 L(shl_9_end):
1137 movaps %xmm4, -0x20(%rdi)
1138 lea 64(%rdx), %rdx
1139 movaps %xmm5, -0x10(%rdi)
1140 add %rdx, %rdi
1141 movdqu %xmm0, (%r8)
1142 add %rdx, %rsi
1143 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1144
1145 .p2align 4
1146 L(shl_9_bwd):
1147 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1148 cmp %rcx, %rdx
1149 movaps -0x09(%rsi), %xmm1
1150 jb L(L9_bwd)
1151 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1152 L(L9_bwd):
1153 lea -64(%rdx), %rdx
1154 jmp *%r9
1155 ud2
1156 L(shl_9_bwd_loop_L2):
1157 prefetchnta -0x1c0(%rsi)
1158 L(shl_9_bwd_loop_L1):
1159 movaps -0x19(%rsi), %xmm2
1160 sub $0x40, %rdx
1161 movaps -0x29(%rsi), %xmm3
1162 movaps -0x39(%rsi), %xmm4
1163 movaps -0x49(%rsi), %xmm5
1164 lea -0x40(%rsi), %rsi
1165 palignr $9, %xmm2, %xmm1
1166 palignr $9, %xmm3, %xmm2
1167 palignr $9, %xmm4, %xmm3
1168 palignr $9, %xmm5, %xmm4
1169
1170 movaps %xmm1, -0x10(%rdi)
1171 movaps %xmm5, %xmm1
1172
1173 movaps %xmm2, -0x20(%rdi)
1174 lea -0x40(%rdi), %rdi
1175
1176 movaps %xmm3, 0x10(%rdi)
1177 jb L(shl_9_bwd_end)
1178 movaps %xmm4, (%rdi)
1179 jmp *%r9
1180 ud2
1181 L(shl_9_bwd_end):
1182 movaps %xmm4, (%rdi)
1183 lea 64(%rdx), %rdx
1184 movdqu %xmm0, (%r8)
1185 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1186
1187 .p2align 4
1188 L(shl_10):
1189 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1190 cmp %rcx, %rdx
1191 movaps -0x0a(%rsi), %xmm1
1192 jb L(L10_fwd)
1193 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1194 L(L10_fwd):
1195 lea -64(%rdx), %rdx
1196 jmp *%r9
1197 ud2
1198 L(shl_10_loop_L2):
1199 prefetchnta 0x1c0(%rsi)
1200 L(shl_10_loop_L1):
1201 sub $64, %rdx
1202 movaps 0x06(%rsi), %xmm2
1203 movaps 0x16(%rsi), %xmm3
1204 movaps 0x26(%rsi), %xmm4
1205 movaps 0x36(%rsi), %xmm5
1206 movdqa %xmm5, %xmm6
1207 palignr $10, %xmm4, %xmm5
1208 lea 64(%rsi), %rsi
1209 palignr $10, %xmm3, %xmm4
1210 palignr $10, %xmm2, %xmm3
1211 lea 64(%rdi), %rdi
1212 palignr $10, %xmm1, %xmm2
1213 movdqa %xmm6, %xmm1
1214 movdqa %xmm2, -0x40(%rdi)
1215 movaps %xmm3, -0x30(%rdi)
1216 jb L(shl_10_end)
1217 movaps %xmm4, -0x20(%rdi)
1218 movaps %xmm5, -0x10(%rdi)
1219 jmp *%r9
1220 ud2
1221 L(shl_10_end):
1222 movaps %xmm4, -0x20(%rdi)
1223 lea 64(%rdx), %rdx
1224 movaps %xmm5, -0x10(%rdi)
1225 add %rdx, %rdi
1226 movdqu %xmm0, (%r8)
1227 add %rdx, %rsi
1228 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1229
1230 .p2align 4
1231 L(shl_10_bwd):
1232 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1233 cmp %rcx, %rdx
1234 movaps -0x0a(%rsi), %xmm1
1235 jb L(L10_bwd)
1236 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1237 L(L10_bwd):
1238 lea -64(%rdx), %rdx
1239 jmp *%r9
1240 ud2
1241 L(shl_10_bwd_loop_L2):
1242 prefetchnta -0x1c0(%rsi)
1243 L(shl_10_bwd_loop_L1):
1244 movaps -0x1a(%rsi), %xmm2
1245 sub $0x40, %rdx
1246 movaps -0x2a(%rsi), %xmm3
1247 movaps -0x3a(%rsi), %xmm4
1248 movaps -0x4a(%rsi), %xmm5
1249 lea -0x40(%rsi), %rsi
1250 palignr $10, %xmm2, %xmm1
1251 palignr $10, %xmm3, %xmm2
1252 palignr $10, %xmm4, %xmm3
1253 palignr $10, %xmm5, %xmm4
1254
1255 movaps %xmm1, -0x10(%rdi)
1256 movaps %xmm5, %xmm1
1257
1258 movaps %xmm2, -0x20(%rdi)
1259 lea -0x40(%rdi), %rdi
1260
1261 movaps %xmm3, 0x10(%rdi)
1262 jb L(shl_10_bwd_end)
1263 movaps %xmm4, (%rdi)
1264 jmp *%r9
1265 ud2
1266 L(shl_10_bwd_end):
1267 movaps %xmm4, (%rdi)
1268 lea 64(%rdx), %rdx
1269 movdqu %xmm0, (%r8)
1270 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1271
1272 .p2align 4
1273 L(shl_11):
1274 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1275 cmp %rcx, %rdx
1276 movaps -0x0b(%rsi), %xmm1
1277 jb L(L11_fwd)
1278 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1279 L(L11_fwd):
1280 lea -64(%rdx), %rdx
1281 jmp *%r9
1282 ud2
1283 L(shl_11_loop_L2):
1284 prefetchnta 0x1c0(%rsi)
1285 L(shl_11_loop_L1):
1286 sub $64, %rdx
1287 movaps 0x05(%rsi), %xmm2
1288 movaps 0x15(%rsi), %xmm3
1289 movaps 0x25(%rsi), %xmm4
1290 movaps 0x35(%rsi), %xmm5
1291 movdqa %xmm5, %xmm6
1292 palignr $11, %xmm4, %xmm5
1293 lea 64(%rsi), %rsi
1294 palignr $11, %xmm3, %xmm4
1295 palignr $11, %xmm2, %xmm3
1296 lea 64(%rdi), %rdi
1297 palignr $11, %xmm1, %xmm2
1298 movdqa %xmm6, %xmm1
1299 movdqa %xmm2, -0x40(%rdi)
1300 movaps %xmm3, -0x30(%rdi)
1301 jb L(shl_11_end)
1302 movaps %xmm4, -0x20(%rdi)
1303 movaps %xmm5, -0x10(%rdi)
1304 jmp *%r9
1305 ud2
1306 L(shl_11_end):
1307 movaps %xmm4, -0x20(%rdi)
1308 lea 64(%rdx), %rdx
1309 movaps %xmm5, -0x10(%rdi)
1310 add %rdx, %rdi
1311 movdqu %xmm0, (%r8)
1312 add %rdx, %rsi
1313 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1314
1315 .p2align 4
1316 L(shl_11_bwd):
1317 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1318 cmp %rcx, %rdx
1319 movaps -0x0b(%rsi), %xmm1
1320 jb L(L11_bwd)
1321 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1322 L(L11_bwd):
1323 lea -64(%rdx), %rdx
1324 jmp *%r9
1325 ud2
1326 L(shl_11_bwd_loop_L2):
1327 prefetchnta -0x1c0(%rsi)
1328 L(shl_11_bwd_loop_L1):
1329 movaps -0x1b(%rsi), %xmm2
1330 sub $0x40, %rdx
1331 movaps -0x2b(%rsi), %xmm3
1332 movaps -0x3b(%rsi), %xmm4
1333 movaps -0x4b(%rsi), %xmm5
1334 lea -0x40(%rsi), %rsi
1335 palignr $11, %xmm2, %xmm1
1336 palignr $11, %xmm3, %xmm2
1337 palignr $11, %xmm4, %xmm3
1338 palignr $11, %xmm5, %xmm4
1339
1340 movaps %xmm1, -0x10(%rdi)
1341 movaps %xmm5, %xmm1
1342
1343 movaps %xmm2, -0x20(%rdi)
1344 lea -0x40(%rdi), %rdi
1345
1346 movaps %xmm3, 0x10(%rdi)
1347 jb L(shl_11_bwd_end)
1348 movaps %xmm4, (%rdi)
1349 jmp *%r9
1350 ud2
1351 L(shl_11_bwd_end):
1352 movaps %xmm4, (%rdi)
1353 lea 64(%rdx), %rdx
1354 movdqu %xmm0, (%r8)
1355 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1356
1357 .p2align 4
1358 L(shl_12):
1359 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1360 cmp %rcx, %rdx
1361 movaps -0x0c(%rsi), %xmm1
1362 jb L(L12_fwd)
1363 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1364 L(L12_fwd):
1365 lea -64(%rdx), %rdx
1366 jmp *%r9
1367 ud2
1368 L(shl_12_loop_L2):
1369 prefetchnta 0x1c0(%rsi)
1370 L(shl_12_loop_L1):
1371 sub $64, %rdx
1372 movaps 0x04(%rsi), %xmm2
1373 movaps 0x14(%rsi), %xmm3
1374 movaps 0x24(%rsi), %xmm4
1375 movaps 0x34(%rsi), %xmm5
1376 movdqa %xmm5, %xmm6
1377 palignr $12, %xmm4, %xmm5
1378 lea 64(%rsi), %rsi
1379 palignr $12, %xmm3, %xmm4
1380 palignr $12, %xmm2, %xmm3
1381 lea 64(%rdi), %rdi
1382 palignr $12, %xmm1, %xmm2
1383 movdqa %xmm6, %xmm1
1384 movdqa %xmm2, -0x40(%rdi)
1385 movaps %xmm3, -0x30(%rdi)
1386 jb L(shl_12_end)
1387 movaps %xmm4, -0x20(%rdi)
1388 movaps %xmm5, -0x10(%rdi)
1389 jmp *%r9
1390 ud2
1391 L(shl_12_end):
1392 movaps %xmm4, -0x20(%rdi)
1393 lea 64(%rdx), %rdx
1394 movaps %xmm5, -0x10(%rdi)
1395 add %rdx, %rdi
1396 movdqu %xmm0, (%r8)
1397 add %rdx, %rsi
1398 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1399
1400 .p2align 4
1401 L(shl_12_bwd):
1402 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1403 cmp %rcx, %rdx
1404 movaps -0x0c(%rsi), %xmm1
1405 jb L(L12_bwd)
1406 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1407 L(L12_bwd):
1408 lea -64(%rdx), %rdx
1409 jmp *%r9
1410 ud2
1411 L(shl_12_bwd_loop_L2):
1412 prefetchnta -0x1c0(%rsi)
1413 L(shl_12_bwd_loop_L1):
1414 movaps -0x1c(%rsi), %xmm2
1415 sub $0x40, %rdx
1416 movaps -0x2c(%rsi), %xmm3
1417 movaps -0x3c(%rsi), %xmm4
1418 movaps -0x4c(%rsi), %xmm5
1419 lea -0x40(%rsi), %rsi
1420 palignr $12, %xmm2, %xmm1
1421 palignr $12, %xmm3, %xmm2
1422 palignr $12, %xmm4, %xmm3
1423 palignr $12, %xmm5, %xmm4
1424
1425 movaps %xmm1, -0x10(%rdi)
1426 movaps %xmm5, %xmm1
1427
1428 movaps %xmm2, -0x20(%rdi)
1429 lea -0x40(%rdi), %rdi
1430
1431 movaps %xmm3, 0x10(%rdi)
1432 jb L(shl_12_bwd_end)
1433 movaps %xmm4, (%rdi)
1434 jmp *%r9
1435 ud2
1436 L(shl_12_bwd_end):
1437 movaps %xmm4, (%rdi)
1438 lea 64(%rdx), %rdx
1439 movdqu %xmm0, (%r8)
1440 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1441
1442 .p2align 4
1443 L(shl_13):
1444 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1445 cmp %rcx, %rdx
1446 movaps -0x0d(%rsi), %xmm1
1447 jb L(L13_fwd)
1448 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1449 L(L13_fwd):
1450 lea -64(%rdx), %rdx
1451 jmp *%r9
1452 ud2
1453 L(shl_13_loop_L2):
1454 prefetchnta 0x1c0(%rsi)
1455 L(shl_13_loop_L1):
1456 sub $64, %rdx
1457 movaps 0x03(%rsi), %xmm2
1458 movaps 0x13(%rsi), %xmm3
1459 movaps 0x23(%rsi), %xmm4
1460 movaps 0x33(%rsi), %xmm5
1461 movdqa %xmm5, %xmm6
1462 palignr $13, %xmm4, %xmm5
1463 lea 64(%rsi), %rsi
1464 palignr $13, %xmm3, %xmm4
1465 palignr $13, %xmm2, %xmm3
1466 lea 64(%rdi), %rdi
1467 palignr $13, %xmm1, %xmm2
1468 movdqa %xmm6, %xmm1
1469 movdqa %xmm2, -0x40(%rdi)
1470 movaps %xmm3, -0x30(%rdi)
1471 jb L(shl_13_end)
1472 movaps %xmm4, -0x20(%rdi)
1473 movaps %xmm5, -0x10(%rdi)
1474 jmp *%r9
1475 ud2
1476 L(shl_13_end):
1477 movaps %xmm4, -0x20(%rdi)
1478 lea 64(%rdx), %rdx
1479 movaps %xmm5, -0x10(%rdi)
1480 add %rdx, %rdi
1481 movdqu %xmm0, (%r8)
1482 add %rdx, %rsi
1483 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1484
1485 .p2align 4
1486 L(shl_13_bwd):
1487 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1488 cmp %rcx, %rdx
1489 movaps -0x0d(%rsi), %xmm1
1490 jb L(L13_bwd)
1491 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1492 L(L13_bwd):
1493 lea -64(%rdx), %rdx
1494 jmp *%r9
1495 ud2
1496 L(shl_13_bwd_loop_L2):
1497 prefetchnta -0x1c0(%rsi)
1498 L(shl_13_bwd_loop_L1):
1499 movaps -0x1d(%rsi), %xmm2
1500 sub $0x40, %rdx
1501 movaps -0x2d(%rsi), %xmm3
1502 movaps -0x3d(%rsi), %xmm4
1503 movaps -0x4d(%rsi), %xmm5
1504 lea -0x40(%rsi), %rsi
1505 palignr $13, %xmm2, %xmm1
1506 palignr $13, %xmm3, %xmm2
1507 palignr $13, %xmm4, %xmm3
1508 palignr $13, %xmm5, %xmm4
1509
1510 movaps %xmm1, -0x10(%rdi)
1511 movaps %xmm5, %xmm1
1512
1513 movaps %xmm2, -0x20(%rdi)
1514 lea -0x40(%rdi), %rdi
1515
1516 movaps %xmm3, 0x10(%rdi)
1517 jb L(shl_13_bwd_end)
1518 movaps %xmm4, (%rdi)
1519 jmp *%r9
1520 ud2
1521 L(shl_13_bwd_end):
1522 movaps %xmm4, (%rdi)
1523 lea 64(%rdx), %rdx
1524 movdqu %xmm0, (%r8)
1525 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1526
1527 .p2align 4
1528 L(shl_14):
1529 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1530 cmp %rcx, %rdx
1531 movaps -0x0e(%rsi), %xmm1
1532 jb L(L14_fwd)
1533 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1534 L(L14_fwd):
1535 lea -64(%rdx), %rdx
1536 jmp *%r9
1537 ud2
1538 L(shl_14_loop_L2):
1539 prefetchnta 0x1c0(%rsi)
1540 L(shl_14_loop_L1):
1541 sub $64, %rdx
1542 movaps 0x02(%rsi), %xmm2
1543 movaps 0x12(%rsi), %xmm3
1544 movaps 0x22(%rsi), %xmm4
1545 movaps 0x32(%rsi), %xmm5
1546 movdqa %xmm5, %xmm6
1547 palignr $14, %xmm4, %xmm5
1548 lea 64(%rsi), %rsi
1549 palignr $14, %xmm3, %xmm4
1550 palignr $14, %xmm2, %xmm3
1551 lea 64(%rdi), %rdi
1552 palignr $14, %xmm1, %xmm2
1553 movdqa %xmm6, %xmm1
1554 movdqa %xmm2, -0x40(%rdi)
1555 movaps %xmm3, -0x30(%rdi)
1556 jb L(shl_14_end)
1557 movaps %xmm4, -0x20(%rdi)
1558 movaps %xmm5, -0x10(%rdi)
1559 jmp *%r9
1560 ud2
1561 L(shl_14_end):
1562 movaps %xmm4, -0x20(%rdi)
1563 lea 64(%rdx), %rdx
1564 movaps %xmm5, -0x10(%rdi)
1565 add %rdx, %rdi
1566 movdqu %xmm0, (%r8)
1567 add %rdx, %rsi
1568 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1569
1570 .p2align 4
1571 L(shl_14_bwd):
1572 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1573 cmp %rcx, %rdx
1574 movaps -0x0e(%rsi), %xmm1
1575 jb L(L14_bwd)
1576 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1577 L(L14_bwd):
1578 lea -64(%rdx), %rdx
1579 jmp *%r9
1580 ud2
1581 L(shl_14_bwd_loop_L2):
1582 prefetchnta -0x1c0(%rsi)
1583 L(shl_14_bwd_loop_L1):
1584 movaps -0x1e(%rsi), %xmm2
1585 sub $0x40, %rdx
1586 movaps -0x2e(%rsi), %xmm3
1587 movaps -0x3e(%rsi), %xmm4
1588 movaps -0x4e(%rsi), %xmm5
1589 lea -0x40(%rsi), %rsi
1590 palignr $14, %xmm2, %xmm1
1591 palignr $14, %xmm3, %xmm2
1592 palignr $14, %xmm4, %xmm3
1593 palignr $14, %xmm5, %xmm4
1594
1595 movaps %xmm1, -0x10(%rdi)
1596 movaps %xmm5, %xmm1
1597
1598 movaps %xmm2, -0x20(%rdi)
1599 lea -0x40(%rdi), %rdi
1600
1601 movaps %xmm3, 0x10(%rdi)
1602 jb L(shl_14_bwd_end)
1603 movaps %xmm4, (%rdi)
1604 jmp *%r9
1605 ud2
1606 L(shl_14_bwd_end):
1607 movaps %xmm4, (%rdi)
1608 lea 64(%rdx), %rdx
1609 movdqu %xmm0, (%r8)
1610 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1611
1612 .p2align 4
1613 L(shl_15):
1614 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1615 cmp %rcx, %rdx
1616 movaps -0x0f(%rsi), %xmm1
1617 jb L(L15_fwd)
1618 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1619 L(L15_fwd):
1620 lea -64(%rdx), %rdx
1621 jmp *%r9
1622 ud2
1623 L(shl_15_loop_L2):
1624 prefetchnta 0x1c0(%rsi)
1625 L(shl_15_loop_L1):
1626 sub $64, %rdx
1627 movaps 0x01(%rsi), %xmm2
1628 movaps 0x11(%rsi), %xmm3
1629 movaps 0x21(%rsi), %xmm4
1630 movaps 0x31(%rsi), %xmm5
1631 movdqa %xmm5, %xmm6
1632 palignr $15, %xmm4, %xmm5
1633 lea 64(%rsi), %rsi
1634 palignr $15, %xmm3, %xmm4
1635 palignr $15, %xmm2, %xmm3
1636 lea 64(%rdi), %rdi
1637 palignr $15, %xmm1, %xmm2
1638 movdqa %xmm6, %xmm1
1639 movdqa %xmm2, -0x40(%rdi)
1640 movaps %xmm3, -0x30(%rdi)
1641 jb L(shl_15_end)
1642 movaps %xmm4, -0x20(%rdi)
1643 movaps %xmm5, -0x10(%rdi)
1644 jmp *%r9
1645 ud2
1646 L(shl_15_end):
1647 movaps %xmm4, -0x20(%rdi)
1648 lea 64(%rdx), %rdx
1649 movaps %xmm5, -0x10(%rdi)
1650 add %rdx, %rdi
1651 movdqu %xmm0, (%r8)
1652 add %rdx, %rsi
1653 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1654
1655 .p2align 4
1656 L(shl_15_bwd):
1657 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1658 cmp %rcx, %rdx
1659 movaps -0x0f(%rsi), %xmm1
1660 jb L(L15_bwd)
1661 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1662 L(L15_bwd):
1663 lea -64(%rdx), %rdx
1664 jmp *%r9
1665 ud2
1666 L(shl_15_bwd_loop_L2):
1667 prefetchnta -0x1c0(%rsi)
1668 L(shl_15_bwd_loop_L1):
1669 movaps -0x1f(%rsi), %xmm2
1670 sub $0x40, %rdx
1671 movaps -0x2f(%rsi), %xmm3
1672 movaps -0x3f(%rsi), %xmm4
1673 movaps -0x4f(%rsi), %xmm5
1674 lea -0x40(%rsi), %rsi
1675 palignr $15, %xmm2, %xmm1
1676 palignr $15, %xmm3, %xmm2
1677 palignr $15, %xmm4, %xmm3
1678 palignr $15, %xmm5, %xmm4
1679
1680 movaps %xmm1, -0x10(%rdi)
1681 movaps %xmm5, %xmm1
1682
1683 movaps %xmm2, -0x20(%rdi)
1684 lea -0x40(%rdi), %rdi
1685
1686 movaps %xmm3, 0x10(%rdi)
1687 jb L(shl_15_bwd_end)
1688 movaps %xmm4, (%rdi)
1689 jmp *%r9
1690 ud2
1691 L(shl_15_bwd_end):
1692 movaps %xmm4, (%rdi)
1693 lea 64(%rdx), %rdx
1694 movdqu %xmm0, (%r8)
1695 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1696
1697 .p2align 4
1698 L(write_72bytes):
1699 movdqu -72(%rsi), %xmm0
1700 movdqu -56(%rsi), %xmm1
1701 mov -40(%rsi), %r8
1702 mov -32(%rsi), %r9
1703 mov -24(%rsi), %r10
1704 mov -16(%rsi), %r11
1705 mov -8(%rsi), %rcx
1706 movdqu %xmm0, -72(%rdi)
1707 movdqu %xmm1, -56(%rdi)
1708 mov %r8, -40(%rdi)
1709 mov %r9, -32(%rdi)
1710 mov %r10, -24(%rdi)
1711 mov %r11, -16(%rdi)
1712 mov %rcx, -8(%rdi)
1713 ret
1714
1715 .p2align 4
1716 L(write_64bytes):
1717 movdqu -64(%rsi), %xmm0
1718 mov -48(%rsi), %rcx
1719 mov -40(%rsi), %r8
1720 mov -32(%rsi), %r9
1721 mov -24(%rsi), %r10
1722 mov -16(%rsi), %r11
1723 mov -8(%rsi), %rdx
1724 movdqu %xmm0, -64(%rdi)
1725 mov %rcx, -48(%rdi)
1726 mov %r8, -40(%rdi)
1727 mov %r9, -32(%rdi)
1728 mov %r10, -24(%rdi)
1729 mov %r11, -16(%rdi)
1730 mov %rdx, -8(%rdi)
1731 ret
1732
1733 .p2align 4
1734 L(write_56bytes):
1735 movdqu -56(%rsi), %xmm0
1736 mov -40(%rsi), %r8
1737 mov -32(%rsi), %r9
1738 mov -24(%rsi), %r10
1739 mov -16(%rsi), %r11
1740 mov -8(%rsi), %rcx
1741 movdqu %xmm0, -56(%rdi)
1742 mov %r8, -40(%rdi)
1743 mov %r9, -32(%rdi)
1744 mov %r10, -24(%rdi)
1745 mov %r11, -16(%rdi)
1746 mov %rcx, -8(%rdi)
1747 ret
1748
1749 .p2align 4
1750 L(write_48bytes):
1751 mov -48(%rsi), %rcx
1752 mov -40(%rsi), %r8
1753 mov -32(%rsi), %r9
1754 mov -24(%rsi), %r10
1755 mov -16(%rsi), %r11
1756 mov -8(%rsi), %rdx
1757 mov %rcx, -48(%rdi)
1758 mov %r8, -40(%rdi)
1759 mov %r9, -32(%rdi)
1760 mov %r10, -24(%rdi)
1761 mov %r11, -16(%rdi)
1762 mov %rdx, -8(%rdi)
1763 ret
1764
1765 .p2align 4
1766 L(write_40bytes):
1767 mov -40(%rsi), %r8
1768 mov -32(%rsi), %r9
1769 mov -24(%rsi), %r10
1770 mov -16(%rsi), %r11
1771 mov -8(%rsi), %rdx
1772 mov %r8, -40(%rdi)
1773 mov %r9, -32(%rdi)
1774 mov %r10, -24(%rdi)
1775 mov %r11, -16(%rdi)
1776 mov %rdx, -8(%rdi)
1777 ret
1778
1779 .p2align 4
1780 L(write_32bytes):
1781 mov -32(%rsi), %r9
1782 mov -24(%rsi), %r10
1783 mov -16(%rsi), %r11
1784 mov -8(%rsi), %rdx
1785 mov %r9, -32(%rdi)
1786 mov %r10, -24(%rdi)
1787 mov %r11, -16(%rdi)
1788 mov %rdx, -8(%rdi)
1789 ret
1790
1791 .p2align 4
1792 L(write_24bytes):
1793 mov -24(%rsi), %r10
1794 mov -16(%rsi), %r11
1795 mov -8(%rsi), %rdx
1796 mov %r10, -24(%rdi)
1797 mov %r11, -16(%rdi)
1798 mov %rdx, -8(%rdi)
1799 ret
1800
1801 .p2align 4
1802 L(write_16bytes):
1803 mov -16(%rsi), %r11
1804 mov -8(%rsi), %rdx
1805 mov %r11, -16(%rdi)
1806 mov %rdx, -8(%rdi)
1807 ret
1808
1809 .p2align 4
1810 L(write_8bytes):
1811 mov -8(%rsi), %rdx
1812 mov %rdx, -8(%rdi)
1813 L(write_0bytes):
1814 ret
1815
1816 .p2align 4
1817 L(write_73bytes):
1818 movdqu -73(%rsi), %xmm0
1819 movdqu -57(%rsi), %xmm1
1820 mov -41(%rsi), %rcx
1821 mov -33(%rsi), %r9
1822 mov -25(%rsi), %r10
1823 mov -17(%rsi), %r11
1824 mov -9(%rsi), %r8
1825 mov -4(%rsi), %edx
1826 movdqu %xmm0, -73(%rdi)
1827 movdqu %xmm1, -57(%rdi)
1828 mov %rcx, -41(%rdi)
1829 mov %r9, -33(%rdi)
1830 mov %r10, -25(%rdi)
1831 mov %r11, -17(%rdi)
1832 mov %r8, -9(%rdi)
1833 mov %edx, -4(%rdi)
1834 ret
1835
1836 .p2align 4
1837 L(write_65bytes):
1838 movdqu -65(%rsi), %xmm0
1839 movdqu -49(%rsi), %xmm1
1840 mov -33(%rsi), %r9
1841 mov -25(%rsi), %r10
1842 mov -17(%rsi), %r11
1843 mov -9(%rsi), %rcx
1844 mov -4(%rsi), %edx
1845 movdqu %xmm0, -65(%rdi)
1846 movdqu %xmm1, -49(%rdi)
1847 mov %r9, -33(%rdi)
1848 mov %r10, -25(%rdi)
1849 mov %r11, -17(%rdi)
1850 mov %rcx, -9(%rdi)
1851 mov %edx, -4(%rdi)
1852 ret
1853
1854 .p2align 4
1855 L(write_57bytes):
1856 movdqu -57(%rsi), %xmm0
1857 mov -41(%rsi), %r8
1858 mov -33(%rsi), %r9
1859 mov -25(%rsi), %r10
1860 mov -17(%rsi), %r11
1861 mov -9(%rsi), %rcx
1862 mov -4(%rsi), %edx
1863 movdqu %xmm0, -57(%rdi)
1864 mov %r8, -41(%rdi)
1865 mov %r9, -33(%rdi)
1866 mov %r10, -25(%rdi)
1867 mov %r11, -17(%rdi)
1868 mov %rcx, -9(%rdi)
1869 mov %edx, -4(%rdi)
1870 ret
1871
1872 .p2align 4
1873 L(write_49bytes):
1874 movdqu -49(%rsi), %xmm0
1875 mov -33(%rsi), %r9
1876 mov -25(%rsi), %r10
1877 mov -17(%rsi), %r11
1878 mov -9(%rsi), %rcx
1879 mov -4(%rsi), %edx
1880 movdqu %xmm0, -49(%rdi)
1881 mov %r9, -33(%rdi)
1882 mov %r10, -25(%rdi)
1883 mov %r11, -17(%rdi)
1884 mov %rcx, -9(%rdi)
1885 mov %edx, -4(%rdi)
1886 ret
1887
1888 .p2align 4
1889 L(write_41bytes):
1890 mov -41(%rsi), %r8
1891 mov -33(%rsi), %r9
1892 mov -25(%rsi), %r10
1893 mov -17(%rsi), %r11
1894 mov -9(%rsi), %rcx
1895 mov -1(%rsi), %dl
1896 mov %r8, -41(%rdi)
1897 mov %r9, -33(%rdi)
1898 mov %r10, -25(%rdi)
1899 mov %r11, -17(%rdi)
1900 mov %rcx, -9(%rdi)
1901 mov %dl, -1(%rdi)
1902 ret
1903
1904 .p2align 4
1905 L(write_33bytes):
1906 mov -33(%rsi), %r9
1907 mov -25(%rsi), %r10
1908 mov -17(%rsi), %r11
1909 mov -9(%rsi), %rcx
1910 mov -1(%rsi), %dl
1911 mov %r9, -33(%rdi)
1912 mov %r10, -25(%rdi)
1913 mov %r11, -17(%rdi)
1914 mov %rcx, -9(%rdi)
1915 mov %dl, -1(%rdi)
1916 ret
1917
1918 .p2align 4
1919 L(write_25bytes):
1920 mov -25(%rsi), %r10
1921 mov -17(%rsi), %r11
1922 mov -9(%rsi), %rcx
1923 mov -1(%rsi), %dl
1924 mov %r10, -25(%rdi)
1925 mov %r11, -17(%rdi)
1926 mov %rcx, -9(%rdi)
1927 mov %dl, -1(%rdi)
1928 ret
1929
1930 .p2align 4
1931 L(write_17bytes):
1932 mov -17(%rsi), %r11
1933 mov -9(%rsi), %rcx
1934 mov -4(%rsi), %edx
1935 mov %r11, -17(%rdi)
1936 mov %rcx, -9(%rdi)
1937 mov %edx, -4(%rdi)
1938 ret
1939
1940 .p2align 4
1941 L(write_9bytes):
1942 mov -9(%rsi), %rcx
1943 mov -4(%rsi), %edx
1944 mov %rcx, -9(%rdi)
1945 mov %edx, -4(%rdi)
1946 ret
1947
1948 .p2align 4
1949 L(write_1bytes):
1950 mov -1(%rsi), %dl
1951 mov %dl, -1(%rdi)
1952 ret
1953
1954 .p2align 4
1955 L(write_74bytes):
1956 movdqu -74(%rsi), %xmm0
1957 movdqu -58(%rsi), %xmm1
1958 mov -42(%rsi), %r8
1959 mov -34(%rsi), %r9
1960 mov -26(%rsi), %r10
1961 mov -18(%rsi), %r11
1962 mov -10(%rsi), %rcx
1963 mov -4(%rsi), %edx
1964 movdqu %xmm0, -74(%rdi)
1965 movdqu %xmm1, -58(%rdi)
1966 mov %r8, -42(%rdi)
1967 mov %r9, -34(%rdi)
1968 mov %r10, -26(%rdi)
1969 mov %r11, -18(%rdi)
1970 mov %rcx, -10(%rdi)
1971 mov %edx, -4(%rdi)
1972 ret
1973
1974 .p2align 4
1975 L(write_66bytes):
1976 movdqu -66(%rsi), %xmm0
1977 movdqu -50(%rsi), %xmm1
1978 mov -42(%rsi), %r8
1979 mov -34(%rsi), %r9
1980 mov -26(%rsi), %r10
1981 mov -18(%rsi), %r11
1982 mov -10(%rsi), %rcx
1983 mov -4(%rsi), %edx
1984 movdqu %xmm0, -66(%rdi)
1985 movdqu %xmm1, -50(%rdi)
1986 mov %r8, -42(%rdi)
1987 mov %r9, -34(%rdi)
1988 mov %r10, -26(%rdi)
1989 mov %r11, -18(%rdi)
1990 mov %rcx, -10(%rdi)
1991 mov %edx, -4(%rdi)
1992 ret
1993
1994 .p2align 4
1995 L(write_58bytes):
1996 movdqu -58(%rsi), %xmm1
1997 mov -42(%rsi), %r8
1998 mov -34(%rsi), %r9
1999 mov -26(%rsi), %r10
2000 mov -18(%rsi), %r11
2001 mov -10(%rsi), %rcx
2002 mov -4(%rsi), %edx
2003 movdqu %xmm1, -58(%rdi)
2004 mov %r8, -42(%rdi)
2005 mov %r9, -34(%rdi)
2006 mov %r10, -26(%rdi)
2007 mov %r11, -18(%rdi)
2008 mov %rcx, -10(%rdi)
2009 mov %edx, -4(%rdi)
2010 ret
2011
2012 .p2align 4
2013 L(write_50bytes):
2014 movdqu -50(%rsi), %xmm0
2015 mov -34(%rsi), %r9
2016 mov -26(%rsi), %r10
2017 mov -18(%rsi), %r11
2018 mov -10(%rsi), %rcx
2019 mov -4(%rsi), %edx
2020 movdqu %xmm0, -50(%rdi)
2021 mov %r9, -34(%rdi)
2022 mov %r10, -26(%rdi)
2023 mov %r11, -18(%rdi)
2024 mov %rcx, -10(%rdi)
2025 mov %edx, -4(%rdi)
2026 ret
2027
2028 .p2align 4
2029 L(write_42bytes):
2030 mov -42(%rsi), %r8
2031 mov -34(%rsi), %r9
2032 mov -26(%rsi), %r10
2033 mov -18(%rsi), %r11
2034 mov -10(%rsi), %rcx
2035 mov -4(%rsi), %edx
2036 mov %r8, -42(%rdi)
2037 mov %r9, -34(%rdi)
2038 mov %r10, -26(%rdi)
2039 mov %r11, -18(%rdi)
2040 mov %rcx, -10(%rdi)
2041 mov %edx, -4(%rdi)
2042 ret
2043
2044 .p2align 4
2045 L(write_34bytes):
2046 mov -34(%rsi), %r9
2047 mov -26(%rsi), %r10
2048 mov -18(%rsi), %r11
2049 mov -10(%rsi), %rcx
2050 mov -4(%rsi), %edx
2051 mov %r9, -34(%rdi)
2052 mov %r10, -26(%rdi)
2053 mov %r11, -18(%rdi)
2054 mov %rcx, -10(%rdi)
2055 mov %edx, -4(%rdi)
2056 ret
2057
2058 .p2align 4
2059 L(write_26bytes):
2060 mov -26(%rsi), %r10
2061 mov -18(%rsi), %r11
2062 mov -10(%rsi), %rcx
2063 mov -4(%rsi), %edx
2064 mov %r10, -26(%rdi)
2065 mov %r11, -18(%rdi)
2066 mov %rcx, -10(%rdi)
2067 mov %edx, -4(%rdi)
2068 ret
2069
2070 .p2align 4
2071 L(write_18bytes):
2072 mov -18(%rsi), %r11
2073 mov -10(%rsi), %rcx
2074 mov -4(%rsi), %edx
2075 mov %r11, -18(%rdi)
2076 mov %rcx, -10(%rdi)
2077 mov %edx, -4(%rdi)
2078 ret
2079
2080 .p2align 4
2081 L(write_10bytes):
2082 mov -10(%rsi), %rcx
2083 mov -4(%rsi), %edx
2084 mov %rcx, -10(%rdi)
2085 mov %edx, -4(%rdi)
2086 ret
2087
2088 .p2align 4
2089 L(write_2bytes):
2090 mov -2(%rsi), %dx
2091 mov %dx, -2(%rdi)
2092 ret
2093
2094 .p2align 4
2095 L(write_75bytes):
2096 movdqu -75(%rsi), %xmm0
2097 movdqu -59(%rsi), %xmm1
2098 mov -43(%rsi), %r8
2099 mov -35(%rsi), %r9
2100 mov -27(%rsi), %r10
2101 mov -19(%rsi), %r11
2102 mov -11(%rsi), %rcx
2103 mov -4(%rsi), %edx
2104 movdqu %xmm0, -75(%rdi)
2105 movdqu %xmm1, -59(%rdi)
2106 mov %r8, -43(%rdi)
2107 mov %r9, -35(%rdi)
2108 mov %r10, -27(%rdi)
2109 mov %r11, -19(%rdi)
2110 mov %rcx, -11(%rdi)
2111 mov %edx, -4(%rdi)
2112 ret
2113
2114 .p2align 4
2115 L(write_67bytes):
2116 movdqu -67(%rsi), %xmm0
2117 movdqu -59(%rsi), %xmm1
2118 mov -43(%rsi), %r8
2119 mov -35(%rsi), %r9
2120 mov -27(%rsi), %r10
2121 mov -19(%rsi), %r11
2122 mov -11(%rsi), %rcx
2123 mov -4(%rsi), %edx
2124 movdqu %xmm0, -67(%rdi)
2125 movdqu %xmm1, -59(%rdi)
2126 mov %r8, -43(%rdi)
2127 mov %r9, -35(%rdi)
2128 mov %r10, -27(%rdi)
2129 mov %r11, -19(%rdi)
2130 mov %rcx, -11(%rdi)
2131 mov %edx, -4(%rdi)
2132 ret
2133
2134 .p2align 4
2135 L(write_59bytes):
2136 movdqu -59(%rsi), %xmm0
2137 mov -43(%rsi), %r8
2138 mov -35(%rsi), %r9
2139 mov -27(%rsi), %r10
2140 mov -19(%rsi), %r11
2141 mov -11(%rsi), %rcx
2142 mov -4(%rsi), %edx
2143 movdqu %xmm0, -59(%rdi)
2144 mov %r8, -43(%rdi)
2145 mov %r9, -35(%rdi)
2146 mov %r10, -27(%rdi)
2147 mov %r11, -19(%rdi)
2148 mov %rcx, -11(%rdi)
2149 mov %edx, -4(%rdi)
2150 ret
2151
2152 .p2align 4
2153 L(write_51bytes):
2154 movdqu -51(%rsi), %xmm0
2155 mov -35(%rsi), %r9
2156 mov -27(%rsi), %r10
2157 mov -19(%rsi), %r11
2158 mov -11(%rsi), %rcx
2159 mov -4(%rsi), %edx
2160 movdqu %xmm0, -51(%rdi)
2161 mov %r9, -35(%rdi)
2162 mov %r10, -27(%rdi)
2163 mov %r11, -19(%rdi)
2164 mov %rcx, -11(%rdi)
2165 mov %edx, -4(%rdi)
2166 ret
2167
2168 .p2align 4
2169 L(write_43bytes):
2170 mov -43(%rsi), %r8
2171 mov -35(%rsi), %r9
2172 mov -27(%rsi), %r10
2173 mov -19(%rsi), %r11
2174 mov -11(%rsi), %rcx
2175 mov -4(%rsi), %edx
2176 mov %r8, -43(%rdi)
2177 mov %r9, -35(%rdi)
2178 mov %r10, -27(%rdi)
2179 mov %r11, -19(%rdi)
2180 mov %rcx, -11(%rdi)
2181 mov %edx, -4(%rdi)
2182 ret
2183
2184 .p2align 4
2185 L(write_35bytes):
2186 mov -35(%rsi), %r9
2187 mov -27(%rsi), %r10
2188 mov -19(%rsi), %r11
2189 mov -11(%rsi), %rcx
2190 mov -4(%rsi), %edx
2191 mov %r9, -35(%rdi)
2192 mov %r10, -27(%rdi)
2193 mov %r11, -19(%rdi)
2194 mov %rcx, -11(%rdi)
2195 mov %edx, -4(%rdi)
2196 ret
2197
2198 .p2align 4
2199 L(write_27bytes):
2200 mov -27(%rsi), %r10
2201 mov -19(%rsi), %r11
2202 mov -11(%rsi), %rcx
2203 mov -4(%rsi), %edx
2204 mov %r10, -27(%rdi)
2205 mov %r11, -19(%rdi)
2206 mov %rcx, -11(%rdi)
2207 mov %edx, -4(%rdi)
2208 ret
2209
2210 .p2align 4
2211 L(write_19bytes):
2212 mov -19(%rsi), %r11
2213 mov -11(%rsi), %rcx
2214 mov -4(%rsi), %edx
2215 mov %r11, -19(%rdi)
2216 mov %rcx, -11(%rdi)
2217 mov %edx, -4(%rdi)
2218 ret
2219
2220 .p2align 4
2221 L(write_11bytes):
2222 mov -11(%rsi), %rcx
2223 mov -4(%rsi), %edx
2224 mov %rcx, -11(%rdi)
2225 mov %edx, -4(%rdi)
2226 ret
2227
2228 .p2align 4
2229 L(write_3bytes):
2230 mov -3(%rsi), %dx
2231 mov -2(%rsi), %cx
2232 mov %dx, -3(%rdi)
2233 mov %cx, -2(%rdi)
2234 ret
2235
2236 .p2align 4
2237 L(write_76bytes):
2238 movdqu -76(%rsi), %xmm0
2239 movdqu -60(%rsi), %xmm1
2240 mov -44(%rsi), %r8
2241 mov -36(%rsi), %r9
2242 mov -28(%rsi), %r10
2243 mov -20(%rsi), %r11
2244 mov -12(%rsi), %rcx
2245 mov -4(%rsi), %edx
2246 movdqu %xmm0, -76(%rdi)
2247 movdqu %xmm1, -60(%rdi)
2248 mov %r8, -44(%rdi)
2249 mov %r9, -36(%rdi)
2250 mov %r10, -28(%rdi)
2251 mov %r11, -20(%rdi)
2252 mov %rcx, -12(%rdi)
2253 mov %edx, -4(%rdi)
2254 ret
2255
2256 .p2align 4
2257 L(write_68bytes):
2258 movdqu -68(%rsi), %xmm0
2259 movdqu -52(%rsi), %xmm1
2260 mov -36(%rsi), %r9
2261 mov -28(%rsi), %r10
2262 mov -20(%rsi), %r11
2263 mov -12(%rsi), %rcx
2264 mov -4(%rsi), %edx
2265 movdqu %xmm0, -68(%rdi)
2266 movdqu %xmm1, -52(%rdi)
2267 mov %r9, -36(%rdi)
2268 mov %r10, -28(%rdi)
2269 mov %r11, -20(%rdi)
2270 mov %rcx, -12(%rdi)
2271 mov %edx, -4(%rdi)
2272 ret
2273
2274 .p2align 4
2275 L(write_60bytes):
2276 movdqu -60(%rsi), %xmm0
2277 mov -44(%rsi), %r8
2278 mov -36(%rsi), %r9
2279 mov -28(%rsi), %r10
2280 mov -20(%rsi), %r11
2281 mov -12(%rsi), %rcx
2282 mov -4(%rsi), %edx
2283 movdqu %xmm0, -60(%rdi)
2284 mov %r8, -44(%rdi)
2285 mov %r9, -36(%rdi)
2286 mov %r10, -28(%rdi)
2287 mov %r11, -20(%rdi)
2288 mov %rcx, -12(%rdi)
2289 mov %edx, -4(%rdi)
2290 ret
2291
2292 .p2align 4
2293 L(write_52bytes):
2294 movdqu -52(%rsi), %xmm0
2295 mov -36(%rsi), %r9
2296 mov -28(%rsi), %r10
2297 mov -20(%rsi), %r11
2298 mov -12(%rsi), %rcx
2299 mov -4(%rsi), %edx
2300 movdqu %xmm0, -52(%rdi)
2301 mov %r9, -36(%rdi)
2302 mov %r10, -28(%rdi)
2303 mov %r11, -20(%rdi)
2304 mov %rcx, -12(%rdi)
2305 mov %edx, -4(%rdi)
2306 ret
2307
2308 .p2align 4
2309 L(write_44bytes):
2310 mov -44(%rsi), %r8
2311 mov -36(%rsi), %r9
2312 mov -28(%rsi), %r10
2313 mov -20(%rsi), %r11
2314 mov -12(%rsi), %rcx
2315 mov -4(%rsi), %edx
2316 mov %r8, -44(%rdi)
2317 mov %r9, -36(%rdi)
2318 mov %r10, -28(%rdi)
2319 mov %r11, -20(%rdi)
2320 mov %rcx, -12(%rdi)
2321 mov %edx, -4(%rdi)
2322 ret
2323
2324 .p2align 4
2325 L(write_36bytes):
2326 mov -36(%rsi), %r9
2327 mov -28(%rsi), %r10
2328 mov -20(%rsi), %r11
2329 mov -12(%rsi), %rcx
2330 mov -4(%rsi), %edx
2331 mov %r9, -36(%rdi)
2332 mov %r10, -28(%rdi)
2333 mov %r11, -20(%rdi)
2334 mov %rcx, -12(%rdi)
2335 mov %edx, -4(%rdi)
2336 ret
2337
2338 .p2align 4
2339 L(write_28bytes):
2340 mov -28(%rsi), %r10
2341 mov -20(%rsi), %r11
2342 mov -12(%rsi), %rcx
2343 mov -4(%rsi), %edx
2344 mov %r10, -28(%rdi)
2345 mov %r11, -20(%rdi)
2346 mov %rcx, -12(%rdi)
2347 mov %edx, -4(%rdi)
2348 ret
2349
2350 .p2align 4
2351 L(write_20bytes):
2352 mov -20(%rsi), %r11
2353 mov -12(%rsi), %rcx
2354 mov -4(%rsi), %edx
2355 mov %r11, -20(%rdi)
2356 mov %rcx, -12(%rdi)
2357 mov %edx, -4(%rdi)
2358 ret
2359
2360 .p2align 4
2361 L(write_12bytes):
2362 mov -12(%rsi), %rcx
2363 mov -4(%rsi), %edx
2364 mov %rcx, -12(%rdi)
2365 mov %edx, -4(%rdi)
2366 ret
2367
2368 .p2align 4
2369 L(write_4bytes):
2370 mov -4(%rsi), %edx
2371 mov %edx, -4(%rdi)
2372 ret
2373
2374 .p2align 4
2375 L(write_77bytes):
2376 movdqu -77(%rsi), %xmm0
2377 movdqu -61(%rsi), %xmm1
2378 mov -45(%rsi), %r8
2379 mov -37(%rsi), %r9
2380 mov -29(%rsi), %r10
2381 mov -21(%rsi), %r11
2382 mov -13(%rsi), %rcx
2383 mov -8(%rsi), %rdx
2384 movdqu %xmm0, -77(%rdi)
2385 movdqu %xmm1, -61(%rdi)
2386 mov %r8, -45(%rdi)
2387 mov %r9, -37(%rdi)
2388 mov %r10, -29(%rdi)
2389 mov %r11, -21(%rdi)
2390 mov %rcx, -13(%rdi)
2391 mov %rdx, -8(%rdi)
2392 ret
2393
2394 .p2align 4
2395 L(write_69bytes):
2396 movdqu -69(%rsi), %xmm0
2397 movdqu -53(%rsi), %xmm1
2398 mov -37(%rsi), %r9
2399 mov -29(%rsi), %r10
2400 mov -21(%rsi), %r11
2401 mov -13(%rsi), %rcx
2402 mov -8(%rsi), %rdx
2403 movdqu %xmm0, -69(%rdi)
2404 movdqu %xmm1, -53(%rdi)
2405 mov %r9, -37(%rdi)
2406 mov %r10, -29(%rdi)
2407 mov %r11, -21(%rdi)
2408 mov %rcx, -13(%rdi)
2409 mov %rdx, -8(%rdi)
2410 ret
2411
2412 .p2align 4
2413 L(write_61bytes):
2414 movdqu -61(%rsi), %xmm0
2415 mov -45(%rsi), %r8
2416 mov -37(%rsi), %r9
2417 mov -29(%rsi), %r10
2418 mov -21(%rsi), %r11
2419 mov -13(%rsi), %rcx
2420 mov -8(%rsi), %rdx
2421 movdqu %xmm0, -61(%rdi)
2422 mov %r8, -45(%rdi)
2423 mov %r9, -37(%rdi)
2424 mov %r10, -29(%rdi)
2425 mov %r11, -21(%rdi)
2426 mov %rcx, -13(%rdi)
2427 mov %rdx, -8(%rdi)
2428 ret
2429
2430 .p2align 4
2431 L(write_53bytes):
2432 movdqu -53(%rsi), %xmm0
2433 mov -45(%rsi), %r8
2434 mov -37(%rsi), %r9
2435 mov -29(%rsi), %r10
2436 mov -21(%rsi), %r11
2437 mov -13(%rsi), %rcx
2438 mov -8(%rsi), %rdx
2439 movdqu %xmm0, -53(%rdi)
2440 mov %r9, -37(%rdi)
2441 mov %r10, -29(%rdi)
2442 mov %r11, -21(%rdi)
2443 mov %rcx, -13(%rdi)
2444 mov %rdx, -8(%rdi)
2445 ret
2446
2447 .p2align 4
2448 L(write_45bytes):
2449 mov -45(%rsi), %r8
2450 mov -37(%rsi), %r9
2451 mov -29(%rsi), %r10
2452 mov -21(%rsi), %r11
2453 mov -13(%rsi), %rcx
2454 mov -8(%rsi), %rdx
2455 mov %r8, -45(%rdi)
2456 mov %r9, -37(%rdi)
2457 mov %r10, -29(%rdi)
2458 mov %r11, -21(%rdi)
2459 mov %rcx, -13(%rdi)
2460 mov %rdx, -8(%rdi)
2461 ret
2462
2463 .p2align 4
2464 L(write_37bytes):
2465 mov -37(%rsi), %r9
2466 mov -29(%rsi), %r10
2467 mov -21(%rsi), %r11
2468 mov -13(%rsi), %rcx
2469 mov -8(%rsi), %rdx
2470 mov %r9, -37(%rdi)
2471 mov %r10, -29(%rdi)
2472 mov %r11, -21(%rdi)
2473 mov %rcx, -13(%rdi)
2474 mov %rdx, -8(%rdi)
2475 ret
2476
2477 .p2align 4
2478 L(write_29bytes):
2479 mov -29(%rsi), %r10
2480 mov -21(%rsi), %r11
2481 mov -13(%rsi), %rcx
2482 mov -8(%rsi), %rdx
2483 mov %r10, -29(%rdi)
2484 mov %r11, -21(%rdi)
2485 mov %rcx, -13(%rdi)
2486 mov %rdx, -8(%rdi)
2487 ret
2488
2489 .p2align 4
2490 L(write_21bytes):
2491 mov -21(%rsi), %r11
2492 mov -13(%rsi), %rcx
2493 mov -8(%rsi), %rdx
2494 mov %r11, -21(%rdi)
2495 mov %rcx, -13(%rdi)
2496 mov %rdx, -8(%rdi)
2497 ret
2498
2499 .p2align 4
2500 L(write_13bytes):
2501 mov -13(%rsi), %rcx
2502 mov -8(%rsi), %rdx
2503 mov %rcx, -13(%rdi)
2504 mov %rdx, -8(%rdi)
2505 ret
2506
2507 .p2align 4
2508 L(write_5bytes):
2509 mov -5(%rsi), %edx
2510 mov -4(%rsi), %ecx
2511 mov %edx, -5(%rdi)
2512 mov %ecx, -4(%rdi)
2513 ret
2514
2515 .p2align 4
2516 L(write_78bytes):
2517 movdqu -78(%rsi), %xmm0
2518 movdqu -62(%rsi), %xmm1
2519 mov -46(%rsi), %r8
2520 mov -38(%rsi), %r9
2521 mov -30(%rsi), %r10
2522 mov -22(%rsi), %r11
2523 mov -14(%rsi), %rcx
2524 mov -8(%rsi), %rdx
2525 movdqu %xmm0, -78(%rdi)
2526 movdqu %xmm1, -62(%rdi)
2527 mov %r8, -46(%rdi)
2528 mov %r9, -38(%rdi)
2529 mov %r10, -30(%rdi)
2530 mov %r11, -22(%rdi)
2531 mov %rcx, -14(%rdi)
2532 mov %rdx, -8(%rdi)
2533 ret
2534
2535 .p2align 4
2536 L(write_70bytes):
2537 movdqu -70(%rsi), %xmm0
2538 movdqu -54(%rsi), %xmm1
2539 mov -38(%rsi), %r9
2540 mov -30(%rsi), %r10
2541 mov -22(%rsi), %r11
2542 mov -14(%rsi), %rcx
2543 mov -8(%rsi), %rdx
2544 movdqu %xmm0, -70(%rdi)
2545 movdqu %xmm1, -54(%rdi)
2546 mov %r9, -38(%rdi)
2547 mov %r10, -30(%rdi)
2548 mov %r11, -22(%rdi)
2549 mov %rcx, -14(%rdi)
2550 mov %rdx, -8(%rdi)
2551 ret
2552
2553 .p2align 4
2554 L(write_62bytes):
2555 movdqu -62(%rsi), %xmm0
2556 mov -46(%rsi), %r8
2557 mov -38(%rsi), %r9
2558 mov -30(%rsi), %r10
2559 mov -22(%rsi), %r11
2560 mov -14(%rsi), %rcx
2561 mov -8(%rsi), %rdx
2562 movdqu %xmm0, -62(%rdi)
2563 mov %r8, -46(%rdi)
2564 mov %r9, -38(%rdi)
2565 mov %r10, -30(%rdi)
2566 mov %r11, -22(%rdi)
2567 mov %rcx, -14(%rdi)
2568 mov %rdx, -8(%rdi)
2569 ret
2570
2571 .p2align 4
2572 L(write_54bytes):
2573 movdqu -54(%rsi), %xmm0
2574 mov -38(%rsi), %r9
2575 mov -30(%rsi), %r10
2576 mov -22(%rsi), %r11
2577 mov -14(%rsi), %rcx
2578 mov -8(%rsi), %rdx
2579 movdqu %xmm0, -54(%rdi)
2580 mov %r9, -38(%rdi)
2581 mov %r10, -30(%rdi)
2582 mov %r11, -22(%rdi)
2583 mov %rcx, -14(%rdi)
2584 mov %rdx, -8(%rdi)
2585 ret
2586
2587 .p2align 4
2588 L(write_46bytes):
2589 mov -46(%rsi), %r8
2590 mov -38(%rsi), %r9
2591 mov -30(%rsi), %r10
2592 mov -22(%rsi), %r11
2593 mov -14(%rsi), %rcx
2594 mov -8(%rsi), %rdx
2595 mov %r8, -46(%rdi)
2596 mov %r9, -38(%rdi)
2597 mov %r10, -30(%rdi)
2598 mov %r11, -22(%rdi)
2599 mov %rcx, -14(%rdi)
2600 mov %rdx, -8(%rdi)
2601 ret
2602
2603 .p2align 4
2604 L(write_38bytes):
2605 mov -38(%rsi), %r9
2606 mov -30(%rsi), %r10
2607 mov -22(%rsi), %r11
2608 mov -14(%rsi), %rcx
2609 mov -8(%rsi), %rdx
2610 mov %r9, -38(%rdi)
2611 mov %r10, -30(%rdi)
2612 mov %r11, -22(%rdi)
2613 mov %rcx, -14(%rdi)
2614 mov %rdx, -8(%rdi)
2615 ret
2616
2617 .p2align 4
2618 L(write_30bytes):
2619 mov -30(%rsi), %r10
2620 mov -22(%rsi), %r11
2621 mov -14(%rsi), %rcx
2622 mov -8(%rsi), %rdx
2623 mov %r10, -30(%rdi)
2624 mov %r11, -22(%rdi)
2625 mov %rcx, -14(%rdi)
2626 mov %rdx, -8(%rdi)
2627 ret
2628
2629 .p2align 4
2630 L(write_22bytes):
2631 mov -22(%rsi), %r11
2632 mov -14(%rsi), %rcx
2633 mov -8(%rsi), %rdx
2634 mov %r11, -22(%rdi)
2635 mov %rcx, -14(%rdi)
2636 mov %rdx, -8(%rdi)
2637 ret
2638
2639 .p2align 4
2640 L(write_14bytes):
2641 mov -14(%rsi), %rcx
2642 mov -8(%rsi), %rdx
2643 mov %rcx, -14(%rdi)
2644 mov %rdx, -8(%rdi)
2645 ret
2646
2647 .p2align 4
2648 L(write_6bytes):
2649 mov -6(%rsi), %edx
2650 mov -4(%rsi), %ecx
2651 mov %edx, -6(%rdi)
2652 mov %ecx, -4(%rdi)
2653 ret
2654
2655 .p2align 4
2656 L(write_79bytes):
2657 movdqu -79(%rsi), %xmm0
2658 movdqu -63(%rsi), %xmm1
2659 mov -47(%rsi), %r8
2660 mov -39(%rsi), %r9
2661 mov -31(%rsi), %r10
2662 mov -23(%rsi), %r11
2663 mov -15(%rsi), %rcx
2664 mov -8(%rsi), %rdx
2665 movdqu %xmm0, -79(%rdi)
2666 movdqu %xmm1, -63(%rdi)
2667 mov %r8, -47(%rdi)
2668 mov %r9, -39(%rdi)
2669 mov %r10, -31(%rdi)
2670 mov %r11, -23(%rdi)
2671 mov %rcx, -15(%rdi)
2672 mov %rdx, -8(%rdi)
2673 ret
2674
2675 .p2align 4
2676 L(write_71bytes):
2677 movdqu -71(%rsi), %xmm0
2678 movdqu -55(%rsi), %xmm1
2679 mov -39(%rsi), %r9
2680 mov -31(%rsi), %r10
2681 mov -23(%rsi), %r11
2682 mov -15(%rsi), %rcx
2683 mov -8(%rsi), %rdx
2684 movdqu %xmm0, -71(%rdi)
2685 movdqu %xmm1, -55(%rdi)
2686 mov %r9, -39(%rdi)
2687 mov %r10, -31(%rdi)
2688 mov %r11, -23(%rdi)
2689 mov %rcx, -15(%rdi)
2690 mov %rdx, -8(%rdi)
2691 ret
2692
2693 .p2align 4
2694 L(write_63bytes):
2695 movdqu -63(%rsi), %xmm0
2696 mov -47(%rsi), %r8
2697 mov -39(%rsi), %r9
2698 mov -31(%rsi), %r10
2699 mov -23(%rsi), %r11
2700 mov -15(%rsi), %rcx
2701 mov -8(%rsi), %rdx
2702 movdqu %xmm0, -63(%rdi)
2703 mov %r8, -47(%rdi)
2704 mov %r9, -39(%rdi)
2705 mov %r10, -31(%rdi)
2706 mov %r11, -23(%rdi)
2707 mov %rcx, -15(%rdi)
2708 mov %rdx, -8(%rdi)
2709 ret
2710
2711 .p2align 4
2712 L(write_55bytes):
2713 movdqu -55(%rsi), %xmm0
2714 mov -39(%rsi), %r9
2715 mov -31(%rsi), %r10
2716 mov -23(%rsi), %r11
2717 mov -15(%rsi), %rcx
2718 mov -8(%rsi), %rdx
2719 movdqu %xmm0, -55(%rdi)
2720 mov %r9, -39(%rdi)
2721 mov %r10, -31(%rdi)
2722 mov %r11, -23(%rdi)
2723 mov %rcx, -15(%rdi)
2724 mov %rdx, -8(%rdi)
2725 ret
2726
2727 .p2align 4
2728 L(write_47bytes):
2729 mov -47(%rsi), %r8
2730 mov -39(%rsi), %r9
2731 mov -31(%rsi), %r10
2732 mov -23(%rsi), %r11
2733 mov -15(%rsi), %rcx
2734 mov -8(%rsi), %rdx
2735 mov %r8, -47(%rdi)
2736 mov %r9, -39(%rdi)
2737 mov %r10, -31(%rdi)
2738 mov %r11, -23(%rdi)
2739 mov %rcx, -15(%rdi)
2740 mov %rdx, -8(%rdi)
2741 ret
2742
2743 .p2align 4
2744 L(write_39bytes):
2745 mov -39(%rsi), %r9
2746 mov -31(%rsi), %r10
2747 mov -23(%rsi), %r11
2748 mov -15(%rsi), %rcx
2749 mov -8(%rsi), %rdx
2750 mov %r9, -39(%rdi)
2751 mov %r10, -31(%rdi)
2752 mov %r11, -23(%rdi)
2753 mov %rcx, -15(%rdi)
2754 mov %rdx, -8(%rdi)
2755 ret
2756
2757 .p2align 4
2758 L(write_31bytes):
2759 mov -31(%rsi), %r10
2760 mov -23(%rsi), %r11
2761 mov -15(%rsi), %rcx
2762 mov -8(%rsi), %rdx
2763 mov %r10, -31(%rdi)
2764 mov %r11, -23(%rdi)
2765 mov %rcx, -15(%rdi)
2766 mov %rdx, -8(%rdi)
2767 ret
2768
2769 .p2align 4
2770 L(write_23bytes):
2771 mov -23(%rsi), %r11
2772 mov -15(%rsi), %rcx
2773 mov -8(%rsi), %rdx
2774 mov %r11, -23(%rdi)
2775 mov %rcx, -15(%rdi)
2776 mov %rdx, -8(%rdi)
2777 ret
2778
2779 .p2align 4
2780 L(write_15bytes):
2781 mov -15(%rsi), %rcx
2782 mov -8(%rsi), %rdx
2783 mov %rcx, -15(%rdi)
2784 mov %rdx, -8(%rdi)
2785 ret
2786
2787 .p2align 4
2788 L(write_7bytes):
2789 mov -7(%rsi), %edx
2790 mov -4(%rsi), %ecx
2791 mov %edx, -7(%rdi)
2792 mov %ecx, -4(%rdi)
2793 ret
2794
2795 .p2align 4
2796 L(large_page_fwd):
2797 movdqu (%rsi), %xmm1
2798 lea 16(%rsi), %rsi
2799 movdqu %xmm0, (%r8)
2800 movntdq %xmm1, (%rdi)
2801 lea 16(%rdi), %rdi
2802 lea -0x90(%rdx), %rdx
2803 #ifdef USE_AS_MEMMOVE
2804 mov %rsi, %r9
2805 sub %rdi, %r9
2806 cmp %rdx, %r9
2807 jae L(memmove_is_memcpy_fwd)
2808 shl $2, %rcx
2809 cmp %rcx, %rdx
2810 jb L(ll_cache_copy_fwd_start)
2811 L(memmove_is_memcpy_fwd):
2812 #endif
2813 L(large_page_loop):
2814 movdqu (%rsi), %xmm0
2815 movdqu 0x10(%rsi), %xmm1
2816 movdqu 0x20(%rsi), %xmm2
2817 movdqu 0x30(%rsi), %xmm3
2818 movdqu 0x40(%rsi), %xmm4
2819 movdqu 0x50(%rsi), %xmm5
2820 movdqu 0x60(%rsi), %xmm6
2821 movdqu 0x70(%rsi), %xmm7
2822 lea 0x80(%rsi), %rsi
2823
2824 sub $0x80, %rdx
2825 movntdq %xmm0, (%rdi)
2826 movntdq %xmm1, 0x10(%rdi)
2827 movntdq %xmm2, 0x20(%rdi)
2828 movntdq %xmm3, 0x30(%rdi)
2829 movntdq %xmm4, 0x40(%rdi)
2830 movntdq %xmm5, 0x50(%rdi)
2831 movntdq %xmm6, 0x60(%rdi)
2832 movntdq %xmm7, 0x70(%rdi)
2833 lea 0x80(%rdi), %rdi
2834 jae L(large_page_loop)
2835 cmp $-0x40, %rdx
2836 lea 0x80(%rdx), %rdx
2837 jl L(large_page_less_64bytes)
2838
2839 movdqu (%rsi), %xmm0
2840 movdqu 0x10(%rsi), %xmm1
2841 movdqu 0x20(%rsi), %xmm2
2842 movdqu 0x30(%rsi), %xmm3
2843 lea 0x40(%rsi), %rsi
2844
2845 movntdq %xmm0, (%rdi)
2846 movntdq %xmm1, 0x10(%rdi)
2847 movntdq %xmm2, 0x20(%rdi)
2848 movntdq %xmm3, 0x30(%rdi)
2849 lea 0x40(%rdi), %rdi
2850 sub $0x40, %rdx
2851 L(large_page_less_64bytes):
2852 add %rdx, %rsi
2853 add %rdx, %rdi
2854 sfence
2855 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2856
2857 #ifdef USE_AS_MEMMOVE
2858 .p2align 4
2859 L(ll_cache_copy_fwd_start):
2860 prefetcht0 0x1c0(%rsi)
2861 prefetcht0 0x200(%rsi)
2862 movdqu (%rsi), %xmm0
2863 movdqu 0x10(%rsi), %xmm1
2864 movdqu 0x20(%rsi), %xmm2
2865 movdqu 0x30(%rsi), %xmm3
2866 movdqu 0x40(%rsi), %xmm4
2867 movdqu 0x50(%rsi), %xmm5
2868 movdqu 0x60(%rsi), %xmm6
2869 movdqu 0x70(%rsi), %xmm7
2870 lea 0x80(%rsi), %rsi
2871
2872 sub $0x80, %rdx
2873 movaps %xmm0, (%rdi)
2874 movaps %xmm1, 0x10(%rdi)
2875 movaps %xmm2, 0x20(%rdi)
2876 movaps %xmm3, 0x30(%rdi)
2877 movaps %xmm4, 0x40(%rdi)
2878 movaps %xmm5, 0x50(%rdi)
2879 movaps %xmm6, 0x60(%rdi)
2880 movaps %xmm7, 0x70(%rdi)
2881 lea 0x80(%rdi), %rdi
2882 jae L(ll_cache_copy_fwd_start)
2883 cmp $-0x40, %rdx
2884 lea 0x80(%rdx), %rdx
2885 jl L(large_page_ll_less_fwd_64bytes)
2886
2887 movdqu (%rsi), %xmm0
2888 movdqu 0x10(%rsi), %xmm1
2889 movdqu 0x20(%rsi), %xmm2
2890 movdqu 0x30(%rsi), %xmm3
2891 lea 0x40(%rsi), %rsi
2892
2893 movaps %xmm0, (%rdi)
2894 movaps %xmm1, 0x10(%rdi)
2895 movaps %xmm2, 0x20(%rdi)
2896 movaps %xmm3, 0x30(%rdi)
2897 lea 0x40(%rdi), %rdi
2898 sub $0x40, %rdx
2899 L(large_page_ll_less_fwd_64bytes):
2900 add %rdx, %rsi
2901 add %rdx, %rdi
2902 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2903
2904 #endif
2905 .p2align 4
2906 L(large_page_bwd):
2907 movdqu -0x10(%rsi), %xmm1
2908 lea -16(%rsi), %rsi
2909 movdqu %xmm0, (%r8)
2910 movdqa %xmm1, -0x10(%rdi)
2911 lea -16(%rdi), %rdi
2912 lea -0x90(%rdx), %rdx
2913 #ifdef USE_AS_MEMMOVE
2914 mov %rdi, %r9
2915 sub %rsi, %r9
2916 cmp %rdx, %r9
2917 jae L(memmove_is_memcpy_bwd)
2918 cmp %rcx, %r9
2919 jb L(ll_cache_copy_bwd_start)
2920 L(memmove_is_memcpy_bwd):
2921 #endif
2922 L(large_page_bwd_loop):
2923 movdqu -0x10(%rsi), %xmm0
2924 movdqu -0x20(%rsi), %xmm1
2925 movdqu -0x30(%rsi), %xmm2
2926 movdqu -0x40(%rsi), %xmm3
2927 movdqu -0x50(%rsi), %xmm4
2928 movdqu -0x60(%rsi), %xmm5
2929 movdqu -0x70(%rsi), %xmm6
2930 movdqu -0x80(%rsi), %xmm7
2931 lea -0x80(%rsi), %rsi
2932
2933 sub $0x80, %rdx
2934 movntdq %xmm0, -0x10(%rdi)
2935 movntdq %xmm1, -0x20(%rdi)
2936 movntdq %xmm2, -0x30(%rdi)
2937 movntdq %xmm3, -0x40(%rdi)
2938 movntdq %xmm4, -0x50(%rdi)
2939 movntdq %xmm5, -0x60(%rdi)
2940 movntdq %xmm6, -0x70(%rdi)
2941 movntdq %xmm7, -0x80(%rdi)
2942 lea -0x80(%rdi), %rdi
2943 jae L(large_page_bwd_loop)
2944 cmp $-0x40, %rdx
2945 lea 0x80(%rdx), %rdx
2946 jl L(large_page_less_bwd_64bytes)
2947
2948 movdqu -0x10(%rsi), %xmm0
2949 movdqu -0x20(%rsi), %xmm1
2950 movdqu -0x30(%rsi), %xmm2
2951 movdqu -0x40(%rsi), %xmm3
2952 lea -0x40(%rsi), %rsi
2953
2954 movntdq %xmm0, -0x10(%rdi)
2955 movntdq %xmm1, -0x20(%rdi)
2956 movntdq %xmm2, -0x30(%rdi)
2957 movntdq %xmm3, -0x40(%rdi)
2958 lea -0x40(%rdi), %rdi
2959 sub $0x40, %rdx
2960 L(large_page_less_bwd_64bytes):
2961 sfence
2962 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2963
2964 #ifdef USE_AS_MEMMOVE
2965 .p2align 4
2966 L(ll_cache_copy_bwd_start):
2967 prefetcht0 -0x1c0(%rsi)
2968 prefetcht0 -0x200(%rsi)
2969 movdqu -0x10(%rsi), %xmm0
2970 movdqu -0x20(%rsi), %xmm1
2971 movdqu -0x30(%rsi), %xmm2
2972 movdqu -0x40(%rsi), %xmm3
2973 movdqu -0x50(%rsi), %xmm4
2974 movdqu -0x60(%rsi), %xmm5
2975 movdqu -0x70(%rsi), %xmm6
2976 movdqu -0x80(%rsi), %xmm7
2977 lea -0x80(%rsi), %rsi
2978
2979 sub $0x80, %rdx
2980 movaps %xmm0, -0x10(%rdi)
2981 movaps %xmm1, -0x20(%rdi)
2982 movaps %xmm2, -0x30(%rdi)
2983 movaps %xmm3, -0x40(%rdi)
2984 movaps %xmm4, -0x50(%rdi)
2985 movaps %xmm5, -0x60(%rdi)
2986 movaps %xmm6, -0x70(%rdi)
2987 movaps %xmm7, -0x80(%rdi)
2988 lea -0x80(%rdi), %rdi
2989 jae L(ll_cache_copy_bwd_start)
2990 cmp $-0x40, %rdx
2991 lea 0x80(%rdx), %rdx
2992 jl L(large_page_ll_less_bwd_64bytes)
2993
2994 movdqu -0x10(%rsi), %xmm0
2995 movdqu -0x20(%rsi), %xmm1
2996 movdqu -0x30(%rsi), %xmm2
2997 movdqu -0x40(%rsi), %xmm3
2998 lea -0x40(%rsi), %rsi
2999
3000 movaps %xmm0, -0x10(%rdi)
3001 movaps %xmm1, -0x20(%rdi)
3002 movaps %xmm2, -0x30(%rdi)
3003 movaps %xmm3, -0x40(%rdi)
3004 lea -0x40(%rdi), %rdi
3005 sub $0x40, %rdx
3006 L(large_page_ll_less_bwd_64bytes):
3007 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3008 #endif
3009
3010 END (MEMCPY)
3011
3012 .section .rodata.ssse3,"a",@progbits
3013 .p2align 3
3014 L(table_less_80bytes):
3015 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3016 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3017 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3018 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3019 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3020 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3021 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3022 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3023 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3024 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3025 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3026 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3027 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3028 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3029 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3030 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3031 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3032 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3033 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3095
3096 .p2align 3
3097 L(shl_table):
3098 .int JMPTBL (L(shl_0), L(shl_table))
3099 .int JMPTBL (L(shl_1), L(shl_table))
3100 .int JMPTBL (L(shl_2), L(shl_table))
3101 .int JMPTBL (L(shl_3), L(shl_table))
3102 .int JMPTBL (L(shl_4), L(shl_table))
3103 .int JMPTBL (L(shl_5), L(shl_table))
3104 .int JMPTBL (L(shl_6), L(shl_table))
3105 .int JMPTBL (L(shl_7), L(shl_table))
3106 .int JMPTBL (L(shl_8), L(shl_table))
3107 .int JMPTBL (L(shl_9), L(shl_table))
3108 .int JMPTBL (L(shl_10), L(shl_table))
3109 .int JMPTBL (L(shl_11), L(shl_table))
3110 .int JMPTBL (L(shl_12), L(shl_table))
3111 .int JMPTBL (L(shl_13), L(shl_table))
3112 .int JMPTBL (L(shl_14), L(shl_table))
3113 .int JMPTBL (L(shl_15), L(shl_table))
3114
3115 .p2align 3
3116 L(shl_table_bwd):
3117 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3118 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3119 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3120 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3121 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3122 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3123 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3124 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3125 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3126 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3127 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3128 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3129 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3130 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3131 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3132 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3133
3134 #endif