]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcpy-ssse3.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcpy-ssse3.S
1 /* memcpy with SSSE3
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc) \
20 && (defined SHARED \
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
23
24 # include <sysdep.h>
25 # include "asm-syntax.h"
26
27 # ifndef MEMCPY
28 # define MEMCPY __memcpy_ssse3
29 # define MEMCPY_CHK __memcpy_chk_ssse3
30 # endif
31
32 # ifdef USE_AS_BCOPY
33 # define SRC PARMS
34 # define DEST SRC+4
35 # define LEN DEST+4
36 # else
37 # define DEST PARMS
38 # define SRC DEST+4
39 # define LEN SRC+4
40 # endif
41
42 # define CFI_PUSH(REG) \
43 cfi_adjust_cfa_offset (4); \
44 cfi_rel_offset (REG, 0)
45
46 # define CFI_POP(REG) \
47 cfi_adjust_cfa_offset (-4); \
48 cfi_restore (REG)
49
50 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
51 # define POP(REG) popl REG; CFI_POP (REG)
52
53 # ifdef PIC
54 # define PARMS 8 /* Preserve EBX. */
55 # define ENTRANCE PUSH (%ebx);
56 # define RETURN_END POP (%ebx); ret
57 # define RETURN RETURN_END; CFI_PUSH (%ebx)
58 # define JMPTBL(I, B) I - B
59
60 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
61 jump table with relative offsets. INDEX is a register contains the
62 index into the jump table. SCALE is the scale of INDEX. */
63
64 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
65 /* We first load PC into EBX. */ \
66 SETUP_PIC_REG(bx); \
67 /* Get the address of the jump table. */ \
68 addl $(TABLE - .), %ebx; \
69 /* Get the entry and convert the relative offset to the \
70 absolute address. */ \
71 addl (%ebx, INDEX, SCALE), %ebx; \
72 /* We loaded the jump table. Go. */ \
73 _CET_NOTRACK jmp *%ebx
74 # else
75
76 # define PARMS 4
77 # define ENTRANCE
78 # define RETURN_END ret
79 # define RETURN RETURN_END
80 # define JMPTBL(I, B) I
81
82 /* Branch to an entry in a jump table. TABLE is a jump table with
83 absolute offsets. INDEX is a register contains the index into the
84 jump table. SCALE is the scale of INDEX. */
85
86 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
87 _CET_NOTRACK jmp *TABLE(, INDEX, SCALE)
88 # endif
89
90 .section .text.ssse3,"ax",@progbits
91 # if !defined USE_AS_BCOPY && defined SHARED
92 ENTRY (MEMCPY_CHK)
93 movl 12(%esp), %eax
94 cmpl %eax, 16(%esp)
95 jb HIDDEN_JUMPTARGET (__chk_fail)
96 END (MEMCPY_CHK)
97 # endif
98 ENTRY (MEMCPY)
99 ENTRANCE
100 movl LEN(%esp), %ecx
101 movl SRC(%esp), %eax
102 movl DEST(%esp), %edx
103
104 # ifdef USE_AS_MEMMOVE
105 cmp %eax, %edx
106 jb L(copy_forward)
107 je L(fwd_write_0bytes)
108 cmp $32, %ecx
109 jae L(memmove_bwd)
110 jmp L(bk_write_less32bytes_2)
111
112 .p2align 4
113 L(memmove_bwd):
114 add %ecx, %eax
115 cmp %eax, %edx
116 movl SRC(%esp), %eax
117 jb L(copy_backward)
118
119 L(copy_forward):
120 # endif
121 cmp $48, %ecx
122 jae L(48bytesormore)
123
124 L(fwd_write_less32bytes):
125 # ifndef USE_AS_MEMMOVE
126 cmp %dl, %al
127 jb L(bk_write)
128 # endif
129 add %ecx, %edx
130 add %ecx, %eax
131 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
132 # ifndef USE_AS_MEMMOVE
133 .p2align 4
134 L(bk_write):
135 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
136 # endif
137
138 .p2align 4
139 L(48bytesormore):
140 # ifndef USE_AS_MEMMOVE
141 movlpd (%eax), %xmm0
142 movlpd 8(%eax), %xmm1
143 movlpd %xmm0, (%edx)
144 movlpd %xmm1, 8(%edx)
145 # else
146 movdqu (%eax), %xmm0
147 # endif
148 PUSH (%edi)
149 movl %edx, %edi
150 and $-16, %edx
151 add $16, %edx
152 sub %edx, %edi
153 add %edi, %ecx
154 sub %edi, %eax
155
156 # ifdef SHARED_CACHE_SIZE_HALF
157 cmp $SHARED_CACHE_SIZE_HALF, %ecx
158 # else
159 # ifdef PIC
160 SETUP_PIC_REG(bx)
161 add $_GLOBAL_OFFSET_TABLE_, %ebx
162 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
163 # else
164 cmp __x86_shared_cache_size_half, %ecx
165 # endif
166 # endif
167
168 mov %eax, %edi
169 jae L(large_page)
170 and $0xf, %edi
171 jz L(shl_0)
172 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
173
174 .p2align 4
175 L(shl_0):
176 # ifdef USE_AS_MEMMOVE
177 movl DEST+4(%esp), %edi
178 movdqu %xmm0, (%edi)
179 # endif
180 xor %edi, %edi
181 cmp $127, %ecx
182 ja L(shl_0_gobble)
183 lea -32(%ecx), %ecx
184
185 .p2align 4
186 L(shl_0_loop):
187 movdqa (%eax, %edi), %xmm0
188 movdqa 16(%eax, %edi), %xmm1
189 sub $32, %ecx
190 movdqa %xmm0, (%edx, %edi)
191 movdqa %xmm1, 16(%edx, %edi)
192 lea 32(%edi), %edi
193 jb L(shl_0_end)
194
195 movdqa (%eax, %edi), %xmm0
196 movdqa 16(%eax, %edi), %xmm1
197 sub $32, %ecx
198 movdqa %xmm0, (%edx, %edi)
199 movdqa %xmm1, 16(%edx, %edi)
200 lea 32(%edi), %edi
201 jb L(shl_0_end)
202
203 movdqa (%eax, %edi), %xmm0
204 movdqa 16(%eax, %edi), %xmm1
205 sub $32, %ecx
206 movdqa %xmm0, (%edx, %edi)
207 movdqa %xmm1, 16(%edx, %edi)
208 lea 32(%edi), %edi
209 jb L(shl_0_end)
210
211 movdqa (%eax, %edi), %xmm0
212 movdqa 16(%eax, %edi), %xmm1
213 sub $32, %ecx
214 movdqa %xmm0, (%edx, %edi)
215 movdqa %xmm1, 16(%edx, %edi)
216 lea 32(%edi), %edi
217
218 L(shl_0_end):
219 lea 32(%ecx), %ecx
220 add %ecx, %edi
221 add %edi, %edx
222 add %edi, %eax
223 POP (%edi)
224 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
225
226 CFI_PUSH (%edi)
227
228 .p2align 4
229 L(shl_0_gobble):
230 # ifdef DATA_CACHE_SIZE_HALF
231 cmp $DATA_CACHE_SIZE_HALF, %ecx
232 # else
233 # ifdef PIC
234 SETUP_PIC_REG(bx)
235 add $_GLOBAL_OFFSET_TABLE_, %ebx
236 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
237 # else
238 cmp __x86_data_cache_size_half, %ecx
239 # endif
240 # endif
241 POP (%edi)
242 lea -128(%ecx), %ecx
243 jae L(shl_0_gobble_mem_loop)
244
245 .p2align 4
246 L(shl_0_gobble_cache_loop):
247 movdqa (%eax), %xmm0
248 movdqa 0x10(%eax), %xmm1
249 movdqa 0x20(%eax), %xmm2
250 movdqa 0x30(%eax), %xmm3
251 movdqa 0x40(%eax), %xmm4
252 movdqa 0x50(%eax), %xmm5
253 movdqa 0x60(%eax), %xmm6
254 movdqa 0x70(%eax), %xmm7
255 lea 0x80(%eax), %eax
256 sub $128, %ecx
257 movdqa %xmm0, (%edx)
258 movdqa %xmm1, 0x10(%edx)
259 movdqa %xmm2, 0x20(%edx)
260 movdqa %xmm3, 0x30(%edx)
261 movdqa %xmm4, 0x40(%edx)
262 movdqa %xmm5, 0x50(%edx)
263 movdqa %xmm6, 0x60(%edx)
264 movdqa %xmm7, 0x70(%edx)
265 lea 0x80(%edx), %edx
266
267 jae L(shl_0_gobble_cache_loop)
268 cmp $-0x40, %ecx
269 lea 0x80(%ecx), %ecx
270 jl L(shl_0_cache_less_64bytes)
271
272 movdqa (%eax), %xmm0
273 sub $0x40, %ecx
274 movdqa 0x10(%eax), %xmm1
275 movdqa %xmm0, (%edx)
276 movdqa %xmm1, 0x10(%edx)
277 movdqa 0x20(%eax), %xmm0
278 movdqa 0x30(%eax), %xmm1
279 add $0x40, %eax
280 movdqa %xmm0, 0x20(%edx)
281 movdqa %xmm1, 0x30(%edx)
282 add $0x40, %edx
283
284 L(shl_0_cache_less_64bytes):
285 cmp $0x20, %ecx
286 jb L(shl_0_cache_less_32bytes)
287 movdqa (%eax), %xmm0
288 sub $0x20, %ecx
289 movdqa 0x10(%eax), %xmm1
290 add $0x20, %eax
291 movdqa %xmm0, (%edx)
292 movdqa %xmm1, 0x10(%edx)
293 add $0x20, %edx
294
295 L(shl_0_cache_less_32bytes):
296 cmp $0x10, %ecx
297 jb L(shl_0_cache_less_16bytes)
298 sub $0x10, %ecx
299 movdqa (%eax), %xmm0
300 add $0x10, %eax
301 movdqa %xmm0, (%edx)
302 add $0x10, %edx
303
304 L(shl_0_cache_less_16bytes):
305 add %ecx, %edx
306 add %ecx, %eax
307 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
308
309 .p2align 4
310 L(shl_0_gobble_mem_loop):
311 prefetcht0 0x1c0(%eax)
312 prefetcht0 0x280(%eax)
313 prefetcht0 0x1c0(%edx)
314
315 movdqa (%eax), %xmm0
316 movdqa 0x10(%eax), %xmm1
317 movdqa 0x20(%eax), %xmm2
318 movdqa 0x30(%eax), %xmm3
319 movdqa 0x40(%eax), %xmm4
320 movdqa 0x50(%eax), %xmm5
321 movdqa 0x60(%eax), %xmm6
322 movdqa 0x70(%eax), %xmm7
323 lea 0x80(%eax), %eax
324 sub $0x80, %ecx
325 movdqa %xmm0, (%edx)
326 movdqa %xmm1, 0x10(%edx)
327 movdqa %xmm2, 0x20(%edx)
328 movdqa %xmm3, 0x30(%edx)
329 movdqa %xmm4, 0x40(%edx)
330 movdqa %xmm5, 0x50(%edx)
331 movdqa %xmm6, 0x60(%edx)
332 movdqa %xmm7, 0x70(%edx)
333 lea 0x80(%edx), %edx
334
335 jae L(shl_0_gobble_mem_loop)
336 cmp $-0x40, %ecx
337 lea 0x80(%ecx), %ecx
338 jl L(shl_0_mem_less_64bytes)
339
340 movdqa (%eax), %xmm0
341 sub $0x40, %ecx
342 movdqa 0x10(%eax), %xmm1
343
344 movdqa %xmm0, (%edx)
345 movdqa %xmm1, 0x10(%edx)
346
347 movdqa 0x20(%eax), %xmm0
348 movdqa 0x30(%eax), %xmm1
349 add $0x40, %eax
350
351 movdqa %xmm0, 0x20(%edx)
352 movdqa %xmm1, 0x30(%edx)
353 add $0x40, %edx
354
355 L(shl_0_mem_less_64bytes):
356 cmp $0x20, %ecx
357 jb L(shl_0_mem_less_32bytes)
358 movdqa (%eax), %xmm0
359 sub $0x20, %ecx
360 movdqa 0x10(%eax), %xmm1
361 add $0x20, %eax
362 movdqa %xmm0, (%edx)
363 movdqa %xmm1, 0x10(%edx)
364 add $0x20, %edx
365
366 L(shl_0_mem_less_32bytes):
367 cmp $0x10, %ecx
368 jb L(shl_0_mem_less_16bytes)
369 sub $0x10, %ecx
370 movdqa (%eax), %xmm0
371 add $0x10, %eax
372 movdqa %xmm0, (%edx)
373 add $0x10, %edx
374
375 L(shl_0_mem_less_16bytes):
376 add %ecx, %edx
377 add %ecx, %eax
378 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
379
380 .p2align 4
381 L(shl_1):
382 # ifndef USE_AS_MEMMOVE
383 movaps -1(%eax), %xmm1
384 # else
385 movl DEST+4(%esp), %edi
386 movaps -1(%eax), %xmm1
387 movdqu %xmm0, (%edi)
388 # endif
389 # ifdef DATA_CACHE_SIZE_HALF
390 cmp $DATA_CACHE_SIZE_HALF, %ecx
391 # else
392 # ifdef PIC
393 SETUP_PIC_REG(bx)
394 add $_GLOBAL_OFFSET_TABLE_, %ebx
395 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
396 # else
397 cmp __x86_data_cache_size_half, %ecx
398 # endif
399 # endif
400 jb L(sh_1_no_prefetch)
401
402 lea -64(%ecx), %ecx
403
404 .p2align 4
405 L(Shl1LoopStart):
406 prefetcht0 0x1c0(%eax)
407 prefetcht0 0x1c0(%edx)
408 movaps 15(%eax), %xmm2
409 movaps 31(%eax), %xmm3
410 movaps 47(%eax), %xmm4
411 movaps 63(%eax), %xmm5
412 movaps %xmm5, %xmm7
413 palignr $1, %xmm4, %xmm5
414 palignr $1, %xmm3, %xmm4
415 movaps %xmm5, 48(%edx)
416 palignr $1, %xmm2, %xmm3
417 lea 64(%eax), %eax
418 palignr $1, %xmm1, %xmm2
419 movaps %xmm4, 32(%edx)
420 movaps %xmm3, 16(%edx)
421 movaps %xmm7, %xmm1
422 movaps %xmm2, (%edx)
423 lea 64(%edx), %edx
424 sub $64, %ecx
425 ja L(Shl1LoopStart)
426
427 L(Shl1LoopLeave):
428 add $32, %ecx
429 jle L(shl_end_0)
430
431 movaps 15(%eax), %xmm2
432 movaps 31(%eax), %xmm3
433 palignr $1, %xmm2, %xmm3
434 palignr $1, %xmm1, %xmm2
435 movaps %xmm2, (%edx)
436 movaps %xmm3, 16(%edx)
437 lea 32(%edx, %ecx), %edx
438 lea 32(%eax, %ecx), %eax
439 POP (%edi)
440 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
441
442 CFI_PUSH (%edi)
443
444 .p2align 4
445 L(sh_1_no_prefetch):
446 lea -32(%ecx), %ecx
447 lea -1(%eax), %eax
448 xor %edi, %edi
449
450 .p2align 4
451 L(sh_1_no_prefetch_loop):
452 movdqa 16(%eax, %edi), %xmm2
453 sub $32, %ecx
454 movdqa 32(%eax, %edi), %xmm3
455 movdqa %xmm3, %xmm4
456 palignr $1, %xmm2, %xmm3
457 palignr $1, %xmm1, %xmm2
458 lea 32(%edi), %edi
459 movdqa %xmm2, -32(%edx, %edi)
460 movdqa %xmm3, -16(%edx, %edi)
461 jb L(sh_1_end_no_prefetch_loop)
462
463 movdqa 16(%eax, %edi), %xmm2
464 sub $32, %ecx
465 movdqa 32(%eax, %edi), %xmm3
466 movdqa %xmm3, %xmm1
467 palignr $1, %xmm2, %xmm3
468 palignr $1, %xmm4, %xmm2
469 lea 32(%edi), %edi
470 movdqa %xmm2, -32(%edx, %edi)
471 movdqa %xmm3, -16(%edx, %edi)
472 jae L(sh_1_no_prefetch_loop)
473
474 L(sh_1_end_no_prefetch_loop):
475 lea 32(%ecx), %ecx
476 add %ecx, %edi
477 add %edi, %edx
478 lea 1(%edi, %eax), %eax
479 POP (%edi)
480 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
481
482 CFI_PUSH (%edi)
483
484 .p2align 4
485 L(shl_2):
486 # ifndef USE_AS_MEMMOVE
487 movaps -2(%eax), %xmm1
488 # else
489 movl DEST+4(%esp), %edi
490 movaps -2(%eax), %xmm1
491 movdqu %xmm0, (%edi)
492 # endif
493 # ifdef DATA_CACHE_SIZE_HALF
494 cmp $DATA_CACHE_SIZE_HALF, %ecx
495 # else
496 # ifdef PIC
497 SETUP_PIC_REG(bx)
498 add $_GLOBAL_OFFSET_TABLE_, %ebx
499 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
500 # else
501 cmp __x86_data_cache_size_half, %ecx
502 # endif
503 # endif
504 jb L(sh_2_no_prefetch)
505
506 lea -64(%ecx), %ecx
507
508 .p2align 4
509 L(Shl2LoopStart):
510 prefetcht0 0x1c0(%eax)
511 prefetcht0 0x1c0(%edx)
512 movaps 14(%eax), %xmm2
513 movaps 30(%eax), %xmm3
514 movaps 46(%eax), %xmm4
515 movaps 62(%eax), %xmm5
516 movaps %xmm5, %xmm7
517 palignr $2, %xmm4, %xmm5
518 palignr $2, %xmm3, %xmm4
519 movaps %xmm5, 48(%edx)
520 palignr $2, %xmm2, %xmm3
521 lea 64(%eax), %eax
522 palignr $2, %xmm1, %xmm2
523 movaps %xmm4, 32(%edx)
524 movaps %xmm3, 16(%edx)
525 movaps %xmm7, %xmm1
526 movaps %xmm2, (%edx)
527 lea 64(%edx), %edx
528 sub $64, %ecx
529 ja L(Shl2LoopStart)
530
531 L(Shl2LoopLeave):
532 add $32, %ecx
533 jle L(shl_end_0)
534
535 movaps 14(%eax), %xmm2
536 movaps 30(%eax), %xmm3
537 palignr $2, %xmm2, %xmm3
538 palignr $2, %xmm1, %xmm2
539 movaps %xmm2, (%edx)
540 movaps %xmm3, 16(%edx)
541 lea 32(%edx, %ecx), %edx
542 lea 32(%eax, %ecx), %eax
543 POP (%edi)
544 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
545
546 CFI_PUSH (%edi)
547
548 .p2align 4
549 L(sh_2_no_prefetch):
550 lea -32(%ecx), %ecx
551 lea -2(%eax), %eax
552 xor %edi, %edi
553
554 .p2align 4
555 L(sh_2_no_prefetch_loop):
556 movdqa 16(%eax, %edi), %xmm2
557 sub $32, %ecx
558 movdqa 32(%eax, %edi), %xmm3
559 movdqa %xmm3, %xmm4
560 palignr $2, %xmm2, %xmm3
561 palignr $2, %xmm1, %xmm2
562 lea 32(%edi), %edi
563 movdqa %xmm2, -32(%edx, %edi)
564 movdqa %xmm3, -16(%edx, %edi)
565 jb L(sh_2_end_no_prefetch_loop)
566
567 movdqa 16(%eax, %edi), %xmm2
568 sub $32, %ecx
569 movdqa 32(%eax, %edi), %xmm3
570 movdqa %xmm3, %xmm1
571 palignr $2, %xmm2, %xmm3
572 palignr $2, %xmm4, %xmm2
573 lea 32(%edi), %edi
574 movdqa %xmm2, -32(%edx, %edi)
575 movdqa %xmm3, -16(%edx, %edi)
576 jae L(sh_2_no_prefetch_loop)
577
578 L(sh_2_end_no_prefetch_loop):
579 lea 32(%ecx), %ecx
580 add %ecx, %edi
581 add %edi, %edx
582 lea 2(%edi, %eax), %eax
583 POP (%edi)
584 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
585
586 CFI_PUSH (%edi)
587
588 .p2align 4
589 L(shl_3):
590 # ifndef USE_AS_MEMMOVE
591 movaps -3(%eax), %xmm1
592 # else
593 movl DEST+4(%esp), %edi
594 movaps -3(%eax), %xmm1
595 movdqu %xmm0, (%edi)
596 # endif
597 # ifdef DATA_CACHE_SIZE_HALF
598 cmp $DATA_CACHE_SIZE_HALF, %ecx
599 # else
600 # ifdef PIC
601 SETUP_PIC_REG(bx)
602 add $_GLOBAL_OFFSET_TABLE_, %ebx
603 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
604 # else
605 cmp __x86_data_cache_size_half, %ecx
606 # endif
607 # endif
608 jb L(sh_3_no_prefetch)
609
610 lea -64(%ecx), %ecx
611
612 .p2align 4
613 L(Shl3LoopStart):
614 prefetcht0 0x1c0(%eax)
615 prefetcht0 0x1c0(%edx)
616 movaps 13(%eax), %xmm2
617 movaps 29(%eax), %xmm3
618 movaps 45(%eax), %xmm4
619 movaps 61(%eax), %xmm5
620 movaps %xmm5, %xmm7
621 palignr $3, %xmm4, %xmm5
622 palignr $3, %xmm3, %xmm4
623 movaps %xmm5, 48(%edx)
624 palignr $3, %xmm2, %xmm3
625 lea 64(%eax), %eax
626 palignr $3, %xmm1, %xmm2
627 movaps %xmm4, 32(%edx)
628 movaps %xmm3, 16(%edx)
629 movaps %xmm7, %xmm1
630 movaps %xmm2, (%edx)
631 lea 64(%edx), %edx
632 sub $64, %ecx
633 ja L(Shl3LoopStart)
634
635 L(Shl3LoopLeave):
636 add $32, %ecx
637 jle L(shl_end_0)
638
639 movaps 13(%eax), %xmm2
640 movaps 29(%eax), %xmm3
641 palignr $3, %xmm2, %xmm3
642 palignr $3, %xmm1, %xmm2
643 movaps %xmm2, (%edx)
644 movaps %xmm3, 16(%edx)
645 lea 32(%edx, %ecx), %edx
646 lea 32(%eax, %ecx), %eax
647 POP (%edi)
648 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
649
650 CFI_PUSH (%edi)
651
652 .p2align 4
653 L(sh_3_no_prefetch):
654 lea -32(%ecx), %ecx
655 lea -3(%eax), %eax
656 xor %edi, %edi
657
658 .p2align 4
659 L(sh_3_no_prefetch_loop):
660 movdqa 16(%eax, %edi), %xmm2
661 sub $32, %ecx
662 movdqa 32(%eax, %edi), %xmm3
663 movdqa %xmm3, %xmm4
664 palignr $3, %xmm2, %xmm3
665 palignr $3, %xmm1, %xmm2
666 lea 32(%edi), %edi
667 movdqa %xmm2, -32(%edx, %edi)
668 movdqa %xmm3, -16(%edx, %edi)
669
670 jb L(sh_3_end_no_prefetch_loop)
671
672 movdqa 16(%eax, %edi), %xmm2
673 sub $32, %ecx
674 movdqa 32(%eax, %edi), %xmm3
675 movdqa %xmm3, %xmm1
676 palignr $3, %xmm2, %xmm3
677 palignr $3, %xmm4, %xmm2
678 lea 32(%edi), %edi
679 movdqa %xmm2, -32(%edx, %edi)
680 movdqa %xmm3, -16(%edx, %edi)
681
682 jae L(sh_3_no_prefetch_loop)
683
684 L(sh_3_end_no_prefetch_loop):
685 lea 32(%ecx), %ecx
686 add %ecx, %edi
687 add %edi, %edx
688 lea 3(%edi, %eax), %eax
689 POP (%edi)
690 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
691
692 CFI_PUSH (%edi)
693
694 .p2align 4
695 L(shl_4):
696 # ifndef USE_AS_MEMMOVE
697 movaps -4(%eax), %xmm1
698 # else
699 movl DEST+4(%esp), %edi
700 movaps -4(%eax), %xmm1
701 movdqu %xmm0, (%edi)
702 # endif
703 # ifdef DATA_CACHE_SIZE_HALF
704 cmp $DATA_CACHE_SIZE_HALF, %ecx
705 # else
706 # ifdef PIC
707 SETUP_PIC_REG(bx)
708 add $_GLOBAL_OFFSET_TABLE_, %ebx
709 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
710 # else
711 cmp __x86_data_cache_size_half, %ecx
712 # endif
713 # endif
714 jb L(sh_4_no_prefetch)
715
716 lea -64(%ecx), %ecx
717
718 .p2align 4
719 L(Shl4LoopStart):
720 prefetcht0 0x1c0(%eax)
721 prefetcht0 0x1c0(%edx)
722 movaps 12(%eax), %xmm2
723 movaps 28(%eax), %xmm3
724 movaps 44(%eax), %xmm4
725 movaps 60(%eax), %xmm5
726 movaps %xmm5, %xmm7
727 palignr $4, %xmm4, %xmm5
728 palignr $4, %xmm3, %xmm4
729 movaps %xmm5, 48(%edx)
730 palignr $4, %xmm2, %xmm3
731 lea 64(%eax), %eax
732 palignr $4, %xmm1, %xmm2
733 movaps %xmm4, 32(%edx)
734 movaps %xmm3, 16(%edx)
735 movaps %xmm7, %xmm1
736 movaps %xmm2, (%edx)
737 lea 64(%edx), %edx
738 sub $64, %ecx
739 ja L(Shl4LoopStart)
740
741 L(Shl4LoopLeave):
742 add $32, %ecx
743 jle L(shl_end_0)
744
745 movaps 12(%eax), %xmm2
746 movaps 28(%eax), %xmm3
747 palignr $4, %xmm2, %xmm3
748 palignr $4, %xmm1, %xmm2
749 movaps %xmm2, (%edx)
750 movaps %xmm3, 16(%edx)
751 lea 32(%edx, %ecx), %edx
752 lea 32(%eax, %ecx), %eax
753 POP (%edi)
754 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
755
756 CFI_PUSH (%edi)
757
758 .p2align 4
759 L(sh_4_no_prefetch):
760 lea -32(%ecx), %ecx
761 lea -4(%eax), %eax
762 xor %edi, %edi
763
764 .p2align 4
765 L(sh_4_no_prefetch_loop):
766 movdqa 16(%eax, %edi), %xmm2
767 sub $32, %ecx
768 movdqa 32(%eax, %edi), %xmm3
769 movdqa %xmm3, %xmm4
770 palignr $4, %xmm2, %xmm3
771 palignr $4, %xmm1, %xmm2
772 lea 32(%edi), %edi
773 movdqa %xmm2, -32(%edx, %edi)
774 movdqa %xmm3, -16(%edx, %edi)
775
776 jb L(sh_4_end_no_prefetch_loop)
777
778 movdqa 16(%eax, %edi), %xmm2
779 sub $32, %ecx
780 movdqa 32(%eax, %edi), %xmm3
781 movdqa %xmm3, %xmm1
782 palignr $4, %xmm2, %xmm3
783 palignr $4, %xmm4, %xmm2
784 lea 32(%edi), %edi
785 movdqa %xmm2, -32(%edx, %edi)
786 movdqa %xmm3, -16(%edx, %edi)
787
788 jae L(sh_4_no_prefetch_loop)
789
790 L(sh_4_end_no_prefetch_loop):
791 lea 32(%ecx), %ecx
792 add %ecx, %edi
793 add %edi, %edx
794 lea 4(%edi, %eax), %eax
795 POP (%edi)
796 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
797
798 CFI_PUSH (%edi)
799
800 .p2align 4
801 L(shl_5):
802 # ifndef USE_AS_MEMMOVE
803 movaps -5(%eax), %xmm1
804 # else
805 movl DEST+4(%esp), %edi
806 movaps -5(%eax), %xmm1
807 movdqu %xmm0, (%edi)
808 # endif
809 # ifdef DATA_CACHE_SIZE_HALF
810 cmp $DATA_CACHE_SIZE_HALF, %ecx
811 # else
812 # ifdef PIC
813 SETUP_PIC_REG(bx)
814 add $_GLOBAL_OFFSET_TABLE_, %ebx
815 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
816 # else
817 cmp __x86_data_cache_size_half, %ecx
818 # endif
819 # endif
820 jb L(sh_5_no_prefetch)
821
822 lea -64(%ecx), %ecx
823
824 .p2align 4
825 L(Shl5LoopStart):
826 prefetcht0 0x1c0(%eax)
827 prefetcht0 0x1c0(%edx)
828 movaps 11(%eax), %xmm2
829 movaps 27(%eax), %xmm3
830 movaps 43(%eax), %xmm4
831 movaps 59(%eax), %xmm5
832 movaps %xmm5, %xmm7
833 palignr $5, %xmm4, %xmm5
834 palignr $5, %xmm3, %xmm4
835 movaps %xmm5, 48(%edx)
836 palignr $5, %xmm2, %xmm3
837 lea 64(%eax), %eax
838 palignr $5, %xmm1, %xmm2
839 movaps %xmm4, 32(%edx)
840 movaps %xmm3, 16(%edx)
841 movaps %xmm7, %xmm1
842 movaps %xmm2, (%edx)
843 lea 64(%edx), %edx
844 sub $64, %ecx
845 ja L(Shl5LoopStart)
846
847 L(Shl5LoopLeave):
848 add $32, %ecx
849 jle L(shl_end_0)
850
851 movaps 11(%eax), %xmm2
852 movaps 27(%eax), %xmm3
853 palignr $5, %xmm2, %xmm3
854 palignr $5, %xmm1, %xmm2
855 movaps %xmm2, (%edx)
856 movaps %xmm3, 16(%edx)
857 lea 32(%edx, %ecx), %edx
858 lea 32(%eax, %ecx), %eax
859 POP (%edi)
860 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
861
862 CFI_PUSH (%edi)
863
864 .p2align 4
865 L(sh_5_no_prefetch):
866 lea -32(%ecx), %ecx
867 lea -5(%eax), %eax
868 xor %edi, %edi
869
870 .p2align 4
871 L(sh_5_no_prefetch_loop):
872 movdqa 16(%eax, %edi), %xmm2
873 sub $32, %ecx
874 movdqa 32(%eax, %edi), %xmm3
875 movdqa %xmm3, %xmm4
876 palignr $5, %xmm2, %xmm3
877 palignr $5, %xmm1, %xmm2
878 lea 32(%edi), %edi
879 movdqa %xmm2, -32(%edx, %edi)
880 movdqa %xmm3, -16(%edx, %edi)
881
882 jb L(sh_5_end_no_prefetch_loop)
883
884 movdqa 16(%eax, %edi), %xmm2
885 sub $32, %ecx
886 movdqa 32(%eax, %edi), %xmm3
887 movdqa %xmm3, %xmm1
888 palignr $5, %xmm2, %xmm3
889 palignr $5, %xmm4, %xmm2
890 lea 32(%edi), %edi
891 movdqa %xmm2, -32(%edx, %edi)
892 movdqa %xmm3, -16(%edx, %edi)
893
894 jae L(sh_5_no_prefetch_loop)
895
896 L(sh_5_end_no_prefetch_loop):
897 lea 32(%ecx), %ecx
898 add %ecx, %edi
899 add %edi, %edx
900 lea 5(%edi, %eax), %eax
901 POP (%edi)
902 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
903
904 CFI_PUSH (%edi)
905
906 .p2align 4
907 L(shl_6):
908 # ifndef USE_AS_MEMMOVE
909 movaps -6(%eax), %xmm1
910 # else
911 movl DEST+4(%esp), %edi
912 movaps -6(%eax), %xmm1
913 movdqu %xmm0, (%edi)
914 # endif
915 # ifdef DATA_CACHE_SIZE_HALF
916 cmp $DATA_CACHE_SIZE_HALF, %ecx
917 # else
918 # ifdef PIC
919 SETUP_PIC_REG(bx)
920 add $_GLOBAL_OFFSET_TABLE_, %ebx
921 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
922 # else
923 cmp __x86_data_cache_size_half, %ecx
924 # endif
925 # endif
926 jb L(sh_6_no_prefetch)
927
928 lea -64(%ecx), %ecx
929
930 .p2align 4
931 L(Shl6LoopStart):
932 prefetcht0 0x1c0(%eax)
933 prefetcht0 0x1c0(%edx)
934 movaps 10(%eax), %xmm2
935 movaps 26(%eax), %xmm3
936 movaps 42(%eax), %xmm4
937 movaps 58(%eax), %xmm5
938 movaps %xmm5, %xmm7
939 palignr $6, %xmm4, %xmm5
940 palignr $6, %xmm3, %xmm4
941 movaps %xmm5, 48(%edx)
942 palignr $6, %xmm2, %xmm3
943 lea 64(%eax), %eax
944 palignr $6, %xmm1, %xmm2
945 movaps %xmm4, 32(%edx)
946 movaps %xmm3, 16(%edx)
947 movaps %xmm7, %xmm1
948 movaps %xmm2, (%edx)
949 lea 64(%edx), %edx
950 sub $64, %ecx
951 ja L(Shl6LoopStart)
952
953 L(Shl6LoopLeave):
954 add $32, %ecx
955 jle L(shl_end_0)
956
957 movaps 10(%eax), %xmm2
958 movaps 26(%eax), %xmm3
959 palignr $6, %xmm2, %xmm3
960 palignr $6, %xmm1, %xmm2
961 movaps %xmm2, (%edx)
962 movaps %xmm3, 16(%edx)
963 lea 32(%edx, %ecx), %edx
964 lea 32(%eax, %ecx), %eax
965 POP (%edi)
966 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
967
968 CFI_PUSH (%edi)
969
970 .p2align 4
971 L(sh_6_no_prefetch):
972 lea -32(%ecx), %ecx
973 lea -6(%eax), %eax
974 xor %edi, %edi
975
976 .p2align 4
977 L(sh_6_no_prefetch_loop):
978 movdqa 16(%eax, %edi), %xmm2
979 sub $32, %ecx
980 movdqa 32(%eax, %edi), %xmm3
981 movdqa %xmm3, %xmm4
982 palignr $6, %xmm2, %xmm3
983 palignr $6, %xmm1, %xmm2
984 lea 32(%edi), %edi
985 movdqa %xmm2, -32(%edx, %edi)
986 movdqa %xmm3, -16(%edx, %edi)
987
988 jb L(sh_6_end_no_prefetch_loop)
989
990 movdqa 16(%eax, %edi), %xmm2
991 sub $32, %ecx
992 movdqa 32(%eax, %edi), %xmm3
993 movdqa %xmm3, %xmm1
994 palignr $6, %xmm2, %xmm3
995 palignr $6, %xmm4, %xmm2
996 lea 32(%edi), %edi
997 movdqa %xmm2, -32(%edx, %edi)
998 movdqa %xmm3, -16(%edx, %edi)
999
1000 jae L(sh_6_no_prefetch_loop)
1001
1002 L(sh_6_end_no_prefetch_loop):
1003 lea 32(%ecx), %ecx
1004 add %ecx, %edi
1005 add %edi, %edx
1006 lea 6(%edi, %eax), %eax
1007 POP (%edi)
1008 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1009
1010 CFI_PUSH (%edi)
1011
1012 .p2align 4
1013 L(shl_7):
1014 # ifndef USE_AS_MEMMOVE
1015 movaps -7(%eax), %xmm1
1016 # else
1017 movl DEST+4(%esp), %edi
1018 movaps -7(%eax), %xmm1
1019 movdqu %xmm0, (%edi)
1020 # endif
1021 # ifdef DATA_CACHE_SIZE_HALF
1022 cmp $DATA_CACHE_SIZE_HALF, %ecx
1023 # else
1024 # ifdef PIC
1025 SETUP_PIC_REG(bx)
1026 add $_GLOBAL_OFFSET_TABLE_, %ebx
1027 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1028 # else
1029 cmp __x86_data_cache_size_half, %ecx
1030 # endif
1031 # endif
1032 jb L(sh_7_no_prefetch)
1033
1034 lea -64(%ecx), %ecx
1035
1036 .p2align 4
1037 L(Shl7LoopStart):
1038 prefetcht0 0x1c0(%eax)
1039 prefetcht0 0x1c0(%edx)
1040 movaps 9(%eax), %xmm2
1041 movaps 25(%eax), %xmm3
1042 movaps 41(%eax), %xmm4
1043 movaps 57(%eax), %xmm5
1044 movaps %xmm5, %xmm7
1045 palignr $7, %xmm4, %xmm5
1046 palignr $7, %xmm3, %xmm4
1047 movaps %xmm5, 48(%edx)
1048 palignr $7, %xmm2, %xmm3
1049 lea 64(%eax), %eax
1050 palignr $7, %xmm1, %xmm2
1051 movaps %xmm4, 32(%edx)
1052 movaps %xmm3, 16(%edx)
1053 movaps %xmm7, %xmm1
1054 movaps %xmm2, (%edx)
1055 lea 64(%edx), %edx
1056 sub $64, %ecx
1057 ja L(Shl7LoopStart)
1058
1059 L(Shl7LoopLeave):
1060 add $32, %ecx
1061 jle L(shl_end_0)
1062
1063 movaps 9(%eax), %xmm2
1064 movaps 25(%eax), %xmm3
1065 palignr $7, %xmm2, %xmm3
1066 palignr $7, %xmm1, %xmm2
1067 movaps %xmm2, (%edx)
1068 movaps %xmm3, 16(%edx)
1069 lea 32(%edx, %ecx), %edx
1070 lea 32(%eax, %ecx), %eax
1071 POP (%edi)
1072 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1073
1074 CFI_PUSH (%edi)
1075
1076 .p2align 4
1077 L(sh_7_no_prefetch):
1078 lea -32(%ecx), %ecx
1079 lea -7(%eax), %eax
1080 xor %edi, %edi
1081
1082 .p2align 4
1083 L(sh_7_no_prefetch_loop):
1084 movdqa 16(%eax, %edi), %xmm2
1085 sub $32, %ecx
1086 movdqa 32(%eax, %edi), %xmm3
1087 movdqa %xmm3, %xmm4
1088 palignr $7, %xmm2, %xmm3
1089 palignr $7, %xmm1, %xmm2
1090 lea 32(%edi), %edi
1091 movdqa %xmm2, -32(%edx, %edi)
1092 movdqa %xmm3, -16(%edx, %edi)
1093 jb L(sh_7_end_no_prefetch_loop)
1094
1095 movdqa 16(%eax, %edi), %xmm2
1096 sub $32, %ecx
1097 movdqa 32(%eax, %edi), %xmm3
1098 movdqa %xmm3, %xmm1
1099 palignr $7, %xmm2, %xmm3
1100 palignr $7, %xmm4, %xmm2
1101 lea 32(%edi), %edi
1102 movdqa %xmm2, -32(%edx, %edi)
1103 movdqa %xmm3, -16(%edx, %edi)
1104 jae L(sh_7_no_prefetch_loop)
1105
1106 L(sh_7_end_no_prefetch_loop):
1107 lea 32(%ecx), %ecx
1108 add %ecx, %edi
1109 add %edi, %edx
1110 lea 7(%edi, %eax), %eax
1111 POP (%edi)
1112 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1113
1114 CFI_PUSH (%edi)
1115
1116 .p2align 4
1117 L(shl_8):
1118 # ifndef USE_AS_MEMMOVE
1119 movaps -8(%eax), %xmm1
1120 # else
1121 movl DEST+4(%esp), %edi
1122 movaps -8(%eax), %xmm1
1123 movdqu %xmm0, (%edi)
1124 # endif
1125 # ifdef DATA_CACHE_SIZE_HALF
1126 cmp $DATA_CACHE_SIZE_HALF, %ecx
1127 # else
1128 # ifdef PIC
1129 SETUP_PIC_REG(bx)
1130 add $_GLOBAL_OFFSET_TABLE_, %ebx
1131 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1132 # else
1133 cmp __x86_data_cache_size_half, %ecx
1134 # endif
1135 # endif
1136 jb L(sh_8_no_prefetch)
1137
1138 lea -64(%ecx), %ecx
1139
1140 .p2align 4
1141 L(Shl8LoopStart):
1142 prefetcht0 0x1c0(%eax)
1143 prefetcht0 0x1c0(%edx)
1144 movaps 8(%eax), %xmm2
1145 movaps 24(%eax), %xmm3
1146 movaps 40(%eax), %xmm4
1147 movaps 56(%eax), %xmm5
1148 movaps %xmm5, %xmm7
1149 palignr $8, %xmm4, %xmm5
1150 palignr $8, %xmm3, %xmm4
1151 movaps %xmm5, 48(%edx)
1152 palignr $8, %xmm2, %xmm3
1153 lea 64(%eax), %eax
1154 palignr $8, %xmm1, %xmm2
1155 movaps %xmm4, 32(%edx)
1156 movaps %xmm3, 16(%edx)
1157 movaps %xmm7, %xmm1
1158 movaps %xmm2, (%edx)
1159 lea 64(%edx), %edx
1160 sub $64, %ecx
1161 ja L(Shl8LoopStart)
1162
1163 L(LoopLeave8):
1164 add $32, %ecx
1165 jle L(shl_end_0)
1166
1167 movaps 8(%eax), %xmm2
1168 movaps 24(%eax), %xmm3
1169 palignr $8, %xmm2, %xmm3
1170 palignr $8, %xmm1, %xmm2
1171 movaps %xmm2, (%edx)
1172 movaps %xmm3, 16(%edx)
1173 lea 32(%edx, %ecx), %edx
1174 lea 32(%eax, %ecx), %eax
1175 POP (%edi)
1176 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1177
1178 CFI_PUSH (%edi)
1179
1180 .p2align 4
1181 L(sh_8_no_prefetch):
1182 lea -32(%ecx), %ecx
1183 lea -8(%eax), %eax
1184 xor %edi, %edi
1185
1186 .p2align 4
1187 L(sh_8_no_prefetch_loop):
1188 movdqa 16(%eax, %edi), %xmm2
1189 sub $32, %ecx
1190 movdqa 32(%eax, %edi), %xmm3
1191 movdqa %xmm3, %xmm4
1192 palignr $8, %xmm2, %xmm3
1193 palignr $8, %xmm1, %xmm2
1194 lea 32(%edi), %edi
1195 movdqa %xmm2, -32(%edx, %edi)
1196 movdqa %xmm3, -16(%edx, %edi)
1197 jb L(sh_8_end_no_prefetch_loop)
1198
1199 movdqa 16(%eax, %edi), %xmm2
1200 sub $32, %ecx
1201 movdqa 32(%eax, %edi), %xmm3
1202 movdqa %xmm3, %xmm1
1203 palignr $8, %xmm2, %xmm3
1204 palignr $8, %xmm4, %xmm2
1205 lea 32(%edi), %edi
1206 movdqa %xmm2, -32(%edx, %edi)
1207 movdqa %xmm3, -16(%edx, %edi)
1208 jae L(sh_8_no_prefetch_loop)
1209
1210 L(sh_8_end_no_prefetch_loop):
1211 lea 32(%ecx), %ecx
1212 add %ecx, %edi
1213 add %edi, %edx
1214 lea 8(%edi, %eax), %eax
1215 POP (%edi)
1216 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1217
1218 CFI_PUSH (%edi)
1219
1220 .p2align 4
1221 L(shl_9):
1222 # ifndef USE_AS_MEMMOVE
1223 movaps -9(%eax), %xmm1
1224 # else
1225 movl DEST+4(%esp), %edi
1226 movaps -9(%eax), %xmm1
1227 movdqu %xmm0, (%edi)
1228 # endif
1229 # ifdef DATA_CACHE_SIZE_HALF
1230 cmp $DATA_CACHE_SIZE_HALF, %ecx
1231 # else
1232 # ifdef PIC
1233 SETUP_PIC_REG(bx)
1234 add $_GLOBAL_OFFSET_TABLE_, %ebx
1235 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1236 # else
1237 cmp __x86_data_cache_size_half, %ecx
1238 # endif
1239 # endif
1240 jb L(sh_9_no_prefetch)
1241
1242 lea -64(%ecx), %ecx
1243
1244 .p2align 4
1245 L(Shl9LoopStart):
1246 prefetcht0 0x1c0(%eax)
1247 prefetcht0 0x1c0(%edx)
1248 movaps 7(%eax), %xmm2
1249 movaps 23(%eax), %xmm3
1250 movaps 39(%eax), %xmm4
1251 movaps 55(%eax), %xmm5
1252 movaps %xmm5, %xmm7
1253 palignr $9, %xmm4, %xmm5
1254 palignr $9, %xmm3, %xmm4
1255 movaps %xmm5, 48(%edx)
1256 palignr $9, %xmm2, %xmm3
1257 lea 64(%eax), %eax
1258 palignr $9, %xmm1, %xmm2
1259 movaps %xmm4, 32(%edx)
1260 movaps %xmm3, 16(%edx)
1261 movaps %xmm7, %xmm1
1262 movaps %xmm2, (%edx)
1263 lea 64(%edx), %edx
1264 sub $64, %ecx
1265 ja L(Shl9LoopStart)
1266
1267 L(Shl9LoopLeave):
1268 add $32, %ecx
1269 jle L(shl_end_0)
1270
1271 movaps 7(%eax), %xmm2
1272 movaps 23(%eax), %xmm3
1273 palignr $9, %xmm2, %xmm3
1274 palignr $9, %xmm1, %xmm2
1275
1276 movaps %xmm2, (%edx)
1277 movaps %xmm3, 16(%edx)
1278 lea 32(%edx, %ecx), %edx
1279 lea 32(%eax, %ecx), %eax
1280 POP (%edi)
1281 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1282
1283 CFI_PUSH (%edi)
1284
1285 .p2align 4
1286 L(sh_9_no_prefetch):
1287 lea -32(%ecx), %ecx
1288 lea -9(%eax), %eax
1289 xor %edi, %edi
1290
1291 .p2align 4
1292 L(sh_9_no_prefetch_loop):
1293 movdqa 16(%eax, %edi), %xmm2
1294 sub $32, %ecx
1295 movdqa 32(%eax, %edi), %xmm3
1296 movdqa %xmm3, %xmm4
1297 palignr $9, %xmm2, %xmm3
1298 palignr $9, %xmm1, %xmm2
1299 lea 32(%edi), %edi
1300 movdqa %xmm2, -32(%edx, %edi)
1301 movdqa %xmm3, -16(%edx, %edi)
1302 jb L(sh_9_end_no_prefetch_loop)
1303
1304 movdqa 16(%eax, %edi), %xmm2
1305 sub $32, %ecx
1306 movdqa 32(%eax, %edi), %xmm3
1307 movdqa %xmm3, %xmm1
1308 palignr $9, %xmm2, %xmm3
1309 palignr $9, %xmm4, %xmm2
1310 lea 32(%edi), %edi
1311 movdqa %xmm2, -32(%edx, %edi)
1312 movdqa %xmm3, -16(%edx, %edi)
1313 jae L(sh_9_no_prefetch_loop)
1314
1315 L(sh_9_end_no_prefetch_loop):
1316 lea 32(%ecx), %ecx
1317 add %ecx, %edi
1318 add %edi, %edx
1319 lea 9(%edi, %eax), %eax
1320 POP (%edi)
1321 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1322
1323 CFI_PUSH (%edi)
1324
1325 .p2align 4
1326 L(shl_10):
1327 # ifndef USE_AS_MEMMOVE
1328 movaps -10(%eax), %xmm1
1329 # else
1330 movl DEST+4(%esp), %edi
1331 movaps -10(%eax), %xmm1
1332 movdqu %xmm0, (%edi)
1333 # endif
1334 # ifdef DATA_CACHE_SIZE_HALF
1335 cmp $DATA_CACHE_SIZE_HALF, %ecx
1336 # else
1337 # ifdef PIC
1338 SETUP_PIC_REG(bx)
1339 add $_GLOBAL_OFFSET_TABLE_, %ebx
1340 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1341 # else
1342 cmp __x86_data_cache_size_half, %ecx
1343 # endif
1344 # endif
1345 jb L(sh_10_no_prefetch)
1346
1347 lea -64(%ecx), %ecx
1348
1349 .p2align 4
1350 L(Shl10LoopStart):
1351 prefetcht0 0x1c0(%eax)
1352 prefetcht0 0x1c0(%edx)
1353 movaps 6(%eax), %xmm2
1354 movaps 22(%eax), %xmm3
1355 movaps 38(%eax), %xmm4
1356 movaps 54(%eax), %xmm5
1357 movaps %xmm5, %xmm7
1358 palignr $10, %xmm4, %xmm5
1359 palignr $10, %xmm3, %xmm4
1360 movaps %xmm5, 48(%edx)
1361 palignr $10, %xmm2, %xmm3
1362 lea 64(%eax), %eax
1363 palignr $10, %xmm1, %xmm2
1364 movaps %xmm4, 32(%edx)
1365 movaps %xmm3, 16(%edx)
1366 movaps %xmm7, %xmm1
1367 movaps %xmm2, (%edx)
1368 lea 64(%edx), %edx
1369 sub $64, %ecx
1370 ja L(Shl10LoopStart)
1371
1372 L(Shl10LoopLeave):
1373 add $32, %ecx
1374 jle L(shl_end_0)
1375
1376 movaps 6(%eax), %xmm2
1377 movaps 22(%eax), %xmm3
1378 palignr $10, %xmm2, %xmm3
1379 palignr $10, %xmm1, %xmm2
1380
1381 movaps %xmm2, (%edx)
1382 movaps %xmm3, 16(%edx)
1383 lea 32(%edx, %ecx), %edx
1384 lea 32(%eax, %ecx), %eax
1385 POP (%edi)
1386 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1387
1388 CFI_PUSH (%edi)
1389
1390 .p2align 4
1391 L(sh_10_no_prefetch):
1392 lea -32(%ecx), %ecx
1393 lea -10(%eax), %eax
1394 xor %edi, %edi
1395
1396 .p2align 4
1397 L(sh_10_no_prefetch_loop):
1398 movdqa 16(%eax, %edi), %xmm2
1399 sub $32, %ecx
1400 movdqa 32(%eax, %edi), %xmm3
1401 movdqa %xmm3, %xmm4
1402 palignr $10, %xmm2, %xmm3
1403 palignr $10, %xmm1, %xmm2
1404 lea 32(%edi), %edi
1405 movdqa %xmm2, -32(%edx, %edi)
1406 movdqa %xmm3, -16(%edx, %edi)
1407 jb L(sh_10_end_no_prefetch_loop)
1408
1409 movdqa 16(%eax, %edi), %xmm2
1410 sub $32, %ecx
1411 movdqa 32(%eax, %edi), %xmm3
1412 movdqa %xmm3, %xmm1
1413 palignr $10, %xmm2, %xmm3
1414 palignr $10, %xmm4, %xmm2
1415 lea 32(%edi), %edi
1416 movdqa %xmm2, -32(%edx, %edi)
1417 movdqa %xmm3, -16(%edx, %edi)
1418 jae L(sh_10_no_prefetch_loop)
1419
1420 L(sh_10_end_no_prefetch_loop):
1421 lea 32(%ecx), %ecx
1422 add %ecx, %edi
1423 add %edi, %edx
1424 lea 10(%edi, %eax), %eax
1425 POP (%edi)
1426 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1427
1428 CFI_PUSH (%edi)
1429
1430 .p2align 4
1431 L(shl_11):
1432 # ifndef USE_AS_MEMMOVE
1433 movaps -11(%eax), %xmm1
1434 # else
1435 movl DEST+4(%esp), %edi
1436 movaps -11(%eax), %xmm1
1437 movdqu %xmm0, (%edi)
1438 # endif
1439 # ifdef DATA_CACHE_SIZE_HALF
1440 cmp $DATA_CACHE_SIZE_HALF, %ecx
1441 # else
1442 # ifdef PIC
1443 SETUP_PIC_REG(bx)
1444 add $_GLOBAL_OFFSET_TABLE_, %ebx
1445 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1446 # else
1447 cmp __x86_data_cache_size_half, %ecx
1448 # endif
1449 # endif
1450 jb L(sh_11_no_prefetch)
1451
1452 lea -64(%ecx), %ecx
1453
1454 .p2align 4
1455 L(Shl11LoopStart):
1456 prefetcht0 0x1c0(%eax)
1457 prefetcht0 0x1c0(%edx)
1458 movaps 5(%eax), %xmm2
1459 movaps 21(%eax), %xmm3
1460 movaps 37(%eax), %xmm4
1461 movaps 53(%eax), %xmm5
1462 movaps %xmm5, %xmm7
1463 palignr $11, %xmm4, %xmm5
1464 palignr $11, %xmm3, %xmm4
1465 movaps %xmm5, 48(%edx)
1466 palignr $11, %xmm2, %xmm3
1467 lea 64(%eax), %eax
1468 palignr $11, %xmm1, %xmm2
1469 movaps %xmm4, 32(%edx)
1470 movaps %xmm3, 16(%edx)
1471 movaps %xmm7, %xmm1
1472 movaps %xmm2, (%edx)
1473 lea 64(%edx), %edx
1474 sub $64, %ecx
1475 ja L(Shl11LoopStart)
1476
1477 L(Shl11LoopLeave):
1478 add $32, %ecx
1479 jle L(shl_end_0)
1480
1481 movaps 5(%eax), %xmm2
1482 movaps 21(%eax), %xmm3
1483 palignr $11, %xmm2, %xmm3
1484 palignr $11, %xmm1, %xmm2
1485
1486 movaps %xmm2, (%edx)
1487 movaps %xmm3, 16(%edx)
1488 lea 32(%edx, %ecx), %edx
1489 lea 32(%eax, %ecx), %eax
1490 POP (%edi)
1491 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1492
1493 CFI_PUSH (%edi)
1494
1495 .p2align 4
1496 L(sh_11_no_prefetch):
1497 lea -32(%ecx), %ecx
1498 lea -11(%eax), %eax
1499 xor %edi, %edi
1500
1501 .p2align 4
1502 L(sh_11_no_prefetch_loop):
1503 movdqa 16(%eax, %edi), %xmm2
1504 sub $32, %ecx
1505 movdqa 32(%eax, %edi), %xmm3
1506 movdqa %xmm3, %xmm4
1507 palignr $11, %xmm2, %xmm3
1508 palignr $11, %xmm1, %xmm2
1509 lea 32(%edi), %edi
1510 movdqa %xmm2, -32(%edx, %edi)
1511 movdqa %xmm3, -16(%edx, %edi)
1512 jb L(sh_11_end_no_prefetch_loop)
1513
1514 movdqa 16(%eax, %edi), %xmm2
1515 sub $32, %ecx
1516 movdqa 32(%eax, %edi), %xmm3
1517 movdqa %xmm3, %xmm1
1518 palignr $11, %xmm2, %xmm3
1519 palignr $11, %xmm4, %xmm2
1520 lea 32(%edi), %edi
1521 movdqa %xmm2, -32(%edx, %edi)
1522 movdqa %xmm3, -16(%edx, %edi)
1523 jae L(sh_11_no_prefetch_loop)
1524
1525 L(sh_11_end_no_prefetch_loop):
1526 lea 32(%ecx), %ecx
1527 add %ecx, %edi
1528 add %edi, %edx
1529 lea 11(%edi, %eax), %eax
1530 POP (%edi)
1531 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1532
1533 CFI_PUSH (%edi)
1534
1535 .p2align 4
1536 L(shl_12):
1537 # ifndef USE_AS_MEMMOVE
1538 movaps -12(%eax), %xmm1
1539 # else
1540 movl DEST+4(%esp), %edi
1541 movaps -12(%eax), %xmm1
1542 movdqu %xmm0, (%edi)
1543 # endif
1544 # ifdef DATA_CACHE_SIZE_HALF
1545 cmp $DATA_CACHE_SIZE_HALF, %ecx
1546 # else
1547 # ifdef PIC
1548 SETUP_PIC_REG(bx)
1549 add $_GLOBAL_OFFSET_TABLE_, %ebx
1550 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1551 # else
1552 cmp __x86_data_cache_size_half, %ecx
1553 # endif
1554 # endif
1555 jb L(sh_12_no_prefetch)
1556
1557 lea -64(%ecx), %ecx
1558
1559 .p2align 4
1560 L(Shl12LoopStart):
1561 prefetcht0 0x1c0(%eax)
1562 prefetcht0 0x1c0(%edx)
1563 movaps 4(%eax), %xmm2
1564 movaps 20(%eax), %xmm3
1565 movaps 36(%eax), %xmm4
1566 movaps 52(%eax), %xmm5
1567 movaps %xmm5, %xmm7
1568 palignr $12, %xmm4, %xmm5
1569 palignr $12, %xmm3, %xmm4
1570 movaps %xmm5, 48(%edx)
1571 palignr $12, %xmm2, %xmm3
1572 lea 64(%eax), %eax
1573 palignr $12, %xmm1, %xmm2
1574 movaps %xmm4, 32(%edx)
1575 movaps %xmm3, 16(%edx)
1576 movaps %xmm7, %xmm1
1577 movaps %xmm2, (%edx)
1578 lea 64(%edx), %edx
1579 sub $64, %ecx
1580 ja L(Shl12LoopStart)
1581
1582 L(Shl12LoopLeave):
1583 add $32, %ecx
1584 jle L(shl_end_0)
1585
1586 movaps 4(%eax), %xmm2
1587 movaps 20(%eax), %xmm3
1588 palignr $12, %xmm2, %xmm3
1589 palignr $12, %xmm1, %xmm2
1590
1591 movaps %xmm2, (%edx)
1592 movaps %xmm3, 16(%edx)
1593 lea 32(%edx, %ecx), %edx
1594 lea 32(%eax, %ecx), %eax
1595 POP (%edi)
1596 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1597
1598 CFI_PUSH (%edi)
1599
1600 .p2align 4
1601 L(sh_12_no_prefetch):
1602 lea -32(%ecx), %ecx
1603 lea -12(%eax), %eax
1604 xor %edi, %edi
1605
1606 .p2align 4
1607 L(sh_12_no_prefetch_loop):
1608 movdqa 16(%eax, %edi), %xmm2
1609 sub $32, %ecx
1610 movdqa 32(%eax, %edi), %xmm3
1611 movdqa %xmm3, %xmm4
1612 palignr $12, %xmm2, %xmm3
1613 palignr $12, %xmm1, %xmm2
1614 lea 32(%edi), %edi
1615 movdqa %xmm2, -32(%edx, %edi)
1616 movdqa %xmm3, -16(%edx, %edi)
1617 jb L(sh_12_end_no_prefetch_loop)
1618
1619 movdqa 16(%eax, %edi), %xmm2
1620 sub $32, %ecx
1621 movdqa 32(%eax, %edi), %xmm3
1622 movdqa %xmm3, %xmm1
1623 palignr $12, %xmm2, %xmm3
1624 palignr $12, %xmm4, %xmm2
1625 lea 32(%edi), %edi
1626 movdqa %xmm2, -32(%edx, %edi)
1627 movdqa %xmm3, -16(%edx, %edi)
1628 jae L(sh_12_no_prefetch_loop)
1629
1630 L(sh_12_end_no_prefetch_loop):
1631 lea 32(%ecx), %ecx
1632 add %ecx, %edi
1633 add %edi, %edx
1634 lea 12(%edi, %eax), %eax
1635 POP (%edi)
1636 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1637
1638 CFI_PUSH (%edi)
1639
1640 .p2align 4
1641 L(shl_13):
1642 # ifndef USE_AS_MEMMOVE
1643 movaps -13(%eax), %xmm1
1644 # else
1645 movl DEST+4(%esp), %edi
1646 movaps -13(%eax), %xmm1
1647 movdqu %xmm0, (%edi)
1648 # endif
1649 # ifdef DATA_CACHE_SIZE_HALF
1650 cmp $DATA_CACHE_SIZE_HALF, %ecx
1651 # else
1652 # ifdef PIC
1653 SETUP_PIC_REG(bx)
1654 add $_GLOBAL_OFFSET_TABLE_, %ebx
1655 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1656 # else
1657 cmp __x86_data_cache_size_half, %ecx
1658 # endif
1659 # endif
1660 jb L(sh_13_no_prefetch)
1661
1662 lea -64(%ecx), %ecx
1663
1664 .p2align 4
1665 L(Shl13LoopStart):
1666 prefetcht0 0x1c0(%eax)
1667 prefetcht0 0x1c0(%edx)
1668 movaps 3(%eax), %xmm2
1669 movaps 19(%eax), %xmm3
1670 movaps 35(%eax), %xmm4
1671 movaps 51(%eax), %xmm5
1672 movaps %xmm5, %xmm7
1673 palignr $13, %xmm4, %xmm5
1674 palignr $13, %xmm3, %xmm4
1675 movaps %xmm5, 48(%edx)
1676 palignr $13, %xmm2, %xmm3
1677 lea 64(%eax), %eax
1678 palignr $13, %xmm1, %xmm2
1679 movaps %xmm4, 32(%edx)
1680 movaps %xmm3, 16(%edx)
1681 movaps %xmm7, %xmm1
1682 movaps %xmm2, (%edx)
1683 lea 64(%edx), %edx
1684 sub $64, %ecx
1685 ja L(Shl13LoopStart)
1686
1687 L(Shl13LoopLeave):
1688 add $32, %ecx
1689 jle L(shl_end_0)
1690
1691 movaps 3(%eax), %xmm2
1692 movaps 19(%eax), %xmm3
1693 palignr $13, %xmm2, %xmm3
1694 palignr $13, %xmm1, %xmm2
1695
1696 movaps %xmm2, (%edx)
1697 movaps %xmm3, 16(%edx)
1698 lea 32(%edx, %ecx), %edx
1699 lea 32(%eax, %ecx), %eax
1700 POP (%edi)
1701 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1702
1703 CFI_PUSH (%edi)
1704
1705 .p2align 4
1706 L(sh_13_no_prefetch):
1707 lea -32(%ecx), %ecx
1708 lea -13(%eax), %eax
1709 xor %edi, %edi
1710
1711 .p2align 4
1712 L(sh_13_no_prefetch_loop):
1713 movdqa 16(%eax, %edi), %xmm2
1714 sub $32, %ecx
1715 movdqa 32(%eax, %edi), %xmm3
1716 movdqa %xmm3, %xmm4
1717 palignr $13, %xmm2, %xmm3
1718 palignr $13, %xmm1, %xmm2
1719 lea 32(%edi), %edi
1720 movdqa %xmm2, -32(%edx, %edi)
1721 movdqa %xmm3, -16(%edx, %edi)
1722 jb L(sh_13_end_no_prefetch_loop)
1723
1724 movdqa 16(%eax, %edi), %xmm2
1725 sub $32, %ecx
1726 movdqa 32(%eax, %edi), %xmm3
1727 movdqa %xmm3, %xmm1
1728 palignr $13, %xmm2, %xmm3
1729 palignr $13, %xmm4, %xmm2
1730 lea 32(%edi), %edi
1731 movdqa %xmm2, -32(%edx, %edi)
1732 movdqa %xmm3, -16(%edx, %edi)
1733 jae L(sh_13_no_prefetch_loop)
1734
1735 L(sh_13_end_no_prefetch_loop):
1736 lea 32(%ecx), %ecx
1737 add %ecx, %edi
1738 add %edi, %edx
1739 lea 13(%edi, %eax), %eax
1740 POP (%edi)
1741 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1742
1743 CFI_PUSH (%edi)
1744
1745 .p2align 4
1746 L(shl_14):
1747 # ifndef USE_AS_MEMMOVE
1748 movaps -14(%eax), %xmm1
1749 # else
1750 movl DEST+4(%esp), %edi
1751 movaps -14(%eax), %xmm1
1752 movdqu %xmm0, (%edi)
1753 # endif
1754 # ifdef DATA_CACHE_SIZE_HALF
1755 cmp $DATA_CACHE_SIZE_HALF, %ecx
1756 # else
1757 # ifdef PIC
1758 SETUP_PIC_REG(bx)
1759 add $_GLOBAL_OFFSET_TABLE_, %ebx
1760 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1761 # else
1762 cmp __x86_data_cache_size_half, %ecx
1763 # endif
1764 # endif
1765 jb L(sh_14_no_prefetch)
1766
1767 lea -64(%ecx), %ecx
1768
1769 .p2align 4
1770 L(Shl14LoopStart):
1771 prefetcht0 0x1c0(%eax)
1772 prefetcht0 0x1c0(%edx)
1773 movaps 2(%eax), %xmm2
1774 movaps 18(%eax), %xmm3
1775 movaps 34(%eax), %xmm4
1776 movaps 50(%eax), %xmm5
1777 movaps %xmm5, %xmm7
1778 palignr $14, %xmm4, %xmm5
1779 palignr $14, %xmm3, %xmm4
1780 movaps %xmm5, 48(%edx)
1781 palignr $14, %xmm2, %xmm3
1782 lea 64(%eax), %eax
1783 palignr $14, %xmm1, %xmm2
1784 movaps %xmm4, 32(%edx)
1785 movaps %xmm3, 16(%edx)
1786 movaps %xmm7, %xmm1
1787 movaps %xmm2, (%edx)
1788 lea 64(%edx), %edx
1789 sub $64, %ecx
1790 ja L(Shl14LoopStart)
1791
1792 L(Shl14LoopLeave):
1793 add $32, %ecx
1794 jle L(shl_end_0)
1795
1796 movaps 2(%eax), %xmm2
1797 movaps 18(%eax), %xmm3
1798 palignr $14, %xmm2, %xmm3
1799 palignr $14, %xmm1, %xmm2
1800
1801 movaps %xmm2, (%edx)
1802 movaps %xmm3, 16(%edx)
1803 lea 32(%edx, %ecx), %edx
1804 lea 32(%eax, %ecx), %eax
1805 POP (%edi)
1806 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1807
1808 CFI_PUSH (%edi)
1809
1810 .p2align 4
1811 L(sh_14_no_prefetch):
1812 lea -32(%ecx), %ecx
1813 lea -14(%eax), %eax
1814 xor %edi, %edi
1815
1816 .p2align 4
1817 L(sh_14_no_prefetch_loop):
1818 movdqa 16(%eax, %edi), %xmm2
1819 sub $32, %ecx
1820 movdqa 32(%eax, %edi), %xmm3
1821 movdqa %xmm3, %xmm4
1822 palignr $14, %xmm2, %xmm3
1823 palignr $14, %xmm1, %xmm2
1824 lea 32(%edi), %edi
1825 movdqa %xmm2, -32(%edx, %edi)
1826 movdqa %xmm3, -16(%edx, %edi)
1827 jb L(sh_14_end_no_prefetch_loop)
1828
1829 movdqa 16(%eax, %edi), %xmm2
1830 sub $32, %ecx
1831 movdqa 32(%eax, %edi), %xmm3
1832 movdqa %xmm3, %xmm1
1833 palignr $14, %xmm2, %xmm3
1834 palignr $14, %xmm4, %xmm2
1835 lea 32(%edi), %edi
1836 movdqa %xmm2, -32(%edx, %edi)
1837 movdqa %xmm3, -16(%edx, %edi)
1838 jae L(sh_14_no_prefetch_loop)
1839
1840 L(sh_14_end_no_prefetch_loop):
1841 lea 32(%ecx), %ecx
1842 add %ecx, %edi
1843 add %edi, %edx
1844 lea 14(%edi, %eax), %eax
1845 POP (%edi)
1846 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1847
1848 CFI_PUSH (%edi)
1849
1850 .p2align 4
1851 L(shl_15):
1852 # ifndef USE_AS_MEMMOVE
1853 movaps -15(%eax), %xmm1
1854 # else
1855 movl DEST+4(%esp), %edi
1856 movaps -15(%eax), %xmm1
1857 movdqu %xmm0, (%edi)
1858 # endif
1859 # ifdef DATA_CACHE_SIZE_HALF
1860 cmp $DATA_CACHE_SIZE_HALF, %ecx
1861 # else
1862 # ifdef PIC
1863 SETUP_PIC_REG(bx)
1864 add $_GLOBAL_OFFSET_TABLE_, %ebx
1865 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1866 # else
1867 cmp __x86_data_cache_size_half, %ecx
1868 # endif
1869 # endif
1870 jb L(sh_15_no_prefetch)
1871
1872 lea -64(%ecx), %ecx
1873
1874 .p2align 4
1875 L(Shl15LoopStart):
1876 prefetcht0 0x1c0(%eax)
1877 prefetcht0 0x1c0(%edx)
1878 movaps 1(%eax), %xmm2
1879 movaps 17(%eax), %xmm3
1880 movaps 33(%eax), %xmm4
1881 movaps 49(%eax), %xmm5
1882 movaps %xmm5, %xmm7
1883 palignr $15, %xmm4, %xmm5
1884 palignr $15, %xmm3, %xmm4
1885 movaps %xmm5, 48(%edx)
1886 palignr $15, %xmm2, %xmm3
1887 lea 64(%eax), %eax
1888 palignr $15, %xmm1, %xmm2
1889 movaps %xmm4, 32(%edx)
1890 movaps %xmm3, 16(%edx)
1891 movaps %xmm7, %xmm1
1892 movaps %xmm2, (%edx)
1893 lea 64(%edx), %edx
1894 sub $64, %ecx
1895 ja L(Shl15LoopStart)
1896
1897 L(Shl15LoopLeave):
1898 add $32, %ecx
1899 jle L(shl_end_0)
1900
1901 movaps 1(%eax), %xmm2
1902 movaps 17(%eax), %xmm3
1903 palignr $15, %xmm2, %xmm3
1904 palignr $15, %xmm1, %xmm2
1905
1906 movaps %xmm2, (%edx)
1907 movaps %xmm3, 16(%edx)
1908 lea 32(%edx, %ecx), %edx
1909 lea 32(%eax, %ecx), %eax
1910 POP (%edi)
1911 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1912
1913 CFI_PUSH (%edi)
1914
1915 .p2align 4
1916 L(sh_15_no_prefetch):
1917 lea -32(%ecx), %ecx
1918 lea -15(%eax), %eax
1919 xor %edi, %edi
1920
1921 .p2align 4
1922 L(sh_15_no_prefetch_loop):
1923 movdqa 16(%eax, %edi), %xmm2
1924 sub $32, %ecx
1925 movdqa 32(%eax, %edi), %xmm3
1926 movdqa %xmm3, %xmm4
1927 palignr $15, %xmm2, %xmm3
1928 palignr $15, %xmm1, %xmm2
1929 lea 32(%edi), %edi
1930 movdqa %xmm2, -32(%edx, %edi)
1931 movdqa %xmm3, -16(%edx, %edi)
1932 jb L(sh_15_end_no_prefetch_loop)
1933
1934 movdqa 16(%eax, %edi), %xmm2
1935 sub $32, %ecx
1936 movdqa 32(%eax, %edi), %xmm3
1937 movdqa %xmm3, %xmm1
1938 palignr $15, %xmm2, %xmm3
1939 palignr $15, %xmm4, %xmm2
1940 lea 32(%edi), %edi
1941 movdqa %xmm2, -32(%edx, %edi)
1942 movdqa %xmm3, -16(%edx, %edi)
1943 jae L(sh_15_no_prefetch_loop)
1944
1945 L(sh_15_end_no_prefetch_loop):
1946 lea 32(%ecx), %ecx
1947 add %ecx, %edi
1948 add %edi, %edx
1949 lea 15(%edi, %eax), %eax
1950 POP (%edi)
1951 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1952
1953 CFI_PUSH (%edi)
1954
1955 .p2align 4
1956 L(shl_end_0):
1957 lea 32(%ecx), %ecx
1958 lea (%edx, %ecx), %edx
1959 lea (%eax, %ecx), %eax
1960 POP (%edi)
1961 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1962
1963 .p2align 4
1964 L(fwd_write_44bytes):
1965 movq -44(%eax), %xmm0
1966 movq %xmm0, -44(%edx)
1967 L(fwd_write_36bytes):
1968 movq -36(%eax), %xmm0
1969 movq %xmm0, -36(%edx)
1970 L(fwd_write_28bytes):
1971 movq -28(%eax), %xmm0
1972 movq %xmm0, -28(%edx)
1973 L(fwd_write_20bytes):
1974 movq -20(%eax), %xmm0
1975 movq %xmm0, -20(%edx)
1976 L(fwd_write_12bytes):
1977 movq -12(%eax), %xmm0
1978 movq %xmm0, -12(%edx)
1979 L(fwd_write_4bytes):
1980 movl -4(%eax), %ecx
1981 movl %ecx, -4(%edx)
1982 # ifndef USE_AS_BCOPY
1983 # ifdef USE_AS_MEMPCPY
1984 movl %edx, %eax
1985 # else
1986 movl DEST(%esp), %eax
1987 # endif
1988 # endif
1989 RETURN
1990
1991 .p2align 4
1992 L(fwd_write_40bytes):
1993 movq -40(%eax), %xmm0
1994 movq %xmm0, -40(%edx)
1995 L(fwd_write_32bytes):
1996 movq -32(%eax), %xmm0
1997 movq %xmm0, -32(%edx)
1998 L(fwd_write_24bytes):
1999 movq -24(%eax), %xmm0
2000 movq %xmm0, -24(%edx)
2001 L(fwd_write_16bytes):
2002 movq -16(%eax), %xmm0
2003 movq %xmm0, -16(%edx)
2004 L(fwd_write_8bytes):
2005 movq -8(%eax), %xmm0
2006 movq %xmm0, -8(%edx)
2007 L(fwd_write_0bytes):
2008 # ifndef USE_AS_BCOPY
2009 # ifdef USE_AS_MEMPCPY
2010 movl %edx, %eax
2011 # else
2012 movl DEST(%esp), %eax
2013 # endif
2014 # endif
2015 RETURN
2016
2017 .p2align 4
2018 L(fwd_write_5bytes):
2019 movl -5(%eax), %ecx
2020 movl -4(%eax), %eax
2021 movl %ecx, -5(%edx)
2022 movl %eax, -4(%edx)
2023 # ifndef USE_AS_BCOPY
2024 # ifdef USE_AS_MEMPCPY
2025 movl %edx, %eax
2026 # else
2027 movl DEST(%esp), %eax
2028 # endif
2029 # endif
2030 RETURN
2031
2032 .p2align 4
2033 L(fwd_write_45bytes):
2034 movq -45(%eax), %xmm0
2035 movq %xmm0, -45(%edx)
2036 L(fwd_write_37bytes):
2037 movq -37(%eax), %xmm0
2038 movq %xmm0, -37(%edx)
2039 L(fwd_write_29bytes):
2040 movq -29(%eax), %xmm0
2041 movq %xmm0, -29(%edx)
2042 L(fwd_write_21bytes):
2043 movq -21(%eax), %xmm0
2044 movq %xmm0, -21(%edx)
2045 L(fwd_write_13bytes):
2046 movq -13(%eax), %xmm0
2047 movq %xmm0, -13(%edx)
2048 movl -5(%eax), %ecx
2049 movl %ecx, -5(%edx)
2050 movzbl -1(%eax), %ecx
2051 movb %cl, -1(%edx)
2052 # ifndef USE_AS_BCOPY
2053 # ifdef USE_AS_MEMPCPY
2054 movl %edx, %eax
2055 # else
2056 movl DEST(%esp), %eax
2057 # endif
2058 # endif
2059 RETURN
2060
2061 .p2align 4
2062 L(fwd_write_41bytes):
2063 movq -41(%eax), %xmm0
2064 movq %xmm0, -41(%edx)
2065 L(fwd_write_33bytes):
2066 movq -33(%eax), %xmm0
2067 movq %xmm0, -33(%edx)
2068 L(fwd_write_25bytes):
2069 movq -25(%eax), %xmm0
2070 movq %xmm0, -25(%edx)
2071 L(fwd_write_17bytes):
2072 movq -17(%eax), %xmm0
2073 movq %xmm0, -17(%edx)
2074 L(fwd_write_9bytes):
2075 movq -9(%eax), %xmm0
2076 movq %xmm0, -9(%edx)
2077 L(fwd_write_1bytes):
2078 movzbl -1(%eax), %ecx
2079 movb %cl, -1(%edx)
2080 # ifndef USE_AS_BCOPY
2081 # ifdef USE_AS_MEMPCPY
2082 movl %edx, %eax
2083 # else
2084 movl DEST(%esp), %eax
2085 # endif
2086 # endif
2087 RETURN
2088
2089 .p2align 4
2090 L(fwd_write_46bytes):
2091 movq -46(%eax), %xmm0
2092 movq %xmm0, -46(%edx)
2093 L(fwd_write_38bytes):
2094 movq -38(%eax), %xmm0
2095 movq %xmm0, -38(%edx)
2096 L(fwd_write_30bytes):
2097 movq -30(%eax), %xmm0
2098 movq %xmm0, -30(%edx)
2099 L(fwd_write_22bytes):
2100 movq -22(%eax), %xmm0
2101 movq %xmm0, -22(%edx)
2102 L(fwd_write_14bytes):
2103 movq -14(%eax), %xmm0
2104 movq %xmm0, -14(%edx)
2105 L(fwd_write_6bytes):
2106 movl -6(%eax), %ecx
2107 movl %ecx, -6(%edx)
2108 movzwl -2(%eax), %ecx
2109 movw %cx, -2(%edx)
2110 # ifndef USE_AS_BCOPY
2111 # ifdef USE_AS_MEMPCPY
2112 movl %edx, %eax
2113 # else
2114 movl DEST(%esp), %eax
2115 # endif
2116 # endif
2117 RETURN
2118
2119 .p2align 4
2120 L(fwd_write_42bytes):
2121 movq -42(%eax), %xmm0
2122 movq %xmm0, -42(%edx)
2123 L(fwd_write_34bytes):
2124 movq -34(%eax), %xmm0
2125 movq %xmm0, -34(%edx)
2126 L(fwd_write_26bytes):
2127 movq -26(%eax), %xmm0
2128 movq %xmm0, -26(%edx)
2129 L(fwd_write_18bytes):
2130 movq -18(%eax), %xmm0
2131 movq %xmm0, -18(%edx)
2132 L(fwd_write_10bytes):
2133 movq -10(%eax), %xmm0
2134 movq %xmm0, -10(%edx)
2135 L(fwd_write_2bytes):
2136 movzwl -2(%eax), %ecx
2137 movw %cx, -2(%edx)
2138 # ifndef USE_AS_BCOPY
2139 # ifdef USE_AS_MEMPCPY
2140 movl %edx, %eax
2141 # else
2142 movl DEST(%esp), %eax
2143 # endif
2144 # endif
2145 RETURN
2146
2147 .p2align 4
2148 L(fwd_write_47bytes):
2149 movq -47(%eax), %xmm0
2150 movq %xmm0, -47(%edx)
2151 L(fwd_write_39bytes):
2152 movq -39(%eax), %xmm0
2153 movq %xmm0, -39(%edx)
2154 L(fwd_write_31bytes):
2155 movq -31(%eax), %xmm0
2156 movq %xmm0, -31(%edx)
2157 L(fwd_write_23bytes):
2158 movq -23(%eax), %xmm0
2159 movq %xmm0, -23(%edx)
2160 L(fwd_write_15bytes):
2161 movq -15(%eax), %xmm0
2162 movq %xmm0, -15(%edx)
2163 L(fwd_write_7bytes):
2164 movl -7(%eax), %ecx
2165 movl %ecx, -7(%edx)
2166 movzwl -3(%eax), %ecx
2167 movzbl -1(%eax), %eax
2168 movw %cx, -3(%edx)
2169 movb %al, -1(%edx)
2170 # ifndef USE_AS_BCOPY
2171 # ifdef USE_AS_MEMPCPY
2172 movl %edx, %eax
2173 # else
2174 movl DEST(%esp), %eax
2175 # endif
2176 # endif
2177 RETURN
2178
2179 .p2align 4
2180 L(fwd_write_43bytes):
2181 movq -43(%eax), %xmm0
2182 movq %xmm0, -43(%edx)
2183 L(fwd_write_35bytes):
2184 movq -35(%eax), %xmm0
2185 movq %xmm0, -35(%edx)
2186 L(fwd_write_27bytes):
2187 movq -27(%eax), %xmm0
2188 movq %xmm0, -27(%edx)
2189 L(fwd_write_19bytes):
2190 movq -19(%eax), %xmm0
2191 movq %xmm0, -19(%edx)
2192 L(fwd_write_11bytes):
2193 movq -11(%eax), %xmm0
2194 movq %xmm0, -11(%edx)
2195 L(fwd_write_3bytes):
2196 movzwl -3(%eax), %ecx
2197 movzbl -1(%eax), %eax
2198 movw %cx, -3(%edx)
2199 movb %al, -1(%edx)
2200 # ifndef USE_AS_BCOPY
2201 # ifdef USE_AS_MEMPCPY
2202 movl %edx, %eax
2203 # else
2204 movl DEST(%esp), %eax
2205 # endif
2206 # endif
2207 RETURN
2208
2209 .p2align 4
2210 L(fwd_write_40bytes_align):
2211 movdqa -40(%eax), %xmm0
2212 movdqa %xmm0, -40(%edx)
2213 L(fwd_write_24bytes_align):
2214 movdqa -24(%eax), %xmm0
2215 movdqa %xmm0, -24(%edx)
2216 L(fwd_write_8bytes_align):
2217 movq -8(%eax), %xmm0
2218 movq %xmm0, -8(%edx)
2219 L(fwd_write_0bytes_align):
2220 # ifndef USE_AS_BCOPY
2221 # ifdef USE_AS_MEMPCPY
2222 movl %edx, %eax
2223 # else
2224 movl DEST(%esp), %eax
2225 # endif
2226 # endif
2227 RETURN
2228
2229 .p2align 4
2230 L(fwd_write_32bytes_align):
2231 movdqa -32(%eax), %xmm0
2232 movdqa %xmm0, -32(%edx)
2233 L(fwd_write_16bytes_align):
2234 movdqa -16(%eax), %xmm0
2235 movdqa %xmm0, -16(%edx)
2236 # ifndef USE_AS_BCOPY
2237 # ifdef USE_AS_MEMPCPY
2238 movl %edx, %eax
2239 # else
2240 movl DEST(%esp), %eax
2241 # endif
2242 # endif
2243 RETURN
2244
2245 .p2align 4
2246 L(fwd_write_5bytes_align):
2247 movl -5(%eax), %ecx
2248 movl -4(%eax), %eax
2249 movl %ecx, -5(%edx)
2250 movl %eax, -4(%edx)
2251 # ifndef USE_AS_BCOPY
2252 # ifdef USE_AS_MEMPCPY
2253 movl %edx, %eax
2254 # else
2255 movl DEST(%esp), %eax
2256 # endif
2257 # endif
2258 RETURN
2259
2260 .p2align 4
2261 L(fwd_write_45bytes_align):
2262 movdqa -45(%eax), %xmm0
2263 movdqa %xmm0, -45(%edx)
2264 L(fwd_write_29bytes_align):
2265 movdqa -29(%eax), %xmm0
2266 movdqa %xmm0, -29(%edx)
2267 L(fwd_write_13bytes_align):
2268 movq -13(%eax), %xmm0
2269 movq %xmm0, -13(%edx)
2270 movl -5(%eax), %ecx
2271 movl %ecx, -5(%edx)
2272 movzbl -1(%eax), %ecx
2273 movb %cl, -1(%edx)
2274 # ifndef USE_AS_BCOPY
2275 # ifdef USE_AS_MEMPCPY
2276 movl %edx, %eax
2277 # else
2278 movl DEST(%esp), %eax
2279 # endif
2280 # endif
2281 RETURN
2282
2283 .p2align 4
2284 L(fwd_write_37bytes_align):
2285 movdqa -37(%eax), %xmm0
2286 movdqa %xmm0, -37(%edx)
2287 L(fwd_write_21bytes_align):
2288 movdqa -21(%eax), %xmm0
2289 movdqa %xmm0, -21(%edx)
2290 movl -5(%eax), %ecx
2291 movl %ecx, -5(%edx)
2292 movzbl -1(%eax), %ecx
2293 movb %cl, -1(%edx)
2294 # ifndef USE_AS_BCOPY
2295 # ifdef USE_AS_MEMPCPY
2296 movl %edx, %eax
2297 # else
2298 movl DEST(%esp), %eax
2299 # endif
2300 # endif
2301 RETURN
2302
2303 .p2align 4
2304 L(fwd_write_41bytes_align):
2305 movdqa -41(%eax), %xmm0
2306 movdqa %xmm0, -41(%edx)
2307 L(fwd_write_25bytes_align):
2308 movdqa -25(%eax), %xmm0
2309 movdqa %xmm0, -25(%edx)
2310 L(fwd_write_9bytes_align):
2311 movq -9(%eax), %xmm0
2312 movq %xmm0, -9(%edx)
2313 L(fwd_write_1bytes_align):
2314 movzbl -1(%eax), %ecx
2315 movb %cl, -1(%edx)
2316 # ifndef USE_AS_BCOPY
2317 # ifdef USE_AS_MEMPCPY
2318 movl %edx, %eax
2319 # else
2320 movl DEST(%esp), %eax
2321 # endif
2322 # endif
2323 RETURN
2324
2325 .p2align 4
2326 L(fwd_write_33bytes_align):
2327 movdqa -33(%eax), %xmm0
2328 movdqa %xmm0, -33(%edx)
2329 L(fwd_write_17bytes_align):
2330 movdqa -17(%eax), %xmm0
2331 movdqa %xmm0, -17(%edx)
2332 movzbl -1(%eax), %ecx
2333 movb %cl, -1(%edx)
2334 # ifndef USE_AS_BCOPY
2335 # ifdef USE_AS_MEMPCPY
2336 movl %edx, %eax
2337 # else
2338 movl DEST(%esp), %eax
2339 # endif
2340 # endif
2341 RETURN
2342
2343 .p2align 4
2344 L(fwd_write_46bytes_align):
2345 movdqa -46(%eax), %xmm0
2346 movdqa %xmm0, -46(%edx)
2347 L(fwd_write_30bytes_align):
2348 movdqa -30(%eax), %xmm0
2349 movdqa %xmm0, -30(%edx)
2350 L(fwd_write_14bytes_align):
2351 movq -14(%eax), %xmm0
2352 movq %xmm0, -14(%edx)
2353 L(fwd_write_6bytes_align):
2354 movl -6(%eax), %ecx
2355 movl %ecx, -6(%edx)
2356 movzwl -2(%eax), %ecx
2357 movw %cx, -2(%edx)
2358 # ifndef USE_AS_BCOPY
2359 # ifdef USE_AS_MEMPCPY
2360 movl %edx, %eax
2361 # else
2362 movl DEST(%esp), %eax
2363 # endif
2364 # endif
2365 RETURN
2366
2367 .p2align 4
2368 L(fwd_write_38bytes_align):
2369 movdqa -38(%eax), %xmm0
2370 movdqa %xmm0, -38(%edx)
2371 L(fwd_write_22bytes_align):
2372 movdqa -22(%eax), %xmm0
2373 movdqa %xmm0, -22(%edx)
2374 movl -6(%eax), %ecx
2375 movl %ecx, -6(%edx)
2376 movzwl -2(%eax), %ecx
2377 movw %cx, -2(%edx)
2378 # ifndef USE_AS_BCOPY
2379 # ifdef USE_AS_MEMPCPY
2380 movl %edx, %eax
2381 # else
2382 movl DEST(%esp), %eax
2383 # endif
2384 # endif
2385 RETURN
2386
2387 .p2align 4
2388 L(fwd_write_42bytes_align):
2389 movdqa -42(%eax), %xmm0
2390 movdqa %xmm0, -42(%edx)
2391 L(fwd_write_26bytes_align):
2392 movdqa -26(%eax), %xmm0
2393 movdqa %xmm0, -26(%edx)
2394 L(fwd_write_10bytes_align):
2395 movq -10(%eax), %xmm0
2396 movq %xmm0, -10(%edx)
2397 L(fwd_write_2bytes_align):
2398 movzwl -2(%eax), %ecx
2399 movw %cx, -2(%edx)
2400 # ifndef USE_AS_BCOPY
2401 # ifdef USE_AS_MEMPCPY
2402 movl %edx, %eax
2403 # else
2404 movl DEST(%esp), %eax
2405 # endif
2406 # endif
2407 RETURN
2408
2409 .p2align 4
2410 L(fwd_write_34bytes_align):
2411 movdqa -34(%eax), %xmm0
2412 movdqa %xmm0, -34(%edx)
2413 L(fwd_write_18bytes_align):
2414 movdqa -18(%eax), %xmm0
2415 movdqa %xmm0, -18(%edx)
2416 movzwl -2(%eax), %ecx
2417 movw %cx, -2(%edx)
2418 # ifndef USE_AS_BCOPY
2419 # ifdef USE_AS_MEMPCPY
2420 movl %edx, %eax
2421 # else
2422 movl DEST(%esp), %eax
2423 # endif
2424 # endif
2425 RETURN
2426
2427 .p2align 4
2428 L(fwd_write_47bytes_align):
2429 movdqa -47(%eax), %xmm0
2430 movdqa %xmm0, -47(%edx)
2431 L(fwd_write_31bytes_align):
2432 movdqa -31(%eax), %xmm0
2433 movdqa %xmm0, -31(%edx)
2434 L(fwd_write_15bytes_align):
2435 movq -15(%eax), %xmm0
2436 movq %xmm0, -15(%edx)
2437 L(fwd_write_7bytes_align):
2438 movl -7(%eax), %ecx
2439 movl %ecx, -7(%edx)
2440 movzwl -3(%eax), %ecx
2441 movzbl -1(%eax), %eax
2442 movw %cx, -3(%edx)
2443 movb %al, -1(%edx)
2444 # ifndef USE_AS_BCOPY
2445 # ifdef USE_AS_MEMPCPY
2446 movl %edx, %eax
2447 # else
2448 movl DEST(%esp), %eax
2449 # endif
2450 # endif
2451 RETURN
2452
2453 .p2align 4
2454 L(fwd_write_39bytes_align):
2455 movdqa -39(%eax), %xmm0
2456 movdqa %xmm0, -39(%edx)
2457 L(fwd_write_23bytes_align):
2458 movdqa -23(%eax), %xmm0
2459 movdqa %xmm0, -23(%edx)
2460 movl -7(%eax), %ecx
2461 movl %ecx, -7(%edx)
2462 movzwl -3(%eax), %ecx
2463 movzbl -1(%eax), %eax
2464 movw %cx, -3(%edx)
2465 movb %al, -1(%edx)
2466 # ifndef USE_AS_BCOPY
2467 # ifdef USE_AS_MEMPCPY
2468 movl %edx, %eax
2469 # else
2470 movl DEST(%esp), %eax
2471 # endif
2472 # endif
2473 RETURN
2474
2475 .p2align 4
2476 L(fwd_write_43bytes_align):
2477 movdqa -43(%eax), %xmm0
2478 movdqa %xmm0, -43(%edx)
2479 L(fwd_write_27bytes_align):
2480 movdqa -27(%eax), %xmm0
2481 movdqa %xmm0, -27(%edx)
2482 L(fwd_write_11bytes_align):
2483 movq -11(%eax), %xmm0
2484 movq %xmm0, -11(%edx)
2485 L(fwd_write_3bytes_align):
2486 movzwl -3(%eax), %ecx
2487 movzbl -1(%eax), %eax
2488 movw %cx, -3(%edx)
2489 movb %al, -1(%edx)
2490 # ifndef USE_AS_BCOPY
2491 # ifdef USE_AS_MEMPCPY
2492 movl %edx, %eax
2493 # else
2494 movl DEST(%esp), %eax
2495 # endif
2496 # endif
2497 RETURN
2498
2499 .p2align 4
2500 L(fwd_write_35bytes_align):
2501 movdqa -35(%eax), %xmm0
2502 movdqa %xmm0, -35(%edx)
2503 L(fwd_write_19bytes_align):
2504 movdqa -19(%eax), %xmm0
2505 movdqa %xmm0, -19(%edx)
2506 movzwl -3(%eax), %ecx
2507 movzbl -1(%eax), %eax
2508 movw %cx, -3(%edx)
2509 movb %al, -1(%edx)
2510 # ifndef USE_AS_BCOPY
2511 # ifdef USE_AS_MEMPCPY
2512 movl %edx, %eax
2513 # else
2514 movl DEST(%esp), %eax
2515 # endif
2516 # endif
2517 RETURN
2518
2519 .p2align 4
2520 L(fwd_write_44bytes_align):
2521 movdqa -44(%eax), %xmm0
2522 movdqa %xmm0, -44(%edx)
2523 L(fwd_write_28bytes_align):
2524 movdqa -28(%eax), %xmm0
2525 movdqa %xmm0, -28(%edx)
2526 L(fwd_write_12bytes_align):
2527 movq -12(%eax), %xmm0
2528 movq %xmm0, -12(%edx)
2529 L(fwd_write_4bytes_align):
2530 movl -4(%eax), %ecx
2531 movl %ecx, -4(%edx)
2532 # ifndef USE_AS_BCOPY
2533 # ifdef USE_AS_MEMPCPY
2534 movl %edx, %eax
2535 # else
2536 movl DEST(%esp), %eax
2537 # endif
2538 # endif
2539 RETURN
2540
2541 .p2align 4
2542 L(fwd_write_36bytes_align):
2543 movdqa -36(%eax), %xmm0
2544 movdqa %xmm0, -36(%edx)
2545 L(fwd_write_20bytes_align):
2546 movdqa -20(%eax), %xmm0
2547 movdqa %xmm0, -20(%edx)
2548 movl -4(%eax), %ecx
2549 movl %ecx, -4(%edx)
2550 # ifndef USE_AS_BCOPY
2551 # ifdef USE_AS_MEMPCPY
2552 movl %edx, %eax
2553 # else
2554 movl DEST(%esp), %eax
2555 # endif
2556 # endif
2557 RETURN_END
2558
2559 CFI_PUSH (%edi)
2560
2561 .p2align 4
2562 L(large_page):
2563 movdqu (%eax), %xmm1
2564 # ifdef USE_AS_MEMMOVE
2565 movl DEST+4(%esp), %edi
2566 movdqu %xmm0, (%edi)
2567 # endif
2568 lea 16(%eax), %eax
2569 movntdq %xmm1, (%edx)
2570 lea 16(%edx), %edx
2571 lea -0x90(%ecx), %ecx
2572 POP (%edi)
2573
2574 .p2align 4
2575 L(large_page_loop):
2576 movdqu (%eax), %xmm0
2577 movdqu 0x10(%eax), %xmm1
2578 movdqu 0x20(%eax), %xmm2
2579 movdqu 0x30(%eax), %xmm3
2580 movdqu 0x40(%eax), %xmm4
2581 movdqu 0x50(%eax), %xmm5
2582 movdqu 0x60(%eax), %xmm6
2583 movdqu 0x70(%eax), %xmm7
2584 lea 0x80(%eax), %eax
2585
2586 sub $0x80, %ecx
2587 movntdq %xmm0, (%edx)
2588 movntdq %xmm1, 0x10(%edx)
2589 movntdq %xmm2, 0x20(%edx)
2590 movntdq %xmm3, 0x30(%edx)
2591 movntdq %xmm4, 0x40(%edx)
2592 movntdq %xmm5, 0x50(%edx)
2593 movntdq %xmm6, 0x60(%edx)
2594 movntdq %xmm7, 0x70(%edx)
2595 lea 0x80(%edx), %edx
2596 jae L(large_page_loop)
2597 cmp $-0x40, %ecx
2598 lea 0x80(%ecx), %ecx
2599 jl L(large_page_less_64bytes)
2600
2601 movdqu (%eax), %xmm0
2602 movdqu 0x10(%eax), %xmm1
2603 movdqu 0x20(%eax), %xmm2
2604 movdqu 0x30(%eax), %xmm3
2605 lea 0x40(%eax), %eax
2606
2607 movntdq %xmm0, (%edx)
2608 movntdq %xmm1, 0x10(%edx)
2609 movntdq %xmm2, 0x20(%edx)
2610 movntdq %xmm3, 0x30(%edx)
2611 lea 0x40(%edx), %edx
2612 sub $0x40, %ecx
2613 L(large_page_less_64bytes):
2614 cmp $32, %ecx
2615 jb L(large_page_less_32bytes)
2616 movdqu (%eax), %xmm0
2617 movdqu 0x10(%eax), %xmm1
2618 lea 0x20(%eax), %eax
2619 movntdq %xmm0, (%edx)
2620 movntdq %xmm1, 0x10(%edx)
2621 lea 0x20(%edx), %edx
2622 sub $0x20, %ecx
2623 L(large_page_less_32bytes):
2624 add %ecx, %edx
2625 add %ecx, %eax
2626 sfence
2627 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2628
2629 .p2align 4
2630 L(bk_write_44bytes):
2631 movq 36(%eax), %xmm0
2632 movq %xmm0, 36(%edx)
2633 L(bk_write_36bytes):
2634 movq 28(%eax), %xmm0
2635 movq %xmm0, 28(%edx)
2636 L(bk_write_28bytes):
2637 movq 20(%eax), %xmm0
2638 movq %xmm0, 20(%edx)
2639 L(bk_write_20bytes):
2640 movq 12(%eax), %xmm0
2641 movq %xmm0, 12(%edx)
2642 L(bk_write_12bytes):
2643 movq 4(%eax), %xmm0
2644 movq %xmm0, 4(%edx)
2645 L(bk_write_4bytes):
2646 movl (%eax), %ecx
2647 movl %ecx, (%edx)
2648 L(bk_write_0bytes):
2649 # ifndef USE_AS_BCOPY
2650 movl DEST(%esp), %eax
2651 # ifdef USE_AS_MEMPCPY
2652 movl LEN(%esp), %ecx
2653 add %ecx, %eax
2654 # endif
2655 # endif
2656 RETURN
2657
2658 .p2align 4
2659 L(bk_write_40bytes):
2660 movq 32(%eax), %xmm0
2661 movq %xmm0, 32(%edx)
2662 L(bk_write_32bytes):
2663 movq 24(%eax), %xmm0
2664 movq %xmm0, 24(%edx)
2665 L(bk_write_24bytes):
2666 movq 16(%eax), %xmm0
2667 movq %xmm0, 16(%edx)
2668 L(bk_write_16bytes):
2669 movq 8(%eax), %xmm0
2670 movq %xmm0, 8(%edx)
2671 L(bk_write_8bytes):
2672 movq (%eax), %xmm0
2673 movq %xmm0, (%edx)
2674 # ifndef USE_AS_BCOPY
2675 movl DEST(%esp), %eax
2676 # ifdef USE_AS_MEMPCPY
2677 movl LEN(%esp), %ecx
2678 add %ecx, %eax
2679 # endif
2680 # endif
2681 RETURN
2682
2683 .p2align 4
2684 L(bk_write_45bytes):
2685 movq 37(%eax), %xmm0
2686 movq %xmm0, 37(%edx)
2687 L(bk_write_37bytes):
2688 movq 29(%eax), %xmm0
2689 movq %xmm0, 29(%edx)
2690 L(bk_write_29bytes):
2691 movq 21(%eax), %xmm0
2692 movq %xmm0, 21(%edx)
2693 L(bk_write_21bytes):
2694 movq 13(%eax), %xmm0
2695 movq %xmm0, 13(%edx)
2696 L(bk_write_13bytes):
2697 movq 5(%eax), %xmm0
2698 movq %xmm0, 5(%edx)
2699 L(bk_write_5bytes):
2700 movl 1(%eax), %ecx
2701 movl %ecx, 1(%edx)
2702 L(bk_write_1bytes):
2703 movzbl (%eax), %ecx
2704 movb %cl, (%edx)
2705 # ifndef USE_AS_BCOPY
2706 movl DEST(%esp), %eax
2707 # ifdef USE_AS_MEMPCPY
2708 movl LEN(%esp), %ecx
2709 add %ecx, %eax
2710 # endif
2711 # endif
2712 RETURN
2713
2714 .p2align 4
2715 L(bk_write_41bytes):
2716 movq 33(%eax), %xmm0
2717 movq %xmm0, 33(%edx)
2718 L(bk_write_33bytes):
2719 movq 25(%eax), %xmm0
2720 movq %xmm0, 25(%edx)
2721 L(bk_write_25bytes):
2722 movq 17(%eax), %xmm0
2723 movq %xmm0, 17(%edx)
2724 L(bk_write_17bytes):
2725 movq 9(%eax), %xmm0
2726 movq %xmm0, 9(%edx)
2727 L(bk_write_9bytes):
2728 movq 1(%eax), %xmm0
2729 movq %xmm0, 1(%edx)
2730 movzbl (%eax), %ecx
2731 movb %cl, (%edx)
2732 # ifndef USE_AS_BCOPY
2733 movl DEST(%esp), %eax
2734 # ifdef USE_AS_MEMPCPY
2735 movl LEN(%esp), %ecx
2736 add %ecx, %eax
2737 # endif
2738 # endif
2739 RETURN
2740
2741 .p2align 4
2742 L(bk_write_46bytes):
2743 movq 38(%eax), %xmm0
2744 movq %xmm0, 38(%edx)
2745 L(bk_write_38bytes):
2746 movq 30(%eax), %xmm0
2747 movq %xmm0, 30(%edx)
2748 L(bk_write_30bytes):
2749 movq 22(%eax), %xmm0
2750 movq %xmm0, 22(%edx)
2751 L(bk_write_22bytes):
2752 movq 14(%eax), %xmm0
2753 movq %xmm0, 14(%edx)
2754 L(bk_write_14bytes):
2755 movq 6(%eax), %xmm0
2756 movq %xmm0, 6(%edx)
2757 L(bk_write_6bytes):
2758 movl 2(%eax), %ecx
2759 movl %ecx, 2(%edx)
2760 movzwl (%eax), %ecx
2761 movw %cx, (%edx)
2762 # ifndef USE_AS_BCOPY
2763 movl DEST(%esp), %eax
2764 # ifdef USE_AS_MEMPCPY
2765 movl LEN(%esp), %ecx
2766 add %ecx, %eax
2767 # endif
2768 # endif
2769 RETURN
2770
2771 .p2align 4
2772 L(bk_write_42bytes):
2773 movq 34(%eax), %xmm0
2774 movq %xmm0, 34(%edx)
2775 L(bk_write_34bytes):
2776 movq 26(%eax), %xmm0
2777 movq %xmm0, 26(%edx)
2778 L(bk_write_26bytes):
2779 movq 18(%eax), %xmm0
2780 movq %xmm0, 18(%edx)
2781 L(bk_write_18bytes):
2782 movq 10(%eax), %xmm0
2783 movq %xmm0, 10(%edx)
2784 L(bk_write_10bytes):
2785 movq 2(%eax), %xmm0
2786 movq %xmm0, 2(%edx)
2787 L(bk_write_2bytes):
2788 movzwl (%eax), %ecx
2789 movw %cx, (%edx)
2790 # ifndef USE_AS_BCOPY
2791 movl DEST(%esp), %eax
2792 # ifdef USE_AS_MEMPCPY
2793 movl LEN(%esp), %ecx
2794 add %ecx, %eax
2795 # endif
2796 # endif
2797 RETURN
2798
2799 .p2align 4
2800 L(bk_write_47bytes):
2801 movq 39(%eax), %xmm0
2802 movq %xmm0, 39(%edx)
2803 L(bk_write_39bytes):
2804 movq 31(%eax), %xmm0
2805 movq %xmm0, 31(%edx)
2806 L(bk_write_31bytes):
2807 movq 23(%eax), %xmm0
2808 movq %xmm0, 23(%edx)
2809 L(bk_write_23bytes):
2810 movq 15(%eax), %xmm0
2811 movq %xmm0, 15(%edx)
2812 L(bk_write_15bytes):
2813 movq 7(%eax), %xmm0
2814 movq %xmm0, 7(%edx)
2815 L(bk_write_7bytes):
2816 movl 3(%eax), %ecx
2817 movl %ecx, 3(%edx)
2818 movzwl 1(%eax), %ecx
2819 movw %cx, 1(%edx)
2820 movzbl (%eax), %eax
2821 movb %al, (%edx)
2822 # ifndef USE_AS_BCOPY
2823 movl DEST(%esp), %eax
2824 # ifdef USE_AS_MEMPCPY
2825 movl LEN(%esp), %ecx
2826 add %ecx, %eax
2827 # endif
2828 # endif
2829 RETURN
2830
2831 .p2align 4
2832 L(bk_write_43bytes):
2833 movq 35(%eax), %xmm0
2834 movq %xmm0, 35(%edx)
2835 L(bk_write_35bytes):
2836 movq 27(%eax), %xmm0
2837 movq %xmm0, 27(%edx)
2838 L(bk_write_27bytes):
2839 movq 19(%eax), %xmm0
2840 movq %xmm0, 19(%edx)
2841 L(bk_write_19bytes):
2842 movq 11(%eax), %xmm0
2843 movq %xmm0, 11(%edx)
2844 L(bk_write_11bytes):
2845 movq 3(%eax), %xmm0
2846 movq %xmm0, 3(%edx)
2847 L(bk_write_3bytes):
2848 movzwl 1(%eax), %ecx
2849 movw %cx, 1(%edx)
2850 movzbl (%eax), %eax
2851 movb %al, (%edx)
2852 # ifndef USE_AS_BCOPY
2853 movl DEST(%esp), %eax
2854 # ifdef USE_AS_MEMPCPY
2855 movl LEN(%esp), %ecx
2856 add %ecx, %eax
2857 # endif
2858 # endif
2859 RETURN_END
2860
2861
2862 .pushsection .rodata.ssse3,"a",@progbits
2863 .p2align 2
2864 L(table_48bytes_fwd):
2865 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2866 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2879 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2880 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2881 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2882 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2883 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2884 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2885 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2886 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2887 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2888 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2889 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2890 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2891 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2892 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2893 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2894 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2895 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2896 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2897 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2898 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2899 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2900 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2901 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2902 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2903 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2904 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2913
2914 .p2align 2
2915 L(table_48bytes_fwd_align):
2916 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2917 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2930 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2931 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2932 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2933 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2934 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2935 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2936 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2937 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2938 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2939 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2940 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2941 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2942 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2943 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2944 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2945 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2946 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2947 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2948 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2949 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2950 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2951 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2952 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2953 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2954 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2955 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2964
2965 .p2align 2
2966 L(shl_table):
2967 .int JMPTBL (L(shl_0), L(shl_table))
2968 .int JMPTBL (L(shl_1), L(shl_table))
2969 .int JMPTBL (L(shl_2), L(shl_table))
2970 .int JMPTBL (L(shl_3), L(shl_table))
2971 .int JMPTBL (L(shl_4), L(shl_table))
2972 .int JMPTBL (L(shl_5), L(shl_table))
2973 .int JMPTBL (L(shl_6), L(shl_table))
2974 .int JMPTBL (L(shl_7), L(shl_table))
2975 .int JMPTBL (L(shl_8), L(shl_table))
2976 .int JMPTBL (L(shl_9), L(shl_table))
2977 .int JMPTBL (L(shl_10), L(shl_table))
2978 .int JMPTBL (L(shl_11), L(shl_table))
2979 .int JMPTBL (L(shl_12), L(shl_table))
2980 .int JMPTBL (L(shl_13), L(shl_table))
2981 .int JMPTBL (L(shl_14), L(shl_table))
2982 .int JMPTBL (L(shl_15), L(shl_table))
2983
2984 .p2align 2
2985 L(table_48_bytes_bwd):
2986 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2987 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3000 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3001 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3002 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3003 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3004 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3005 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3006 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3007 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3008 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3009 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3010 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3011 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3012 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3013 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3014 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3015 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3016 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3017 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3018 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3019 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3020 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3021 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3022 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3023 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3024 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3025 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3034
3035 .popsection
3036
3037 # ifdef USE_AS_MEMMOVE
3038 .p2align 4
3039 L(copy_backward):
3040 PUSH (%edi)
3041 movl %eax, %edi
3042 lea (%ecx,%edx,1),%edx
3043 lea (%ecx,%edi,1),%edi
3044 testl $0x3, %edx
3045 jnz L(bk_align)
3046
3047 L(bk_aligned_4):
3048 cmp $64, %ecx
3049 jae L(bk_write_more64bytes)
3050
3051 L(bk_write_64bytesless):
3052 cmp $32, %ecx
3053 jb L(bk_write_less32bytes)
3054
3055 L(bk_write_more32bytes):
3056 /* Copy 32 bytes at a time. */
3057 sub $32, %ecx
3058 movq -8(%edi), %xmm0
3059 movq %xmm0, -8(%edx)
3060 movq -16(%edi), %xmm0
3061 movq %xmm0, -16(%edx)
3062 movq -24(%edi), %xmm0
3063 movq %xmm0, -24(%edx)
3064 movq -32(%edi), %xmm0
3065 movq %xmm0, -32(%edx)
3066 sub $32, %edx
3067 sub $32, %edi
3068
3069 L(bk_write_less32bytes):
3070 movl %edi, %eax
3071 sub %ecx, %edx
3072 sub %ecx, %eax
3073 POP (%edi)
3074 L(bk_write_less32bytes_2):
3075 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3076
3077 CFI_PUSH (%edi)
3078
3079 .p2align 4
3080 L(bk_align):
3081 cmp $8, %ecx
3082 jbe L(bk_write_less32bytes)
3083 testl $1, %edx
3084 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3085 then (EDX & 2) must be != 0. */
3086 jz L(bk_got2)
3087 sub $1, %edi
3088 sub $1, %ecx
3089 sub $1, %edx
3090 movzbl (%edi), %eax
3091 movb %al, (%edx)
3092
3093 testl $2, %edx
3094 jz L(bk_aligned_4)
3095
3096 L(bk_got2):
3097 sub $2, %edi
3098 sub $2, %ecx
3099 sub $2, %edx
3100 movzwl (%edi), %eax
3101 movw %ax, (%edx)
3102 jmp L(bk_aligned_4)
3103
3104 .p2align 4
3105 L(bk_write_more64bytes):
3106 /* Check alignment of last byte. */
3107 testl $15, %edx
3108 jz L(bk_ssse3_cpy_pre)
3109
3110 /* EDX is aligned 4 bytes, but not 16 bytes. */
3111 L(bk_ssse3_align):
3112 sub $4, %edi
3113 sub $4, %ecx
3114 sub $4, %edx
3115 movl (%edi), %eax
3116 movl %eax, (%edx)
3117
3118 testl $15, %edx
3119 jz L(bk_ssse3_cpy_pre)
3120
3121 sub $4, %edi
3122 sub $4, %ecx
3123 sub $4, %edx
3124 movl (%edi), %eax
3125 movl %eax, (%edx)
3126
3127 testl $15, %edx
3128 jz L(bk_ssse3_cpy_pre)
3129
3130 sub $4, %edi
3131 sub $4, %ecx
3132 sub $4, %edx
3133 movl (%edi), %eax
3134 movl %eax, (%edx)
3135
3136 L(bk_ssse3_cpy_pre):
3137 cmp $64, %ecx
3138 jb L(bk_write_more32bytes)
3139
3140 .p2align 4
3141 L(bk_ssse3_cpy):
3142 sub $64, %edi
3143 sub $64, %ecx
3144 sub $64, %edx
3145 movdqu 0x30(%edi), %xmm3
3146 movdqa %xmm3, 0x30(%edx)
3147 movdqu 0x20(%edi), %xmm2
3148 movdqa %xmm2, 0x20(%edx)
3149 movdqu 0x10(%edi), %xmm1
3150 movdqa %xmm1, 0x10(%edx)
3151 movdqu (%edi), %xmm0
3152 movdqa %xmm0, (%edx)
3153 cmp $64, %ecx
3154 jae L(bk_ssse3_cpy)
3155 jmp L(bk_write_64bytesless)
3156
3157 # endif
3158
3159 END (MEMCPY)
3160
3161 #endif