]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcpy-ssse3.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcpy-ssse3.S
1 /* memcpy with SSSE3
2 Copyright (C) 2010-2015 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc) \
21 && (defined SHARED \
22 || defined USE_AS_MEMMOVE \
23 || !defined USE_MULTIARCH)
24
25 # include <sysdep.h>
26 # include "asm-syntax.h"
27
28 # ifndef MEMCPY
29 # define MEMCPY __memcpy_ssse3
30 # define MEMCPY_CHK __memcpy_chk_ssse3
31 # endif
32
33 # ifdef USE_AS_BCOPY
34 # define SRC PARMS
35 # define DEST SRC+4
36 # define LEN DEST+4
37 # else
38 # define DEST PARMS
39 # define SRC DEST+4
40 # define LEN SRC+4
41 # endif
42
43 # define CFI_PUSH(REG) \
44 cfi_adjust_cfa_offset (4); \
45 cfi_rel_offset (REG, 0)
46
47 # define CFI_POP(REG) \
48 cfi_adjust_cfa_offset (-4); \
49 cfi_restore (REG)
50
51 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
52 # define POP(REG) popl REG; CFI_POP (REG)
53
54 # ifdef SHARED
55 # define PARMS 8 /* Preserve EBX. */
56 # define ENTRANCE PUSH (%ebx);
57 # define RETURN_END POP (%ebx); ret
58 # define RETURN RETURN_END; CFI_PUSH (%ebx)
59 # define JMPTBL(I, B) I - B
60
61 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
62 jump table with relative offsets. INDEX is a register contains the
63 index into the jump table. SCALE is the scale of INDEX. */
64
65 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
66 /* We first load PC into EBX. */ \
67 SETUP_PIC_REG(bx); \
68 /* Get the address of the jump table. */ \
69 addl $(TABLE - .), %ebx; \
70 /* Get the entry and convert the relative offset to the \
71 absolute address. */ \
72 addl (%ebx, INDEX, SCALE), %ebx; \
73 /* We loaded the jump table. Go. */ \
74 jmp *%ebx
75 # else
76
77 # define PARMS 4
78 # define ENTRANCE
79 # define RETURN_END ret
80 # define RETURN RETURN_END
81 # define JMPTBL(I, B) I
82
83 /* Branch to an entry in a jump table. TABLE is a jump table with
84 absolute offsets. INDEX is a register contains the index into the
85 jump table. SCALE is the scale of INDEX. */
86
87 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
88 jmp *TABLE(, INDEX, SCALE)
89 # endif
90
91 .section .text.ssse3,"ax",@progbits
92 # if !defined USE_AS_BCOPY
93 ENTRY (MEMCPY_CHK)
94 movl 12(%esp), %eax
95 cmpl %eax, 16(%esp)
96 jb HIDDEN_JUMPTARGET (__chk_fail)
97 END (MEMCPY_CHK)
98 # endif
99 ENTRY (MEMCPY)
100 ENTRANCE
101 movl LEN(%esp), %ecx
102 movl SRC(%esp), %eax
103 movl DEST(%esp), %edx
104
105 # ifdef USE_AS_MEMMOVE
106 cmp %eax, %edx
107 jb L(copy_forward)
108 je L(fwd_write_0bytes)
109 cmp $32, %ecx
110 jae L(memmove_bwd)
111 jmp L(bk_write_less32bytes_2)
112
113 .p2align 4
114 L(memmove_bwd):
115 add %ecx, %eax
116 cmp %eax, %edx
117 movl SRC(%esp), %eax
118 jb L(copy_backward)
119
120 L(copy_forward):
121 # endif
122 cmp $48, %ecx
123 jae L(48bytesormore)
124
125 L(fwd_write_less32bytes):
126 # ifndef USE_AS_MEMMOVE
127 cmp %dl, %al
128 jb L(bk_write)
129 # endif
130 add %ecx, %edx
131 add %ecx, %eax
132 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
133 # ifndef USE_AS_MEMMOVE
134 .p2align 4
135 L(bk_write):
136 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
137 # endif
138
139 .p2align 4
140 L(48bytesormore):
141 # ifndef USE_AS_MEMMOVE
142 movlpd (%eax), %xmm0
143 movlpd 8(%eax), %xmm1
144 movlpd %xmm0, (%edx)
145 movlpd %xmm1, 8(%edx)
146 # else
147 movdqu (%eax), %xmm0
148 # endif
149 PUSH (%edi)
150 movl %edx, %edi
151 and $-16, %edx
152 add $16, %edx
153 sub %edx, %edi
154 add %edi, %ecx
155 sub %edi, %eax
156
157 # ifdef SHARED_CACHE_SIZE_HALF
158 cmp $SHARED_CACHE_SIZE_HALF, %ecx
159 # else
160 # ifdef SHARED
161 SETUP_PIC_REG(bx)
162 add $_GLOBAL_OFFSET_TABLE_, %ebx
163 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
164 # else
165 cmp __x86_shared_cache_size_half, %ecx
166 # endif
167 # endif
168
169 mov %eax, %edi
170 jae L(large_page)
171 and $0xf, %edi
172 jz L(shl_0)
173 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
174
175 .p2align 4
176 L(shl_0):
177 # ifdef USE_AS_MEMMOVE
178 movl DEST+4(%esp), %edi
179 movdqu %xmm0, (%edi)
180 # endif
181 xor %edi, %edi
182 cmp $127, %ecx
183 ja L(shl_0_gobble)
184 lea -32(%ecx), %ecx
185
186 .p2align 4
187 L(shl_0_loop):
188 movdqa (%eax, %edi), %xmm0
189 movdqa 16(%eax, %edi), %xmm1
190 sub $32, %ecx
191 movdqa %xmm0, (%edx, %edi)
192 movdqa %xmm1, 16(%edx, %edi)
193 lea 32(%edi), %edi
194 jb L(shl_0_end)
195
196 movdqa (%eax, %edi), %xmm0
197 movdqa 16(%eax, %edi), %xmm1
198 sub $32, %ecx
199 movdqa %xmm0, (%edx, %edi)
200 movdqa %xmm1, 16(%edx, %edi)
201 lea 32(%edi), %edi
202 jb L(shl_0_end)
203
204 movdqa (%eax, %edi), %xmm0
205 movdqa 16(%eax, %edi), %xmm1
206 sub $32, %ecx
207 movdqa %xmm0, (%edx, %edi)
208 movdqa %xmm1, 16(%edx, %edi)
209 lea 32(%edi), %edi
210 jb L(shl_0_end)
211
212 movdqa (%eax, %edi), %xmm0
213 movdqa 16(%eax, %edi), %xmm1
214 sub $32, %ecx
215 movdqa %xmm0, (%edx, %edi)
216 movdqa %xmm1, 16(%edx, %edi)
217 lea 32(%edi), %edi
218
219 L(shl_0_end):
220 lea 32(%ecx), %ecx
221 add %ecx, %edi
222 add %edi, %edx
223 add %edi, %eax
224 POP (%edi)
225 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
226
227 CFI_PUSH (%edi)
228
229 .p2align 4
230 L(shl_0_gobble):
231 # ifdef DATA_CACHE_SIZE_HALF
232 cmp $DATA_CACHE_SIZE_HALF, %ecx
233 # else
234 # ifdef SHARED
235 SETUP_PIC_REG(bx)
236 add $_GLOBAL_OFFSET_TABLE_, %ebx
237 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
238 # else
239 cmp __x86_data_cache_size_half, %ecx
240 # endif
241 # endif
242 POP (%edi)
243 lea -128(%ecx), %ecx
244 jae L(shl_0_gobble_mem_loop)
245
246 .p2align 4
247 L(shl_0_gobble_cache_loop):
248 movdqa (%eax), %xmm0
249 movdqa 0x10(%eax), %xmm1
250 movdqa 0x20(%eax), %xmm2
251 movdqa 0x30(%eax), %xmm3
252 movdqa 0x40(%eax), %xmm4
253 movdqa 0x50(%eax), %xmm5
254 movdqa 0x60(%eax), %xmm6
255 movdqa 0x70(%eax), %xmm7
256 lea 0x80(%eax), %eax
257 sub $128, %ecx
258 movdqa %xmm0, (%edx)
259 movdqa %xmm1, 0x10(%edx)
260 movdqa %xmm2, 0x20(%edx)
261 movdqa %xmm3, 0x30(%edx)
262 movdqa %xmm4, 0x40(%edx)
263 movdqa %xmm5, 0x50(%edx)
264 movdqa %xmm6, 0x60(%edx)
265 movdqa %xmm7, 0x70(%edx)
266 lea 0x80(%edx), %edx
267
268 jae L(shl_0_gobble_cache_loop)
269 cmp $-0x40, %ecx
270 lea 0x80(%ecx), %ecx
271 jl L(shl_0_cache_less_64bytes)
272
273 movdqa (%eax), %xmm0
274 sub $0x40, %ecx
275 movdqa 0x10(%eax), %xmm1
276 movdqa %xmm0, (%edx)
277 movdqa %xmm1, 0x10(%edx)
278 movdqa 0x20(%eax), %xmm0
279 movdqa 0x30(%eax), %xmm1
280 add $0x40, %eax
281 movdqa %xmm0, 0x20(%edx)
282 movdqa %xmm1, 0x30(%edx)
283 add $0x40, %edx
284
285 L(shl_0_cache_less_64bytes):
286 cmp $0x20, %ecx
287 jb L(shl_0_cache_less_32bytes)
288 movdqa (%eax), %xmm0
289 sub $0x20, %ecx
290 movdqa 0x10(%eax), %xmm1
291 add $0x20, %eax
292 movdqa %xmm0, (%edx)
293 movdqa %xmm1, 0x10(%edx)
294 add $0x20, %edx
295
296 L(shl_0_cache_less_32bytes):
297 cmp $0x10, %ecx
298 jb L(shl_0_cache_less_16bytes)
299 sub $0x10, %ecx
300 movdqa (%eax), %xmm0
301 add $0x10, %eax
302 movdqa %xmm0, (%edx)
303 add $0x10, %edx
304
305 L(shl_0_cache_less_16bytes):
306 add %ecx, %edx
307 add %ecx, %eax
308 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
309
310 .p2align 4
311 L(shl_0_gobble_mem_loop):
312 prefetcht0 0x1c0(%eax)
313 prefetcht0 0x280(%eax)
314 prefetcht0 0x1c0(%edx)
315
316 movdqa (%eax), %xmm0
317 movdqa 0x10(%eax), %xmm1
318 movdqa 0x20(%eax), %xmm2
319 movdqa 0x30(%eax), %xmm3
320 movdqa 0x40(%eax), %xmm4
321 movdqa 0x50(%eax), %xmm5
322 movdqa 0x60(%eax), %xmm6
323 movdqa 0x70(%eax), %xmm7
324 lea 0x80(%eax), %eax
325 sub $0x80, %ecx
326 movdqa %xmm0, (%edx)
327 movdqa %xmm1, 0x10(%edx)
328 movdqa %xmm2, 0x20(%edx)
329 movdqa %xmm3, 0x30(%edx)
330 movdqa %xmm4, 0x40(%edx)
331 movdqa %xmm5, 0x50(%edx)
332 movdqa %xmm6, 0x60(%edx)
333 movdqa %xmm7, 0x70(%edx)
334 lea 0x80(%edx), %edx
335
336 jae L(shl_0_gobble_mem_loop)
337 cmp $-0x40, %ecx
338 lea 0x80(%ecx), %ecx
339 jl L(shl_0_mem_less_64bytes)
340
341 movdqa (%eax), %xmm0
342 sub $0x40, %ecx
343 movdqa 0x10(%eax), %xmm1
344
345 movdqa %xmm0, (%edx)
346 movdqa %xmm1, 0x10(%edx)
347
348 movdqa 0x20(%eax), %xmm0
349 movdqa 0x30(%eax), %xmm1
350 add $0x40, %eax
351
352 movdqa %xmm0, 0x20(%edx)
353 movdqa %xmm1, 0x30(%edx)
354 add $0x40, %edx
355
356 L(shl_0_mem_less_64bytes):
357 cmp $0x20, %ecx
358 jb L(shl_0_mem_less_32bytes)
359 movdqa (%eax), %xmm0
360 sub $0x20, %ecx
361 movdqa 0x10(%eax), %xmm1
362 add $0x20, %eax
363 movdqa %xmm0, (%edx)
364 movdqa %xmm1, 0x10(%edx)
365 add $0x20, %edx
366
367 L(shl_0_mem_less_32bytes):
368 cmp $0x10, %ecx
369 jb L(shl_0_mem_less_16bytes)
370 sub $0x10, %ecx
371 movdqa (%eax), %xmm0
372 add $0x10, %eax
373 movdqa %xmm0, (%edx)
374 add $0x10, %edx
375
376 L(shl_0_mem_less_16bytes):
377 add %ecx, %edx
378 add %ecx, %eax
379 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
380
381 .p2align 4
382 L(shl_1):
383 # ifndef USE_AS_MEMMOVE
384 movaps -1(%eax), %xmm1
385 # else
386 movl DEST+4(%esp), %edi
387 movaps -1(%eax), %xmm1
388 movdqu %xmm0, (%edi)
389 # endif
390 # ifdef DATA_CACHE_SIZE_HALF
391 cmp $DATA_CACHE_SIZE_HALF, %ecx
392 # else
393 # ifdef SHARED
394 SETUP_PIC_REG(bx)
395 add $_GLOBAL_OFFSET_TABLE_, %ebx
396 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
397 # else
398 cmp __x86_data_cache_size_half, %ecx
399 # endif
400 # endif
401 jb L(sh_1_no_prefetch)
402
403 lea -64(%ecx), %ecx
404
405 .p2align 4
406 L(Shl1LoopStart):
407 prefetcht0 0x1c0(%eax)
408 prefetcht0 0x1c0(%edx)
409 movaps 15(%eax), %xmm2
410 movaps 31(%eax), %xmm3
411 movaps 47(%eax), %xmm4
412 movaps 63(%eax), %xmm5
413 movaps %xmm5, %xmm7
414 palignr $1, %xmm4, %xmm5
415 palignr $1, %xmm3, %xmm4
416 movaps %xmm5, 48(%edx)
417 palignr $1, %xmm2, %xmm3
418 lea 64(%eax), %eax
419 palignr $1, %xmm1, %xmm2
420 movaps %xmm4, 32(%edx)
421 movaps %xmm3, 16(%edx)
422 movaps %xmm7, %xmm1
423 movaps %xmm2, (%edx)
424 lea 64(%edx), %edx
425 sub $64, %ecx
426 ja L(Shl1LoopStart)
427
428 L(Shl1LoopLeave):
429 add $32, %ecx
430 jle L(shl_end_0)
431
432 movaps 15(%eax), %xmm2
433 movaps 31(%eax), %xmm3
434 palignr $1, %xmm2, %xmm3
435 palignr $1, %xmm1, %xmm2
436 movaps %xmm2, (%edx)
437 movaps %xmm3, 16(%edx)
438 lea 32(%edx, %ecx), %edx
439 lea 32(%eax, %ecx), %eax
440 POP (%edi)
441 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
442
443 CFI_PUSH (%edi)
444
445 .p2align 4
446 L(sh_1_no_prefetch):
447 lea -32(%ecx), %ecx
448 lea -1(%eax), %eax
449 xor %edi, %edi
450
451 .p2align 4
452 L(sh_1_no_prefetch_loop):
453 movdqa 16(%eax, %edi), %xmm2
454 sub $32, %ecx
455 movdqa 32(%eax, %edi), %xmm3
456 movdqa %xmm3, %xmm4
457 palignr $1, %xmm2, %xmm3
458 palignr $1, %xmm1, %xmm2
459 lea 32(%edi), %edi
460 movdqa %xmm2, -32(%edx, %edi)
461 movdqa %xmm3, -16(%edx, %edi)
462 jb L(sh_1_end_no_prefetch_loop)
463
464 movdqa 16(%eax, %edi), %xmm2
465 sub $32, %ecx
466 movdqa 32(%eax, %edi), %xmm3
467 movdqa %xmm3, %xmm1
468 palignr $1, %xmm2, %xmm3
469 palignr $1, %xmm4, %xmm2
470 lea 32(%edi), %edi
471 movdqa %xmm2, -32(%edx, %edi)
472 movdqa %xmm3, -16(%edx, %edi)
473 jae L(sh_1_no_prefetch_loop)
474
475 L(sh_1_end_no_prefetch_loop):
476 lea 32(%ecx), %ecx
477 add %ecx, %edi
478 add %edi, %edx
479 lea 1(%edi, %eax), %eax
480 POP (%edi)
481 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
482
483 CFI_PUSH (%edi)
484
485 .p2align 4
486 L(shl_2):
487 # ifndef USE_AS_MEMMOVE
488 movaps -2(%eax), %xmm1
489 # else
490 movl DEST+4(%esp), %edi
491 movaps -2(%eax), %xmm1
492 movdqu %xmm0, (%edi)
493 # endif
494 # ifdef DATA_CACHE_SIZE_HALF
495 cmp $DATA_CACHE_SIZE_HALF, %ecx
496 # else
497 # ifdef SHARED
498 SETUP_PIC_REG(bx)
499 add $_GLOBAL_OFFSET_TABLE_, %ebx
500 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
501 # else
502 cmp __x86_data_cache_size_half, %ecx
503 # endif
504 # endif
505 jb L(sh_2_no_prefetch)
506
507 lea -64(%ecx), %ecx
508
509 .p2align 4
510 L(Shl2LoopStart):
511 prefetcht0 0x1c0(%eax)
512 prefetcht0 0x1c0(%edx)
513 movaps 14(%eax), %xmm2
514 movaps 30(%eax), %xmm3
515 movaps 46(%eax), %xmm4
516 movaps 62(%eax), %xmm5
517 movaps %xmm5, %xmm7
518 palignr $2, %xmm4, %xmm5
519 palignr $2, %xmm3, %xmm4
520 movaps %xmm5, 48(%edx)
521 palignr $2, %xmm2, %xmm3
522 lea 64(%eax), %eax
523 palignr $2, %xmm1, %xmm2
524 movaps %xmm4, 32(%edx)
525 movaps %xmm3, 16(%edx)
526 movaps %xmm7, %xmm1
527 movaps %xmm2, (%edx)
528 lea 64(%edx), %edx
529 sub $64, %ecx
530 ja L(Shl2LoopStart)
531
532 L(Shl2LoopLeave):
533 add $32, %ecx
534 jle L(shl_end_0)
535
536 movaps 14(%eax), %xmm2
537 movaps 30(%eax), %xmm3
538 palignr $2, %xmm2, %xmm3
539 palignr $2, %xmm1, %xmm2
540 movaps %xmm2, (%edx)
541 movaps %xmm3, 16(%edx)
542 lea 32(%edx, %ecx), %edx
543 lea 32(%eax, %ecx), %eax
544 POP (%edi)
545 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
546
547 CFI_PUSH (%edi)
548
549 .p2align 4
550 L(sh_2_no_prefetch):
551 lea -32(%ecx), %ecx
552 lea -2(%eax), %eax
553 xor %edi, %edi
554
555 .p2align 4
556 L(sh_2_no_prefetch_loop):
557 movdqa 16(%eax, %edi), %xmm2
558 sub $32, %ecx
559 movdqa 32(%eax, %edi), %xmm3
560 movdqa %xmm3, %xmm4
561 palignr $2, %xmm2, %xmm3
562 palignr $2, %xmm1, %xmm2
563 lea 32(%edi), %edi
564 movdqa %xmm2, -32(%edx, %edi)
565 movdqa %xmm3, -16(%edx, %edi)
566 jb L(sh_2_end_no_prefetch_loop)
567
568 movdqa 16(%eax, %edi), %xmm2
569 sub $32, %ecx
570 movdqa 32(%eax, %edi), %xmm3
571 movdqa %xmm3, %xmm1
572 palignr $2, %xmm2, %xmm3
573 palignr $2, %xmm4, %xmm2
574 lea 32(%edi), %edi
575 movdqa %xmm2, -32(%edx, %edi)
576 movdqa %xmm3, -16(%edx, %edi)
577 jae L(sh_2_no_prefetch_loop)
578
579 L(sh_2_end_no_prefetch_loop):
580 lea 32(%ecx), %ecx
581 add %ecx, %edi
582 add %edi, %edx
583 lea 2(%edi, %eax), %eax
584 POP (%edi)
585 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
586
587 CFI_PUSH (%edi)
588
589 .p2align 4
590 L(shl_3):
591 # ifndef USE_AS_MEMMOVE
592 movaps -3(%eax), %xmm1
593 # else
594 movl DEST+4(%esp), %edi
595 movaps -3(%eax), %xmm1
596 movdqu %xmm0, (%edi)
597 # endif
598 # ifdef DATA_CACHE_SIZE_HALF
599 cmp $DATA_CACHE_SIZE_HALF, %ecx
600 # else
601 # ifdef SHARED
602 SETUP_PIC_REG(bx)
603 add $_GLOBAL_OFFSET_TABLE_, %ebx
604 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
605 # else
606 cmp __x86_data_cache_size_half, %ecx
607 # endif
608 # endif
609 jb L(sh_3_no_prefetch)
610
611 lea -64(%ecx), %ecx
612
613 .p2align 4
614 L(Shl3LoopStart):
615 prefetcht0 0x1c0(%eax)
616 prefetcht0 0x1c0(%edx)
617 movaps 13(%eax), %xmm2
618 movaps 29(%eax), %xmm3
619 movaps 45(%eax), %xmm4
620 movaps 61(%eax), %xmm5
621 movaps %xmm5, %xmm7
622 palignr $3, %xmm4, %xmm5
623 palignr $3, %xmm3, %xmm4
624 movaps %xmm5, 48(%edx)
625 palignr $3, %xmm2, %xmm3
626 lea 64(%eax), %eax
627 palignr $3, %xmm1, %xmm2
628 movaps %xmm4, 32(%edx)
629 movaps %xmm3, 16(%edx)
630 movaps %xmm7, %xmm1
631 movaps %xmm2, (%edx)
632 lea 64(%edx), %edx
633 sub $64, %ecx
634 ja L(Shl3LoopStart)
635
636 L(Shl3LoopLeave):
637 add $32, %ecx
638 jle L(shl_end_0)
639
640 movaps 13(%eax), %xmm2
641 movaps 29(%eax), %xmm3
642 palignr $3, %xmm2, %xmm3
643 palignr $3, %xmm1, %xmm2
644 movaps %xmm2, (%edx)
645 movaps %xmm3, 16(%edx)
646 lea 32(%edx, %ecx), %edx
647 lea 32(%eax, %ecx), %eax
648 POP (%edi)
649 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
650
651 CFI_PUSH (%edi)
652
653 .p2align 4
654 L(sh_3_no_prefetch):
655 lea -32(%ecx), %ecx
656 lea -3(%eax), %eax
657 xor %edi, %edi
658
659 .p2align 4
660 L(sh_3_no_prefetch_loop):
661 movdqa 16(%eax, %edi), %xmm2
662 sub $32, %ecx
663 movdqa 32(%eax, %edi), %xmm3
664 movdqa %xmm3, %xmm4
665 palignr $3, %xmm2, %xmm3
666 palignr $3, %xmm1, %xmm2
667 lea 32(%edi), %edi
668 movdqa %xmm2, -32(%edx, %edi)
669 movdqa %xmm3, -16(%edx, %edi)
670
671 jb L(sh_3_end_no_prefetch_loop)
672
673 movdqa 16(%eax, %edi), %xmm2
674 sub $32, %ecx
675 movdqa 32(%eax, %edi), %xmm3
676 movdqa %xmm3, %xmm1
677 palignr $3, %xmm2, %xmm3
678 palignr $3, %xmm4, %xmm2
679 lea 32(%edi), %edi
680 movdqa %xmm2, -32(%edx, %edi)
681 movdqa %xmm3, -16(%edx, %edi)
682
683 jae L(sh_3_no_prefetch_loop)
684
685 L(sh_3_end_no_prefetch_loop):
686 lea 32(%ecx), %ecx
687 add %ecx, %edi
688 add %edi, %edx
689 lea 3(%edi, %eax), %eax
690 POP (%edi)
691 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
692
693 CFI_PUSH (%edi)
694
695 .p2align 4
696 L(shl_4):
697 # ifndef USE_AS_MEMMOVE
698 movaps -4(%eax), %xmm1
699 # else
700 movl DEST+4(%esp), %edi
701 movaps -4(%eax), %xmm1
702 movdqu %xmm0, (%edi)
703 # endif
704 # ifdef DATA_CACHE_SIZE_HALF
705 cmp $DATA_CACHE_SIZE_HALF, %ecx
706 # else
707 # ifdef SHARED
708 SETUP_PIC_REG(bx)
709 add $_GLOBAL_OFFSET_TABLE_, %ebx
710 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
711 # else
712 cmp __x86_data_cache_size_half, %ecx
713 # endif
714 # endif
715 jb L(sh_4_no_prefetch)
716
717 lea -64(%ecx), %ecx
718
719 .p2align 4
720 L(Shl4LoopStart):
721 prefetcht0 0x1c0(%eax)
722 prefetcht0 0x1c0(%edx)
723 movaps 12(%eax), %xmm2
724 movaps 28(%eax), %xmm3
725 movaps 44(%eax), %xmm4
726 movaps 60(%eax), %xmm5
727 movaps %xmm5, %xmm7
728 palignr $4, %xmm4, %xmm5
729 palignr $4, %xmm3, %xmm4
730 movaps %xmm5, 48(%edx)
731 palignr $4, %xmm2, %xmm3
732 lea 64(%eax), %eax
733 palignr $4, %xmm1, %xmm2
734 movaps %xmm4, 32(%edx)
735 movaps %xmm3, 16(%edx)
736 movaps %xmm7, %xmm1
737 movaps %xmm2, (%edx)
738 lea 64(%edx), %edx
739 sub $64, %ecx
740 ja L(Shl4LoopStart)
741
742 L(Shl4LoopLeave):
743 add $32, %ecx
744 jle L(shl_end_0)
745
746 movaps 12(%eax), %xmm2
747 movaps 28(%eax), %xmm3
748 palignr $4, %xmm2, %xmm3
749 palignr $4, %xmm1, %xmm2
750 movaps %xmm2, (%edx)
751 movaps %xmm3, 16(%edx)
752 lea 32(%edx, %ecx), %edx
753 lea 32(%eax, %ecx), %eax
754 POP (%edi)
755 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
756
757 CFI_PUSH (%edi)
758
759 .p2align 4
760 L(sh_4_no_prefetch):
761 lea -32(%ecx), %ecx
762 lea -4(%eax), %eax
763 xor %edi, %edi
764
765 .p2align 4
766 L(sh_4_no_prefetch_loop):
767 movdqa 16(%eax, %edi), %xmm2
768 sub $32, %ecx
769 movdqa 32(%eax, %edi), %xmm3
770 movdqa %xmm3, %xmm4
771 palignr $4, %xmm2, %xmm3
772 palignr $4, %xmm1, %xmm2
773 lea 32(%edi), %edi
774 movdqa %xmm2, -32(%edx, %edi)
775 movdqa %xmm3, -16(%edx, %edi)
776
777 jb L(sh_4_end_no_prefetch_loop)
778
779 movdqa 16(%eax, %edi), %xmm2
780 sub $32, %ecx
781 movdqa 32(%eax, %edi), %xmm3
782 movdqa %xmm3, %xmm1
783 palignr $4, %xmm2, %xmm3
784 palignr $4, %xmm4, %xmm2
785 lea 32(%edi), %edi
786 movdqa %xmm2, -32(%edx, %edi)
787 movdqa %xmm3, -16(%edx, %edi)
788
789 jae L(sh_4_no_prefetch_loop)
790
791 L(sh_4_end_no_prefetch_loop):
792 lea 32(%ecx), %ecx
793 add %ecx, %edi
794 add %edi, %edx
795 lea 4(%edi, %eax), %eax
796 POP (%edi)
797 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
798
799 CFI_PUSH (%edi)
800
801 .p2align 4
802 L(shl_5):
803 # ifndef USE_AS_MEMMOVE
804 movaps -5(%eax), %xmm1
805 # else
806 movl DEST+4(%esp), %edi
807 movaps -5(%eax), %xmm1
808 movdqu %xmm0, (%edi)
809 # endif
810 # ifdef DATA_CACHE_SIZE_HALF
811 cmp $DATA_CACHE_SIZE_HALF, %ecx
812 # else
813 # ifdef SHARED
814 SETUP_PIC_REG(bx)
815 add $_GLOBAL_OFFSET_TABLE_, %ebx
816 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
817 # else
818 cmp __x86_data_cache_size_half, %ecx
819 # endif
820 # endif
821 jb L(sh_5_no_prefetch)
822
823 lea -64(%ecx), %ecx
824
825 .p2align 4
826 L(Shl5LoopStart):
827 prefetcht0 0x1c0(%eax)
828 prefetcht0 0x1c0(%edx)
829 movaps 11(%eax), %xmm2
830 movaps 27(%eax), %xmm3
831 movaps 43(%eax), %xmm4
832 movaps 59(%eax), %xmm5
833 movaps %xmm5, %xmm7
834 palignr $5, %xmm4, %xmm5
835 palignr $5, %xmm3, %xmm4
836 movaps %xmm5, 48(%edx)
837 palignr $5, %xmm2, %xmm3
838 lea 64(%eax), %eax
839 palignr $5, %xmm1, %xmm2
840 movaps %xmm4, 32(%edx)
841 movaps %xmm3, 16(%edx)
842 movaps %xmm7, %xmm1
843 movaps %xmm2, (%edx)
844 lea 64(%edx), %edx
845 sub $64, %ecx
846 ja L(Shl5LoopStart)
847
848 L(Shl5LoopLeave):
849 add $32, %ecx
850 jle L(shl_end_0)
851
852 movaps 11(%eax), %xmm2
853 movaps 27(%eax), %xmm3
854 palignr $5, %xmm2, %xmm3
855 palignr $5, %xmm1, %xmm2
856 movaps %xmm2, (%edx)
857 movaps %xmm3, 16(%edx)
858 lea 32(%edx, %ecx), %edx
859 lea 32(%eax, %ecx), %eax
860 POP (%edi)
861 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
862
863 CFI_PUSH (%edi)
864
865 .p2align 4
866 L(sh_5_no_prefetch):
867 lea -32(%ecx), %ecx
868 lea -5(%eax), %eax
869 xor %edi, %edi
870
871 .p2align 4
872 L(sh_5_no_prefetch_loop):
873 movdqa 16(%eax, %edi), %xmm2
874 sub $32, %ecx
875 movdqa 32(%eax, %edi), %xmm3
876 movdqa %xmm3, %xmm4
877 palignr $5, %xmm2, %xmm3
878 palignr $5, %xmm1, %xmm2
879 lea 32(%edi), %edi
880 movdqa %xmm2, -32(%edx, %edi)
881 movdqa %xmm3, -16(%edx, %edi)
882
883 jb L(sh_5_end_no_prefetch_loop)
884
885 movdqa 16(%eax, %edi), %xmm2
886 sub $32, %ecx
887 movdqa 32(%eax, %edi), %xmm3
888 movdqa %xmm3, %xmm1
889 palignr $5, %xmm2, %xmm3
890 palignr $5, %xmm4, %xmm2
891 lea 32(%edi), %edi
892 movdqa %xmm2, -32(%edx, %edi)
893 movdqa %xmm3, -16(%edx, %edi)
894
895 jae L(sh_5_no_prefetch_loop)
896
897 L(sh_5_end_no_prefetch_loop):
898 lea 32(%ecx), %ecx
899 add %ecx, %edi
900 add %edi, %edx
901 lea 5(%edi, %eax), %eax
902 POP (%edi)
903 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
904
905 CFI_PUSH (%edi)
906
907 .p2align 4
908 L(shl_6):
909 # ifndef USE_AS_MEMMOVE
910 movaps -6(%eax), %xmm1
911 # else
912 movl DEST+4(%esp), %edi
913 movaps -6(%eax), %xmm1
914 movdqu %xmm0, (%edi)
915 # endif
916 # ifdef DATA_CACHE_SIZE_HALF
917 cmp $DATA_CACHE_SIZE_HALF, %ecx
918 # else
919 # ifdef SHARED
920 SETUP_PIC_REG(bx)
921 add $_GLOBAL_OFFSET_TABLE_, %ebx
922 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
923 # else
924 cmp __x86_data_cache_size_half, %ecx
925 # endif
926 # endif
927 jb L(sh_6_no_prefetch)
928
929 lea -64(%ecx), %ecx
930
931 .p2align 4
932 L(Shl6LoopStart):
933 prefetcht0 0x1c0(%eax)
934 prefetcht0 0x1c0(%edx)
935 movaps 10(%eax), %xmm2
936 movaps 26(%eax), %xmm3
937 movaps 42(%eax), %xmm4
938 movaps 58(%eax), %xmm5
939 movaps %xmm5, %xmm7
940 palignr $6, %xmm4, %xmm5
941 palignr $6, %xmm3, %xmm4
942 movaps %xmm5, 48(%edx)
943 palignr $6, %xmm2, %xmm3
944 lea 64(%eax), %eax
945 palignr $6, %xmm1, %xmm2
946 movaps %xmm4, 32(%edx)
947 movaps %xmm3, 16(%edx)
948 movaps %xmm7, %xmm1
949 movaps %xmm2, (%edx)
950 lea 64(%edx), %edx
951 sub $64, %ecx
952 ja L(Shl6LoopStart)
953
954 L(Shl6LoopLeave):
955 add $32, %ecx
956 jle L(shl_end_0)
957
958 movaps 10(%eax), %xmm2
959 movaps 26(%eax), %xmm3
960 palignr $6, %xmm2, %xmm3
961 palignr $6, %xmm1, %xmm2
962 movaps %xmm2, (%edx)
963 movaps %xmm3, 16(%edx)
964 lea 32(%edx, %ecx), %edx
965 lea 32(%eax, %ecx), %eax
966 POP (%edi)
967 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
968
969 CFI_PUSH (%edi)
970
971 .p2align 4
972 L(sh_6_no_prefetch):
973 lea -32(%ecx), %ecx
974 lea -6(%eax), %eax
975 xor %edi, %edi
976
977 .p2align 4
978 L(sh_6_no_prefetch_loop):
979 movdqa 16(%eax, %edi), %xmm2
980 sub $32, %ecx
981 movdqa 32(%eax, %edi), %xmm3
982 movdqa %xmm3, %xmm4
983 palignr $6, %xmm2, %xmm3
984 palignr $6, %xmm1, %xmm2
985 lea 32(%edi), %edi
986 movdqa %xmm2, -32(%edx, %edi)
987 movdqa %xmm3, -16(%edx, %edi)
988
989 jb L(sh_6_end_no_prefetch_loop)
990
991 movdqa 16(%eax, %edi), %xmm2
992 sub $32, %ecx
993 movdqa 32(%eax, %edi), %xmm3
994 movdqa %xmm3, %xmm1
995 palignr $6, %xmm2, %xmm3
996 palignr $6, %xmm4, %xmm2
997 lea 32(%edi), %edi
998 movdqa %xmm2, -32(%edx, %edi)
999 movdqa %xmm3, -16(%edx, %edi)
1000
1001 jae L(sh_6_no_prefetch_loop)
1002
1003 L(sh_6_end_no_prefetch_loop):
1004 lea 32(%ecx), %ecx
1005 add %ecx, %edi
1006 add %edi, %edx
1007 lea 6(%edi, %eax), %eax
1008 POP (%edi)
1009 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1010
1011 CFI_PUSH (%edi)
1012
1013 .p2align 4
1014 L(shl_7):
1015 # ifndef USE_AS_MEMMOVE
1016 movaps -7(%eax), %xmm1
1017 # else
1018 movl DEST+4(%esp), %edi
1019 movaps -7(%eax), %xmm1
1020 movdqu %xmm0, (%edi)
1021 # endif
1022 # ifdef DATA_CACHE_SIZE_HALF
1023 cmp $DATA_CACHE_SIZE_HALF, %ecx
1024 # else
1025 # ifdef SHARED
1026 SETUP_PIC_REG(bx)
1027 add $_GLOBAL_OFFSET_TABLE_, %ebx
1028 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1029 # else
1030 cmp __x86_data_cache_size_half, %ecx
1031 # endif
1032 # endif
1033 jb L(sh_7_no_prefetch)
1034
1035 lea -64(%ecx), %ecx
1036
1037 .p2align 4
1038 L(Shl7LoopStart):
1039 prefetcht0 0x1c0(%eax)
1040 prefetcht0 0x1c0(%edx)
1041 movaps 9(%eax), %xmm2
1042 movaps 25(%eax), %xmm3
1043 movaps 41(%eax), %xmm4
1044 movaps 57(%eax), %xmm5
1045 movaps %xmm5, %xmm7
1046 palignr $7, %xmm4, %xmm5
1047 palignr $7, %xmm3, %xmm4
1048 movaps %xmm5, 48(%edx)
1049 palignr $7, %xmm2, %xmm3
1050 lea 64(%eax), %eax
1051 palignr $7, %xmm1, %xmm2
1052 movaps %xmm4, 32(%edx)
1053 movaps %xmm3, 16(%edx)
1054 movaps %xmm7, %xmm1
1055 movaps %xmm2, (%edx)
1056 lea 64(%edx), %edx
1057 sub $64, %ecx
1058 ja L(Shl7LoopStart)
1059
1060 L(Shl7LoopLeave):
1061 add $32, %ecx
1062 jle L(shl_end_0)
1063
1064 movaps 9(%eax), %xmm2
1065 movaps 25(%eax), %xmm3
1066 palignr $7, %xmm2, %xmm3
1067 palignr $7, %xmm1, %xmm2
1068 movaps %xmm2, (%edx)
1069 movaps %xmm3, 16(%edx)
1070 lea 32(%edx, %ecx), %edx
1071 lea 32(%eax, %ecx), %eax
1072 POP (%edi)
1073 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1074
1075 CFI_PUSH (%edi)
1076
1077 .p2align 4
1078 L(sh_7_no_prefetch):
1079 lea -32(%ecx), %ecx
1080 lea -7(%eax), %eax
1081 xor %edi, %edi
1082
1083 .p2align 4
1084 L(sh_7_no_prefetch_loop):
1085 movdqa 16(%eax, %edi), %xmm2
1086 sub $32, %ecx
1087 movdqa 32(%eax, %edi), %xmm3
1088 movdqa %xmm3, %xmm4
1089 palignr $7, %xmm2, %xmm3
1090 palignr $7, %xmm1, %xmm2
1091 lea 32(%edi), %edi
1092 movdqa %xmm2, -32(%edx, %edi)
1093 movdqa %xmm3, -16(%edx, %edi)
1094 jb L(sh_7_end_no_prefetch_loop)
1095
1096 movdqa 16(%eax, %edi), %xmm2
1097 sub $32, %ecx
1098 movdqa 32(%eax, %edi), %xmm3
1099 movdqa %xmm3, %xmm1
1100 palignr $7, %xmm2, %xmm3
1101 palignr $7, %xmm4, %xmm2
1102 lea 32(%edi), %edi
1103 movdqa %xmm2, -32(%edx, %edi)
1104 movdqa %xmm3, -16(%edx, %edi)
1105 jae L(sh_7_no_prefetch_loop)
1106
1107 L(sh_7_end_no_prefetch_loop):
1108 lea 32(%ecx), %ecx
1109 add %ecx, %edi
1110 add %edi, %edx
1111 lea 7(%edi, %eax), %eax
1112 POP (%edi)
1113 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1114
1115 CFI_PUSH (%edi)
1116
1117 .p2align 4
1118 L(shl_8):
1119 # ifndef USE_AS_MEMMOVE
1120 movaps -8(%eax), %xmm1
1121 # else
1122 movl DEST+4(%esp), %edi
1123 movaps -8(%eax), %xmm1
1124 movdqu %xmm0, (%edi)
1125 # endif
1126 # ifdef DATA_CACHE_SIZE_HALF
1127 cmp $DATA_CACHE_SIZE_HALF, %ecx
1128 # else
1129 # ifdef SHARED
1130 SETUP_PIC_REG(bx)
1131 add $_GLOBAL_OFFSET_TABLE_, %ebx
1132 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1133 # else
1134 cmp __x86_data_cache_size_half, %ecx
1135 # endif
1136 # endif
1137 jb L(sh_8_no_prefetch)
1138
1139 lea -64(%ecx), %ecx
1140
1141 .p2align 4
1142 L(Shl8LoopStart):
1143 prefetcht0 0x1c0(%eax)
1144 prefetcht0 0x1c0(%edx)
1145 movaps 8(%eax), %xmm2
1146 movaps 24(%eax), %xmm3
1147 movaps 40(%eax), %xmm4
1148 movaps 56(%eax), %xmm5
1149 movaps %xmm5, %xmm7
1150 palignr $8, %xmm4, %xmm5
1151 palignr $8, %xmm3, %xmm4
1152 movaps %xmm5, 48(%edx)
1153 palignr $8, %xmm2, %xmm3
1154 lea 64(%eax), %eax
1155 palignr $8, %xmm1, %xmm2
1156 movaps %xmm4, 32(%edx)
1157 movaps %xmm3, 16(%edx)
1158 movaps %xmm7, %xmm1
1159 movaps %xmm2, (%edx)
1160 lea 64(%edx), %edx
1161 sub $64, %ecx
1162 ja L(Shl8LoopStart)
1163
1164 L(LoopLeave8):
1165 add $32, %ecx
1166 jle L(shl_end_0)
1167
1168 movaps 8(%eax), %xmm2
1169 movaps 24(%eax), %xmm3
1170 palignr $8, %xmm2, %xmm3
1171 palignr $8, %xmm1, %xmm2
1172 movaps %xmm2, (%edx)
1173 movaps %xmm3, 16(%edx)
1174 lea 32(%edx, %ecx), %edx
1175 lea 32(%eax, %ecx), %eax
1176 POP (%edi)
1177 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1178
1179 CFI_PUSH (%edi)
1180
1181 .p2align 4
1182 L(sh_8_no_prefetch):
1183 lea -32(%ecx), %ecx
1184 lea -8(%eax), %eax
1185 xor %edi, %edi
1186
1187 .p2align 4
1188 L(sh_8_no_prefetch_loop):
1189 movdqa 16(%eax, %edi), %xmm2
1190 sub $32, %ecx
1191 movdqa 32(%eax, %edi), %xmm3
1192 movdqa %xmm3, %xmm4
1193 palignr $8, %xmm2, %xmm3
1194 palignr $8, %xmm1, %xmm2
1195 lea 32(%edi), %edi
1196 movdqa %xmm2, -32(%edx, %edi)
1197 movdqa %xmm3, -16(%edx, %edi)
1198 jb L(sh_8_end_no_prefetch_loop)
1199
1200 movdqa 16(%eax, %edi), %xmm2
1201 sub $32, %ecx
1202 movdqa 32(%eax, %edi), %xmm3
1203 movdqa %xmm3, %xmm1
1204 palignr $8, %xmm2, %xmm3
1205 palignr $8, %xmm4, %xmm2
1206 lea 32(%edi), %edi
1207 movdqa %xmm2, -32(%edx, %edi)
1208 movdqa %xmm3, -16(%edx, %edi)
1209 jae L(sh_8_no_prefetch_loop)
1210
1211 L(sh_8_end_no_prefetch_loop):
1212 lea 32(%ecx), %ecx
1213 add %ecx, %edi
1214 add %edi, %edx
1215 lea 8(%edi, %eax), %eax
1216 POP (%edi)
1217 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1218
1219 CFI_PUSH (%edi)
1220
1221 .p2align 4
1222 L(shl_9):
1223 # ifndef USE_AS_MEMMOVE
1224 movaps -9(%eax), %xmm1
1225 # else
1226 movl DEST+4(%esp), %edi
1227 movaps -9(%eax), %xmm1
1228 movdqu %xmm0, (%edi)
1229 # endif
1230 # ifdef DATA_CACHE_SIZE_HALF
1231 cmp $DATA_CACHE_SIZE_HALF, %ecx
1232 # else
1233 # ifdef SHARED
1234 SETUP_PIC_REG(bx)
1235 add $_GLOBAL_OFFSET_TABLE_, %ebx
1236 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1237 # else
1238 cmp __x86_data_cache_size_half, %ecx
1239 # endif
1240 # endif
1241 jb L(sh_9_no_prefetch)
1242
1243 lea -64(%ecx), %ecx
1244
1245 .p2align 4
1246 L(Shl9LoopStart):
1247 prefetcht0 0x1c0(%eax)
1248 prefetcht0 0x1c0(%edx)
1249 movaps 7(%eax), %xmm2
1250 movaps 23(%eax), %xmm3
1251 movaps 39(%eax), %xmm4
1252 movaps 55(%eax), %xmm5
1253 movaps %xmm5, %xmm7
1254 palignr $9, %xmm4, %xmm5
1255 palignr $9, %xmm3, %xmm4
1256 movaps %xmm5, 48(%edx)
1257 palignr $9, %xmm2, %xmm3
1258 lea 64(%eax), %eax
1259 palignr $9, %xmm1, %xmm2
1260 movaps %xmm4, 32(%edx)
1261 movaps %xmm3, 16(%edx)
1262 movaps %xmm7, %xmm1
1263 movaps %xmm2, (%edx)
1264 lea 64(%edx), %edx
1265 sub $64, %ecx
1266 ja L(Shl9LoopStart)
1267
1268 L(Shl9LoopLeave):
1269 add $32, %ecx
1270 jle L(shl_end_0)
1271
1272 movaps 7(%eax), %xmm2
1273 movaps 23(%eax), %xmm3
1274 palignr $9, %xmm2, %xmm3
1275 palignr $9, %xmm1, %xmm2
1276
1277 movaps %xmm2, (%edx)
1278 movaps %xmm3, 16(%edx)
1279 lea 32(%edx, %ecx), %edx
1280 lea 32(%eax, %ecx), %eax
1281 POP (%edi)
1282 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1283
1284 CFI_PUSH (%edi)
1285
1286 .p2align 4
1287 L(sh_9_no_prefetch):
1288 lea -32(%ecx), %ecx
1289 lea -9(%eax), %eax
1290 xor %edi, %edi
1291
1292 .p2align 4
1293 L(sh_9_no_prefetch_loop):
1294 movdqa 16(%eax, %edi), %xmm2
1295 sub $32, %ecx
1296 movdqa 32(%eax, %edi), %xmm3
1297 movdqa %xmm3, %xmm4
1298 palignr $9, %xmm2, %xmm3
1299 palignr $9, %xmm1, %xmm2
1300 lea 32(%edi), %edi
1301 movdqa %xmm2, -32(%edx, %edi)
1302 movdqa %xmm3, -16(%edx, %edi)
1303 jb L(sh_9_end_no_prefetch_loop)
1304
1305 movdqa 16(%eax, %edi), %xmm2
1306 sub $32, %ecx
1307 movdqa 32(%eax, %edi), %xmm3
1308 movdqa %xmm3, %xmm1
1309 palignr $9, %xmm2, %xmm3
1310 palignr $9, %xmm4, %xmm2
1311 lea 32(%edi), %edi
1312 movdqa %xmm2, -32(%edx, %edi)
1313 movdqa %xmm3, -16(%edx, %edi)
1314 jae L(sh_9_no_prefetch_loop)
1315
1316 L(sh_9_end_no_prefetch_loop):
1317 lea 32(%ecx), %ecx
1318 add %ecx, %edi
1319 add %edi, %edx
1320 lea 9(%edi, %eax), %eax
1321 POP (%edi)
1322 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1323
1324 CFI_PUSH (%edi)
1325
1326 .p2align 4
1327 L(shl_10):
1328 # ifndef USE_AS_MEMMOVE
1329 movaps -10(%eax), %xmm1
1330 # else
1331 movl DEST+4(%esp), %edi
1332 movaps -10(%eax), %xmm1
1333 movdqu %xmm0, (%edi)
1334 # endif
1335 # ifdef DATA_CACHE_SIZE_HALF
1336 cmp $DATA_CACHE_SIZE_HALF, %ecx
1337 # else
1338 # ifdef SHARED
1339 SETUP_PIC_REG(bx)
1340 add $_GLOBAL_OFFSET_TABLE_, %ebx
1341 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1342 # else
1343 cmp __x86_data_cache_size_half, %ecx
1344 # endif
1345 # endif
1346 jb L(sh_10_no_prefetch)
1347
1348 lea -64(%ecx), %ecx
1349
1350 .p2align 4
1351 L(Shl10LoopStart):
1352 prefetcht0 0x1c0(%eax)
1353 prefetcht0 0x1c0(%edx)
1354 movaps 6(%eax), %xmm2
1355 movaps 22(%eax), %xmm3
1356 movaps 38(%eax), %xmm4
1357 movaps 54(%eax), %xmm5
1358 movaps %xmm5, %xmm7
1359 palignr $10, %xmm4, %xmm5
1360 palignr $10, %xmm3, %xmm4
1361 movaps %xmm5, 48(%edx)
1362 palignr $10, %xmm2, %xmm3
1363 lea 64(%eax), %eax
1364 palignr $10, %xmm1, %xmm2
1365 movaps %xmm4, 32(%edx)
1366 movaps %xmm3, 16(%edx)
1367 movaps %xmm7, %xmm1
1368 movaps %xmm2, (%edx)
1369 lea 64(%edx), %edx
1370 sub $64, %ecx
1371 ja L(Shl10LoopStart)
1372
1373 L(Shl10LoopLeave):
1374 add $32, %ecx
1375 jle L(shl_end_0)
1376
1377 movaps 6(%eax), %xmm2
1378 movaps 22(%eax), %xmm3
1379 palignr $10, %xmm2, %xmm3
1380 palignr $10, %xmm1, %xmm2
1381
1382 movaps %xmm2, (%edx)
1383 movaps %xmm3, 16(%edx)
1384 lea 32(%edx, %ecx), %edx
1385 lea 32(%eax, %ecx), %eax
1386 POP (%edi)
1387 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1388
1389 CFI_PUSH (%edi)
1390
1391 .p2align 4
1392 L(sh_10_no_prefetch):
1393 lea -32(%ecx), %ecx
1394 lea -10(%eax), %eax
1395 xor %edi, %edi
1396
1397 .p2align 4
1398 L(sh_10_no_prefetch_loop):
1399 movdqa 16(%eax, %edi), %xmm2
1400 sub $32, %ecx
1401 movdqa 32(%eax, %edi), %xmm3
1402 movdqa %xmm3, %xmm4
1403 palignr $10, %xmm2, %xmm3
1404 palignr $10, %xmm1, %xmm2
1405 lea 32(%edi), %edi
1406 movdqa %xmm2, -32(%edx, %edi)
1407 movdqa %xmm3, -16(%edx, %edi)
1408 jb L(sh_10_end_no_prefetch_loop)
1409
1410 movdqa 16(%eax, %edi), %xmm2
1411 sub $32, %ecx
1412 movdqa 32(%eax, %edi), %xmm3
1413 movdqa %xmm3, %xmm1
1414 palignr $10, %xmm2, %xmm3
1415 palignr $10, %xmm4, %xmm2
1416 lea 32(%edi), %edi
1417 movdqa %xmm2, -32(%edx, %edi)
1418 movdqa %xmm3, -16(%edx, %edi)
1419 jae L(sh_10_no_prefetch_loop)
1420
1421 L(sh_10_end_no_prefetch_loop):
1422 lea 32(%ecx), %ecx
1423 add %ecx, %edi
1424 add %edi, %edx
1425 lea 10(%edi, %eax), %eax
1426 POP (%edi)
1427 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1428
1429 CFI_PUSH (%edi)
1430
1431 .p2align 4
1432 L(shl_11):
1433 # ifndef USE_AS_MEMMOVE
1434 movaps -11(%eax), %xmm1
1435 # else
1436 movl DEST+4(%esp), %edi
1437 movaps -11(%eax), %xmm1
1438 movdqu %xmm0, (%edi)
1439 # endif
1440 # ifdef DATA_CACHE_SIZE_HALF
1441 cmp $DATA_CACHE_SIZE_HALF, %ecx
1442 # else
1443 # ifdef SHARED
1444 SETUP_PIC_REG(bx)
1445 add $_GLOBAL_OFFSET_TABLE_, %ebx
1446 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1447 # else
1448 cmp __x86_data_cache_size_half, %ecx
1449 # endif
1450 # endif
1451 jb L(sh_11_no_prefetch)
1452
1453 lea -64(%ecx), %ecx
1454
1455 .p2align 4
1456 L(Shl11LoopStart):
1457 prefetcht0 0x1c0(%eax)
1458 prefetcht0 0x1c0(%edx)
1459 movaps 5(%eax), %xmm2
1460 movaps 21(%eax), %xmm3
1461 movaps 37(%eax), %xmm4
1462 movaps 53(%eax), %xmm5
1463 movaps %xmm5, %xmm7
1464 palignr $11, %xmm4, %xmm5
1465 palignr $11, %xmm3, %xmm4
1466 movaps %xmm5, 48(%edx)
1467 palignr $11, %xmm2, %xmm3
1468 lea 64(%eax), %eax
1469 palignr $11, %xmm1, %xmm2
1470 movaps %xmm4, 32(%edx)
1471 movaps %xmm3, 16(%edx)
1472 movaps %xmm7, %xmm1
1473 movaps %xmm2, (%edx)
1474 lea 64(%edx), %edx
1475 sub $64, %ecx
1476 ja L(Shl11LoopStart)
1477
1478 L(Shl11LoopLeave):
1479 add $32, %ecx
1480 jle L(shl_end_0)
1481
1482 movaps 5(%eax), %xmm2
1483 movaps 21(%eax), %xmm3
1484 palignr $11, %xmm2, %xmm3
1485 palignr $11, %xmm1, %xmm2
1486
1487 movaps %xmm2, (%edx)
1488 movaps %xmm3, 16(%edx)
1489 lea 32(%edx, %ecx), %edx
1490 lea 32(%eax, %ecx), %eax
1491 POP (%edi)
1492 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1493
1494 CFI_PUSH (%edi)
1495
1496 .p2align 4
1497 L(sh_11_no_prefetch):
1498 lea -32(%ecx), %ecx
1499 lea -11(%eax), %eax
1500 xor %edi, %edi
1501
1502 .p2align 4
1503 L(sh_11_no_prefetch_loop):
1504 movdqa 16(%eax, %edi), %xmm2
1505 sub $32, %ecx
1506 movdqa 32(%eax, %edi), %xmm3
1507 movdqa %xmm3, %xmm4
1508 palignr $11, %xmm2, %xmm3
1509 palignr $11, %xmm1, %xmm2
1510 lea 32(%edi), %edi
1511 movdqa %xmm2, -32(%edx, %edi)
1512 movdqa %xmm3, -16(%edx, %edi)
1513 jb L(sh_11_end_no_prefetch_loop)
1514
1515 movdqa 16(%eax, %edi), %xmm2
1516 sub $32, %ecx
1517 movdqa 32(%eax, %edi), %xmm3
1518 movdqa %xmm3, %xmm1
1519 palignr $11, %xmm2, %xmm3
1520 palignr $11, %xmm4, %xmm2
1521 lea 32(%edi), %edi
1522 movdqa %xmm2, -32(%edx, %edi)
1523 movdqa %xmm3, -16(%edx, %edi)
1524 jae L(sh_11_no_prefetch_loop)
1525
1526 L(sh_11_end_no_prefetch_loop):
1527 lea 32(%ecx), %ecx
1528 add %ecx, %edi
1529 add %edi, %edx
1530 lea 11(%edi, %eax), %eax
1531 POP (%edi)
1532 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1533
1534 CFI_PUSH (%edi)
1535
1536 .p2align 4
1537 L(shl_12):
1538 # ifndef USE_AS_MEMMOVE
1539 movaps -12(%eax), %xmm1
1540 # else
1541 movl DEST+4(%esp), %edi
1542 movaps -12(%eax), %xmm1
1543 movdqu %xmm0, (%edi)
1544 # endif
1545 # ifdef DATA_CACHE_SIZE_HALF
1546 cmp $DATA_CACHE_SIZE_HALF, %ecx
1547 # else
1548 # ifdef SHARED
1549 SETUP_PIC_REG(bx)
1550 add $_GLOBAL_OFFSET_TABLE_, %ebx
1551 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1552 # else
1553 cmp __x86_data_cache_size_half, %ecx
1554 # endif
1555 # endif
1556 jb L(sh_12_no_prefetch)
1557
1558 lea -64(%ecx), %ecx
1559
1560 .p2align 4
1561 L(Shl12LoopStart):
1562 prefetcht0 0x1c0(%eax)
1563 prefetcht0 0x1c0(%edx)
1564 movaps 4(%eax), %xmm2
1565 movaps 20(%eax), %xmm3
1566 movaps 36(%eax), %xmm4
1567 movaps 52(%eax), %xmm5
1568 movaps %xmm5, %xmm7
1569 palignr $12, %xmm4, %xmm5
1570 palignr $12, %xmm3, %xmm4
1571 movaps %xmm5, 48(%edx)
1572 palignr $12, %xmm2, %xmm3
1573 lea 64(%eax), %eax
1574 palignr $12, %xmm1, %xmm2
1575 movaps %xmm4, 32(%edx)
1576 movaps %xmm3, 16(%edx)
1577 movaps %xmm7, %xmm1
1578 movaps %xmm2, (%edx)
1579 lea 64(%edx), %edx
1580 sub $64, %ecx
1581 ja L(Shl12LoopStart)
1582
1583 L(Shl12LoopLeave):
1584 add $32, %ecx
1585 jle L(shl_end_0)
1586
1587 movaps 4(%eax), %xmm2
1588 movaps 20(%eax), %xmm3
1589 palignr $12, %xmm2, %xmm3
1590 palignr $12, %xmm1, %xmm2
1591
1592 movaps %xmm2, (%edx)
1593 movaps %xmm3, 16(%edx)
1594 lea 32(%edx, %ecx), %edx
1595 lea 32(%eax, %ecx), %eax
1596 POP (%edi)
1597 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1598
1599 CFI_PUSH (%edi)
1600
1601 .p2align 4
1602 L(sh_12_no_prefetch):
1603 lea -32(%ecx), %ecx
1604 lea -12(%eax), %eax
1605 xor %edi, %edi
1606
1607 .p2align 4
1608 L(sh_12_no_prefetch_loop):
1609 movdqa 16(%eax, %edi), %xmm2
1610 sub $32, %ecx
1611 movdqa 32(%eax, %edi), %xmm3
1612 movdqa %xmm3, %xmm4
1613 palignr $12, %xmm2, %xmm3
1614 palignr $12, %xmm1, %xmm2
1615 lea 32(%edi), %edi
1616 movdqa %xmm2, -32(%edx, %edi)
1617 movdqa %xmm3, -16(%edx, %edi)
1618 jb L(sh_12_end_no_prefetch_loop)
1619
1620 movdqa 16(%eax, %edi), %xmm2
1621 sub $32, %ecx
1622 movdqa 32(%eax, %edi), %xmm3
1623 movdqa %xmm3, %xmm1
1624 palignr $12, %xmm2, %xmm3
1625 palignr $12, %xmm4, %xmm2
1626 lea 32(%edi), %edi
1627 movdqa %xmm2, -32(%edx, %edi)
1628 movdqa %xmm3, -16(%edx, %edi)
1629 jae L(sh_12_no_prefetch_loop)
1630
1631 L(sh_12_end_no_prefetch_loop):
1632 lea 32(%ecx), %ecx
1633 add %ecx, %edi
1634 add %edi, %edx
1635 lea 12(%edi, %eax), %eax
1636 POP (%edi)
1637 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1638
1639 CFI_PUSH (%edi)
1640
1641 .p2align 4
1642 L(shl_13):
1643 # ifndef USE_AS_MEMMOVE
1644 movaps -13(%eax), %xmm1
1645 # else
1646 movl DEST+4(%esp), %edi
1647 movaps -13(%eax), %xmm1
1648 movdqu %xmm0, (%edi)
1649 # endif
1650 # ifdef DATA_CACHE_SIZE_HALF
1651 cmp $DATA_CACHE_SIZE_HALF, %ecx
1652 # else
1653 # ifdef SHARED
1654 SETUP_PIC_REG(bx)
1655 add $_GLOBAL_OFFSET_TABLE_, %ebx
1656 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1657 # else
1658 cmp __x86_data_cache_size_half, %ecx
1659 # endif
1660 # endif
1661 jb L(sh_13_no_prefetch)
1662
1663 lea -64(%ecx), %ecx
1664
1665 .p2align 4
1666 L(Shl13LoopStart):
1667 prefetcht0 0x1c0(%eax)
1668 prefetcht0 0x1c0(%edx)
1669 movaps 3(%eax), %xmm2
1670 movaps 19(%eax), %xmm3
1671 movaps 35(%eax), %xmm4
1672 movaps 51(%eax), %xmm5
1673 movaps %xmm5, %xmm7
1674 palignr $13, %xmm4, %xmm5
1675 palignr $13, %xmm3, %xmm4
1676 movaps %xmm5, 48(%edx)
1677 palignr $13, %xmm2, %xmm3
1678 lea 64(%eax), %eax
1679 palignr $13, %xmm1, %xmm2
1680 movaps %xmm4, 32(%edx)
1681 movaps %xmm3, 16(%edx)
1682 movaps %xmm7, %xmm1
1683 movaps %xmm2, (%edx)
1684 lea 64(%edx), %edx
1685 sub $64, %ecx
1686 ja L(Shl13LoopStart)
1687
1688 L(Shl13LoopLeave):
1689 add $32, %ecx
1690 jle L(shl_end_0)
1691
1692 movaps 3(%eax), %xmm2
1693 movaps 19(%eax), %xmm3
1694 palignr $13, %xmm2, %xmm3
1695 palignr $13, %xmm1, %xmm2
1696
1697 movaps %xmm2, (%edx)
1698 movaps %xmm3, 16(%edx)
1699 lea 32(%edx, %ecx), %edx
1700 lea 32(%eax, %ecx), %eax
1701 POP (%edi)
1702 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1703
1704 CFI_PUSH (%edi)
1705
1706 .p2align 4
1707 L(sh_13_no_prefetch):
1708 lea -32(%ecx), %ecx
1709 lea -13(%eax), %eax
1710 xor %edi, %edi
1711
1712 .p2align 4
1713 L(sh_13_no_prefetch_loop):
1714 movdqa 16(%eax, %edi), %xmm2
1715 sub $32, %ecx
1716 movdqa 32(%eax, %edi), %xmm3
1717 movdqa %xmm3, %xmm4
1718 palignr $13, %xmm2, %xmm3
1719 palignr $13, %xmm1, %xmm2
1720 lea 32(%edi), %edi
1721 movdqa %xmm2, -32(%edx, %edi)
1722 movdqa %xmm3, -16(%edx, %edi)
1723 jb L(sh_13_end_no_prefetch_loop)
1724
1725 movdqa 16(%eax, %edi), %xmm2
1726 sub $32, %ecx
1727 movdqa 32(%eax, %edi), %xmm3
1728 movdqa %xmm3, %xmm1
1729 palignr $13, %xmm2, %xmm3
1730 palignr $13, %xmm4, %xmm2
1731 lea 32(%edi), %edi
1732 movdqa %xmm2, -32(%edx, %edi)
1733 movdqa %xmm3, -16(%edx, %edi)
1734 jae L(sh_13_no_prefetch_loop)
1735
1736 L(sh_13_end_no_prefetch_loop):
1737 lea 32(%ecx), %ecx
1738 add %ecx, %edi
1739 add %edi, %edx
1740 lea 13(%edi, %eax), %eax
1741 POP (%edi)
1742 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1743
1744 CFI_PUSH (%edi)
1745
1746 .p2align 4
1747 L(shl_14):
1748 # ifndef USE_AS_MEMMOVE
1749 movaps -14(%eax), %xmm1
1750 # else
1751 movl DEST+4(%esp), %edi
1752 movaps -14(%eax), %xmm1
1753 movdqu %xmm0, (%edi)
1754 # endif
1755 # ifdef DATA_CACHE_SIZE_HALF
1756 cmp $DATA_CACHE_SIZE_HALF, %ecx
1757 # else
1758 # ifdef SHARED
1759 SETUP_PIC_REG(bx)
1760 add $_GLOBAL_OFFSET_TABLE_, %ebx
1761 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1762 # else
1763 cmp __x86_data_cache_size_half, %ecx
1764 # endif
1765 # endif
1766 jb L(sh_14_no_prefetch)
1767
1768 lea -64(%ecx), %ecx
1769
1770 .p2align 4
1771 L(Shl14LoopStart):
1772 prefetcht0 0x1c0(%eax)
1773 prefetcht0 0x1c0(%edx)
1774 movaps 2(%eax), %xmm2
1775 movaps 18(%eax), %xmm3
1776 movaps 34(%eax), %xmm4
1777 movaps 50(%eax), %xmm5
1778 movaps %xmm5, %xmm7
1779 palignr $14, %xmm4, %xmm5
1780 palignr $14, %xmm3, %xmm4
1781 movaps %xmm5, 48(%edx)
1782 palignr $14, %xmm2, %xmm3
1783 lea 64(%eax), %eax
1784 palignr $14, %xmm1, %xmm2
1785 movaps %xmm4, 32(%edx)
1786 movaps %xmm3, 16(%edx)
1787 movaps %xmm7, %xmm1
1788 movaps %xmm2, (%edx)
1789 lea 64(%edx), %edx
1790 sub $64, %ecx
1791 ja L(Shl14LoopStart)
1792
1793 L(Shl14LoopLeave):
1794 add $32, %ecx
1795 jle L(shl_end_0)
1796
1797 movaps 2(%eax), %xmm2
1798 movaps 18(%eax), %xmm3
1799 palignr $14, %xmm2, %xmm3
1800 palignr $14, %xmm1, %xmm2
1801
1802 movaps %xmm2, (%edx)
1803 movaps %xmm3, 16(%edx)
1804 lea 32(%edx, %ecx), %edx
1805 lea 32(%eax, %ecx), %eax
1806 POP (%edi)
1807 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1808
1809 CFI_PUSH (%edi)
1810
1811 .p2align 4
1812 L(sh_14_no_prefetch):
1813 lea -32(%ecx), %ecx
1814 lea -14(%eax), %eax
1815 xor %edi, %edi
1816
1817 .p2align 4
1818 L(sh_14_no_prefetch_loop):
1819 movdqa 16(%eax, %edi), %xmm2
1820 sub $32, %ecx
1821 movdqa 32(%eax, %edi), %xmm3
1822 movdqa %xmm3, %xmm4
1823 palignr $14, %xmm2, %xmm3
1824 palignr $14, %xmm1, %xmm2
1825 lea 32(%edi), %edi
1826 movdqa %xmm2, -32(%edx, %edi)
1827 movdqa %xmm3, -16(%edx, %edi)
1828 jb L(sh_14_end_no_prefetch_loop)
1829
1830 movdqa 16(%eax, %edi), %xmm2
1831 sub $32, %ecx
1832 movdqa 32(%eax, %edi), %xmm3
1833 movdqa %xmm3, %xmm1
1834 palignr $14, %xmm2, %xmm3
1835 palignr $14, %xmm4, %xmm2
1836 lea 32(%edi), %edi
1837 movdqa %xmm2, -32(%edx, %edi)
1838 movdqa %xmm3, -16(%edx, %edi)
1839 jae L(sh_14_no_prefetch_loop)
1840
1841 L(sh_14_end_no_prefetch_loop):
1842 lea 32(%ecx), %ecx
1843 add %ecx, %edi
1844 add %edi, %edx
1845 lea 14(%edi, %eax), %eax
1846 POP (%edi)
1847 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1848
1849 CFI_PUSH (%edi)
1850
1851 .p2align 4
1852 L(shl_15):
1853 # ifndef USE_AS_MEMMOVE
1854 movaps -15(%eax), %xmm1
1855 # else
1856 movl DEST+4(%esp), %edi
1857 movaps -15(%eax), %xmm1
1858 movdqu %xmm0, (%edi)
1859 # endif
1860 # ifdef DATA_CACHE_SIZE_HALF
1861 cmp $DATA_CACHE_SIZE_HALF, %ecx
1862 # else
1863 # ifdef SHARED
1864 SETUP_PIC_REG(bx)
1865 add $_GLOBAL_OFFSET_TABLE_, %ebx
1866 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1867 # else
1868 cmp __x86_data_cache_size_half, %ecx
1869 # endif
1870 # endif
1871 jb L(sh_15_no_prefetch)
1872
1873 lea -64(%ecx), %ecx
1874
1875 .p2align 4
1876 L(Shl15LoopStart):
1877 prefetcht0 0x1c0(%eax)
1878 prefetcht0 0x1c0(%edx)
1879 movaps 1(%eax), %xmm2
1880 movaps 17(%eax), %xmm3
1881 movaps 33(%eax), %xmm4
1882 movaps 49(%eax), %xmm5
1883 movaps %xmm5, %xmm7
1884 palignr $15, %xmm4, %xmm5
1885 palignr $15, %xmm3, %xmm4
1886 movaps %xmm5, 48(%edx)
1887 palignr $15, %xmm2, %xmm3
1888 lea 64(%eax), %eax
1889 palignr $15, %xmm1, %xmm2
1890 movaps %xmm4, 32(%edx)
1891 movaps %xmm3, 16(%edx)
1892 movaps %xmm7, %xmm1
1893 movaps %xmm2, (%edx)
1894 lea 64(%edx), %edx
1895 sub $64, %ecx
1896 ja L(Shl15LoopStart)
1897
1898 L(Shl15LoopLeave):
1899 add $32, %ecx
1900 jle L(shl_end_0)
1901
1902 movaps 1(%eax), %xmm2
1903 movaps 17(%eax), %xmm3
1904 palignr $15, %xmm2, %xmm3
1905 palignr $15, %xmm1, %xmm2
1906
1907 movaps %xmm2, (%edx)
1908 movaps %xmm3, 16(%edx)
1909 lea 32(%edx, %ecx), %edx
1910 lea 32(%eax, %ecx), %eax
1911 POP (%edi)
1912 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1913
1914 CFI_PUSH (%edi)
1915
1916 .p2align 4
1917 L(sh_15_no_prefetch):
1918 lea -32(%ecx), %ecx
1919 lea -15(%eax), %eax
1920 xor %edi, %edi
1921
1922 .p2align 4
1923 L(sh_15_no_prefetch_loop):
1924 movdqa 16(%eax, %edi), %xmm2
1925 sub $32, %ecx
1926 movdqa 32(%eax, %edi), %xmm3
1927 movdqa %xmm3, %xmm4
1928 palignr $15, %xmm2, %xmm3
1929 palignr $15, %xmm1, %xmm2
1930 lea 32(%edi), %edi
1931 movdqa %xmm2, -32(%edx, %edi)
1932 movdqa %xmm3, -16(%edx, %edi)
1933 jb L(sh_15_end_no_prefetch_loop)
1934
1935 movdqa 16(%eax, %edi), %xmm2
1936 sub $32, %ecx
1937 movdqa 32(%eax, %edi), %xmm3
1938 movdqa %xmm3, %xmm1
1939 palignr $15, %xmm2, %xmm3
1940 palignr $15, %xmm4, %xmm2
1941 lea 32(%edi), %edi
1942 movdqa %xmm2, -32(%edx, %edi)
1943 movdqa %xmm3, -16(%edx, %edi)
1944 jae L(sh_15_no_prefetch_loop)
1945
1946 L(sh_15_end_no_prefetch_loop):
1947 lea 32(%ecx), %ecx
1948 add %ecx, %edi
1949 add %edi, %edx
1950 lea 15(%edi, %eax), %eax
1951 POP (%edi)
1952 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1953
1954 CFI_PUSH (%edi)
1955
1956 .p2align 4
1957 L(shl_end_0):
1958 lea 32(%ecx), %ecx
1959 lea (%edx, %ecx), %edx
1960 lea (%eax, %ecx), %eax
1961 POP (%edi)
1962 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1963
1964 .p2align 4
1965 L(fwd_write_44bytes):
1966 movq -44(%eax), %xmm0
1967 movq %xmm0, -44(%edx)
1968 L(fwd_write_36bytes):
1969 movq -36(%eax), %xmm0
1970 movq %xmm0, -36(%edx)
1971 L(fwd_write_28bytes):
1972 movq -28(%eax), %xmm0
1973 movq %xmm0, -28(%edx)
1974 L(fwd_write_20bytes):
1975 movq -20(%eax), %xmm0
1976 movq %xmm0, -20(%edx)
1977 L(fwd_write_12bytes):
1978 movq -12(%eax), %xmm0
1979 movq %xmm0, -12(%edx)
1980 L(fwd_write_4bytes):
1981 movl -4(%eax), %ecx
1982 movl %ecx, -4(%edx)
1983 # ifndef USE_AS_BCOPY
1984 # ifdef USE_AS_MEMPCPY
1985 movl %edx, %eax
1986 # else
1987 movl DEST(%esp), %eax
1988 # endif
1989 # endif
1990 RETURN
1991
1992 .p2align 4
1993 L(fwd_write_40bytes):
1994 movq -40(%eax), %xmm0
1995 movq %xmm0, -40(%edx)
1996 L(fwd_write_32bytes):
1997 movq -32(%eax), %xmm0
1998 movq %xmm0, -32(%edx)
1999 L(fwd_write_24bytes):
2000 movq -24(%eax), %xmm0
2001 movq %xmm0, -24(%edx)
2002 L(fwd_write_16bytes):
2003 movq -16(%eax), %xmm0
2004 movq %xmm0, -16(%edx)
2005 L(fwd_write_8bytes):
2006 movq -8(%eax), %xmm0
2007 movq %xmm0, -8(%edx)
2008 L(fwd_write_0bytes):
2009 # ifndef USE_AS_BCOPY
2010 # ifdef USE_AS_MEMPCPY
2011 movl %edx, %eax
2012 # else
2013 movl DEST(%esp), %eax
2014 # endif
2015 # endif
2016 RETURN
2017
2018 .p2align 4
2019 L(fwd_write_5bytes):
2020 movl -5(%eax), %ecx
2021 movl -4(%eax), %eax
2022 movl %ecx, -5(%edx)
2023 movl %eax, -4(%edx)
2024 # ifndef USE_AS_BCOPY
2025 # ifdef USE_AS_MEMPCPY
2026 movl %edx, %eax
2027 # else
2028 movl DEST(%esp), %eax
2029 # endif
2030 # endif
2031 RETURN
2032
2033 .p2align 4
2034 L(fwd_write_45bytes):
2035 movq -45(%eax), %xmm0
2036 movq %xmm0, -45(%edx)
2037 L(fwd_write_37bytes):
2038 movq -37(%eax), %xmm0
2039 movq %xmm0, -37(%edx)
2040 L(fwd_write_29bytes):
2041 movq -29(%eax), %xmm0
2042 movq %xmm0, -29(%edx)
2043 L(fwd_write_21bytes):
2044 movq -21(%eax), %xmm0
2045 movq %xmm0, -21(%edx)
2046 L(fwd_write_13bytes):
2047 movq -13(%eax), %xmm0
2048 movq %xmm0, -13(%edx)
2049 movl -5(%eax), %ecx
2050 movl %ecx, -5(%edx)
2051 movzbl -1(%eax), %ecx
2052 movb %cl, -1(%edx)
2053 # ifndef USE_AS_BCOPY
2054 # ifdef USE_AS_MEMPCPY
2055 movl %edx, %eax
2056 # else
2057 movl DEST(%esp), %eax
2058 # endif
2059 # endif
2060 RETURN
2061
2062 .p2align 4
2063 L(fwd_write_41bytes):
2064 movq -41(%eax), %xmm0
2065 movq %xmm0, -41(%edx)
2066 L(fwd_write_33bytes):
2067 movq -33(%eax), %xmm0
2068 movq %xmm0, -33(%edx)
2069 L(fwd_write_25bytes):
2070 movq -25(%eax), %xmm0
2071 movq %xmm0, -25(%edx)
2072 L(fwd_write_17bytes):
2073 movq -17(%eax), %xmm0
2074 movq %xmm0, -17(%edx)
2075 L(fwd_write_9bytes):
2076 movq -9(%eax), %xmm0
2077 movq %xmm0, -9(%edx)
2078 L(fwd_write_1bytes):
2079 movzbl -1(%eax), %ecx
2080 movb %cl, -1(%edx)
2081 # ifndef USE_AS_BCOPY
2082 # ifdef USE_AS_MEMPCPY
2083 movl %edx, %eax
2084 # else
2085 movl DEST(%esp), %eax
2086 # endif
2087 # endif
2088 RETURN
2089
2090 .p2align 4
2091 L(fwd_write_46bytes):
2092 movq -46(%eax), %xmm0
2093 movq %xmm0, -46(%edx)
2094 L(fwd_write_38bytes):
2095 movq -38(%eax), %xmm0
2096 movq %xmm0, -38(%edx)
2097 L(fwd_write_30bytes):
2098 movq -30(%eax), %xmm0
2099 movq %xmm0, -30(%edx)
2100 L(fwd_write_22bytes):
2101 movq -22(%eax), %xmm0
2102 movq %xmm0, -22(%edx)
2103 L(fwd_write_14bytes):
2104 movq -14(%eax), %xmm0
2105 movq %xmm0, -14(%edx)
2106 L(fwd_write_6bytes):
2107 movl -6(%eax), %ecx
2108 movl %ecx, -6(%edx)
2109 movzwl -2(%eax), %ecx
2110 movw %cx, -2(%edx)
2111 # ifndef USE_AS_BCOPY
2112 # ifdef USE_AS_MEMPCPY
2113 movl %edx, %eax
2114 # else
2115 movl DEST(%esp), %eax
2116 # endif
2117 # endif
2118 RETURN
2119
2120 .p2align 4
2121 L(fwd_write_42bytes):
2122 movq -42(%eax), %xmm0
2123 movq %xmm0, -42(%edx)
2124 L(fwd_write_34bytes):
2125 movq -34(%eax), %xmm0
2126 movq %xmm0, -34(%edx)
2127 L(fwd_write_26bytes):
2128 movq -26(%eax), %xmm0
2129 movq %xmm0, -26(%edx)
2130 L(fwd_write_18bytes):
2131 movq -18(%eax), %xmm0
2132 movq %xmm0, -18(%edx)
2133 L(fwd_write_10bytes):
2134 movq -10(%eax), %xmm0
2135 movq %xmm0, -10(%edx)
2136 L(fwd_write_2bytes):
2137 movzwl -2(%eax), %ecx
2138 movw %cx, -2(%edx)
2139 # ifndef USE_AS_BCOPY
2140 # ifdef USE_AS_MEMPCPY
2141 movl %edx, %eax
2142 # else
2143 movl DEST(%esp), %eax
2144 # endif
2145 # endif
2146 RETURN
2147
2148 .p2align 4
2149 L(fwd_write_47bytes):
2150 movq -47(%eax), %xmm0
2151 movq %xmm0, -47(%edx)
2152 L(fwd_write_39bytes):
2153 movq -39(%eax), %xmm0
2154 movq %xmm0, -39(%edx)
2155 L(fwd_write_31bytes):
2156 movq -31(%eax), %xmm0
2157 movq %xmm0, -31(%edx)
2158 L(fwd_write_23bytes):
2159 movq -23(%eax), %xmm0
2160 movq %xmm0, -23(%edx)
2161 L(fwd_write_15bytes):
2162 movq -15(%eax), %xmm0
2163 movq %xmm0, -15(%edx)
2164 L(fwd_write_7bytes):
2165 movl -7(%eax), %ecx
2166 movl %ecx, -7(%edx)
2167 movzwl -3(%eax), %ecx
2168 movzbl -1(%eax), %eax
2169 movw %cx, -3(%edx)
2170 movb %al, -1(%edx)
2171 # ifndef USE_AS_BCOPY
2172 # ifdef USE_AS_MEMPCPY
2173 movl %edx, %eax
2174 # else
2175 movl DEST(%esp), %eax
2176 # endif
2177 # endif
2178 RETURN
2179
2180 .p2align 4
2181 L(fwd_write_43bytes):
2182 movq -43(%eax), %xmm0
2183 movq %xmm0, -43(%edx)
2184 L(fwd_write_35bytes):
2185 movq -35(%eax), %xmm0
2186 movq %xmm0, -35(%edx)
2187 L(fwd_write_27bytes):
2188 movq -27(%eax), %xmm0
2189 movq %xmm0, -27(%edx)
2190 L(fwd_write_19bytes):
2191 movq -19(%eax), %xmm0
2192 movq %xmm0, -19(%edx)
2193 L(fwd_write_11bytes):
2194 movq -11(%eax), %xmm0
2195 movq %xmm0, -11(%edx)
2196 L(fwd_write_3bytes):
2197 movzwl -3(%eax), %ecx
2198 movzbl -1(%eax), %eax
2199 movw %cx, -3(%edx)
2200 movb %al, -1(%edx)
2201 # ifndef USE_AS_BCOPY
2202 # ifdef USE_AS_MEMPCPY
2203 movl %edx, %eax
2204 # else
2205 movl DEST(%esp), %eax
2206 # endif
2207 # endif
2208 RETURN
2209
2210 .p2align 4
2211 L(fwd_write_40bytes_align):
2212 movdqa -40(%eax), %xmm0
2213 movdqa %xmm0, -40(%edx)
2214 L(fwd_write_24bytes_align):
2215 movdqa -24(%eax), %xmm0
2216 movdqa %xmm0, -24(%edx)
2217 L(fwd_write_8bytes_align):
2218 movq -8(%eax), %xmm0
2219 movq %xmm0, -8(%edx)
2220 L(fwd_write_0bytes_align):
2221 # ifndef USE_AS_BCOPY
2222 # ifdef USE_AS_MEMPCPY
2223 movl %edx, %eax
2224 # else
2225 movl DEST(%esp), %eax
2226 # endif
2227 # endif
2228 RETURN
2229
2230 .p2align 4
2231 L(fwd_write_32bytes_align):
2232 movdqa -32(%eax), %xmm0
2233 movdqa %xmm0, -32(%edx)
2234 L(fwd_write_16bytes_align):
2235 movdqa -16(%eax), %xmm0
2236 movdqa %xmm0, -16(%edx)
2237 # ifndef USE_AS_BCOPY
2238 # ifdef USE_AS_MEMPCPY
2239 movl %edx, %eax
2240 # else
2241 movl DEST(%esp), %eax
2242 # endif
2243 # endif
2244 RETURN
2245
2246 .p2align 4
2247 L(fwd_write_5bytes_align):
2248 movl -5(%eax), %ecx
2249 movl -4(%eax), %eax
2250 movl %ecx, -5(%edx)
2251 movl %eax, -4(%edx)
2252 # ifndef USE_AS_BCOPY
2253 # ifdef USE_AS_MEMPCPY
2254 movl %edx, %eax
2255 # else
2256 movl DEST(%esp), %eax
2257 # endif
2258 # endif
2259 RETURN
2260
2261 .p2align 4
2262 L(fwd_write_45bytes_align):
2263 movdqa -45(%eax), %xmm0
2264 movdqa %xmm0, -45(%edx)
2265 L(fwd_write_29bytes_align):
2266 movdqa -29(%eax), %xmm0
2267 movdqa %xmm0, -29(%edx)
2268 L(fwd_write_13bytes_align):
2269 movq -13(%eax), %xmm0
2270 movq %xmm0, -13(%edx)
2271 movl -5(%eax), %ecx
2272 movl %ecx, -5(%edx)
2273 movzbl -1(%eax), %ecx
2274 movb %cl, -1(%edx)
2275 # ifndef USE_AS_BCOPY
2276 # ifdef USE_AS_MEMPCPY
2277 movl %edx, %eax
2278 # else
2279 movl DEST(%esp), %eax
2280 # endif
2281 # endif
2282 RETURN
2283
2284 .p2align 4
2285 L(fwd_write_37bytes_align):
2286 movdqa -37(%eax), %xmm0
2287 movdqa %xmm0, -37(%edx)
2288 L(fwd_write_21bytes_align):
2289 movdqa -21(%eax), %xmm0
2290 movdqa %xmm0, -21(%edx)
2291 movl -5(%eax), %ecx
2292 movl %ecx, -5(%edx)
2293 movzbl -1(%eax), %ecx
2294 movb %cl, -1(%edx)
2295 # ifndef USE_AS_BCOPY
2296 # ifdef USE_AS_MEMPCPY
2297 movl %edx, %eax
2298 # else
2299 movl DEST(%esp), %eax
2300 # endif
2301 # endif
2302 RETURN
2303
2304 .p2align 4
2305 L(fwd_write_41bytes_align):
2306 movdqa -41(%eax), %xmm0
2307 movdqa %xmm0, -41(%edx)
2308 L(fwd_write_25bytes_align):
2309 movdqa -25(%eax), %xmm0
2310 movdqa %xmm0, -25(%edx)
2311 L(fwd_write_9bytes_align):
2312 movq -9(%eax), %xmm0
2313 movq %xmm0, -9(%edx)
2314 L(fwd_write_1bytes_align):
2315 movzbl -1(%eax), %ecx
2316 movb %cl, -1(%edx)
2317 # ifndef USE_AS_BCOPY
2318 # ifdef USE_AS_MEMPCPY
2319 movl %edx, %eax
2320 # else
2321 movl DEST(%esp), %eax
2322 # endif
2323 # endif
2324 RETURN
2325
2326 .p2align 4
2327 L(fwd_write_33bytes_align):
2328 movdqa -33(%eax), %xmm0
2329 movdqa %xmm0, -33(%edx)
2330 L(fwd_write_17bytes_align):
2331 movdqa -17(%eax), %xmm0
2332 movdqa %xmm0, -17(%edx)
2333 movzbl -1(%eax), %ecx
2334 movb %cl, -1(%edx)
2335 # ifndef USE_AS_BCOPY
2336 # ifdef USE_AS_MEMPCPY
2337 movl %edx, %eax
2338 # else
2339 movl DEST(%esp), %eax
2340 # endif
2341 # endif
2342 RETURN
2343
2344 .p2align 4
2345 L(fwd_write_46bytes_align):
2346 movdqa -46(%eax), %xmm0
2347 movdqa %xmm0, -46(%edx)
2348 L(fwd_write_30bytes_align):
2349 movdqa -30(%eax), %xmm0
2350 movdqa %xmm0, -30(%edx)
2351 L(fwd_write_14bytes_align):
2352 movq -14(%eax), %xmm0
2353 movq %xmm0, -14(%edx)
2354 L(fwd_write_6bytes_align):
2355 movl -6(%eax), %ecx
2356 movl %ecx, -6(%edx)
2357 movzwl -2(%eax), %ecx
2358 movw %cx, -2(%edx)
2359 # ifndef USE_AS_BCOPY
2360 # ifdef USE_AS_MEMPCPY
2361 movl %edx, %eax
2362 # else
2363 movl DEST(%esp), %eax
2364 # endif
2365 # endif
2366 RETURN
2367
2368 .p2align 4
2369 L(fwd_write_38bytes_align):
2370 movdqa -38(%eax), %xmm0
2371 movdqa %xmm0, -38(%edx)
2372 L(fwd_write_22bytes_align):
2373 movdqa -22(%eax), %xmm0
2374 movdqa %xmm0, -22(%edx)
2375 movl -6(%eax), %ecx
2376 movl %ecx, -6(%edx)
2377 movzwl -2(%eax), %ecx
2378 movw %cx, -2(%edx)
2379 # ifndef USE_AS_BCOPY
2380 # ifdef USE_AS_MEMPCPY
2381 movl %edx, %eax
2382 # else
2383 movl DEST(%esp), %eax
2384 # endif
2385 # endif
2386 RETURN
2387
2388 .p2align 4
2389 L(fwd_write_42bytes_align):
2390 movdqa -42(%eax), %xmm0
2391 movdqa %xmm0, -42(%edx)
2392 L(fwd_write_26bytes_align):
2393 movdqa -26(%eax), %xmm0
2394 movdqa %xmm0, -26(%edx)
2395 L(fwd_write_10bytes_align):
2396 movq -10(%eax), %xmm0
2397 movq %xmm0, -10(%edx)
2398 L(fwd_write_2bytes_align):
2399 movzwl -2(%eax), %ecx
2400 movw %cx, -2(%edx)
2401 # ifndef USE_AS_BCOPY
2402 # ifdef USE_AS_MEMPCPY
2403 movl %edx, %eax
2404 # else
2405 movl DEST(%esp), %eax
2406 # endif
2407 # endif
2408 RETURN
2409
2410 .p2align 4
2411 L(fwd_write_34bytes_align):
2412 movdqa -34(%eax), %xmm0
2413 movdqa %xmm0, -34(%edx)
2414 L(fwd_write_18bytes_align):
2415 movdqa -18(%eax), %xmm0
2416 movdqa %xmm0, -18(%edx)
2417 movzwl -2(%eax), %ecx
2418 movw %cx, -2(%edx)
2419 # ifndef USE_AS_BCOPY
2420 # ifdef USE_AS_MEMPCPY
2421 movl %edx, %eax
2422 # else
2423 movl DEST(%esp), %eax
2424 # endif
2425 # endif
2426 RETURN
2427
2428 .p2align 4
2429 L(fwd_write_47bytes_align):
2430 movdqa -47(%eax), %xmm0
2431 movdqa %xmm0, -47(%edx)
2432 L(fwd_write_31bytes_align):
2433 movdqa -31(%eax), %xmm0
2434 movdqa %xmm0, -31(%edx)
2435 L(fwd_write_15bytes_align):
2436 movq -15(%eax), %xmm0
2437 movq %xmm0, -15(%edx)
2438 L(fwd_write_7bytes_align):
2439 movl -7(%eax), %ecx
2440 movl %ecx, -7(%edx)
2441 movzwl -3(%eax), %ecx
2442 movzbl -1(%eax), %eax
2443 movw %cx, -3(%edx)
2444 movb %al, -1(%edx)
2445 # ifndef USE_AS_BCOPY
2446 # ifdef USE_AS_MEMPCPY
2447 movl %edx, %eax
2448 # else
2449 movl DEST(%esp), %eax
2450 # endif
2451 # endif
2452 RETURN
2453
2454 .p2align 4
2455 L(fwd_write_39bytes_align):
2456 movdqa -39(%eax), %xmm0
2457 movdqa %xmm0, -39(%edx)
2458 L(fwd_write_23bytes_align):
2459 movdqa -23(%eax), %xmm0
2460 movdqa %xmm0, -23(%edx)
2461 movl -7(%eax), %ecx
2462 movl %ecx, -7(%edx)
2463 movzwl -3(%eax), %ecx
2464 movzbl -1(%eax), %eax
2465 movw %cx, -3(%edx)
2466 movb %al, -1(%edx)
2467 # ifndef USE_AS_BCOPY
2468 # ifdef USE_AS_MEMPCPY
2469 movl %edx, %eax
2470 # else
2471 movl DEST(%esp), %eax
2472 # endif
2473 # endif
2474 RETURN
2475
2476 .p2align 4
2477 L(fwd_write_43bytes_align):
2478 movdqa -43(%eax), %xmm0
2479 movdqa %xmm0, -43(%edx)
2480 L(fwd_write_27bytes_align):
2481 movdqa -27(%eax), %xmm0
2482 movdqa %xmm0, -27(%edx)
2483 L(fwd_write_11bytes_align):
2484 movq -11(%eax), %xmm0
2485 movq %xmm0, -11(%edx)
2486 L(fwd_write_3bytes_align):
2487 movzwl -3(%eax), %ecx
2488 movzbl -1(%eax), %eax
2489 movw %cx, -3(%edx)
2490 movb %al, -1(%edx)
2491 # ifndef USE_AS_BCOPY
2492 # ifdef USE_AS_MEMPCPY
2493 movl %edx, %eax
2494 # else
2495 movl DEST(%esp), %eax
2496 # endif
2497 # endif
2498 RETURN
2499
2500 .p2align 4
2501 L(fwd_write_35bytes_align):
2502 movdqa -35(%eax), %xmm0
2503 movdqa %xmm0, -35(%edx)
2504 L(fwd_write_19bytes_align):
2505 movdqa -19(%eax), %xmm0
2506 movdqa %xmm0, -19(%edx)
2507 movzwl -3(%eax), %ecx
2508 movzbl -1(%eax), %eax
2509 movw %cx, -3(%edx)
2510 movb %al, -1(%edx)
2511 # ifndef USE_AS_BCOPY
2512 # ifdef USE_AS_MEMPCPY
2513 movl %edx, %eax
2514 # else
2515 movl DEST(%esp), %eax
2516 # endif
2517 # endif
2518 RETURN
2519
2520 .p2align 4
2521 L(fwd_write_44bytes_align):
2522 movdqa -44(%eax), %xmm0
2523 movdqa %xmm0, -44(%edx)
2524 L(fwd_write_28bytes_align):
2525 movdqa -28(%eax), %xmm0
2526 movdqa %xmm0, -28(%edx)
2527 L(fwd_write_12bytes_align):
2528 movq -12(%eax), %xmm0
2529 movq %xmm0, -12(%edx)
2530 L(fwd_write_4bytes_align):
2531 movl -4(%eax), %ecx
2532 movl %ecx, -4(%edx)
2533 # ifndef USE_AS_BCOPY
2534 # ifdef USE_AS_MEMPCPY
2535 movl %edx, %eax
2536 # else
2537 movl DEST(%esp), %eax
2538 # endif
2539 # endif
2540 RETURN
2541
2542 .p2align 4
2543 L(fwd_write_36bytes_align):
2544 movdqa -36(%eax), %xmm0
2545 movdqa %xmm0, -36(%edx)
2546 L(fwd_write_20bytes_align):
2547 movdqa -20(%eax), %xmm0
2548 movdqa %xmm0, -20(%edx)
2549 movl -4(%eax), %ecx
2550 movl %ecx, -4(%edx)
2551 # ifndef USE_AS_BCOPY
2552 # ifdef USE_AS_MEMPCPY
2553 movl %edx, %eax
2554 # else
2555 movl DEST(%esp), %eax
2556 # endif
2557 # endif
2558 RETURN_END
2559
2560 CFI_PUSH (%edi)
2561
2562 .p2align 4
2563 L(large_page):
2564 movdqu (%eax), %xmm1
2565 # ifdef USE_AS_MEMMOVE
2566 movl DEST+4(%esp), %edi
2567 movdqu %xmm0, (%edi)
2568 # endif
2569 lea 16(%eax), %eax
2570 movntdq %xmm1, (%edx)
2571 lea 16(%edx), %edx
2572 lea -0x90(%ecx), %ecx
2573 POP (%edi)
2574
2575 .p2align 4
2576 L(large_page_loop):
2577 movdqu (%eax), %xmm0
2578 movdqu 0x10(%eax), %xmm1
2579 movdqu 0x20(%eax), %xmm2
2580 movdqu 0x30(%eax), %xmm3
2581 movdqu 0x40(%eax), %xmm4
2582 movdqu 0x50(%eax), %xmm5
2583 movdqu 0x60(%eax), %xmm6
2584 movdqu 0x70(%eax), %xmm7
2585 lea 0x80(%eax), %eax
2586
2587 sub $0x80, %ecx
2588 movntdq %xmm0, (%edx)
2589 movntdq %xmm1, 0x10(%edx)
2590 movntdq %xmm2, 0x20(%edx)
2591 movntdq %xmm3, 0x30(%edx)
2592 movntdq %xmm4, 0x40(%edx)
2593 movntdq %xmm5, 0x50(%edx)
2594 movntdq %xmm6, 0x60(%edx)
2595 movntdq %xmm7, 0x70(%edx)
2596 lea 0x80(%edx), %edx
2597 jae L(large_page_loop)
2598 cmp $-0x40, %ecx
2599 lea 0x80(%ecx), %ecx
2600 jl L(large_page_less_64bytes)
2601
2602 movdqu (%eax), %xmm0
2603 movdqu 0x10(%eax), %xmm1
2604 movdqu 0x20(%eax), %xmm2
2605 movdqu 0x30(%eax), %xmm3
2606 lea 0x40(%eax), %eax
2607
2608 movntdq %xmm0, (%edx)
2609 movntdq %xmm1, 0x10(%edx)
2610 movntdq %xmm2, 0x20(%edx)
2611 movntdq %xmm3, 0x30(%edx)
2612 lea 0x40(%edx), %edx
2613 sub $0x40, %ecx
2614 L(large_page_less_64bytes):
2615 cmp $32, %ecx
2616 jb L(large_page_less_32bytes)
2617 movdqu (%eax), %xmm0
2618 movdqu 0x10(%eax), %xmm1
2619 lea 0x20(%eax), %eax
2620 movntdq %xmm0, (%edx)
2621 movntdq %xmm1, 0x10(%edx)
2622 lea 0x20(%edx), %edx
2623 sub $0x20, %ecx
2624 L(large_page_less_32bytes):
2625 add %ecx, %edx
2626 add %ecx, %eax
2627 sfence
2628 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2629
2630 .p2align 4
2631 L(bk_write_44bytes):
2632 movq 36(%eax), %xmm0
2633 movq %xmm0, 36(%edx)
2634 L(bk_write_36bytes):
2635 movq 28(%eax), %xmm0
2636 movq %xmm0, 28(%edx)
2637 L(bk_write_28bytes):
2638 movq 20(%eax), %xmm0
2639 movq %xmm0, 20(%edx)
2640 L(bk_write_20bytes):
2641 movq 12(%eax), %xmm0
2642 movq %xmm0, 12(%edx)
2643 L(bk_write_12bytes):
2644 movq 4(%eax), %xmm0
2645 movq %xmm0, 4(%edx)
2646 L(bk_write_4bytes):
2647 movl (%eax), %ecx
2648 movl %ecx, (%edx)
2649 L(bk_write_0bytes):
2650 # ifndef USE_AS_BCOPY
2651 movl DEST(%esp), %eax
2652 # ifdef USE_AS_MEMPCPY
2653 movl LEN(%esp), %ecx
2654 add %ecx, %eax
2655 # endif
2656 # endif
2657 RETURN
2658
2659 .p2align 4
2660 L(bk_write_40bytes):
2661 movq 32(%eax), %xmm0
2662 movq %xmm0, 32(%edx)
2663 L(bk_write_32bytes):
2664 movq 24(%eax), %xmm0
2665 movq %xmm0, 24(%edx)
2666 L(bk_write_24bytes):
2667 movq 16(%eax), %xmm0
2668 movq %xmm0, 16(%edx)
2669 L(bk_write_16bytes):
2670 movq 8(%eax), %xmm0
2671 movq %xmm0, 8(%edx)
2672 L(bk_write_8bytes):
2673 movq (%eax), %xmm0
2674 movq %xmm0, (%edx)
2675 # ifndef USE_AS_BCOPY
2676 movl DEST(%esp), %eax
2677 # ifdef USE_AS_MEMPCPY
2678 movl LEN(%esp), %ecx
2679 add %ecx, %eax
2680 # endif
2681 # endif
2682 RETURN
2683
2684 .p2align 4
2685 L(bk_write_45bytes):
2686 movq 37(%eax), %xmm0
2687 movq %xmm0, 37(%edx)
2688 L(bk_write_37bytes):
2689 movq 29(%eax), %xmm0
2690 movq %xmm0, 29(%edx)
2691 L(bk_write_29bytes):
2692 movq 21(%eax), %xmm0
2693 movq %xmm0, 21(%edx)
2694 L(bk_write_21bytes):
2695 movq 13(%eax), %xmm0
2696 movq %xmm0, 13(%edx)
2697 L(bk_write_13bytes):
2698 movq 5(%eax), %xmm0
2699 movq %xmm0, 5(%edx)
2700 L(bk_write_5bytes):
2701 movl 1(%eax), %ecx
2702 movl %ecx, 1(%edx)
2703 L(bk_write_1bytes):
2704 movzbl (%eax), %ecx
2705 movb %cl, (%edx)
2706 # ifndef USE_AS_BCOPY
2707 movl DEST(%esp), %eax
2708 # ifdef USE_AS_MEMPCPY
2709 movl LEN(%esp), %ecx
2710 add %ecx, %eax
2711 # endif
2712 # endif
2713 RETURN
2714
2715 .p2align 4
2716 L(bk_write_41bytes):
2717 movq 33(%eax), %xmm0
2718 movq %xmm0, 33(%edx)
2719 L(bk_write_33bytes):
2720 movq 25(%eax), %xmm0
2721 movq %xmm0, 25(%edx)
2722 L(bk_write_25bytes):
2723 movq 17(%eax), %xmm0
2724 movq %xmm0, 17(%edx)
2725 L(bk_write_17bytes):
2726 movq 9(%eax), %xmm0
2727 movq %xmm0, 9(%edx)
2728 L(bk_write_9bytes):
2729 movq 1(%eax), %xmm0
2730 movq %xmm0, 1(%edx)
2731 movzbl (%eax), %ecx
2732 movb %cl, (%edx)
2733 # ifndef USE_AS_BCOPY
2734 movl DEST(%esp), %eax
2735 # ifdef USE_AS_MEMPCPY
2736 movl LEN(%esp), %ecx
2737 add %ecx, %eax
2738 # endif
2739 # endif
2740 RETURN
2741
2742 .p2align 4
2743 L(bk_write_46bytes):
2744 movq 38(%eax), %xmm0
2745 movq %xmm0, 38(%edx)
2746 L(bk_write_38bytes):
2747 movq 30(%eax), %xmm0
2748 movq %xmm0, 30(%edx)
2749 L(bk_write_30bytes):
2750 movq 22(%eax), %xmm0
2751 movq %xmm0, 22(%edx)
2752 L(bk_write_22bytes):
2753 movq 14(%eax), %xmm0
2754 movq %xmm0, 14(%edx)
2755 L(bk_write_14bytes):
2756 movq 6(%eax), %xmm0
2757 movq %xmm0, 6(%edx)
2758 L(bk_write_6bytes):
2759 movl 2(%eax), %ecx
2760 movl %ecx, 2(%edx)
2761 movzwl (%eax), %ecx
2762 movw %cx, (%edx)
2763 # ifndef USE_AS_BCOPY
2764 movl DEST(%esp), %eax
2765 # ifdef USE_AS_MEMPCPY
2766 movl LEN(%esp), %ecx
2767 add %ecx, %eax
2768 # endif
2769 # endif
2770 RETURN
2771
2772 .p2align 4
2773 L(bk_write_42bytes):
2774 movq 34(%eax), %xmm0
2775 movq %xmm0, 34(%edx)
2776 L(bk_write_34bytes):
2777 movq 26(%eax), %xmm0
2778 movq %xmm0, 26(%edx)
2779 L(bk_write_26bytes):
2780 movq 18(%eax), %xmm0
2781 movq %xmm0, 18(%edx)
2782 L(bk_write_18bytes):
2783 movq 10(%eax), %xmm0
2784 movq %xmm0, 10(%edx)
2785 L(bk_write_10bytes):
2786 movq 2(%eax), %xmm0
2787 movq %xmm0, 2(%edx)
2788 L(bk_write_2bytes):
2789 movzwl (%eax), %ecx
2790 movw %cx, (%edx)
2791 # ifndef USE_AS_BCOPY
2792 movl DEST(%esp), %eax
2793 # ifdef USE_AS_MEMPCPY
2794 movl LEN(%esp), %ecx
2795 add %ecx, %eax
2796 # endif
2797 # endif
2798 RETURN
2799
2800 .p2align 4
2801 L(bk_write_47bytes):
2802 movq 39(%eax), %xmm0
2803 movq %xmm0, 39(%edx)
2804 L(bk_write_39bytes):
2805 movq 31(%eax), %xmm0
2806 movq %xmm0, 31(%edx)
2807 L(bk_write_31bytes):
2808 movq 23(%eax), %xmm0
2809 movq %xmm0, 23(%edx)
2810 L(bk_write_23bytes):
2811 movq 15(%eax), %xmm0
2812 movq %xmm0, 15(%edx)
2813 L(bk_write_15bytes):
2814 movq 7(%eax), %xmm0
2815 movq %xmm0, 7(%edx)
2816 L(bk_write_7bytes):
2817 movl 3(%eax), %ecx
2818 movl %ecx, 3(%edx)
2819 movzwl 1(%eax), %ecx
2820 movw %cx, 1(%edx)
2821 movzbl (%eax), %eax
2822 movb %al, (%edx)
2823 # ifndef USE_AS_BCOPY
2824 movl DEST(%esp), %eax
2825 # ifdef USE_AS_MEMPCPY
2826 movl LEN(%esp), %ecx
2827 add %ecx, %eax
2828 # endif
2829 # endif
2830 RETURN
2831
2832 .p2align 4
2833 L(bk_write_43bytes):
2834 movq 35(%eax), %xmm0
2835 movq %xmm0, 35(%edx)
2836 L(bk_write_35bytes):
2837 movq 27(%eax), %xmm0
2838 movq %xmm0, 27(%edx)
2839 L(bk_write_27bytes):
2840 movq 19(%eax), %xmm0
2841 movq %xmm0, 19(%edx)
2842 L(bk_write_19bytes):
2843 movq 11(%eax), %xmm0
2844 movq %xmm0, 11(%edx)
2845 L(bk_write_11bytes):
2846 movq 3(%eax), %xmm0
2847 movq %xmm0, 3(%edx)
2848 L(bk_write_3bytes):
2849 movzwl 1(%eax), %ecx
2850 movw %cx, 1(%edx)
2851 movzbl (%eax), %eax
2852 movb %al, (%edx)
2853 # ifndef USE_AS_BCOPY
2854 movl DEST(%esp), %eax
2855 # ifdef USE_AS_MEMPCPY
2856 movl LEN(%esp), %ecx
2857 add %ecx, %eax
2858 # endif
2859 # endif
2860 RETURN_END
2861
2862
2863 .pushsection .rodata.ssse3,"a",@progbits
2864 .p2align 2
2865 L(table_48bytes_fwd):
2866 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2867 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2868 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2869 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2870 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2871 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2872 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2873 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2874 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2875 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2876 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2877 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2878 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2879 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2880 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2881 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2882 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2883 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2884 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2885 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2886 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2887 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2888 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2889 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2890 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2891 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2892 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2893 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2894 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2895 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2896 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2897 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2898 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2899 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2900 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2901 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2902 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2903 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2904 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2913 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2914
2915 .p2align 2
2916 L(table_48bytes_fwd_align):
2917 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2918 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2919 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2920 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2921 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2922 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2923 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2924 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2925 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2926 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2927 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2928 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2929 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2930 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2931 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2932 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2933 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2934 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2935 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2936 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2937 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2938 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2939 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2940 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2941 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2942 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2943 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2944 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2945 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2946 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2947 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2948 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2949 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2950 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2951 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2952 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2953 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2954 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2955 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2964 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2965
2966 .p2align 2
2967 L(shl_table):
2968 .int JMPTBL (L(shl_0), L(shl_table))
2969 .int JMPTBL (L(shl_1), L(shl_table))
2970 .int JMPTBL (L(shl_2), L(shl_table))
2971 .int JMPTBL (L(shl_3), L(shl_table))
2972 .int JMPTBL (L(shl_4), L(shl_table))
2973 .int JMPTBL (L(shl_5), L(shl_table))
2974 .int JMPTBL (L(shl_6), L(shl_table))
2975 .int JMPTBL (L(shl_7), L(shl_table))
2976 .int JMPTBL (L(shl_8), L(shl_table))
2977 .int JMPTBL (L(shl_9), L(shl_table))
2978 .int JMPTBL (L(shl_10), L(shl_table))
2979 .int JMPTBL (L(shl_11), L(shl_table))
2980 .int JMPTBL (L(shl_12), L(shl_table))
2981 .int JMPTBL (L(shl_13), L(shl_table))
2982 .int JMPTBL (L(shl_14), L(shl_table))
2983 .int JMPTBL (L(shl_15), L(shl_table))
2984
2985 .p2align 2
2986 L(table_48_bytes_bwd):
2987 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2988 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2989 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2990 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2991 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2992 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2993 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2994 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2995 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2996 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2997 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2998 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2999 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3000 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3001 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3002 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3003 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3004 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3005 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3006 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3007 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3008 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3009 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3010 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3011 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3012 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3013 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3014 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3015 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3016 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3017 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3018 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3019 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3020 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3021 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3022 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3023 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3024 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3025 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3034 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3035
3036 .popsection
3037
3038 # ifdef USE_AS_MEMMOVE
3039 .p2align 4
3040 L(copy_backward):
3041 PUSH (%edi)
3042 movl %eax, %edi
3043 lea (%ecx,%edx,1),%edx
3044 lea (%ecx,%edi,1),%edi
3045 testl $0x3, %edx
3046 jnz L(bk_align)
3047
3048 L(bk_aligned_4):
3049 cmp $64, %ecx
3050 jae L(bk_write_more64bytes)
3051
3052 L(bk_write_64bytesless):
3053 cmp $32, %ecx
3054 jb L(bk_write_less32bytes)
3055
3056 L(bk_write_more32bytes):
3057 /* Copy 32 bytes at a time. */
3058 sub $32, %ecx
3059 movq -8(%edi), %xmm0
3060 movq %xmm0, -8(%edx)
3061 movq -16(%edi), %xmm0
3062 movq %xmm0, -16(%edx)
3063 movq -24(%edi), %xmm0
3064 movq %xmm0, -24(%edx)
3065 movq -32(%edi), %xmm0
3066 movq %xmm0, -32(%edx)
3067 sub $32, %edx
3068 sub $32, %edi
3069
3070 L(bk_write_less32bytes):
3071 movl %edi, %eax
3072 sub %ecx, %edx
3073 sub %ecx, %eax
3074 POP (%edi)
3075 L(bk_write_less32bytes_2):
3076 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3077
3078 CFI_PUSH (%edi)
3079
3080 .p2align 4
3081 L(bk_align):
3082 cmp $8, %ecx
3083 jbe L(bk_write_less32bytes)
3084 testl $1, %edx
3085 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3086 then (EDX & 2) must be != 0. */
3087 jz L(bk_got2)
3088 sub $1, %edi
3089 sub $1, %ecx
3090 sub $1, %edx
3091 movzbl (%edi), %eax
3092 movb %al, (%edx)
3093
3094 testl $2, %edx
3095 jz L(bk_aligned_4)
3096
3097 L(bk_got2):
3098 sub $2, %edi
3099 sub $2, %ecx
3100 sub $2, %edx
3101 movzwl (%edi), %eax
3102 movw %ax, (%edx)
3103 jmp L(bk_aligned_4)
3104
3105 .p2align 4
3106 L(bk_write_more64bytes):
3107 /* Check alignment of last byte. */
3108 testl $15, %edx
3109 jz L(bk_ssse3_cpy_pre)
3110
3111 /* EDX is aligned 4 bytes, but not 16 bytes. */
3112 L(bk_ssse3_align):
3113 sub $4, %edi
3114 sub $4, %ecx
3115 sub $4, %edx
3116 movl (%edi), %eax
3117 movl %eax, (%edx)
3118
3119 testl $15, %edx
3120 jz L(bk_ssse3_cpy_pre)
3121
3122 sub $4, %edi
3123 sub $4, %ecx
3124 sub $4, %edx
3125 movl (%edi), %eax
3126 movl %eax, (%edx)
3127
3128 testl $15, %edx
3129 jz L(bk_ssse3_cpy_pre)
3130
3131 sub $4, %edi
3132 sub $4, %ecx
3133 sub $4, %edx
3134 movl (%edi), %eax
3135 movl %eax, (%edx)
3136
3137 L(bk_ssse3_cpy_pre):
3138 cmp $64, %ecx
3139 jb L(bk_write_more32bytes)
3140
3141 .p2align 4
3142 L(bk_ssse3_cpy):
3143 sub $64, %edi
3144 sub $64, %ecx
3145 sub $64, %edx
3146 movdqu 0x30(%edi), %xmm3
3147 movdqa %xmm3, 0x30(%edx)
3148 movdqu 0x20(%edi), %xmm2
3149 movdqa %xmm2, 0x20(%edx)
3150 movdqu 0x10(%edi), %xmm1
3151 movdqa %xmm1, 0x10(%edx)
3152 movdqu (%edi), %xmm0
3153 movdqa %xmm0, (%edx)
3154 cmp $64, %ecx
3155 jae L(bk_ssse3_cpy)
3156 jmp L(bk_write_64bytesless)
3157
3158 # endif
3159
3160 END (MEMCPY)
3161
3162 #endif