]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcpy-ssse3-rep.S
1 /* memcpy with SSSE3 and REP string.
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 #if IS_IN (libc) \
23 && (defined SHARED \
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
26
27 #include "asm-syntax.h"
28
29 #ifndef MEMCPY
30 # define MEMCPY __memcpy_ssse3_rep
31 # define MEMCPY_CHK __memcpy_chk_ssse3_rep
32 #endif
33
34 #ifdef USE_AS_BCOPY
35 # define SRC PARMS
36 # define DEST SRC+4
37 # define LEN DEST+4
38 #else
39 # define DEST PARMS
40 # define SRC DEST+4
41 # define LEN SRC+4
42 #endif
43
44 #define CFI_PUSH(REG) \
45 cfi_adjust_cfa_offset (4); \
46 cfi_rel_offset (REG, 0)
47
48 #define CFI_POP(REG) \
49 cfi_adjust_cfa_offset (-4); \
50 cfi_restore (REG)
51
52 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
53 #define POP(REG) popl REG; CFI_POP (REG)
54
55 #ifdef PIC
56 # define PARMS 8 /* Preserve EBX. */
57 # define ENTRANCE PUSH (%ebx);
58 # define RETURN_END POP (%ebx); ret
59 # define RETURN RETURN_END; CFI_PUSH (%ebx)
60 # define JMPTBL(I, B) I - B
61
62 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
63 jump table with relative offsets. INDEX is a register contains the
64 index into the jump table. SCALE is the scale of INDEX. */
65 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
66 /* We first load PC into EBX. */ \
67 SETUP_PIC_REG(bx); \
68 /* Get the address of the jump table. */ \
69 addl $(TABLE - .), %ebx; \
70 /* Get the entry and convert the relative offset to the \
71 absolute address. */ \
72 addl (%ebx,INDEX,SCALE), %ebx; \
73 /* We loaded the jump table. Go. */ \
74 _CET_NOTRACK jmp *%ebx
75
76 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
77 addl $(TABLE - .), %ebx
78
79 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
80 addl (%ebx,INDEX,SCALE), %ebx; \
81 /* We loaded the jump table. Go. */ \
82 _CET_NOTRACK jmp *%ebx
83 #else
84 # define PARMS 4
85 # define ENTRANCE
86 # define RETURN_END ret
87 # define RETURN RETURN_END
88 # define JMPTBL(I, B) I
89
90 /* Branch to an entry in a jump table. TABLE is a jump table with
91 absolute offsets. INDEX is a register contains the index into the
92 jump table. SCALE is the scale of INDEX. */
93 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
94 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
95
96 # define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
97
98 # define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
99 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
100 #endif
101
102 .section .text.ssse3,"ax",@progbits
103 #if !defined USE_AS_BCOPY && defined SHARED
104 ENTRY (MEMCPY_CHK)
105 movl 12(%esp), %eax
106 cmpl %eax, 16(%esp)
107 jb HIDDEN_JUMPTARGET (__chk_fail)
108 END (MEMCPY_CHK)
109 #endif
110 ENTRY (MEMCPY)
111 ENTRANCE
112 movl LEN(%esp), %ecx
113 movl SRC(%esp), %eax
114 movl DEST(%esp), %edx
115
116 #ifdef USE_AS_MEMMOVE
117 cmp %eax, %edx
118 jb L(copy_forward)
119 je L(fwd_write_0bytes)
120 cmp $48, %ecx
121 jb L(bk_write_less48bytes)
122 add %ecx, %eax
123 cmp %eax, %edx
124 movl SRC(%esp), %eax
125 jb L(copy_backward)
126
127 L(copy_forward):
128 #endif
129 cmp $48, %ecx
130 jae L(48bytesormore)
131
132 L(fwd_write_less32bytes):
133 #ifndef USE_AS_MEMMOVE
134 cmp %dl, %al
135 jb L(bk_write)
136 #endif
137 add %ecx, %edx
138 add %ecx, %eax
139 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
140 #ifndef USE_AS_MEMMOVE
141 L(bk_write):
142 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
143 #endif
144
145 ALIGN (4)
146 /* ECX > 32 and EDX is 4 byte aligned. */
147 L(48bytesormore):
148 movdqu (%eax), %xmm0
149 PUSH (%edi)
150 movl %edx, %edi
151 and $-16, %edx
152 PUSH (%esi)
153 cfi_remember_state
154 add $16, %edx
155 movl %edi, %esi
156 sub %edx, %edi
157 add %edi, %ecx
158 sub %edi, %eax
159
160 #ifdef SHARED_CACHE_SIZE_HALF
161 cmp $SHARED_CACHE_SIZE_HALF, %ecx
162 #else
163 # ifdef PIC
164 SETUP_PIC_REG(bx)
165 add $_GLOBAL_OFFSET_TABLE_, %ebx
166 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
167 # else
168 cmp __x86_shared_cache_size_half, %ecx
169 # endif
170 #endif
171
172 mov %eax, %edi
173 jae L(large_page)
174 and $0xf, %edi
175 jz L(shl_0)
176
177 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
178
179 ALIGN (4)
180 L(shl_0):
181 movdqu %xmm0, (%esi)
182 xor %edi, %edi
183 cmp $127, %ecx
184 ja L(shl_0_gobble)
185 lea -32(%ecx), %ecx
186 L(shl_0_loop):
187 movdqa (%eax, %edi), %xmm0
188 movdqa 16(%eax, %edi), %xmm1
189 sub $32, %ecx
190 movdqa %xmm0, (%edx, %edi)
191 movdqa %xmm1, 16(%edx, %edi)
192 lea 32(%edi), %edi
193 jb L(shl_0_end)
194
195 movdqa (%eax, %edi), %xmm0
196 movdqa 16(%eax, %edi), %xmm1
197 sub $32, %ecx
198 movdqa %xmm0, (%edx, %edi)
199 movdqa %xmm1, 16(%edx, %edi)
200 lea 32(%edi), %edi
201 jb L(shl_0_end)
202
203 movdqa (%eax, %edi), %xmm0
204 movdqa 16(%eax, %edi), %xmm1
205 sub $32, %ecx
206 movdqa %xmm0, (%edx, %edi)
207 movdqa %xmm1, 16(%edx, %edi)
208 lea 32(%edi), %edi
209 jb L(shl_0_end)
210
211 movdqa (%eax, %edi), %xmm0
212 movdqa 16(%eax, %edi), %xmm1
213 sub $32, %ecx
214 movdqa %xmm0, (%edx, %edi)
215 movdqa %xmm1, 16(%edx, %edi)
216 lea 32(%edi), %edi
217 L(shl_0_end):
218 lea 32(%ecx), %ecx
219 add %ecx, %edi
220 add %edi, %edx
221 add %edi, %eax
222 POP (%esi)
223 POP (%edi)
224 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
225
226 cfi_restore_state
227 cfi_remember_state
228 L(shl_0_gobble):
229
230 #ifdef DATA_CACHE_SIZE_HALF
231 cmp $DATA_CACHE_SIZE_HALF, %ecx
232 #else
233 # ifdef PIC
234 SETUP_PIC_REG(bx)
235 add $_GLOBAL_OFFSET_TABLE_, %ebx
236 mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
237 # else
238 mov __x86_data_cache_size_half, %edi
239 # endif
240 #endif
241 mov %edi, %esi
242 shr $3, %esi
243 sub %esi, %edi
244 cmp %edi, %ecx
245 jae L(shl_0_gobble_mem_start)
246 sub $128, %ecx
247 ALIGN (4)
248 L(shl_0_gobble_cache_loop):
249 movdqa (%eax), %xmm0
250 movaps 0x10(%eax), %xmm1
251 movaps 0x20(%eax), %xmm2
252 movaps 0x30(%eax), %xmm3
253 movaps 0x40(%eax), %xmm4
254 movaps 0x50(%eax), %xmm5
255 movaps 0x60(%eax), %xmm6
256 movaps 0x70(%eax), %xmm7
257 lea 0x80(%eax), %eax
258 sub $128, %ecx
259 movdqa %xmm0, (%edx)
260 movaps %xmm1, 0x10(%edx)
261 movaps %xmm2, 0x20(%edx)
262 movaps %xmm3, 0x30(%edx)
263 movaps %xmm4, 0x40(%edx)
264 movaps %xmm5, 0x50(%edx)
265 movaps %xmm6, 0x60(%edx)
266 movaps %xmm7, 0x70(%edx)
267 lea 0x80(%edx), %edx
268
269 jae L(shl_0_gobble_cache_loop)
270 add $0x80, %ecx
271 cmp $0x40, %ecx
272 jb L(shl_0_cache_less_64bytes)
273
274 movdqa (%eax), %xmm0
275 sub $0x40, %ecx
276 movdqa 0x10(%eax), %xmm1
277
278 movdqa %xmm0, (%edx)
279 movdqa %xmm1, 0x10(%edx)
280
281 movdqa 0x20(%eax), %xmm0
282 movdqa 0x30(%eax), %xmm1
283 add $0x40, %eax
284
285 movdqa %xmm0, 0x20(%edx)
286 movdqa %xmm1, 0x30(%edx)
287 add $0x40, %edx
288 L(shl_0_cache_less_64bytes):
289 cmp $0x20, %ecx
290 jb L(shl_0_cache_less_32bytes)
291 movdqa (%eax), %xmm0
292 sub $0x20, %ecx
293 movdqa 0x10(%eax), %xmm1
294 add $0x20, %eax
295 movdqa %xmm0, (%edx)
296 movdqa %xmm1, 0x10(%edx)
297 add $0x20, %edx
298 L(shl_0_cache_less_32bytes):
299 cmp $0x10, %ecx
300 jb L(shl_0_cache_less_16bytes)
301 sub $0x10, %ecx
302 movdqa (%eax), %xmm0
303 add $0x10, %eax
304 movdqa %xmm0, (%edx)
305 add $0x10, %edx
306 L(shl_0_cache_less_16bytes):
307 add %ecx, %edx
308 add %ecx, %eax
309 POP (%esi)
310 POP (%edi)
311 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
312
313 cfi_restore_state
314 cfi_remember_state
315 ALIGN (4)
316 L(shl_0_gobble_mem_start):
317 cmp %al, %dl
318 je L(copy_page_by_rep)
319 sub $128, %ecx
320 L(shl_0_gobble_mem_loop):
321 prefetchnta 0x1c0(%eax)
322 prefetchnta 0x280(%eax)
323 prefetchnta 0x1c0(%edx)
324 prefetchnta 0x280(%edx)
325
326 movdqa (%eax), %xmm0
327 movaps 0x10(%eax), %xmm1
328 movaps 0x20(%eax), %xmm2
329 movaps 0x30(%eax), %xmm3
330 movaps 0x40(%eax), %xmm4
331 movaps 0x50(%eax), %xmm5
332 movaps 0x60(%eax), %xmm6
333 movaps 0x70(%eax), %xmm7
334 lea 0x80(%eax), %eax
335 sub $0x80, %ecx
336 movdqa %xmm0, (%edx)
337 movaps %xmm1, 0x10(%edx)
338 movaps %xmm2, 0x20(%edx)
339 movaps %xmm3, 0x30(%edx)
340 movaps %xmm4, 0x40(%edx)
341 movaps %xmm5, 0x50(%edx)
342 movaps %xmm6, 0x60(%edx)
343 movaps %xmm7, 0x70(%edx)
344 lea 0x80(%edx), %edx
345
346 jae L(shl_0_gobble_mem_loop)
347 add $0x80, %ecx
348 cmp $0x40, %ecx
349 jb L(shl_0_mem_less_64bytes)
350
351 movdqa (%eax), %xmm0
352 sub $0x40, %ecx
353 movdqa 0x10(%eax), %xmm1
354
355 movdqa %xmm0, (%edx)
356 movdqa %xmm1, 0x10(%edx)
357
358 movdqa 0x20(%eax), %xmm0
359 movdqa 0x30(%eax), %xmm1
360 add $0x40, %eax
361
362 movdqa %xmm0, 0x20(%edx)
363 movdqa %xmm1, 0x30(%edx)
364 add $0x40, %edx
365 L(shl_0_mem_less_64bytes):
366 cmp $0x20, %ecx
367 jb L(shl_0_mem_less_32bytes)
368 movdqa (%eax), %xmm0
369 sub $0x20, %ecx
370 movdqa 0x10(%eax), %xmm1
371 add $0x20, %eax
372 movdqa %xmm0, (%edx)
373 movdqa %xmm1, 0x10(%edx)
374 add $0x20, %edx
375 L(shl_0_mem_less_32bytes):
376 cmp $0x10, %ecx
377 jb L(shl_0_mem_less_16bytes)
378 sub $0x10, %ecx
379 movdqa (%eax), %xmm0
380 add $0x10, %eax
381 movdqa %xmm0, (%edx)
382 add $0x10, %edx
383 L(shl_0_mem_less_16bytes):
384 add %ecx, %edx
385 add %ecx, %eax
386 POP (%esi)
387 POP (%edi)
388 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
389
390 cfi_restore_state
391 cfi_remember_state
392 ALIGN (4)
393 L(shl_1):
394 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
395 sub $1, %eax
396 movaps (%eax), %xmm1
397 xor %edi, %edi
398 sub $32, %ecx
399 movdqu %xmm0, (%esi)
400 POP (%esi)
401 L(shl_1_loop):
402
403 movdqa 16(%eax, %edi), %xmm2
404 sub $32, %ecx
405 movdqa 32(%eax, %edi), %xmm3
406 movdqa %xmm3, %xmm4
407 palignr $1, %xmm2, %xmm3
408 palignr $1, %xmm1, %xmm2
409 lea 32(%edi), %edi
410 movdqa %xmm2, -32(%edx, %edi)
411 movdqa %xmm3, -16(%edx, %edi)
412
413 jb L(shl_1_end)
414
415 movdqa 16(%eax, %edi), %xmm2
416 sub $32, %ecx
417 movdqa 32(%eax, %edi), %xmm3
418 movdqa %xmm3, %xmm1
419 palignr $1, %xmm2, %xmm3
420 palignr $1, %xmm4, %xmm2
421 lea 32(%edi), %edi
422 movdqa %xmm2, -32(%edx, %edi)
423 movdqa %xmm3, -16(%edx, %edi)
424
425 jae L(shl_1_loop)
426
427 L(shl_1_end):
428 add $32, %ecx
429 add %ecx, %edi
430 add %edi, %edx
431 lea 1(%edi, %eax), %eax
432 POP (%edi)
433 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
434
435 cfi_restore_state
436 cfi_remember_state
437 ALIGN (4)
438 L(shl_2):
439 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
440 sub $2, %eax
441 movaps (%eax), %xmm1
442 xor %edi, %edi
443 sub $32, %ecx
444 movdqu %xmm0, (%esi)
445 POP (%esi)
446 L(shl_2_loop):
447
448 movdqa 16(%eax, %edi), %xmm2
449 sub $32, %ecx
450 movdqa 32(%eax, %edi), %xmm3
451 movdqa %xmm3, %xmm4
452 palignr $2, %xmm2, %xmm3
453 palignr $2, %xmm1, %xmm2
454 lea 32(%edi), %edi
455 movdqa %xmm2, -32(%edx, %edi)
456 movdqa %xmm3, -16(%edx, %edi)
457
458 jb L(shl_2_end)
459
460 movdqa 16(%eax, %edi), %xmm2
461 sub $32, %ecx
462 movdqa 32(%eax, %edi), %xmm3
463 movdqa %xmm3, %xmm1
464 palignr $2, %xmm2, %xmm3
465 palignr $2, %xmm4, %xmm2
466 lea 32(%edi), %edi
467 movdqa %xmm2, -32(%edx, %edi)
468 movdqa %xmm3, -16(%edx, %edi)
469
470 jae L(shl_2_loop)
471
472 L(shl_2_end):
473 add $32, %ecx
474 add %ecx, %edi
475 add %edi, %edx
476 lea 2(%edi, %eax), %eax
477 POP (%edi)
478 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
479
480 cfi_restore_state
481 cfi_remember_state
482 ALIGN (4)
483 L(shl_3):
484 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
485 sub $3, %eax
486 movaps (%eax), %xmm1
487 xor %edi, %edi
488 sub $32, %ecx
489 movdqu %xmm0, (%esi)
490 POP (%esi)
491 L(shl_3_loop):
492
493 movdqa 16(%eax, %edi), %xmm2
494 sub $32, %ecx
495 movdqa 32(%eax, %edi), %xmm3
496 movdqa %xmm3, %xmm4
497 palignr $3, %xmm2, %xmm3
498 palignr $3, %xmm1, %xmm2
499 lea 32(%edi), %edi
500 movdqa %xmm2, -32(%edx, %edi)
501 movdqa %xmm3, -16(%edx, %edi)
502
503 jb L(shl_3_end)
504
505 movdqa 16(%eax, %edi), %xmm2
506 sub $32, %ecx
507 movdqa 32(%eax, %edi), %xmm3
508 movdqa %xmm3, %xmm1
509 palignr $3, %xmm2, %xmm3
510 palignr $3, %xmm4, %xmm2
511 lea 32(%edi), %edi
512 movdqa %xmm2, -32(%edx, %edi)
513 movdqa %xmm3, -16(%edx, %edi)
514
515 jae L(shl_3_loop)
516
517 L(shl_3_end):
518 add $32, %ecx
519 add %ecx, %edi
520 add %edi, %edx
521 lea 3(%edi, %eax), %eax
522 POP (%edi)
523 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
524
525 cfi_restore_state
526 cfi_remember_state
527 ALIGN (4)
528 L(shl_4):
529 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
530 sub $4, %eax
531 movaps (%eax), %xmm1
532 xor %edi, %edi
533 sub $32, %ecx
534 movdqu %xmm0, (%esi)
535 POP (%esi)
536 L(shl_4_loop):
537
538 movdqa 16(%eax, %edi), %xmm2
539 sub $32, %ecx
540 movdqa 32(%eax, %edi), %xmm3
541 movdqa %xmm3, %xmm4
542 palignr $4, %xmm2, %xmm3
543 palignr $4, %xmm1, %xmm2
544 lea 32(%edi), %edi
545 movdqa %xmm2, -32(%edx, %edi)
546 movdqa %xmm3, -16(%edx, %edi)
547
548 jb L(shl_4_end)
549
550 movdqa 16(%eax, %edi), %xmm2
551 sub $32, %ecx
552 movdqa 32(%eax, %edi), %xmm3
553 movdqa %xmm3, %xmm1
554 palignr $4, %xmm2, %xmm3
555 palignr $4, %xmm4, %xmm2
556 lea 32(%edi), %edi
557 movdqa %xmm2, -32(%edx, %edi)
558 movdqa %xmm3, -16(%edx, %edi)
559
560 jae L(shl_4_loop)
561
562 L(shl_4_end):
563 add $32, %ecx
564 add %ecx, %edi
565 add %edi, %edx
566 lea 4(%edi, %eax), %eax
567 POP (%edi)
568 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
569
570 cfi_restore_state
571 cfi_remember_state
572 ALIGN (4)
573 L(shl_5):
574 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
575 sub $5, %eax
576 movaps (%eax), %xmm1
577 xor %edi, %edi
578 sub $32, %ecx
579 movdqu %xmm0, (%esi)
580 POP (%esi)
581 L(shl_5_loop):
582
583 movdqa 16(%eax, %edi), %xmm2
584 sub $32, %ecx
585 movdqa 32(%eax, %edi), %xmm3
586 movdqa %xmm3, %xmm4
587 palignr $5, %xmm2, %xmm3
588 palignr $5, %xmm1, %xmm2
589 lea 32(%edi), %edi
590 movdqa %xmm2, -32(%edx, %edi)
591 movdqa %xmm3, -16(%edx, %edi)
592
593 jb L(shl_5_end)
594
595 movdqa 16(%eax, %edi), %xmm2
596 sub $32, %ecx
597 movdqa 32(%eax, %edi), %xmm3
598 movdqa %xmm3, %xmm1
599 palignr $5, %xmm2, %xmm3
600 palignr $5, %xmm4, %xmm2
601 lea 32(%edi), %edi
602 movdqa %xmm2, -32(%edx, %edi)
603 movdqa %xmm3, -16(%edx, %edi)
604
605 jae L(shl_5_loop)
606
607 L(shl_5_end):
608 add $32, %ecx
609 add %ecx, %edi
610 add %edi, %edx
611 lea 5(%edi, %eax), %eax
612 POP (%edi)
613 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
614
615 cfi_restore_state
616 cfi_remember_state
617 ALIGN (4)
618 L(shl_6):
619 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
620 sub $6, %eax
621 movaps (%eax), %xmm1
622 xor %edi, %edi
623 sub $32, %ecx
624 movdqu %xmm0, (%esi)
625 POP (%esi)
626 L(shl_6_loop):
627
628 movdqa 16(%eax, %edi), %xmm2
629 sub $32, %ecx
630 movdqa 32(%eax, %edi), %xmm3
631 movdqa %xmm3, %xmm4
632 palignr $6, %xmm2, %xmm3
633 palignr $6, %xmm1, %xmm2
634 lea 32(%edi), %edi
635 movdqa %xmm2, -32(%edx, %edi)
636 movdqa %xmm3, -16(%edx, %edi)
637
638 jb L(shl_6_end)
639
640 movdqa 16(%eax, %edi), %xmm2
641 sub $32, %ecx
642 movdqa 32(%eax, %edi), %xmm3
643 movdqa %xmm3, %xmm1
644 palignr $6, %xmm2, %xmm3
645 palignr $6, %xmm4, %xmm2
646 lea 32(%edi), %edi
647 movdqa %xmm2, -32(%edx, %edi)
648 movdqa %xmm3, -16(%edx, %edi)
649
650 jae L(shl_6_loop)
651
652 L(shl_6_end):
653 add $32, %ecx
654 add %ecx, %edi
655 add %edi, %edx
656 lea 6(%edi, %eax), %eax
657 POP (%edi)
658 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
659
660 cfi_restore_state
661 cfi_remember_state
662 ALIGN (4)
663 L(shl_7):
664 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
665 sub $7, %eax
666 movaps (%eax), %xmm1
667 xor %edi, %edi
668 sub $32, %ecx
669 movdqu %xmm0, (%esi)
670 POP (%esi)
671 L(shl_7_loop):
672
673 movdqa 16(%eax, %edi), %xmm2
674 sub $32, %ecx
675 movdqa 32(%eax, %edi), %xmm3
676 movdqa %xmm3, %xmm4
677 palignr $7, %xmm2, %xmm3
678 palignr $7, %xmm1, %xmm2
679 lea 32(%edi), %edi
680 movdqa %xmm2, -32(%edx, %edi)
681 movdqa %xmm3, -16(%edx, %edi)
682
683 jb L(shl_7_end)
684
685 movdqa 16(%eax, %edi), %xmm2
686 sub $32, %ecx
687 movdqa 32(%eax, %edi), %xmm3
688 movdqa %xmm3, %xmm1
689 palignr $7, %xmm2, %xmm3
690 palignr $7, %xmm4, %xmm2
691 lea 32(%edi), %edi
692 movdqa %xmm2, -32(%edx, %edi)
693 movdqa %xmm3, -16(%edx, %edi)
694
695 jae L(shl_7_loop)
696
697 L(shl_7_end):
698 add $32, %ecx
699 add %ecx, %edi
700 add %edi, %edx
701 lea 7(%edi, %eax), %eax
702 POP (%edi)
703 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
704
705 cfi_restore_state
706 cfi_remember_state
707 ALIGN (4)
708 L(shl_8):
709 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
710 sub $8, %eax
711 movaps (%eax), %xmm1
712 xor %edi, %edi
713 sub $32, %ecx
714 movdqu %xmm0, (%esi)
715 POP (%esi)
716 L(shl_8_loop):
717
718 movdqa 16(%eax, %edi), %xmm2
719 sub $32, %ecx
720 movdqa 32(%eax, %edi), %xmm3
721 movdqa %xmm3, %xmm4
722 palignr $8, %xmm2, %xmm3
723 palignr $8, %xmm1, %xmm2
724 lea 32(%edi), %edi
725 movdqa %xmm2, -32(%edx, %edi)
726 movdqa %xmm3, -16(%edx, %edi)
727
728 jb L(shl_8_end)
729
730 movdqa 16(%eax, %edi), %xmm2
731 sub $32, %ecx
732 movdqa 32(%eax, %edi), %xmm3
733 movdqa %xmm3, %xmm1
734 palignr $8, %xmm2, %xmm3
735 palignr $8, %xmm4, %xmm2
736 lea 32(%edi), %edi
737 movdqa %xmm2, -32(%edx, %edi)
738 movdqa %xmm3, -16(%edx, %edi)
739
740 jae L(shl_8_loop)
741
742 L(shl_8_end):
743 add $32, %ecx
744 add %ecx, %edi
745 add %edi, %edx
746 lea 8(%edi, %eax), %eax
747 POP (%edi)
748 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
749
750 cfi_restore_state
751 cfi_remember_state
752 ALIGN (4)
753 L(shl_9):
754 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
755 sub $9, %eax
756 movaps (%eax), %xmm1
757 xor %edi, %edi
758 sub $32, %ecx
759 movdqu %xmm0, (%esi)
760 POP (%esi)
761 L(shl_9_loop):
762
763 movdqa 16(%eax, %edi), %xmm2
764 sub $32, %ecx
765 movdqa 32(%eax, %edi), %xmm3
766 movdqa %xmm3, %xmm4
767 palignr $9, %xmm2, %xmm3
768 palignr $9, %xmm1, %xmm2
769 lea 32(%edi), %edi
770 movdqa %xmm2, -32(%edx, %edi)
771 movdqa %xmm3, -16(%edx, %edi)
772
773 jb L(shl_9_end)
774
775 movdqa 16(%eax, %edi), %xmm2
776 sub $32, %ecx
777 movdqa 32(%eax, %edi), %xmm3
778 movdqa %xmm3, %xmm1
779 palignr $9, %xmm2, %xmm3
780 palignr $9, %xmm4, %xmm2
781 lea 32(%edi), %edi
782 movdqa %xmm2, -32(%edx, %edi)
783 movdqa %xmm3, -16(%edx, %edi)
784
785 jae L(shl_9_loop)
786
787 L(shl_9_end):
788 add $32, %ecx
789 add %ecx, %edi
790 add %edi, %edx
791 lea 9(%edi, %eax), %eax
792 POP (%edi)
793 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
794
795 cfi_restore_state
796 cfi_remember_state
797 ALIGN (4)
798 L(shl_10):
799 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
800 sub $10, %eax
801 movaps (%eax), %xmm1
802 xor %edi, %edi
803 sub $32, %ecx
804 movdqu %xmm0, (%esi)
805 POP (%esi)
806 L(shl_10_loop):
807
808 movdqa 16(%eax, %edi), %xmm2
809 sub $32, %ecx
810 movdqa 32(%eax, %edi), %xmm3
811 movdqa %xmm3, %xmm4
812 palignr $10, %xmm2, %xmm3
813 palignr $10, %xmm1, %xmm2
814 lea 32(%edi), %edi
815 movdqa %xmm2, -32(%edx, %edi)
816 movdqa %xmm3, -16(%edx, %edi)
817
818 jb L(shl_10_end)
819
820 movdqa 16(%eax, %edi), %xmm2
821 sub $32, %ecx
822 movdqa 32(%eax, %edi), %xmm3
823 movdqa %xmm3, %xmm1
824 palignr $10, %xmm2, %xmm3
825 palignr $10, %xmm4, %xmm2
826 lea 32(%edi), %edi
827 movdqa %xmm2, -32(%edx, %edi)
828 movdqa %xmm3, -16(%edx, %edi)
829
830 jae L(shl_10_loop)
831
832 L(shl_10_end):
833 add $32, %ecx
834 add %ecx, %edi
835 add %edi, %edx
836 lea 10(%edi, %eax), %eax
837 POP (%edi)
838 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
839
840 cfi_restore_state
841 cfi_remember_state
842 ALIGN (4)
843 L(shl_11):
844 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
845 sub $11, %eax
846 movaps (%eax), %xmm1
847 xor %edi, %edi
848 sub $32, %ecx
849 movdqu %xmm0, (%esi)
850 POP (%esi)
851 L(shl_11_loop):
852
853 movdqa 16(%eax, %edi), %xmm2
854 sub $32, %ecx
855 movdqa 32(%eax, %edi), %xmm3
856 movdqa %xmm3, %xmm4
857 palignr $11, %xmm2, %xmm3
858 palignr $11, %xmm1, %xmm2
859 lea 32(%edi), %edi
860 movdqa %xmm2, -32(%edx, %edi)
861 movdqa %xmm3, -16(%edx, %edi)
862
863 jb L(shl_11_end)
864
865 movdqa 16(%eax, %edi), %xmm2
866 sub $32, %ecx
867 movdqa 32(%eax, %edi), %xmm3
868 movdqa %xmm3, %xmm1
869 palignr $11, %xmm2, %xmm3
870 palignr $11, %xmm4, %xmm2
871 lea 32(%edi), %edi
872 movdqa %xmm2, -32(%edx, %edi)
873 movdqa %xmm3, -16(%edx, %edi)
874
875 jae L(shl_11_loop)
876
877 L(shl_11_end):
878 add $32, %ecx
879 add %ecx, %edi
880 add %edi, %edx
881 lea 11(%edi, %eax), %eax
882 POP (%edi)
883 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
884
885 cfi_restore_state
886 cfi_remember_state
887 ALIGN (4)
888 L(shl_12):
889 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
890 sub $12, %eax
891 movaps (%eax), %xmm1
892 xor %edi, %edi
893 sub $32, %ecx
894 movdqu %xmm0, (%esi)
895 POP (%esi)
896 L(shl_12_loop):
897
898 movdqa 16(%eax, %edi), %xmm2
899 sub $32, %ecx
900 movdqa 32(%eax, %edi), %xmm3
901 movdqa %xmm3, %xmm4
902 palignr $12, %xmm2, %xmm3
903 palignr $12, %xmm1, %xmm2
904 lea 32(%edi), %edi
905 movdqa %xmm2, -32(%edx, %edi)
906 movdqa %xmm3, -16(%edx, %edi)
907
908 jb L(shl_12_end)
909
910 movdqa 16(%eax, %edi), %xmm2
911 sub $32, %ecx
912 movdqa 32(%eax, %edi), %xmm3
913 movdqa %xmm3, %xmm1
914 palignr $12, %xmm2, %xmm3
915 palignr $12, %xmm4, %xmm2
916 lea 32(%edi), %edi
917 movdqa %xmm2, -32(%edx, %edi)
918 movdqa %xmm3, -16(%edx, %edi)
919
920 jae L(shl_12_loop)
921
922 L(shl_12_end):
923 add $32, %ecx
924 add %ecx, %edi
925 add %edi, %edx
926 lea 12(%edi, %eax), %eax
927 POP (%edi)
928 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
929
930 cfi_restore_state
931 cfi_remember_state
932 ALIGN (4)
933 L(shl_13):
934 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
935 sub $13, %eax
936 movaps (%eax), %xmm1
937 xor %edi, %edi
938 sub $32, %ecx
939 movdqu %xmm0, (%esi)
940 POP (%esi)
941 L(shl_13_loop):
942
943 movdqa 16(%eax, %edi), %xmm2
944 sub $32, %ecx
945 movdqa 32(%eax, %edi), %xmm3
946 movdqa %xmm3, %xmm4
947 palignr $13, %xmm2, %xmm3
948 palignr $13, %xmm1, %xmm2
949 lea 32(%edi), %edi
950 movdqa %xmm2, -32(%edx, %edi)
951 movdqa %xmm3, -16(%edx, %edi)
952
953 jb L(shl_13_end)
954
955 movdqa 16(%eax, %edi), %xmm2
956 sub $32, %ecx
957 movdqa 32(%eax, %edi), %xmm3
958 movdqa %xmm3, %xmm1
959 palignr $13, %xmm2, %xmm3
960 palignr $13, %xmm4, %xmm2
961 lea 32(%edi), %edi
962 movdqa %xmm2, -32(%edx, %edi)
963 movdqa %xmm3, -16(%edx, %edi)
964
965 jae L(shl_13_loop)
966
967 L(shl_13_end):
968 add $32, %ecx
969 add %ecx, %edi
970 add %edi, %edx
971 lea 13(%edi, %eax), %eax
972 POP (%edi)
973 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
974
975 cfi_restore_state
976 cfi_remember_state
977 ALIGN (4)
978 L(shl_14):
979 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
980 sub $14, %eax
981 movaps (%eax), %xmm1
982 xor %edi, %edi
983 sub $32, %ecx
984 movdqu %xmm0, (%esi)
985 POP (%esi)
986 L(shl_14_loop):
987
988 movdqa 16(%eax, %edi), %xmm2
989 sub $32, %ecx
990 movdqa 32(%eax, %edi), %xmm3
991 movdqa %xmm3, %xmm4
992 palignr $14, %xmm2, %xmm3
993 palignr $14, %xmm1, %xmm2
994 lea 32(%edi), %edi
995 movdqa %xmm2, -32(%edx, %edi)
996 movdqa %xmm3, -16(%edx, %edi)
997
998 jb L(shl_14_end)
999
1000 movdqa 16(%eax, %edi), %xmm2
1001 sub $32, %ecx
1002 movdqa 32(%eax, %edi), %xmm3
1003 movdqa %xmm3, %xmm1
1004 palignr $14, %xmm2, %xmm3
1005 palignr $14, %xmm4, %xmm2
1006 lea 32(%edi), %edi
1007 movdqa %xmm2, -32(%edx, %edi)
1008 movdqa %xmm3, -16(%edx, %edi)
1009
1010 jae L(shl_14_loop)
1011
1012 L(shl_14_end):
1013 add $32, %ecx
1014 add %ecx, %edi
1015 add %edi, %edx
1016 lea 14(%edi, %eax), %eax
1017 POP (%edi)
1018 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1019
1020 cfi_restore_state
1021 cfi_remember_state
1022 ALIGN (4)
1023 L(shl_15):
1024 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1025 sub $15, %eax
1026 movaps (%eax), %xmm1
1027 xor %edi, %edi
1028 sub $32, %ecx
1029 movdqu %xmm0, (%esi)
1030 POP (%esi)
1031 L(shl_15_loop):
1032
1033 movdqa 16(%eax, %edi), %xmm2
1034 sub $32, %ecx
1035 movdqa 32(%eax, %edi), %xmm3
1036 movdqa %xmm3, %xmm4
1037 palignr $15, %xmm2, %xmm3
1038 palignr $15, %xmm1, %xmm2
1039 lea 32(%edi), %edi
1040 movdqa %xmm2, -32(%edx, %edi)
1041 movdqa %xmm3, -16(%edx, %edi)
1042
1043 jb L(shl_15_end)
1044
1045 movdqa 16(%eax, %edi), %xmm2
1046 sub $32, %ecx
1047 movdqa 32(%eax, %edi), %xmm3
1048 movdqa %xmm3, %xmm1
1049 palignr $15, %xmm2, %xmm3
1050 palignr $15, %xmm4, %xmm2
1051 lea 32(%edi), %edi
1052 movdqa %xmm2, -32(%edx, %edi)
1053 movdqa %xmm3, -16(%edx, %edi)
1054
1055 jae L(shl_15_loop)
1056
1057 L(shl_15_end):
1058 add $32, %ecx
1059 add %ecx, %edi
1060 add %edi, %edx
1061 lea 15(%edi, %eax), %eax
1062 POP (%edi)
1063 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1064
1065
1066 ALIGN (4)
1067 L(fwd_write_44bytes):
1068 movl -44(%eax), %ecx
1069 movl %ecx, -44(%edx)
1070 L(fwd_write_40bytes):
1071 movl -40(%eax), %ecx
1072 movl %ecx, -40(%edx)
1073 L(fwd_write_36bytes):
1074 movl -36(%eax), %ecx
1075 movl %ecx, -36(%edx)
1076 L(fwd_write_32bytes):
1077 movl -32(%eax), %ecx
1078 movl %ecx, -32(%edx)
1079 L(fwd_write_28bytes):
1080 movl -28(%eax), %ecx
1081 movl %ecx, -28(%edx)
1082 L(fwd_write_24bytes):
1083 movl -24(%eax), %ecx
1084 movl %ecx, -24(%edx)
1085 L(fwd_write_20bytes):
1086 movl -20(%eax), %ecx
1087 movl %ecx, -20(%edx)
1088 L(fwd_write_16bytes):
1089 movl -16(%eax), %ecx
1090 movl %ecx, -16(%edx)
1091 L(fwd_write_12bytes):
1092 movl -12(%eax), %ecx
1093 movl %ecx, -12(%edx)
1094 L(fwd_write_8bytes):
1095 movl -8(%eax), %ecx
1096 movl %ecx, -8(%edx)
1097 L(fwd_write_4bytes):
1098 movl -4(%eax), %ecx
1099 movl %ecx, -4(%edx)
1100 L(fwd_write_0bytes):
1101 #ifndef USE_AS_BCOPY
1102 # ifdef USE_AS_MEMPCPY
1103 movl %edx, %eax
1104 # else
1105 movl DEST(%esp), %eax
1106 # endif
1107 #endif
1108 RETURN
1109
1110 ALIGN (4)
1111 L(fwd_write_5bytes):
1112 movl -5(%eax), %ecx
1113 movl -4(%eax), %eax
1114 movl %ecx, -5(%edx)
1115 movl %eax, -4(%edx)
1116 #ifndef USE_AS_BCOPY
1117 # ifdef USE_AS_MEMPCPY
1118 movl %edx, %eax
1119 # else
1120 movl DEST(%esp), %eax
1121 # endif
1122 #endif
1123 RETURN
1124
1125 ALIGN (4)
1126 L(fwd_write_45bytes):
1127 movl -45(%eax), %ecx
1128 movl %ecx, -45(%edx)
1129 L(fwd_write_41bytes):
1130 movl -41(%eax), %ecx
1131 movl %ecx, -41(%edx)
1132 L(fwd_write_37bytes):
1133 movl -37(%eax), %ecx
1134 movl %ecx, -37(%edx)
1135 L(fwd_write_33bytes):
1136 movl -33(%eax), %ecx
1137 movl %ecx, -33(%edx)
1138 L(fwd_write_29bytes):
1139 movl -29(%eax), %ecx
1140 movl %ecx, -29(%edx)
1141 L(fwd_write_25bytes):
1142 movl -25(%eax), %ecx
1143 movl %ecx, -25(%edx)
1144 L(fwd_write_21bytes):
1145 movl -21(%eax), %ecx
1146 movl %ecx, -21(%edx)
1147 L(fwd_write_17bytes):
1148 movl -17(%eax), %ecx
1149 movl %ecx, -17(%edx)
1150 L(fwd_write_13bytes):
1151 movl -13(%eax), %ecx
1152 movl %ecx, -13(%edx)
1153 L(fwd_write_9bytes):
1154 movl -9(%eax), %ecx
1155 movl %ecx, -9(%edx)
1156 movl -5(%eax), %ecx
1157 movl %ecx, -5(%edx)
1158 L(fwd_write_1bytes):
1159 movzbl -1(%eax), %ecx
1160 movb %cl, -1(%edx)
1161 #ifndef USE_AS_BCOPY
1162 # ifdef USE_AS_MEMPCPY
1163 movl %edx, %eax
1164 # else
1165 movl DEST(%esp), %eax
1166 # endif
1167 #endif
1168 RETURN
1169
1170 ALIGN (4)
1171 L(fwd_write_46bytes):
1172 movl -46(%eax), %ecx
1173 movl %ecx, -46(%edx)
1174 L(fwd_write_42bytes):
1175 movl -42(%eax), %ecx
1176 movl %ecx, -42(%edx)
1177 L(fwd_write_38bytes):
1178 movl -38(%eax), %ecx
1179 movl %ecx, -38(%edx)
1180 L(fwd_write_34bytes):
1181 movl -34(%eax), %ecx
1182 movl %ecx, -34(%edx)
1183 L(fwd_write_30bytes):
1184 movl -30(%eax), %ecx
1185 movl %ecx, -30(%edx)
1186 L(fwd_write_26bytes):
1187 movl -26(%eax), %ecx
1188 movl %ecx, -26(%edx)
1189 L(fwd_write_22bytes):
1190 movl -22(%eax), %ecx
1191 movl %ecx, -22(%edx)
1192 L(fwd_write_18bytes):
1193 movl -18(%eax), %ecx
1194 movl %ecx, -18(%edx)
1195 L(fwd_write_14bytes):
1196 movl -14(%eax), %ecx
1197 movl %ecx, -14(%edx)
1198 L(fwd_write_10bytes):
1199 movl -10(%eax), %ecx
1200 movl %ecx, -10(%edx)
1201 L(fwd_write_6bytes):
1202 movl -6(%eax), %ecx
1203 movl %ecx, -6(%edx)
1204 L(fwd_write_2bytes):
1205 movzwl -2(%eax), %ecx
1206 movw %cx, -2(%edx)
1207 #ifndef USE_AS_BCOPY
1208 # ifdef USE_AS_MEMPCPY
1209 movl %edx, %eax
1210 # else
1211 movl DEST(%esp), %eax
1212 # endif
1213 #endif
1214 RETURN
1215
1216 ALIGN (4)
1217 L(fwd_write_47bytes):
1218 movl -47(%eax), %ecx
1219 movl %ecx, -47(%edx)
1220 L(fwd_write_43bytes):
1221 movl -43(%eax), %ecx
1222 movl %ecx, -43(%edx)
1223 L(fwd_write_39bytes):
1224 movl -39(%eax), %ecx
1225 movl %ecx, -39(%edx)
1226 L(fwd_write_35bytes):
1227 movl -35(%eax), %ecx
1228 movl %ecx, -35(%edx)
1229 L(fwd_write_31bytes):
1230 movl -31(%eax), %ecx
1231 movl %ecx, -31(%edx)
1232 L(fwd_write_27bytes):
1233 movl -27(%eax), %ecx
1234 movl %ecx, -27(%edx)
1235 L(fwd_write_23bytes):
1236 movl -23(%eax), %ecx
1237 movl %ecx, -23(%edx)
1238 L(fwd_write_19bytes):
1239 movl -19(%eax), %ecx
1240 movl %ecx, -19(%edx)
1241 L(fwd_write_15bytes):
1242 movl -15(%eax), %ecx
1243 movl %ecx, -15(%edx)
1244 L(fwd_write_11bytes):
1245 movl -11(%eax), %ecx
1246 movl %ecx, -11(%edx)
1247 L(fwd_write_7bytes):
1248 movl -7(%eax), %ecx
1249 movl %ecx, -7(%edx)
1250 L(fwd_write_3bytes):
1251 movzwl -3(%eax), %ecx
1252 movzbl -1(%eax), %eax
1253 movw %cx, -3(%edx)
1254 movb %al, -1(%edx)
1255 #ifndef USE_AS_BCOPY
1256 # ifdef USE_AS_MEMPCPY
1257 movl %edx, %eax
1258 # else
1259 movl DEST(%esp), %eax
1260 # endif
1261 #endif
1262 RETURN_END
1263
1264 cfi_restore_state
1265 cfi_remember_state
1266 ALIGN (4)
1267 L(large_page):
1268 movdqu (%eax), %xmm1
1269 movdqu %xmm0, (%esi)
1270 movntdq %xmm1, (%edx)
1271 add $0x10, %eax
1272 add $0x10, %edx
1273 sub $0x10, %ecx
1274 cmp %al, %dl
1275 je L(copy_page_by_rep)
1276 L(large_page_loop_init):
1277 POP (%esi)
1278 sub $0x80, %ecx
1279 POP (%edi)
1280 L(large_page_loop):
1281 prefetchnta 0x1c0(%eax)
1282 prefetchnta 0x280(%eax)
1283 movdqu (%eax), %xmm0
1284 movdqu 0x10(%eax), %xmm1
1285 movdqu 0x20(%eax), %xmm2
1286 movdqu 0x30(%eax), %xmm3
1287 movdqu 0x40(%eax), %xmm4
1288 movdqu 0x50(%eax), %xmm5
1289 movdqu 0x60(%eax), %xmm6
1290 movdqu 0x70(%eax), %xmm7
1291 lea 0x80(%eax), %eax
1292 lfence
1293 sub $0x80, %ecx
1294 movntdq %xmm0, (%edx)
1295 movntdq %xmm1, 0x10(%edx)
1296 movntdq %xmm2, 0x20(%edx)
1297 movntdq %xmm3, 0x30(%edx)
1298 movntdq %xmm4, 0x40(%edx)
1299 movntdq %xmm5, 0x50(%edx)
1300 movntdq %xmm6, 0x60(%edx)
1301 movntdq %xmm7, 0x70(%edx)
1302 lea 0x80(%edx), %edx
1303 jae L(large_page_loop)
1304 add $0x80, %ecx
1305 cmp $0x40, %ecx
1306 jb L(large_page_less_64bytes)
1307
1308 movdqu (%eax), %xmm0
1309 movdqu 0x10(%eax), %xmm1
1310 movdqu 0x20(%eax), %xmm2
1311 movdqu 0x30(%eax), %xmm3
1312 lea 0x40(%eax), %eax
1313
1314 movntdq %xmm0, (%edx)
1315 movntdq %xmm1, 0x10(%edx)
1316 movntdq %xmm2, 0x20(%edx)
1317 movntdq %xmm3, 0x30(%edx)
1318 lea 0x40(%edx), %edx
1319 sub $0x40, %ecx
1320 L(large_page_less_64bytes):
1321 cmp $32, %ecx
1322 jb L(large_page_less_32bytes)
1323 movdqu (%eax), %xmm0
1324 movdqu 0x10(%eax), %xmm1
1325 lea 0x20(%eax), %eax
1326 movntdq %xmm0, (%edx)
1327 movntdq %xmm1, 0x10(%edx)
1328 lea 0x20(%edx), %edx
1329 sub $0x20, %ecx
1330 L(large_page_less_32bytes):
1331 add %ecx, %edx
1332 add %ecx, %eax
1333 sfence
1334 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1335
1336 cfi_restore_state
1337 cfi_remember_state
1338 ALIGN (4)
1339 L(copy_page_by_rep):
1340 mov %eax, %esi
1341 mov %edx, %edi
1342 mov %ecx, %edx
1343 shr $2, %ecx
1344 and $3, %edx
1345 rep movsl
1346 jz L(copy_page_by_rep_exit)
1347 cmp $2, %edx
1348 jb L(copy_page_by_rep_left_1)
1349 movzwl (%esi), %eax
1350 movw %ax, (%edi)
1351 add $2, %esi
1352 add $2, %edi
1353 sub $2, %edx
1354 jz L(copy_page_by_rep_exit)
1355 L(copy_page_by_rep_left_1):
1356 movzbl (%esi), %eax
1357 movb %al, (%edi)
1358 L(copy_page_by_rep_exit):
1359 POP (%esi)
1360 POP (%edi)
1361 #ifndef USE_AS_BCOPY
1362 movl DEST(%esp), %eax
1363 # ifdef USE_AS_MEMPCPY
1364 movl LEN(%esp), %ecx
1365 add %ecx, %eax
1366 # endif
1367 #endif
1368 RETURN
1369
1370 ALIGN (4)
1371 L(bk_write_44bytes):
1372 movl 40(%eax), %ecx
1373 movl %ecx, 40(%edx)
1374 L(bk_write_40bytes):
1375 movl 36(%eax), %ecx
1376 movl %ecx, 36(%edx)
1377 L(bk_write_36bytes):
1378 movl 32(%eax), %ecx
1379 movl %ecx, 32(%edx)
1380 L(bk_write_32bytes):
1381 movl 28(%eax), %ecx
1382 movl %ecx, 28(%edx)
1383 L(bk_write_28bytes):
1384 movl 24(%eax), %ecx
1385 movl %ecx, 24(%edx)
1386 L(bk_write_24bytes):
1387 movl 20(%eax), %ecx
1388 movl %ecx, 20(%edx)
1389 L(bk_write_20bytes):
1390 movl 16(%eax), %ecx
1391 movl %ecx, 16(%edx)
1392 L(bk_write_16bytes):
1393 movl 12(%eax), %ecx
1394 movl %ecx, 12(%edx)
1395 L(bk_write_12bytes):
1396 movl 8(%eax), %ecx
1397 movl %ecx, 8(%edx)
1398 L(bk_write_8bytes):
1399 movl 4(%eax), %ecx
1400 movl %ecx, 4(%edx)
1401 L(bk_write_4bytes):
1402 movl (%eax), %ecx
1403 movl %ecx, (%edx)
1404 L(bk_write_0bytes):
1405 #ifndef USE_AS_BCOPY
1406 movl DEST(%esp), %eax
1407 # ifdef USE_AS_MEMPCPY
1408 movl LEN(%esp), %ecx
1409 add %ecx, %eax
1410 # endif
1411 #endif
1412 RETURN
1413
1414 ALIGN (4)
1415 L(bk_write_45bytes):
1416 movl 41(%eax), %ecx
1417 movl %ecx, 41(%edx)
1418 L(bk_write_41bytes):
1419 movl 37(%eax), %ecx
1420 movl %ecx, 37(%edx)
1421 L(bk_write_37bytes):
1422 movl 33(%eax), %ecx
1423 movl %ecx, 33(%edx)
1424 L(bk_write_33bytes):
1425 movl 29(%eax), %ecx
1426 movl %ecx, 29(%edx)
1427 L(bk_write_29bytes):
1428 movl 25(%eax), %ecx
1429 movl %ecx, 25(%edx)
1430 L(bk_write_25bytes):
1431 movl 21(%eax), %ecx
1432 movl %ecx, 21(%edx)
1433 L(bk_write_21bytes):
1434 movl 17(%eax), %ecx
1435 movl %ecx, 17(%edx)
1436 L(bk_write_17bytes):
1437 movl 13(%eax), %ecx
1438 movl %ecx, 13(%edx)
1439 L(bk_write_13bytes):
1440 movl 9(%eax), %ecx
1441 movl %ecx, 9(%edx)
1442 L(bk_write_9bytes):
1443 movl 5(%eax), %ecx
1444 movl %ecx, 5(%edx)
1445 L(bk_write_5bytes):
1446 movl 1(%eax), %ecx
1447 movl %ecx, 1(%edx)
1448 L(bk_write_1bytes):
1449 movzbl (%eax), %ecx
1450 movb %cl, (%edx)
1451 #ifndef USE_AS_BCOPY
1452 movl DEST(%esp), %eax
1453 # ifdef USE_AS_MEMPCPY
1454 movl LEN(%esp), %ecx
1455 add %ecx, %eax
1456 # endif
1457 #endif
1458 RETURN
1459
1460 ALIGN (4)
1461 L(bk_write_46bytes):
1462 movl 42(%eax), %ecx
1463 movl %ecx, 42(%edx)
1464 L(bk_write_42bytes):
1465 movl 38(%eax), %ecx
1466 movl %ecx, 38(%edx)
1467 L(bk_write_38bytes):
1468 movl 34(%eax), %ecx
1469 movl %ecx, 34(%edx)
1470 L(bk_write_34bytes):
1471 movl 30(%eax), %ecx
1472 movl %ecx, 30(%edx)
1473 L(bk_write_30bytes):
1474 movl 26(%eax), %ecx
1475 movl %ecx, 26(%edx)
1476 L(bk_write_26bytes):
1477 movl 22(%eax), %ecx
1478 movl %ecx, 22(%edx)
1479 L(bk_write_22bytes):
1480 movl 18(%eax), %ecx
1481 movl %ecx, 18(%edx)
1482 L(bk_write_18bytes):
1483 movl 14(%eax), %ecx
1484 movl %ecx, 14(%edx)
1485 L(bk_write_14bytes):
1486 movl 10(%eax), %ecx
1487 movl %ecx, 10(%edx)
1488 L(bk_write_10bytes):
1489 movl 6(%eax), %ecx
1490 movl %ecx, 6(%edx)
1491 L(bk_write_6bytes):
1492 movl 2(%eax), %ecx
1493 movl %ecx, 2(%edx)
1494 L(bk_write_2bytes):
1495 movzwl (%eax), %ecx
1496 movw %cx, (%edx)
1497 #ifndef USE_AS_BCOPY
1498 movl DEST(%esp), %eax
1499 # ifdef USE_AS_MEMPCPY
1500 movl LEN(%esp), %ecx
1501 add %ecx, %eax
1502 # endif
1503 #endif
1504 RETURN
1505
1506 ALIGN (4)
1507 L(bk_write_47bytes):
1508 movl 43(%eax), %ecx
1509 movl %ecx, 43(%edx)
1510 L(bk_write_43bytes):
1511 movl 39(%eax), %ecx
1512 movl %ecx, 39(%edx)
1513 L(bk_write_39bytes):
1514 movl 35(%eax), %ecx
1515 movl %ecx, 35(%edx)
1516 L(bk_write_35bytes):
1517 movl 31(%eax), %ecx
1518 movl %ecx, 31(%edx)
1519 L(bk_write_31bytes):
1520 movl 27(%eax), %ecx
1521 movl %ecx, 27(%edx)
1522 L(bk_write_27bytes):
1523 movl 23(%eax), %ecx
1524 movl %ecx, 23(%edx)
1525 L(bk_write_23bytes):
1526 movl 19(%eax), %ecx
1527 movl %ecx, 19(%edx)
1528 L(bk_write_19bytes):
1529 movl 15(%eax), %ecx
1530 movl %ecx, 15(%edx)
1531 L(bk_write_15bytes):
1532 movl 11(%eax), %ecx
1533 movl %ecx, 11(%edx)
1534 L(bk_write_11bytes):
1535 movl 7(%eax), %ecx
1536 movl %ecx, 7(%edx)
1537 L(bk_write_7bytes):
1538 movl 3(%eax), %ecx
1539 movl %ecx, 3(%edx)
1540 L(bk_write_3bytes):
1541 movzwl 1(%eax), %ecx
1542 movw %cx, 1(%edx)
1543 movzbl (%eax), %eax
1544 movb %al, (%edx)
1545 #ifndef USE_AS_BCOPY
1546 movl DEST(%esp), %eax
1547 # ifdef USE_AS_MEMPCPY
1548 movl LEN(%esp), %ecx
1549 add %ecx, %eax
1550 # endif
1551 #endif
1552 RETURN_END
1553
1554
1555 .pushsection .rodata.ssse3,"a",@progbits
1556 ALIGN (2)
1557 L(table_48bytes_fwd):
1558 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1571 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1572 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1573 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1574 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1575 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1576 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1577 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1578 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1579 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1580 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1581 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1582 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1583 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1584 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1585 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1586 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1587 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1588 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1589 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1590 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1591 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1592 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1593 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1594 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1595 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1596 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1597 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1598 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1599 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1600 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1601 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1602 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1603 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1604 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1605 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1606
1607 ALIGN (2)
1608 L(shl_table):
1609 .int JMPTBL (L(shl_0), L(shl_table))
1610 .int JMPTBL (L(shl_1), L(shl_table))
1611 .int JMPTBL (L(shl_2), L(shl_table))
1612 .int JMPTBL (L(shl_3), L(shl_table))
1613 .int JMPTBL (L(shl_4), L(shl_table))
1614 .int JMPTBL (L(shl_5), L(shl_table))
1615 .int JMPTBL (L(shl_6), L(shl_table))
1616 .int JMPTBL (L(shl_7), L(shl_table))
1617 .int JMPTBL (L(shl_8), L(shl_table))
1618 .int JMPTBL (L(shl_9), L(shl_table))
1619 .int JMPTBL (L(shl_10), L(shl_table))
1620 .int JMPTBL (L(shl_11), L(shl_table))
1621 .int JMPTBL (L(shl_12), L(shl_table))
1622 .int JMPTBL (L(shl_13), L(shl_table))
1623 .int JMPTBL (L(shl_14), L(shl_table))
1624 .int JMPTBL (L(shl_15), L(shl_table))
1625
1626 ALIGN (2)
1627 L(table_48_bytes_bwd):
1628 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1640 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1641 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1642 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1643 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1644 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1645 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1646 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1647 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1648 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1649 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1650 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1651 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1652 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1653 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1654 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1655 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1656 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1657 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1658 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1659 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1660 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1661 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1662 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1663 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1664 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1665 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1666 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1667 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1668 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1669 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1670 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1671 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1672 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1673 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1674 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1675 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1676
1677 .popsection
1678
1679 #ifdef USE_AS_MEMMOVE
1680 ALIGN (4)
1681 L(copy_backward):
1682 PUSH (%esi)
1683 movl %eax, %esi
1684 add %ecx, %edx
1685 add %ecx, %esi
1686 testl $0x3, %edx
1687 jnz L(bk_align)
1688
1689 L(bk_aligned_4):
1690 cmp $64, %ecx
1691 jae L(bk_write_more64bytes)
1692
1693 L(bk_write_64bytesless):
1694 cmp $32, %ecx
1695 jb L(bk_write_less32bytes)
1696
1697 L(bk_write_more32bytes):
1698 /* Copy 32 bytes at a time. */
1699 sub $32, %ecx
1700 movl -4(%esi), %eax
1701 movl %eax, -4(%edx)
1702 movl -8(%esi), %eax
1703 movl %eax, -8(%edx)
1704 movl -12(%esi), %eax
1705 movl %eax, -12(%edx)
1706 movl -16(%esi), %eax
1707 movl %eax, -16(%edx)
1708 movl -20(%esi), %eax
1709 movl %eax, -20(%edx)
1710 movl -24(%esi), %eax
1711 movl %eax, -24(%edx)
1712 movl -28(%esi), %eax
1713 movl %eax, -28(%edx)
1714 movl -32(%esi), %eax
1715 movl %eax, -32(%edx)
1716 sub $32, %edx
1717 sub $32, %esi
1718
1719 L(bk_write_less32bytes):
1720 movl %esi, %eax
1721 sub %ecx, %edx
1722 sub %ecx, %eax
1723 POP (%esi)
1724 L(bk_write_less48bytes):
1725 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1726
1727 CFI_PUSH (%esi)
1728 ALIGN (4)
1729 L(bk_align):
1730 cmp $8, %ecx
1731 jbe L(bk_write_less32bytes)
1732 testl $1, %edx
1733 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1734 then (EDX & 2) must be != 0. */
1735 jz L(bk_got2)
1736 sub $1, %esi
1737 sub $1, %ecx
1738 sub $1, %edx
1739 movzbl (%esi), %eax
1740 movb %al, (%edx)
1741
1742 testl $2, %edx
1743 jz L(bk_aligned_4)
1744
1745 L(bk_got2):
1746 sub $2, %esi
1747 sub $2, %ecx
1748 sub $2, %edx
1749 movzwl (%esi), %eax
1750 movw %ax, (%edx)
1751 jmp L(bk_aligned_4)
1752
1753 ALIGN (4)
1754 L(bk_write_more64bytes):
1755 /* Check alignment of last byte. */
1756 testl $15, %edx
1757 jz L(bk_ssse3_cpy_pre)
1758
1759 /* EDX is aligned 4 bytes, but not 16 bytes. */
1760 L(bk_ssse3_align):
1761 sub $4, %esi
1762 sub $4, %ecx
1763 sub $4, %edx
1764 movl (%esi), %eax
1765 movl %eax, (%edx)
1766
1767 testl $15, %edx
1768 jz L(bk_ssse3_cpy_pre)
1769
1770 sub $4, %esi
1771 sub $4, %ecx
1772 sub $4, %edx
1773 movl (%esi), %eax
1774 movl %eax, (%edx)
1775
1776 testl $15, %edx
1777 jz L(bk_ssse3_cpy_pre)
1778
1779 sub $4, %esi
1780 sub $4, %ecx
1781 sub $4, %edx
1782 movl (%esi), %eax
1783 movl %eax, (%edx)
1784
1785 L(bk_ssse3_cpy_pre):
1786 cmp $64, %ecx
1787 jb L(bk_write_more32bytes)
1788
1789 L(bk_ssse3_cpy):
1790 sub $64, %esi
1791 sub $64, %ecx
1792 sub $64, %edx
1793 movdqu 0x30(%esi), %xmm3
1794 movdqa %xmm3, 0x30(%edx)
1795 movdqu 0x20(%esi), %xmm2
1796 movdqa %xmm2, 0x20(%edx)
1797 movdqu 0x10(%esi), %xmm1
1798 movdqa %xmm1, 0x10(%edx)
1799 movdqu (%esi), %xmm0
1800 movdqa %xmm0, (%edx)
1801 cmp $64, %ecx
1802 jae L(bk_ssse3_cpy)
1803 jmp L(bk_write_64bytesless)
1804
1805 #endif
1806
1807 END (MEMCPY)
1808
1809 #endif