]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
i386: Don't define multiarch __memmove_chk in libc.a [BZ #21791]
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcpy-sse2-unaligned.S
1 /* memcpy optimized with SSE2 unaligned memory access instructions.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc) \
20 && (defined SHARED \
21 || defined USE_AS_MEMMOVE \
22 || !defined USE_MULTIARCH)
23
24 # include <sysdep.h>
25 # include "asm-syntax.h"
26
27 # ifndef MEMCPY
28 # define MEMCPY __memcpy_sse2_unaligned
29 # define MEMCPY_CHK __memcpy_chk_sse2_unaligned
30 # endif
31
32 # ifdef USE_AS_BCOPY
33 # define SRC PARMS
34 # define DEST SRC+4
35 # define LEN DEST+4
36 # else
37 # define DEST PARMS
38 # define SRC DEST+4
39 # define LEN SRC+4
40 # endif
41
42 # define CFI_PUSH(REG) \
43 cfi_adjust_cfa_offset (4); \
44 cfi_rel_offset (REG, 0)
45
46 # define CFI_POP(REG) \
47 cfi_adjust_cfa_offset (-4); \
48 cfi_restore (REG)
49
50 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
51 # define POP(REG) popl REG; CFI_POP (REG)
52
53 # define PARMS 8 /* Preserve EBX. */
54 # define ENTRANCE PUSH (%ebx);
55 # define RETURN_END POP (%ebx); ret
56 # define RETURN RETURN_END; CFI_PUSH (%ebx)
57
58 .section .text.sse2,"ax",@progbits
59 # if !defined USE_AS_BCOPY && defined SHARED
60 ENTRY (MEMCPY_CHK)
61 movl 12(%esp), %eax
62 cmpl %eax, 16(%esp)
63 jb HIDDEN_JUMPTARGET (__chk_fail)
64 END (MEMCPY_CHK)
65 # endif
66
67 ENTRY (MEMCPY)
68 ENTRANCE
69 movl LEN(%esp), %ecx
70 movl SRC(%esp), %eax
71 movl DEST(%esp), %edx
72 cmp %edx, %eax
73
74 # ifdef USE_AS_MEMMOVE
75 jg L(check_forward)
76
77 L(mm_len_0_or_more_backward):
78 /* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
79 separately. */
80 cmp $16, %ecx
81 jbe L(mm_len_0_16_bytes_backward)
82
83 cmpl $32, %ecx
84 jg L(mm_len_32_or_more_backward)
85
86 /* Copy [0..32] and return. */
87 movdqu (%eax), %xmm0
88 movdqu -16(%eax, %ecx), %xmm1
89 movdqu %xmm0, (%edx)
90 movdqu %xmm1, -16(%edx, %ecx)
91 jmp L(return)
92
93 L(mm_len_32_or_more_backward):
94 cmpl $64, %ecx
95 jg L(mm_len_64_or_more_backward)
96
97 /* Copy [0..64] and return. */
98 movdqu (%eax), %xmm0
99 movdqu 16(%eax), %xmm1
100 movdqu -16(%eax, %ecx), %xmm2
101 movdqu -32(%eax, %ecx), %xmm3
102 movdqu %xmm0, (%edx)
103 movdqu %xmm1, 16(%edx)
104 movdqu %xmm2, -16(%edx, %ecx)
105 movdqu %xmm3, -32(%edx, %ecx)
106 jmp L(return)
107
108 L(mm_len_64_or_more_backward):
109 cmpl $128, %ecx
110 jg L(mm_len_128_or_more_backward)
111
112 /* Copy [0..128] and return. */
113 movdqu (%eax), %xmm0
114 movdqu 16(%eax), %xmm1
115 movdqu 32(%eax), %xmm2
116 movdqu 48(%eax), %xmm3
117 movdqu -64(%eax, %ecx), %xmm4
118 movdqu -48(%eax, %ecx), %xmm5
119 movdqu -32(%eax, %ecx), %xmm6
120 movdqu -16(%eax, %ecx), %xmm7
121 movdqu %xmm0, (%edx)
122 movdqu %xmm1, 16(%edx)
123 movdqu %xmm2, 32(%edx)
124 movdqu %xmm3, 48(%edx)
125 movdqu %xmm4, -64(%edx, %ecx)
126 movdqu %xmm5, -48(%edx, %ecx)
127 movdqu %xmm6, -32(%edx, %ecx)
128 movdqu %xmm7, -16(%edx, %ecx)
129 jmp L(return)
130
131 L(mm_len_128_or_more_backward):
132 add %ecx, %eax
133 cmp %edx, %eax
134 movl SRC(%esp), %eax
135 jle L(forward)
136 PUSH (%esi)
137 PUSH (%edi)
138 PUSH (%ebx)
139
140 /* Aligning the address of destination. */
141 movdqu (%eax), %xmm4
142 movdqu 16(%eax), %xmm5
143 movdqu 32(%eax), %xmm6
144 movdqu 48(%eax), %xmm7
145 leal (%edx, %ecx), %esi
146 movdqu -16(%eax, %ecx), %xmm0
147 subl $16, %esp
148 movdqu %xmm0, (%esp)
149 mov %ecx, %edi
150 movl %esi, %ecx
151 andl $-16, %ecx
152 leal (%ecx), %ebx
153 subl %edx, %ebx
154 leal (%eax, %ebx), %eax
155 shrl $6, %ebx
156
157 # ifdef SHARED_CACHE_SIZE_HALF
158 cmp $SHARED_CACHE_SIZE_HALF, %edi
159 # else
160 # ifdef SHARED
161 PUSH (%ebx)
162 SETUP_PIC_REG (bx)
163 add $_GLOBAL_OFFSET_TABLE_, %ebx
164 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
165 POP (%ebx)
166 # else
167 cmp __x86_shared_cache_size_half, %edi
168 # endif
169 # endif
170 jae L(mm_large_page_loop_backward)
171
172 .p2align 4
173 L(mm_main_loop_backward):
174
175 prefetcht0 -128(%eax)
176
177 movdqu -64(%eax), %xmm0
178 movdqu -48(%eax), %xmm1
179 movdqu -32(%eax), %xmm2
180 movdqu -16(%eax), %xmm3
181 movaps %xmm0, -64(%ecx)
182 subl $64, %eax
183 movaps %xmm1, -48(%ecx)
184 movaps %xmm2, -32(%ecx)
185 movaps %xmm3, -16(%ecx)
186 subl $64, %ecx
187 sub $1, %ebx
188 jnz L(mm_main_loop_backward)
189 movdqu (%esp), %xmm0
190 addl $16, %esp
191 movdqu %xmm0, -16(%esi)
192 movdqu %xmm4, (%edx)
193 movdqu %xmm5, 16(%edx)
194 movdqu %xmm6, 32(%edx)
195 movdqu %xmm7, 48(%edx)
196 POP (%ebx)
197 jmp L(mm_return_pop_all)
198
199 /* Copy [0..16] and return. */
200 L(mm_len_0_16_bytes_backward):
201 testb $24, %cl
202 jnz L(mm_len_9_16_bytes_backward)
203 testb $4, %cl
204 .p2align 4,,5
205 jnz L(mm_len_5_8_bytes_backward)
206 testl %ecx, %ecx
207 .p2align 4,,2
208 je L(return)
209 testb $2, %cl
210 .p2align 4,,1
211 jne L(mm_len_3_4_bytes_backward)
212 movzbl -1(%eax,%ecx), %ebx
213 movzbl (%eax), %eax
214 movb %bl, -1(%edx,%ecx)
215 movb %al, (%edx)
216 jmp L(return)
217
218 L(mm_len_3_4_bytes_backward):
219 movzwl -2(%eax,%ecx), %ebx
220 movzwl (%eax), %eax
221 movw %bx, -2(%edx,%ecx)
222 movw %ax, (%edx)
223 jmp L(return)
224
225 L(mm_len_9_16_bytes_backward):
226 PUSH (%esi)
227 movl -4(%eax,%ecx), %ebx
228 movl -8(%eax,%ecx), %esi
229 movl %ebx, -4(%edx,%ecx)
230 movl %esi, -8(%edx,%ecx)
231 subl $8, %ecx
232 POP (%esi)
233 jmp L(mm_len_0_16_bytes_backward)
234
235 L(mm_len_5_8_bytes_backward):
236 movl (%eax), %ebx
237 movl -4(%eax,%ecx), %eax
238 movl %ebx, (%edx)
239 movl %eax, -4(%edx,%ecx)
240 jmp L(return)
241
242 /* Big length copy backward part. */
243 .p2align 4
244 L(mm_large_page_loop_backward):
245 movdqu -64(%eax), %xmm0
246 movdqu -48(%eax), %xmm1
247 movdqu -32(%eax), %xmm2
248 movdqu -16(%eax), %xmm3
249 movntdq %xmm0, -64(%ecx)
250 subl $64, %eax
251 movntdq %xmm1, -48(%ecx)
252 movntdq %xmm2, -32(%ecx)
253 movntdq %xmm3, -16(%ecx)
254 subl $64, %ecx
255 sub $1, %ebx
256 jnz L(mm_large_page_loop_backward)
257 sfence
258 movdqu (%esp), %xmm0
259 addl $16, %esp
260 movdqu %xmm0, -16(%esi)
261 movdqu %xmm4, (%edx)
262 movdqu %xmm5, 16(%edx)
263 movdqu %xmm6, 32(%edx)
264 movdqu %xmm7, 48(%edx)
265 POP (%ebx)
266 jmp L(mm_return_pop_all)
267
268 L(check_forward):
269 add %edx, %ecx
270 cmp %eax, %ecx
271 movl LEN(%esp), %ecx
272 jle L(forward)
273
274 /* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
275 separately. */
276 cmp $16, %ecx
277 jbe L(mm_len_0_16_bytes_forward)
278
279 cmpl $32, %ecx
280 ja L(mm_len_32_or_more_forward)
281
282 /* Copy [0..32] and return. */
283 movdqu (%eax), %xmm0
284 movdqu -16(%eax, %ecx), %xmm1
285 movdqu %xmm0, (%edx)
286 movdqu %xmm1, -16(%edx, %ecx)
287 jmp L(return)
288
289 L(mm_len_32_or_more_forward):
290 cmpl $64, %ecx
291 ja L(mm_len_64_or_more_forward)
292
293 /* Copy [0..64] and return. */
294 movdqu (%eax), %xmm0
295 movdqu 16(%eax), %xmm1
296 movdqu -16(%eax, %ecx), %xmm2
297 movdqu -32(%eax, %ecx), %xmm3
298 movdqu %xmm0, (%edx)
299 movdqu %xmm1, 16(%edx)
300 movdqu %xmm2, -16(%edx, %ecx)
301 movdqu %xmm3, -32(%edx, %ecx)
302 jmp L(return)
303
304 L(mm_len_64_or_more_forward):
305 cmpl $128, %ecx
306 ja L(mm_len_128_or_more_forward)
307
308 /* Copy [0..128] and return. */
309 movdqu (%eax), %xmm0
310 movdqu 16(%eax), %xmm1
311 movdqu 32(%eax), %xmm2
312 movdqu 48(%eax), %xmm3
313 movdqu -64(%eax, %ecx), %xmm4
314 movdqu -48(%eax, %ecx), %xmm5
315 movdqu -32(%eax, %ecx), %xmm6
316 movdqu -16(%eax, %ecx), %xmm7
317 movdqu %xmm0, (%edx)
318 movdqu %xmm1, 16(%edx)
319 movdqu %xmm2, 32(%edx)
320 movdqu %xmm3, 48(%edx)
321 movdqu %xmm4, -64(%edx, %ecx)
322 movdqu %xmm5, -48(%edx, %ecx)
323 movdqu %xmm6, -32(%edx, %ecx)
324 movdqu %xmm7, -16(%edx, %ecx)
325 jmp L(return)
326
327 L(mm_len_128_or_more_forward):
328 PUSH (%esi)
329 PUSH (%edi)
330 PUSH (%ebx)
331
332 /* Aligning the address of destination. */
333 movdqu -16(%eax, %ecx), %xmm4
334 movdqu -32(%eax, %ecx), %xmm5
335 movdqu -48(%eax, %ecx), %xmm6
336 movdqu -64(%eax, %ecx), %xmm7
337 leal (%edx, %ecx), %esi
338 movdqu (%eax), %xmm0
339 subl $16, %esp
340 movdqu %xmm0, (%esp)
341 mov %ecx, %edi
342 leal 16(%edx), %ecx
343 andl $-16, %ecx
344 movl %ecx, %ebx
345 subl %edx, %ebx
346 addl %ebx, %eax
347 movl %esi, %ebx
348 subl %ecx, %ebx
349 shrl $6, %ebx
350
351 # ifdef SHARED_CACHE_SIZE_HALF
352 cmp $SHARED_CACHE_SIZE_HALF, %edi
353 # else
354 # ifdef SHARED
355 PUSH (%ebx)
356 SETUP_PIC_REG(bx)
357 add $_GLOBAL_OFFSET_TABLE_, %ebx
358 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
359 POP (%ebx)
360 # else
361 cmp __x86_shared_cache_size_half, %edi
362 # endif
363 # endif
364 jae L(mm_large_page_loop_forward)
365
366 .p2align 4
367 L(mm_main_loop_forward):
368
369 prefetcht0 128(%eax)
370
371 movdqu (%eax), %xmm0
372 movdqu 16(%eax), %xmm1
373 movdqu 32(%eax), %xmm2
374 movdqu 48(%eax), %xmm3
375 movdqa %xmm0, (%ecx)
376 addl $64, %eax
377 movaps %xmm1, 16(%ecx)
378 movaps %xmm2, 32(%ecx)
379 movaps %xmm3, 48(%ecx)
380 addl $64, %ecx
381 sub $1, %ebx
382 jnz L(mm_main_loop_forward)
383 movdqu (%esp), %xmm0
384 addl $16, %esp
385 movdqu %xmm0, (%edx)
386 movdqu %xmm4, -16(%esi)
387 movdqu %xmm5, -32(%esi)
388 movdqu %xmm6, -48(%esi)
389 movdqu %xmm7, -64(%esi)
390 POP (%ebx)
391 jmp L(mm_return_pop_all)
392
393 L(mm_len_0_16_bytes_forward):
394 testb $24, %cl
395 jne L(mm_len_9_16_bytes_forward)
396 testb $4, %cl
397 .p2align 4,,5
398 jne L(mm_len_5_8_bytes_forward)
399 testl %ecx, %ecx
400 .p2align 4,,2
401 je L(return)
402 testb $2, %cl
403 .p2align 4,,1
404 jne L(mm_len_2_4_bytes_forward)
405 movzbl -1(%eax,%ecx), %ebx
406 movzbl (%eax), %eax
407 movb %bl, -1(%edx,%ecx)
408 movb %al, (%edx)
409 jmp L(return)
410
411 L(mm_len_2_4_bytes_forward):
412 movzwl -2(%eax,%ecx), %ebx
413 movzwl (%eax), %eax
414 movw %bx, -2(%edx,%ecx)
415 movw %ax, (%edx)
416 jmp L(return)
417
418 L(mm_len_5_8_bytes_forward):
419 movl (%eax), %ebx
420 movl -4(%eax,%ecx), %eax
421 movl %ebx, (%edx)
422 movl %eax, -4(%edx,%ecx)
423 jmp L(return)
424
425 L(mm_len_9_16_bytes_forward):
426 movq (%eax), %xmm0
427 movq -8(%eax, %ecx), %xmm1
428 movq %xmm0, (%edx)
429 movq %xmm1, -8(%edx, %ecx)
430 jmp L(return)
431
432 L(mm_return_pop_all):
433 movl %edx, %eax
434 POP (%edi)
435 POP (%esi)
436 RETURN
437
438 /* Big length copy forward part. */
439 .p2align 4
440 L(mm_large_page_loop_forward):
441 movdqu (%eax), %xmm0
442 movdqu 16(%eax), %xmm1
443 movdqu 32(%eax), %xmm2
444 movdqu 48(%eax), %xmm3
445 movntdq %xmm0, (%ecx)
446 addl $64, %eax
447 movntdq %xmm1, 16(%ecx)
448 movntdq %xmm2, 32(%ecx)
449 movntdq %xmm3, 48(%ecx)
450 addl $64, %ecx
451 sub $1, %ebx
452 jnz L(mm_large_page_loop_forward)
453 sfence
454 movdqu (%esp), %xmm0
455 addl $16, %esp
456 movdqu %xmm0, (%edx)
457 movdqu %xmm4, -16(%esi)
458 movdqu %xmm5, -32(%esi)
459 movdqu %xmm6, -48(%esi)
460 movdqu %xmm7, -64(%esi)
461 POP (%ebx)
462 jmp L(mm_return_pop_all)
463 # endif
464
465 L(forward):
466 cmp $16, %ecx
467 jbe L(len_0_16_bytes)
468
469 # ifdef SHARED_CACHE_SIZE_HALF
470 cmp $SHARED_CACHE_SIZE_HALF, %ecx
471 # else
472 # ifdef SHARED
473 SETUP_PIC_REG(bx)
474 add $_GLOBAL_OFFSET_TABLE_, %ebx
475 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
476 # else
477 cmp __x86_shared_cache_size_half, %ecx
478 # endif
479 # endif
480 jae L(large_page)
481
482 movdqu (%eax), %xmm0
483 movdqu -16(%eax, %ecx), %xmm1
484 cmpl $32, %ecx
485 movdqu %xmm0, (%edx)
486 movdqu %xmm1, -16(%edx, %ecx)
487 jbe L(return)
488
489 movdqu 16(%eax), %xmm0
490 movdqu -32(%eax, %ecx), %xmm1
491 cmpl $64, %ecx
492 movdqu %xmm0, 16(%edx)
493 movdqu %xmm1, -32(%edx, %ecx)
494 jbe L(return)
495
496 movdqu 32(%eax), %xmm0
497 movdqu 48(%eax), %xmm1
498 movdqu -48(%eax, %ecx), %xmm2
499 movdqu -64(%eax, %ecx), %xmm3
500 cmpl $128, %ecx
501 movdqu %xmm0, 32(%edx)
502 movdqu %xmm1, 48(%edx)
503 movdqu %xmm2, -48(%edx, %ecx)
504 movdqu %xmm3, -64(%edx, %ecx)
505 jbe L(return)
506
507 /* Now the main loop: we align the address of the destination. */
508 leal 64(%edx), %ebx
509 andl $-64, %ebx
510
511 addl %edx, %ecx
512 andl $-64, %ecx
513
514 subl %edx, %eax
515
516 /* We should stop two iterations before the termination
517 (in order not to misprefetch). */
518 subl $64, %ecx
519 cmpl %ebx, %ecx
520 je L(main_loop_just_one_iteration)
521
522 subl $64, %ecx
523 cmpl %ebx, %ecx
524 je L(main_loop_last_two_iterations)
525
526 .p2align 4
527 L(main_loop_cache):
528
529 prefetcht0 128(%ebx, %eax)
530
531 movdqu (%ebx, %eax), %xmm0
532 movdqu 16(%ebx, %eax), %xmm1
533 movdqu 32(%ebx, %eax), %xmm2
534 movdqu 48(%ebx, %eax), %xmm3
535 movdqa %xmm0, (%ebx)
536 movaps %xmm1, 16(%ebx)
537 movaps %xmm2, 32(%ebx)
538 movaps %xmm3, 48(%ebx)
539 lea 64(%ebx), %ebx
540 cmpl %ebx, %ecx
541 jne L(main_loop_cache)
542
543 L(main_loop_last_two_iterations):
544 movdqu (%ebx, %eax), %xmm0
545 movdqu 16(%ebx, %eax), %xmm1
546 movdqu 32(%ebx, %eax), %xmm2
547 movdqu 48(%ebx, %eax), %xmm3
548 movdqu 64(%ebx, %eax), %xmm4
549 movdqu 80(%ebx, %eax), %xmm5
550 movdqu 96(%ebx, %eax), %xmm6
551 movdqu 112(%ebx, %eax), %xmm7
552 movdqa %xmm0, (%ebx)
553 movaps %xmm1, 16(%ebx)
554 movaps %xmm2, 32(%ebx)
555 movaps %xmm3, 48(%ebx)
556 movaps %xmm4, 64(%ebx)
557 movaps %xmm5, 80(%ebx)
558 movaps %xmm6, 96(%ebx)
559 movaps %xmm7, 112(%ebx)
560 jmp L(return)
561
562 L(main_loop_just_one_iteration):
563 movdqu (%ebx, %eax), %xmm0
564 movdqu 16(%ebx, %eax), %xmm1
565 movdqu 32(%ebx, %eax), %xmm2
566 movdqu 48(%ebx, %eax), %xmm3
567 movdqa %xmm0, (%ebx)
568 movaps %xmm1, 16(%ebx)
569 movaps %xmm2, 32(%ebx)
570 movaps %xmm3, 48(%ebx)
571 jmp L(return)
572
573 L(large_page):
574 movdqu (%eax), %xmm0
575 movdqu 16(%eax), %xmm1
576 movdqu 32(%eax), %xmm2
577 movdqu 48(%eax), %xmm3
578 movdqu -64(%eax, %ecx), %xmm4
579 movdqu -48(%eax, %ecx), %xmm5
580 movdqu -32(%eax, %ecx), %xmm6
581 movdqu -16(%eax, %ecx), %xmm7
582 movdqu %xmm0, (%edx)
583 movdqu %xmm1, 16(%edx)
584 movdqu %xmm2, 32(%edx)
585 movdqu %xmm3, 48(%edx)
586 movdqu %xmm4, -64(%edx, %ecx)
587 movdqu %xmm5, -48(%edx, %ecx)
588 movdqu %xmm6, -32(%edx, %ecx)
589 movdqu %xmm7, -16(%edx, %ecx)
590
591 movdqu 64(%eax), %xmm0
592 movdqu 80(%eax), %xmm1
593 movdqu 96(%eax), %xmm2
594 movdqu 112(%eax), %xmm3
595 movdqu -128(%eax, %ecx), %xmm4
596 movdqu -112(%eax, %ecx), %xmm5
597 movdqu -96(%eax, %ecx), %xmm6
598 movdqu -80(%eax, %ecx), %xmm7
599 movdqu %xmm0, 64(%edx)
600 movdqu %xmm1, 80(%edx)
601 movdqu %xmm2, 96(%edx)
602 movdqu %xmm3, 112(%edx)
603 movdqu %xmm4, -128(%edx, %ecx)
604 movdqu %xmm5, -112(%edx, %ecx)
605 movdqu %xmm6, -96(%edx, %ecx)
606 movdqu %xmm7, -80(%edx, %ecx)
607
608 /* Now the main loop with non temporal stores. We align
609 the address of the destination. */
610 leal 128(%edx), %ebx
611 andl $-128, %ebx
612
613 addl %edx, %ecx
614 andl $-128, %ecx
615
616 subl %edx, %eax
617
618 .p2align 4
619 L(main_loop_large_page):
620 movdqu (%ebx, %eax), %xmm0
621 movdqu 16(%ebx, %eax), %xmm1
622 movdqu 32(%ebx, %eax), %xmm2
623 movdqu 48(%ebx, %eax), %xmm3
624 movdqu 64(%ebx, %eax), %xmm4
625 movdqu 80(%ebx, %eax), %xmm5
626 movdqu 96(%ebx, %eax), %xmm6
627 movdqu 112(%ebx, %eax), %xmm7
628 movntdq %xmm0, (%ebx)
629 movntdq %xmm1, 16(%ebx)
630 movntdq %xmm2, 32(%ebx)
631 movntdq %xmm3, 48(%ebx)
632 movntdq %xmm4, 64(%ebx)
633 movntdq %xmm5, 80(%ebx)
634 movntdq %xmm6, 96(%ebx)
635 movntdq %xmm7, 112(%ebx)
636 lea 128(%ebx), %ebx
637 cmpl %ebx, %ecx
638 jne L(main_loop_large_page)
639 sfence
640 jmp L(return)
641
642 L(len_0_16_bytes):
643 testb $24, %cl
644 jne L(len_9_16_bytes)
645 testb $4, %cl
646 .p2align 4,,5
647 jne L(len_5_8_bytes)
648 testl %ecx, %ecx
649 .p2align 4,,2
650 je L(return)
651 movzbl (%eax), %ebx
652 testb $2, %cl
653 movb %bl, (%edx)
654 je L(return)
655 movzwl -2(%eax,%ecx), %ebx
656 movw %bx, -2(%edx,%ecx)
657 jmp L(return)
658
659 L(len_9_16_bytes):
660 movq (%eax), %xmm0
661 movq -8(%eax, %ecx), %xmm1
662 movq %xmm0, (%edx)
663 movq %xmm1, -8(%edx, %ecx)
664 jmp L(return)
665
666 L(len_5_8_bytes):
667 movl (%eax), %ebx
668 movl %ebx, (%edx)
669 movl -4(%eax,%ecx), %ebx
670 movl %ebx, -4(%edx,%ecx)
671
672 L(return):
673 movl %edx, %eax
674 # if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
675 movl LEN(%esp), %ecx
676 add %ecx, %eax
677 # endif
678 RETURN
679
680 END (MEMCPY)
681 #endif