]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcmp-ssse3.S
546342892a2ffa19fc3f6c3924d4e3e846b6bc34
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcmp-ssse3.S
1 /* memcmp with SSSE3, wmemcmp with SSSE3
2 Copyright (C) 2010-2014 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23
24 # ifndef MEMCMP
25 # define MEMCMP __memcmp_ssse3
26 # endif
27
28 # define CFI_PUSH(REG) \
29 cfi_adjust_cfa_offset (4); \
30 cfi_rel_offset (REG, 0)
31
32 # define CFI_POP(REG) \
33 cfi_adjust_cfa_offset (-4); \
34 cfi_restore (REG)
35
36 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
37 # define POP(REG) popl REG; CFI_POP (REG)
38
39 # define PARMS 4
40 # define BLK1 PARMS
41 # define BLK2 BLK1+4
42 # define LEN BLK2+4
43 # define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
44 # define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
45
46 /* Warning!
47 wmemcmp has to use SIGNED comparison for elements.
48 memcmp has to use UNSIGNED comparison for elemnts.
49 */
50
51 atom_text_section
52 ENTRY (MEMCMP)
53 movl LEN(%esp), %ecx
54
55 # ifdef USE_AS_WMEMCMP
56 shl $2, %ecx
57 test %ecx, %ecx
58 jz L(zero)
59 # endif
60
61 movl BLK1(%esp), %eax
62 cmp $48, %ecx
63 movl BLK2(%esp), %edx
64 jae L(48bytesormore)
65
66 # ifndef USE_AS_WMEMCMP
67 cmp $1, %ecx
68 jbe L(less1bytes)
69 # endif
70
71 PUSH (%ebx)
72 add %ecx, %edx
73 add %ecx, %eax
74 jmp L(less48bytes)
75
76 CFI_POP (%ebx)
77
78 # ifndef USE_AS_WMEMCMP
79 .p2align 4
80 L(less1bytes):
81 jb L(zero)
82 movb (%eax), %cl
83 cmp (%edx), %cl
84 je L(zero)
85 mov $1, %eax
86 ja L(1bytesend)
87 neg %eax
88 L(1bytesend):
89 ret
90 # endif
91
92 .p2align 4
93 L(zero):
94 xor %eax, %eax
95 ret
96
97 .p2align 4
98 L(48bytesormore):
99 PUSH (%ebx)
100 PUSH (%esi)
101 PUSH (%edi)
102 cfi_remember_state
103 movdqu (%eax), %xmm3
104 movdqu (%edx), %xmm0
105 movl %eax, %edi
106 movl %edx, %esi
107 pcmpeqb %xmm0, %xmm3
108 pmovmskb %xmm3, %edx
109 lea 16(%edi), %edi
110
111 sub $0xffff, %edx
112 lea 16(%esi), %esi
113 jnz L(less16bytes)
114 mov %edi, %edx
115 and $0xf, %edx
116 xor %edx, %edi
117 sub %edx, %esi
118 add %edx, %ecx
119 mov %esi, %edx
120 and $0xf, %edx
121 jz L(shr_0)
122 xor %edx, %esi
123
124 # ifndef USE_AS_WMEMCMP
125 cmp $8, %edx
126 jae L(next_unaligned_table)
127 cmp $0, %edx
128 je L(shr_0)
129 cmp $1, %edx
130 je L(shr_1)
131 cmp $2, %edx
132 je L(shr_2)
133 cmp $3, %edx
134 je L(shr_3)
135 cmp $4, %edx
136 je L(shr_4)
137 cmp $5, %edx
138 je L(shr_5)
139 cmp $6, %edx
140 je L(shr_6)
141 jmp L(shr_7)
142
143 .p2align 2
144 L(next_unaligned_table):
145 cmp $8, %edx
146 je L(shr_8)
147 cmp $9, %edx
148 je L(shr_9)
149 cmp $10, %edx
150 je L(shr_10)
151 cmp $11, %edx
152 je L(shr_11)
153 cmp $12, %edx
154 je L(shr_12)
155 cmp $13, %edx
156 je L(shr_13)
157 cmp $14, %edx
158 je L(shr_14)
159 jmp L(shr_15)
160 # else
161 cmp $0, %edx
162 je L(shr_0)
163 cmp $4, %edx
164 je L(shr_4)
165 cmp $8, %edx
166 je L(shr_8)
167 jmp L(shr_12)
168 # endif
169
170 .p2align 4
171 L(shr_0):
172 cmp $80, %ecx
173 jae L(shr_0_gobble)
174 lea -48(%ecx), %ecx
175 xor %eax, %eax
176 movaps (%esi), %xmm1
177 pcmpeqb (%edi), %xmm1
178 movaps 16(%esi), %xmm2
179 pcmpeqb 16(%edi), %xmm2
180 pand %xmm1, %xmm2
181 pmovmskb %xmm2, %edx
182 add $32, %edi
183 add $32, %esi
184 sub $0xffff, %edx
185 jnz L(exit)
186
187 lea (%ecx, %edi,1), %eax
188 lea (%ecx, %esi,1), %edx
189 POP (%edi)
190 POP (%esi)
191 jmp L(less48bytes)
192
193 cfi_restore_state
194 cfi_remember_state
195 .p2align 4
196 L(shr_0_gobble):
197 lea -48(%ecx), %ecx
198 movdqa (%esi), %xmm0
199 xor %eax, %eax
200 pcmpeqb (%edi), %xmm0
201 sub $32, %ecx
202 movdqa 16(%esi), %xmm2
203 pcmpeqb 16(%edi), %xmm2
204 L(shr_0_gobble_loop):
205 pand %xmm0, %xmm2
206 sub $32, %ecx
207 pmovmskb %xmm2, %edx
208 movdqa %xmm0, %xmm1
209 movdqa 32(%esi), %xmm0
210 movdqa 48(%esi), %xmm2
211 sbb $0xffff, %edx
212 pcmpeqb 32(%edi), %xmm0
213 pcmpeqb 48(%edi), %xmm2
214 lea 32(%edi), %edi
215 lea 32(%esi), %esi
216 jz L(shr_0_gobble_loop)
217
218 pand %xmm0, %xmm2
219 cmp $0, %ecx
220 jge L(shr_0_gobble_loop_next)
221 inc %edx
222 add $32, %ecx
223 L(shr_0_gobble_loop_next):
224 test %edx, %edx
225 jnz L(exit)
226
227 pmovmskb %xmm2, %edx
228 movdqa %xmm0, %xmm1
229 lea 32(%edi), %edi
230 lea 32(%esi), %esi
231 sub $0xffff, %edx
232 jnz L(exit)
233 lea (%ecx, %edi,1), %eax
234 lea (%ecx, %esi,1), %edx
235 POP (%edi)
236 POP (%esi)
237 jmp L(less48bytes)
238
239 # ifndef USE_AS_WMEMCMP
240 cfi_restore_state
241 cfi_remember_state
242 .p2align 4
243 L(shr_1):
244 cmp $80, %ecx
245 lea -48(%ecx), %ecx
246 mov %edx, %eax
247 jae L(shr_1_gobble)
248
249 movdqa 16(%esi), %xmm1
250 movdqa %xmm1, %xmm2
251 palignr $1,(%esi), %xmm1
252 pcmpeqb (%edi), %xmm1
253
254 movdqa 32(%esi), %xmm3
255 palignr $1,%xmm2, %xmm3
256 pcmpeqb 16(%edi), %xmm3
257
258 pand %xmm1, %xmm3
259 pmovmskb %xmm3, %edx
260 lea 32(%edi), %edi
261 lea 32(%esi), %esi
262 sub $0xffff, %edx
263 jnz L(exit)
264 lea (%ecx, %edi,1), %eax
265 lea 1(%ecx, %esi,1), %edx
266 POP (%edi)
267 POP (%esi)
268 jmp L(less48bytes)
269
270 cfi_restore_state
271 cfi_remember_state
272 .p2align 4
273 L(shr_1_gobble):
274 sub $32, %ecx
275 movdqa 16(%esi), %xmm0
276 palignr $1,(%esi), %xmm0
277 pcmpeqb (%edi), %xmm0
278
279 movdqa 32(%esi), %xmm3
280 palignr $1,16(%esi), %xmm3
281 pcmpeqb 16(%edi), %xmm3
282
283 L(shr_1_gobble_loop):
284 pand %xmm0, %xmm3
285 sub $32, %ecx
286 pmovmskb %xmm3, %edx
287 movdqa %xmm0, %xmm1
288
289 movdqa 64(%esi), %xmm3
290 palignr $1,48(%esi), %xmm3
291 sbb $0xffff, %edx
292 movdqa 48(%esi), %xmm0
293 palignr $1,32(%esi), %xmm0
294 pcmpeqb 32(%edi), %xmm0
295 lea 32(%esi), %esi
296 pcmpeqb 48(%edi), %xmm3
297
298 lea 32(%edi), %edi
299 jz L(shr_1_gobble_loop)
300 pand %xmm0, %xmm3
301
302 cmp $0, %ecx
303 jge L(shr_1_gobble_next)
304 inc %edx
305 add $32, %ecx
306 L(shr_1_gobble_next):
307 test %edx, %edx
308 jnz L(exit)
309
310 pmovmskb %xmm3, %edx
311 movdqa %xmm0, %xmm1
312 lea 32(%edi), %edi
313 lea 32(%esi), %esi
314 sub $0xffff, %edx
315 jnz L(exit)
316
317 lea (%ecx, %edi,1), %eax
318 lea 1(%ecx, %esi,1), %edx
319 POP (%edi)
320 POP (%esi)
321 jmp L(less48bytes)
322
323
324 cfi_restore_state
325 cfi_remember_state
326 .p2align 4
327 L(shr_2):
328 cmp $80, %ecx
329 lea -48(%ecx), %ecx
330 mov %edx, %eax
331 jae L(shr_2_gobble)
332
333 movdqa 16(%esi), %xmm1
334 movdqa %xmm1, %xmm2
335 palignr $2,(%esi), %xmm1
336 pcmpeqb (%edi), %xmm1
337
338 movdqa 32(%esi), %xmm3
339 palignr $2,%xmm2, %xmm3
340 pcmpeqb 16(%edi), %xmm3
341
342 pand %xmm1, %xmm3
343 pmovmskb %xmm3, %edx
344 lea 32(%edi), %edi
345 lea 32(%esi), %esi
346 sub $0xffff, %edx
347 jnz L(exit)
348 lea (%ecx, %edi,1), %eax
349 lea 2(%ecx, %esi,1), %edx
350 POP (%edi)
351 POP (%esi)
352 jmp L(less48bytes)
353
354 cfi_restore_state
355 cfi_remember_state
356 .p2align 4
357 L(shr_2_gobble):
358 sub $32, %ecx
359 movdqa 16(%esi), %xmm0
360 palignr $2,(%esi), %xmm0
361 pcmpeqb (%edi), %xmm0
362
363 movdqa 32(%esi), %xmm3
364 palignr $2,16(%esi), %xmm3
365 pcmpeqb 16(%edi), %xmm3
366
367 L(shr_2_gobble_loop):
368 pand %xmm0, %xmm3
369 sub $32, %ecx
370 pmovmskb %xmm3, %edx
371 movdqa %xmm0, %xmm1
372
373 movdqa 64(%esi), %xmm3
374 palignr $2,48(%esi), %xmm3
375 sbb $0xffff, %edx
376 movdqa 48(%esi), %xmm0
377 palignr $2,32(%esi), %xmm0
378 pcmpeqb 32(%edi), %xmm0
379 lea 32(%esi), %esi
380 pcmpeqb 48(%edi), %xmm3
381
382 lea 32(%edi), %edi
383 jz L(shr_2_gobble_loop)
384 pand %xmm0, %xmm3
385
386 cmp $0, %ecx
387 jge L(shr_2_gobble_next)
388 inc %edx
389 add $32, %ecx
390 L(shr_2_gobble_next):
391 test %edx, %edx
392 jnz L(exit)
393
394 pmovmskb %xmm3, %edx
395 movdqa %xmm0, %xmm1
396 lea 32(%edi), %edi
397 lea 32(%esi), %esi
398 sub $0xffff, %edx
399 jnz L(exit)
400
401 lea (%ecx, %edi,1), %eax
402 lea 2(%ecx, %esi,1), %edx
403 POP (%edi)
404 POP (%esi)
405 jmp L(less48bytes)
406
407 cfi_restore_state
408 cfi_remember_state
409 .p2align 4
410 L(shr_3):
411 cmp $80, %ecx
412 lea -48(%ecx), %ecx
413 mov %edx, %eax
414 jae L(shr_3_gobble)
415
416 movdqa 16(%esi), %xmm1
417 movdqa %xmm1, %xmm2
418 palignr $3,(%esi), %xmm1
419 pcmpeqb (%edi), %xmm1
420
421 movdqa 32(%esi), %xmm3
422 palignr $3,%xmm2, %xmm3
423 pcmpeqb 16(%edi), %xmm3
424
425 pand %xmm1, %xmm3
426 pmovmskb %xmm3, %edx
427 lea 32(%edi), %edi
428 lea 32(%esi), %esi
429 sub $0xffff, %edx
430 jnz L(exit)
431 lea (%ecx, %edi,1), %eax
432 lea 3(%ecx, %esi,1), %edx
433 POP (%edi)
434 POP (%esi)
435 jmp L(less48bytes)
436
437 cfi_restore_state
438 cfi_remember_state
439 .p2align 4
440 L(shr_3_gobble):
441 sub $32, %ecx
442 movdqa 16(%esi), %xmm0
443 palignr $3,(%esi), %xmm0
444 pcmpeqb (%edi), %xmm0
445
446 movdqa 32(%esi), %xmm3
447 palignr $3,16(%esi), %xmm3
448 pcmpeqb 16(%edi), %xmm3
449
450 L(shr_3_gobble_loop):
451 pand %xmm0, %xmm3
452 sub $32, %ecx
453 pmovmskb %xmm3, %edx
454 movdqa %xmm0, %xmm1
455
456 movdqa 64(%esi), %xmm3
457 palignr $3,48(%esi), %xmm3
458 sbb $0xffff, %edx
459 movdqa 48(%esi), %xmm0
460 palignr $3,32(%esi), %xmm0
461 pcmpeqb 32(%edi), %xmm0
462 lea 32(%esi), %esi
463 pcmpeqb 48(%edi), %xmm3
464
465 lea 32(%edi), %edi
466 jz L(shr_3_gobble_loop)
467 pand %xmm0, %xmm3
468
469 cmp $0, %ecx
470 jge L(shr_3_gobble_next)
471 inc %edx
472 add $32, %ecx
473 L(shr_3_gobble_next):
474 test %edx, %edx
475 jnz L(exit)
476
477 pmovmskb %xmm3, %edx
478 movdqa %xmm0, %xmm1
479 lea 32(%edi), %edi
480 lea 32(%esi), %esi
481 sub $0xffff, %edx
482 jnz L(exit)
483
484 lea (%ecx, %edi,1), %eax
485 lea 3(%ecx, %esi,1), %edx
486 POP (%edi)
487 POP (%esi)
488 jmp L(less48bytes)
489 # endif
490
491 cfi_restore_state
492 cfi_remember_state
493 .p2align 4
494 L(shr_4):
495 cmp $80, %ecx
496 lea -48(%ecx), %ecx
497 mov %edx, %eax
498 jae L(shr_4_gobble)
499
500 movdqa 16(%esi), %xmm1
501 movdqa %xmm1, %xmm2
502 palignr $4,(%esi), %xmm1
503 pcmpeqb (%edi), %xmm1
504
505 movdqa 32(%esi), %xmm3
506 palignr $4,%xmm2, %xmm3
507 pcmpeqb 16(%edi), %xmm3
508
509 pand %xmm1, %xmm3
510 pmovmskb %xmm3, %edx
511 lea 32(%edi), %edi
512 lea 32(%esi), %esi
513 sub $0xffff, %edx
514 jnz L(exit)
515 lea (%ecx, %edi,1), %eax
516 lea 4(%ecx, %esi,1), %edx
517 POP (%edi)
518 POP (%esi)
519 jmp L(less48bytes)
520
521 cfi_restore_state
522 cfi_remember_state
523 .p2align 4
524 L(shr_4_gobble):
525 sub $32, %ecx
526 movdqa 16(%esi), %xmm0
527 palignr $4,(%esi), %xmm0
528 pcmpeqb (%edi), %xmm0
529
530 movdqa 32(%esi), %xmm3
531 palignr $4,16(%esi), %xmm3
532 pcmpeqb 16(%edi), %xmm3
533
534 L(shr_4_gobble_loop):
535 pand %xmm0, %xmm3
536 sub $32, %ecx
537 pmovmskb %xmm3, %edx
538 movdqa %xmm0, %xmm1
539
540 movdqa 64(%esi), %xmm3
541 palignr $4,48(%esi), %xmm3
542 sbb $0xffff, %edx
543 movdqa 48(%esi), %xmm0
544 palignr $4,32(%esi), %xmm0
545 pcmpeqb 32(%edi), %xmm0
546 lea 32(%esi), %esi
547 pcmpeqb 48(%edi), %xmm3
548
549 lea 32(%edi), %edi
550 jz L(shr_4_gobble_loop)
551 pand %xmm0, %xmm3
552
553 cmp $0, %ecx
554 jge L(shr_4_gobble_next)
555 inc %edx
556 add $32, %ecx
557 L(shr_4_gobble_next):
558 test %edx, %edx
559 jnz L(exit)
560
561 pmovmskb %xmm3, %edx
562 movdqa %xmm0, %xmm1
563 lea 32(%edi), %edi
564 lea 32(%esi), %esi
565 sub $0xffff, %edx
566 jnz L(exit)
567
568 lea (%ecx, %edi,1), %eax
569 lea 4(%ecx, %esi,1), %edx
570 POP (%edi)
571 POP (%esi)
572 jmp L(less48bytes)
573
574 # ifndef USE_AS_WMEMCMP
575 cfi_restore_state
576 cfi_remember_state
577 .p2align 4
578 L(shr_5):
579 cmp $80, %ecx
580 lea -48(%ecx), %ecx
581 mov %edx, %eax
582 jae L(shr_5_gobble)
583
584 movdqa 16(%esi), %xmm1
585 movdqa %xmm1, %xmm2
586 palignr $5,(%esi), %xmm1
587 pcmpeqb (%edi), %xmm1
588
589 movdqa 32(%esi), %xmm3
590 palignr $5,%xmm2, %xmm3
591 pcmpeqb 16(%edi), %xmm3
592
593 pand %xmm1, %xmm3
594 pmovmskb %xmm3, %edx
595 lea 32(%edi), %edi
596 lea 32(%esi), %esi
597 sub $0xffff, %edx
598 jnz L(exit)
599 lea (%ecx, %edi,1), %eax
600 lea 5(%ecx, %esi,1), %edx
601 POP (%edi)
602 POP (%esi)
603 jmp L(less48bytes)
604
605 cfi_restore_state
606 cfi_remember_state
607 .p2align 4
608 L(shr_5_gobble):
609 sub $32, %ecx
610 movdqa 16(%esi), %xmm0
611 palignr $5,(%esi), %xmm0
612 pcmpeqb (%edi), %xmm0
613
614 movdqa 32(%esi), %xmm3
615 palignr $5,16(%esi), %xmm3
616 pcmpeqb 16(%edi), %xmm3
617
618 L(shr_5_gobble_loop):
619 pand %xmm0, %xmm3
620 sub $32, %ecx
621 pmovmskb %xmm3, %edx
622 movdqa %xmm0, %xmm1
623
624 movdqa 64(%esi), %xmm3
625 palignr $5,48(%esi), %xmm3
626 sbb $0xffff, %edx
627 movdqa 48(%esi), %xmm0
628 palignr $5,32(%esi), %xmm0
629 pcmpeqb 32(%edi), %xmm0
630 lea 32(%esi), %esi
631 pcmpeqb 48(%edi), %xmm3
632
633 lea 32(%edi), %edi
634 jz L(shr_5_gobble_loop)
635 pand %xmm0, %xmm3
636
637 cmp $0, %ecx
638 jge L(shr_5_gobble_next)
639 inc %edx
640 add $32, %ecx
641 L(shr_5_gobble_next):
642 test %edx, %edx
643 jnz L(exit)
644
645 pmovmskb %xmm3, %edx
646 movdqa %xmm0, %xmm1
647 lea 32(%edi), %edi
648 lea 32(%esi), %esi
649 sub $0xffff, %edx
650 jnz L(exit)
651
652 lea (%ecx, %edi,1), %eax
653 lea 5(%ecx, %esi,1), %edx
654 POP (%edi)
655 POP (%esi)
656 jmp L(less48bytes)
657
658 cfi_restore_state
659 cfi_remember_state
660 .p2align 4
661 L(shr_6):
662 cmp $80, %ecx
663 lea -48(%ecx), %ecx
664 mov %edx, %eax
665 jae L(shr_6_gobble)
666
667 movdqa 16(%esi), %xmm1
668 movdqa %xmm1, %xmm2
669 palignr $6,(%esi), %xmm1
670 pcmpeqb (%edi), %xmm1
671
672 movdqa 32(%esi), %xmm3
673 palignr $6,%xmm2, %xmm3
674 pcmpeqb 16(%edi), %xmm3
675
676 pand %xmm1, %xmm3
677 pmovmskb %xmm3, %edx
678 lea 32(%edi), %edi
679 lea 32(%esi), %esi
680 sub $0xffff, %edx
681 jnz L(exit)
682 lea (%ecx, %edi,1), %eax
683 lea 6(%ecx, %esi,1), %edx
684 POP (%edi)
685 POP (%esi)
686 jmp L(less48bytes)
687
688 cfi_restore_state
689 cfi_remember_state
690 .p2align 4
691 L(shr_6_gobble):
692 sub $32, %ecx
693 movdqa 16(%esi), %xmm0
694 palignr $6,(%esi), %xmm0
695 pcmpeqb (%edi), %xmm0
696
697 movdqa 32(%esi), %xmm3
698 palignr $6,16(%esi), %xmm3
699 pcmpeqb 16(%edi), %xmm3
700
701 L(shr_6_gobble_loop):
702 pand %xmm0, %xmm3
703 sub $32, %ecx
704 pmovmskb %xmm3, %edx
705 movdqa %xmm0, %xmm1
706
707 movdqa 64(%esi), %xmm3
708 palignr $6,48(%esi), %xmm3
709 sbb $0xffff, %edx
710 movdqa 48(%esi), %xmm0
711 palignr $6,32(%esi), %xmm0
712 pcmpeqb 32(%edi), %xmm0
713 lea 32(%esi), %esi
714 pcmpeqb 48(%edi), %xmm3
715
716 lea 32(%edi), %edi
717 jz L(shr_6_gobble_loop)
718 pand %xmm0, %xmm3
719
720 cmp $0, %ecx
721 jge L(shr_6_gobble_next)
722 inc %edx
723 add $32, %ecx
724 L(shr_6_gobble_next):
725 test %edx, %edx
726 jnz L(exit)
727
728 pmovmskb %xmm3, %edx
729 movdqa %xmm0, %xmm1
730 lea 32(%edi), %edi
731 lea 32(%esi), %esi
732 sub $0xffff, %edx
733 jnz L(exit)
734
735 lea (%ecx, %edi,1), %eax
736 lea 6(%ecx, %esi,1), %edx
737 POP (%edi)
738 POP (%esi)
739 jmp L(less48bytes)
740
741 cfi_restore_state
742 cfi_remember_state
743 .p2align 4
744 L(shr_7):
745 cmp $80, %ecx
746 lea -48(%ecx), %ecx
747 mov %edx, %eax
748 jae L(shr_7_gobble)
749
750 movdqa 16(%esi), %xmm1
751 movdqa %xmm1, %xmm2
752 palignr $7,(%esi), %xmm1
753 pcmpeqb (%edi), %xmm1
754
755 movdqa 32(%esi), %xmm3
756 palignr $7,%xmm2, %xmm3
757 pcmpeqb 16(%edi), %xmm3
758
759 pand %xmm1, %xmm3
760 pmovmskb %xmm3, %edx
761 lea 32(%edi), %edi
762 lea 32(%esi), %esi
763 sub $0xffff, %edx
764 jnz L(exit)
765 lea (%ecx, %edi,1), %eax
766 lea 7(%ecx, %esi,1), %edx
767 POP (%edi)
768 POP (%esi)
769 jmp L(less48bytes)
770
771 cfi_restore_state
772 cfi_remember_state
773 .p2align 4
774 L(shr_7_gobble):
775 sub $32, %ecx
776 movdqa 16(%esi), %xmm0
777 palignr $7,(%esi), %xmm0
778 pcmpeqb (%edi), %xmm0
779
780 movdqa 32(%esi), %xmm3
781 palignr $7,16(%esi), %xmm3
782 pcmpeqb 16(%edi), %xmm3
783
784 L(shr_7_gobble_loop):
785 pand %xmm0, %xmm3
786 sub $32, %ecx
787 pmovmskb %xmm3, %edx
788 movdqa %xmm0, %xmm1
789
790 movdqa 64(%esi), %xmm3
791 palignr $7,48(%esi), %xmm3
792 sbb $0xffff, %edx
793 movdqa 48(%esi), %xmm0
794 palignr $7,32(%esi), %xmm0
795 pcmpeqb 32(%edi), %xmm0
796 lea 32(%esi), %esi
797 pcmpeqb 48(%edi), %xmm3
798
799 lea 32(%edi), %edi
800 jz L(shr_7_gobble_loop)
801 pand %xmm0, %xmm3
802
803 cmp $0, %ecx
804 jge L(shr_7_gobble_next)
805 inc %edx
806 add $32, %ecx
807 L(shr_7_gobble_next):
808 test %edx, %edx
809 jnz L(exit)
810
811 pmovmskb %xmm3, %edx
812 movdqa %xmm0, %xmm1
813 lea 32(%edi), %edi
814 lea 32(%esi), %esi
815 sub $0xffff, %edx
816 jnz L(exit)
817
818 lea (%ecx, %edi,1), %eax
819 lea 7(%ecx, %esi,1), %edx
820 POP (%edi)
821 POP (%esi)
822 jmp L(less48bytes)
823 # endif
824
825 cfi_restore_state
826 cfi_remember_state
827 .p2align 4
828 L(shr_8):
829 cmp $80, %ecx
830 lea -48(%ecx), %ecx
831 mov %edx, %eax
832 jae L(shr_8_gobble)
833
834 movdqa 16(%esi), %xmm1
835 movdqa %xmm1, %xmm2
836 palignr $8,(%esi), %xmm1
837 pcmpeqb (%edi), %xmm1
838
839 movdqa 32(%esi), %xmm3
840 palignr $8,%xmm2, %xmm3
841 pcmpeqb 16(%edi), %xmm3
842
843 pand %xmm1, %xmm3
844 pmovmskb %xmm3, %edx
845 lea 32(%edi), %edi
846 lea 32(%esi), %esi
847 sub $0xffff, %edx
848 jnz L(exit)
849 lea (%ecx, %edi,1), %eax
850 lea 8(%ecx, %esi,1), %edx
851 POP (%edi)
852 POP (%esi)
853 jmp L(less48bytes)
854
855 cfi_restore_state
856 cfi_remember_state
857 .p2align 4
858 L(shr_8_gobble):
859 sub $32, %ecx
860 movdqa 16(%esi), %xmm0
861 palignr $8,(%esi), %xmm0
862 pcmpeqb (%edi), %xmm0
863
864 movdqa 32(%esi), %xmm3
865 palignr $8,16(%esi), %xmm3
866 pcmpeqb 16(%edi), %xmm3
867
868 L(shr_8_gobble_loop):
869 pand %xmm0, %xmm3
870 sub $32, %ecx
871 pmovmskb %xmm3, %edx
872 movdqa %xmm0, %xmm1
873
874 movdqa 64(%esi), %xmm3
875 palignr $8,48(%esi), %xmm3
876 sbb $0xffff, %edx
877 movdqa 48(%esi), %xmm0
878 palignr $8,32(%esi), %xmm0
879 pcmpeqb 32(%edi), %xmm0
880 lea 32(%esi), %esi
881 pcmpeqb 48(%edi), %xmm3
882
883 lea 32(%edi), %edi
884 jz L(shr_8_gobble_loop)
885 pand %xmm0, %xmm3
886
887 cmp $0, %ecx
888 jge L(shr_8_gobble_next)
889 inc %edx
890 add $32, %ecx
891 L(shr_8_gobble_next):
892 test %edx, %edx
893 jnz L(exit)
894
895 pmovmskb %xmm3, %edx
896 movdqa %xmm0, %xmm1
897 lea 32(%edi), %edi
898 lea 32(%esi), %esi
899 sub $0xffff, %edx
900 jnz L(exit)
901
902 lea (%ecx, %edi,1), %eax
903 lea 8(%ecx, %esi,1), %edx
904 POP (%edi)
905 POP (%esi)
906 jmp L(less48bytes)
907
908 # ifndef USE_AS_WMEMCMP
909 cfi_restore_state
910 cfi_remember_state
911 .p2align 4
912 L(shr_9):
913 cmp $80, %ecx
914 lea -48(%ecx), %ecx
915 mov %edx, %eax
916 jae L(shr_9_gobble)
917
918 movdqa 16(%esi), %xmm1
919 movdqa %xmm1, %xmm2
920 palignr $9,(%esi), %xmm1
921 pcmpeqb (%edi), %xmm1
922
923 movdqa 32(%esi), %xmm3
924 palignr $9,%xmm2, %xmm3
925 pcmpeqb 16(%edi), %xmm3
926
927 pand %xmm1, %xmm3
928 pmovmskb %xmm3, %edx
929 lea 32(%edi), %edi
930 lea 32(%esi), %esi
931 sub $0xffff, %edx
932 jnz L(exit)
933 lea (%ecx, %edi,1), %eax
934 lea 9(%ecx, %esi,1), %edx
935 POP (%edi)
936 POP (%esi)
937 jmp L(less48bytes)
938
939 cfi_restore_state
940 cfi_remember_state
941 .p2align 4
942 L(shr_9_gobble):
943 sub $32, %ecx
944 movdqa 16(%esi), %xmm0
945 palignr $9,(%esi), %xmm0
946 pcmpeqb (%edi), %xmm0
947
948 movdqa 32(%esi), %xmm3
949 palignr $9,16(%esi), %xmm3
950 pcmpeqb 16(%edi), %xmm3
951
952 L(shr_9_gobble_loop):
953 pand %xmm0, %xmm3
954 sub $32, %ecx
955 pmovmskb %xmm3, %edx
956 movdqa %xmm0, %xmm1
957
958 movdqa 64(%esi), %xmm3
959 palignr $9,48(%esi), %xmm3
960 sbb $0xffff, %edx
961 movdqa 48(%esi), %xmm0
962 palignr $9,32(%esi), %xmm0
963 pcmpeqb 32(%edi), %xmm0
964 lea 32(%esi), %esi
965 pcmpeqb 48(%edi), %xmm3
966
967 lea 32(%edi), %edi
968 jz L(shr_9_gobble_loop)
969 pand %xmm0, %xmm3
970
971 cmp $0, %ecx
972 jge L(shr_9_gobble_next)
973 inc %edx
974 add $32, %ecx
975 L(shr_9_gobble_next):
976 test %edx, %edx
977 jnz L(exit)
978
979 pmovmskb %xmm3, %edx
980 movdqa %xmm0, %xmm1
981 lea 32(%edi), %edi
982 lea 32(%esi), %esi
983 sub $0xffff, %edx
984 jnz L(exit)
985
986 lea (%ecx, %edi,1), %eax
987 lea 9(%ecx, %esi,1), %edx
988 POP (%edi)
989 POP (%esi)
990 jmp L(less48bytes)
991
992 cfi_restore_state
993 cfi_remember_state
994 .p2align 4
995 L(shr_10):
996 cmp $80, %ecx
997 lea -48(%ecx), %ecx
998 mov %edx, %eax
999 jae L(shr_10_gobble)
1000
1001 movdqa 16(%esi), %xmm1
1002 movdqa %xmm1, %xmm2
1003 palignr $10, (%esi), %xmm1
1004 pcmpeqb (%edi), %xmm1
1005
1006 movdqa 32(%esi), %xmm3
1007 palignr $10,%xmm2, %xmm3
1008 pcmpeqb 16(%edi), %xmm3
1009
1010 pand %xmm1, %xmm3
1011 pmovmskb %xmm3, %edx
1012 lea 32(%edi), %edi
1013 lea 32(%esi), %esi
1014 sub $0xffff, %edx
1015 jnz L(exit)
1016 lea (%ecx, %edi,1), %eax
1017 lea 10(%ecx, %esi,1), %edx
1018 POP (%edi)
1019 POP (%esi)
1020 jmp L(less48bytes)
1021
1022 cfi_restore_state
1023 cfi_remember_state
1024 .p2align 4
1025 L(shr_10_gobble):
1026 sub $32, %ecx
1027 movdqa 16(%esi), %xmm0
1028 palignr $10, (%esi), %xmm0
1029 pcmpeqb (%edi), %xmm0
1030
1031 movdqa 32(%esi), %xmm3
1032 palignr $10, 16(%esi), %xmm3
1033 pcmpeqb 16(%edi), %xmm3
1034
1035 L(shr_10_gobble_loop):
1036 pand %xmm0, %xmm3
1037 sub $32, %ecx
1038 pmovmskb %xmm3, %edx
1039 movdqa %xmm0, %xmm1
1040
1041 movdqa 64(%esi), %xmm3
1042 palignr $10,48(%esi), %xmm3
1043 sbb $0xffff, %edx
1044 movdqa 48(%esi), %xmm0
1045 palignr $10,32(%esi), %xmm0
1046 pcmpeqb 32(%edi), %xmm0
1047 lea 32(%esi), %esi
1048 pcmpeqb 48(%edi), %xmm3
1049
1050 lea 32(%edi), %edi
1051 jz L(shr_10_gobble_loop)
1052 pand %xmm0, %xmm3
1053
1054 cmp $0, %ecx
1055 jge L(shr_10_gobble_next)
1056 inc %edx
1057 add $32, %ecx
1058 L(shr_10_gobble_next):
1059 test %edx, %edx
1060 jnz L(exit)
1061
1062 pmovmskb %xmm3, %edx
1063 movdqa %xmm0, %xmm1
1064 lea 32(%edi), %edi
1065 lea 32(%esi), %esi
1066 sub $0xffff, %edx
1067 jnz L(exit)
1068
1069 lea (%ecx, %edi,1), %eax
1070 lea 10(%ecx, %esi,1), %edx
1071 POP (%edi)
1072 POP (%esi)
1073 jmp L(less48bytes)
1074
1075 cfi_restore_state
1076 cfi_remember_state
1077 .p2align 4
1078 L(shr_11):
1079 cmp $80, %ecx
1080 lea -48(%ecx), %ecx
1081 mov %edx, %eax
1082 jae L(shr_11_gobble)
1083
1084 movdqa 16(%esi), %xmm1
1085 movdqa %xmm1, %xmm2
1086 palignr $11, (%esi), %xmm1
1087 pcmpeqb (%edi), %xmm1
1088
1089 movdqa 32(%esi), %xmm3
1090 palignr $11, %xmm2, %xmm3
1091 pcmpeqb 16(%edi), %xmm3
1092
1093 pand %xmm1, %xmm3
1094 pmovmskb %xmm3, %edx
1095 lea 32(%edi), %edi
1096 lea 32(%esi), %esi
1097 sub $0xffff, %edx
1098 jnz L(exit)
1099 lea (%ecx, %edi,1), %eax
1100 lea 11(%ecx, %esi,1), %edx
1101 POP (%edi)
1102 POP (%esi)
1103 jmp L(less48bytes)
1104
1105 cfi_restore_state
1106 cfi_remember_state
1107 .p2align 4
1108 L(shr_11_gobble):
1109 sub $32, %ecx
1110 movdqa 16(%esi), %xmm0
1111 palignr $11, (%esi), %xmm0
1112 pcmpeqb (%edi), %xmm0
1113
1114 movdqa 32(%esi), %xmm3
1115 palignr $11, 16(%esi), %xmm3
1116 pcmpeqb 16(%edi), %xmm3
1117
1118 L(shr_11_gobble_loop):
1119 pand %xmm0, %xmm3
1120 sub $32, %ecx
1121 pmovmskb %xmm3, %edx
1122 movdqa %xmm0, %xmm1
1123
1124 movdqa 64(%esi), %xmm3
1125 palignr $11,48(%esi), %xmm3
1126 sbb $0xffff, %edx
1127 movdqa 48(%esi), %xmm0
1128 palignr $11,32(%esi), %xmm0
1129 pcmpeqb 32(%edi), %xmm0
1130 lea 32(%esi), %esi
1131 pcmpeqb 48(%edi), %xmm3
1132
1133 lea 32(%edi), %edi
1134 jz L(shr_11_gobble_loop)
1135 pand %xmm0, %xmm3
1136
1137 cmp $0, %ecx
1138 jge L(shr_11_gobble_next)
1139 inc %edx
1140 add $32, %ecx
1141 L(shr_11_gobble_next):
1142 test %edx, %edx
1143 jnz L(exit)
1144
1145 pmovmskb %xmm3, %edx
1146 movdqa %xmm0, %xmm1
1147 lea 32(%edi), %edi
1148 lea 32(%esi), %esi
1149 sub $0xffff, %edx
1150 jnz L(exit)
1151
1152 lea (%ecx, %edi,1), %eax
1153 lea 11(%ecx, %esi,1), %edx
1154 POP (%edi)
1155 POP (%esi)
1156 jmp L(less48bytes)
1157 # endif
1158
1159 cfi_restore_state
1160 cfi_remember_state
1161 .p2align 4
1162 L(shr_12):
1163 cmp $80, %ecx
1164 lea -48(%ecx), %ecx
1165 mov %edx, %eax
1166 jae L(shr_12_gobble)
1167
1168 movdqa 16(%esi), %xmm1
1169 movdqa %xmm1, %xmm2
1170 palignr $12, (%esi), %xmm1
1171 pcmpeqb (%edi), %xmm1
1172
1173 movdqa 32(%esi), %xmm3
1174 palignr $12, %xmm2, %xmm3
1175 pcmpeqb 16(%edi), %xmm3
1176
1177 pand %xmm1, %xmm3
1178 pmovmskb %xmm3, %edx
1179 lea 32(%edi), %edi
1180 lea 32(%esi), %esi
1181 sub $0xffff, %edx
1182 jnz L(exit)
1183 lea (%ecx, %edi,1), %eax
1184 lea 12(%ecx, %esi,1), %edx
1185 POP (%edi)
1186 POP (%esi)
1187 jmp L(less48bytes)
1188
1189 cfi_restore_state
1190 cfi_remember_state
1191 .p2align 4
1192 L(shr_12_gobble):
1193 sub $32, %ecx
1194 movdqa 16(%esi), %xmm0
1195 palignr $12, (%esi), %xmm0
1196 pcmpeqb (%edi), %xmm0
1197
1198 movdqa 32(%esi), %xmm3
1199 palignr $12, 16(%esi), %xmm3
1200 pcmpeqb 16(%edi), %xmm3
1201
1202 L(shr_12_gobble_loop):
1203 pand %xmm0, %xmm3
1204 sub $32, %ecx
1205 pmovmskb %xmm3, %edx
1206 movdqa %xmm0, %xmm1
1207
1208 movdqa 64(%esi), %xmm3
1209 palignr $12,48(%esi), %xmm3
1210 sbb $0xffff, %edx
1211 movdqa 48(%esi), %xmm0
1212 palignr $12,32(%esi), %xmm0
1213 pcmpeqb 32(%edi), %xmm0
1214 lea 32(%esi), %esi
1215 pcmpeqb 48(%edi), %xmm3
1216
1217 lea 32(%edi), %edi
1218 jz L(shr_12_gobble_loop)
1219 pand %xmm0, %xmm3
1220
1221 cmp $0, %ecx
1222 jge L(shr_12_gobble_next)
1223 inc %edx
1224 add $32, %ecx
1225 L(shr_12_gobble_next):
1226 test %edx, %edx
1227 jnz L(exit)
1228
1229 pmovmskb %xmm3, %edx
1230 movdqa %xmm0, %xmm1
1231 lea 32(%edi), %edi
1232 lea 32(%esi), %esi
1233 sub $0xffff, %edx
1234 jnz L(exit)
1235
1236 lea (%ecx, %edi,1), %eax
1237 lea 12(%ecx, %esi,1), %edx
1238 POP (%edi)
1239 POP (%esi)
1240 jmp L(less48bytes)
1241
1242 # ifndef USE_AS_WMEMCMP
1243 cfi_restore_state
1244 cfi_remember_state
1245 .p2align 4
1246 L(shr_13):
1247 cmp $80, %ecx
1248 lea -48(%ecx), %ecx
1249 mov %edx, %eax
1250 jae L(shr_13_gobble)
1251
1252 movdqa 16(%esi), %xmm1
1253 movdqa %xmm1, %xmm2
1254 palignr $13, (%esi), %xmm1
1255 pcmpeqb (%edi), %xmm1
1256
1257 movdqa 32(%esi), %xmm3
1258 palignr $13, %xmm2, %xmm3
1259 pcmpeqb 16(%edi), %xmm3
1260
1261 pand %xmm1, %xmm3
1262 pmovmskb %xmm3, %edx
1263 lea 32(%edi), %edi
1264 lea 32(%esi), %esi
1265 sub $0xffff, %edx
1266 jnz L(exit)
1267 lea (%ecx, %edi,1), %eax
1268 lea 13(%ecx, %esi,1), %edx
1269 POP (%edi)
1270 POP (%esi)
1271 jmp L(less48bytes)
1272
1273 cfi_restore_state
1274 cfi_remember_state
1275 .p2align 4
1276 L(shr_13_gobble):
1277 sub $32, %ecx
1278 movdqa 16(%esi), %xmm0
1279 palignr $13, (%esi), %xmm0
1280 pcmpeqb (%edi), %xmm0
1281
1282 movdqa 32(%esi), %xmm3
1283 palignr $13, 16(%esi), %xmm3
1284 pcmpeqb 16(%edi), %xmm3
1285
1286 L(shr_13_gobble_loop):
1287 pand %xmm0, %xmm3
1288 sub $32, %ecx
1289 pmovmskb %xmm3, %edx
1290 movdqa %xmm0, %xmm1
1291
1292 movdqa 64(%esi), %xmm3
1293 palignr $13,48(%esi), %xmm3
1294 sbb $0xffff, %edx
1295 movdqa 48(%esi), %xmm0
1296 palignr $13,32(%esi), %xmm0
1297 pcmpeqb 32(%edi), %xmm0
1298 lea 32(%esi), %esi
1299 pcmpeqb 48(%edi), %xmm3
1300
1301 lea 32(%edi), %edi
1302 jz L(shr_13_gobble_loop)
1303 pand %xmm0, %xmm3
1304
1305 cmp $0, %ecx
1306 jge L(shr_13_gobble_next)
1307 inc %edx
1308 add $32, %ecx
1309 L(shr_13_gobble_next):
1310 test %edx, %edx
1311 jnz L(exit)
1312
1313 pmovmskb %xmm3, %edx
1314 movdqa %xmm0, %xmm1
1315 lea 32(%edi), %edi
1316 lea 32(%esi), %esi
1317 sub $0xffff, %edx
1318 jnz L(exit)
1319
1320 lea (%ecx, %edi,1), %eax
1321 lea 13(%ecx, %esi,1), %edx
1322 POP (%edi)
1323 POP (%esi)
1324 jmp L(less48bytes)
1325
1326 cfi_restore_state
1327 cfi_remember_state
1328 .p2align 4
1329 L(shr_14):
1330 cmp $80, %ecx
1331 lea -48(%ecx), %ecx
1332 mov %edx, %eax
1333 jae L(shr_14_gobble)
1334
1335 movdqa 16(%esi), %xmm1
1336 movdqa %xmm1, %xmm2
1337 palignr $14, (%esi), %xmm1
1338 pcmpeqb (%edi), %xmm1
1339
1340 movdqa 32(%esi), %xmm3
1341 palignr $14, %xmm2, %xmm3
1342 pcmpeqb 16(%edi), %xmm3
1343
1344 pand %xmm1, %xmm3
1345 pmovmskb %xmm3, %edx
1346 lea 32(%edi), %edi
1347 lea 32(%esi), %esi
1348 sub $0xffff, %edx
1349 jnz L(exit)
1350 lea (%ecx, %edi,1), %eax
1351 lea 14(%ecx, %esi,1), %edx
1352 POP (%edi)
1353 POP (%esi)
1354 jmp L(less48bytes)
1355
1356 cfi_restore_state
1357 cfi_remember_state
1358 .p2align 4
1359 L(shr_14_gobble):
1360 sub $32, %ecx
1361 movdqa 16(%esi), %xmm0
1362 palignr $14, (%esi), %xmm0
1363 pcmpeqb (%edi), %xmm0
1364
1365 movdqa 32(%esi), %xmm3
1366 palignr $14, 16(%esi), %xmm3
1367 pcmpeqb 16(%edi), %xmm3
1368
1369 L(shr_14_gobble_loop):
1370 pand %xmm0, %xmm3
1371 sub $32, %ecx
1372 pmovmskb %xmm3, %edx
1373 movdqa %xmm0, %xmm1
1374
1375 movdqa 64(%esi), %xmm3
1376 palignr $14,48(%esi), %xmm3
1377 sbb $0xffff, %edx
1378 movdqa 48(%esi), %xmm0
1379 palignr $14,32(%esi), %xmm0
1380 pcmpeqb 32(%edi), %xmm0
1381 lea 32(%esi), %esi
1382 pcmpeqb 48(%edi), %xmm3
1383
1384 lea 32(%edi), %edi
1385 jz L(shr_14_gobble_loop)
1386 pand %xmm0, %xmm3
1387
1388 cmp $0, %ecx
1389 jge L(shr_14_gobble_next)
1390 inc %edx
1391 add $32, %ecx
1392 L(shr_14_gobble_next):
1393 test %edx, %edx
1394 jnz L(exit)
1395
1396 pmovmskb %xmm3, %edx
1397 movdqa %xmm0, %xmm1
1398 lea 32(%edi), %edi
1399 lea 32(%esi), %esi
1400 sub $0xffff, %edx
1401 jnz L(exit)
1402
1403 lea (%ecx, %edi,1), %eax
1404 lea 14(%ecx, %esi,1), %edx
1405 POP (%edi)
1406 POP (%esi)
1407 jmp L(less48bytes)
1408
1409 cfi_restore_state
1410 cfi_remember_state
1411 .p2align 4
1412 L(shr_15):
1413 cmp $80, %ecx
1414 lea -48(%ecx), %ecx
1415 mov %edx, %eax
1416 jae L(shr_15_gobble)
1417
1418 movdqa 16(%esi), %xmm1
1419 movdqa %xmm1, %xmm2
1420 palignr $15, (%esi), %xmm1
1421 pcmpeqb (%edi), %xmm1
1422
1423 movdqa 32(%esi), %xmm3
1424 palignr $15, %xmm2, %xmm3
1425 pcmpeqb 16(%edi), %xmm3
1426
1427 pand %xmm1, %xmm3
1428 pmovmskb %xmm3, %edx
1429 lea 32(%edi), %edi
1430 lea 32(%esi), %esi
1431 sub $0xffff, %edx
1432 jnz L(exit)
1433 lea (%ecx, %edi,1), %eax
1434 lea 15(%ecx, %esi,1), %edx
1435 POP (%edi)
1436 POP (%esi)
1437 jmp L(less48bytes)
1438
1439 cfi_restore_state
1440 cfi_remember_state
1441 .p2align 4
1442 L(shr_15_gobble):
1443 sub $32, %ecx
1444 movdqa 16(%esi), %xmm0
1445 palignr $15, (%esi), %xmm0
1446 pcmpeqb (%edi), %xmm0
1447
1448 movdqa 32(%esi), %xmm3
1449 palignr $15, 16(%esi), %xmm3
1450 pcmpeqb 16(%edi), %xmm3
1451
1452 L(shr_15_gobble_loop):
1453 pand %xmm0, %xmm3
1454 sub $32, %ecx
1455 pmovmskb %xmm3, %edx
1456 movdqa %xmm0, %xmm1
1457
1458 movdqa 64(%esi), %xmm3
1459 palignr $15,48(%esi), %xmm3
1460 sbb $0xffff, %edx
1461 movdqa 48(%esi), %xmm0
1462 palignr $15,32(%esi), %xmm0
1463 pcmpeqb 32(%edi), %xmm0
1464 lea 32(%esi), %esi
1465 pcmpeqb 48(%edi), %xmm3
1466
1467 lea 32(%edi), %edi
1468 jz L(shr_15_gobble_loop)
1469 pand %xmm0, %xmm3
1470
1471 cmp $0, %ecx
1472 jge L(shr_15_gobble_next)
1473 inc %edx
1474 add $32, %ecx
1475 L(shr_15_gobble_next):
1476 test %edx, %edx
1477 jnz L(exit)
1478
1479 pmovmskb %xmm3, %edx
1480 movdqa %xmm0, %xmm1
1481 lea 32(%edi), %edi
1482 lea 32(%esi), %esi
1483 sub $0xffff, %edx
1484 jnz L(exit)
1485
1486 lea (%ecx, %edi,1), %eax
1487 lea 15(%ecx, %esi,1), %edx
1488 POP (%edi)
1489 POP (%esi)
1490 jmp L(less48bytes)
1491 # endif
1492
1493 cfi_restore_state
1494 cfi_remember_state
1495 .p2align 4
1496 L(exit):
1497 pmovmskb %xmm1, %ebx
1498 sub $0xffff, %ebx
1499 jz L(first16bytes)
1500 lea -16(%esi), %esi
1501 lea -16(%edi), %edi
1502 mov %ebx, %edx
1503
1504 L(first16bytes):
1505 add %eax, %esi
1506 L(less16bytes):
1507
1508 # ifndef USE_AS_WMEMCMP
1509 test %dl, %dl
1510 jz L(next_24_bytes)
1511
1512 test $0x01, %dl
1513 jnz L(Byte16)
1514
1515 test $0x02, %dl
1516 jnz L(Byte17)
1517
1518 test $0x04, %dl
1519 jnz L(Byte18)
1520
1521 test $0x08, %dl
1522 jnz L(Byte19)
1523
1524 test $0x10, %dl
1525 jnz L(Byte20)
1526
1527 test $0x20, %dl
1528 jnz L(Byte21)
1529
1530 test $0x40, %dl
1531 jnz L(Byte22)
1532 L(Byte23):
1533 movzbl -9(%edi), %eax
1534 movzbl -9(%esi), %edx
1535 sub %edx, %eax
1536 RETURN
1537
1538 .p2align 4
1539 L(Byte16):
1540 movzbl -16(%edi), %eax
1541 movzbl -16(%esi), %edx
1542 sub %edx, %eax
1543 RETURN
1544
1545 .p2align 4
1546 L(Byte17):
1547 movzbl -15(%edi), %eax
1548 movzbl -15(%esi), %edx
1549 sub %edx, %eax
1550 RETURN
1551
1552 .p2align 4
1553 L(Byte18):
1554 movzbl -14(%edi), %eax
1555 movzbl -14(%esi), %edx
1556 sub %edx, %eax
1557 RETURN
1558
1559 .p2align 4
1560 L(Byte19):
1561 movzbl -13(%edi), %eax
1562 movzbl -13(%esi), %edx
1563 sub %edx, %eax
1564 RETURN
1565
1566 .p2align 4
1567 L(Byte20):
1568 movzbl -12(%edi), %eax
1569 movzbl -12(%esi), %edx
1570 sub %edx, %eax
1571 RETURN
1572
1573 .p2align 4
1574 L(Byte21):
1575 movzbl -11(%edi), %eax
1576 movzbl -11(%esi), %edx
1577 sub %edx, %eax
1578 RETURN
1579
1580 .p2align 4
1581 L(Byte22):
1582 movzbl -10(%edi), %eax
1583 movzbl -10(%esi), %edx
1584 sub %edx, %eax
1585 RETURN
1586
1587 .p2align 4
1588 L(next_24_bytes):
1589 lea 8(%edi), %edi
1590 lea 8(%esi), %esi
1591 test $0x01, %dh
1592 jnz L(Byte16)
1593
1594 test $0x02, %dh
1595 jnz L(Byte17)
1596
1597 test $0x04, %dh
1598 jnz L(Byte18)
1599
1600 test $0x08, %dh
1601 jnz L(Byte19)
1602
1603 test $0x10, %dh
1604 jnz L(Byte20)
1605
1606 test $0x20, %dh
1607 jnz L(Byte21)
1608
1609 test $0x40, %dh
1610 jnz L(Byte22)
1611
1612 .p2align 4
1613 L(Byte31):
1614 movzbl -9(%edi), %eax
1615 movzbl -9(%esi), %edx
1616 sub %edx, %eax
1617 RETURN_END
1618 # else
1619
1620 /* special for wmemcmp */
1621 xor %eax, %eax
1622 test %dl, %dl
1623 jz L(next_two_double_words)
1624 and $15, %dl
1625 jz L(second_double_word)
1626 mov -16(%edi), %eax
1627 cmp -16(%esi), %eax
1628 jne L(nequal)
1629 RETURN
1630
1631 .p2align 4
1632 L(second_double_word):
1633 mov -12(%edi), %eax
1634 cmp -12(%esi), %eax
1635 jne L(nequal)
1636 RETURN
1637
1638 .p2align 4
1639 L(next_two_double_words):
1640 and $15, %dh
1641 jz L(fourth_double_word)
1642 mov -8(%edi), %eax
1643 cmp -8(%esi), %eax
1644 jne L(nequal)
1645 RETURN
1646
1647 .p2align 4
1648 L(fourth_double_word):
1649 mov -4(%edi), %eax
1650 cmp -4(%esi), %eax
1651 jne L(nequal)
1652 RETURN
1653
1654 .p2align 4
1655 L(nequal):
1656 mov $1, %eax
1657 jg L(nequal_bigger)
1658 neg %eax
1659 RETURN
1660
1661 .p2align 4
1662 L(nequal_bigger):
1663 RETURN_END
1664 # endif
1665
1666 CFI_PUSH (%ebx)
1667
1668 .p2align 4
1669 L(more8bytes):
1670 cmp $16, %ecx
1671 jae L(more16bytes)
1672 cmp $8, %ecx
1673 je L(8bytes)
1674 # ifndef USE_AS_WMEMCMP
1675 cmp $9, %ecx
1676 je L(9bytes)
1677 cmp $10, %ecx
1678 je L(10bytes)
1679 cmp $11, %ecx
1680 je L(11bytes)
1681 cmp $12, %ecx
1682 je L(12bytes)
1683 cmp $13, %ecx
1684 je L(13bytes)
1685 cmp $14, %ecx
1686 je L(14bytes)
1687 jmp L(15bytes)
1688 # else
1689 jmp L(12bytes)
1690 # endif
1691
1692 .p2align 4
1693 L(more16bytes):
1694 cmp $24, %ecx
1695 jae L(more24bytes)
1696 cmp $16, %ecx
1697 je L(16bytes)
1698 # ifndef USE_AS_WMEMCMP
1699 cmp $17, %ecx
1700 je L(17bytes)
1701 cmp $18, %ecx
1702 je L(18bytes)
1703 cmp $19, %ecx
1704 je L(19bytes)
1705 cmp $20, %ecx
1706 je L(20bytes)
1707 cmp $21, %ecx
1708 je L(21bytes)
1709 cmp $22, %ecx
1710 je L(22bytes)
1711 jmp L(23bytes)
1712 # else
1713 jmp L(20bytes)
1714 # endif
1715
1716 .p2align 4
1717 L(more24bytes):
1718 cmp $32, %ecx
1719 jae L(more32bytes)
1720 cmp $24, %ecx
1721 je L(24bytes)
1722 # ifndef USE_AS_WMEMCMP
1723 cmp $25, %ecx
1724 je L(25bytes)
1725 cmp $26, %ecx
1726 je L(26bytes)
1727 cmp $27, %ecx
1728 je L(27bytes)
1729 cmp $28, %ecx
1730 je L(28bytes)
1731 cmp $29, %ecx
1732 je L(29bytes)
1733 cmp $30, %ecx
1734 je L(30bytes)
1735 jmp L(31bytes)
1736 # else
1737 jmp L(28bytes)
1738 # endif
1739
1740 .p2align 4
1741 L(more32bytes):
1742 cmp $40, %ecx
1743 jae L(more40bytes)
1744 cmp $32, %ecx
1745 je L(32bytes)
1746 # ifndef USE_AS_WMEMCMP
1747 cmp $33, %ecx
1748 je L(33bytes)
1749 cmp $34, %ecx
1750 je L(34bytes)
1751 cmp $35, %ecx
1752 je L(35bytes)
1753 cmp $36, %ecx
1754 je L(36bytes)
1755 cmp $37, %ecx
1756 je L(37bytes)
1757 cmp $38, %ecx
1758 je L(38bytes)
1759 jmp L(39bytes)
1760 # else
1761 jmp L(36bytes)
1762 # endif
1763
1764 .p2align 4
1765 L(less48bytes):
1766 cmp $8, %ecx
1767 jae L(more8bytes)
1768 # ifndef USE_AS_WMEMCMP
1769 cmp $2, %ecx
1770 je L(2bytes)
1771 cmp $3, %ecx
1772 je L(3bytes)
1773 cmp $4, %ecx
1774 je L(4bytes)
1775 cmp $5, %ecx
1776 je L(5bytes)
1777 cmp $6, %ecx
1778 je L(6bytes)
1779 jmp L(7bytes)
1780 # else
1781 jmp L(4bytes)
1782 # endif
1783
1784 .p2align 4
1785 L(more40bytes):
1786 cmp $40, %ecx
1787 je L(40bytes)
1788 # ifndef USE_AS_WMEMCMP
1789 cmp $41, %ecx
1790 je L(41bytes)
1791 cmp $42, %ecx
1792 je L(42bytes)
1793 cmp $43, %ecx
1794 je L(43bytes)
1795 cmp $44, %ecx
1796 je L(44bytes)
1797 cmp $45, %ecx
1798 je L(45bytes)
1799 cmp $46, %ecx
1800 je L(46bytes)
1801 jmp L(47bytes)
1802
1803 .p2align 4
1804 L(44bytes):
1805 mov -44(%eax), %ecx
1806 mov -44(%edx), %ebx
1807 cmp %ebx, %ecx
1808 jne L(find_diff)
1809 L(40bytes):
1810 mov -40(%eax), %ecx
1811 mov -40(%edx), %ebx
1812 cmp %ebx, %ecx
1813 jne L(find_diff)
1814 L(36bytes):
1815 mov -36(%eax), %ecx
1816 mov -36(%edx), %ebx
1817 cmp %ebx, %ecx
1818 jne L(find_diff)
1819 L(32bytes):
1820 mov -32(%eax), %ecx
1821 mov -32(%edx), %ebx
1822 cmp %ebx, %ecx
1823 jne L(find_diff)
1824 L(28bytes):
1825 mov -28(%eax), %ecx
1826 mov -28(%edx), %ebx
1827 cmp %ebx, %ecx
1828 jne L(find_diff)
1829 L(24bytes):
1830 mov -24(%eax), %ecx
1831 mov -24(%edx), %ebx
1832 cmp %ebx, %ecx
1833 jne L(find_diff)
1834 L(20bytes):
1835 mov -20(%eax), %ecx
1836 mov -20(%edx), %ebx
1837 cmp %ebx, %ecx
1838 jne L(find_diff)
1839 L(16bytes):
1840 mov -16(%eax), %ecx
1841 mov -16(%edx), %ebx
1842 cmp %ebx, %ecx
1843 jne L(find_diff)
1844 L(12bytes):
1845 mov -12(%eax), %ecx
1846 mov -12(%edx), %ebx
1847 cmp %ebx, %ecx
1848 jne L(find_diff)
1849 L(8bytes):
1850 mov -8(%eax), %ecx
1851 mov -8(%edx), %ebx
1852 cmp %ebx, %ecx
1853 jne L(find_diff)
1854 L(4bytes):
1855 mov -4(%eax), %ecx
1856 mov -4(%edx), %ebx
1857 cmp %ebx, %ecx
1858 mov $0, %eax
1859 jne L(find_diff)
1860 POP (%ebx)
1861 ret
1862 CFI_PUSH (%ebx)
1863 # else
1864 .p2align 4
1865 L(44bytes):
1866 mov -44(%eax), %ecx
1867 cmp -44(%edx), %ecx
1868 jne L(find_diff)
1869 L(40bytes):
1870 mov -40(%eax), %ecx
1871 cmp -40(%edx), %ecx
1872 jne L(find_diff)
1873 L(36bytes):
1874 mov -36(%eax), %ecx
1875 cmp -36(%edx), %ecx
1876 jne L(find_diff)
1877 L(32bytes):
1878 mov -32(%eax), %ecx
1879 cmp -32(%edx), %ecx
1880 jne L(find_diff)
1881 L(28bytes):
1882 mov -28(%eax), %ecx
1883 cmp -28(%edx), %ecx
1884 jne L(find_diff)
1885 L(24bytes):
1886 mov -24(%eax), %ecx
1887 cmp -24(%edx), %ecx
1888 jne L(find_diff)
1889 L(20bytes):
1890 mov -20(%eax), %ecx
1891 cmp -20(%edx), %ecx
1892 jne L(find_diff)
1893 L(16bytes):
1894 mov -16(%eax), %ecx
1895 cmp -16(%edx), %ecx
1896 jne L(find_diff)
1897 L(12bytes):
1898 mov -12(%eax), %ecx
1899 cmp -12(%edx), %ecx
1900 jne L(find_diff)
1901 L(8bytes):
1902 mov -8(%eax), %ecx
1903 cmp -8(%edx), %ecx
1904 jne L(find_diff)
1905 L(4bytes):
1906 mov -4(%eax), %ecx
1907 xor %eax, %eax
1908 cmp -4(%edx), %ecx
1909 jne L(find_diff)
1910 POP (%ebx)
1911 ret
1912 CFI_PUSH (%ebx)
1913 # endif
1914
1915 # ifndef USE_AS_WMEMCMP
1916
1917 .p2align 4
1918 L(45bytes):
1919 mov -45(%eax), %ecx
1920 mov -45(%edx), %ebx
1921 cmp %ebx, %ecx
1922 jne L(find_diff)
1923 L(41bytes):
1924 mov -41(%eax), %ecx
1925 mov -41(%edx), %ebx
1926 cmp %ebx, %ecx
1927 jne L(find_diff)
1928 L(37bytes):
1929 mov -37(%eax), %ecx
1930 mov -37(%edx), %ebx
1931 cmp %ebx, %ecx
1932 jne L(find_diff)
1933 L(33bytes):
1934 mov -33(%eax), %ecx
1935 mov -33(%edx), %ebx
1936 cmp %ebx, %ecx
1937 jne L(find_diff)
1938 L(29bytes):
1939 mov -29(%eax), %ecx
1940 mov -29(%edx), %ebx
1941 cmp %ebx, %ecx
1942 jne L(find_diff)
1943 L(25bytes):
1944 mov -25(%eax), %ecx
1945 mov -25(%edx), %ebx
1946 cmp %ebx, %ecx
1947 jne L(find_diff)
1948 L(21bytes):
1949 mov -21(%eax), %ecx
1950 mov -21(%edx), %ebx
1951 cmp %ebx, %ecx
1952 jne L(find_diff)
1953 L(17bytes):
1954 mov -17(%eax), %ecx
1955 mov -17(%edx), %ebx
1956 cmp %ebx, %ecx
1957 jne L(find_diff)
1958 L(13bytes):
1959 mov -13(%eax), %ecx
1960 mov -13(%edx), %ebx
1961 cmp %ebx, %ecx
1962 jne L(find_diff)
1963 L(9bytes):
1964 mov -9(%eax), %ecx
1965 mov -9(%edx), %ebx
1966 cmp %ebx, %ecx
1967 jne L(find_diff)
1968 L(5bytes):
1969 mov -5(%eax), %ecx
1970 mov -5(%edx), %ebx
1971 cmp %ebx, %ecx
1972 jne L(find_diff)
1973 movzbl -1(%eax), %ecx
1974 cmp -1(%edx), %cl
1975 mov $0, %eax
1976 jne L(end)
1977 POP (%ebx)
1978 ret
1979 CFI_PUSH (%ebx)
1980
1981 .p2align 4
1982 L(46bytes):
1983 mov -46(%eax), %ecx
1984 mov -46(%edx), %ebx
1985 cmp %ebx, %ecx
1986 jne L(find_diff)
1987 L(42bytes):
1988 mov -42(%eax), %ecx
1989 mov -42(%edx), %ebx
1990 cmp %ebx, %ecx
1991 jne L(find_diff)
1992 L(38bytes):
1993 mov -38(%eax), %ecx
1994 mov -38(%edx), %ebx
1995 cmp %ebx, %ecx
1996 jne L(find_diff)
1997 L(34bytes):
1998 mov -34(%eax), %ecx
1999 mov -34(%edx), %ebx
2000 cmp %ebx, %ecx
2001 jne L(find_diff)
2002 L(30bytes):
2003 mov -30(%eax), %ecx
2004 mov -30(%edx), %ebx
2005 cmp %ebx, %ecx
2006 jne L(find_diff)
2007 L(26bytes):
2008 mov -26(%eax), %ecx
2009 mov -26(%edx), %ebx
2010 cmp %ebx, %ecx
2011 jne L(find_diff)
2012 L(22bytes):
2013 mov -22(%eax), %ecx
2014 mov -22(%edx), %ebx
2015 cmp %ebx, %ecx
2016 jne L(find_diff)
2017 L(18bytes):
2018 mov -18(%eax), %ecx
2019 mov -18(%edx), %ebx
2020 cmp %ebx, %ecx
2021 jne L(find_diff)
2022 L(14bytes):
2023 mov -14(%eax), %ecx
2024 mov -14(%edx), %ebx
2025 cmp %ebx, %ecx
2026 jne L(find_diff)
2027 L(10bytes):
2028 mov -10(%eax), %ecx
2029 mov -10(%edx), %ebx
2030 cmp %ebx, %ecx
2031 jne L(find_diff)
2032 L(6bytes):
2033 mov -6(%eax), %ecx
2034 mov -6(%edx), %ebx
2035 cmp %ebx, %ecx
2036 jne L(find_diff)
2037 L(2bytes):
2038 movzwl -2(%eax), %ecx
2039 movzwl -2(%edx), %ebx
2040 cmp %bl, %cl
2041 jne L(end)
2042 cmp %bh, %ch
2043 mov $0, %eax
2044 jne L(end)
2045 POP (%ebx)
2046 ret
2047 CFI_PUSH (%ebx)
2048
2049 .p2align 4
2050 L(47bytes):
2051 movl -47(%eax), %ecx
2052 movl -47(%edx), %ebx
2053 cmp %ebx, %ecx
2054 jne L(find_diff)
2055 L(43bytes):
2056 movl -43(%eax), %ecx
2057 movl -43(%edx), %ebx
2058 cmp %ebx, %ecx
2059 jne L(find_diff)
2060 L(39bytes):
2061 movl -39(%eax), %ecx
2062 movl -39(%edx), %ebx
2063 cmp %ebx, %ecx
2064 jne L(find_diff)
2065 L(35bytes):
2066 movl -35(%eax), %ecx
2067 movl -35(%edx), %ebx
2068 cmp %ebx, %ecx
2069 jne L(find_diff)
2070 L(31bytes):
2071 movl -31(%eax), %ecx
2072 movl -31(%edx), %ebx
2073 cmp %ebx, %ecx
2074 jne L(find_diff)
2075 L(27bytes):
2076 movl -27(%eax), %ecx
2077 movl -27(%edx), %ebx
2078 cmp %ebx, %ecx
2079 jne L(find_diff)
2080 L(23bytes):
2081 movl -23(%eax), %ecx
2082 movl -23(%edx), %ebx
2083 cmp %ebx, %ecx
2084 jne L(find_diff)
2085 L(19bytes):
2086 movl -19(%eax), %ecx
2087 movl -19(%edx), %ebx
2088 cmp %ebx, %ecx
2089 jne L(find_diff)
2090 L(15bytes):
2091 movl -15(%eax), %ecx
2092 movl -15(%edx), %ebx
2093 cmp %ebx, %ecx
2094 jne L(find_diff)
2095 L(11bytes):
2096 movl -11(%eax), %ecx
2097 movl -11(%edx), %ebx
2098 cmp %ebx, %ecx
2099 jne L(find_diff)
2100 L(7bytes):
2101 movl -7(%eax), %ecx
2102 movl -7(%edx), %ebx
2103 cmp %ebx, %ecx
2104 jne L(find_diff)
2105 L(3bytes):
2106 movzwl -3(%eax), %ecx
2107 movzwl -3(%edx), %ebx
2108 cmpb %bl, %cl
2109 jne L(end)
2110 cmp %bx, %cx
2111 jne L(end)
2112 movzbl -1(%eax), %eax
2113 cmpb -1(%edx), %al
2114 mov $0, %eax
2115 jne L(end)
2116 POP (%ebx)
2117 ret
2118 CFI_PUSH (%ebx)
2119
2120 .p2align 4
2121 L(find_diff):
2122 cmpb %bl, %cl
2123 jne L(end)
2124 cmp %bx, %cx
2125 jne L(end)
2126 shr $16,%ecx
2127 shr $16,%ebx
2128 cmp %bl, %cl
2129 jne L(end)
2130 cmp %bx, %cx
2131
2132 .p2align 4
2133 L(end):
2134 POP (%ebx)
2135 mov $1, %eax
2136 ja L(bigger)
2137 neg %eax
2138 L(bigger):
2139 ret
2140 # else
2141
2142 /* for wmemcmp */
2143 .p2align 4
2144 L(find_diff):
2145 POP (%ebx)
2146 mov $1, %eax
2147 jg L(find_diff_bigger)
2148 neg %eax
2149 ret
2150
2151 .p2align 4
2152 L(find_diff_bigger):
2153 ret
2154
2155 # endif
2156 END (MEMCMP)
2157 #endif