1 /* Optimized wcscmp for x86-64 with SSE2.
2 Copyright (C) 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 * This implementation uses SSE to compare up to 16 bytes at a time.
30 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
33 and $63, %eax /* rsi alignment in cache line */
34 and $63, %edx /* rdi alignment in cache line */
80 movdqu 16(%rdi), %xmm1
81 movdqu 16(%rsi), %xmm2
82 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
83 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
84 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
86 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
87 jnz L(less4_double_words_16)
89 movdqu 32(%rdi), %xmm1
90 movdqu 32(%rsi), %xmm2
91 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
92 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
93 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
95 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
96 jnz L(less4_double_words_32)
98 movdqu 48(%rdi), %xmm1
99 movdqu 48(%rsi), %xmm2
100 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
101 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
102 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
104 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
105 jnz L(less4_double_words_48)
109 jmp L(continue_48_48)
147 movdqu 16(%rdi), %xmm1
148 movdqu 16(%rsi), %xmm2
149 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
150 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
151 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
153 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
154 jnz L(less4_double_words_16)
156 movdqu 32(%rdi), %xmm1
157 movdqu 32(%rsi), %xmm2
158 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
159 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
160 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
162 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
163 jnz L(less4_double_words_32)
206 pcmpeqd (%rdi), %xmm0
210 jnz L(less4_double_words1)
227 movdqu 16(%rsi), %xmm2
228 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
229 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
230 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
232 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
233 jnz L(less4_double_words_16)
235 movdqu 32(%rsi), %xmm2
236 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
237 pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
238 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
240 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
241 jnz L(less4_double_words_32)
243 movdqu 48(%rsi), %xmm2
244 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
245 pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
246 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
248 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
249 jnz L(less4_double_words_48)
253 jmp L(continue_00_48)
316 movdqu 32(%rdi), %xmm1
317 movdqu 32(%rsi), %xmm2
318 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
319 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
320 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
322 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
323 jnz L(less4_double_words_32)
325 movdqu 48(%rdi), %xmm1
326 movdqu 48(%rsi), %xmm2
327 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
328 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
329 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
331 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
332 jnz L(less4_double_words_48)
336 jmp L(continue_32_48)
375 movdqu 16(%rdi), %xmm1
376 movdqu 16(%rsi), %xmm2
377 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
378 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
379 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
381 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
382 jnz L(less4_double_words_16)
408 movdqu 48(%rdi), %xmm1
409 movdqu 48(%rsi), %xmm2
410 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
411 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
412 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
414 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
415 jnz L(less4_double_words_48)
419 jmp L(continue_16_48)
424 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
425 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
426 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
428 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
429 jnz L(less4_double_words)
431 movdqa 16(%rdi), %xmm3
432 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
433 pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
434 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
436 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
437 jnz L(less4_double_words_16)
439 movdqa 32(%rdi), %xmm5
440 pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
441 pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
442 psubb %xmm0, %xmm5 /* packed sub of comparison results*/
444 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
445 jnz L(less4_double_words_32)
447 movdqa 48(%rdi), %xmm1
448 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
449 pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
450 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
452 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
453 jnz L(less4_double_words_48)
457 jmp L(continue_00_00)
462 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
463 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
464 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
466 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
467 jnz L(less4_double_words)
471 jmp L(continue_00_48)
476 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
477 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
478 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
480 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
481 jnz L(less4_double_words)
483 movdqu 16(%rsi), %xmm2
484 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
485 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
486 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
488 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
489 jnz L(less4_double_words_16)
493 jmp L(continue_00_48)
498 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
499 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
500 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
502 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
503 jnz L(less4_double_words)
505 movdqu 16(%rsi), %xmm2
506 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
507 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
508 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
510 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
511 jnz L(less4_double_words_16)
513 movdqu 32(%rsi), %xmm2
514 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
515 pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
516 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
518 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
519 jnz L(less4_double_words_32)
523 jmp L(continue_00_48)
527 pcmpeqd (%rsi), %xmm0
531 jnz L(less4_double_words1)
548 movdqu 16(%rdi), %xmm1
549 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
550 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
551 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
553 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
554 jnz L(less4_double_words_16)
556 movdqu 32(%rdi), %xmm1
557 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
558 pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
559 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
561 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
562 jnz L(less4_double_words_32)
564 movdqu 48(%rdi), %xmm1
565 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
566 pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
567 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
569 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
570 jnz L(less4_double_words_48)
574 jmp L(continue_48_00)
579 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
580 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
581 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
583 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
584 jnz L(less4_double_words)
588 jmp L(continue_48_00)
593 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
594 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
595 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
597 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
598 jnz L(less4_double_words)
600 movdqu 16(%rdi), %xmm1
601 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
602 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
603 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
605 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
606 jnz L(less4_double_words_16)
610 jmp L(continue_48_00)
615 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
616 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
617 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
619 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
620 jnz L(less4_double_words)
622 movdqu 16(%rdi), %xmm1
623 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
624 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
625 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
627 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
628 jnz L(less4_double_words_16)
630 movdqu 32(%rdi), %xmm1
631 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
632 pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
633 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
635 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
636 jnz L(less4_double_words_32)
640 jmp L(continue_48_00)
646 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
647 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
648 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
650 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
651 jnz L(less4_double_words)
655 jmp L(continue_48_48)
661 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
662 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
663 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
665 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
666 jnz L(less4_double_words)
668 movdqu 16(%rdi), %xmm3
669 movdqu 16(%rsi), %xmm4
670 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
671 pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
672 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
674 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
675 jnz L(less4_double_words_16)
679 jmp L(continue_48_48)
685 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
686 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
687 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
689 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
690 jnz L(less4_double_words)
692 movdqu 16(%rdi), %xmm3
693 movdqu 16(%rsi), %xmm4
694 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
695 pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
696 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
698 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
699 jnz L(less4_double_words_16)
701 movdqu 32(%rdi), %xmm1
702 movdqu 32(%rsi), %xmm2
703 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
704 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
705 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
707 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
708 jnz L(less4_double_words_32)
712 jmp L(continue_48_48)
718 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
719 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
720 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
722 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
723 jnz L(less4_double_words)
725 movdqu 16(%rdi), %xmm1
726 movdqu 16(%rsi), %xmm2
727 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
728 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
729 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
731 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
732 jnz L(less4_double_words_16)
736 jmp L(continue_32_48)
742 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
743 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
744 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
746 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
747 jnz L(less4_double_words)
751 jmp L(continue_16_48)
757 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
758 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
759 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
761 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
762 jnz L(less4_double_words)
766 jmp L(continue_32_48)
769 L(less4_double_words1):
793 L(less4_double_words):
795 jz L(next_two_double_words)
797 jz L(second_double_word)
803 L(second_double_word):
809 L(next_two_double_words):
811 jz L(fourth_double_word)
817 L(fourth_double_word):
823 L(less4_double_words_16):
825 jz L(next_two_double_words_16)
827 jz L(second_double_word_16)
833 L(second_double_word_16):
839 L(next_two_double_words_16):
841 jz L(fourth_double_word_16)
847 L(fourth_double_word_16):
853 L(less4_double_words_32):
855 jz L(next_two_double_words_32)
857 jz L(second_double_word_32)
863 L(second_double_word_32):
869 L(next_two_double_words_32):
871 jz L(fourth_double_word_32)
877 L(fourth_double_word_32):
883 L(less4_double_words_48):
885 jz L(next_two_double_words_48)
887 jz L(second_double_word_48)
893 L(second_double_word_48):
899 L(next_two_double_words_48):
901 jz L(fourth_double_word_48)
907 L(fourth_double_word_48):
931 libc_hidden_def (wcscmp)