1 /* wcscmp optimized with SSE2.
2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 /* ISA level >= 2 because there is no wcscmp-sse4 implementations. */
22 #if ISA_SHOULD_BUILD (2)
25 /* Needed to get right name. */
26 # define USE_AS_WCSCMP
27 # define STRCMP_ISA _sse2
28 # include "strcmp-naming.h"
30 /* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
35 * This implementation uses SSE to compare up to 16 bytes at a time.
39 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
42 and $63, %eax /* rsi alignment in cache line */
43 and $63, %edx /* rdi alignment in cache line */
89 movdqu 16(%rdi), %xmm1
90 movdqu 16(%rsi), %xmm2
91 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
92 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
93 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
95 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
96 jnz L(less4_double_words_16)
98 movdqu 32(%rdi), %xmm1
99 movdqu 32(%rsi), %xmm2
100 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
101 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
102 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
104 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
105 jnz L(less4_double_words_32)
107 movdqu 48(%rdi), %xmm1
108 movdqu 48(%rsi), %xmm2
109 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
110 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
111 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
113 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
114 jnz L(less4_double_words_48)
118 jmp L(continue_48_48)
156 movdqu 16(%rdi), %xmm1
157 movdqu 16(%rsi), %xmm2
158 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
159 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
160 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
162 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
163 jnz L(less4_double_words_16)
165 movdqu 32(%rdi), %xmm1
166 movdqu 32(%rsi), %xmm2
167 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
168 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
169 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
171 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
172 jnz L(less4_double_words_32)
215 pcmpeqd (%rdi), %xmm0
219 jnz L(less4_double_words1)
236 movdqu 16(%rsi), %xmm2
237 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
238 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
239 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
241 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
242 jnz L(less4_double_words_16)
244 movdqu 32(%rsi), %xmm2
245 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
246 pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
247 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
249 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
250 jnz L(less4_double_words_32)
252 movdqu 48(%rsi), %xmm2
253 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
254 pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
255 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
257 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
258 jnz L(less4_double_words_48)
262 jmp L(continue_00_48)
325 movdqu 32(%rdi), %xmm1
326 movdqu 32(%rsi), %xmm2
327 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
328 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
329 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
331 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
332 jnz L(less4_double_words_32)
334 movdqu 48(%rdi), %xmm1
335 movdqu 48(%rsi), %xmm2
336 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
337 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
338 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
340 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
341 jnz L(less4_double_words_48)
345 jmp L(continue_32_48)
384 movdqu 16(%rdi), %xmm1
385 movdqu 16(%rsi), %xmm2
386 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
387 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
388 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
390 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
391 jnz L(less4_double_words_16)
417 movdqu 48(%rdi), %xmm1
418 movdqu 48(%rsi), %xmm2
419 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
420 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
421 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
423 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
424 jnz L(less4_double_words_48)
428 jmp L(continue_16_48)
433 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
434 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
435 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
437 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
438 jnz L(less4_double_words)
440 movdqa 16(%rdi), %xmm3
441 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
442 pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
443 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
445 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
446 jnz L(less4_double_words_16)
448 movdqa 32(%rdi), %xmm5
449 pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
450 pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
451 psubb %xmm0, %xmm5 /* packed sub of comparison results*/
453 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
454 jnz L(less4_double_words_32)
456 movdqa 48(%rdi), %xmm1
457 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
458 pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
459 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
461 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
462 jnz L(less4_double_words_48)
466 jmp L(continue_00_00)
471 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
472 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
473 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
475 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
476 jnz L(less4_double_words)
480 jmp L(continue_00_48)
485 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
486 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
487 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
489 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
490 jnz L(less4_double_words)
492 movdqu 16(%rsi), %xmm2
493 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
494 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
495 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
497 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
498 jnz L(less4_double_words_16)
502 jmp L(continue_00_48)
507 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
508 pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
509 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
511 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
512 jnz L(less4_double_words)
514 movdqu 16(%rsi), %xmm2
515 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
516 pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
517 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
519 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
520 jnz L(less4_double_words_16)
522 movdqu 32(%rsi), %xmm2
523 pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
524 pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
525 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
527 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
528 jnz L(less4_double_words_32)
532 jmp L(continue_00_48)
536 pcmpeqd (%rsi), %xmm0
540 jnz L(less4_double_words1)
557 movdqu 16(%rdi), %xmm1
558 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
559 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
560 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
562 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
563 jnz L(less4_double_words_16)
565 movdqu 32(%rdi), %xmm1
566 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
567 pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
568 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
570 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
571 jnz L(less4_double_words_32)
573 movdqu 48(%rdi), %xmm1
574 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
575 pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
576 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
578 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
579 jnz L(less4_double_words_48)
583 jmp L(continue_48_00)
588 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
589 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
590 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
592 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
593 jnz L(less4_double_words)
597 jmp L(continue_48_00)
602 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
603 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
604 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
606 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
607 jnz L(less4_double_words)
609 movdqu 16(%rdi), %xmm1
610 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
611 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
612 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
614 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
615 jnz L(less4_double_words_16)
619 jmp L(continue_48_00)
624 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
625 pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
626 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
628 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
629 jnz L(less4_double_words)
631 movdqu 16(%rdi), %xmm1
632 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
633 pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
634 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
636 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
637 jnz L(less4_double_words_16)
639 movdqu 32(%rdi), %xmm1
640 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
641 pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
642 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
644 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
645 jnz L(less4_double_words_32)
649 jmp L(continue_48_00)
655 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
656 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
657 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
659 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
660 jnz L(less4_double_words)
664 jmp L(continue_48_48)
670 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
671 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
672 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
674 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
675 jnz L(less4_double_words)
677 movdqu 16(%rdi), %xmm3
678 movdqu 16(%rsi), %xmm4
679 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
680 pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
681 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
683 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
684 jnz L(less4_double_words_16)
688 jmp L(continue_48_48)
694 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
695 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
696 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
698 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
699 jnz L(less4_double_words)
701 movdqu 16(%rdi), %xmm3
702 movdqu 16(%rsi), %xmm4
703 pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
704 pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
705 psubb %xmm0, %xmm3 /* packed sub of comparison results*/
707 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
708 jnz L(less4_double_words_16)
710 movdqu 32(%rdi), %xmm1
711 movdqu 32(%rsi), %xmm2
712 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
713 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
714 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
716 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
717 jnz L(less4_double_words_32)
721 jmp L(continue_48_48)
727 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
728 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
729 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
731 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
732 jnz L(less4_double_words)
734 movdqu 16(%rdi), %xmm1
735 movdqu 16(%rsi), %xmm2
736 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
737 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
738 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
740 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
741 jnz L(less4_double_words_16)
745 jmp L(continue_32_48)
751 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
752 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
753 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
755 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
756 jnz L(less4_double_words)
760 jmp L(continue_16_48)
766 pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
767 pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
768 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
770 sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
771 jnz L(less4_double_words)
775 jmp L(continue_32_48)
778 L(less4_double_words1):
803 L(less4_double_words):
806 jz L(next_two_double_words)
808 jz L(second_double_word)
815 L(second_double_word):
822 L(next_two_double_words):
824 jz L(fourth_double_word)
831 L(fourth_double_word):
838 L(less4_double_words_16):
841 jz L(next_two_double_words_16)
843 jz L(second_double_word_16)
850 L(second_double_word_16):
857 L(next_two_double_words_16):
859 jz L(fourth_double_word_16)
866 L(fourth_double_word_16):
873 L(less4_double_words_32):
876 jz L(next_two_double_words_32)
878 jz L(second_double_word_32)
885 L(second_double_word_32):
892 L(next_two_double_words_32):
894 jz L(fourth_double_word_32)
901 L(fourth_double_word_32):
908 L(less4_double_words_48):
911 jz L(next_two_double_words_48)
913 jz L(second_double_word_48)
920 L(second_double_word_48):
927 L(next_two_double_words_48):
929 jz L(fourth_double_word_48)
936 L(fourth_double_word_48):