1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
23 # define STRCMP_ISA _evex
24 # include "strcmp-naming.h"
27 # if defined USE_AS_STRCASECMP_L
28 # include "locale-defines.h"
32 # define STRCMP __strcmp_evex
35 # define PAGE_SIZE 4096
37 /* VEC_SIZE = Number of bytes in a ymm register. */
39 # define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
41 # define VMOVU vmovdqu64
42 # define VMOVA vmovdqa64
45 # define TESTEQ subl $0xff,
46 /* Compare packed dwords. */
48 # define VPMINU vpminud
49 # define VPTESTM vptestmd
50 # define VPTESTNM vptestnmd
51 /* 1 dword char == 4 bytes. */
52 # define SIZE_OF_CHAR 4
55 /* Compare packed bytes. */
57 # define VPMINU vpminub
58 # define VPTESTM vptestmb
59 # define VPTESTNM vptestnmb
60 /* 1 byte char == 1 byte. */
61 # define SIZE_OF_CHAR 1
64 # ifdef USE_AS_STRNCMP
66 # define LOOP_REG64 r9
68 # define OFFSET_REG8 r9b
69 # define OFFSET_REG r9d
70 # define OFFSET_REG64 r9
73 # define LOOP_REG64 rdx
75 # define OFFSET_REG8 dl
76 # define OFFSET_REG edx
77 # define OFFSET_REG64 rdx
80 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
83 # define VEC_OFFSET (-VEC_SIZE)
112 # ifdef USE_AS_STRCASECMP_L
113 # define BYTE_LOOP_REG OFFSET_REG
115 # define BYTE_LOOP_REG ecx
118 # ifdef USE_AS_STRCASECMP_L
119 # ifdef USE_AS_STRNCMP
120 # define LOCALE_REG rcx
121 # define LOCALE_REG_LP RCX_LP
123 # define LOCALE_REG rdx
124 # define LOCALE_REG_LP RDX_LP
128 # define LCASE_MIN_YMM %YMM12
129 # define LCASE_MAX_YMM %YMM13
130 # define CASE_ADD_YMM %YMM14
132 # define LCASE_MIN_XMM %XMM12
133 # define LCASE_MAX_XMM %XMM13
134 # define CASE_ADD_XMM %XMM14
136 /* NB: wcsncmp uses r11 but strcasecmp is never used in
137 conjunction with wcscmp. */
138 # define TOLOWER_BASE %r11
140 # ifdef USE_AS_STRCASECMP_L
141 # define _REG(x, y) x ## y
142 # define REG(x, y) _REG(x, y)
143 # define TOLOWER(reg1, reg2, ext) \
144 vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
145 vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
146 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
147 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
148 vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
149 vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
151 # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152 # define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
153 # define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
155 # define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
156 TOLOWER (s1_reg, s2_reg, ext); \
157 VPCMP $0, s1_reg, s2_reg, reg_out
159 # define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
160 VMOVU s2_mem, s2_reg; \
161 CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
163 # define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
164 # define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
166 # define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
167 # define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
170 # define TOLOWER_gpr(...)
171 # define TOLOWER_YMM(...)
172 # define TOLOWER_XMM(...)
174 # define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
175 VPCMP $0, s2_reg, s1_reg, reg_out
177 # define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
179 # define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
180 VPCMP $0, s2_mem, s1_reg, reg_out
182 # define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
186 wcscmp/wcsncmp have to use SIGNED comparison for elements.
187 strcmp/strncmp have to use UNSIGNED comparison for elements.
190 /* The main idea of the string comparison (byte or dword) using 256-bit
191 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
192 latter can be on either packed bytes or dwords depending on
193 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
194 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
195 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
196 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
197 instructions. Main loop (away from from page boundary) compares 4
198 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
201 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
202 is the same as strcmp, except that an a maximum offset is tracked. If
203 the maximum offset is reached before a difference is found, zero is
206 .section .text.evex, "ax", @progbits
208 .type STRCMP, @function
210 # ifdef USE_AS_STRCASECMP_L
212 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
213 mov %fs:(%rax), %LOCALE_REG_LP
215 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
218 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
227 # if defined USE_AS_STRCASECMP_L
228 /* We have to fall back on the C implementation for locales with
229 encodings not matching ASCII for single bytes. */
230 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
231 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
233 mov (%LOCALE_REG), %RAX_LP
235 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
236 jne STRCASECMP_L_NONASCII
237 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
240 # ifdef USE_AS_STRNCMP
241 /* Don't overwrite LOCALE_REG (rcx) until we have pass
242 L(one_or_less). Otherwise we might use the wrong locale in
243 the OVERFLOW_STRCMP (strcasecmp_l). */
245 /* Clear the upper 32 bits. */
249 /* Signed comparison intentional. We use this branch to also
250 test cases where length >= 2^63. These very large sizes can be
251 handled with strcmp as there is no way for that length to
252 actually bound the buffer. */
256 # if defined USE_AS_STRCASECMP_L
257 .section .rodata.cst32, "aM", @progbits, 32
260 .quad 0x4141414141414141
261 .quad 0x4141414141414141
262 .quad 0x4141414141414141
263 .quad 0x4141414141414141
265 .quad 0x1a1a1a1a1a1a1a1a
266 .quad 0x1a1a1a1a1a1a1a1a
267 .quad 0x1a1a1a1a1a1a1a1a
268 .quad 0x1a1a1a1a1a1a1a1a
270 .quad 0x2020202020202020
271 .quad 0x2020202020202020
272 .quad 0x2020202020202020
273 .quad 0x2020202020202020
276 vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
277 vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
278 vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
283 /* Shift out the bits irrelivant to page boundary ([63:12]). */
285 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
286 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
290 /* Safe to compare 4x vectors. */
292 VPTESTM %YMM0, %YMM0, %k2
293 /* Each bit cleared in K1 represents a mismatch or a null CHAR
294 in YMM0 and 32 bytes at (%rsi). */
295 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
297 # ifdef USE_AS_STRNCMP
298 cmpq $CHAR_PER_VEC, %rdx
299 jbe L(vec_0_test_len)
302 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
305 /* All 1s represents all equals. TESTEQ will overflow to zero in
306 all equals case. Otherwise 1s will carry until position of first
314 # ifdef USE_AS_WCSCMP
315 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
317 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
323 movzbl (%rdi, %rcx), %eax
324 movzbl (%rsi, %rcx), %ecx
325 TOLOWER_gpr (%rax, %eax)
326 TOLOWER_gpr (%rcx, %ecx)
332 # ifdef USE_AS_STRNCMP
336 bzhil %edx, %ecx, %eax
338 /* Align if will cross fetch block. */
346 # ifdef USE_AS_STRCASECMP_L
347 /* Set locale argument for strcasecmp. */
348 movq %LOCALE_REG, %rdx
351 /* 'nbe' covers the case where length is negative (large
354 # ifdef USE_AS_WCSCMP
365 TOLOWER_gpr (%rax, %eax)
366 TOLOWER_gpr (%rcx, %ecx)
376 # ifdef USE_AS_STRNCMP
377 /* rdx must be > CHAR_PER_VEC so its safe to subtract without
378 worrying about underflow. */
379 addq $-CHAR_PER_VEC, %rdx
383 # ifdef USE_AS_WCSCMP
384 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
386 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
392 movzbl VEC_SIZE(%rdi, %rcx), %eax
393 movzbl VEC_SIZE(%rsi, %rcx), %ecx
394 TOLOWER_gpr (%rax, %eax)
395 TOLOWER_gpr (%rcx, %ecx)
402 # ifdef USE_AS_STRNCMP
404 # if CHAR_PER_VEC <= 16
405 sall $CHAR_PER_VEC, %ecx
407 salq $CHAR_PER_VEC, %rcx
411 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
417 # ifdef USE_AS_STRNCMP
422 # ifdef USE_AS_WCSCMP
423 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
425 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
431 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
432 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
433 TOLOWER_gpr (%rax, %eax)
434 TOLOWER_gpr (%rcx, %ecx)
440 # ifndef USE_AS_STRNCMP
444 # ifdef USE_AS_WCSCMP
445 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
447 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
453 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
454 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
455 TOLOWER_gpr (%rax, %eax)
456 TOLOWER_gpr (%rcx, %ecx)
463 /* 32 byte align here ensures the main loop is ideally aligned
467 /* Safe to compare 4x vectors. */
468 VMOVU (VEC_SIZE)(%rdi), %YMM0
469 VPTESTM %YMM0, %YMM0, %k2
470 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
475 # ifdef USE_AS_STRNCMP
476 subq $(CHAR_PER_VEC * 2), %rdx
480 VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
481 VPTESTM %YMM0, %YMM0, %k2
482 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
487 VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
488 VPTESTM %YMM0, %YMM0, %k2
489 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
494 # ifdef USE_AS_STRNCMP
495 cmpq $(CHAR_PER_VEC * 2), %rdx
500 # ifdef USE_AS_WCSCMP
501 /* any non-zero positive value that doesn't inference with 0x1.
509 /* The prepare labels are various entry points from the page
513 # ifdef USE_AS_STRNCMP
514 # ifdef USE_AS_WCSCMP
515 L(prepare_loop_no_len):
517 andl $(VEC_SIZE * 4 - 1), %ecx
519 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
521 /* Store N + (VEC_SIZE * 4) and place check at the begining of
523 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
524 L(prepare_loop_no_len):
527 L(prepare_loop_no_len):
530 /* Align s1 and adjust s2 accordingly. */
532 andq $-(VEC_SIZE * 4), %rdi
533 L(prepare_loop_readj):
535 # if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
539 L(prepare_loop_aligned):
540 /* eax stores distance from rsi to next page cross. These cases
541 need to be handled specially as the 4x loop could potentially
542 read memory past the length of s1 or s2 and across a page
544 movl $-(VEC_SIZE * 4), %eax
546 andl $(PAGE_SIZE - 1), %eax
549 /* Loop 4x comparisons at a time. */
553 /* End condition for strncmp. */
554 # ifdef USE_AS_STRNCMP
555 subq $(CHAR_PER_VEC * 4), %rdx
559 subq $-(VEC_SIZE * 4), %rdi
560 subq $-(VEC_SIZE * 4), %rsi
562 /* Check if rsi loads will cross a page boundary. */
563 addl $-(VEC_SIZE * 4), %eax
564 jnb L(page_cross_during_loop)
566 /* Loop entry after handling page cross during loop. */
567 L(loop_skip_page_cross_check):
568 VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
569 VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
570 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
571 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
573 VPMINU %YMM0, %YMM2, %YMM8
574 VPMINU %YMM4, %YMM6, %YMM9
576 /* A zero CHAR in YMM9 means that there is a null CHAR. */
577 VPMINU %YMM8, %YMM9, %YMM9
579 /* Each bit set in K1 represents a non-null CHAR in YMM9. */
580 VPTESTM %YMM9, %YMM9, %k1
581 # ifndef USE_AS_STRCASECMP_L
582 vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
583 vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
584 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
585 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
586 oring with YMM1. Result is stored in YMM6. */
587 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
589 VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
590 TOLOWER_YMM (%YMM0, %YMM1)
591 VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
592 TOLOWER_YMM (%YMM2, %YMM3)
593 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
594 TOLOWER_YMM (%YMM4, %YMM5)
595 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
596 TOLOWER_YMM (%YMM6, %YMM7)
597 vpxorq %YMM0, %YMM1, %YMM1
598 vpxorq %YMM2, %YMM3, %YMM3
599 vpxorq %YMM4, %YMM5, %YMM5
600 vpternlogd $0xde, %YMM7, %YMM1, %YMM6
602 /* Or together YMM3, YMM5, and YMM6. */
603 vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
606 /* A non-zero CHAR in YMM6 represents a mismatch. */
607 VPTESTNM %YMM6, %YMM6, %k0{%k1}
614 /* Find which VEC has the mismatch of end of string. */
615 VPTESTM %YMM0, %YMM0, %k1
616 VPTESTNM %YMM1, %YMM1, %k0{%k1}
619 jnz L(return_vec_0_end)
621 VPTESTM %YMM2, %YMM2, %k1
622 VPTESTNM %YMM3, %YMM3, %k0{%k1}
625 jnz L(return_vec_1_end)
628 /* Handle VEC 2 and 3 without branches. */
629 L(return_vec_2_3_end):
630 # ifdef USE_AS_STRNCMP
631 subq $(CHAR_PER_VEC * 2), %rdx
635 VPTESTM %YMM4, %YMM4, %k1
636 VPTESTNM %YMM5, %YMM5, %k0{%k1}
639 # if CHAR_PER_VEC <= 16
640 sall $CHAR_PER_VEC, %LOOP_REG
643 salq $CHAR_PER_VEC, %LOOP_REG64
644 orq %rcx, %LOOP_REG64
647 /* LOOP_REG contains matches for null/mismatch from the loop. If
648 VEC 0,1,and 2 all have no null and no mismatches then mismatch
649 must entirely be from VEC 3 which is fully represented by
651 # if CHAR_PER_VEC <= 16
652 tzcntl %LOOP_REG, %LOOP_REG
654 tzcntq %LOOP_REG64, %LOOP_REG64
656 # ifdef USE_AS_STRNCMP
657 cmpq %LOOP_REG64, %rdx
661 # ifdef USE_AS_WCSCMP
662 movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
664 cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
670 movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
671 movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
672 TOLOWER_gpr (%rax, %eax)
673 TOLOWER_gpr (%rcx, %ecx)
681 # ifdef USE_AS_STRNCMP
689 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
690 they use the value of `r8` to negate the return value. This is
691 because the page cross logic can swap `rdi` and `rsi`. */
693 # ifdef USE_AS_STRNCMP
695 # if CHAR_PER_VEC <= 16
696 sall $CHAR_PER_VEC, %ecx
698 salq $CHAR_PER_VEC, %rcx
702 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
708 # ifdef USE_AS_STRNCMP
713 # ifdef USE_AS_WCSCMP
714 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
716 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
720 /* This is the non-zero case for `eax` so just xorl with `r8d`
721 flip is `rdi` and `rsi` where swapped. */
724 movzbl (%rdi, %rcx), %eax
725 movzbl (%rsi, %rcx), %ecx
726 TOLOWER_gpr (%rax, %eax)
727 TOLOWER_gpr (%rcx, %ecx)
729 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
730 logic. Subtract `r8d` after xor for zero case. */
737 # ifndef USE_AS_STRNCMP
741 # ifdef USE_AS_WCSCMP
742 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
744 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
750 movzbl VEC_SIZE(%rdi, %rcx), %eax
751 movzbl VEC_SIZE(%rsi, %rcx), %ecx
752 TOLOWER_gpr (%rax, %eax)
753 TOLOWER_gpr (%rcx, %ecx)
763 /* Page cross in rsi in next 4x VEC. */
765 /* TODO: Improve logic here. */
767 L(page_cross_during_loop):
768 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
770 /* Optimistically rsi and rdi and both aligned in which case we
771 don't need any logic here. */
772 cmpl $-(VEC_SIZE * 4), %eax
773 /* Don't adjust eax before jumping back to loop and we will
774 never hit page cross case again. */
775 je L(loop_skip_page_cross_check)
777 /* Check if we can safely load a VEC. */
778 cmpl $-(VEC_SIZE * 3), %eax
779 jle L(less_1x_vec_till_page_cross)
782 VPTESTM %YMM0, %YMM0, %k2
783 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
786 jnz L(return_vec_0_end)
788 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
789 cmpl $-(VEC_SIZE * 2), %eax
790 jg L(more_2x_vec_till_page_cross)
793 L(less_1x_vec_till_page_cross):
794 subl $-(VEC_SIZE * 4), %eax
795 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
796 concerning case is first iteration if incoming s1 was near start
797 of a page and s2 near end. If s1 was near the start of the page
798 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
799 to read back -VEC_SIZE. If rdi is truly at the start of a page
800 here, it means the previous page (rdi - VEC_SIZE) has already
801 been loaded earlier so must be valid. */
802 VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
803 VPTESTM %YMM0, %YMM0, %k2
804 CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
805 /* Mask of potentially valid bits. The lower bits can be out of
806 range comparisons (but safe regarding page crosses). */
808 # ifdef USE_AS_WCSCMP
811 andl $(VEC_SIZE - 1), %ecx
813 shlxl %ecx, %r10d, %ecx
817 shlxl %esi, %ecx, %r10d
824 # ifdef USE_AS_STRNCMP
825 # ifdef USE_AS_WCSCMP
826 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
834 jbe L(return_page_cross_end_check)
836 movl %eax, %OFFSET_REG
838 /* Readjust eax before potentially returning to the loop. */
839 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
842 jz L(loop_skip_page_cross_check)
845 L(return_page_cross_end):
848 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
849 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
850 L(return_page_cross_cmp_mem):
852 addl %OFFSET_REG, %ecx
854 # ifdef USE_AS_WCSCMP
855 movl VEC_OFFSET(%rdi, %rcx), %edx
857 cmpl VEC_OFFSET(%rsi, %rcx), %edx
863 movzbl VEC_OFFSET(%rdi, %rcx), %eax
864 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
865 TOLOWER_gpr (%rax, %eax)
866 TOLOWER_gpr (%rcx, %ecx)
874 # ifdef USE_AS_STRNCMP
876 L(return_page_cross_end_check):
879 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
880 # ifdef USE_AS_WCSCMP
884 ja L(return_page_cross_cmp_mem)
891 L(more_2x_vec_till_page_cross):
892 /* If more 2x vec till cross we will complete a full loop
895 VMOVA VEC_SIZE(%rdi), %YMM0
896 VPTESTM %YMM0, %YMM0, %k2
897 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
900 jnz L(return_vec_1_end)
902 # ifdef USE_AS_STRNCMP
903 cmpq $(CHAR_PER_VEC * 2), %rdx
904 jbe L(ret_zero_in_loop_page_cross)
907 subl $-(VEC_SIZE * 4), %eax
909 /* Safe to include comparisons from lower bytes. */
910 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
911 VPTESTM %YMM0, %YMM0, %k2
912 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
915 jnz L(return_vec_page_cross_0)
917 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
918 VPTESTM %YMM0, %YMM0, %k2
919 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
922 jnz L(return_vec_page_cross_1)
924 # ifdef USE_AS_STRNCMP
925 /* Must check length here as length might proclude reading next
927 # ifdef USE_AS_WCSCMP
928 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
936 jbe L(ret_zero_in_loop_page_cross)
939 /* Finish the loop. */
940 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
941 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
942 VPMINU %YMM4, %YMM6, %YMM9
943 VPTESTM %YMM9, %YMM9, %k1
944 # ifndef USE_AS_STRCASECMP_L
945 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
946 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
947 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
949 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
950 TOLOWER_YMM (%YMM4, %YMM5)
951 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
952 TOLOWER_YMM (%YMM6, %YMM7)
953 vpxorq %YMM4, %YMM5, %YMM5
954 vpternlogd $0xde, %YMM7, %YMM5, %YMM6
956 VPTESTNM %YMM6, %YMM6, %k0{%k1}
959 jnz L(return_vec_2_3_end)
961 /* Best for code size to include ucond-jmp here. Would be faster
962 if this case is hot to duplicate the L(return_vec_2_3_end) code
963 as fall-through and have jump back to loop on mismatch
965 subq $-(VEC_SIZE * 4), %rdi
966 subq $-(VEC_SIZE * 4), %rsi
967 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
968 # ifdef USE_AS_STRNCMP
969 subq $(CHAR_PER_VEC * 4), %rdx
970 ja L(loop_skip_page_cross_check)
971 L(ret_zero_in_loop_page_cross):
975 jmp L(loop_skip_page_cross_check)
980 L(return_vec_page_cross_0):
981 addl $-VEC_SIZE, %eax
982 L(return_vec_page_cross_1):
984 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
985 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
986 # ifdef USE_AS_STRNCMP
987 # ifdef USE_AS_WCSCMP
988 /* Must divide ecx instead of multiply rdx due to overflow. */
995 jbe L(ret_zero_in_loop_page_cross)
1001 # ifdef USE_AS_WCSCMP
1002 movl VEC_OFFSET(%rdi, %rcx), %edx
1004 cmpl VEC_OFFSET(%rsi, %rcx), %edx
1010 movzbl VEC_OFFSET(%rdi, %rcx), %eax
1011 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
1012 TOLOWER_gpr (%rax, %eax)
1013 TOLOWER_gpr (%rcx, %ecx)
1024 # ifndef USE_AS_STRNCMP
1025 /* If both are VEC aligned we don't need any special logic here.
1026 Only valid for strcmp where stop condition is guranteed to be
1027 reachable by just reading memory. */
1028 testl $((VEC_SIZE - 1) << 20), %eax
1034 andl $(PAGE_SIZE - 1), %eax
1035 andl $(PAGE_SIZE - 1), %ecx
1037 xorl %OFFSET_REG, %OFFSET_REG
1039 /* Check which is closer to page cross, s1 or s2. */
1043 /* The previous page cross check has false positives. Check for
1044 true positive as page cross logic is very expensive. */
1045 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
1046 jbe L(no_page_cross)
1049 /* Set r8 to not interfere with normal return value (rdi and rsi
1051 # ifdef USE_AS_WCSCMP
1052 /* any non-zero positive value that doesn't inference with 0x1.
1059 /* Check if less than 1x VEC till page cross. */
1060 subl $(VEC_SIZE * 3), %eax
1061 jg L(less_1x_vec_till_page)
1064 /* If more than 1x VEC till page cross, loop throuh safely
1065 loadable memory until within 1x VEC of page cross. */
1068 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1069 VPTESTM %YMM0, %YMM0, %k2
1070 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1073 jnz L(check_ret_vec_page_cross)
1074 addl $CHAR_PER_VEC, %OFFSET_REG
1075 # ifdef USE_AS_STRNCMP
1076 cmpq %OFFSET_REG64, %rdx
1077 jbe L(ret_zero_page_cross)
1079 addl $VEC_SIZE, %eax
1080 jl L(page_cross_loop)
1082 # ifdef USE_AS_WCSCMP
1087 subl %eax, %OFFSET_REG
1088 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1089 to not cross page so is safe to load. Since we have already
1090 loaded at least 1 VEC from rsi it is also guranteed to be safe.
1092 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1093 VPTESTM %YMM0, %YMM0, %k2
1094 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1097 # ifdef USE_AS_STRNCMP
1098 leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1100 jbe L(check_ret_vec_page_cross2)
1101 # ifdef USE_AS_WCSCMP
1102 addq $-(CHAR_PER_VEC * 2), %rdx
1108 jz L(prepare_loop_no_len)
1111 L(ret_vec_page_cross):
1112 # ifndef USE_AS_STRNCMP
1113 L(check_ret_vec_page_cross):
1116 addl %OFFSET_REG, %ecx
1117 L(ret_vec_page_cross_cont):
1118 # ifdef USE_AS_WCSCMP
1119 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1121 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1127 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1128 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1129 TOLOWER_gpr (%rax, %eax)
1130 TOLOWER_gpr (%rcx, %ecx)
1139 # ifdef USE_AS_STRNCMP
1141 L(check_ret_vec_page_cross2):
1143 L(check_ret_vec_page_cross):
1145 addl %OFFSET_REG, %ecx
1147 ja L(ret_vec_page_cross_cont)
1149 L(ret_zero_page_cross):
1156 /* Ensure this is a true page cross. */
1157 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1158 jbe L(no_page_cross)
1166 /* set r8 to negate return value as rdi and rsi swapped. */
1167 # ifdef USE_AS_WCSCMP
1172 xorl %OFFSET_REG, %OFFSET_REG
1174 /* Check if more than 1x VEC till page cross. */
1175 subl $(VEC_SIZE * 3), %eax
1176 jle L(page_cross_loop)
1179 L(less_1x_vec_till_page):
1180 # ifdef USE_AS_WCSCMP
1183 /* Find largest load size we can use. */
1184 cmpl $(16 / SIZE_OF_CHAR), %eax
1185 ja L(less_16_till_page)
1187 /* Use 16 byte comparison. */
1188 vmovdqu (%rdi), %xmm0
1189 VPTESTM %xmm0, %xmm0, %k2
1190 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1192 # ifdef USE_AS_WCSCMP
1197 jnz L(check_ret_vec_page_cross)
1198 movl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1199 # ifdef USE_AS_STRNCMP
1200 cmpq %OFFSET_REG64, %rdx
1201 jbe L(ret_zero_page_cross_slow_case0)
1202 subl %eax, %OFFSET_REG
1204 /* Explicit check for 16 byte alignment. */
1205 subl %eax, %OFFSET_REG
1208 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1209 VPTESTM %xmm0, %xmm0, %k2
1210 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1212 # ifdef USE_AS_WCSCMP
1217 jnz L(check_ret_vec_page_cross)
1218 # ifdef USE_AS_STRNCMP
1219 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1220 subq %OFFSET_REG64, %rdx
1221 jbe L(ret_zero_page_cross_slow_case0)
1222 subq $-(CHAR_PER_VEC * 4), %rdx
1224 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1225 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1227 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1228 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1230 jmp L(prepare_loop_aligned)
1232 # ifdef USE_AS_STRNCMP
1234 L(ret_zero_page_cross_slow_case0):
1241 L(less_16_till_page):
1242 cmpl $(24 / SIZE_OF_CHAR), %eax
1243 ja L(less_8_till_page)
1245 /* Use 8 byte comparison. */
1248 VPTESTM %xmm0, %xmm0, %k2
1249 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1251 # ifdef USE_AS_WCSCMP
1256 jnz L(check_ret_vec_page_cross)
1259 # ifdef USE_AS_STRNCMP
1260 cmpq $(8 / SIZE_OF_CHAR), %rdx
1261 jbe L(ret_zero_page_cross_slow_case0)
1263 movl $(24 / SIZE_OF_CHAR), %OFFSET_REG
1264 subl %eax, %OFFSET_REG
1266 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1267 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1268 VPTESTM %xmm0, %xmm0, %k2
1269 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1271 # ifdef USE_AS_WCSCMP
1276 jnz L(check_ret_vec_page_cross)
1279 # ifdef USE_AS_STRNCMP
1280 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
1281 subq %OFFSET_REG64, %rdx
1282 jbe L(ret_zero_page_cross_slow_case0)
1283 subq $-(CHAR_PER_VEC * 4), %rdx
1285 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1286 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1288 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1289 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1291 jmp L(prepare_loop_aligned)
1297 L(less_8_till_page):
1298 # ifdef USE_AS_WCSCMP
1299 /* If using wchar then this is the only check before we reach
1300 the page boundary. */
1304 jnz L(ret_less_8_wcs)
1305 # ifdef USE_AS_STRNCMP
1306 addq $-(CHAR_PER_VEC * 2), %rdx
1307 /* We already checked for len <= 1 so cannot hit that case here.
1318 movl %OFFSET_REG, %eax
1324 ja L(less_4_till_page)
1328 VPTESTM %xmm0, %xmm0, %k2
1329 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1332 jnz L(check_ret_vec_page_cross)
1334 # ifdef USE_AS_STRNCMP
1336 jbe L(ret_zero_page_cross_slow_case1)
1338 movl $(28 / SIZE_OF_CHAR), %OFFSET_REG
1339 subl %eax, %OFFSET_REG
1341 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1342 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1343 VPTESTM %xmm0, %xmm0, %k2
1344 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1347 jnz L(check_ret_vec_page_cross)
1348 # ifdef USE_AS_STRNCMP
1349 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
1350 subq %OFFSET_REG64, %rdx
1351 jbe L(ret_zero_page_cross_slow_case1)
1352 subq $-(CHAR_PER_VEC * 4), %rdx
1354 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1355 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1357 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1358 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1360 jmp L(prepare_loop_aligned)
1363 # ifdef USE_AS_STRNCMP
1365 L(ret_zero_page_cross_slow_case1):
1371 L(less_4_till_page):
1373 /* Extremely slow byte comparison loop. */
1376 movzbl (%rsi, %rdi), %ecx
1377 TOLOWER_gpr (%rax, %eax)
1378 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1379 subl %BYTE_LOOP_REG, %eax
1380 jnz L(ret_less_4_loop)
1382 jz L(ret_zero_4_loop)
1383 # ifdef USE_AS_STRNCMP
1385 jz L(ret_zero_4_loop)
1388 /* end condition is reach page boundary (rdi is aligned). */
1391 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1392 addq $-(VEC_SIZE * 4), %rdi
1393 # ifdef USE_AS_STRNCMP
1394 subq $-(CHAR_PER_VEC * 4), %rdx
1396 jmp L(prepare_loop_aligned)
1407 .size STRCMP, .-STRCMP