1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 # define STRCMP __strcmp_evex
27 # define PAGE_SIZE 4096
29 /* VEC_SIZE = Number of bytes in a ymm register */
32 /* Shift for dividing by (VEC_SIZE * 4). */
33 # define DIVIDE_BY_VEC_4_SHIFT 7
34 # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35 # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
38 # define VMOVU vmovdqu64
39 # define VMOVA vmovdqa64
42 /* Compare packed dwords. */
44 # define SHIFT_REG32 r8d
45 # define SHIFT_REG64 r8
46 /* 1 dword char == 4 bytes. */
47 # define SIZE_OF_CHAR 4
49 /* Compare packed bytes. */
51 # define SHIFT_REG32 ecx
52 # define SHIFT_REG64 rcx
53 /* 1 byte char == 1 byte. */
54 # define SIZE_OF_CHAR 1
57 # define XMMZERO xmm16
61 # define YMMZERO ymm16
72 wcscmp/wcsncmp have to use SIGNED comparison for elements.
73 strcmp/strncmp have to use UNSIGNED comparison for elements.
76 /* The main idea of the string comparison (byte or dword) using 256-bit
77 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
78 latter can be on either packed bytes or dwords depending on
79 USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
80 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
81 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
82 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
83 instructions. Main loop (away from from page boundary) compares 4
84 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
87 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
88 is the same as strcmp, except that an a maximum offset is tracked. If
89 the maximum offset is reached before a difference is found, zero is
92 .section .text.evex,"ax",@progbits
94 # ifdef USE_AS_STRNCMP
95 /* Check for simple cases (0 or 1) in offset. */
100 /* Convert units: from wide to byte char. */
103 /* Register %r11 tracks the maximum offset. */
108 /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
109 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
111 andl $(PAGE_SIZE - 1), %eax
112 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
114 /* Start comparing 4 vectors. */
118 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
119 VPCMP $4, %YMM0, %YMM1, %k0
121 /* Check for NULL in YMM0. */
122 VPCMP $0, %YMMZERO, %YMM0, %k1
123 /* Check for NULL in YMM1. */
124 VPCMP $0, %YMMZERO, %YMM1, %k2
125 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
128 /* Each bit in K1 represents:
129 1. A mismatch in YMM0 and YMM1. Or
130 2. A NULL in YMM0 or YMM1.
138 # ifdef USE_AS_WCSCMP
139 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
142 # ifdef USE_AS_STRNCMP
143 /* Return 0 if the mismatched index (%rdx) is after the maximum
148 # ifdef USE_AS_WCSCMP
150 movl (%rdi, %rdx), %ecx
151 cmpl (%rsi, %rdx), %ecx
159 movzbl (%rdi, %rdx), %eax
160 movzbl (%rsi, %rdx), %edx
169 # ifdef USE_AS_WCSCMP
170 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
173 # ifdef USE_AS_STRNCMP
174 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
175 the maximum offset (%r11). */
179 # ifdef USE_AS_WCSCMP
181 movl (%rdi, %rdx), %ecx
182 cmpl (%rsi, %rdx), %ecx
185 movzbl (%rdi, %rdx), %eax
186 movzbl (%rsi, %rdx), %edx
190 # ifdef USE_AS_WCSCMP
192 movl VEC_SIZE(%rdi, %rdx), %ecx
193 cmpl VEC_SIZE(%rsi, %rdx), %ecx
196 movzbl VEC_SIZE(%rdi, %rdx), %eax
197 movzbl VEC_SIZE(%rsi, %rdx), %edx
204 L(return_2_vec_size):
207 # ifdef USE_AS_WCSCMP
208 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
211 # ifdef USE_AS_STRNCMP
212 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
213 after the maximum offset (%r11). */
214 addq $(VEC_SIZE * 2), %rdx
217 # ifdef USE_AS_WCSCMP
219 movl (%rdi, %rdx), %ecx
220 cmpl (%rsi, %rdx), %ecx
223 movzbl (%rdi, %rdx), %eax
224 movzbl (%rsi, %rdx), %edx
228 # ifdef USE_AS_WCSCMP
230 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
231 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
234 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
235 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
242 L(return_3_vec_size):
245 # ifdef USE_AS_WCSCMP
246 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
249 # ifdef USE_AS_STRNCMP
250 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
251 after the maximum offset (%r11). */
252 addq $(VEC_SIZE * 3), %rdx
255 # ifdef USE_AS_WCSCMP
257 movl (%rdi, %rdx), %ecx
258 cmpl (%rsi, %rdx), %ecx
261 movzbl (%rdi, %rdx), %eax
262 movzbl (%rsi, %rdx), %edx
266 # ifdef USE_AS_WCSCMP
268 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
269 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
272 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
273 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
281 VMOVU VEC_SIZE(%rdi), %YMM0
282 VMOVU VEC_SIZE(%rsi), %YMM1
283 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
284 VPCMP $4, %YMM0, %YMM1, %k0
285 VPCMP $0, %YMMZERO, %YMM0, %k1
286 VPCMP $0, %YMMZERO, %YMM1, %k2
287 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
289 /* Each bit in K1 represents a NULL or a mismatch. */
292 jne L(return_vec_size)
294 VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
295 VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
296 VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
297 VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
299 /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
300 VPCMP $4, %YMM2, %YMM4, %k0
301 VPCMP $0, %YMMZERO, %YMM2, %k1
302 VPCMP $0, %YMMZERO, %YMM4, %k2
303 /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
305 /* Each bit in K1 represents a NULL or a mismatch. */
308 jne L(return_2_vec_size)
310 /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
311 VPCMP $4, %YMM3, %YMM5, %k0
312 VPCMP $0, %YMMZERO, %YMM3, %k1
313 VPCMP $0, %YMMZERO, %YMM5, %k2
314 /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
316 /* Each bit in K1 represents a NULL or a mismatch. */
319 jne L(return_3_vec_size)
321 leaq (VEC_SIZE * 4)(%rdi), %rdx
322 movl $PAGE_SIZE, %ecx
323 /* Align load via RAX. */
324 andq $-(VEC_SIZE * 4), %rdx
326 leaq (%rdi, %rdx), %rax
327 # ifdef USE_AS_STRNCMP
328 /* Starting from this point, the maximum offset, or simply the
329 'offset', DECREASES by the same amount when base pointers are
330 moved forward. Return 0 when:
331 1) On match: offset <= the matched vector index.
332 2) On mistmach, offset is before the mistmatched index.
339 andl $(PAGE_SIZE - 1), %esi
340 /* Number of bytes before page crossing. */
342 /* Number of VEC_SIZE * 4 blocks before page crossing. */
343 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
344 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
350 # ifdef USE_AS_STRNCMP
351 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
352 the maximum offset (%r11) by the same amount. */
353 subq $(VEC_SIZE * 4), %r11
356 addq $(VEC_SIZE * 4), %rax
357 addq $(VEC_SIZE * 4), %rdx
361 je L(loop_cross_page)
363 /* Main loop, comparing 4 vectors are a time. */
365 VMOVA VEC_SIZE(%rax), %YMM2
366 VMOVA (VEC_SIZE * 2)(%rax), %YMM4
367 VMOVA (VEC_SIZE * 3)(%rax), %YMM6
369 VMOVU VEC_SIZE(%rdx), %YMM3
370 VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
371 VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
373 VPCMP $4, %YMM0, %YMM1, %k0
374 VPCMP $0, %YMMZERO, %YMM0, %k1
375 VPCMP $0, %YMMZERO, %YMM1, %k2
377 /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
381 VPCMP $4, %YMM2, %YMM3, %k0
382 VPCMP $0, %YMMZERO, %YMM2, %k1
383 VPCMP $0, %YMMZERO, %YMM3, %k2
385 /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
389 VPCMP $4, %YMM4, %YMM5, %k0
390 VPCMP $0, %YMMZERO, %YMM4, %k1
391 VPCMP $0, %YMMZERO, %YMM5, %k2
393 /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
397 VPCMP $4, %YMM6, %YMM7, %k0
398 VPCMP $0, %YMMZERO, %YMM6, %k1
399 VPCMP $0, %YMMZERO, %YMM7, %k2
401 /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
408 /* Test each mask (32 bits) individually because for VEC_SIZE
409 == 32 is not possible to OR the four masks and keep all bits
410 in a 64-bit integer register, differing from SSE2 strcmp
411 where ORing is possible. */
418 # ifdef USE_AS_WCSCMP
419 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
422 # ifdef USE_AS_STRNCMP
425 # ifdef USE_AS_WCSCMP
428 movl (%rsi, %rcx), %edi
429 cmpl (%rdx, %rcx), %edi
432 movzbl (%rax, %rcx), %eax
433 movzbl (%rdx, %rcx), %edx
437 # ifdef USE_AS_WCSCMP
440 movl (%rsi, %rcx), %edi
441 cmpl (%rdx, %rcx), %edi
444 movzbl (%rax, %rcx), %eax
445 movzbl (%rdx, %rcx), %edx
453 # ifdef USE_AS_STRNCMP
454 /* The first vector matched. Return 0 if the maximum offset
455 (%r11) <= VEC_SIZE. */
463 # ifdef USE_AS_WCSCMP
464 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
467 # ifdef USE_AS_STRNCMP
471 # ifdef USE_AS_WCSCMP
474 movl (%rsi, %rdi), %ecx
475 cmpl (%rdx, %rdi), %ecx
478 movzbl (%rax, %rdi), %eax
479 movzbl (%rdx, %rdi), %edx
483 # ifdef USE_AS_WCSCMP
486 movl VEC_SIZE(%rsi, %rdi), %ecx
487 cmpl VEC_SIZE(%rdx, %rdi), %ecx
490 movzbl VEC_SIZE(%rax, %rdi), %eax
491 movzbl VEC_SIZE(%rdx, %rdi), %edx
499 # ifdef USE_AS_STRNCMP
500 /* The first 2 vectors matched. Return 0 if the maximum offset
501 (%r11) <= 2 * VEC_SIZE. */
502 cmpq $(VEC_SIZE * 2), %r11
509 # ifdef USE_AS_WCSCMP
510 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
513 # ifdef USE_AS_STRNCMP
514 addq $(VEC_SIZE * 2), %rdi
517 # ifdef USE_AS_WCSCMP
520 movl (%rsi, %rdi), %ecx
521 cmpl (%rdx, %rdi), %ecx
524 movzbl (%rax, %rdi), %eax
525 movzbl (%rdx, %rdi), %edx
529 # ifdef USE_AS_WCSCMP
532 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
533 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
536 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
537 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
545 # ifdef USE_AS_STRNCMP
546 /* The first 3 vectors matched. Return 0 if the maximum offset
547 (%r11) <= 3 * VEC_SIZE. */
548 cmpq $(VEC_SIZE * 3), %r11
553 # ifdef USE_AS_WCSCMP
554 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
557 # ifdef USE_AS_STRNCMP
558 addq $(VEC_SIZE * 3), %rcx
561 # ifdef USE_AS_WCSCMP
564 movl (%rsi, %rcx), %esi
565 cmpl (%rdx, %rcx), %esi
568 movzbl (%rax, %rcx), %eax
569 movzbl (%rdx, %rcx), %edx
573 # ifdef USE_AS_WCSCMP
576 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
577 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
580 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
581 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
591 /* Align load via RDX. We load the extra ECX bytes which should
593 andl $((VEC_SIZE * 4) - 1), %ecx
597 /* This works only if VEC_SIZE * 2 == 64. */
598 # if (VEC_SIZE * 2) != 64
599 # error (VEC_SIZE * 2) != 64
602 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
603 cmpl $(VEC_SIZE * 2), %ecx
604 jge L(loop_cross_page_2_vec)
606 VMOVU (%rax, %r10), %YMM2
607 VMOVU VEC_SIZE(%rax, %r10), %YMM3
608 VMOVU (%rdx, %r10), %YMM4
609 VMOVU VEC_SIZE(%rdx, %r10), %YMM5
611 VPCMP $4, %YMM4, %YMM2, %k0
612 VPCMP $0, %YMMZERO, %YMM2, %k1
613 VPCMP $0, %YMMZERO, %YMM4, %k2
615 /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
619 VPCMP $4, %YMM5, %YMM3, %k3
620 VPCMP $0, %YMMZERO, %YMM3, %k4
621 VPCMP $0, %YMMZERO, %YMM5, %k5
623 /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
627 # ifdef USE_AS_WCSCMP
628 /* NB: Each bit in K1/K3 represents 4-byte element. */
629 kshiftlw $8, %k3, %k2
630 /* NB: Divide shift count by 4 since each bit in K1 represent 4
632 movl %ecx, %SHIFT_REG32
633 sarl $2, %SHIFT_REG32
635 kshiftlq $32, %k3, %k2
638 /* Each bit in K1 represents a NULL or a mismatch. */
642 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
643 shrxq %SHIFT_REG64, %rdi, %rdi
645 je L(loop_cross_page_2_vec)
647 # ifdef USE_AS_WCSCMP
648 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
651 # ifdef USE_AS_STRNCMP
654 # ifdef USE_AS_WCSCMP
657 movl (%rsi, %rcx), %edi
658 cmpl (%rdx, %rcx), %edi
661 movzbl (%rax, %rcx), %eax
662 movzbl (%rdx, %rcx), %edx
666 # ifdef USE_AS_WCSCMP
669 movl (%rsi, %rcx), %edi
670 cmpl (%rdx, %rcx), %edi
673 movzbl (%rax, %rcx), %eax
674 movzbl (%rdx, %rcx), %edx
681 L(loop_cross_page_2_vec):
682 /* The first VEC_SIZE * 2 bytes match or are ignored. */
683 VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
684 VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
685 VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
686 VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
688 VPCMP $4, %YMM0, %YMM2, %k0
689 VPCMP $0, %YMMZERO, %YMM0, %k1
690 VPCMP $0, %YMMZERO, %YMM2, %k2
692 /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
696 VPCMP $4, %YMM1, %YMM3, %k3
697 VPCMP $0, %YMMZERO, %YMM1, %k4
698 VPCMP $0, %YMMZERO, %YMM3, %k5
700 /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
704 # ifdef USE_AS_WCSCMP
705 /* NB: Each bit in K1/K3 represents 4-byte element. */
706 kshiftlw $8, %k3, %k2
708 kshiftlq $32, %k3, %k2
711 /* Each bit in K1 represents a NULL or a mismatch. */
716 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
717 subl $(VEC_SIZE * 2), %ecx
719 /* R8 has number of bytes skipped. */
721 # ifdef USE_AS_WCSCMP
722 /* NB: Divide shift count by 4 since each bit in K1 represent 4
726 /* Skip ECX bytes. */
729 /* Before jumping back to the loop, set ESI to the number of
730 VEC_SIZE * 4 blocks before page crossing. */
731 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
734 # ifdef USE_AS_STRNCMP
735 /* At this point, if %rdi value is 0, it already tested
736 VEC_SIZE*4+%r10 byte starting from %rax. This label
737 checks whether strncmp maximum offset reached or not. */
738 je L(string_nbyte_offset_check)
743 # ifdef USE_AS_WCSCMP
744 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
748 /* Adjust for number of bytes skipped. */
750 # ifdef USE_AS_STRNCMP
751 addq $(VEC_SIZE * 2), %rcx
754 # ifdef USE_AS_WCSCMP
757 movl (%rsi, %rcx), %edi
758 cmpl (%rdx, %rcx), %edi
761 movzbl (%rax, %rcx), %eax
762 movzbl (%rdx, %rcx), %edx
766 # ifdef USE_AS_WCSCMP
769 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
770 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
773 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
774 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
780 # ifdef USE_AS_STRNCMP
781 L(string_nbyte_offset_check):
782 leaq (VEC_SIZE * 4)(%r10), %r10
790 /* Check one byte/dword at a time. */
791 # ifdef USE_AS_WCSCMP
797 addl $SIZE_OF_CHAR, %edx
798 cmpl $(VEC_SIZE * 4), %edx
799 je L(main_loop_header)
800 # ifdef USE_AS_STRNCMP
804 # ifdef USE_AS_WCSCMP
805 movl (%rdi, %rdx), %eax
806 movl (%rsi, %rdx), %ecx
808 movzbl (%rdi, %rdx), %eax
809 movzbl (%rsi, %rdx), %ecx
811 /* Check null char. */
813 jne L(cross_page_loop)
814 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
817 # ifndef USE_AS_WCSCMP
822 # ifdef USE_AS_WCSCMP
825 /* Use movl to avoid modifying EFLAGS. */
833 # ifdef USE_AS_STRNCMP
841 # ifdef USE_AS_WCSCMP
858 # ifdef USE_AS_STRNCMP
862 # ifdef USE_AS_WCSCMP
863 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
866 # ifdef USE_AS_STRNCMP
870 # ifdef USE_AS_WCSCMP
872 movl (%rdi, %rdx), %ecx
873 cmpl (%rsi, %rdx), %ecx
876 movzbl (%rdi, %rdx), %eax
877 movzbl (%rsi, %rdx), %edx
882 /* Comparing on page boundary region requires special treatment:
883 It must done one vector at the time, starting with the wider
884 ymm vector if possible, if not, with xmm. If fetching 16 bytes
885 (xmm) still passes the boundary, byte comparison must be done.
889 /* Try one ymm vector at a time. */
890 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
891 jg L(cross_page_1_vector)
893 VMOVU (%rdi, %rdx), %YMM0
894 VMOVU (%rsi, %rdx), %YMM1
896 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
897 VPCMP $4, %YMM0, %YMM1, %k0
898 VPCMP $0, %YMMZERO, %YMM0, %k1
899 VPCMP $0, %YMMZERO, %YMM1, %k2
900 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
902 /* Each bit in K1 represents a NULL or a mismatch. */
911 # ifdef USE_AS_STRNCMP
912 /* Return 0 if the current offset (%rdx) >= the maximum offset
917 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
919 L(cross_page_1_vector):
920 /* Less than 32 bytes to check, try one xmm vector. */
921 cmpl $(PAGE_SIZE - 16), %eax
922 jg L(cross_page_1_xmm)
923 VMOVU (%rdi, %rdx), %XMM0
924 VMOVU (%rsi, %rdx), %XMM1
926 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
927 VPCMP $4, %XMM0, %XMM1, %k0
928 VPCMP $0, %XMMZERO, %XMM0, %k1
929 VPCMP $0, %XMMZERO, %XMM1, %k2
930 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
932 /* Each bit in K1 represents a NULL or a mismatch. */
939 # ifndef USE_AS_WCSCMP
942 # ifdef USE_AS_STRNCMP
943 /* Return 0 if the current offset (%rdx) >= the maximum offset
950 # ifndef USE_AS_WCSCMP
951 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
952 for wcscmp nor wcsncmp since wide char is 4 bytes. */
953 cmpl $(PAGE_SIZE - 8), %eax
954 jg L(cross_page_8bytes)
955 vmovq (%rdi, %rdx), %XMM0
956 vmovq (%rsi, %rdx), %XMM1
958 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
959 VPCMP $4, %XMM0, %XMM1, %k0
960 VPCMP $0, %XMMZERO, %XMM0, %k1
961 VPCMP $0, %XMMZERO, %XMM1, %k2
962 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
964 /* Each bit in K1 represents a NULL or a mismatch. */
968 # ifdef USE_AS_WCSCMP
969 /* Only last 2 bits are valid. */
972 /* Only last 8 bits are valid. */
981 # ifdef USE_AS_STRNCMP
982 /* Return 0 if the current offset (%rdx) >= the maximum offset
988 L(cross_page_8bytes):
989 /* Less than 8 bytes to check, try 4 byte vector. */
990 cmpl $(PAGE_SIZE - 4), %eax
991 jg L(cross_page_4bytes)
992 vmovd (%rdi, %rdx), %XMM0
993 vmovd (%rsi, %rdx), %XMM1
995 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
996 VPCMP $4, %XMM0, %XMM1, %k0
997 VPCMP $0, %XMMZERO, %XMM0, %k1
998 VPCMP $0, %XMMZERO, %XMM1, %k2
999 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
1001 /* Each bit in K1 represents a NULL or a mismatch. */
1005 # ifdef USE_AS_WCSCMP
1006 /* Only the last bit is valid. */
1009 /* Only last 4 bits are valid. */
1017 # ifdef USE_AS_STRNCMP
1018 /* Return 0 if the current offset (%rdx) >= the maximum offset
1024 L(cross_page_4bytes):
1026 /* Less than 4 bytes to check, try one byte/dword at a time. */
1027 # ifdef USE_AS_STRNCMP
1031 # ifdef USE_AS_WCSCMP
1032 movl (%rdi, %rdx), %eax
1033 movl (%rsi, %rdx), %ecx
1035 movzbl (%rdi, %rdx), %eax
1036 movzbl (%rsi, %rdx), %ecx
1039 jne L(cross_page_loop)