1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 # define STRCMP __strcmp_avx2
27 # define PAGE_SIZE 4096
29 /* VEC_SIZE = Number of bytes in a ymm register */
32 /* Shift for dividing by (VEC_SIZE * 4). */
33 # define DIVIDE_BY_VEC_4_SHIFT 7
34 # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35 # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
39 /* Compare packed dwords. */
40 # define VPCMPEQ vpcmpeqd
41 /* Compare packed dwords and store minimum. */
42 # define VPMINU vpminud
43 /* 1 dword char == 4 bytes. */
44 # define SIZE_OF_CHAR 4
46 /* Compare packed bytes. */
47 # define VPCMPEQ vpcmpeqb
48 /* Compare packed bytes and store minimum. */
49 # define VPMINU vpminub
50 /* 1 byte char == 1 byte. */
51 # define SIZE_OF_CHAR 1
55 # define VZEROUPPER vzeroupper
59 wcscmp/wcsncmp have to use SIGNED comparison for elements.
60 strcmp/strncmp have to use UNSIGNED comparison for elements.
63 /* The main idea of the string comparison (byte or dword) using AVX2
64 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
65 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
66 to check the null char, algorithm keeps the matched bytes/dwords,
67 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
68 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
69 one VPMINU instructions, together with movdqu and testl instructions.
70 Main loop (away from from page boundary) compares 4 vectors are a time,
71 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
73 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
74 is the same as strcmp, except that an a maximum offset is tracked. If
75 the maximum offset is reached before a difference is found, zero is
78 .section .text.avx,"ax",@progbits
80 # ifdef USE_AS_STRNCMP
81 /* Check for simple cases (0 or 1) in offset. */
86 /* Convert units: from wide to byte char. */
89 /* Register %r11 tracks the maximum offset. */
94 /* Make %ymm7 all zeros in this function. */
95 vpxor %ymm7, %ymm7, %ymm7
97 andl $(PAGE_SIZE - 1), %eax
98 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
100 /* Start comparing 4 vectors. */
101 vmovdqu (%rdi), %ymm1
102 VPCMPEQ (%rsi), %ymm1, %ymm0
103 VPMINU %ymm1, %ymm0, %ymm0
104 VPCMPEQ %ymm7, %ymm0, %ymm0
105 vpmovmskb %ymm0, %ecx
109 # ifdef USE_AS_STRNCMP
110 /* Return 0 if the mismatched index (%rdx) is after the maximum
115 # ifdef USE_AS_WCSCMP
117 movl (%rdi, %rdx), %ecx
118 cmpl (%rsi, %rdx), %ecx
126 movzbl (%rdi, %rdx), %eax
127 movzbl (%rsi, %rdx), %edx
136 # ifdef USE_AS_STRNCMP
137 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
138 the maximum offset (%r11). */
142 # ifdef USE_AS_WCSCMP
144 movl (%rdi, %rdx), %ecx
145 cmpl (%rsi, %rdx), %ecx
148 movzbl (%rdi, %rdx), %eax
149 movzbl (%rsi, %rdx), %edx
153 # ifdef USE_AS_WCSCMP
155 movl VEC_SIZE(%rdi, %rdx), %ecx
156 cmpl VEC_SIZE(%rsi, %rdx), %ecx
159 movzbl VEC_SIZE(%rdi, %rdx), %eax
160 movzbl VEC_SIZE(%rsi, %rdx), %edx
168 L(return_2_vec_size):
170 # ifdef USE_AS_STRNCMP
171 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
172 after the maximum offset (%r11). */
173 addq $(VEC_SIZE * 2), %rdx
176 # ifdef USE_AS_WCSCMP
178 movl (%rdi, %rdx), %ecx
179 cmpl (%rsi, %rdx), %ecx
182 movzbl (%rdi, %rdx), %eax
183 movzbl (%rsi, %rdx), %edx
187 # ifdef USE_AS_WCSCMP
189 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
190 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
193 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
194 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
202 L(return_3_vec_size):
204 # ifdef USE_AS_STRNCMP
205 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
206 after the maximum offset (%r11). */
207 addq $(VEC_SIZE * 3), %rdx
210 # ifdef USE_AS_WCSCMP
212 movl (%rdi, %rdx), %ecx
213 cmpl (%rsi, %rdx), %ecx
216 movzbl (%rdi, %rdx), %eax
217 movzbl (%rsi, %rdx), %edx
221 # ifdef USE_AS_WCSCMP
223 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
224 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
227 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
228 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
237 vmovdqu VEC_SIZE(%rdi), %ymm6
238 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
239 VPMINU %ymm6, %ymm3, %ymm3
240 VPCMPEQ %ymm7, %ymm3, %ymm3
241 vpmovmskb %ymm3, %ecx
243 jne L(return_vec_size)
244 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
245 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
246 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
247 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
248 VPMINU %ymm5, %ymm2, %ymm2
249 VPCMPEQ %ymm4, %ymm0, %ymm0
250 VPCMPEQ %ymm7, %ymm2, %ymm2
251 vpmovmskb %ymm2, %ecx
253 jne L(return_2_vec_size)
254 VPMINU %ymm4, %ymm0, %ymm0
255 VPCMPEQ %ymm7, %ymm0, %ymm0
256 vpmovmskb %ymm0, %ecx
258 jne L(return_3_vec_size)
260 leaq (VEC_SIZE * 4)(%rdi), %rdx
261 movl $PAGE_SIZE, %ecx
262 /* Align load via RAX. */
263 andq $-(VEC_SIZE * 4), %rdx
265 leaq (%rdi, %rdx), %rax
266 # ifdef USE_AS_STRNCMP
267 /* Starting from this point, the maximum offset, or simply the
268 'offset', DECREASES by the same amount when base pointers are
269 moved forward. Return 0 when:
270 1) On match: offset <= the matched vector index.
271 2) On mistmach, offset is before the mistmatched index.
278 andl $(PAGE_SIZE - 1), %esi
279 /* Number of bytes before page crossing. */
281 /* Number of VEC_SIZE * 4 blocks before page crossing. */
282 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
283 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
289 # ifdef USE_AS_STRNCMP
290 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
291 the maximum offset (%r11) by the same amount. */
292 subq $(VEC_SIZE * 4), %r11
295 addq $(VEC_SIZE * 4), %rax
296 addq $(VEC_SIZE * 4), %rdx
300 je L(loop_cross_page)
302 /* Main loop, comparing 4 vectors are a time. */
303 vmovdqa (%rax), %ymm0
304 vmovdqa VEC_SIZE(%rax), %ymm3
305 VPCMPEQ (%rdx), %ymm0, %ymm4
306 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
307 VPMINU %ymm0, %ymm4, %ymm4
308 VPMINU %ymm3, %ymm1, %ymm1
309 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
310 VPMINU %ymm1, %ymm4, %ymm0
311 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
312 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
313 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
314 VPMINU %ymm2, %ymm5, %ymm5
315 VPMINU %ymm3, %ymm6, %ymm6
316 VPMINU %ymm5, %ymm0, %ymm0
317 VPMINU %ymm6, %ymm0, %ymm0
318 VPCMPEQ %ymm7, %ymm0, %ymm0
320 /* Test each mask (32 bits) individually because for VEC_SIZE
321 == 32 is not possible to OR the four masks and keep all bits
322 in a 64-bit integer register, differing from SSE2 strcmp
323 where ORing is possible. */
324 vpmovmskb %ymm0, %ecx
327 VPCMPEQ %ymm7, %ymm4, %ymm0
328 vpmovmskb %ymm0, %edi
332 # ifdef USE_AS_STRNCMP
335 # ifdef USE_AS_WCSCMP
338 movl (%rsi, %rcx), %edi
339 cmpl (%rdx, %rcx), %edi
342 movzbl (%rax, %rcx), %eax
343 movzbl (%rdx, %rcx), %edx
347 # ifdef USE_AS_WCSCMP
350 movl (%rsi, %rcx), %edi
351 cmpl (%rdx, %rcx), %edi
354 movzbl (%rax, %rcx), %eax
355 movzbl (%rdx, %rcx), %edx
364 # ifdef USE_AS_STRNCMP
365 /* The first vector matched. Return 0 if the maximum offset
366 (%r11) <= VEC_SIZE. */
370 VPCMPEQ %ymm7, %ymm1, %ymm1
371 vpmovmskb %ymm1, %ecx
375 # ifdef USE_AS_STRNCMP
379 # ifdef USE_AS_WCSCMP
382 movl (%rsi, %rdi), %ecx
383 cmpl (%rdx, %rdi), %ecx
386 movzbl (%rax, %rdi), %eax
387 movzbl (%rdx, %rdi), %edx
391 # ifdef USE_AS_WCSCMP
394 movl VEC_SIZE(%rsi, %rdi), %ecx
395 cmpl VEC_SIZE(%rdx, %rdi), %ecx
398 movzbl VEC_SIZE(%rax, %rdi), %eax
399 movzbl VEC_SIZE(%rdx, %rdi), %edx
408 # ifdef USE_AS_STRNCMP
409 /* The first 2 vectors matched. Return 0 if the maximum offset
410 (%r11) <= 2 * VEC_SIZE. */
411 cmpq $(VEC_SIZE * 2), %r11
414 VPCMPEQ %ymm7, %ymm5, %ymm5
415 vpmovmskb %ymm5, %ecx
419 # ifdef USE_AS_STRNCMP
420 addq $(VEC_SIZE * 2), %rdi
423 # ifdef USE_AS_WCSCMP
426 movl (%rsi, %rdi), %ecx
427 cmpl (%rdx, %rdi), %ecx
430 movzbl (%rax, %rdi), %eax
431 movzbl (%rdx, %rdi), %edx
435 # ifdef USE_AS_WCSCMP
438 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
439 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
442 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
443 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
452 # ifdef USE_AS_STRNCMP
453 /* The first 3 vectors matched. Return 0 if the maximum offset
454 (%r11) <= 3 * VEC_SIZE. */
455 cmpq $(VEC_SIZE * 3), %r11
458 VPCMPEQ %ymm7, %ymm6, %ymm6
459 vpmovmskb %ymm6, %esi
461 # ifdef USE_AS_STRNCMP
462 addq $(VEC_SIZE * 3), %rcx
465 # ifdef USE_AS_WCSCMP
468 movl (%rsi, %rcx), %esi
469 cmpl (%rdx, %rcx), %esi
472 movzbl (%rax, %rcx), %eax
473 movzbl (%rdx, %rcx), %edx
477 # ifdef USE_AS_WCSCMP
480 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
481 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
484 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
485 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
496 /* Align load via RDX. We load the extra ECX bytes which should
498 andl $((VEC_SIZE * 4) - 1), %ecx
502 /* This works only if VEC_SIZE * 2 == 64. */
503 # if (VEC_SIZE * 2) != 64
504 # error (VEC_SIZE * 2) != 64
507 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
508 cmpl $(VEC_SIZE * 2), %ecx
509 jge L(loop_cross_page_2_vec)
511 vmovdqu (%rax, %r10), %ymm2
512 vmovdqu VEC_SIZE(%rax, %r10), %ymm3
513 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
514 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
515 VPMINU %ymm2, %ymm0, %ymm0
516 VPMINU %ymm3, %ymm1, %ymm1
517 VPCMPEQ %ymm7, %ymm0, %ymm0
518 VPCMPEQ %ymm7, %ymm1, %ymm1
520 vpmovmskb %ymm0, %edi
521 vpmovmskb %ymm1, %esi
526 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
530 je L(loop_cross_page_2_vec)
532 # ifdef USE_AS_STRNCMP
535 # ifdef USE_AS_WCSCMP
538 movl (%rsi, %rcx), %edi
539 cmpl (%rdx, %rcx), %edi
542 movzbl (%rax, %rcx), %eax
543 movzbl (%rdx, %rcx), %edx
547 # ifdef USE_AS_WCSCMP
550 movl (%rsi, %rcx), %edi
551 cmpl (%rdx, %rcx), %edi
554 movzbl (%rax, %rcx), %eax
555 movzbl (%rdx, %rcx), %edx
563 L(loop_cross_page_2_vec):
564 /* The first VEC_SIZE * 2 bytes match or are ignored. */
565 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
566 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
567 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
568 VPMINU %ymm2, %ymm5, %ymm5
569 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
570 VPCMPEQ %ymm7, %ymm5, %ymm5
571 VPMINU %ymm3, %ymm6, %ymm6
572 VPCMPEQ %ymm7, %ymm6, %ymm6
574 vpmovmskb %ymm5, %edi
575 vpmovmskb %ymm6, %esi
581 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
582 subl $(VEC_SIZE * 2), %ecx
584 /* Skip ECX bytes. */
586 /* R8 has number of bytes skipped. */
589 /* Before jumping back to the loop, set ESI to the number of
590 VEC_SIZE * 4 blocks before page crossing. */
591 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
597 /* Adjust for number of bytes skipped. */
599 # ifdef USE_AS_STRNCMP
600 addq $(VEC_SIZE * 2), %rcx
603 # ifdef USE_AS_WCSCMP
606 movl (%rsi, %rcx), %edi
607 cmpl (%rdx, %rcx), %edi
610 movzbl (%rax, %rcx), %eax
611 movzbl (%rdx, %rcx), %edx
615 # ifdef USE_AS_WCSCMP
618 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
619 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
622 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
623 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
632 /* Check one byte/dword at a time. */
633 # ifdef USE_AS_WCSCMP
639 addl $SIZE_OF_CHAR, %edx
640 cmpl $(VEC_SIZE * 4), %edx
641 je L(main_loop_header)
642 # ifdef USE_AS_STRNCMP
646 # ifdef USE_AS_WCSCMP
647 movl (%rdi, %rdx), %eax
648 movl (%rsi, %rdx), %ecx
650 movzbl (%rdi, %rdx), %eax
651 movzbl (%rsi, %rdx), %ecx
653 /* Check null char. */
655 jne L(cross_page_loop)
656 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
659 # ifndef USE_AS_WCSCMP
665 # ifdef USE_AS_WCSCMP
668 /* Use movl to avoid modifying EFLAGS. */
677 # ifdef USE_AS_STRNCMP
686 # ifdef USE_AS_WCSCMP
704 # ifdef USE_AS_STRNCMP
708 # ifdef USE_AS_STRNCMP
712 # ifdef USE_AS_WCSCMP
714 movl (%rdi, %rdx), %ecx
715 cmpl (%rsi, %rdx), %ecx
718 movzbl (%rdi, %rdx), %eax
719 movzbl (%rsi, %rdx), %edx
725 /* Comparing on page boundary region requires special treatment:
726 It must done one vector at the time, starting with the wider
727 ymm vector if possible, if not, with xmm. If fetching 16 bytes
728 (xmm) still passes the boundary, byte comparison must be done.
732 /* Try one ymm vector at a time. */
733 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
734 jg L(cross_page_1_vector)
736 vmovdqu (%rdi, %rdx), %ymm1
737 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
738 VPMINU %ymm1, %ymm0, %ymm0
739 VPCMPEQ %ymm7, %ymm0, %ymm0
740 vpmovmskb %ymm0, %ecx
747 # ifdef USE_AS_STRNCMP
748 /* Return 0 if the current offset (%rdx) >= the maximum offset
753 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
755 L(cross_page_1_vector):
756 /* Less than 32 bytes to check, try one xmm vector. */
757 cmpl $(PAGE_SIZE - 16), %eax
758 jg L(cross_page_1_xmm)
759 vmovdqu (%rdi, %rdx), %xmm1
760 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
761 VPMINU %xmm1, %xmm0, %xmm0
762 VPCMPEQ %xmm7, %xmm0, %xmm0
763 vpmovmskb %xmm0, %ecx
768 # ifndef USE_AS_WCSCMP
771 # ifdef USE_AS_STRNCMP
772 /* Return 0 if the current offset (%rdx) >= the maximum offset
779 # ifndef USE_AS_WCSCMP
780 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
781 for wcscmp nor wcsncmp since wide char is 4 bytes. */
782 cmpl $(PAGE_SIZE - 8), %eax
783 jg L(cross_page_8bytes)
784 vmovq (%rdi, %rdx), %xmm1
785 vmovq (%rsi, %rdx), %xmm0
786 VPCMPEQ %xmm0, %xmm1, %xmm0
787 VPMINU %xmm1, %xmm0, %xmm0
788 VPCMPEQ %xmm7, %xmm0, %xmm0
789 vpmovmskb %xmm0, %ecx
790 /* Only last 8 bits are valid. */
797 # ifdef USE_AS_STRNCMP
798 /* Return 0 if the current offset (%rdx) >= the maximum offset
804 L(cross_page_8bytes):
805 /* Less than 8 bytes to check, try 4 byte vector. */
806 cmpl $(PAGE_SIZE - 4), %eax
807 jg L(cross_page_4bytes)
808 vmovd (%rdi, %rdx), %xmm1
809 vmovd (%rsi, %rdx), %xmm0
810 VPCMPEQ %xmm0, %xmm1, %xmm0
811 VPMINU %xmm1, %xmm0, %xmm0
812 VPCMPEQ %xmm7, %xmm0, %xmm0
813 vpmovmskb %xmm0, %ecx
814 /* Only last 4 bits are valid. */
820 # ifdef USE_AS_STRNCMP
821 /* Return 0 if the current offset (%rdx) >= the maximum offset
827 L(cross_page_4bytes):
829 /* Less than 4 bytes to check, try one byte/dword at a time. */
830 # ifdef USE_AS_STRNCMP
834 # ifdef USE_AS_WCSCMP
835 movl (%rdi, %rdx), %eax
836 movl (%rsi, %rdx), %ecx
838 movzbl (%rdi, %rdx), %eax
839 movzbl (%rsi, %rdx), %ecx
842 jne L(cross_page_loop)