2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 # define STRCMP_SSE42 __strcmp_sse42
26 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
27 # include "locale-defines.h"
30 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
31 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
32 if the new counter > the old one or is 0. */
33 # define UPDATE_STRNCMP_COUNTER \
34 /* calculate left number to compare */ \
35 lea -16(%rcx, %r11), %r9; \
37 jb LABEL(strcmp_exitz); \
39 je LABEL(strcmp_exitz); \
42 # define UPDATE_STRNCMP_COUNTER
47 # define GLABEL(l) l##_avx
49 # define SECTION sse4.2
50 # define GLABEL(l) l##_sse42
53 #define LABEL(l) .L##l
57 | _SIDD_CMP_EQUAL_EACH
58 | _SIDD_NEGATIVE_POLARITY
59 | _SIDD_LEAST_SIGNIFICANT
60 on pcmpistri to find out if two 16byte data elements are the same
61 and the offset of the first different byte. There are 4 cases:
63 1. Both 16byte data elements are valid and identical.
64 2. Both 16byte data elements have EOS and identical.
65 3. Both 16byte data elements are valid and they differ at offset X.
66 4. At least one 16byte data element has EOS at offset X. Two 16byte
67 data elements must differ at or before offset X.
69 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
71 case ECX CFlag ZFlag SFlag
77 We exit from the loop for cases 2, 3 and 4 with jbe which branches
78 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
81 /* Put all SSE 4.2 functions together. */
82 .section .text.SECTION,"ax",@progbits
84 .type STRCMP_SSE42, @function
87 #ifdef USE_AS_STRCASECMP_L
88 ENTRY (GLABEL(__strcasecmp))
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
90 mov %fs:(%rax),%RDX_LP
92 // XXX 5 byte should be before the function
94 .byte 0x0f,0x1f,0x44,0x00,0x00
95 END (GLABEL(__strcasecmp))
96 /* FALLTHROUGH to strcasecmp_l. */
98 #ifdef USE_AS_STRNCASECMP_L
99 ENTRY (GLABEL(__strncasecmp))
100 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
101 mov %fs:(%rax),%RCX_LP
103 // XXX 5 byte should be before the function
105 .byte 0x0f,0x1f,0x44,0x00,0x00
106 END (GLABEL(__strncasecmp))
107 /* FALLTHROUGH to strncasecmp_l. */
112 # define movdqa vmovdqa
113 # define movdqu vmovdqu
114 # define pmovmskb vpmovmskb
115 # define pcmpistri vpcmpistri
116 # define psubb vpsubb
117 # define pcmpeqb vpcmpeqb
118 # define psrldq vpsrldq
119 # define pslldq vpslldq
120 # define palignr vpalignr
122 # define D(arg) arg, arg
132 * This implementation uses SSE to compare up to 16 bytes at a time.
134 #ifdef USE_AS_STRCASECMP_L
135 /* We have to fall back on the C implementation for locales
136 with encodings not matching ASCII for single bytes. */
137 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
138 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
142 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
143 jne __strcasecmp_l_nonascii
145 #ifdef USE_AS_STRNCASECMP_L
146 /* We have to fall back on the C implementation for locales
147 with encodings not matching ASCII for single bytes. */
148 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
149 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
153 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
154 jne __strncasecmp_l_nonascii
157 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
159 je LABEL(strcmp_exitz)
166 /* Use 64bit AND here to avoid long NOP padding. */
167 and $0x3f, %rcx /* rsi alignment in cache line */
168 and $0x3f, %rax /* rdi alignment in cache line */
169 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
170 .section .rodata.cst16,"aM",@progbits,16
173 .quad 0x4040404040404040
174 .quad 0x4040404040404040
177 .quad 0x5a5a5a5a5a5a5a5a
178 .quad 0x5a5a5a5a5a5a5a5a
180 .quad 0x5b5b5b5b5b5b5b5b
181 .quad 0x5b5b5b5b5b5b5b5b
184 .quad 0x2020202020202020
185 .quad 0x2020202020202020
187 movdqa LABEL(belowupper)(%rip), %xmm4
188 # define UCLOW_reg %xmm4
189 movdqa LABEL(topupper)(%rip), %xmm5
190 # define UCHIGH_reg %xmm5
191 movdqa LABEL(touppermask)(%rip), %xmm6
192 # define LCQWORD_reg %xmm6
195 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
197 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
200 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
202 # define TOLOWER(reg1, reg2) \
203 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
204 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
205 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
206 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
207 vpandn %xmm7, %xmm8, %xmm8; \
208 vpandn %xmm9, %xmm10, %xmm10; \
209 vpand LCQWORD_reg, %xmm8, %xmm8; \
210 vpand LCQWORD_reg, %xmm10, %xmm10; \
211 vpor reg1, %xmm8, reg1; \
212 vpor reg2, %xmm10, reg2
214 # define TOLOWER(reg1, reg2) \
215 movdqa reg1, %xmm7; \
216 movdqa UCHIGH_reg, %xmm8; \
217 movdqa reg2, %xmm9; \
218 movdqa UCHIGH_reg, %xmm10; \
219 pcmpgtb UCLOW_reg, %xmm7; \
220 pcmpgtb reg1, %xmm8; \
221 pcmpgtb UCLOW_reg, %xmm9; \
222 pcmpgtb reg2, %xmm10; \
224 pand %xmm10, %xmm9; \
225 pand LCQWORD_reg, %xmm7; \
226 pand LCQWORD_reg, %xmm9; \
230 TOLOWER (%xmm1, %xmm2)
232 # define TOLOWER(reg1, reg2)
234 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
235 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
236 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
237 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
239 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
240 jnz LABEL(less16bytes)/* If not, find different value or null char */
241 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
243 jbe LABEL(strcmp_exitz)/* finish comparison */
245 add $16, %rsi /* prepare to search next 16 bytes */
246 add $16, %rdi /* prepare to search next 16 bytes */
249 * Determine source and destination string offsets from 16-byte
250 * alignment. Use relative offset difference between the two to
251 * determine which case below to use.
255 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
256 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
257 mov $0xffff, %edx /* for equivalent offset */
259 and $0xf, %ecx /* offset of rsi */
260 and $0xf, %eax /* offset of rdi */
261 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
263 je LABEL(ashr_0) /* rsi and rdi relative offset same */
265 mov %edx, %r8d /* r8d is offset flag for exit tail */
273 lea LABEL(unaligned_table)(%rip), %r10
274 movslq (%r10, %r9,4), %r9
275 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
276 lea (%r10, %r9), %r10
277 jmp *%r10 /* jump to corresponding case */
280 * The following cases will be handled by ashr_0
281 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
282 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
288 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
289 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
290 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
293 TOLOWER (%xmm1, %xmm2)
294 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
296 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
298 shr %cl, %edx /* adjust 0xffff for offset */
299 shr %cl, %r9d /* adjust for 16-byte offset */
302 * edx must be the same with r9d if in left byte (16-rcx) is equal to
303 * the start from (16-rax) and no null char was seen.
305 jne LABEL(less32bytes) /* mismatch or null char */
306 UPDATE_STRNCMP_COUNTER
311 * Now both strings are aligned at 16-byte boundary. Loop over strings
312 * checking 32-bytes per iteration.
314 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
317 movdqa (%rdi,%rdx), %xmm0
318 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
319 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
321 movdqa (%rsi,%rdx), %xmm1
322 TOLOWER (%xmm0, %xmm1)
323 pcmpistri $0x1a, %xmm1, %xmm0
326 jbe LABEL(ashr_0_exit_use)
327 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
329 jbe LABEL(strcmp_exitz)
332 movdqa (%rdi,%rdx), %xmm0
333 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
334 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
336 movdqa (%rsi,%rdx), %xmm1
337 TOLOWER (%xmm0, %xmm1)
338 pcmpistri $0x1a, %xmm1, %xmm0
341 jbe LABEL(ashr_0_exit_use)
342 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
344 jbe LABEL(strcmp_exitz)
346 jmp LABEL(ashr_0_use)
350 LABEL(ashr_0_exit_use):
351 jnc LABEL(strcmp_exitz)
352 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
354 jbe LABEL(strcmp_exitz)
356 lea -16(%rdx, %rcx), %rcx
357 movzbl (%rdi, %rcx), %eax
358 movzbl (%rsi, %rcx), %edx
359 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
360 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
361 movl (%rcx,%rax,4), %eax
362 movl (%rcx,%rdx,4), %edx
370 * The following cases will be handled by ashr_1
371 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
372 * n(15) n -15 0(15 +(n-15) - n) ashr_1
376 pslldq $15, D(%xmm2) /* shift first string to align with second */
377 TOLOWER (%xmm1, %xmm2)
378 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
379 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
381 shr %cl, %edx /* adjust 0xffff for offset */
382 shr %cl, %r9d /* adjust for 16-byte offset */
384 jnz LABEL(less32bytes) /* mismatch or null char seen */
386 UPDATE_STRNCMP_COUNTER
388 mov $16, %rcx /* index for loads*/
389 mov $1, %r9d /* byte position left over from less32bytes case */
391 * Setup %r10 value allows us to detect crossing a page boundary.
392 * When %r10 goes positive we have crossed a page boundary and
393 * need to do a nibble.
396 and $0xfff, %r10 /* offset into 4K page */
397 sub $0x1000, %r10 /* subtract 4K pagesize */
398 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
401 LABEL(loop_ashr_1_use):
403 jg LABEL(nibble_ashr_1_use)
405 LABEL(nibble_ashr_1_restart_use):
406 movdqa (%rdi, %rdx), %xmm0
407 palignr $1, -16(%rdi, %rdx), D(%xmm0)
408 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
409 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
411 movdqa (%rsi,%rdx), %xmm1
412 TOLOWER (%xmm0, %xmm1)
413 pcmpistri $0x1a, %xmm1, %xmm0
416 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
418 jbe LABEL(strcmp_exitz)
423 jg LABEL(nibble_ashr_1_use)
425 movdqa (%rdi, %rdx), %xmm0
426 palignr $1, -16(%rdi, %rdx), D(%xmm0)
427 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
428 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
430 movdqa (%rsi,%rdx), %xmm1
431 TOLOWER (%xmm0, %xmm1)
432 pcmpistri $0x1a, %xmm1, %xmm0
435 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
437 jbe LABEL(strcmp_exitz)
440 jmp LABEL(loop_ashr_1_use)
443 LABEL(nibble_ashr_1_use):
445 movdqa -16(%rdi, %rdx), %xmm0
447 pcmpistri $0x3a,%xmm0, %xmm0
448 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
450 jae LABEL(nibble_ashr_exit_use)
453 ja LABEL(nibble_ashr_1_restart_use)
455 jmp LABEL(nibble_ashr_exit_use)
458 * The following cases will be handled by ashr_2
459 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
460 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
465 TOLOWER (%xmm1, %xmm2)
466 pcmpeqb %xmm1, D(%xmm2)
467 psubb %xmm0, D(%xmm2)
472 jnz LABEL(less32bytes)
474 UPDATE_STRNCMP_COUNTER
476 mov $16, %rcx /* index for loads */
477 mov $2, %r9d /* byte position left over from less32bytes case */
479 * Setup %r10 value allows us to detect crossing a page boundary.
480 * When %r10 goes positive we have crossed a page boundary and
481 * need to do a nibble.
484 and $0xfff, %r10 /* offset into 4K page */
485 sub $0x1000, %r10 /* subtract 4K pagesize */
486 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
489 LABEL(loop_ashr_2_use):
491 jg LABEL(nibble_ashr_2_use)
493 LABEL(nibble_ashr_2_restart_use):
494 movdqa (%rdi, %rdx), %xmm0
495 palignr $2, -16(%rdi, %rdx), D(%xmm0)
496 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
497 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
499 movdqa (%rsi,%rdx), %xmm1
500 TOLOWER (%xmm0, %xmm1)
501 pcmpistri $0x1a, %xmm1, %xmm0
504 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
506 jbe LABEL(strcmp_exitz)
511 jg LABEL(nibble_ashr_2_use)
513 movdqa (%rdi, %rdx), %xmm0
514 palignr $2, -16(%rdi, %rdx), D(%xmm0)
515 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
516 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
518 movdqa (%rsi,%rdx), %xmm1
519 TOLOWER (%xmm0, %xmm1)
520 pcmpistri $0x1a, %xmm1, %xmm0
523 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
525 jbe LABEL(strcmp_exitz)
528 jmp LABEL(loop_ashr_2_use)
531 LABEL(nibble_ashr_2_use):
533 movdqa -16(%rdi, %rdx), %xmm0
535 pcmpistri $0x3a,%xmm0, %xmm0
536 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
538 jae LABEL(nibble_ashr_exit_use)
541 ja LABEL(nibble_ashr_2_restart_use)
543 jmp LABEL(nibble_ashr_exit_use)
546 * The following cases will be handled by ashr_3
547 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
548 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
553 TOLOWER (%xmm1, %xmm2)
554 pcmpeqb %xmm1, D(%xmm2)
555 psubb %xmm0, D(%xmm2)
560 jnz LABEL(less32bytes)
563 UPDATE_STRNCMP_COUNTER
565 mov $16, %rcx /* index for loads */
566 mov $3, %r9d /* byte position left over from less32bytes case */
568 * Setup %r10 value allows us to detect crossing a page boundary.
569 * When %r10 goes positive we have crossed a page boundary and
570 * need to do a nibble.
573 and $0xfff, %r10 /* offset into 4K page */
574 sub $0x1000, %r10 /* subtract 4K pagesize */
575 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
577 LABEL(loop_ashr_3_use):
579 jg LABEL(nibble_ashr_3_use)
581 LABEL(nibble_ashr_3_restart_use):
582 movdqa (%rdi, %rdx), %xmm0
583 palignr $3, -16(%rdi, %rdx), D(%xmm0)
584 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
585 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
587 movdqa (%rsi,%rdx), %xmm1
588 TOLOWER (%xmm0, %xmm1)
589 pcmpistri $0x1a, %xmm1, %xmm0
592 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
594 jbe LABEL(strcmp_exitz)
599 jg LABEL(nibble_ashr_3_use)
601 movdqa (%rdi, %rdx), %xmm0
602 palignr $3, -16(%rdi, %rdx), D(%xmm0)
603 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
604 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
606 movdqa (%rsi,%rdx), %xmm1
607 TOLOWER (%xmm0, %xmm1)
608 pcmpistri $0x1a, %xmm1, %xmm0
611 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
613 jbe LABEL(strcmp_exitz)
616 jmp LABEL(loop_ashr_3_use)
619 LABEL(nibble_ashr_3_use):
621 movdqa -16(%rdi, %rdx), %xmm0
623 pcmpistri $0x3a,%xmm0, %xmm0
624 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
626 jae LABEL(nibble_ashr_exit_use)
629 ja LABEL(nibble_ashr_3_restart_use)
631 jmp LABEL(nibble_ashr_exit_use)
634 * The following cases will be handled by ashr_4
635 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
636 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
641 TOLOWER (%xmm1, %xmm2)
642 pcmpeqb %xmm1, D(%xmm2)
643 psubb %xmm0, D(%xmm2)
648 jnz LABEL(less32bytes)
651 UPDATE_STRNCMP_COUNTER
653 mov $16, %rcx /* index for loads */
654 mov $4, %r9d /* byte position left over from less32bytes case */
656 * Setup %r10 value allows us to detect crossing a page boundary.
657 * When %r10 goes positive we have crossed a page boundary and
658 * need to do a nibble.
661 and $0xfff, %r10 /* offset into 4K page */
662 sub $0x1000, %r10 /* subtract 4K pagesize */
663 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
666 LABEL(loop_ashr_4_use):
668 jg LABEL(nibble_ashr_4_use)
670 LABEL(nibble_ashr_4_restart_use):
671 movdqa (%rdi, %rdx), %xmm0
672 palignr $4, -16(%rdi, %rdx), D(%xmm0)
673 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
674 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
676 movdqa (%rsi,%rdx), %xmm1
677 TOLOWER (%xmm0, %xmm1)
678 pcmpistri $0x1a, %xmm1, %xmm0
681 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
683 jbe LABEL(strcmp_exitz)
688 jg LABEL(nibble_ashr_4_use)
690 movdqa (%rdi, %rdx), %xmm0
691 palignr $4, -16(%rdi, %rdx), D(%xmm0)
692 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
693 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
695 movdqa (%rsi,%rdx), %xmm1
696 TOLOWER (%xmm0, %xmm1)
697 pcmpistri $0x1a, %xmm1, %xmm0
700 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
702 jbe LABEL(strcmp_exitz)
705 jmp LABEL(loop_ashr_4_use)
708 LABEL(nibble_ashr_4_use):
710 movdqa -16(%rdi, %rdx), %xmm0
712 pcmpistri $0x3a,%xmm0, %xmm0
713 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
715 jae LABEL(nibble_ashr_exit_use)
718 ja LABEL(nibble_ashr_4_restart_use)
720 jmp LABEL(nibble_ashr_exit_use)
723 * The following cases will be handled by ashr_5
724 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
725 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
730 TOLOWER (%xmm1, %xmm2)
731 pcmpeqb %xmm1, D(%xmm2)
732 psubb %xmm0, D(%xmm2)
737 jnz LABEL(less32bytes)
740 UPDATE_STRNCMP_COUNTER
742 mov $16, %rcx /* index for loads */
743 mov $5, %r9d /* byte position left over from less32bytes case */
745 * Setup %r10 value allows us to detect crossing a page boundary.
746 * When %r10 goes positive we have crossed a page boundary and
747 * need to do a nibble.
750 and $0xfff, %r10 /* offset into 4K page */
751 sub $0x1000, %r10 /* subtract 4K pagesize */
752 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
755 LABEL(loop_ashr_5_use):
757 jg LABEL(nibble_ashr_5_use)
759 LABEL(nibble_ashr_5_restart_use):
760 movdqa (%rdi, %rdx), %xmm0
761 palignr $5, -16(%rdi, %rdx), D(%xmm0)
762 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
763 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
765 movdqa (%rsi,%rdx), %xmm1
766 TOLOWER (%xmm0, %xmm1)
767 pcmpistri $0x1a, %xmm1, %xmm0
770 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
772 jbe LABEL(strcmp_exitz)
777 jg LABEL(nibble_ashr_5_use)
779 movdqa (%rdi, %rdx), %xmm0
781 palignr $5, -16(%rdi, %rdx), D(%xmm0)
782 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
783 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
785 movdqa (%rsi,%rdx), %xmm1
786 TOLOWER (%xmm0, %xmm1)
787 pcmpistri $0x1a, %xmm1, %xmm0
790 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
792 jbe LABEL(strcmp_exitz)
795 jmp LABEL(loop_ashr_5_use)
798 LABEL(nibble_ashr_5_use):
800 movdqa -16(%rdi, %rdx), %xmm0
802 pcmpistri $0x3a,%xmm0, %xmm0
803 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
805 jae LABEL(nibble_ashr_exit_use)
808 ja LABEL(nibble_ashr_5_restart_use)
810 jmp LABEL(nibble_ashr_exit_use)
813 * The following cases will be handled by ashr_6
814 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
815 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
820 TOLOWER (%xmm1, %xmm2)
821 pcmpeqb %xmm1, D(%xmm2)
822 psubb %xmm0, D(%xmm2)
827 jnz LABEL(less32bytes)
830 UPDATE_STRNCMP_COUNTER
832 mov $16, %rcx /* index for loads */
833 mov $6, %r9d /* byte position left over from less32bytes case */
835 * Setup %r10 value allows us to detect crossing a page boundary.
836 * When %r10 goes positive we have crossed a page boundary and
837 * need to do a nibble.
840 and $0xfff, %r10 /* offset into 4K page */
841 sub $0x1000, %r10 /* subtract 4K pagesize */
842 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
845 LABEL(loop_ashr_6_use):
847 jg LABEL(nibble_ashr_6_use)
849 LABEL(nibble_ashr_6_restart_use):
850 movdqa (%rdi, %rdx), %xmm0
851 palignr $6, -16(%rdi, %rdx), D(%xmm0)
852 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
853 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
855 movdqa (%rsi,%rdx), %xmm1
856 TOLOWER (%xmm0, %xmm1)
857 pcmpistri $0x1a, %xmm1, %xmm0
860 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
862 jbe LABEL(strcmp_exitz)
867 jg LABEL(nibble_ashr_6_use)
869 movdqa (%rdi, %rdx), %xmm0
870 palignr $6, -16(%rdi, %rdx), D(%xmm0)
871 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
872 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
874 movdqa (%rsi,%rdx), %xmm1
875 TOLOWER (%xmm0, %xmm1)
876 pcmpistri $0x1a, %xmm1, %xmm0
879 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
881 jbe LABEL(strcmp_exitz)
884 jmp LABEL(loop_ashr_6_use)
887 LABEL(nibble_ashr_6_use):
889 movdqa -16(%rdi, %rdx), %xmm0
891 pcmpistri $0x3a,%xmm0, %xmm0
892 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
894 jae LABEL(nibble_ashr_exit_use)
897 ja LABEL(nibble_ashr_6_restart_use)
899 jmp LABEL(nibble_ashr_exit_use)
902 * The following cases will be handled by ashr_7
903 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
904 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
909 TOLOWER (%xmm1, %xmm2)
910 pcmpeqb %xmm1, D(%xmm2)
911 psubb %xmm0, D(%xmm2)
916 jnz LABEL(less32bytes)
919 UPDATE_STRNCMP_COUNTER
921 mov $16, %rcx /* index for loads */
922 mov $7, %r9d /* byte position left over from less32bytes case */
924 * Setup %r10 value allows us to detect crossing a page boundary.
925 * When %r10 goes positive we have crossed a page boundary and
926 * need to do a nibble.
929 and $0xfff, %r10 /* offset into 4K page */
930 sub $0x1000, %r10 /* subtract 4K pagesize */
931 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
934 LABEL(loop_ashr_7_use):
936 jg LABEL(nibble_ashr_7_use)
938 LABEL(nibble_ashr_7_restart_use):
939 movdqa (%rdi, %rdx), %xmm0
940 palignr $7, -16(%rdi, %rdx), D(%xmm0)
941 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
942 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
944 movdqa (%rsi,%rdx), %xmm1
945 TOLOWER (%xmm0, %xmm1)
946 pcmpistri $0x1a, %xmm1, %xmm0
949 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
951 jbe LABEL(strcmp_exitz)
956 jg LABEL(nibble_ashr_7_use)
958 movdqa (%rdi, %rdx), %xmm0
959 palignr $7, -16(%rdi, %rdx), D(%xmm0)
960 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
961 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
963 movdqa (%rsi,%rdx), %xmm1
964 TOLOWER (%xmm0, %xmm1)
965 pcmpistri $0x1a, %xmm1, %xmm0
968 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
970 jbe LABEL(strcmp_exitz)
973 jmp LABEL(loop_ashr_7_use)
976 LABEL(nibble_ashr_7_use):
978 movdqa -16(%rdi, %rdx), %xmm0
980 pcmpistri $0x3a,%xmm0, %xmm0
981 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
983 jae LABEL(nibble_ashr_exit_use)
986 ja LABEL(nibble_ashr_7_restart_use)
988 jmp LABEL(nibble_ashr_exit_use)
991 * The following cases will be handled by ashr_8
992 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
993 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
998 TOLOWER (%xmm1, %xmm2)
999 pcmpeqb %xmm1, D(%xmm2)
1000 psubb %xmm0, D(%xmm2)
1001 pmovmskb %xmm2, %r9d
1005 jnz LABEL(less32bytes)
1006 movdqa (%rdi), %xmm3
1008 UPDATE_STRNCMP_COUNTER
1010 mov $16, %rcx /* index for loads */
1011 mov $8, %r9d /* byte position left over from less32bytes case */
1013 * Setup %r10 value allows us to detect crossing a page boundary.
1014 * When %r10 goes positive we have crossed a page boundary and
1015 * need to do a nibble.
1018 and $0xfff, %r10 /* offset into 4K page */
1019 sub $0x1000, %r10 /* subtract 4K pagesize */
1020 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1023 LABEL(loop_ashr_8_use):
1025 jg LABEL(nibble_ashr_8_use)
1027 LABEL(nibble_ashr_8_restart_use):
1028 movdqa (%rdi, %rdx), %xmm0
1029 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1030 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1031 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1033 movdqa (%rsi,%rdx), %xmm1
1034 TOLOWER (%xmm0, %xmm1)
1035 pcmpistri $0x1a, %xmm1, %xmm0
1038 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1040 jbe LABEL(strcmp_exitz)
1045 jg LABEL(nibble_ashr_8_use)
1047 movdqa (%rdi, %rdx), %xmm0
1048 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1049 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1050 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1052 movdqa (%rsi,%rdx), %xmm1
1053 TOLOWER (%xmm0, %xmm1)
1054 pcmpistri $0x1a, %xmm1, %xmm0
1057 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1059 jbe LABEL(strcmp_exitz)
1062 jmp LABEL(loop_ashr_8_use)
1065 LABEL(nibble_ashr_8_use):
1067 movdqa -16(%rdi, %rdx), %xmm0
1069 pcmpistri $0x3a,%xmm0, %xmm0
1070 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1072 jae LABEL(nibble_ashr_exit_use)
1075 ja LABEL(nibble_ashr_8_restart_use)
1077 jmp LABEL(nibble_ashr_exit_use)
1080 * The following cases will be handled by ashr_9
1081 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1082 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1087 TOLOWER (%xmm1, %xmm2)
1088 pcmpeqb %xmm1, D(%xmm2)
1089 psubb %xmm0, D(%xmm2)
1090 pmovmskb %xmm2, %r9d
1094 jnz LABEL(less32bytes)
1095 movdqa (%rdi), %xmm3
1097 UPDATE_STRNCMP_COUNTER
1099 mov $16, %rcx /* index for loads */
1100 mov $9, %r9d /* byte position left over from less32bytes case */
1102 * Setup %r10 value allows us to detect crossing a page boundary.
1103 * When %r10 goes positive we have crossed a page boundary and
1104 * need to do a nibble.
1107 and $0xfff, %r10 /* offset into 4K page */
1108 sub $0x1000, %r10 /* subtract 4K pagesize */
1109 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1112 LABEL(loop_ashr_9_use):
1114 jg LABEL(nibble_ashr_9_use)
1116 LABEL(nibble_ashr_9_restart_use):
1117 movdqa (%rdi, %rdx), %xmm0
1119 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1120 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1121 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1123 movdqa (%rsi,%rdx), %xmm1
1124 TOLOWER (%xmm0, %xmm1)
1125 pcmpistri $0x1a, %xmm1, %xmm0
1128 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1130 jbe LABEL(strcmp_exitz)
1135 jg LABEL(nibble_ashr_9_use)
1137 movdqa (%rdi, %rdx), %xmm0
1138 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1139 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1140 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1142 movdqa (%rsi,%rdx), %xmm1
1143 TOLOWER (%xmm0, %xmm1)
1144 pcmpistri $0x1a, %xmm1, %xmm0
1147 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1149 jbe LABEL(strcmp_exitz)
1152 jmp LABEL(loop_ashr_9_use)
1155 LABEL(nibble_ashr_9_use):
1157 movdqa -16(%rdi, %rdx), %xmm0
1159 pcmpistri $0x3a,%xmm0, %xmm0
1160 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1162 jae LABEL(nibble_ashr_exit_use)
1165 ja LABEL(nibble_ashr_9_restart_use)
1167 jmp LABEL(nibble_ashr_exit_use)
1170 * The following cases will be handled by ashr_10
1171 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1172 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1177 TOLOWER (%xmm1, %xmm2)
1178 pcmpeqb %xmm1, D(%xmm2)
1179 psubb %xmm0, D(%xmm2)
1180 pmovmskb %xmm2, %r9d
1184 jnz LABEL(less32bytes)
1185 movdqa (%rdi), %xmm3
1187 UPDATE_STRNCMP_COUNTER
1189 mov $16, %rcx /* index for loads */
1190 mov $10, %r9d /* byte position left over from less32bytes case */
1192 * Setup %r10 value allows us to detect crossing a page boundary.
1193 * When %r10 goes positive we have crossed a page boundary and
1194 * need to do a nibble.
1197 and $0xfff, %r10 /* offset into 4K page */
1198 sub $0x1000, %r10 /* subtract 4K pagesize */
1199 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1202 LABEL(loop_ashr_10_use):
1204 jg LABEL(nibble_ashr_10_use)
1206 LABEL(nibble_ashr_10_restart_use):
1207 movdqa (%rdi, %rdx), %xmm0
1208 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1209 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1210 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1212 movdqa (%rsi,%rdx), %xmm1
1213 TOLOWER (%xmm0, %xmm1)
1214 pcmpistri $0x1a, %xmm1, %xmm0
1217 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1219 jbe LABEL(strcmp_exitz)
1224 jg LABEL(nibble_ashr_10_use)
1226 movdqa (%rdi, %rdx), %xmm0
1227 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1228 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1229 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1231 movdqa (%rsi,%rdx), %xmm1
1232 TOLOWER (%xmm0, %xmm1)
1233 pcmpistri $0x1a, %xmm1, %xmm0
1236 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1238 jbe LABEL(strcmp_exitz)
1241 jmp LABEL(loop_ashr_10_use)
1244 LABEL(nibble_ashr_10_use):
1246 movdqa -16(%rdi, %rdx), %xmm0
1247 psrldq $10, D(%xmm0)
1248 pcmpistri $0x3a,%xmm0, %xmm0
1249 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1251 jae LABEL(nibble_ashr_exit_use)
1254 ja LABEL(nibble_ashr_10_restart_use)
1256 jmp LABEL(nibble_ashr_exit_use)
1259 * The following cases will be handled by ashr_11
1260 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1261 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1266 TOLOWER (%xmm1, %xmm2)
1267 pcmpeqb %xmm1, D(%xmm2)
1268 psubb %xmm0, D(%xmm2)
1269 pmovmskb %xmm2, %r9d
1273 jnz LABEL(less32bytes)
1274 movdqa (%rdi), %xmm3
1276 UPDATE_STRNCMP_COUNTER
1278 mov $16, %rcx /* index for loads */
1279 mov $11, %r9d /* byte position left over from less32bytes case */
1281 * Setup %r10 value allows us to detect crossing a page boundary.
1282 * When %r10 goes positive we have crossed a page boundary and
1283 * need to do a nibble.
1286 and $0xfff, %r10 /* offset into 4K page */
1287 sub $0x1000, %r10 /* subtract 4K pagesize */
1288 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1291 LABEL(loop_ashr_11_use):
1293 jg LABEL(nibble_ashr_11_use)
1295 LABEL(nibble_ashr_11_restart_use):
1296 movdqa (%rdi, %rdx), %xmm0
1297 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1298 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1299 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1301 movdqa (%rsi,%rdx), %xmm1
1302 TOLOWER (%xmm0, %xmm1)
1303 pcmpistri $0x1a, %xmm1, %xmm0
1306 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1308 jbe LABEL(strcmp_exitz)
1313 jg LABEL(nibble_ashr_11_use)
1315 movdqa (%rdi, %rdx), %xmm0
1316 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1317 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1318 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1320 movdqa (%rsi,%rdx), %xmm1
1321 TOLOWER (%xmm0, %xmm1)
1322 pcmpistri $0x1a, %xmm1, %xmm0
1325 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1327 jbe LABEL(strcmp_exitz)
1330 jmp LABEL(loop_ashr_11_use)
1333 LABEL(nibble_ashr_11_use):
1335 movdqa -16(%rdi, %rdx), %xmm0
1336 psrldq $11, D(%xmm0)
1337 pcmpistri $0x3a,%xmm0, %xmm0
1338 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1340 jae LABEL(nibble_ashr_exit_use)
1343 ja LABEL(nibble_ashr_11_restart_use)
1345 jmp LABEL(nibble_ashr_exit_use)
1348 * The following cases will be handled by ashr_12
1349 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1350 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1355 TOLOWER (%xmm1, %xmm2)
1356 pcmpeqb %xmm1, D(%xmm2)
1357 psubb %xmm0, D(%xmm2)
1358 pmovmskb %xmm2, %r9d
1362 jnz LABEL(less32bytes)
1363 movdqa (%rdi), %xmm3
1365 UPDATE_STRNCMP_COUNTER
1367 mov $16, %rcx /* index for loads */
1368 mov $12, %r9d /* byte position left over from less32bytes case */
1370 * Setup %r10 value allows us to detect crossing a page boundary.
1371 * When %r10 goes positive we have crossed a page boundary and
1372 * need to do a nibble.
1375 and $0xfff, %r10 /* offset into 4K page */
1376 sub $0x1000, %r10 /* subtract 4K pagesize */
1377 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1380 LABEL(loop_ashr_12_use):
1382 jg LABEL(nibble_ashr_12_use)
1384 LABEL(nibble_ashr_12_restart_use):
1385 movdqa (%rdi, %rdx), %xmm0
1386 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1387 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1388 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1390 movdqa (%rsi,%rdx), %xmm1
1391 TOLOWER (%xmm0, %xmm1)
1392 pcmpistri $0x1a, %xmm1, %xmm0
1395 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1397 jbe LABEL(strcmp_exitz)
1402 jg LABEL(nibble_ashr_12_use)
1404 movdqa (%rdi, %rdx), %xmm0
1405 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1406 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1407 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1409 movdqa (%rsi,%rdx), %xmm1
1410 TOLOWER (%xmm0, %xmm1)
1411 pcmpistri $0x1a, %xmm1, %xmm0
1414 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1416 jbe LABEL(strcmp_exitz)
1419 jmp LABEL(loop_ashr_12_use)
1422 LABEL(nibble_ashr_12_use):
1424 movdqa -16(%rdi, %rdx), %xmm0
1425 psrldq $12, D(%xmm0)
1426 pcmpistri $0x3a,%xmm0, %xmm0
1427 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1429 jae LABEL(nibble_ashr_exit_use)
1432 ja LABEL(nibble_ashr_12_restart_use)
1434 jmp LABEL(nibble_ashr_exit_use)
1437 * The following cases will be handled by ashr_13
1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1439 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1444 TOLOWER (%xmm1, %xmm2)
1445 pcmpeqb %xmm1, D(%xmm2)
1446 psubb %xmm0, D(%xmm2)
1447 pmovmskb %xmm2, %r9d
1451 jnz LABEL(less32bytes)
1452 movdqa (%rdi), %xmm3
1454 UPDATE_STRNCMP_COUNTER
1456 mov $16, %rcx /* index for loads */
1457 mov $13, %r9d /* byte position left over from less32bytes case */
1459 * Setup %r10 value allows us to detect crossing a page boundary.
1460 * When %r10 goes positive we have crossed a page boundary and
1461 * need to do a nibble.
1464 and $0xfff, %r10 /* offset into 4K page */
1465 sub $0x1000, %r10 /* subtract 4K pagesize */
1467 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1470 LABEL(loop_ashr_13_use):
1472 jg LABEL(nibble_ashr_13_use)
1474 LABEL(nibble_ashr_13_restart_use):
1475 movdqa (%rdi, %rdx), %xmm0
1476 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1477 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1478 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1480 movdqa (%rsi,%rdx), %xmm1
1481 TOLOWER (%xmm0, %xmm1)
1482 pcmpistri $0x1a, %xmm1, %xmm0
1485 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1487 jbe LABEL(strcmp_exitz)
1492 jg LABEL(nibble_ashr_13_use)
1494 movdqa (%rdi, %rdx), %xmm0
1495 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1496 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1497 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1499 movdqa (%rsi,%rdx), %xmm1
1500 TOLOWER (%xmm0, %xmm1)
1501 pcmpistri $0x1a, %xmm1, %xmm0
1504 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1506 jbe LABEL(strcmp_exitz)
1509 jmp LABEL(loop_ashr_13_use)
1512 LABEL(nibble_ashr_13_use):
1514 movdqa -16(%rdi, %rdx), %xmm0
1515 psrldq $13, D(%xmm0)
1516 pcmpistri $0x3a,%xmm0, %xmm0
1517 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1519 jae LABEL(nibble_ashr_exit_use)
1522 ja LABEL(nibble_ashr_13_restart_use)
1524 jmp LABEL(nibble_ashr_exit_use)
1527 * The following cases will be handled by ashr_14
1528 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1529 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1534 TOLOWER (%xmm1, %xmm2)
1535 pcmpeqb %xmm1, D(%xmm2)
1536 psubb %xmm0, D(%xmm2)
1537 pmovmskb %xmm2, %r9d
1541 jnz LABEL(less32bytes)
1542 movdqa (%rdi), %xmm3
1544 UPDATE_STRNCMP_COUNTER
1546 mov $16, %rcx /* index for loads */
1547 mov $14, %r9d /* byte position left over from less32bytes case */
1549 * Setup %r10 value allows us to detect crossing a page boundary.
1550 * When %r10 goes positive we have crossed a page boundary and
1551 * need to do a nibble.
1554 and $0xfff, %r10 /* offset into 4K page */
1555 sub $0x1000, %r10 /* subtract 4K pagesize */
1557 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1560 LABEL(loop_ashr_14_use):
1562 jg LABEL(nibble_ashr_14_use)
1564 LABEL(nibble_ashr_14_restart_use):
1565 movdqa (%rdi, %rdx), %xmm0
1566 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1567 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1568 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1570 movdqa (%rsi,%rdx), %xmm1
1571 TOLOWER (%xmm0, %xmm1)
1572 pcmpistri $0x1a, %xmm1, %xmm0
1575 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1577 jbe LABEL(strcmp_exitz)
1582 jg LABEL(nibble_ashr_14_use)
1584 movdqa (%rdi, %rdx), %xmm0
1585 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1586 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1587 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1589 movdqa (%rsi,%rdx), %xmm1
1590 TOLOWER (%xmm0, %xmm1)
1591 pcmpistri $0x1a, %xmm1, %xmm0
1594 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1596 jbe LABEL(strcmp_exitz)
1599 jmp LABEL(loop_ashr_14_use)
1602 LABEL(nibble_ashr_14_use):
1604 movdqa -16(%rdi, %rdx), %xmm0
1605 psrldq $14, D(%xmm0)
1606 pcmpistri $0x3a,%xmm0, %xmm0
1607 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1609 jae LABEL(nibble_ashr_exit_use)
1612 ja LABEL(nibble_ashr_14_restart_use)
1614 jmp LABEL(nibble_ashr_exit_use)
1617 * The following cases will be handled by ashr_15
1618 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1619 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1624 TOLOWER (%xmm1, %xmm2)
1625 pcmpeqb %xmm1, D(%xmm2)
1626 psubb %xmm0, D(%xmm2)
1627 pmovmskb %xmm2, %r9d
1631 jnz LABEL(less32bytes)
1633 movdqa (%rdi), %xmm3
1635 UPDATE_STRNCMP_COUNTER
1637 mov $16, %rcx /* index for loads */
1638 mov $15, %r9d /* byte position left over from less32bytes case */
1640 * Setup %r10 value allows us to detect crossing a page boundary.
1641 * When %r10 goes positive we have crossed a page boundary and
1642 * need to do a nibble.
1645 and $0xfff, %r10 /* offset into 4K page */
1647 sub $0x1000, %r10 /* subtract 4K pagesize */
1649 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1652 LABEL(loop_ashr_15_use):
1654 jg LABEL(nibble_ashr_15_use)
1656 LABEL(nibble_ashr_15_restart_use):
1657 movdqa (%rdi, %rdx), %xmm0
1658 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1659 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1660 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1662 movdqa (%rsi,%rdx), %xmm1
1663 TOLOWER (%xmm0, %xmm1)
1664 pcmpistri $0x1a, %xmm1, %xmm0
1667 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1669 jbe LABEL(strcmp_exitz)
1674 jg LABEL(nibble_ashr_15_use)
1676 movdqa (%rdi, %rdx), %xmm0
1677 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1678 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1679 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1681 movdqa (%rsi,%rdx), %xmm1
1682 TOLOWER (%xmm0, %xmm1)
1683 pcmpistri $0x1a, %xmm1, %xmm0
1686 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1688 jbe LABEL(strcmp_exitz)
1691 jmp LABEL(loop_ashr_15_use)
1694 LABEL(nibble_ashr_15_use):
1696 movdqa -16(%rdi, %rdx), %xmm0
1697 psrldq $15, D(%xmm0)
1698 pcmpistri $0x3a,%xmm0, %xmm0
1699 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1701 jae LABEL(nibble_ashr_exit_use)
1704 ja LABEL(nibble_ashr_15_restart_use)
1706 LABEL(nibble_ashr_exit_use):
1707 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1708 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1710 movdqa (%rsi,%rdx), %xmm1
1711 TOLOWER (%xmm0, %xmm1)
1712 pcmpistri $0x1a, %xmm1, %xmm0
1716 jnc LABEL(strcmp_exitz)
1717 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1719 jbe LABEL(strcmp_exitz)
1722 lea -16(%rdi, %r9), %rdi
1723 movzbl (%rdi, %rdx), %eax
1724 movzbl (%rsi, %rdx), %edx
1729 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1730 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1731 movl (%rcx,%rdx,4), %edx
1732 movl (%rcx,%rax,4), %eax
1739 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1740 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1743 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1748 bsf %rdx, %rdx /* find and store bit index in %rdx */
1750 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1752 jbe LABEL(strcmp_exitz)
1754 movzbl (%rsi, %rdx), %ecx
1755 movzbl (%rdi, %rdx), %eax
1757 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1758 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1759 movl (%rdx,%rcx,4), %ecx
1760 movl (%rdx,%rax,4), %eax
1766 LABEL(strcmp_exitz):
1771 // XXX Same as code above
1776 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1777 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1778 movl (%rdx,%rcx,4), %ecx
1779 movl (%rdx,%rax,4), %eax
1785 .size STRCMP_SSE42, .-STRCMP_SSE42
1792 /* Put all SSE 4.2 functions together. */
1793 .section .rodata.SECTION,"a",@progbits
1795 LABEL(unaligned_table):
1796 .int LABEL(ashr_1) - LABEL(unaligned_table)
1797 .int LABEL(ashr_2) - LABEL(unaligned_table)
1798 .int LABEL(ashr_3) - LABEL(unaligned_table)
1799 .int LABEL(ashr_4) - LABEL(unaligned_table)
1800 .int LABEL(ashr_5) - LABEL(unaligned_table)
1801 .int LABEL(ashr_6) - LABEL(unaligned_table)
1802 .int LABEL(ashr_7) - LABEL(unaligned_table)
1803 .int LABEL(ashr_8) - LABEL(unaligned_table)
1804 .int LABEL(ashr_9) - LABEL(unaligned_table)
1805 .int LABEL(ashr_10) - LABEL(unaligned_table)
1806 .int LABEL(ashr_11) - LABEL(unaligned_table)
1807 .int LABEL(ashr_12) - LABEL(unaligned_table)
1808 .int LABEL(ashr_13) - LABEL(unaligned_table)
1809 .int LABEL(ashr_14) - LABEL(unaligned_table)
1810 .int LABEL(ashr_15) - LABEL(unaligned_table)
1811 .int LABEL(ashr_0) - LABEL(unaligned_table)