2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 | _SIDD_CMP_EQUAL_EACH
25 | _SIDD_NEGATIVE_POLARITY
26 | _SIDD_LEAST_SIGNIFICANT
27 on pcmpistri to find out if two 16byte data elements are the same
28 and the offset of the first different byte. There are 4 cases:
30 1. Both 16byte data elements are valid and identical.
31 2. Both 16byte data elements have EOS and identical.
32 3. Both 16byte data elements are valid and they differ at offset X.
33 4. At least one 16byte data element has EOS at offset X. Two 16byte
34 data elements must differ at or before offset X.
36 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
38 case ECX CFlag ZFlag SFlag
44 We exit from the loop for cases 2, 3 and 4 with jbe which branches
45 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
48 /* Put all SSE 4.2 functions together. */
49 .section .text.SECTION,"ax",@progbits
51 .type STRCMP_SSE42, @function
52 #ifdef USE_AS_STRCASECMP_L
53 ENTRY (GLABEL(__strcasecmp))
54 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
57 // XXX 5 byte should be before the function
59 .byte 0x0f,0x1f,0x44,0x00,0x00
60 END (GLABEL(__strcasecmp))
61 /* FALLTHROUGH to strcasecmp_l. */
63 #ifdef USE_AS_STRNCASECMP_L
64 ENTRY (GLABEL(__strncasecmp))
65 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
68 // XXX 5 byte should be before the function
70 .byte 0x0f,0x1f,0x44,0x00,0x00
71 END (GLABEL(__strncasecmp))
72 /* FALLTHROUGH to strncasecmp_l. */
77 # define movdqa vmovdqa
78 # define movdqu vmovdqu
79 # define pmovmskb vpmovmskb
80 # define pcmpistri vpcmpistri
82 # define pcmpeqb vpcmpeqb
83 # define psrldq vpsrldq
84 # define pslldq vpslldq
85 # define palignr vpalignr
87 # define D(arg) arg, arg
97 * This implementation uses SSE to compare up to 16 bytes at a time.
99 #ifdef USE_AS_STRCASECMP_L
100 /* We have to fall back on the C implementation for locales
101 with encodings not matching ASCII for single bytes. */
102 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
103 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
107 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
108 jne __strcasecmp_l_nonascii
110 #ifdef USE_AS_STRNCASECMP_L
111 /* We have to fall back on the C implementation for locales
112 with encodings not matching ASCII for single bytes. */
113 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
114 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
118 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
119 jne __strncasecmp_l_nonascii
122 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
124 je LABEL(strcmp_exitz)
131 /* Use 64bit AND here to avoid long NOP padding. */
132 and $0x3f, %rcx /* rsi alignment in cache line */
133 and $0x3f, %rax /* rdi alignment in cache line */
134 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
135 .section .rodata.cst16,"aM",@progbits,16
138 .quad 0x4040404040404040
139 .quad 0x4040404040404040
142 .quad 0x5a5a5a5a5a5a5a5a
143 .quad 0x5a5a5a5a5a5a5a5a
145 .quad 0x5b5b5b5b5b5b5b5b
146 .quad 0x5b5b5b5b5b5b5b5b
149 .quad 0x2020202020202020
150 .quad 0x2020202020202020
152 movdqa LABEL(belowupper)(%rip), %xmm4
153 # define UCLOW_reg %xmm4
154 movdqa LABEL(topupper)(%rip), %xmm5
155 # define UCHIGH_reg %xmm5
156 movdqa LABEL(touppermask)(%rip), %xmm6
157 # define LCQWORD_reg %xmm6
160 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
162 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
165 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
167 # define TOLOWER(reg1, reg2) \
168 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
169 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
170 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
171 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
172 vpandn %xmm7, %xmm8, %xmm8; \
173 vpandn %xmm9, %xmm10, %xmm10; \
174 vpand LCQWORD_reg, %xmm8, %xmm8; \
175 vpand LCQWORD_reg, %xmm10, %xmm10; \
176 vpor reg1, %xmm8, reg1; \
177 vpor reg2, %xmm10, reg2
179 # define TOLOWER(reg1, reg2) \
180 movdqa reg1, %xmm7; \
181 movdqa UCHIGH_reg, %xmm8; \
182 movdqa reg2, %xmm9; \
183 movdqa UCHIGH_reg, %xmm10; \
184 pcmpgtb UCLOW_reg, %xmm7; \
185 pcmpgtb reg1, %xmm8; \
186 pcmpgtb UCLOW_reg, %xmm9; \
187 pcmpgtb reg2, %xmm10; \
189 pand %xmm10, %xmm9; \
190 pand LCQWORD_reg, %xmm7; \
191 pand LCQWORD_reg, %xmm9; \
195 TOLOWER (%xmm1, %xmm2)
197 # define TOLOWER(reg1, reg2)
199 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
200 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
201 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
202 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
204 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
205 jnz LABEL(less16bytes)/* If not, find different value or null char */
206 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
208 jbe LABEL(strcmp_exitz)/* finish comparision */
210 add $16, %rsi /* prepare to search next 16 bytes */
211 add $16, %rdi /* prepare to search next 16 bytes */
214 * Determine source and destination string offsets from 16-byte
215 * alignment. Use relative offset difference between the two to
216 * determine which case below to use.
220 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
221 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
222 mov $0xffff, %edx /* for equivalent offset */
224 and $0xf, %ecx /* offset of rsi */
225 and $0xf, %eax /* offset of rdi */
226 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
228 je LABEL(ashr_0) /* rsi and rdi relative offset same */
230 mov %edx, %r8d /* r8d is offset flag for exit tail */
238 lea LABEL(unaligned_table)(%rip), %r10
239 movslq (%r10, %r9,4), %r9
240 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
241 lea (%r10, %r9), %r10
242 jmp *%r10 /* jump to corresponding case */
245 * The following cases will be handled by ashr_0
246 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
247 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
253 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
254 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
255 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
258 TOLOWER (%xmm1, %xmm2)
259 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
261 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
263 shr %cl, %edx /* adjust 0xffff for offset */
264 shr %cl, %r9d /* adjust for 16-byte offset */
267 * edx must be the same with r9d if in left byte (16-rcx) is equal to
268 * the start from (16-rax) and no null char was seen.
270 jne LABEL(less32bytes) /* mismatch or null char */
271 UPDATE_STRNCMP_COUNTER
276 * Now both strings are aligned at 16-byte boundary. Loop over strings
277 * checking 32-bytes per iteration.
279 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
282 movdqa (%rdi,%rdx), %xmm0
283 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
284 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
286 movdqa (%rsi,%rdx), %xmm1
287 TOLOWER (%xmm0, %xmm1)
288 pcmpistri $0x1a, %xmm1, %xmm0
291 jbe LABEL(ashr_0_exit_use)
292 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
294 jbe LABEL(strcmp_exitz)
297 movdqa (%rdi,%rdx), %xmm0
298 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
299 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
301 movdqa (%rsi,%rdx), %xmm1
302 TOLOWER (%xmm0, %xmm1)
303 pcmpistri $0x1a, %xmm1, %xmm0
306 jbe LABEL(ashr_0_exit_use)
307 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
309 jbe LABEL(strcmp_exitz)
311 jmp LABEL(ashr_0_use)
315 LABEL(ashr_0_exit_use):
316 jnc LABEL(strcmp_exitz)
317 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
319 jbe LABEL(strcmp_exitz)
321 lea -16(%rdx, %rcx), %rcx
322 movzbl (%rdi, %rcx), %eax
323 movzbl (%rsi, %rcx), %edx
324 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
325 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
326 movl (%rcx,%rax,4), %eax
327 movl (%rcx,%rdx,4), %edx
335 * The following cases will be handled by ashr_1
336 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
337 * n(15) n -15 0(15 +(n-15) - n) ashr_1
341 pslldq $15, D(%xmm2) /* shift first string to align with second */
342 TOLOWER (%xmm1, %xmm2)
343 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
344 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
346 shr %cl, %edx /* adjust 0xffff for offset */
347 shr %cl, %r9d /* adjust for 16-byte offset */
349 jnz LABEL(less32bytes) /* mismatch or null char seen */
351 UPDATE_STRNCMP_COUNTER
353 mov $16, %rcx /* index for loads*/
354 mov $1, %r9d /* byte position left over from less32bytes case */
356 * Setup %r10 value allows us to detect crossing a page boundary.
357 * When %r10 goes positive we have crossed a page boundary and
358 * need to do a nibble.
361 and $0xfff, %r10 /* offset into 4K page */
362 sub $0x1000, %r10 /* subtract 4K pagesize */
363 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
366 LABEL(loop_ashr_1_use):
368 jg LABEL(nibble_ashr_1_use)
370 LABEL(nibble_ashr_1_restart_use):
371 movdqa (%rdi, %rdx), %xmm0
372 palignr $1, -16(%rdi, %rdx), D(%xmm0)
373 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
374 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
376 movdqa (%rsi,%rdx), %xmm1
377 TOLOWER (%xmm0, %xmm1)
378 pcmpistri $0x1a, %xmm1, %xmm0
381 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
383 jbe LABEL(strcmp_exitz)
388 jg LABEL(nibble_ashr_1_use)
390 movdqa (%rdi, %rdx), %xmm0
391 palignr $1, -16(%rdi, %rdx), D(%xmm0)
392 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
393 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
395 movdqa (%rsi,%rdx), %xmm1
396 TOLOWER (%xmm0, %xmm1)
397 pcmpistri $0x1a, %xmm1, %xmm0
400 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
402 jbe LABEL(strcmp_exitz)
405 jmp LABEL(loop_ashr_1_use)
408 LABEL(nibble_ashr_1_use):
410 movdqa -16(%rdi, %rdx), %xmm0
412 pcmpistri $0x3a,%xmm0, %xmm0
413 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
415 jae LABEL(nibble_ashr_exit_use)
418 ja LABEL(nibble_ashr_1_restart_use)
420 jmp LABEL(nibble_ashr_exit_use)
423 * The following cases will be handled by ashr_2
424 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
425 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
430 TOLOWER (%xmm1, %xmm2)
431 pcmpeqb %xmm1, D(%xmm2)
432 psubb %xmm0, D(%xmm2)
437 jnz LABEL(less32bytes)
439 UPDATE_STRNCMP_COUNTER
441 mov $16, %rcx /* index for loads */
442 mov $2, %r9d /* byte position left over from less32bytes case */
444 * Setup %r10 value allows us to detect crossing a page boundary.
445 * When %r10 goes positive we have crossed a page boundary and
446 * need to do a nibble.
449 and $0xfff, %r10 /* offset into 4K page */
450 sub $0x1000, %r10 /* subtract 4K pagesize */
451 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
454 LABEL(loop_ashr_2_use):
456 jg LABEL(nibble_ashr_2_use)
458 LABEL(nibble_ashr_2_restart_use):
459 movdqa (%rdi, %rdx), %xmm0
460 palignr $2, -16(%rdi, %rdx), D(%xmm0)
461 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
462 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
464 movdqa (%rsi,%rdx), %xmm1
465 TOLOWER (%xmm0, %xmm1)
466 pcmpistri $0x1a, %xmm1, %xmm0
469 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
471 jbe LABEL(strcmp_exitz)
476 jg LABEL(nibble_ashr_2_use)
478 movdqa (%rdi, %rdx), %xmm0
479 palignr $2, -16(%rdi, %rdx), D(%xmm0)
480 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
481 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
483 movdqa (%rsi,%rdx), %xmm1
484 TOLOWER (%xmm0, %xmm1)
485 pcmpistri $0x1a, %xmm1, %xmm0
488 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
490 jbe LABEL(strcmp_exitz)
493 jmp LABEL(loop_ashr_2_use)
496 LABEL(nibble_ashr_2_use):
498 movdqa -16(%rdi, %rdx), %xmm0
500 pcmpistri $0x3a,%xmm0, %xmm0
501 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
503 jae LABEL(nibble_ashr_exit_use)
506 ja LABEL(nibble_ashr_2_restart_use)
508 jmp LABEL(nibble_ashr_exit_use)
511 * The following cases will be handled by ashr_3
512 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
513 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
518 TOLOWER (%xmm1, %xmm2)
519 pcmpeqb %xmm1, D(%xmm2)
520 psubb %xmm0, D(%xmm2)
525 jnz LABEL(less32bytes)
528 UPDATE_STRNCMP_COUNTER
530 mov $16, %rcx /* index for loads */
531 mov $3, %r9d /* byte position left over from less32bytes case */
533 * Setup %r10 value allows us to detect crossing a page boundary.
534 * When %r10 goes positive we have crossed a page boundary and
535 * need to do a nibble.
538 and $0xfff, %r10 /* offset into 4K page */
539 sub $0x1000, %r10 /* subtract 4K pagesize */
540 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
542 LABEL(loop_ashr_3_use):
544 jg LABEL(nibble_ashr_3_use)
546 LABEL(nibble_ashr_3_restart_use):
547 movdqa (%rdi, %rdx), %xmm0
548 palignr $3, -16(%rdi, %rdx), D(%xmm0)
549 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
550 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
552 movdqa (%rsi,%rdx), %xmm1
553 TOLOWER (%xmm0, %xmm1)
554 pcmpistri $0x1a, %xmm1, %xmm0
557 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
559 jbe LABEL(strcmp_exitz)
564 jg LABEL(nibble_ashr_3_use)
566 movdqa (%rdi, %rdx), %xmm0
567 palignr $3, -16(%rdi, %rdx), D(%xmm0)
568 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
569 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
571 movdqa (%rsi,%rdx), %xmm1
572 TOLOWER (%xmm0, %xmm1)
573 pcmpistri $0x1a, %xmm1, %xmm0
576 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
578 jbe LABEL(strcmp_exitz)
581 jmp LABEL(loop_ashr_3_use)
584 LABEL(nibble_ashr_3_use):
586 movdqa -16(%rdi, %rdx), %xmm0
588 pcmpistri $0x3a,%xmm0, %xmm0
589 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
591 jae LABEL(nibble_ashr_exit_use)
594 ja LABEL(nibble_ashr_3_restart_use)
596 jmp LABEL(nibble_ashr_exit_use)
599 * The following cases will be handled by ashr_4
600 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
601 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
606 TOLOWER (%xmm1, %xmm2)
607 pcmpeqb %xmm1, D(%xmm2)
608 psubb %xmm0, D(%xmm2)
613 jnz LABEL(less32bytes)
616 UPDATE_STRNCMP_COUNTER
618 mov $16, %rcx /* index for loads */
619 mov $4, %r9d /* byte position left over from less32bytes case */
621 * Setup %r10 value allows us to detect crossing a page boundary.
622 * When %r10 goes positive we have crossed a page boundary and
623 * need to do a nibble.
626 and $0xfff, %r10 /* offset into 4K page */
627 sub $0x1000, %r10 /* subtract 4K pagesize */
628 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
631 LABEL(loop_ashr_4_use):
633 jg LABEL(nibble_ashr_4_use)
635 LABEL(nibble_ashr_4_restart_use):
636 movdqa (%rdi, %rdx), %xmm0
637 palignr $4, -16(%rdi, %rdx), D(%xmm0)
638 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
639 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
641 movdqa (%rsi,%rdx), %xmm1
642 TOLOWER (%xmm0, %xmm1)
643 pcmpistri $0x1a, %xmm1, %xmm0
646 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
648 jbe LABEL(strcmp_exitz)
653 jg LABEL(nibble_ashr_4_use)
655 movdqa (%rdi, %rdx), %xmm0
656 palignr $4, -16(%rdi, %rdx), D(%xmm0)
657 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
658 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
660 movdqa (%rsi,%rdx), %xmm1
661 TOLOWER (%xmm0, %xmm1)
662 pcmpistri $0x1a, %xmm1, %xmm0
665 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
667 jbe LABEL(strcmp_exitz)
670 jmp LABEL(loop_ashr_4_use)
673 LABEL(nibble_ashr_4_use):
675 movdqa -16(%rdi, %rdx), %xmm0
677 pcmpistri $0x3a,%xmm0, %xmm0
678 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
680 jae LABEL(nibble_ashr_exit_use)
683 ja LABEL(nibble_ashr_4_restart_use)
685 jmp LABEL(nibble_ashr_exit_use)
688 * The following cases will be handled by ashr_5
689 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
690 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
695 TOLOWER (%xmm1, %xmm2)
696 pcmpeqb %xmm1, D(%xmm2)
697 psubb %xmm0, D(%xmm2)
702 jnz LABEL(less32bytes)
705 UPDATE_STRNCMP_COUNTER
707 mov $16, %rcx /* index for loads */
708 mov $5, %r9d /* byte position left over from less32bytes case */
710 * Setup %r10 value allows us to detect crossing a page boundary.
711 * When %r10 goes positive we have crossed a page boundary and
712 * need to do a nibble.
715 and $0xfff, %r10 /* offset into 4K page */
716 sub $0x1000, %r10 /* subtract 4K pagesize */
717 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
720 LABEL(loop_ashr_5_use):
722 jg LABEL(nibble_ashr_5_use)
724 LABEL(nibble_ashr_5_restart_use):
725 movdqa (%rdi, %rdx), %xmm0
726 palignr $5, -16(%rdi, %rdx), D(%xmm0)
727 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
728 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
730 movdqa (%rsi,%rdx), %xmm1
731 TOLOWER (%xmm0, %xmm1)
732 pcmpistri $0x1a, %xmm1, %xmm0
735 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
737 jbe LABEL(strcmp_exitz)
742 jg LABEL(nibble_ashr_5_use)
744 movdqa (%rdi, %rdx), %xmm0
746 palignr $5, -16(%rdi, %rdx), D(%xmm0)
747 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
748 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
750 movdqa (%rsi,%rdx), %xmm1
751 TOLOWER (%xmm0, %xmm1)
752 pcmpistri $0x1a, %xmm1, %xmm0
755 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
757 jbe LABEL(strcmp_exitz)
760 jmp LABEL(loop_ashr_5_use)
763 LABEL(nibble_ashr_5_use):
765 movdqa -16(%rdi, %rdx), %xmm0
767 pcmpistri $0x3a,%xmm0, %xmm0
768 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
770 jae LABEL(nibble_ashr_exit_use)
773 ja LABEL(nibble_ashr_5_restart_use)
775 jmp LABEL(nibble_ashr_exit_use)
778 * The following cases will be handled by ashr_6
779 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
780 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
785 TOLOWER (%xmm1, %xmm2)
786 pcmpeqb %xmm1, D(%xmm2)
787 psubb %xmm0, D(%xmm2)
792 jnz LABEL(less32bytes)
795 UPDATE_STRNCMP_COUNTER
797 mov $16, %rcx /* index for loads */
798 mov $6, %r9d /* byte position left over from less32bytes case */
800 * Setup %r10 value allows us to detect crossing a page boundary.
801 * When %r10 goes positive we have crossed a page boundary and
802 * need to do a nibble.
805 and $0xfff, %r10 /* offset into 4K page */
806 sub $0x1000, %r10 /* subtract 4K pagesize */
807 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
810 LABEL(loop_ashr_6_use):
812 jg LABEL(nibble_ashr_6_use)
814 LABEL(nibble_ashr_6_restart_use):
815 movdqa (%rdi, %rdx), %xmm0
816 palignr $6, -16(%rdi, %rdx), D(%xmm0)
817 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
818 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
820 movdqa (%rsi,%rdx), %xmm1
821 TOLOWER (%xmm0, %xmm1)
822 pcmpistri $0x1a, %xmm1, %xmm0
825 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
827 jbe LABEL(strcmp_exitz)
832 jg LABEL(nibble_ashr_6_use)
834 movdqa (%rdi, %rdx), %xmm0
835 palignr $6, -16(%rdi, %rdx), D(%xmm0)
836 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
837 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
839 movdqa (%rsi,%rdx), %xmm1
840 TOLOWER (%xmm0, %xmm1)
841 pcmpistri $0x1a, %xmm1, %xmm0
844 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
846 jbe LABEL(strcmp_exitz)
849 jmp LABEL(loop_ashr_6_use)
852 LABEL(nibble_ashr_6_use):
854 movdqa -16(%rdi, %rdx), %xmm0
856 pcmpistri $0x3a,%xmm0, %xmm0
857 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
859 jae LABEL(nibble_ashr_exit_use)
862 ja LABEL(nibble_ashr_6_restart_use)
864 jmp LABEL(nibble_ashr_exit_use)
867 * The following cases will be handled by ashr_7
868 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
869 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
874 TOLOWER (%xmm1, %xmm2)
875 pcmpeqb %xmm1, D(%xmm2)
876 psubb %xmm0, D(%xmm2)
881 jnz LABEL(less32bytes)
884 UPDATE_STRNCMP_COUNTER
886 mov $16, %rcx /* index for loads */
887 mov $7, %r9d /* byte position left over from less32bytes case */
889 * Setup %r10 value allows us to detect crossing a page boundary.
890 * When %r10 goes positive we have crossed a page boundary and
891 * need to do a nibble.
894 and $0xfff, %r10 /* offset into 4K page */
895 sub $0x1000, %r10 /* subtract 4K pagesize */
896 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
899 LABEL(loop_ashr_7_use):
901 jg LABEL(nibble_ashr_7_use)
903 LABEL(nibble_ashr_7_restart_use):
904 movdqa (%rdi, %rdx), %xmm0
905 palignr $7, -16(%rdi, %rdx), D(%xmm0)
906 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
907 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
909 movdqa (%rsi,%rdx), %xmm1
910 TOLOWER (%xmm0, %xmm1)
911 pcmpistri $0x1a, %xmm1, %xmm0
914 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
916 jbe LABEL(strcmp_exitz)
921 jg LABEL(nibble_ashr_7_use)
923 movdqa (%rdi, %rdx), %xmm0
924 palignr $7, -16(%rdi, %rdx), D(%xmm0)
925 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
926 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
928 movdqa (%rsi,%rdx), %xmm1
929 TOLOWER (%xmm0, %xmm1)
930 pcmpistri $0x1a, %xmm1, %xmm0
933 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
935 jbe LABEL(strcmp_exitz)
938 jmp LABEL(loop_ashr_7_use)
941 LABEL(nibble_ashr_7_use):
943 movdqa -16(%rdi, %rdx), %xmm0
945 pcmpistri $0x3a,%xmm0, %xmm0
946 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
948 jae LABEL(nibble_ashr_exit_use)
951 ja LABEL(nibble_ashr_7_restart_use)
953 jmp LABEL(nibble_ashr_exit_use)
956 * The following cases will be handled by ashr_8
957 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
958 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
963 TOLOWER (%xmm1, %xmm2)
964 pcmpeqb %xmm1, D(%xmm2)
965 psubb %xmm0, D(%xmm2)
970 jnz LABEL(less32bytes)
973 UPDATE_STRNCMP_COUNTER
975 mov $16, %rcx /* index for loads */
976 mov $8, %r9d /* byte position left over from less32bytes case */
978 * Setup %r10 value allows us to detect crossing a page boundary.
979 * When %r10 goes positive we have crossed a page boundary and
980 * need to do a nibble.
983 and $0xfff, %r10 /* offset into 4K page */
984 sub $0x1000, %r10 /* subtract 4K pagesize */
985 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
988 LABEL(loop_ashr_8_use):
990 jg LABEL(nibble_ashr_8_use)
992 LABEL(nibble_ashr_8_restart_use):
993 movdqa (%rdi, %rdx), %xmm0
994 palignr $8, -16(%rdi, %rdx), D(%xmm0)
995 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
996 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
998 movdqa (%rsi,%rdx), %xmm1
999 TOLOWER (%xmm0, %xmm1)
1000 pcmpistri $0x1a, %xmm1, %xmm0
1003 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1005 jbe LABEL(strcmp_exitz)
1010 jg LABEL(nibble_ashr_8_use)
1012 movdqa (%rdi, %rdx), %xmm0
1013 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1014 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1015 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1017 movdqa (%rsi,%rdx), %xmm1
1018 TOLOWER (%xmm0, %xmm1)
1019 pcmpistri $0x1a, %xmm1, %xmm0
1022 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1024 jbe LABEL(strcmp_exitz)
1027 jmp LABEL(loop_ashr_8_use)
1030 LABEL(nibble_ashr_8_use):
1032 movdqa -16(%rdi, %rdx), %xmm0
1034 pcmpistri $0x3a,%xmm0, %xmm0
1035 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1037 jae LABEL(nibble_ashr_exit_use)
1040 ja LABEL(nibble_ashr_8_restart_use)
1042 jmp LABEL(nibble_ashr_exit_use)
1045 * The following cases will be handled by ashr_9
1046 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1047 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1052 TOLOWER (%xmm1, %xmm2)
1053 pcmpeqb %xmm1, D(%xmm2)
1054 psubb %xmm0, D(%xmm2)
1055 pmovmskb %xmm2, %r9d
1059 jnz LABEL(less32bytes)
1060 movdqa (%rdi), %xmm3
1062 UPDATE_STRNCMP_COUNTER
1064 mov $16, %rcx /* index for loads */
1065 mov $9, %r9d /* byte position left over from less32bytes case */
1067 * Setup %r10 value allows us to detect crossing a page boundary.
1068 * When %r10 goes positive we have crossed a page boundary and
1069 * need to do a nibble.
1072 and $0xfff, %r10 /* offset into 4K page */
1073 sub $0x1000, %r10 /* subtract 4K pagesize */
1074 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1077 LABEL(loop_ashr_9_use):
1079 jg LABEL(nibble_ashr_9_use)
1081 LABEL(nibble_ashr_9_restart_use):
1082 movdqa (%rdi, %rdx), %xmm0
1084 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1085 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1086 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1088 movdqa (%rsi,%rdx), %xmm1
1089 TOLOWER (%xmm0, %xmm1)
1090 pcmpistri $0x1a, %xmm1, %xmm0
1093 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1095 jbe LABEL(strcmp_exitz)
1100 jg LABEL(nibble_ashr_9_use)
1102 movdqa (%rdi, %rdx), %xmm0
1103 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1104 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1105 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1107 movdqa (%rsi,%rdx), %xmm1
1108 TOLOWER (%xmm0, %xmm1)
1109 pcmpistri $0x1a, %xmm1, %xmm0
1112 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1114 jbe LABEL(strcmp_exitz)
1117 jmp LABEL(loop_ashr_9_use)
1120 LABEL(nibble_ashr_9_use):
1122 movdqa -16(%rdi, %rdx), %xmm0
1124 pcmpistri $0x3a,%xmm0, %xmm0
1125 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1127 jae LABEL(nibble_ashr_exit_use)
1130 ja LABEL(nibble_ashr_9_restart_use)
1132 jmp LABEL(nibble_ashr_exit_use)
1135 * The following cases will be handled by ashr_10
1136 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1137 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1142 TOLOWER (%xmm1, %xmm2)
1143 pcmpeqb %xmm1, D(%xmm2)
1144 psubb %xmm0, D(%xmm2)
1145 pmovmskb %xmm2, %r9d
1149 jnz LABEL(less32bytes)
1150 movdqa (%rdi), %xmm3
1152 UPDATE_STRNCMP_COUNTER
1154 mov $16, %rcx /* index for loads */
1155 mov $10, %r9d /* byte position left over from less32bytes case */
1157 * Setup %r10 value allows us to detect crossing a page boundary.
1158 * When %r10 goes positive we have crossed a page boundary and
1159 * need to do a nibble.
1162 and $0xfff, %r10 /* offset into 4K page */
1163 sub $0x1000, %r10 /* subtract 4K pagesize */
1164 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1167 LABEL(loop_ashr_10_use):
1169 jg LABEL(nibble_ashr_10_use)
1171 LABEL(nibble_ashr_10_restart_use):
1172 movdqa (%rdi, %rdx), %xmm0
1173 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1174 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1175 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1177 movdqa (%rsi,%rdx), %xmm1
1178 TOLOWER (%xmm0, %xmm1)
1179 pcmpistri $0x1a, %xmm1, %xmm0
1182 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1184 jbe LABEL(strcmp_exitz)
1189 jg LABEL(nibble_ashr_10_use)
1191 movdqa (%rdi, %rdx), %xmm0
1192 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1193 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1194 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1196 movdqa (%rsi,%rdx), %xmm1
1197 TOLOWER (%xmm0, %xmm1)
1198 pcmpistri $0x1a, %xmm1, %xmm0
1201 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1203 jbe LABEL(strcmp_exitz)
1206 jmp LABEL(loop_ashr_10_use)
1209 LABEL(nibble_ashr_10_use):
1211 movdqa -16(%rdi, %rdx), %xmm0
1212 psrldq $10, D(%xmm0)
1213 pcmpistri $0x3a,%xmm0, %xmm0
1214 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1216 jae LABEL(nibble_ashr_exit_use)
1219 ja LABEL(nibble_ashr_10_restart_use)
1221 jmp LABEL(nibble_ashr_exit_use)
1224 * The following cases will be handled by ashr_11
1225 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1226 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1231 TOLOWER (%xmm1, %xmm2)
1232 pcmpeqb %xmm1, D(%xmm2)
1233 psubb %xmm0, D(%xmm2)
1234 pmovmskb %xmm2, %r9d
1238 jnz LABEL(less32bytes)
1239 movdqa (%rdi), %xmm3
1241 UPDATE_STRNCMP_COUNTER
1243 mov $16, %rcx /* index for loads */
1244 mov $11, %r9d /* byte position left over from less32bytes case */
1246 * Setup %r10 value allows us to detect crossing a page boundary.
1247 * When %r10 goes positive we have crossed a page boundary and
1248 * need to do a nibble.
1251 and $0xfff, %r10 /* offset into 4K page */
1252 sub $0x1000, %r10 /* subtract 4K pagesize */
1253 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1256 LABEL(loop_ashr_11_use):
1258 jg LABEL(nibble_ashr_11_use)
1260 LABEL(nibble_ashr_11_restart_use):
1261 movdqa (%rdi, %rdx), %xmm0
1262 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1263 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1264 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1266 movdqa (%rsi,%rdx), %xmm1
1267 TOLOWER (%xmm0, %xmm1)
1268 pcmpistri $0x1a, %xmm1, %xmm0
1271 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1273 jbe LABEL(strcmp_exitz)
1278 jg LABEL(nibble_ashr_11_use)
1280 movdqa (%rdi, %rdx), %xmm0
1281 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1282 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1283 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1285 movdqa (%rsi,%rdx), %xmm1
1286 TOLOWER (%xmm0, %xmm1)
1287 pcmpistri $0x1a, %xmm1, %xmm0
1290 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1292 jbe LABEL(strcmp_exitz)
1295 jmp LABEL(loop_ashr_11_use)
1298 LABEL(nibble_ashr_11_use):
1300 movdqa -16(%rdi, %rdx), %xmm0
1301 psrldq $11, D(%xmm0)
1302 pcmpistri $0x3a,%xmm0, %xmm0
1303 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1305 jae LABEL(nibble_ashr_exit_use)
1308 ja LABEL(nibble_ashr_11_restart_use)
1310 jmp LABEL(nibble_ashr_exit_use)
1313 * The following cases will be handled by ashr_12
1314 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1315 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1320 TOLOWER (%xmm1, %xmm2)
1321 pcmpeqb %xmm1, D(%xmm2)
1322 psubb %xmm0, D(%xmm2)
1323 pmovmskb %xmm2, %r9d
1327 jnz LABEL(less32bytes)
1328 movdqa (%rdi), %xmm3
1330 UPDATE_STRNCMP_COUNTER
1332 mov $16, %rcx /* index for loads */
1333 mov $12, %r9d /* byte position left over from less32bytes case */
1335 * Setup %r10 value allows us to detect crossing a page boundary.
1336 * When %r10 goes positive we have crossed a page boundary and
1337 * need to do a nibble.
1340 and $0xfff, %r10 /* offset into 4K page */
1341 sub $0x1000, %r10 /* subtract 4K pagesize */
1342 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1345 LABEL(loop_ashr_12_use):
1347 jg LABEL(nibble_ashr_12_use)
1349 LABEL(nibble_ashr_12_restart_use):
1350 movdqa (%rdi, %rdx), %xmm0
1351 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1352 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1353 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1355 movdqa (%rsi,%rdx), %xmm1
1356 TOLOWER (%xmm0, %xmm1)
1357 pcmpistri $0x1a, %xmm1, %xmm0
1360 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1362 jbe LABEL(strcmp_exitz)
1367 jg LABEL(nibble_ashr_12_use)
1369 movdqa (%rdi, %rdx), %xmm0
1370 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1371 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1372 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1374 movdqa (%rsi,%rdx), %xmm1
1375 TOLOWER (%xmm0, %xmm1)
1376 pcmpistri $0x1a, %xmm1, %xmm0
1379 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1381 jbe LABEL(strcmp_exitz)
1384 jmp LABEL(loop_ashr_12_use)
1387 LABEL(nibble_ashr_12_use):
1389 movdqa -16(%rdi, %rdx), %xmm0
1390 psrldq $12, D(%xmm0)
1391 pcmpistri $0x3a,%xmm0, %xmm0
1392 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1394 jae LABEL(nibble_ashr_exit_use)
1397 ja LABEL(nibble_ashr_12_restart_use)
1399 jmp LABEL(nibble_ashr_exit_use)
1402 * The following cases will be handled by ashr_13
1403 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1404 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1409 TOLOWER (%xmm1, %xmm2)
1410 pcmpeqb %xmm1, D(%xmm2)
1411 psubb %xmm0, D(%xmm2)
1412 pmovmskb %xmm2, %r9d
1416 jnz LABEL(less32bytes)
1417 movdqa (%rdi), %xmm3
1419 UPDATE_STRNCMP_COUNTER
1421 mov $16, %rcx /* index for loads */
1422 mov $13, %r9d /* byte position left over from less32bytes case */
1424 * Setup %r10 value allows us to detect crossing a page boundary.
1425 * When %r10 goes positive we have crossed a page boundary and
1426 * need to do a nibble.
1429 and $0xfff, %r10 /* offset into 4K page */
1430 sub $0x1000, %r10 /* subtract 4K pagesize */
1432 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1435 LABEL(loop_ashr_13_use):
1437 jg LABEL(nibble_ashr_13_use)
1439 LABEL(nibble_ashr_13_restart_use):
1440 movdqa (%rdi, %rdx), %xmm0
1441 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1442 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1443 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1445 movdqa (%rsi,%rdx), %xmm1
1446 TOLOWER (%xmm0, %xmm1)
1447 pcmpistri $0x1a, %xmm1, %xmm0
1450 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1452 jbe LABEL(strcmp_exitz)
1457 jg LABEL(nibble_ashr_13_use)
1459 movdqa (%rdi, %rdx), %xmm0
1460 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1461 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1462 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1464 movdqa (%rsi,%rdx), %xmm1
1465 TOLOWER (%xmm0, %xmm1)
1466 pcmpistri $0x1a, %xmm1, %xmm0
1469 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1471 jbe LABEL(strcmp_exitz)
1474 jmp LABEL(loop_ashr_13_use)
1477 LABEL(nibble_ashr_13_use):
1479 movdqa -16(%rdi, %rdx), %xmm0
1480 psrldq $13, D(%xmm0)
1481 pcmpistri $0x3a,%xmm0, %xmm0
1482 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1484 jae LABEL(nibble_ashr_exit_use)
1487 ja LABEL(nibble_ashr_13_restart_use)
1489 jmp LABEL(nibble_ashr_exit_use)
1492 * The following cases will be handled by ashr_14
1493 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1494 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1499 TOLOWER (%xmm1, %xmm2)
1500 pcmpeqb %xmm1, D(%xmm2)
1501 psubb %xmm0, D(%xmm2)
1502 pmovmskb %xmm2, %r9d
1506 jnz LABEL(less32bytes)
1507 movdqa (%rdi), %xmm3
1509 UPDATE_STRNCMP_COUNTER
1511 mov $16, %rcx /* index for loads */
1512 mov $14, %r9d /* byte position left over from less32bytes case */
1514 * Setup %r10 value allows us to detect crossing a page boundary.
1515 * When %r10 goes positive we have crossed a page boundary and
1516 * need to do a nibble.
1519 and $0xfff, %r10 /* offset into 4K page */
1520 sub $0x1000, %r10 /* subtract 4K pagesize */
1522 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1525 LABEL(loop_ashr_14_use):
1527 jg LABEL(nibble_ashr_14_use)
1529 LABEL(nibble_ashr_14_restart_use):
1530 movdqa (%rdi, %rdx), %xmm0
1531 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1532 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1533 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1535 movdqa (%rsi,%rdx), %xmm1
1536 TOLOWER (%xmm0, %xmm1)
1537 pcmpistri $0x1a, %xmm1, %xmm0
1540 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1542 jbe LABEL(strcmp_exitz)
1547 jg LABEL(nibble_ashr_14_use)
1549 movdqa (%rdi, %rdx), %xmm0
1550 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1551 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1552 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1554 movdqa (%rsi,%rdx), %xmm1
1555 TOLOWER (%xmm0, %xmm1)
1556 pcmpistri $0x1a, %xmm1, %xmm0
1559 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1561 jbe LABEL(strcmp_exitz)
1564 jmp LABEL(loop_ashr_14_use)
1567 LABEL(nibble_ashr_14_use):
1569 movdqa -16(%rdi, %rdx), %xmm0
1570 psrldq $14, D(%xmm0)
1571 pcmpistri $0x3a,%xmm0, %xmm0
1572 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1574 jae LABEL(nibble_ashr_exit_use)
1577 ja LABEL(nibble_ashr_14_restart_use)
1579 jmp LABEL(nibble_ashr_exit_use)
1582 * The following cases will be handled by ashr_15
1583 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1584 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1589 TOLOWER (%xmm1, %xmm2)
1590 pcmpeqb %xmm1, D(%xmm2)
1591 psubb %xmm0, D(%xmm2)
1592 pmovmskb %xmm2, %r9d
1596 jnz LABEL(less32bytes)
1598 movdqa (%rdi), %xmm3
1600 UPDATE_STRNCMP_COUNTER
1602 mov $16, %rcx /* index for loads */
1603 mov $15, %r9d /* byte position left over from less32bytes case */
1605 * Setup %r10 value allows us to detect crossing a page boundary.
1606 * When %r10 goes positive we have crossed a page boundary and
1607 * need to do a nibble.
1610 and $0xfff, %r10 /* offset into 4K page */
1612 sub $0x1000, %r10 /* subtract 4K pagesize */
1614 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1617 LABEL(loop_ashr_15_use):
1619 jg LABEL(nibble_ashr_15_use)
1621 LABEL(nibble_ashr_15_restart_use):
1622 movdqa (%rdi, %rdx), %xmm0
1623 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1624 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1625 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1627 movdqa (%rsi,%rdx), %xmm1
1628 TOLOWER (%xmm0, %xmm1)
1629 pcmpistri $0x1a, %xmm1, %xmm0
1632 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1634 jbe LABEL(strcmp_exitz)
1639 jg LABEL(nibble_ashr_15_use)
1641 movdqa (%rdi, %rdx), %xmm0
1642 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1643 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1644 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1646 movdqa (%rsi,%rdx), %xmm1
1647 TOLOWER (%xmm0, %xmm1)
1648 pcmpistri $0x1a, %xmm1, %xmm0
1651 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1653 jbe LABEL(strcmp_exitz)
1656 jmp LABEL(loop_ashr_15_use)
1659 LABEL(nibble_ashr_15_use):
1661 movdqa -16(%rdi, %rdx), %xmm0
1662 psrldq $15, D(%xmm0)
1663 pcmpistri $0x3a,%xmm0, %xmm0
1664 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1666 jae LABEL(nibble_ashr_exit_use)
1669 ja LABEL(nibble_ashr_15_restart_use)
1671 LABEL(nibble_ashr_exit_use):
1672 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1673 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1675 movdqa (%rsi,%rdx), %xmm1
1676 TOLOWER (%xmm0, %xmm1)
1677 pcmpistri $0x1a, %xmm1, %xmm0
1681 jnc LABEL(strcmp_exitz)
1682 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1684 jbe LABEL(strcmp_exitz)
1687 lea -16(%rdi, %r9), %rdi
1688 movzbl (%rdi, %rdx), %eax
1689 movzbl (%rsi, %rdx), %edx
1694 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1695 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1696 movl (%rcx,%rdx,4), %edx
1697 movl (%rcx,%rax,4), %eax
1704 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1705 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1708 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1713 bsf %rdx, %rdx /* find and store bit index in %rdx */
1715 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1717 jbe LABEL(strcmp_exitz)
1719 movzbl (%rsi, %rdx), %ecx
1720 movzbl (%rdi, %rdx), %eax
1722 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1723 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1724 movl (%rdx,%rcx,4), %ecx
1725 movl (%rdx,%rax,4), %eax
1731 LABEL(strcmp_exitz):
1736 // XXX Same as code above
1741 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1742 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1743 movl (%rdx,%rcx,4), %ecx
1744 movl (%rdx,%rax,4), %eax
1750 .size STRCMP_SSE42, .-STRCMP_SSE42
1757 /* Put all SSE 4.2 functions together. */
1758 .section .rodata.SECTION,"a",@progbits
1760 LABEL(unaligned_table):
1761 .int LABEL(ashr_1) - LABEL(unaligned_table)
1762 .int LABEL(ashr_2) - LABEL(unaligned_table)
1763 .int LABEL(ashr_3) - LABEL(unaligned_table)
1764 .int LABEL(ashr_4) - LABEL(unaligned_table)
1765 .int LABEL(ashr_5) - LABEL(unaligned_table)
1766 .int LABEL(ashr_6) - LABEL(unaligned_table)
1767 .int LABEL(ashr_7) - LABEL(unaligned_table)
1768 .int LABEL(ashr_8) - LABEL(unaligned_table)
1769 .int LABEL(ashr_9) - LABEL(unaligned_table)
1770 .int LABEL(ashr_10) - LABEL(unaligned_table)
1771 .int LABEL(ashr_11) - LABEL(unaligned_table)
1772 .int LABEL(ashr_12) - LABEL(unaligned_table)
1773 .int LABEL(ashr_13) - LABEL(unaligned_table)
1774 .int LABEL(ashr_14) - LABEL(unaligned_table)
1775 .int LABEL(ashr_15) - LABEL(unaligned_table)
1776 .int LABEL(ashr_0) - LABEL(unaligned_table)