2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
23 # define STRCMP_SSE42 __strcmp_sse42
26 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
27 # include "locale-defines.h"
30 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
31 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
32 if the new counter > the old one or is 0. */
33 # define UPDATE_STRNCMP_COUNTER \
34 /* calculate left number to compare */ \
35 lea -16(%rcx, %r11), %r9; \
37 jb LABEL(strcmp_exitz); \
39 je LABEL(strcmp_exitz); \
42 # define UPDATE_STRNCMP_COUNTER
47 # define GLABEL(l) l##_avx
49 # define SECTION sse4.2
50 # define GLABEL(l) l##_sse42
53 #define LABEL(l) .L##l
57 | _SIDD_CMP_EQUAL_EACH
58 | _SIDD_NEGATIVE_POLARITY
59 | _SIDD_LEAST_SIGNIFICANT
60 on pcmpistri to find out if two 16byte data elements are the same
61 and the offset of the first different byte. There are 4 cases:
63 1. Both 16byte data elements are valid and identical.
64 2. Both 16byte data elements have EOS and identical.
65 3. Both 16byte data elements are valid and they differ at offset X.
66 4. At least one 16byte data element has EOS at offset X. Two 16byte
67 data elements must differ at or before offset X.
69 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
71 case ECX CFlag ZFlag SFlag
77 We exit from the loop for cases 2, 3 and 4 with jbe which branches
78 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
81 /* Put all SSE 4.2 functions together. */
82 .section .text.SECTION,"ax",@progbits
84 .type STRCMP_SSE42, @function
87 #ifdef USE_AS_STRCASECMP_L
88 ENTRY (GLABEL(__strcasecmp))
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
90 mov %fs:(%rax),%RDX_LP
92 // XXX 5 byte should be before the function
94 .byte 0x0f,0x1f,0x44,0x00,0x00
95 END (GLABEL(__strcasecmp))
96 /* FALLTHROUGH to strcasecmp_l. */
98 #ifdef USE_AS_STRNCASECMP_L
99 ENTRY (GLABEL(__strncasecmp))
100 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
101 mov %fs:(%rax),%RCX_LP
103 // XXX 5 byte should be before the function
105 .byte 0x0f,0x1f,0x44,0x00,0x00
106 END (GLABEL(__strncasecmp))
107 /* FALLTHROUGH to strncasecmp_l. */
112 # define movdqa vmovdqa
113 # define movdqu vmovdqu
114 # define pmovmskb vpmovmskb
115 # define pcmpistri vpcmpistri
116 # define psubb vpsubb
117 # define pcmpeqb vpcmpeqb
118 # define psrldq vpsrldq
119 # define pslldq vpslldq
120 # define palignr vpalignr
122 # define D(arg) arg, arg
133 * This implementation uses SSE to compare up to 16 bytes at a time.
135 #ifdef USE_AS_STRCASECMP_L
136 /* We have to fall back on the C implementation for locales
137 with encodings not matching ASCII for single bytes. */
138 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
139 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
143 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
144 jne __strcasecmp_l_nonascii
146 #ifdef USE_AS_STRNCASECMP_L
147 /* We have to fall back on the C implementation for locales
148 with encodings not matching ASCII for single bytes. */
149 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
150 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
154 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
155 jne __strncasecmp_l_nonascii
158 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
159 test %RDX_LP, %RDX_LP
160 je LABEL(strcmp_exitz)
167 /* Use 64bit AND here to avoid long NOP padding. */
168 and $0x3f, %rcx /* rsi alignment in cache line */
169 and $0x3f, %rax /* rdi alignment in cache line */
170 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
171 .section .rodata.cst16,"aM",@progbits,16
174 .quad 0x4040404040404040
175 .quad 0x4040404040404040
178 .quad 0x5a5a5a5a5a5a5a5a
179 .quad 0x5a5a5a5a5a5a5a5a
181 .quad 0x5b5b5b5b5b5b5b5b
182 .quad 0x5b5b5b5b5b5b5b5b
185 .quad 0x2020202020202020
186 .quad 0x2020202020202020
188 movdqa LABEL(belowupper)(%rip), %xmm4
189 # define UCLOW_reg %xmm4
190 movdqa LABEL(topupper)(%rip), %xmm5
191 # define UCHIGH_reg %xmm5
192 movdqa LABEL(touppermask)(%rip), %xmm6
193 # define LCQWORD_reg %xmm6
196 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
198 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
201 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
203 # define TOLOWER(reg1, reg2) \
204 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
205 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
206 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
207 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
208 vpandn %xmm7, %xmm8, %xmm8; \
209 vpandn %xmm9, %xmm10, %xmm10; \
210 vpand LCQWORD_reg, %xmm8, %xmm8; \
211 vpand LCQWORD_reg, %xmm10, %xmm10; \
212 vpor reg1, %xmm8, reg1; \
213 vpor reg2, %xmm10, reg2
215 # define TOLOWER(reg1, reg2) \
216 movdqa reg1, %xmm7; \
217 movdqa UCHIGH_reg, %xmm8; \
218 movdqa reg2, %xmm9; \
219 movdqa UCHIGH_reg, %xmm10; \
220 pcmpgtb UCLOW_reg, %xmm7; \
221 pcmpgtb reg1, %xmm8; \
222 pcmpgtb UCLOW_reg, %xmm9; \
223 pcmpgtb reg2, %xmm10; \
225 pand %xmm10, %xmm9; \
226 pand LCQWORD_reg, %xmm7; \
227 pand LCQWORD_reg, %xmm9; \
231 TOLOWER (%xmm1, %xmm2)
233 # define TOLOWER(reg1, reg2)
235 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
236 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
237 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
238 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
240 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
241 jnz LABEL(less16bytes)/* If not, find different value or null char */
242 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
244 jbe LABEL(strcmp_exitz)/* finish comparison */
246 add $16, %rsi /* prepare to search next 16 bytes */
247 add $16, %rdi /* prepare to search next 16 bytes */
250 * Determine source and destination string offsets from 16-byte
251 * alignment. Use relative offset difference between the two to
252 * determine which case below to use.
256 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
257 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
258 mov $0xffff, %edx /* for equivalent offset */
260 and $0xf, %ecx /* offset of rsi */
261 and $0xf, %eax /* offset of rdi */
262 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
264 je LABEL(ashr_0) /* rsi and rdi relative offset same */
266 mov %edx, %r8d /* r8d is offset flag for exit tail */
274 lea LABEL(unaligned_table)(%rip), %r10
275 movslq (%r10, %r9,4), %r9
276 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
277 lea (%r10, %r9), %r10
278 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
281 * The following cases will be handled by ashr_0
282 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
283 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
289 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
290 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
291 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
294 TOLOWER (%xmm1, %xmm2)
295 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
297 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
299 shr %cl, %edx /* adjust 0xffff for offset */
300 shr %cl, %r9d /* adjust for 16-byte offset */
303 * edx must be the same with r9d if in left byte (16-rcx) is equal to
304 * the start from (16-rax) and no null char was seen.
306 jne LABEL(less32bytes) /* mismatch or null char */
307 UPDATE_STRNCMP_COUNTER
312 * Now both strings are aligned at 16-byte boundary. Loop over strings
313 * checking 32-bytes per iteration.
315 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
318 movdqa (%rdi,%rdx), %xmm0
319 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
320 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
322 movdqa (%rsi,%rdx), %xmm1
323 TOLOWER (%xmm0, %xmm1)
324 pcmpistri $0x1a, %xmm1, %xmm0
327 jbe LABEL(ashr_0_exit_use)
328 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
330 jbe LABEL(strcmp_exitz)
333 movdqa (%rdi,%rdx), %xmm0
334 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
335 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
337 movdqa (%rsi,%rdx), %xmm1
338 TOLOWER (%xmm0, %xmm1)
339 pcmpistri $0x1a, %xmm1, %xmm0
342 jbe LABEL(ashr_0_exit_use)
343 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
345 jbe LABEL(strcmp_exitz)
347 jmp LABEL(ashr_0_use)
351 LABEL(ashr_0_exit_use):
352 jnc LABEL(strcmp_exitz)
353 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
355 jbe LABEL(strcmp_exitz)
357 lea -16(%rdx, %rcx), %rcx
358 movzbl (%rdi, %rcx), %eax
359 movzbl (%rsi, %rcx), %edx
360 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
361 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
362 movl (%rcx,%rax,4), %eax
363 movl (%rcx,%rdx,4), %edx
371 * The following cases will be handled by ashr_1
372 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
373 * n(15) n -15 0(15 +(n-15) - n) ashr_1
377 pslldq $15, D(%xmm2) /* shift first string to align with second */
378 TOLOWER (%xmm1, %xmm2)
379 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
380 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
382 shr %cl, %edx /* adjust 0xffff for offset */
383 shr %cl, %r9d /* adjust for 16-byte offset */
385 jnz LABEL(less32bytes) /* mismatch or null char seen */
387 UPDATE_STRNCMP_COUNTER
389 mov $16, %rcx /* index for loads*/
390 mov $1, %r9d /* byte position left over from less32bytes case */
392 * Setup %r10 value allows us to detect crossing a page boundary.
393 * When %r10 goes positive we have crossed a page boundary and
394 * need to do a nibble.
397 and $0xfff, %r10 /* offset into 4K page */
398 sub $0x1000, %r10 /* subtract 4K pagesize */
399 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
402 LABEL(loop_ashr_1_use):
404 jg LABEL(nibble_ashr_1_use)
406 LABEL(nibble_ashr_1_restart_use):
407 movdqa (%rdi, %rdx), %xmm0
408 palignr $1, -16(%rdi, %rdx), D(%xmm0)
409 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
410 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
412 movdqa (%rsi,%rdx), %xmm1
413 TOLOWER (%xmm0, %xmm1)
414 pcmpistri $0x1a, %xmm1, %xmm0
417 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
419 jbe LABEL(strcmp_exitz)
424 jg LABEL(nibble_ashr_1_use)
426 movdqa (%rdi, %rdx), %xmm0
427 palignr $1, -16(%rdi, %rdx), D(%xmm0)
428 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
429 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
431 movdqa (%rsi,%rdx), %xmm1
432 TOLOWER (%xmm0, %xmm1)
433 pcmpistri $0x1a, %xmm1, %xmm0
436 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
438 jbe LABEL(strcmp_exitz)
441 jmp LABEL(loop_ashr_1_use)
444 LABEL(nibble_ashr_1_use):
446 movdqa -16(%rdi, %rdx), %xmm0
448 pcmpistri $0x3a,%xmm0, %xmm0
449 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
451 jae LABEL(nibble_ashr_exit_use)
454 ja LABEL(nibble_ashr_1_restart_use)
456 jmp LABEL(nibble_ashr_exit_use)
459 * The following cases will be handled by ashr_2
460 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
461 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
466 TOLOWER (%xmm1, %xmm2)
467 pcmpeqb %xmm1, D(%xmm2)
468 psubb %xmm0, D(%xmm2)
473 jnz LABEL(less32bytes)
475 UPDATE_STRNCMP_COUNTER
477 mov $16, %rcx /* index for loads */
478 mov $2, %r9d /* byte position left over from less32bytes case */
480 * Setup %r10 value allows us to detect crossing a page boundary.
481 * When %r10 goes positive we have crossed a page boundary and
482 * need to do a nibble.
485 and $0xfff, %r10 /* offset into 4K page */
486 sub $0x1000, %r10 /* subtract 4K pagesize */
487 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
490 LABEL(loop_ashr_2_use):
492 jg LABEL(nibble_ashr_2_use)
494 LABEL(nibble_ashr_2_restart_use):
495 movdqa (%rdi, %rdx), %xmm0
496 palignr $2, -16(%rdi, %rdx), D(%xmm0)
497 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
498 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
500 movdqa (%rsi,%rdx), %xmm1
501 TOLOWER (%xmm0, %xmm1)
502 pcmpistri $0x1a, %xmm1, %xmm0
505 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
507 jbe LABEL(strcmp_exitz)
512 jg LABEL(nibble_ashr_2_use)
514 movdqa (%rdi, %rdx), %xmm0
515 palignr $2, -16(%rdi, %rdx), D(%xmm0)
516 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
517 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
519 movdqa (%rsi,%rdx), %xmm1
520 TOLOWER (%xmm0, %xmm1)
521 pcmpistri $0x1a, %xmm1, %xmm0
524 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
526 jbe LABEL(strcmp_exitz)
529 jmp LABEL(loop_ashr_2_use)
532 LABEL(nibble_ashr_2_use):
534 movdqa -16(%rdi, %rdx), %xmm0
536 pcmpistri $0x3a,%xmm0, %xmm0
537 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
539 jae LABEL(nibble_ashr_exit_use)
542 ja LABEL(nibble_ashr_2_restart_use)
544 jmp LABEL(nibble_ashr_exit_use)
547 * The following cases will be handled by ashr_3
548 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
549 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
554 TOLOWER (%xmm1, %xmm2)
555 pcmpeqb %xmm1, D(%xmm2)
556 psubb %xmm0, D(%xmm2)
561 jnz LABEL(less32bytes)
564 UPDATE_STRNCMP_COUNTER
566 mov $16, %rcx /* index for loads */
567 mov $3, %r9d /* byte position left over from less32bytes case */
569 * Setup %r10 value allows us to detect crossing a page boundary.
570 * When %r10 goes positive we have crossed a page boundary and
571 * need to do a nibble.
574 and $0xfff, %r10 /* offset into 4K page */
575 sub $0x1000, %r10 /* subtract 4K pagesize */
576 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
578 LABEL(loop_ashr_3_use):
580 jg LABEL(nibble_ashr_3_use)
582 LABEL(nibble_ashr_3_restart_use):
583 movdqa (%rdi, %rdx), %xmm0
584 palignr $3, -16(%rdi, %rdx), D(%xmm0)
585 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
586 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
588 movdqa (%rsi,%rdx), %xmm1
589 TOLOWER (%xmm0, %xmm1)
590 pcmpistri $0x1a, %xmm1, %xmm0
593 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
595 jbe LABEL(strcmp_exitz)
600 jg LABEL(nibble_ashr_3_use)
602 movdqa (%rdi, %rdx), %xmm0
603 palignr $3, -16(%rdi, %rdx), D(%xmm0)
604 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
605 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
607 movdqa (%rsi,%rdx), %xmm1
608 TOLOWER (%xmm0, %xmm1)
609 pcmpistri $0x1a, %xmm1, %xmm0
612 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
614 jbe LABEL(strcmp_exitz)
617 jmp LABEL(loop_ashr_3_use)
620 LABEL(nibble_ashr_3_use):
622 movdqa -16(%rdi, %rdx), %xmm0
624 pcmpistri $0x3a,%xmm0, %xmm0
625 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
627 jae LABEL(nibble_ashr_exit_use)
630 ja LABEL(nibble_ashr_3_restart_use)
632 jmp LABEL(nibble_ashr_exit_use)
635 * The following cases will be handled by ashr_4
636 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
637 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
642 TOLOWER (%xmm1, %xmm2)
643 pcmpeqb %xmm1, D(%xmm2)
644 psubb %xmm0, D(%xmm2)
649 jnz LABEL(less32bytes)
652 UPDATE_STRNCMP_COUNTER
654 mov $16, %rcx /* index for loads */
655 mov $4, %r9d /* byte position left over from less32bytes case */
657 * Setup %r10 value allows us to detect crossing a page boundary.
658 * When %r10 goes positive we have crossed a page boundary and
659 * need to do a nibble.
662 and $0xfff, %r10 /* offset into 4K page */
663 sub $0x1000, %r10 /* subtract 4K pagesize */
664 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
667 LABEL(loop_ashr_4_use):
669 jg LABEL(nibble_ashr_4_use)
671 LABEL(nibble_ashr_4_restart_use):
672 movdqa (%rdi, %rdx), %xmm0
673 palignr $4, -16(%rdi, %rdx), D(%xmm0)
674 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
675 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
677 movdqa (%rsi,%rdx), %xmm1
678 TOLOWER (%xmm0, %xmm1)
679 pcmpistri $0x1a, %xmm1, %xmm0
682 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
684 jbe LABEL(strcmp_exitz)
689 jg LABEL(nibble_ashr_4_use)
691 movdqa (%rdi, %rdx), %xmm0
692 palignr $4, -16(%rdi, %rdx), D(%xmm0)
693 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
694 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
696 movdqa (%rsi,%rdx), %xmm1
697 TOLOWER (%xmm0, %xmm1)
698 pcmpistri $0x1a, %xmm1, %xmm0
701 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
703 jbe LABEL(strcmp_exitz)
706 jmp LABEL(loop_ashr_4_use)
709 LABEL(nibble_ashr_4_use):
711 movdqa -16(%rdi, %rdx), %xmm0
713 pcmpistri $0x3a,%xmm0, %xmm0
714 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
716 jae LABEL(nibble_ashr_exit_use)
719 ja LABEL(nibble_ashr_4_restart_use)
721 jmp LABEL(nibble_ashr_exit_use)
724 * The following cases will be handled by ashr_5
725 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
726 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
731 TOLOWER (%xmm1, %xmm2)
732 pcmpeqb %xmm1, D(%xmm2)
733 psubb %xmm0, D(%xmm2)
738 jnz LABEL(less32bytes)
741 UPDATE_STRNCMP_COUNTER
743 mov $16, %rcx /* index for loads */
744 mov $5, %r9d /* byte position left over from less32bytes case */
746 * Setup %r10 value allows us to detect crossing a page boundary.
747 * When %r10 goes positive we have crossed a page boundary and
748 * need to do a nibble.
751 and $0xfff, %r10 /* offset into 4K page */
752 sub $0x1000, %r10 /* subtract 4K pagesize */
753 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
756 LABEL(loop_ashr_5_use):
758 jg LABEL(nibble_ashr_5_use)
760 LABEL(nibble_ashr_5_restart_use):
761 movdqa (%rdi, %rdx), %xmm0
762 palignr $5, -16(%rdi, %rdx), D(%xmm0)
763 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
764 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
766 movdqa (%rsi,%rdx), %xmm1
767 TOLOWER (%xmm0, %xmm1)
768 pcmpistri $0x1a, %xmm1, %xmm0
771 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
773 jbe LABEL(strcmp_exitz)
778 jg LABEL(nibble_ashr_5_use)
780 movdqa (%rdi, %rdx), %xmm0
782 palignr $5, -16(%rdi, %rdx), D(%xmm0)
783 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
784 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
786 movdqa (%rsi,%rdx), %xmm1
787 TOLOWER (%xmm0, %xmm1)
788 pcmpistri $0x1a, %xmm1, %xmm0
791 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
793 jbe LABEL(strcmp_exitz)
796 jmp LABEL(loop_ashr_5_use)
799 LABEL(nibble_ashr_5_use):
801 movdqa -16(%rdi, %rdx), %xmm0
803 pcmpistri $0x3a,%xmm0, %xmm0
804 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
806 jae LABEL(nibble_ashr_exit_use)
809 ja LABEL(nibble_ashr_5_restart_use)
811 jmp LABEL(nibble_ashr_exit_use)
814 * The following cases will be handled by ashr_6
815 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
816 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
821 TOLOWER (%xmm1, %xmm2)
822 pcmpeqb %xmm1, D(%xmm2)
823 psubb %xmm0, D(%xmm2)
828 jnz LABEL(less32bytes)
831 UPDATE_STRNCMP_COUNTER
833 mov $16, %rcx /* index for loads */
834 mov $6, %r9d /* byte position left over from less32bytes case */
836 * Setup %r10 value allows us to detect crossing a page boundary.
837 * When %r10 goes positive we have crossed a page boundary and
838 * need to do a nibble.
841 and $0xfff, %r10 /* offset into 4K page */
842 sub $0x1000, %r10 /* subtract 4K pagesize */
843 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
846 LABEL(loop_ashr_6_use):
848 jg LABEL(nibble_ashr_6_use)
850 LABEL(nibble_ashr_6_restart_use):
851 movdqa (%rdi, %rdx), %xmm0
852 palignr $6, -16(%rdi, %rdx), D(%xmm0)
853 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
854 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
856 movdqa (%rsi,%rdx), %xmm1
857 TOLOWER (%xmm0, %xmm1)
858 pcmpistri $0x1a, %xmm1, %xmm0
861 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
863 jbe LABEL(strcmp_exitz)
868 jg LABEL(nibble_ashr_6_use)
870 movdqa (%rdi, %rdx), %xmm0
871 palignr $6, -16(%rdi, %rdx), D(%xmm0)
872 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
873 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
875 movdqa (%rsi,%rdx), %xmm1
876 TOLOWER (%xmm0, %xmm1)
877 pcmpistri $0x1a, %xmm1, %xmm0
880 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
882 jbe LABEL(strcmp_exitz)
885 jmp LABEL(loop_ashr_6_use)
888 LABEL(nibble_ashr_6_use):
890 movdqa -16(%rdi, %rdx), %xmm0
892 pcmpistri $0x3a,%xmm0, %xmm0
893 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
895 jae LABEL(nibble_ashr_exit_use)
898 ja LABEL(nibble_ashr_6_restart_use)
900 jmp LABEL(nibble_ashr_exit_use)
903 * The following cases will be handled by ashr_7
904 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
905 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
910 TOLOWER (%xmm1, %xmm2)
911 pcmpeqb %xmm1, D(%xmm2)
912 psubb %xmm0, D(%xmm2)
917 jnz LABEL(less32bytes)
920 UPDATE_STRNCMP_COUNTER
922 mov $16, %rcx /* index for loads */
923 mov $7, %r9d /* byte position left over from less32bytes case */
925 * Setup %r10 value allows us to detect crossing a page boundary.
926 * When %r10 goes positive we have crossed a page boundary and
927 * need to do a nibble.
930 and $0xfff, %r10 /* offset into 4K page */
931 sub $0x1000, %r10 /* subtract 4K pagesize */
932 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
935 LABEL(loop_ashr_7_use):
937 jg LABEL(nibble_ashr_7_use)
939 LABEL(nibble_ashr_7_restart_use):
940 movdqa (%rdi, %rdx), %xmm0
941 palignr $7, -16(%rdi, %rdx), D(%xmm0)
942 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
943 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
945 movdqa (%rsi,%rdx), %xmm1
946 TOLOWER (%xmm0, %xmm1)
947 pcmpistri $0x1a, %xmm1, %xmm0
950 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
952 jbe LABEL(strcmp_exitz)
957 jg LABEL(nibble_ashr_7_use)
959 movdqa (%rdi, %rdx), %xmm0
960 palignr $7, -16(%rdi, %rdx), D(%xmm0)
961 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
962 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
964 movdqa (%rsi,%rdx), %xmm1
965 TOLOWER (%xmm0, %xmm1)
966 pcmpistri $0x1a, %xmm1, %xmm0
969 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
971 jbe LABEL(strcmp_exitz)
974 jmp LABEL(loop_ashr_7_use)
977 LABEL(nibble_ashr_7_use):
979 movdqa -16(%rdi, %rdx), %xmm0
981 pcmpistri $0x3a,%xmm0, %xmm0
982 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
984 jae LABEL(nibble_ashr_exit_use)
987 ja LABEL(nibble_ashr_7_restart_use)
989 jmp LABEL(nibble_ashr_exit_use)
992 * The following cases will be handled by ashr_8
993 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
994 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
999 TOLOWER (%xmm1, %xmm2)
1000 pcmpeqb %xmm1, D(%xmm2)
1001 psubb %xmm0, D(%xmm2)
1002 pmovmskb %xmm2, %r9d
1006 jnz LABEL(less32bytes)
1007 movdqa (%rdi), %xmm3
1009 UPDATE_STRNCMP_COUNTER
1011 mov $16, %rcx /* index for loads */
1012 mov $8, %r9d /* byte position left over from less32bytes case */
1014 * Setup %r10 value allows us to detect crossing a page boundary.
1015 * When %r10 goes positive we have crossed a page boundary and
1016 * need to do a nibble.
1019 and $0xfff, %r10 /* offset into 4K page */
1020 sub $0x1000, %r10 /* subtract 4K pagesize */
1021 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1024 LABEL(loop_ashr_8_use):
1026 jg LABEL(nibble_ashr_8_use)
1028 LABEL(nibble_ashr_8_restart_use):
1029 movdqa (%rdi, %rdx), %xmm0
1030 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1031 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1032 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1034 movdqa (%rsi,%rdx), %xmm1
1035 TOLOWER (%xmm0, %xmm1)
1036 pcmpistri $0x1a, %xmm1, %xmm0
1039 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1041 jbe LABEL(strcmp_exitz)
1046 jg LABEL(nibble_ashr_8_use)
1048 movdqa (%rdi, %rdx), %xmm0
1049 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1050 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1051 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1053 movdqa (%rsi,%rdx), %xmm1
1054 TOLOWER (%xmm0, %xmm1)
1055 pcmpistri $0x1a, %xmm1, %xmm0
1058 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1060 jbe LABEL(strcmp_exitz)
1063 jmp LABEL(loop_ashr_8_use)
1066 LABEL(nibble_ashr_8_use):
1068 movdqa -16(%rdi, %rdx), %xmm0
1070 pcmpistri $0x3a,%xmm0, %xmm0
1071 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1073 jae LABEL(nibble_ashr_exit_use)
1076 ja LABEL(nibble_ashr_8_restart_use)
1078 jmp LABEL(nibble_ashr_exit_use)
1081 * The following cases will be handled by ashr_9
1082 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1083 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1088 TOLOWER (%xmm1, %xmm2)
1089 pcmpeqb %xmm1, D(%xmm2)
1090 psubb %xmm0, D(%xmm2)
1091 pmovmskb %xmm2, %r9d
1095 jnz LABEL(less32bytes)
1096 movdqa (%rdi), %xmm3
1098 UPDATE_STRNCMP_COUNTER
1100 mov $16, %rcx /* index for loads */
1101 mov $9, %r9d /* byte position left over from less32bytes case */
1103 * Setup %r10 value allows us to detect crossing a page boundary.
1104 * When %r10 goes positive we have crossed a page boundary and
1105 * need to do a nibble.
1108 and $0xfff, %r10 /* offset into 4K page */
1109 sub $0x1000, %r10 /* subtract 4K pagesize */
1110 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1113 LABEL(loop_ashr_9_use):
1115 jg LABEL(nibble_ashr_9_use)
1117 LABEL(nibble_ashr_9_restart_use):
1118 movdqa (%rdi, %rdx), %xmm0
1120 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1121 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1122 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1124 movdqa (%rsi,%rdx), %xmm1
1125 TOLOWER (%xmm0, %xmm1)
1126 pcmpistri $0x1a, %xmm1, %xmm0
1129 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1131 jbe LABEL(strcmp_exitz)
1136 jg LABEL(nibble_ashr_9_use)
1138 movdqa (%rdi, %rdx), %xmm0
1139 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1140 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1141 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1143 movdqa (%rsi,%rdx), %xmm1
1144 TOLOWER (%xmm0, %xmm1)
1145 pcmpistri $0x1a, %xmm1, %xmm0
1148 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1150 jbe LABEL(strcmp_exitz)
1153 jmp LABEL(loop_ashr_9_use)
1156 LABEL(nibble_ashr_9_use):
1158 movdqa -16(%rdi, %rdx), %xmm0
1160 pcmpistri $0x3a,%xmm0, %xmm0
1161 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1163 jae LABEL(nibble_ashr_exit_use)
1166 ja LABEL(nibble_ashr_9_restart_use)
1168 jmp LABEL(nibble_ashr_exit_use)
1171 * The following cases will be handled by ashr_10
1172 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1173 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1178 TOLOWER (%xmm1, %xmm2)
1179 pcmpeqb %xmm1, D(%xmm2)
1180 psubb %xmm0, D(%xmm2)
1181 pmovmskb %xmm2, %r9d
1185 jnz LABEL(less32bytes)
1186 movdqa (%rdi), %xmm3
1188 UPDATE_STRNCMP_COUNTER
1190 mov $16, %rcx /* index for loads */
1191 mov $10, %r9d /* byte position left over from less32bytes case */
1193 * Setup %r10 value allows us to detect crossing a page boundary.
1194 * When %r10 goes positive we have crossed a page boundary and
1195 * need to do a nibble.
1198 and $0xfff, %r10 /* offset into 4K page */
1199 sub $0x1000, %r10 /* subtract 4K pagesize */
1200 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1203 LABEL(loop_ashr_10_use):
1205 jg LABEL(nibble_ashr_10_use)
1207 LABEL(nibble_ashr_10_restart_use):
1208 movdqa (%rdi, %rdx), %xmm0
1209 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1210 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1211 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1213 movdqa (%rsi,%rdx), %xmm1
1214 TOLOWER (%xmm0, %xmm1)
1215 pcmpistri $0x1a, %xmm1, %xmm0
1218 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1220 jbe LABEL(strcmp_exitz)
1225 jg LABEL(nibble_ashr_10_use)
1227 movdqa (%rdi, %rdx), %xmm0
1228 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1229 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1230 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1232 movdqa (%rsi,%rdx), %xmm1
1233 TOLOWER (%xmm0, %xmm1)
1234 pcmpistri $0x1a, %xmm1, %xmm0
1237 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1239 jbe LABEL(strcmp_exitz)
1242 jmp LABEL(loop_ashr_10_use)
1245 LABEL(nibble_ashr_10_use):
1247 movdqa -16(%rdi, %rdx), %xmm0
1248 psrldq $10, D(%xmm0)
1249 pcmpistri $0x3a,%xmm0, %xmm0
1250 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1252 jae LABEL(nibble_ashr_exit_use)
1255 ja LABEL(nibble_ashr_10_restart_use)
1257 jmp LABEL(nibble_ashr_exit_use)
1260 * The following cases will be handled by ashr_11
1261 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1262 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1267 TOLOWER (%xmm1, %xmm2)
1268 pcmpeqb %xmm1, D(%xmm2)
1269 psubb %xmm0, D(%xmm2)
1270 pmovmskb %xmm2, %r9d
1274 jnz LABEL(less32bytes)
1275 movdqa (%rdi), %xmm3
1277 UPDATE_STRNCMP_COUNTER
1279 mov $16, %rcx /* index for loads */
1280 mov $11, %r9d /* byte position left over from less32bytes case */
1282 * Setup %r10 value allows us to detect crossing a page boundary.
1283 * When %r10 goes positive we have crossed a page boundary and
1284 * need to do a nibble.
1287 and $0xfff, %r10 /* offset into 4K page */
1288 sub $0x1000, %r10 /* subtract 4K pagesize */
1289 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1292 LABEL(loop_ashr_11_use):
1294 jg LABEL(nibble_ashr_11_use)
1296 LABEL(nibble_ashr_11_restart_use):
1297 movdqa (%rdi, %rdx), %xmm0
1298 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1299 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1300 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1302 movdqa (%rsi,%rdx), %xmm1
1303 TOLOWER (%xmm0, %xmm1)
1304 pcmpistri $0x1a, %xmm1, %xmm0
1307 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1309 jbe LABEL(strcmp_exitz)
1314 jg LABEL(nibble_ashr_11_use)
1316 movdqa (%rdi, %rdx), %xmm0
1317 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1318 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1319 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1321 movdqa (%rsi,%rdx), %xmm1
1322 TOLOWER (%xmm0, %xmm1)
1323 pcmpistri $0x1a, %xmm1, %xmm0
1326 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1328 jbe LABEL(strcmp_exitz)
1331 jmp LABEL(loop_ashr_11_use)
1334 LABEL(nibble_ashr_11_use):
1336 movdqa -16(%rdi, %rdx), %xmm0
1337 psrldq $11, D(%xmm0)
1338 pcmpistri $0x3a,%xmm0, %xmm0
1339 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1341 jae LABEL(nibble_ashr_exit_use)
1344 ja LABEL(nibble_ashr_11_restart_use)
1346 jmp LABEL(nibble_ashr_exit_use)
1349 * The following cases will be handled by ashr_12
1350 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1351 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1356 TOLOWER (%xmm1, %xmm2)
1357 pcmpeqb %xmm1, D(%xmm2)
1358 psubb %xmm0, D(%xmm2)
1359 pmovmskb %xmm2, %r9d
1363 jnz LABEL(less32bytes)
1364 movdqa (%rdi), %xmm3
1366 UPDATE_STRNCMP_COUNTER
1368 mov $16, %rcx /* index for loads */
1369 mov $12, %r9d /* byte position left over from less32bytes case */
1371 * Setup %r10 value allows us to detect crossing a page boundary.
1372 * When %r10 goes positive we have crossed a page boundary and
1373 * need to do a nibble.
1376 and $0xfff, %r10 /* offset into 4K page */
1377 sub $0x1000, %r10 /* subtract 4K pagesize */
1378 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1381 LABEL(loop_ashr_12_use):
1383 jg LABEL(nibble_ashr_12_use)
1385 LABEL(nibble_ashr_12_restart_use):
1386 movdqa (%rdi, %rdx), %xmm0
1387 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1388 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1389 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1391 movdqa (%rsi,%rdx), %xmm1
1392 TOLOWER (%xmm0, %xmm1)
1393 pcmpistri $0x1a, %xmm1, %xmm0
1396 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1398 jbe LABEL(strcmp_exitz)
1403 jg LABEL(nibble_ashr_12_use)
1405 movdqa (%rdi, %rdx), %xmm0
1406 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1407 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1408 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1410 movdqa (%rsi,%rdx), %xmm1
1411 TOLOWER (%xmm0, %xmm1)
1412 pcmpistri $0x1a, %xmm1, %xmm0
1415 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1417 jbe LABEL(strcmp_exitz)
1420 jmp LABEL(loop_ashr_12_use)
1423 LABEL(nibble_ashr_12_use):
1425 movdqa -16(%rdi, %rdx), %xmm0
1426 psrldq $12, D(%xmm0)
1427 pcmpistri $0x3a,%xmm0, %xmm0
1428 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1430 jae LABEL(nibble_ashr_exit_use)
1433 ja LABEL(nibble_ashr_12_restart_use)
1435 jmp LABEL(nibble_ashr_exit_use)
1438 * The following cases will be handled by ashr_13
1439 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1440 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1445 TOLOWER (%xmm1, %xmm2)
1446 pcmpeqb %xmm1, D(%xmm2)
1447 psubb %xmm0, D(%xmm2)
1448 pmovmskb %xmm2, %r9d
1452 jnz LABEL(less32bytes)
1453 movdqa (%rdi), %xmm3
1455 UPDATE_STRNCMP_COUNTER
1457 mov $16, %rcx /* index for loads */
1458 mov $13, %r9d /* byte position left over from less32bytes case */
1460 * Setup %r10 value allows us to detect crossing a page boundary.
1461 * When %r10 goes positive we have crossed a page boundary and
1462 * need to do a nibble.
1465 and $0xfff, %r10 /* offset into 4K page */
1466 sub $0x1000, %r10 /* subtract 4K pagesize */
1468 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1471 LABEL(loop_ashr_13_use):
1473 jg LABEL(nibble_ashr_13_use)
1475 LABEL(nibble_ashr_13_restart_use):
1476 movdqa (%rdi, %rdx), %xmm0
1477 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1478 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1479 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1481 movdqa (%rsi,%rdx), %xmm1
1482 TOLOWER (%xmm0, %xmm1)
1483 pcmpistri $0x1a, %xmm1, %xmm0
1486 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1488 jbe LABEL(strcmp_exitz)
1493 jg LABEL(nibble_ashr_13_use)
1495 movdqa (%rdi, %rdx), %xmm0
1496 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1497 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1498 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1500 movdqa (%rsi,%rdx), %xmm1
1501 TOLOWER (%xmm0, %xmm1)
1502 pcmpistri $0x1a, %xmm1, %xmm0
1505 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1507 jbe LABEL(strcmp_exitz)
1510 jmp LABEL(loop_ashr_13_use)
1513 LABEL(nibble_ashr_13_use):
1515 movdqa -16(%rdi, %rdx), %xmm0
1516 psrldq $13, D(%xmm0)
1517 pcmpistri $0x3a,%xmm0, %xmm0
1518 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1520 jae LABEL(nibble_ashr_exit_use)
1523 ja LABEL(nibble_ashr_13_restart_use)
1525 jmp LABEL(nibble_ashr_exit_use)
1528 * The following cases will be handled by ashr_14
1529 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1530 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1535 TOLOWER (%xmm1, %xmm2)
1536 pcmpeqb %xmm1, D(%xmm2)
1537 psubb %xmm0, D(%xmm2)
1538 pmovmskb %xmm2, %r9d
1542 jnz LABEL(less32bytes)
1543 movdqa (%rdi), %xmm3
1545 UPDATE_STRNCMP_COUNTER
1547 mov $16, %rcx /* index for loads */
1548 mov $14, %r9d /* byte position left over from less32bytes case */
1550 * Setup %r10 value allows us to detect crossing a page boundary.
1551 * When %r10 goes positive we have crossed a page boundary and
1552 * need to do a nibble.
1555 and $0xfff, %r10 /* offset into 4K page */
1556 sub $0x1000, %r10 /* subtract 4K pagesize */
1558 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1561 LABEL(loop_ashr_14_use):
1563 jg LABEL(nibble_ashr_14_use)
1565 LABEL(nibble_ashr_14_restart_use):
1566 movdqa (%rdi, %rdx), %xmm0
1567 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1568 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1569 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1571 movdqa (%rsi,%rdx), %xmm1
1572 TOLOWER (%xmm0, %xmm1)
1573 pcmpistri $0x1a, %xmm1, %xmm0
1576 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1578 jbe LABEL(strcmp_exitz)
1583 jg LABEL(nibble_ashr_14_use)
1585 movdqa (%rdi, %rdx), %xmm0
1586 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1587 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1588 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1590 movdqa (%rsi,%rdx), %xmm1
1591 TOLOWER (%xmm0, %xmm1)
1592 pcmpistri $0x1a, %xmm1, %xmm0
1595 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1597 jbe LABEL(strcmp_exitz)
1600 jmp LABEL(loop_ashr_14_use)
1603 LABEL(nibble_ashr_14_use):
1605 movdqa -16(%rdi, %rdx), %xmm0
1606 psrldq $14, D(%xmm0)
1607 pcmpistri $0x3a,%xmm0, %xmm0
1608 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1610 jae LABEL(nibble_ashr_exit_use)
1613 ja LABEL(nibble_ashr_14_restart_use)
1615 jmp LABEL(nibble_ashr_exit_use)
1618 * The following cases will be handled by ashr_15
1619 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1620 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1625 TOLOWER (%xmm1, %xmm2)
1626 pcmpeqb %xmm1, D(%xmm2)
1627 psubb %xmm0, D(%xmm2)
1628 pmovmskb %xmm2, %r9d
1632 jnz LABEL(less32bytes)
1634 movdqa (%rdi), %xmm3
1636 UPDATE_STRNCMP_COUNTER
1638 mov $16, %rcx /* index for loads */
1639 mov $15, %r9d /* byte position left over from less32bytes case */
1641 * Setup %r10 value allows us to detect crossing a page boundary.
1642 * When %r10 goes positive we have crossed a page boundary and
1643 * need to do a nibble.
1646 and $0xfff, %r10 /* offset into 4K page */
1648 sub $0x1000, %r10 /* subtract 4K pagesize */
1650 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1653 LABEL(loop_ashr_15_use):
1655 jg LABEL(nibble_ashr_15_use)
1657 LABEL(nibble_ashr_15_restart_use):
1658 movdqa (%rdi, %rdx), %xmm0
1659 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1660 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1661 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1663 movdqa (%rsi,%rdx), %xmm1
1664 TOLOWER (%xmm0, %xmm1)
1665 pcmpistri $0x1a, %xmm1, %xmm0
1668 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1670 jbe LABEL(strcmp_exitz)
1675 jg LABEL(nibble_ashr_15_use)
1677 movdqa (%rdi, %rdx), %xmm0
1678 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1679 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1680 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1682 movdqa (%rsi,%rdx), %xmm1
1683 TOLOWER (%xmm0, %xmm1)
1684 pcmpistri $0x1a, %xmm1, %xmm0
1687 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1689 jbe LABEL(strcmp_exitz)
1692 jmp LABEL(loop_ashr_15_use)
1695 LABEL(nibble_ashr_15_use):
1697 movdqa -16(%rdi, %rdx), %xmm0
1698 psrldq $15, D(%xmm0)
1699 pcmpistri $0x3a,%xmm0, %xmm0
1700 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1702 jae LABEL(nibble_ashr_exit_use)
1705 ja LABEL(nibble_ashr_15_restart_use)
1707 LABEL(nibble_ashr_exit_use):
1708 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1709 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1711 movdqa (%rsi,%rdx), %xmm1
1712 TOLOWER (%xmm0, %xmm1)
1713 pcmpistri $0x1a, %xmm1, %xmm0
1717 jnc LABEL(strcmp_exitz)
1718 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1720 jbe LABEL(strcmp_exitz)
1723 lea -16(%rdi, %r9), %rdi
1724 movzbl (%rdi, %rdx), %eax
1725 movzbl (%rsi, %rdx), %edx
1730 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1731 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1732 movl (%rcx,%rdx,4), %edx
1733 movl (%rcx,%rax,4), %eax
1740 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1741 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1744 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1749 bsf %rdx, %rdx /* find and store bit index in %rdx */
1751 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1753 jbe LABEL(strcmp_exitz)
1755 movzbl (%rsi, %rdx), %ecx
1756 movzbl (%rdi, %rdx), %eax
1758 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1759 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1760 movl (%rdx,%rcx,4), %ecx
1761 movl (%rdx,%rax,4), %eax
1767 LABEL(strcmp_exitz):
1772 // XXX Same as code above
1777 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1778 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1779 movl (%rdx,%rcx,4), %ecx
1780 movl (%rdx,%rax,4), %eax
1786 .size STRCMP_SSE42, .-STRCMP_SSE42
1793 /* Put all SSE 4.2 functions together. */
1794 .section .rodata.SECTION,"a",@progbits
1796 LABEL(unaligned_table):
1797 .int LABEL(ashr_1) - LABEL(unaligned_table)
1798 .int LABEL(ashr_2) - LABEL(unaligned_table)
1799 .int LABEL(ashr_3) - LABEL(unaligned_table)
1800 .int LABEL(ashr_4) - LABEL(unaligned_table)
1801 .int LABEL(ashr_5) - LABEL(unaligned_table)
1802 .int LABEL(ashr_6) - LABEL(unaligned_table)
1803 .int LABEL(ashr_7) - LABEL(unaligned_table)
1804 .int LABEL(ashr_8) - LABEL(unaligned_table)
1805 .int LABEL(ashr_9) - LABEL(unaligned_table)
1806 .int LABEL(ashr_10) - LABEL(unaligned_table)
1807 .int LABEL(ashr_11) - LABEL(unaligned_table)
1808 .int LABEL(ashr_12) - LABEL(unaligned_table)
1809 .int LABEL(ashr_13) - LABEL(unaligned_table)
1810 .int LABEL(ashr_14) - LABEL(unaligned_table)
1811 .int LABEL(ashr_15) - LABEL(unaligned_table)
1812 .int LABEL(ashr_0) - LABEL(unaligned_table)