]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strcmp-sse42.S
Add optimized str{,n}casecmp for AVX on x86-64
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-sse42.S
CommitLineData
d9a4d2ab
UD
1/* strcmp with SSE4.2
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21
22/* We use 0x1a:
23 _SIDD_SBYTE_OPS
24 | _SIDD_CMP_EQUAL_EACH
25 | _SIDD_NEGATIVE_POLARITY
26 | _SIDD_LEAST_SIGNIFICANT
27 on pcmpistri to find out if two 16byte data elements are the same
28 and the offset of the first different byte. There are 4 cases:
29
30 1. Both 16byte data elements are valid and identical.
31 2. Both 16byte data elements have EOS and identical.
32 3. Both 16byte data elements are valid and they differ at offset X.
33 4. At least one 16byte data element has EOS at offset X. Two 16byte
34 data elements must differ at or before offset X.
35
36 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
37
38 case ECX CFlag ZFlag SFlag
39 1 16 0 0 0
40 2 16 0 1 1
41 3 X 1 0 0
42 4 0 <= X 1 0/1 0/1
43
44 We exit from the loop for cases 2, 3 and 4 with jbe which branches
45 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
46 case 2. */
47
48 /* Put all SSE 4.2 functions together. */
49 .section .text.SECTION,"ax",@progbits
50 .align 16
51 .type STRCMP_SSE42, @function
52#ifdef USE_AS_STRCASECMP_L
53ENTRY (GLABEL(__strcasecmp))
54 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
55 movq %fs:(%rax),%rdx
56
57 // XXX 5 byte should be before the function
58 /* 5-byte NOP. */
59 .byte 0x0f,0x1f,0x44,0x00,0x00
60END (GLABEL(__strcasecmp))
61 /* FALLTHROUGH to strcasecmp_l. */
62#endif
63#ifdef USE_AS_STRNCASECMP_L
64ENTRY (GLABEL(__strncasecmp))
65 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
66 movq %fs:(%rax),%rcx
67
68 // XXX 5 byte should be before the function
69 /* 5-byte NOP. */
70 .byte 0x0f,0x1f,0x44,0x00,0x00
71END (GLABEL(__strncasecmp))
72 /* FALLTHROUGH to strncasecmp_l. */
73#endif
74
75STRCMP_SSE42:
76 cfi_startproc
77 CALL_MCOUNT
78
79/*
80 * This implementation uses SSE to compare up to 16 bytes at a time.
81 */
82#ifdef USE_AS_STRCASECMP_L
83 /* We have to fall back on the C implementation for locales
84 with encodings not matching ASCII for single bytes. */
85# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
86 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
87# else
88 movq (%rdx), %rax
89# endif
90 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
91 jne __strcasecmp_l_nonascii
92#endif
93#ifdef USE_AS_STRNCASECMP_L
94 /* We have to fall back on the C implementation for locales
95 with encodings not matching ASCII for single bytes. */
96# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
97 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
98# else
99 movq (%rcx), %rax
100# endif
101 testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
102 jne __strncasecmp_l_nonascii
103#endif
104
105#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
106 test %rdx, %rdx
107 je LABEL(strcmp_exitz)
108 cmp $1, %rdx
109 je LABEL(Byte0)
110 mov %rdx, %r11
111#endif
112 mov %esi, %ecx
113 mov %edi, %eax
114/* Use 64bit AND here to avoid long NOP padding. */
115 and $0x3f, %rcx /* rsi alignment in cache line */
116 and $0x3f, %rax /* rdi alignment in cache line */
117#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
118 .section .rodata.cst16,"aM",@progbits,16
119 .align 16
120LABEL(belowupper):
121 .quad 0x4040404040404040
122 .quad 0x4040404040404040
123LABEL(topupper):
124# ifdef USE_AVX
125 .quad 0x5a5a5a5a5a5a5a5a
126 .quad 0x5a5a5a5a5a5a5a5a
127# else
128 .quad 0x5b5b5b5b5b5b5b5b
129 .quad 0x5b5b5b5b5b5b5b5b
130# endif
131LABEL(touppermask):
132 .quad 0x2020202020202020
133 .quad 0x2020202020202020
134 .previous
135 movdqa LABEL(belowupper)(%rip), %xmm4
136# define UCLOW_reg %xmm4
137 movdqa LABEL(topupper)(%rip), %xmm5
138# define UCHIGH_reg %xmm5
139 movdqa LABEL(touppermask)(%rip), %xmm6
140# define LCQWORD_reg %xmm6
141#endif
142 cmp $0x30, %ecx
143 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
144 cmp $0x30, %eax
145 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
146 movdqu (%rdi), %xmm1
147 movdqu (%rsi), %xmm2
148#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
149# ifdef USE_AVX
150# define TOLOWER(reg1, reg2) \
151 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
152 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
153 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
154 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
155 vpandn %xmm7, %xmm8, %xmm8; \
156 vpandn %xmm9, %xmm10, %xmm10; \
157 vpand LCQWORD_reg, %xmm8, %xmm8; \
158 vpand LCQWORD_reg, %xmm10, %xmm10; \
159 vpor reg1, %xmm8, reg1; \
160 vpor reg2, %xmm10, reg2
161# else
162# define TOLOWER(reg1, reg2) \
163 movdqa reg1, %xmm7; \
164 movdqa UCHIGH_reg, %xmm8; \
165 movdqa reg2, %xmm9; \
166 movdqa UCHIGH_reg, %xmm10; \
167 pcmpgtb UCLOW_reg, %xmm7; \
168 pcmpgtb reg1, %xmm8; \
169 pcmpgtb UCLOW_reg, %xmm9; \
170 pcmpgtb reg2, %xmm10; \
171 pand %xmm8, %xmm7; \
172 pand %xmm10, %xmm9; \
173 pand LCQWORD_reg, %xmm7; \
174 pand LCQWORD_reg, %xmm9; \
175 por %xmm7, reg1; \
176 por %xmm9, reg2
177# endif
178 TOLOWER (%xmm1, %xmm2)
179#else
180# define TOLOWER(reg1, reg2)
181#endif
182 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
183 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
184 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
185 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
186 pmovmskb %xmm1, %edx
187 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
188 jnz LABEL(less16bytes)/* If not, find different value or null char */
189#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
190 sub $16, %r11
191 jbe LABEL(strcmp_exitz)/* finish comparision */
192#endif
193 add $16, %rsi /* prepare to search next 16 bytes */
194 add $16, %rdi /* prepare to search next 16 bytes */
195
196 /*
197 * Determine source and destination string offsets from 16-byte
198 * alignment. Use relative offset difference between the two to
199 * determine which case below to use.
200 */
201 .p2align 4
202LABEL(crosscache):
203 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
204 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
205 mov $0xffff, %edx /* for equivalent offset */
206 xor %r8d, %r8d
207 and $0xf, %ecx /* offset of rsi */
208 and $0xf, %eax /* offset of rdi */
209 cmp %eax, %ecx
210 je LABEL(ashr_0) /* rsi and rdi relative offset same */
211 ja LABEL(bigger)
212 mov %edx, %r8d /* r8d is offset flag for exit tail */
213 xchg %ecx, %eax
214 xchg %rsi, %rdi
215LABEL(bigger):
216 lea 15(%rax), %r9
217 sub %rcx, %r9
218 lea LABEL(unaligned_table)(%rip), %r10
219 movslq (%r10, %r9,4), %r9
220 lea (%r10, %r9), %r10
221 jmp *%r10 /* jump to corresponding case */
222
223/*
224 * The following cases will be handled by ashr_0
225 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
226 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
227 */
228 .p2align 4
229LABEL(ashr_0):
230
231 movdqa (%rsi), %xmm1
232 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
233 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
234#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
235 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
236#else
237 movdqa (%rdi), %xmm2
238 TOLOWER (%xmm1, %xmm2)
239 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
240#endif
241 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
242 pmovmskb %xmm1, %r9d
243 shr %cl, %edx /* adjust 0xffff for offset */
244 shr %cl, %r9d /* adjust for 16-byte offset */
245 sub %r9d, %edx
246 /*
247 * edx must be the same with r9d if in left byte (16-rcx) is equal to
248 * the start from (16-rax) and no null char was seen.
249 */
250 jne LABEL(less32bytes) /* mismatch or null char */
251 UPDATE_STRNCMP_COUNTER
252 mov $16, %rcx
253 mov $16, %r9
254 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
255
256 /*
257 * Now both strings are aligned at 16-byte boundary. Loop over strings
258 * checking 32-bytes per iteration.
259 */
260 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
261 .p2align 4
262LABEL(ashr_0_use):
263 movdqa (%rdi,%rdx), %xmm0
264#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
265 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
266#else
267 movdqa (%rsi,%rdx), %xmm1
268 TOLOWER (%xmm0, %xmm1)
269 pcmpistri $0x1a, %xmm1, %xmm0
270#endif
271 lea 16(%rdx), %rdx
272 jbe LABEL(ashr_0_exit_use)
273#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
274 sub $16, %r11
275 jbe LABEL(strcmp_exitz)
276#endif
277
278 movdqa (%rdi,%rdx), %xmm0
279#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
280 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
281#else
282 movdqa (%rsi,%rdx), %xmm1
283 TOLOWER (%xmm0, %xmm1)
284 pcmpistri $0x1a, %xmm1, %xmm0
285#endif
286 lea 16(%rdx), %rdx
287 jbe LABEL(ashr_0_exit_use)
288#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
289 sub $16, %r11
290 jbe LABEL(strcmp_exitz)
291#endif
292 jmp LABEL(ashr_0_use)
293
294
295 .p2align 4
296LABEL(ashr_0_exit_use):
297 jnc LABEL(strcmp_exitz)
298#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
299 sub %rcx, %r11
300 jbe LABEL(strcmp_exitz)
301#endif
302 lea -16(%rdx, %rcx), %rcx
303 movzbl (%rdi, %rcx), %eax
304 movzbl (%rsi, %rcx), %edx
305#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
306 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
307 movl (%rcx,%rax,4), %eax
308 movl (%rcx,%rdx,4), %edx
309#endif
310 sub %edx, %eax
311 ret
312
313
314
315/*
316 * The following cases will be handled by ashr_1
317 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
318 * n(15) n -15 0(15 +(n-15) - n) ashr_1
319 */
320 .p2align 4
321LABEL(ashr_1):
322 pxor %xmm0, %xmm0
323 movdqa (%rdi), %xmm2
324 movdqa (%rsi), %xmm1
325 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
326 pslldq $15, %xmm2 /* shift first string to align with second */
327 TOLOWER (%xmm1, %xmm2)
328 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
329 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
330 pmovmskb %xmm2, %r9d
331 shr %cl, %edx /* adjust 0xffff for offset */
332 shr %cl, %r9d /* adjust for 16-byte offset */
333 sub %r9d, %edx
334 jnz LABEL(less32bytes) /* mismatch or null char seen */
335 movdqa (%rdi), %xmm3
336 UPDATE_STRNCMP_COUNTER
337
338 pxor %xmm0, %xmm0
339 mov $16, %rcx /* index for loads*/
340 mov $1, %r9d /* byte position left over from less32bytes case */
341 /*
342 * Setup %r10 value allows us to detect crossing a page boundary.
343 * When %r10 goes positive we have crossed a page boundary and
344 * need to do a nibble.
345 */
346 lea 1(%rdi), %r10
347 and $0xfff, %r10 /* offset into 4K page */
348 sub $0x1000, %r10 /* subtract 4K pagesize */
349 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
350
351 .p2align 4
352LABEL(loop_ashr_1_use):
353 add $16, %r10
354 jg LABEL(nibble_ashr_1_use)
355
356LABEL(nibble_ashr_1_restart_use):
357 movdqa (%rdi, %rdx), %xmm0
358 palignr $1, -16(%rdi, %rdx), %xmm0
359#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
360 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
361#else
362 movdqa (%rsi,%rdx), %xmm1
363 TOLOWER (%xmm0, %xmm1)
364 pcmpistri $0x1a, %xmm1, %xmm0
365#endif
366 jbe LABEL(exit_use)
367#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
368 sub $16, %r11
369 jbe LABEL(strcmp_exitz)
370#endif
371
372 add $16, %rdx
373 add $16, %r10
374 jg LABEL(nibble_ashr_1_use)
375
376 movdqa (%rdi, %rdx), %xmm0
377 palignr $1, -16(%rdi, %rdx), %xmm0
378#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
379 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
380#else
381 movdqa (%rsi,%rdx), %xmm1
382 TOLOWER (%xmm0, %xmm1)
383 pcmpistri $0x1a, %xmm1, %xmm0
384#endif
385 jbe LABEL(exit_use)
386#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
387 sub $16, %r11
388 jbe LABEL(strcmp_exitz)
389#endif
390 add $16, %rdx
391 jmp LABEL(loop_ashr_1_use)
392
393 .p2align 4
394LABEL(nibble_ashr_1_use):
395 sub $0x1000, %r10
396 movdqa -16(%rdi, %rdx), %xmm0
397 psrldq $1, %xmm0
398 pcmpistri $0x3a,%xmm0, %xmm0
399#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
400 cmp %r11, %rcx
401 jae LABEL(nibble_ashr_exit_use)
402#endif
403 cmp $14, %ecx
404 ja LABEL(nibble_ashr_1_restart_use)
405
406 jmp LABEL(nibble_ashr_exit_use)
407
408/*
409 * The following cases will be handled by ashr_2
410 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
411 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
412 */
413 .p2align 4
414LABEL(ashr_2):
415 pxor %xmm0, %xmm0
416 movdqa (%rdi), %xmm2
417 movdqa (%rsi), %xmm1
418 pcmpeqb %xmm1, %xmm0
419 pslldq $14, %xmm2
420 TOLOWER (%xmm1, %xmm2)
421 pcmpeqb %xmm1, %xmm2
422 psubb %xmm0, %xmm2
423 pmovmskb %xmm2, %r9d
424 shr %cl, %edx
425 shr %cl, %r9d
426 sub %r9d, %edx
427 jnz LABEL(less32bytes)
428 movdqa (%rdi), %xmm3
429 UPDATE_STRNCMP_COUNTER
430
431 pxor %xmm0, %xmm0
432 mov $16, %rcx /* index for loads */
433 mov $2, %r9d /* byte position left over from less32bytes case */
434 /*
435 * Setup %r10 value allows us to detect crossing a page boundary.
436 * When %r10 goes positive we have crossed a page boundary and
437 * need to do a nibble.
438 */
439 lea 2(%rdi), %r10
440 and $0xfff, %r10 /* offset into 4K page */
441 sub $0x1000, %r10 /* subtract 4K pagesize */
442 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
443
444 .p2align 4
445LABEL(loop_ashr_2_use):
446 add $16, %r10
447 jg LABEL(nibble_ashr_2_use)
448
449LABEL(nibble_ashr_2_restart_use):
450 movdqa (%rdi, %rdx), %xmm0
451 palignr $2, -16(%rdi, %rdx), %xmm0
452#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
453 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
454#else
455 movdqa (%rsi,%rdx), %xmm1
456 TOLOWER (%xmm0, %xmm1)
457 pcmpistri $0x1a, %xmm1, %xmm0
458#endif
459 jbe LABEL(exit_use)
460#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
461 sub $16, %r11
462 jbe LABEL(strcmp_exitz)
463#endif
464
465 add $16, %rdx
466 add $16, %r10
467 jg LABEL(nibble_ashr_2_use)
468
469 movdqa (%rdi, %rdx), %xmm0
470 palignr $2, -16(%rdi, %rdx), %xmm0
471#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
472 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
473#else
474 movdqa (%rsi,%rdx), %xmm1
475 TOLOWER (%xmm0, %xmm1)
476 pcmpistri $0x1a, %xmm1, %xmm0
477#endif
478 jbe LABEL(exit_use)
479#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
480 sub $16, %r11
481 jbe LABEL(strcmp_exitz)
482#endif
483 add $16, %rdx
484 jmp LABEL(loop_ashr_2_use)
485
486 .p2align 4
487LABEL(nibble_ashr_2_use):
488 sub $0x1000, %r10
489 movdqa -16(%rdi, %rdx), %xmm0
490 psrldq $2, %xmm0
491 pcmpistri $0x3a,%xmm0, %xmm0
492#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
493 cmp %r11, %rcx
494 jae LABEL(nibble_ashr_exit_use)
495#endif
496 cmp $13, %ecx
497 ja LABEL(nibble_ashr_2_restart_use)
498
499 jmp LABEL(nibble_ashr_exit_use)
500
501/*
502 * The following cases will be handled by ashr_3
503 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
504 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
505 */
506 .p2align 4
507LABEL(ashr_3):
508 pxor %xmm0, %xmm0
509 movdqa (%rdi), %xmm2
510 movdqa (%rsi), %xmm1
511 pcmpeqb %xmm1, %xmm0
512 pslldq $13, %xmm2
513 TOLOWER (%xmm1, %xmm2)
514 pcmpeqb %xmm1, %xmm2
515 psubb %xmm0, %xmm2
516 pmovmskb %xmm2, %r9d
517 shr %cl, %edx
518 shr %cl, %r9d
519 sub %r9d, %edx
520 jnz LABEL(less32bytes)
521 movdqa (%rdi), %xmm3
522
523 UPDATE_STRNCMP_COUNTER
524
525 pxor %xmm0, %xmm0
526 mov $16, %rcx /* index for loads */
527 mov $3, %r9d /* byte position left over from less32bytes case */
528 /*
529 * Setup %r10 value allows us to detect crossing a page boundary.
530 * When %r10 goes positive we have crossed a page boundary and
531 * need to do a nibble.
532 */
533 lea 3(%rdi), %r10
534 and $0xfff, %r10 /* offset into 4K page */
535 sub $0x1000, %r10 /* subtract 4K pagesize */
536 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
537
538LABEL(loop_ashr_3_use):
539 add $16, %r10
540 jg LABEL(nibble_ashr_3_use)
541
542LABEL(nibble_ashr_3_restart_use):
543 movdqa (%rdi, %rdx), %xmm0
544 palignr $3, -16(%rdi, %rdx), %xmm0
545#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
546 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
547#else
548 movdqa (%rsi,%rdx), %xmm1
549 TOLOWER (%xmm0, %xmm1)
550 pcmpistri $0x1a, %xmm1, %xmm0
551#endif
552 jbe LABEL(exit_use)
553#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
554 sub $16, %r11
555 jbe LABEL(strcmp_exitz)
556#endif
557
558 add $16, %rdx
559 add $16, %r10
560 jg LABEL(nibble_ashr_3_use)
561
562 movdqa (%rdi, %rdx), %xmm0
563 palignr $3, -16(%rdi, %rdx), %xmm0
564#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
565 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
566#else
567 movdqa (%rsi,%rdx), %xmm1
568 TOLOWER (%xmm0, %xmm1)
569 pcmpistri $0x1a, %xmm1, %xmm0
570#endif
571 jbe LABEL(exit_use)
572#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
573 sub $16, %r11
574 jbe LABEL(strcmp_exitz)
575#endif
576 add $16, %rdx
577 jmp LABEL(loop_ashr_3_use)
578
579 .p2align 4
580LABEL(nibble_ashr_3_use):
581 sub $0x1000, %r10
582 movdqa -16(%rdi, %rdx), %xmm0
583 psrldq $3, %xmm0
584 pcmpistri $0x3a,%xmm0, %xmm0
585#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
586 cmp %r11, %rcx
587 jae LABEL(nibble_ashr_exit_use)
588#endif
589 cmp $12, %ecx
590 ja LABEL(nibble_ashr_3_restart_use)
591
592 jmp LABEL(nibble_ashr_exit_use)
593
594/*
595 * The following cases will be handled by ashr_4
596 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
597 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
598 */
599 .p2align 4
600LABEL(ashr_4):
601 pxor %xmm0, %xmm0
602 movdqa (%rdi), %xmm2
603 movdqa (%rsi), %xmm1
604 pcmpeqb %xmm1, %xmm0
605 pslldq $12, %xmm2
606 TOLOWER (%xmm1, %xmm2)
607 pcmpeqb %xmm1, %xmm2
608 psubb %xmm0, %xmm2
609 pmovmskb %xmm2, %r9d
610 shr %cl, %edx
611 shr %cl, %r9d
612 sub %r9d, %edx
613 jnz LABEL(less32bytes)
614 movdqa (%rdi), %xmm3
615
616 UPDATE_STRNCMP_COUNTER
617
618 pxor %xmm0, %xmm0
619 mov $16, %rcx /* index for loads */
620 mov $4, %r9d /* byte position left over from less32bytes case */
621 /*
622 * Setup %r10 value allows us to detect crossing a page boundary.
623 * When %r10 goes positive we have crossed a page boundary and
624 * need to do a nibble.
625 */
626 lea 4(%rdi), %r10
627 and $0xfff, %r10 /* offset into 4K page */
628 sub $0x1000, %r10 /* subtract 4K pagesize */
629 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
630
631 .p2align 4
632LABEL(loop_ashr_4_use):
633 add $16, %r10
634 jg LABEL(nibble_ashr_4_use)
635
636LABEL(nibble_ashr_4_restart_use):
637 movdqa (%rdi, %rdx), %xmm0
638 palignr $4, -16(%rdi, %rdx), %xmm0
639#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
640 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
641#else
642 movdqa (%rsi,%rdx), %xmm1
643 TOLOWER (%xmm0, %xmm1)
644 pcmpistri $0x1a, %xmm1, %xmm0
645#endif
646 jbe LABEL(exit_use)
647#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
648 sub $16, %r11
649 jbe LABEL(strcmp_exitz)
650#endif
651
652 add $16, %rdx
653 add $16, %r10
654 jg LABEL(nibble_ashr_4_use)
655
656 movdqa (%rdi, %rdx), %xmm0
657 palignr $4, -16(%rdi, %rdx), %xmm0
658#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
659 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
660#else
661 movdqa (%rsi,%rdx), %xmm1
662 TOLOWER (%xmm0, %xmm1)
663 pcmpistri $0x1a, %xmm1, %xmm0
664#endif
665 jbe LABEL(exit_use)
666#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
667 sub $16, %r11
668 jbe LABEL(strcmp_exitz)
669#endif
670 add $16, %rdx
671 jmp LABEL(loop_ashr_4_use)
672
673 .p2align 4
674LABEL(nibble_ashr_4_use):
675 sub $0x1000, %r10
676 movdqa -16(%rdi, %rdx), %xmm0
677 psrldq $4, %xmm0
678 pcmpistri $0x3a,%xmm0, %xmm0
679#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
680 cmp %r11, %rcx
681 jae LABEL(nibble_ashr_exit_use)
682#endif
683 cmp $11, %ecx
684 ja LABEL(nibble_ashr_4_restart_use)
685
686 jmp LABEL(nibble_ashr_exit_use)
687
688/*
689 * The following cases will be handled by ashr_5
690 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
691 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
692 */
693 .p2align 4
694LABEL(ashr_5):
695 pxor %xmm0, %xmm0
696 movdqa (%rdi), %xmm2
697 movdqa (%rsi), %xmm1
698 pcmpeqb %xmm1, %xmm0
699 pslldq $11, %xmm2
700 TOLOWER (%xmm1, %xmm2)
701 pcmpeqb %xmm1, %xmm2
702 psubb %xmm0, %xmm2
703 pmovmskb %xmm2, %r9d
704 shr %cl, %edx
705 shr %cl, %r9d
706 sub %r9d, %edx
707 jnz LABEL(less32bytes)
708 movdqa (%rdi), %xmm3
709
710 UPDATE_STRNCMP_COUNTER
711
712 pxor %xmm0, %xmm0
713 mov $16, %rcx /* index for loads */
714 mov $5, %r9d /* byte position left over from less32bytes case */
715 /*
716 * Setup %r10 value allows us to detect crossing a page boundary.
717 * When %r10 goes positive we have crossed a page boundary and
718 * need to do a nibble.
719 */
720 lea 5(%rdi), %r10
721 and $0xfff, %r10 /* offset into 4K page */
722 sub $0x1000, %r10 /* subtract 4K pagesize */
723 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
724
725 .p2align 4
726LABEL(loop_ashr_5_use):
727 add $16, %r10
728 jg LABEL(nibble_ashr_5_use)
729
730LABEL(nibble_ashr_5_restart_use):
731 movdqa (%rdi, %rdx), %xmm0
732 palignr $5, -16(%rdi, %rdx), %xmm0
733#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
734 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
735#else
736 movdqa (%rsi,%rdx), %xmm1
737 TOLOWER (%xmm0, %xmm1)
738 pcmpistri $0x1a, %xmm1, %xmm0
739#endif
740 jbe LABEL(exit_use)
741#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
742 sub $16, %r11
743 jbe LABEL(strcmp_exitz)
744#endif
745
746 add $16, %rdx
747 add $16, %r10
748 jg LABEL(nibble_ashr_5_use)
749
750 movdqa (%rdi, %rdx), %xmm0
751
752 palignr $5, -16(%rdi, %rdx), %xmm0
753#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
754 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
755#else
756 movdqa (%rsi,%rdx), %xmm1
757 TOLOWER (%xmm0, %xmm1)
758 pcmpistri $0x1a, %xmm1, %xmm0
759#endif
760 jbe LABEL(exit_use)
761#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
762 sub $16, %r11
763 jbe LABEL(strcmp_exitz)
764#endif
765 add $16, %rdx
766 jmp LABEL(loop_ashr_5_use)
767
768 .p2align 4
769LABEL(nibble_ashr_5_use):
770 sub $0x1000, %r10
771 movdqa -16(%rdi, %rdx), %xmm0
772 psrldq $5, %xmm0
773 pcmpistri $0x3a,%xmm0, %xmm0
774#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
775 cmp %r11, %rcx
776 jae LABEL(nibble_ashr_exit_use)
777#endif
778 cmp $10, %ecx
779 ja LABEL(nibble_ashr_5_restart_use)
780
781 jmp LABEL(nibble_ashr_exit_use)
782
783/*
784 * The following cases will be handled by ashr_6
785 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
786 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
787 */
788 .p2align 4
789LABEL(ashr_6):
790 pxor %xmm0, %xmm0
791 movdqa (%rdi), %xmm2
792 movdqa (%rsi), %xmm1
793 pcmpeqb %xmm1, %xmm0
794 pslldq $10, %xmm2
795 TOLOWER (%xmm1, %xmm2)
796 pcmpeqb %xmm1, %xmm2
797 psubb %xmm0, %xmm2
798 pmovmskb %xmm2, %r9d
799 shr %cl, %edx
800 shr %cl, %r9d
801 sub %r9d, %edx
802 jnz LABEL(less32bytes)
803 movdqa (%rdi), %xmm3
804
805 UPDATE_STRNCMP_COUNTER
806
807 pxor %xmm0, %xmm0
808 mov $16, %rcx /* index for loads */
809 mov $6, %r9d /* byte position left over from less32bytes case */
810 /*
811 * Setup %r10 value allows us to detect crossing a page boundary.
812 * When %r10 goes positive we have crossed a page boundary and
813 * need to do a nibble.
814 */
815 lea 6(%rdi), %r10
816 and $0xfff, %r10 /* offset into 4K page */
817 sub $0x1000, %r10 /* subtract 4K pagesize */
818 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
819
820 .p2align 4
821LABEL(loop_ashr_6_use):
822 add $16, %r10
823 jg LABEL(nibble_ashr_6_use)
824
825LABEL(nibble_ashr_6_restart_use):
826 movdqa (%rdi, %rdx), %xmm0
827 palignr $6, -16(%rdi, %rdx), %xmm0
828#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
829 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
830#else
831 movdqa (%rsi,%rdx), %xmm1
832 TOLOWER (%xmm0, %xmm1)
833 pcmpistri $0x1a, %xmm1, %xmm0
834#endif
835 jbe LABEL(exit_use)
836#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
837 sub $16, %r11
838 jbe LABEL(strcmp_exitz)
839#endif
840
841 add $16, %rdx
842 add $16, %r10
843 jg LABEL(nibble_ashr_6_use)
844
845 movdqa (%rdi, %rdx), %xmm0
846 palignr $6, -16(%rdi, %rdx), %xmm0
847#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
848 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
849#else
850 movdqa (%rsi,%rdx), %xmm1
851 TOLOWER (%xmm0, %xmm1)
852 pcmpistri $0x1a, %xmm1, %xmm0
853#endif
854 jbe LABEL(exit_use)
855#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
856 sub $16, %r11
857 jbe LABEL(strcmp_exitz)
858#endif
859 add $16, %rdx
860 jmp LABEL(loop_ashr_6_use)
861
862 .p2align 4
863LABEL(nibble_ashr_6_use):
864 sub $0x1000, %r10
865 movdqa -16(%rdi, %rdx), %xmm0
866 psrldq $6, %xmm0
867 pcmpistri $0x3a,%xmm0, %xmm0
868#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
869 cmp %r11, %rcx
870 jae LABEL(nibble_ashr_exit_use)
871#endif
872 cmp $9, %ecx
873 ja LABEL(nibble_ashr_6_restart_use)
874
875 jmp LABEL(nibble_ashr_exit_use)
876
877/*
878 * The following cases will be handled by ashr_7
879 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
880 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
881 */
882 .p2align 4
883LABEL(ashr_7):
884 pxor %xmm0, %xmm0
885 movdqa (%rdi), %xmm2
886 movdqa (%rsi), %xmm1
887 pcmpeqb %xmm1, %xmm0
888 pslldq $9, %xmm2
889 TOLOWER (%xmm1, %xmm2)
890 pcmpeqb %xmm1, %xmm2
891 psubb %xmm0, %xmm2
892 pmovmskb %xmm2, %r9d
893 shr %cl, %edx
894 shr %cl, %r9d
895 sub %r9d, %edx
896 jnz LABEL(less32bytes)
897 movdqa (%rdi), %xmm3
898
899 UPDATE_STRNCMP_COUNTER
900
901 pxor %xmm0, %xmm0
902 mov $16, %rcx /* index for loads */
903 mov $7, %r9d /* byte position left over from less32bytes case */
904 /*
905 * Setup %r10 value allows us to detect crossing a page boundary.
906 * When %r10 goes positive we have crossed a page boundary and
907 * need to do a nibble.
908 */
909 lea 7(%rdi), %r10
910 and $0xfff, %r10 /* offset into 4K page */
911 sub $0x1000, %r10 /* subtract 4K pagesize */
912 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
913
914 .p2align 4
915LABEL(loop_ashr_7_use):
916 add $16, %r10
917 jg LABEL(nibble_ashr_7_use)
918
919LABEL(nibble_ashr_7_restart_use):
920 movdqa (%rdi, %rdx), %xmm0
921 palignr $7, -16(%rdi, %rdx), %xmm0
922#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
923 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
924#else
925 movdqa (%rsi,%rdx), %xmm1
926 TOLOWER (%xmm0, %xmm1)
927 pcmpistri $0x1a, %xmm1, %xmm0
928#endif
929 jbe LABEL(exit_use)
930#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
931 sub $16, %r11
932 jbe LABEL(strcmp_exitz)
933#endif
934
935 add $16, %rdx
936 add $16, %r10
937 jg LABEL(nibble_ashr_7_use)
938
939 movdqa (%rdi, %rdx), %xmm0
940 palignr $7, -16(%rdi, %rdx), %xmm0
941#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
942 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
943#else
944 movdqa (%rsi,%rdx), %xmm1
945 TOLOWER (%xmm0, %xmm1)
946 pcmpistri $0x1a, %xmm1, %xmm0
947#endif
948 jbe LABEL(exit_use)
949#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
950 sub $16, %r11
951 jbe LABEL(strcmp_exitz)
952#endif
953 add $16, %rdx
954 jmp LABEL(loop_ashr_7_use)
955
956 .p2align 4
957LABEL(nibble_ashr_7_use):
958 sub $0x1000, %r10
959 movdqa -16(%rdi, %rdx), %xmm0
960 psrldq $7, %xmm0
961 pcmpistri $0x3a,%xmm0, %xmm0
962#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
963 cmp %r11, %rcx
964 jae LABEL(nibble_ashr_exit_use)
965#endif
966 cmp $8, %ecx
967 ja LABEL(nibble_ashr_7_restart_use)
968
969 jmp LABEL(nibble_ashr_exit_use)
970
971/*
972 * The following cases will be handled by ashr_8
973 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
974 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
975 */
976 .p2align 4
977LABEL(ashr_8):
978 pxor %xmm0, %xmm0
979 movdqa (%rdi), %xmm2
980 movdqa (%rsi), %xmm1
981 pcmpeqb %xmm1, %xmm0
982 pslldq $8, %xmm2
983 TOLOWER (%xmm1, %xmm2)
984 pcmpeqb %xmm1, %xmm2
985 psubb %xmm0, %xmm2
986 pmovmskb %xmm2, %r9d
987 shr %cl, %edx
988 shr %cl, %r9d
989 sub %r9d, %edx
990 jnz LABEL(less32bytes)
991 movdqa (%rdi), %xmm3
992
993 UPDATE_STRNCMP_COUNTER
994
995 pxor %xmm0, %xmm0
996 mov $16, %rcx /* index for loads */
997 mov $8, %r9d /* byte position left over from less32bytes case */
998 /*
999 * Setup %r10 value allows us to detect crossing a page boundary.
1000 * When %r10 goes positive we have crossed a page boundary and
1001 * need to do a nibble.
1002 */
1003 lea 8(%rdi), %r10
1004 and $0xfff, %r10 /* offset into 4K page */
1005 sub $0x1000, %r10 /* subtract 4K pagesize */
1006 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1007
1008 .p2align 4
1009LABEL(loop_ashr_8_use):
1010 add $16, %r10
1011 jg LABEL(nibble_ashr_8_use)
1012
1013LABEL(nibble_ashr_8_restart_use):
1014 movdqa (%rdi, %rdx), %xmm0
1015 palignr $8, -16(%rdi, %rdx), %xmm0
1016#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1017 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1018#else
1019 movdqa (%rsi,%rdx), %xmm1
1020 TOLOWER (%xmm0, %xmm1)
1021 pcmpistri $0x1a, %xmm1, %xmm0
1022#endif
1023 jbe LABEL(exit_use)
1024#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1025 sub $16, %r11
1026 jbe LABEL(strcmp_exitz)
1027#endif
1028
1029 add $16, %rdx
1030 add $16, %r10
1031 jg LABEL(nibble_ashr_8_use)
1032
1033 movdqa (%rdi, %rdx), %xmm0
1034 palignr $8, -16(%rdi, %rdx), %xmm0
1035#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1036 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1037#else
1038 movdqa (%rsi,%rdx), %xmm1
1039 TOLOWER (%xmm0, %xmm1)
1040 pcmpistri $0x1a, %xmm1, %xmm0
1041#endif
1042 jbe LABEL(exit_use)
1043#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1044 sub $16, %r11
1045 jbe LABEL(strcmp_exitz)
1046#endif
1047 add $16, %rdx
1048 jmp LABEL(loop_ashr_8_use)
1049
1050 .p2align 4
1051LABEL(nibble_ashr_8_use):
1052 sub $0x1000, %r10
1053 movdqa -16(%rdi, %rdx), %xmm0
1054 psrldq $8, %xmm0
1055 pcmpistri $0x3a,%xmm0, %xmm0
1056#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1057 cmp %r11, %rcx
1058 jae LABEL(nibble_ashr_exit_use)
1059#endif
1060 cmp $7, %ecx
1061 ja LABEL(nibble_ashr_8_restart_use)
1062
1063 jmp LABEL(nibble_ashr_exit_use)
1064
1065/*
1066 * The following cases will be handled by ashr_9
1067 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1068 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1069 */
1070 .p2align 4
1071LABEL(ashr_9):
1072 pxor %xmm0, %xmm0
1073 movdqa (%rdi), %xmm2
1074 movdqa (%rsi), %xmm1
1075 pcmpeqb %xmm1, %xmm0
1076 pslldq $7, %xmm2
1077 TOLOWER (%xmm1, %xmm2)
1078 pcmpeqb %xmm1, %xmm2
1079 psubb %xmm0, %xmm2
1080 pmovmskb %xmm2, %r9d
1081 shr %cl, %edx
1082 shr %cl, %r9d
1083 sub %r9d, %edx
1084 jnz LABEL(less32bytes)
1085 movdqa (%rdi), %xmm3
1086
1087 UPDATE_STRNCMP_COUNTER
1088
1089 pxor %xmm0, %xmm0
1090 mov $16, %rcx /* index for loads */
1091 mov $9, %r9d /* byte position left over from less32bytes case */
1092 /*
1093 * Setup %r10 value allows us to detect crossing a page boundary.
1094 * When %r10 goes positive we have crossed a page boundary and
1095 * need to do a nibble.
1096 */
1097 lea 9(%rdi), %r10
1098 and $0xfff, %r10 /* offset into 4K page */
1099 sub $0x1000, %r10 /* subtract 4K pagesize */
1100 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1101
1102 .p2align 4
1103LABEL(loop_ashr_9_use):
1104 add $16, %r10
1105 jg LABEL(nibble_ashr_9_use)
1106
1107LABEL(nibble_ashr_9_restart_use):
1108 movdqa (%rdi, %rdx), %xmm0
1109
1110 palignr $9, -16(%rdi, %rdx), %xmm0
1111#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1112 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1113#else
1114 movdqa (%rsi,%rdx), %xmm1
1115 TOLOWER (%xmm0, %xmm1)
1116 pcmpistri $0x1a, %xmm1, %xmm0
1117#endif
1118 jbe LABEL(exit_use)
1119#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1120 sub $16, %r11
1121 jbe LABEL(strcmp_exitz)
1122#endif
1123
1124 add $16, %rdx
1125 add $16, %r10
1126 jg LABEL(nibble_ashr_9_use)
1127
1128 movdqa (%rdi, %rdx), %xmm0
1129 palignr $9, -16(%rdi, %rdx), %xmm0
1130#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1131 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1132#else
1133 movdqa (%rsi,%rdx), %xmm1
1134 TOLOWER (%xmm0, %xmm1)
1135 pcmpistri $0x1a, %xmm1, %xmm0
1136#endif
1137 jbe LABEL(exit_use)
1138#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1139 sub $16, %r11
1140 jbe LABEL(strcmp_exitz)
1141#endif
1142 add $16, %rdx
1143 jmp LABEL(loop_ashr_9_use)
1144
1145 .p2align 4
1146LABEL(nibble_ashr_9_use):
1147 sub $0x1000, %r10
1148 movdqa -16(%rdi, %rdx), %xmm0
1149 psrldq $9, %xmm0
1150 pcmpistri $0x3a,%xmm0, %xmm0
1151#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1152 cmp %r11, %rcx
1153 jae LABEL(nibble_ashr_exit_use)
1154#endif
1155 cmp $6, %ecx
1156 ja LABEL(nibble_ashr_9_restart_use)
1157
1158 jmp LABEL(nibble_ashr_exit_use)
1159
1160/*
1161 * The following cases will be handled by ashr_10
1162 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1163 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1164 */
1165 .p2align 4
1166LABEL(ashr_10):
1167 pxor %xmm0, %xmm0
1168 movdqa (%rdi), %xmm2
1169 movdqa (%rsi), %xmm1
1170 pcmpeqb %xmm1, %xmm0
1171 pslldq $6, %xmm2
1172 TOLOWER (%xmm1, %xmm2)
1173 pcmpeqb %xmm1, %xmm2
1174 psubb %xmm0, %xmm2
1175 pmovmskb %xmm2, %r9d
1176 shr %cl, %edx
1177 shr %cl, %r9d
1178 sub %r9d, %edx
1179 jnz LABEL(less32bytes)
1180 movdqa (%rdi), %xmm3
1181
1182 UPDATE_STRNCMP_COUNTER
1183
1184 pxor %xmm0, %xmm0
1185 mov $16, %rcx /* index for loads */
1186 mov $10, %r9d /* byte position left over from less32bytes case */
1187 /*
1188 * Setup %r10 value allows us to detect crossing a page boundary.
1189 * When %r10 goes positive we have crossed a page boundary and
1190 * need to do a nibble.
1191 */
1192 lea 10(%rdi), %r10
1193 and $0xfff, %r10 /* offset into 4K page */
1194 sub $0x1000, %r10 /* subtract 4K pagesize */
1195 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1196
1197 .p2align 4
1198LABEL(loop_ashr_10_use):
1199 add $16, %r10
1200 jg LABEL(nibble_ashr_10_use)
1201
1202LABEL(nibble_ashr_10_restart_use):
1203 movdqa (%rdi, %rdx), %xmm0
1204 palignr $10, -16(%rdi, %rdx), %xmm0
1205#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1206 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1207#else
1208 movdqa (%rsi,%rdx), %xmm1
1209 TOLOWER (%xmm0, %xmm1)
1210 pcmpistri $0x1a, %xmm1, %xmm0
1211#endif
1212 jbe LABEL(exit_use)
1213#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1214 sub $16, %r11
1215 jbe LABEL(strcmp_exitz)
1216#endif
1217
1218 add $16, %rdx
1219 add $16, %r10
1220 jg LABEL(nibble_ashr_10_use)
1221
1222 movdqa (%rdi, %rdx), %xmm0
1223 palignr $10, -16(%rdi, %rdx), %xmm0
1224#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1225 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1226#else
1227 movdqa (%rsi,%rdx), %xmm1
1228 TOLOWER (%xmm0, %xmm1)
1229 pcmpistri $0x1a, %xmm1, %xmm0
1230#endif
1231 jbe LABEL(exit_use)
1232#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1233 sub $16, %r11
1234 jbe LABEL(strcmp_exitz)
1235#endif
1236 add $16, %rdx
1237 jmp LABEL(loop_ashr_10_use)
1238
1239 .p2align 4
1240LABEL(nibble_ashr_10_use):
1241 sub $0x1000, %r10
1242 movdqa -16(%rdi, %rdx), %xmm0
1243 psrldq $10, %xmm0
1244 pcmpistri $0x3a,%xmm0, %xmm0
1245#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1246 cmp %r11, %rcx
1247 jae LABEL(nibble_ashr_exit_use)
1248#endif
1249 cmp $5, %ecx
1250 ja LABEL(nibble_ashr_10_restart_use)
1251
1252 jmp LABEL(nibble_ashr_exit_use)
1253
1254/*
1255 * The following cases will be handled by ashr_11
1256 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1257 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1258 */
1259 .p2align 4
1260LABEL(ashr_11):
1261 pxor %xmm0, %xmm0
1262 movdqa (%rdi), %xmm2
1263 movdqa (%rsi), %xmm1
1264 pcmpeqb %xmm1, %xmm0
1265 pslldq $5, %xmm2
1266 TOLOWER (%xmm1, %xmm2)
1267 pcmpeqb %xmm1, %xmm2
1268 psubb %xmm0, %xmm2
1269 pmovmskb %xmm2, %r9d
1270 shr %cl, %edx
1271 shr %cl, %r9d
1272 sub %r9d, %edx
1273 jnz LABEL(less32bytes)
1274 movdqa (%rdi), %xmm3
1275
1276 UPDATE_STRNCMP_COUNTER
1277
1278 pxor %xmm0, %xmm0
1279 mov $16, %rcx /* index for loads */
1280 mov $11, %r9d /* byte position left over from less32bytes case */
1281 /*
1282 * Setup %r10 value allows us to detect crossing a page boundary.
1283 * When %r10 goes positive we have crossed a page boundary and
1284 * need to do a nibble.
1285 */
1286 lea 11(%rdi), %r10
1287 and $0xfff, %r10 /* offset into 4K page */
1288 sub $0x1000, %r10 /* subtract 4K pagesize */
1289 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1290
1291 .p2align 4
1292LABEL(loop_ashr_11_use):
1293 add $16, %r10
1294 jg LABEL(nibble_ashr_11_use)
1295
1296LABEL(nibble_ashr_11_restart_use):
1297 movdqa (%rdi, %rdx), %xmm0
1298 palignr $11, -16(%rdi, %rdx), %xmm0
1299#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1300 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1301#else
1302 movdqa (%rsi,%rdx), %xmm1
1303 TOLOWER (%xmm0, %xmm1)
1304 pcmpistri $0x1a, %xmm1, %xmm0
1305#endif
1306 jbe LABEL(exit_use)
1307#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1308 sub $16, %r11
1309 jbe LABEL(strcmp_exitz)
1310#endif
1311
1312 add $16, %rdx
1313 add $16, %r10
1314 jg LABEL(nibble_ashr_11_use)
1315
1316 movdqa (%rdi, %rdx), %xmm0
1317 palignr $11, -16(%rdi, %rdx), %xmm0
1318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1319 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1320#else
1321 movdqa (%rsi,%rdx), %xmm1
1322 TOLOWER (%xmm0, %xmm1)
1323 pcmpistri $0x1a, %xmm1, %xmm0
1324#endif
1325 jbe LABEL(exit_use)
1326#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1327 sub $16, %r11
1328 jbe LABEL(strcmp_exitz)
1329#endif
1330 add $16, %rdx
1331 jmp LABEL(loop_ashr_11_use)
1332
1333 .p2align 4
1334LABEL(nibble_ashr_11_use):
1335 sub $0x1000, %r10
1336 movdqa -16(%rdi, %rdx), %xmm0
1337 psrldq $11, %xmm0
1338 pcmpistri $0x3a,%xmm0, %xmm0
1339#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1340 cmp %r11, %rcx
1341 jae LABEL(nibble_ashr_exit_use)
1342#endif
1343 cmp $4, %ecx
1344 ja LABEL(nibble_ashr_11_restart_use)
1345
1346 jmp LABEL(nibble_ashr_exit_use)
1347
1348/*
1349 * The following cases will be handled by ashr_12
1350 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1351 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1352 */
1353 .p2align 4
1354LABEL(ashr_12):
1355 pxor %xmm0, %xmm0
1356 movdqa (%rdi), %xmm2
1357 movdqa (%rsi), %xmm1
1358 pcmpeqb %xmm1, %xmm0
1359 pslldq $4, %xmm2
1360 TOLOWER (%xmm1, %xmm2)
1361 pcmpeqb %xmm1, %xmm2
1362 psubb %xmm0, %xmm2
1363 pmovmskb %xmm2, %r9d
1364 shr %cl, %edx
1365 shr %cl, %r9d
1366 sub %r9d, %edx
1367 jnz LABEL(less32bytes)
1368 movdqa (%rdi), %xmm3
1369
1370 UPDATE_STRNCMP_COUNTER
1371
1372 pxor %xmm0, %xmm0
1373 mov $16, %rcx /* index for loads */
1374 mov $12, %r9d /* byte position left over from less32bytes case */
1375 /*
1376 * Setup %r10 value allows us to detect crossing a page boundary.
1377 * When %r10 goes positive we have crossed a page boundary and
1378 * need to do a nibble.
1379 */
1380 lea 12(%rdi), %r10
1381 and $0xfff, %r10 /* offset into 4K page */
1382 sub $0x1000, %r10 /* subtract 4K pagesize */
1383 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1384
1385 .p2align 4
1386LABEL(loop_ashr_12_use):
1387 add $16, %r10
1388 jg LABEL(nibble_ashr_12_use)
1389
1390LABEL(nibble_ashr_12_restart_use):
1391 movdqa (%rdi, %rdx), %xmm0
1392 palignr $12, -16(%rdi, %rdx), %xmm0
1393#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1394 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1395#else
1396 movdqa (%rsi,%rdx), %xmm1
1397 TOLOWER (%xmm0, %xmm1)
1398 pcmpistri $0x1a, %xmm1, %xmm0
1399#endif
1400 jbe LABEL(exit_use)
1401#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1402 sub $16, %r11
1403 jbe LABEL(strcmp_exitz)
1404#endif
1405
1406 add $16, %rdx
1407 add $16, %r10
1408 jg LABEL(nibble_ashr_12_use)
1409
1410 movdqa (%rdi, %rdx), %xmm0
1411 palignr $12, -16(%rdi, %rdx), %xmm0
1412#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1413 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1414#else
1415 movdqa (%rsi,%rdx), %xmm1
1416 TOLOWER (%xmm0, %xmm1)
1417 pcmpistri $0x1a, %xmm1, %xmm0
1418#endif
1419 jbe LABEL(exit_use)
1420#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1421 sub $16, %r11
1422 jbe LABEL(strcmp_exitz)
1423#endif
1424 add $16, %rdx
1425 jmp LABEL(loop_ashr_12_use)
1426
1427 .p2align 4
1428LABEL(nibble_ashr_12_use):
1429 sub $0x1000, %r10
1430 movdqa -16(%rdi, %rdx), %xmm0
1431 psrldq $12, %xmm0
1432 pcmpistri $0x3a,%xmm0, %xmm0
1433#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1434 cmp %r11, %rcx
1435 jae LABEL(nibble_ashr_exit_use)
1436#endif
1437 cmp $3, %ecx
1438 ja LABEL(nibble_ashr_12_restart_use)
1439
1440 jmp LABEL(nibble_ashr_exit_use)
1441
1442/*
1443 * The following cases will be handled by ashr_13
1444 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1445 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1446 */
1447 .p2align 4
1448LABEL(ashr_13):
1449 pxor %xmm0, %xmm0
1450 movdqa (%rdi), %xmm2
1451 movdqa (%rsi), %xmm1
1452 pcmpeqb %xmm1, %xmm0
1453 pslldq $3, %xmm2
1454 TOLOWER (%xmm1, %xmm2)
1455 pcmpeqb %xmm1, %xmm2
1456 psubb %xmm0, %xmm2
1457 pmovmskb %xmm2, %r9d
1458 shr %cl, %edx
1459 shr %cl, %r9d
1460 sub %r9d, %edx
1461 jnz LABEL(less32bytes)
1462 movdqa (%rdi), %xmm3
1463
1464 UPDATE_STRNCMP_COUNTER
1465
1466 pxor %xmm0, %xmm0
1467 mov $16, %rcx /* index for loads */
1468 mov $13, %r9d /* byte position left over from less32bytes case */
1469 /*
1470 * Setup %r10 value allows us to detect crossing a page boundary.
1471 * When %r10 goes positive we have crossed a page boundary and
1472 * need to do a nibble.
1473 */
1474 lea 13(%rdi), %r10
1475 and $0xfff, %r10 /* offset into 4K page */
1476 sub $0x1000, %r10 /* subtract 4K pagesize */
1477
1478 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1479
1480 .p2align 4
1481LABEL(loop_ashr_13_use):
1482 add $16, %r10
1483 jg LABEL(nibble_ashr_13_use)
1484
1485LABEL(nibble_ashr_13_restart_use):
1486 movdqa (%rdi, %rdx), %xmm0
1487 palignr $13, -16(%rdi, %rdx), %xmm0
1488#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1489 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1490#else
1491 movdqa (%rsi,%rdx), %xmm1
1492 TOLOWER (%xmm0, %xmm1)
1493 pcmpistri $0x1a, %xmm1, %xmm0
1494#endif
1495 jbe LABEL(exit_use)
1496#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1497 sub $16, %r11
1498 jbe LABEL(strcmp_exitz)
1499#endif
1500
1501 add $16, %rdx
1502 add $16, %r10
1503 jg LABEL(nibble_ashr_13_use)
1504
1505 movdqa (%rdi, %rdx), %xmm0
1506 palignr $13, -16(%rdi, %rdx), %xmm0
1507#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1508 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1509#else
1510 movdqa (%rsi,%rdx), %xmm1
1511 TOLOWER (%xmm0, %xmm1)
1512 pcmpistri $0x1a, %xmm1, %xmm0
1513#endif
1514 jbe LABEL(exit_use)
1515#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1516 sub $16, %r11
1517 jbe LABEL(strcmp_exitz)
1518#endif
1519 add $16, %rdx
1520 jmp LABEL(loop_ashr_13_use)
1521
1522 .p2align 4
1523LABEL(nibble_ashr_13_use):
1524 sub $0x1000, %r10
1525 movdqa -16(%rdi, %rdx), %xmm0
1526 psrldq $13, %xmm0
1527 pcmpistri $0x3a,%xmm0, %xmm0
1528#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1529 cmp %r11, %rcx
1530 jae LABEL(nibble_ashr_exit_use)
1531#endif
1532 cmp $2, %ecx
1533 ja LABEL(nibble_ashr_13_restart_use)
1534
1535 jmp LABEL(nibble_ashr_exit_use)
1536
1537/*
1538 * The following cases will be handled by ashr_14
1539 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1540 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1541 */
1542 .p2align 4
1543LABEL(ashr_14):
1544 pxor %xmm0, %xmm0
1545 movdqa (%rdi), %xmm2
1546 movdqa (%rsi), %xmm1
1547 pcmpeqb %xmm1, %xmm0
1548 pslldq $2, %xmm2
1549 TOLOWER (%xmm1, %xmm2)
1550 pcmpeqb %xmm1, %xmm2
1551 psubb %xmm0, %xmm2
1552 pmovmskb %xmm2, %r9d
1553 shr %cl, %edx
1554 shr %cl, %r9d
1555 sub %r9d, %edx
1556 jnz LABEL(less32bytes)
1557 movdqa (%rdi), %xmm3
1558
1559 UPDATE_STRNCMP_COUNTER
1560
1561 pxor %xmm0, %xmm0
1562 mov $16, %rcx /* index for loads */
1563 mov $14, %r9d /* byte position left over from less32bytes case */
1564 /*
1565 * Setup %r10 value allows us to detect crossing a page boundary.
1566 * When %r10 goes positive we have crossed a page boundary and
1567 * need to do a nibble.
1568 */
1569 lea 14(%rdi), %r10
1570 and $0xfff, %r10 /* offset into 4K page */
1571 sub $0x1000, %r10 /* subtract 4K pagesize */
1572
1573 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1574
1575 .p2align 4
1576LABEL(loop_ashr_14_use):
1577 add $16, %r10
1578 jg LABEL(nibble_ashr_14_use)
1579
1580LABEL(nibble_ashr_14_restart_use):
1581 movdqa (%rdi, %rdx), %xmm0
1582 palignr $14, -16(%rdi, %rdx), %xmm0
1583#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1584 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1585#else
1586 movdqa (%rsi,%rdx), %xmm1
1587 TOLOWER (%xmm0, %xmm1)
1588 pcmpistri $0x1a, %xmm1, %xmm0
1589#endif
1590 jbe LABEL(exit_use)
1591#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1592 sub $16, %r11
1593 jbe LABEL(strcmp_exitz)
1594#endif
1595
1596 add $16, %rdx
1597 add $16, %r10
1598 jg LABEL(nibble_ashr_14_use)
1599
1600 movdqa (%rdi, %rdx), %xmm0
1601 palignr $14, -16(%rdi, %rdx), %xmm0
1602#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1603 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1604#else
1605 movdqa (%rsi,%rdx), %xmm1
1606 TOLOWER (%xmm0, %xmm1)
1607 pcmpistri $0x1a, %xmm1, %xmm0
1608#endif
1609 jbe LABEL(exit_use)
1610#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1611 sub $16, %r11
1612 jbe LABEL(strcmp_exitz)
1613#endif
1614 add $16, %rdx
1615 jmp LABEL(loop_ashr_14_use)
1616
1617 .p2align 4
1618LABEL(nibble_ashr_14_use):
1619 sub $0x1000, %r10
1620 movdqa -16(%rdi, %rdx), %xmm0
1621 psrldq $14, %xmm0
1622 pcmpistri $0x3a,%xmm0, %xmm0
1623#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1624 cmp %r11, %rcx
1625 jae LABEL(nibble_ashr_exit_use)
1626#endif
1627 cmp $1, %ecx
1628 ja LABEL(nibble_ashr_14_restart_use)
1629
1630 jmp LABEL(nibble_ashr_exit_use)
1631
1632/*
1633 * The following cases will be handled by ashr_15
1634 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1635 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1636 */
1637 .p2align 4
1638LABEL(ashr_15):
1639 pxor %xmm0, %xmm0
1640 movdqa (%rdi), %xmm2
1641 movdqa (%rsi), %xmm1
1642 pcmpeqb %xmm1, %xmm0
1643 pslldq $1, %xmm2
1644 TOLOWER (%xmm1, %xmm2)
1645 pcmpeqb %xmm1, %xmm2
1646 psubb %xmm0, %xmm2
1647 pmovmskb %xmm2, %r9d
1648 shr %cl, %edx
1649 shr %cl, %r9d
1650 sub %r9d, %edx
1651 jnz LABEL(less32bytes)
1652
1653 movdqa (%rdi), %xmm3
1654
1655 UPDATE_STRNCMP_COUNTER
1656
1657 pxor %xmm0, %xmm0
1658 mov $16, %rcx /* index for loads */
1659 mov $15, %r9d /* byte position left over from less32bytes case */
1660 /*
1661 * Setup %r10 value allows us to detect crossing a page boundary.
1662 * When %r10 goes positive we have crossed a page boundary and
1663 * need to do a nibble.
1664 */
1665 lea 15(%rdi), %r10
1666 and $0xfff, %r10 /* offset into 4K page */
1667
1668 sub $0x1000, %r10 /* subtract 4K pagesize */
1669
1670 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1671
1672 .p2align 4
1673LABEL(loop_ashr_15_use):
1674 add $16, %r10
1675 jg LABEL(nibble_ashr_15_use)
1676
1677LABEL(nibble_ashr_15_restart_use):
1678 movdqa (%rdi, %rdx), %xmm0
1679 palignr $15, -16(%rdi, %rdx), %xmm0
1680#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1681 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1682#else
1683 movdqa (%rsi,%rdx), %xmm1
1684 TOLOWER (%xmm0, %xmm1)
1685 pcmpistri $0x1a, %xmm1, %xmm0
1686#endif
1687 jbe LABEL(exit_use)
1688#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1689 sub $16, %r11
1690 jbe LABEL(strcmp_exitz)
1691#endif
1692
1693 add $16, %rdx
1694 add $16, %r10
1695 jg LABEL(nibble_ashr_15_use)
1696
1697 movdqa (%rdi, %rdx), %xmm0
1698 palignr $15, -16(%rdi, %rdx), %xmm0
1699#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1700 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1701#else
1702 movdqa (%rsi,%rdx), %xmm1
1703 TOLOWER (%xmm0, %xmm1)
1704 pcmpistri $0x1a, %xmm1, %xmm0
1705#endif
1706 jbe LABEL(exit_use)
1707#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1708 sub $16, %r11
1709 jbe LABEL(strcmp_exitz)
1710#endif
1711 add $16, %rdx
1712 jmp LABEL(loop_ashr_15_use)
1713
1714 .p2align 4
1715LABEL(nibble_ashr_15_use):
1716 sub $0x1000, %r10
1717 movdqa -16(%rdi, %rdx), %xmm0
1718 psrldq $15, %xmm0
1719 pcmpistri $0x3a,%xmm0, %xmm0
1720#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1721 cmp %r11, %rcx
1722 jae LABEL(nibble_ashr_exit_use)
1723#endif
1724 cmp $0, %ecx
1725 ja LABEL(nibble_ashr_15_restart_use)
1726
1727LABEL(nibble_ashr_exit_use):
1728#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1729 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1730#else
1731 movdqa (%rsi,%rdx), %xmm1
1732 TOLOWER (%xmm0, %xmm1)
1733 pcmpistri $0x1a, %xmm1, %xmm0
1734#endif
1735 .p2align 4
1736LABEL(exit_use):
1737 jnc LABEL(strcmp_exitz)
1738#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1739 sub %rcx, %r11
1740 jbe LABEL(strcmp_exitz)
1741#endif
1742 add %rcx, %rdx
1743 lea -16(%rdi, %r9), %rdi
1744 movzbl (%rdi, %rdx), %eax
1745 movzbl (%rsi, %rdx), %edx
1746 test %r8d, %r8d
1747 jz LABEL(ret_use)
1748 xchg %eax, %edx
1749LABEL(ret_use):
1750#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1751 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1752 movl (%rcx,%rdx,4), %edx
1753 movl (%rcx,%rax,4), %eax
1754#endif
1755
1756 sub %edx, %eax
1757 ret
1758
1759LABEL(less32bytes):
1760 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1761 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1762 test %r8d, %r8d
1763 jz LABEL(ret)
1764 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1765
1766 .p2align 4
1767LABEL(ret):
1768LABEL(less16bytes):
1769 bsf %rdx, %rdx /* find and store bit index in %rdx */
1770
1771#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1772 sub %rdx, %r11
1773 jbe LABEL(strcmp_exitz)
1774#endif
1775 movzbl (%rsi, %rdx), %ecx
1776 movzbl (%rdi, %rdx), %eax
1777
1778#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1779 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1780 movl (%rdx,%rcx,4), %ecx
1781 movl (%rdx,%rax,4), %eax
1782#endif
1783
1784 sub %ecx, %eax
1785 ret
1786
1787LABEL(strcmp_exitz):
1788 xor %eax, %eax
1789 ret
1790
1791 .p2align 4
1792 // XXX Same as code above
1793LABEL(Byte0):
1794 movzx (%rsi), %ecx
1795 movzx (%rdi), %eax
1796
1797#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1798 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1799 movl (%rdx,%rcx,4), %ecx
1800 movl (%rdx,%rax,4), %eax
1801#endif
1802
1803 sub %ecx, %eax
1804 ret
1805 cfi_endproc
1806 .size STRCMP_SSE42, .-STRCMP_SSE42
1807
1808#undef UCLOW_reg
1809#undef UCHIGH_reg
1810#undef LCQWORD_reg
1811#undef TOLOWER
1812
1813 /* Put all SSE 4.2 functions together. */
1814 .section .rodata.SECTION,"a",@progbits
1815 .p2align 3
1816LABEL(unaligned_table):
1817 .int LABEL(ashr_1) - LABEL(unaligned_table)
1818 .int LABEL(ashr_2) - LABEL(unaligned_table)
1819 .int LABEL(ashr_3) - LABEL(unaligned_table)
1820 .int LABEL(ashr_4) - LABEL(unaligned_table)
1821 .int LABEL(ashr_5) - LABEL(unaligned_table)
1822 .int LABEL(ashr_6) - LABEL(unaligned_table)
1823 .int LABEL(ashr_7) - LABEL(unaligned_table)
1824 .int LABEL(ashr_8) - LABEL(unaligned_table)
1825 .int LABEL(ashr_9) - LABEL(unaligned_table)
1826 .int LABEL(ashr_10) - LABEL(unaligned_table)
1827 .int LABEL(ashr_11) - LABEL(unaligned_table)
1828 .int LABEL(ashr_12) - LABEL(unaligned_table)
1829 .int LABEL(ashr_13) - LABEL(unaligned_table)
1830 .int LABEL(ashr_14) - LABEL(unaligned_table)
1831 .int LABEL(ashr_15) - LABEL(unaligned_table)
1832 .int LABEL(ashr_0) - LABEL(unaligned_table)
1833
1834#undef LABEL
1835#undef GLABEL
1836#undef SECTION