]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strcmp-sse42.S
Fix test of non-ASCII locales in x86-64 strcasecmp et.al.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-sse42.S
1 /* strcmp with SSE4.2
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21
22 /* We use 0x1a:
23 _SIDD_SBYTE_OPS
24 | _SIDD_CMP_EQUAL_EACH
25 | _SIDD_NEGATIVE_POLARITY
26 | _SIDD_LEAST_SIGNIFICANT
27 on pcmpistri to find out if two 16byte data elements are the same
28 and the offset of the first different byte. There are 4 cases:
29
30 1. Both 16byte data elements are valid and identical.
31 2. Both 16byte data elements have EOS and identical.
32 3. Both 16byte data elements are valid and they differ at offset X.
33 4. At least one 16byte data element has EOS at offset X. Two 16byte
34 data elements must differ at or before offset X.
35
36 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
37
38 case ECX CFlag ZFlag SFlag
39 1 16 0 0 0
40 2 16 0 1 1
41 3 X 1 0 0
42 4 0 <= X 1 0/1 0/1
43
44 We exit from the loop for cases 2, 3 and 4 with jbe which branches
45 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
46 case 2. */
47
48 /* Put all SSE 4.2 functions together. */
49 .section .text.SECTION,"ax",@progbits
50 .align 16
51 .type STRCMP_SSE42, @function
52 #ifdef USE_AS_STRCASECMP_L
53 ENTRY (GLABEL(__strcasecmp))
54 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
55 movq %fs:(%rax),%rdx
56
57 // XXX 5 byte should be before the function
58 /* 5-byte NOP. */
59 .byte 0x0f,0x1f,0x44,0x00,0x00
60 END (GLABEL(__strcasecmp))
61 /* FALLTHROUGH to strcasecmp_l. */
62 #endif
63 #ifdef USE_AS_STRNCASECMP_L
64 ENTRY (GLABEL(__strncasecmp))
65 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
66 movq %fs:(%rax),%rcx
67
68 // XXX 5 byte should be before the function
69 /* 5-byte NOP. */
70 .byte 0x0f,0x1f,0x44,0x00,0x00
71 END (GLABEL(__strncasecmp))
72 /* FALLTHROUGH to strncasecmp_l. */
73 #endif
74
75
76 #ifdef USE_AVX
77 # define movdqa vmovdqa
78 # define movdqu vmovdqu
79 # define pmovmskb vpmovmskb
80 # define pcmpistri vpcmpistri
81 # define psubb vpsubb
82 # define pcmpeqb vpcmpeqb
83 # define psrldq vpsrldq
84 # define pslldq vpslldq
85 # define palignr vpalignr
86 # define pxor vpxor
87 # define D(arg) arg, arg
88 #else
89 # define D(arg) arg
90 #endif
91
92 STRCMP_SSE42:
93 cfi_startproc
94 CALL_MCOUNT
95
96 /*
97 * This implementation uses SSE to compare up to 16 bytes at a time.
98 */
99 #ifdef USE_AS_STRCASECMP_L
100 /* We have to fall back on the C implementation for locales
101 with encodings not matching ASCII for single bytes. */
102 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
103 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
104 # else
105 movq (%rdx), %rax
106 # endif
107 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
108 jne __strcasecmp_l_nonascii
109 #endif
110 #ifdef USE_AS_STRNCASECMP_L
111 /* We have to fall back on the C implementation for locales
112 with encodings not matching ASCII for single bytes. */
113 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
114 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
115 # else
116 movq (%rcx), %rax
117 # endif
118 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
119 jne __strncasecmp_l_nonascii
120 #endif
121
122 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
123 test %rdx, %rdx
124 je LABEL(strcmp_exitz)
125 cmp $1, %rdx
126 je LABEL(Byte0)
127 mov %rdx, %r11
128 #endif
129 mov %esi, %ecx
130 mov %edi, %eax
131 /* Use 64bit AND here to avoid long NOP padding. */
132 and $0x3f, %rcx /* rsi alignment in cache line */
133 and $0x3f, %rax /* rdi alignment in cache line */
134 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
135 .section .rodata.cst16,"aM",@progbits,16
136 .align 16
137 LABEL(belowupper):
138 .quad 0x4040404040404040
139 .quad 0x4040404040404040
140 LABEL(topupper):
141 # ifdef USE_AVX
142 .quad 0x5a5a5a5a5a5a5a5a
143 .quad 0x5a5a5a5a5a5a5a5a
144 # else
145 .quad 0x5b5b5b5b5b5b5b5b
146 .quad 0x5b5b5b5b5b5b5b5b
147 # endif
148 LABEL(touppermask):
149 .quad 0x2020202020202020
150 .quad 0x2020202020202020
151 .previous
152 movdqa LABEL(belowupper)(%rip), %xmm4
153 # define UCLOW_reg %xmm4
154 movdqa LABEL(topupper)(%rip), %xmm5
155 # define UCHIGH_reg %xmm5
156 movdqa LABEL(touppermask)(%rip), %xmm6
157 # define LCQWORD_reg %xmm6
158 #endif
159 cmp $0x30, %ecx
160 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
161 cmp $0x30, %eax
162 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
163 movdqu (%rdi), %xmm1
164 movdqu (%rsi), %xmm2
165 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
166 # ifdef USE_AVX
167 # define TOLOWER(reg1, reg2) \
168 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
169 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
170 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
171 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
172 vpandn %xmm7, %xmm8, %xmm8; \
173 vpandn %xmm9, %xmm10, %xmm10; \
174 vpand LCQWORD_reg, %xmm8, %xmm8; \
175 vpand LCQWORD_reg, %xmm10, %xmm10; \
176 vpor reg1, %xmm8, reg1; \
177 vpor reg2, %xmm10, reg2
178 # else
179 # define TOLOWER(reg1, reg2) \
180 movdqa reg1, %xmm7; \
181 movdqa UCHIGH_reg, %xmm8; \
182 movdqa reg2, %xmm9; \
183 movdqa UCHIGH_reg, %xmm10; \
184 pcmpgtb UCLOW_reg, %xmm7; \
185 pcmpgtb reg1, %xmm8; \
186 pcmpgtb UCLOW_reg, %xmm9; \
187 pcmpgtb reg2, %xmm10; \
188 pand %xmm8, %xmm7; \
189 pand %xmm10, %xmm9; \
190 pand LCQWORD_reg, %xmm7; \
191 pand LCQWORD_reg, %xmm9; \
192 por %xmm7, reg1; \
193 por %xmm9, reg2
194 # endif
195 TOLOWER (%xmm1, %xmm2)
196 #else
197 # define TOLOWER(reg1, reg2)
198 #endif
199 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
200 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
201 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
202 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
203 pmovmskb %xmm1, %edx
204 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
205 jnz LABEL(less16bytes)/* If not, find different value or null char */
206 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
207 sub $16, %r11
208 jbe LABEL(strcmp_exitz)/* finish comparision */
209 #endif
210 add $16, %rsi /* prepare to search next 16 bytes */
211 add $16, %rdi /* prepare to search next 16 bytes */
212
213 /*
214 * Determine source and destination string offsets from 16-byte
215 * alignment. Use relative offset difference between the two to
216 * determine which case below to use.
217 */
218 .p2align 4
219 LABEL(crosscache):
220 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
221 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
222 mov $0xffff, %edx /* for equivalent offset */
223 xor %r8d, %r8d
224 and $0xf, %ecx /* offset of rsi */
225 and $0xf, %eax /* offset of rdi */
226 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
227 cmp %eax, %ecx
228 je LABEL(ashr_0) /* rsi and rdi relative offset same */
229 ja LABEL(bigger)
230 mov %edx, %r8d /* r8d is offset flag for exit tail */
231 xchg %ecx, %eax
232 xchg %rsi, %rdi
233 LABEL(bigger):
234 movdqa (%rdi), %xmm2
235 movdqa (%rsi), %xmm1
236 lea 15(%rax), %r9
237 sub %rcx, %r9
238 lea LABEL(unaligned_table)(%rip), %r10
239 movslq (%r10, %r9,4), %r9
240 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
241 lea (%r10, %r9), %r10
242 jmp *%r10 /* jump to corresponding case */
243
244 /*
245 * The following cases will be handled by ashr_0
246 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
247 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
248 */
249 .p2align 4
250 LABEL(ashr_0):
251
252 movdqa (%rsi), %xmm1
253 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
254 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
255 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
256 #else
257 movdqa (%rdi), %xmm2
258 TOLOWER (%xmm1, %xmm2)
259 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
260 #endif
261 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
262 pmovmskb %xmm1, %r9d
263 shr %cl, %edx /* adjust 0xffff for offset */
264 shr %cl, %r9d /* adjust for 16-byte offset */
265 sub %r9d, %edx
266 /*
267 * edx must be the same with r9d if in left byte (16-rcx) is equal to
268 * the start from (16-rax) and no null char was seen.
269 */
270 jne LABEL(less32bytes) /* mismatch or null char */
271 UPDATE_STRNCMP_COUNTER
272 mov $16, %rcx
273 mov $16, %r9
274
275 /*
276 * Now both strings are aligned at 16-byte boundary. Loop over strings
277 * checking 32-bytes per iteration.
278 */
279 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
280 .p2align 4
281 LABEL(ashr_0_use):
282 movdqa (%rdi,%rdx), %xmm0
283 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
284 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
285 #else
286 movdqa (%rsi,%rdx), %xmm1
287 TOLOWER (%xmm0, %xmm1)
288 pcmpistri $0x1a, %xmm1, %xmm0
289 #endif
290 lea 16(%rdx), %rdx
291 jbe LABEL(ashr_0_exit_use)
292 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
293 sub $16, %r11
294 jbe LABEL(strcmp_exitz)
295 #endif
296
297 movdqa (%rdi,%rdx), %xmm0
298 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
299 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
300 #else
301 movdqa (%rsi,%rdx), %xmm1
302 TOLOWER (%xmm0, %xmm1)
303 pcmpistri $0x1a, %xmm1, %xmm0
304 #endif
305 lea 16(%rdx), %rdx
306 jbe LABEL(ashr_0_exit_use)
307 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
308 sub $16, %r11
309 jbe LABEL(strcmp_exitz)
310 #endif
311 jmp LABEL(ashr_0_use)
312
313
314 .p2align 4
315 LABEL(ashr_0_exit_use):
316 jnc LABEL(strcmp_exitz)
317 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
318 sub %rcx, %r11
319 jbe LABEL(strcmp_exitz)
320 #endif
321 lea -16(%rdx, %rcx), %rcx
322 movzbl (%rdi, %rcx), %eax
323 movzbl (%rsi, %rcx), %edx
324 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
325 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
326 movl (%rcx,%rax,4), %eax
327 movl (%rcx,%rdx,4), %edx
328 #endif
329 sub %edx, %eax
330 ret
331
332
333
334 /*
335 * The following cases will be handled by ashr_1
336 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
337 * n(15) n -15 0(15 +(n-15) - n) ashr_1
338 */
339 .p2align 4
340 LABEL(ashr_1):
341 pslldq $15, D(%xmm2) /* shift first string to align with second */
342 TOLOWER (%xmm1, %xmm2)
343 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
344 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
345 pmovmskb %xmm2, %r9d
346 shr %cl, %edx /* adjust 0xffff for offset */
347 shr %cl, %r9d /* adjust for 16-byte offset */
348 sub %r9d, %edx
349 jnz LABEL(less32bytes) /* mismatch or null char seen */
350 movdqa (%rdi), %xmm3
351 UPDATE_STRNCMP_COUNTER
352
353 mov $16, %rcx /* index for loads*/
354 mov $1, %r9d /* byte position left over from less32bytes case */
355 /*
356 * Setup %r10 value allows us to detect crossing a page boundary.
357 * When %r10 goes positive we have crossed a page boundary and
358 * need to do a nibble.
359 */
360 lea 1(%rdi), %r10
361 and $0xfff, %r10 /* offset into 4K page */
362 sub $0x1000, %r10 /* subtract 4K pagesize */
363 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
364
365 .p2align 4
366 LABEL(loop_ashr_1_use):
367 add $16, %r10
368 jg LABEL(nibble_ashr_1_use)
369
370 LABEL(nibble_ashr_1_restart_use):
371 movdqa (%rdi, %rdx), %xmm0
372 palignr $1, -16(%rdi, %rdx), D(%xmm0)
373 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
374 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
375 #else
376 movdqa (%rsi,%rdx), %xmm1
377 TOLOWER (%xmm0, %xmm1)
378 pcmpistri $0x1a, %xmm1, %xmm0
379 #endif
380 jbe LABEL(exit_use)
381 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
382 sub $16, %r11
383 jbe LABEL(strcmp_exitz)
384 #endif
385
386 add $16, %rdx
387 add $16, %r10
388 jg LABEL(nibble_ashr_1_use)
389
390 movdqa (%rdi, %rdx), %xmm0
391 palignr $1, -16(%rdi, %rdx), D(%xmm0)
392 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
393 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
394 #else
395 movdqa (%rsi,%rdx), %xmm1
396 TOLOWER (%xmm0, %xmm1)
397 pcmpistri $0x1a, %xmm1, %xmm0
398 #endif
399 jbe LABEL(exit_use)
400 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
401 sub $16, %r11
402 jbe LABEL(strcmp_exitz)
403 #endif
404 add $16, %rdx
405 jmp LABEL(loop_ashr_1_use)
406
407 .p2align 4
408 LABEL(nibble_ashr_1_use):
409 sub $0x1000, %r10
410 movdqa -16(%rdi, %rdx), %xmm0
411 psrldq $1, D(%xmm0)
412 pcmpistri $0x3a,%xmm0, %xmm0
413 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
414 cmp %r11, %rcx
415 jae LABEL(nibble_ashr_exit_use)
416 #endif
417 cmp $14, %ecx
418 ja LABEL(nibble_ashr_1_restart_use)
419
420 jmp LABEL(nibble_ashr_exit_use)
421
422 /*
423 * The following cases will be handled by ashr_2
424 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
425 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
426 */
427 .p2align 4
428 LABEL(ashr_2):
429 pslldq $14, D(%xmm2)
430 TOLOWER (%xmm1, %xmm2)
431 pcmpeqb %xmm1, D(%xmm2)
432 psubb %xmm0, D(%xmm2)
433 pmovmskb %xmm2, %r9d
434 shr %cl, %edx
435 shr %cl, %r9d
436 sub %r9d, %edx
437 jnz LABEL(less32bytes)
438 movdqa (%rdi), %xmm3
439 UPDATE_STRNCMP_COUNTER
440
441 mov $16, %rcx /* index for loads */
442 mov $2, %r9d /* byte position left over from less32bytes case */
443 /*
444 * Setup %r10 value allows us to detect crossing a page boundary.
445 * When %r10 goes positive we have crossed a page boundary and
446 * need to do a nibble.
447 */
448 lea 2(%rdi), %r10
449 and $0xfff, %r10 /* offset into 4K page */
450 sub $0x1000, %r10 /* subtract 4K pagesize */
451 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
452
453 .p2align 4
454 LABEL(loop_ashr_2_use):
455 add $16, %r10
456 jg LABEL(nibble_ashr_2_use)
457
458 LABEL(nibble_ashr_2_restart_use):
459 movdqa (%rdi, %rdx), %xmm0
460 palignr $2, -16(%rdi, %rdx), D(%xmm0)
461 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
462 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
463 #else
464 movdqa (%rsi,%rdx), %xmm1
465 TOLOWER (%xmm0, %xmm1)
466 pcmpistri $0x1a, %xmm1, %xmm0
467 #endif
468 jbe LABEL(exit_use)
469 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
470 sub $16, %r11
471 jbe LABEL(strcmp_exitz)
472 #endif
473
474 add $16, %rdx
475 add $16, %r10
476 jg LABEL(nibble_ashr_2_use)
477
478 movdqa (%rdi, %rdx), %xmm0
479 palignr $2, -16(%rdi, %rdx), D(%xmm0)
480 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
481 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
482 #else
483 movdqa (%rsi,%rdx), %xmm1
484 TOLOWER (%xmm0, %xmm1)
485 pcmpistri $0x1a, %xmm1, %xmm0
486 #endif
487 jbe LABEL(exit_use)
488 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
489 sub $16, %r11
490 jbe LABEL(strcmp_exitz)
491 #endif
492 add $16, %rdx
493 jmp LABEL(loop_ashr_2_use)
494
495 .p2align 4
496 LABEL(nibble_ashr_2_use):
497 sub $0x1000, %r10
498 movdqa -16(%rdi, %rdx), %xmm0
499 psrldq $2, D(%xmm0)
500 pcmpistri $0x3a,%xmm0, %xmm0
501 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
502 cmp %r11, %rcx
503 jae LABEL(nibble_ashr_exit_use)
504 #endif
505 cmp $13, %ecx
506 ja LABEL(nibble_ashr_2_restart_use)
507
508 jmp LABEL(nibble_ashr_exit_use)
509
510 /*
511 * The following cases will be handled by ashr_3
512 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
513 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
514 */
515 .p2align 4
516 LABEL(ashr_3):
517 pslldq $13, D(%xmm2)
518 TOLOWER (%xmm1, %xmm2)
519 pcmpeqb %xmm1, D(%xmm2)
520 psubb %xmm0, D(%xmm2)
521 pmovmskb %xmm2, %r9d
522 shr %cl, %edx
523 shr %cl, %r9d
524 sub %r9d, %edx
525 jnz LABEL(less32bytes)
526 movdqa (%rdi), %xmm3
527
528 UPDATE_STRNCMP_COUNTER
529
530 mov $16, %rcx /* index for loads */
531 mov $3, %r9d /* byte position left over from less32bytes case */
532 /*
533 * Setup %r10 value allows us to detect crossing a page boundary.
534 * When %r10 goes positive we have crossed a page boundary and
535 * need to do a nibble.
536 */
537 lea 3(%rdi), %r10
538 and $0xfff, %r10 /* offset into 4K page */
539 sub $0x1000, %r10 /* subtract 4K pagesize */
540 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
541
542 LABEL(loop_ashr_3_use):
543 add $16, %r10
544 jg LABEL(nibble_ashr_3_use)
545
546 LABEL(nibble_ashr_3_restart_use):
547 movdqa (%rdi, %rdx), %xmm0
548 palignr $3, -16(%rdi, %rdx), D(%xmm0)
549 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
550 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
551 #else
552 movdqa (%rsi,%rdx), %xmm1
553 TOLOWER (%xmm0, %xmm1)
554 pcmpistri $0x1a, %xmm1, %xmm0
555 #endif
556 jbe LABEL(exit_use)
557 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
558 sub $16, %r11
559 jbe LABEL(strcmp_exitz)
560 #endif
561
562 add $16, %rdx
563 add $16, %r10
564 jg LABEL(nibble_ashr_3_use)
565
566 movdqa (%rdi, %rdx), %xmm0
567 palignr $3, -16(%rdi, %rdx), D(%xmm0)
568 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
569 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
570 #else
571 movdqa (%rsi,%rdx), %xmm1
572 TOLOWER (%xmm0, %xmm1)
573 pcmpistri $0x1a, %xmm1, %xmm0
574 #endif
575 jbe LABEL(exit_use)
576 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
577 sub $16, %r11
578 jbe LABEL(strcmp_exitz)
579 #endif
580 add $16, %rdx
581 jmp LABEL(loop_ashr_3_use)
582
583 .p2align 4
584 LABEL(nibble_ashr_3_use):
585 sub $0x1000, %r10
586 movdqa -16(%rdi, %rdx), %xmm0
587 psrldq $3, D(%xmm0)
588 pcmpistri $0x3a,%xmm0, %xmm0
589 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
590 cmp %r11, %rcx
591 jae LABEL(nibble_ashr_exit_use)
592 #endif
593 cmp $12, %ecx
594 ja LABEL(nibble_ashr_3_restart_use)
595
596 jmp LABEL(nibble_ashr_exit_use)
597
598 /*
599 * The following cases will be handled by ashr_4
600 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
601 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
602 */
603 .p2align 4
604 LABEL(ashr_4):
605 pslldq $12, D(%xmm2)
606 TOLOWER (%xmm1, %xmm2)
607 pcmpeqb %xmm1, D(%xmm2)
608 psubb %xmm0, D(%xmm2)
609 pmovmskb %xmm2, %r9d
610 shr %cl, %edx
611 shr %cl, %r9d
612 sub %r9d, %edx
613 jnz LABEL(less32bytes)
614 movdqa (%rdi), %xmm3
615
616 UPDATE_STRNCMP_COUNTER
617
618 mov $16, %rcx /* index for loads */
619 mov $4, %r9d /* byte position left over from less32bytes case */
620 /*
621 * Setup %r10 value allows us to detect crossing a page boundary.
622 * When %r10 goes positive we have crossed a page boundary and
623 * need to do a nibble.
624 */
625 lea 4(%rdi), %r10
626 and $0xfff, %r10 /* offset into 4K page */
627 sub $0x1000, %r10 /* subtract 4K pagesize */
628 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
629
630 .p2align 4
631 LABEL(loop_ashr_4_use):
632 add $16, %r10
633 jg LABEL(nibble_ashr_4_use)
634
635 LABEL(nibble_ashr_4_restart_use):
636 movdqa (%rdi, %rdx), %xmm0
637 palignr $4, -16(%rdi, %rdx), D(%xmm0)
638 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
639 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
640 #else
641 movdqa (%rsi,%rdx), %xmm1
642 TOLOWER (%xmm0, %xmm1)
643 pcmpistri $0x1a, %xmm1, %xmm0
644 #endif
645 jbe LABEL(exit_use)
646 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
647 sub $16, %r11
648 jbe LABEL(strcmp_exitz)
649 #endif
650
651 add $16, %rdx
652 add $16, %r10
653 jg LABEL(nibble_ashr_4_use)
654
655 movdqa (%rdi, %rdx), %xmm0
656 palignr $4, -16(%rdi, %rdx), D(%xmm0)
657 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
658 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
659 #else
660 movdqa (%rsi,%rdx), %xmm1
661 TOLOWER (%xmm0, %xmm1)
662 pcmpistri $0x1a, %xmm1, %xmm0
663 #endif
664 jbe LABEL(exit_use)
665 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
666 sub $16, %r11
667 jbe LABEL(strcmp_exitz)
668 #endif
669 add $16, %rdx
670 jmp LABEL(loop_ashr_4_use)
671
672 .p2align 4
673 LABEL(nibble_ashr_4_use):
674 sub $0x1000, %r10
675 movdqa -16(%rdi, %rdx), %xmm0
676 psrldq $4, D(%xmm0)
677 pcmpistri $0x3a,%xmm0, %xmm0
678 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
679 cmp %r11, %rcx
680 jae LABEL(nibble_ashr_exit_use)
681 #endif
682 cmp $11, %ecx
683 ja LABEL(nibble_ashr_4_restart_use)
684
685 jmp LABEL(nibble_ashr_exit_use)
686
687 /*
688 * The following cases will be handled by ashr_5
689 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
690 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
691 */
692 .p2align 4
693 LABEL(ashr_5):
694 pslldq $11, D(%xmm2)
695 TOLOWER (%xmm1, %xmm2)
696 pcmpeqb %xmm1, D(%xmm2)
697 psubb %xmm0, D(%xmm2)
698 pmovmskb %xmm2, %r9d
699 shr %cl, %edx
700 shr %cl, %r9d
701 sub %r9d, %edx
702 jnz LABEL(less32bytes)
703 movdqa (%rdi), %xmm3
704
705 UPDATE_STRNCMP_COUNTER
706
707 mov $16, %rcx /* index for loads */
708 mov $5, %r9d /* byte position left over from less32bytes case */
709 /*
710 * Setup %r10 value allows us to detect crossing a page boundary.
711 * When %r10 goes positive we have crossed a page boundary and
712 * need to do a nibble.
713 */
714 lea 5(%rdi), %r10
715 and $0xfff, %r10 /* offset into 4K page */
716 sub $0x1000, %r10 /* subtract 4K pagesize */
717 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
718
719 .p2align 4
720 LABEL(loop_ashr_5_use):
721 add $16, %r10
722 jg LABEL(nibble_ashr_5_use)
723
724 LABEL(nibble_ashr_5_restart_use):
725 movdqa (%rdi, %rdx), %xmm0
726 palignr $5, -16(%rdi, %rdx), D(%xmm0)
727 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
728 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
729 #else
730 movdqa (%rsi,%rdx), %xmm1
731 TOLOWER (%xmm0, %xmm1)
732 pcmpistri $0x1a, %xmm1, %xmm0
733 #endif
734 jbe LABEL(exit_use)
735 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
736 sub $16, %r11
737 jbe LABEL(strcmp_exitz)
738 #endif
739
740 add $16, %rdx
741 add $16, %r10
742 jg LABEL(nibble_ashr_5_use)
743
744 movdqa (%rdi, %rdx), %xmm0
745
746 palignr $5, -16(%rdi, %rdx), D(%xmm0)
747 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
748 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
749 #else
750 movdqa (%rsi,%rdx), %xmm1
751 TOLOWER (%xmm0, %xmm1)
752 pcmpistri $0x1a, %xmm1, %xmm0
753 #endif
754 jbe LABEL(exit_use)
755 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
756 sub $16, %r11
757 jbe LABEL(strcmp_exitz)
758 #endif
759 add $16, %rdx
760 jmp LABEL(loop_ashr_5_use)
761
762 .p2align 4
763 LABEL(nibble_ashr_5_use):
764 sub $0x1000, %r10
765 movdqa -16(%rdi, %rdx), %xmm0
766 psrldq $5, D(%xmm0)
767 pcmpistri $0x3a,%xmm0, %xmm0
768 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
769 cmp %r11, %rcx
770 jae LABEL(nibble_ashr_exit_use)
771 #endif
772 cmp $10, %ecx
773 ja LABEL(nibble_ashr_5_restart_use)
774
775 jmp LABEL(nibble_ashr_exit_use)
776
777 /*
778 * The following cases will be handled by ashr_6
779 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
780 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
781 */
782 .p2align 4
783 LABEL(ashr_6):
784 pslldq $10, D(%xmm2)
785 TOLOWER (%xmm1, %xmm2)
786 pcmpeqb %xmm1, D(%xmm2)
787 psubb %xmm0, D(%xmm2)
788 pmovmskb %xmm2, %r9d
789 shr %cl, %edx
790 shr %cl, %r9d
791 sub %r9d, %edx
792 jnz LABEL(less32bytes)
793 movdqa (%rdi), %xmm3
794
795 UPDATE_STRNCMP_COUNTER
796
797 mov $16, %rcx /* index for loads */
798 mov $6, %r9d /* byte position left over from less32bytes case */
799 /*
800 * Setup %r10 value allows us to detect crossing a page boundary.
801 * When %r10 goes positive we have crossed a page boundary and
802 * need to do a nibble.
803 */
804 lea 6(%rdi), %r10
805 and $0xfff, %r10 /* offset into 4K page */
806 sub $0x1000, %r10 /* subtract 4K pagesize */
807 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
808
809 .p2align 4
810 LABEL(loop_ashr_6_use):
811 add $16, %r10
812 jg LABEL(nibble_ashr_6_use)
813
814 LABEL(nibble_ashr_6_restart_use):
815 movdqa (%rdi, %rdx), %xmm0
816 palignr $6, -16(%rdi, %rdx), D(%xmm0)
817 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
818 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
819 #else
820 movdqa (%rsi,%rdx), %xmm1
821 TOLOWER (%xmm0, %xmm1)
822 pcmpistri $0x1a, %xmm1, %xmm0
823 #endif
824 jbe LABEL(exit_use)
825 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
826 sub $16, %r11
827 jbe LABEL(strcmp_exitz)
828 #endif
829
830 add $16, %rdx
831 add $16, %r10
832 jg LABEL(nibble_ashr_6_use)
833
834 movdqa (%rdi, %rdx), %xmm0
835 palignr $6, -16(%rdi, %rdx), D(%xmm0)
836 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
837 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
838 #else
839 movdqa (%rsi,%rdx), %xmm1
840 TOLOWER (%xmm0, %xmm1)
841 pcmpistri $0x1a, %xmm1, %xmm0
842 #endif
843 jbe LABEL(exit_use)
844 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
845 sub $16, %r11
846 jbe LABEL(strcmp_exitz)
847 #endif
848 add $16, %rdx
849 jmp LABEL(loop_ashr_6_use)
850
851 .p2align 4
852 LABEL(nibble_ashr_6_use):
853 sub $0x1000, %r10
854 movdqa -16(%rdi, %rdx), %xmm0
855 psrldq $6, D(%xmm0)
856 pcmpistri $0x3a,%xmm0, %xmm0
857 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
858 cmp %r11, %rcx
859 jae LABEL(nibble_ashr_exit_use)
860 #endif
861 cmp $9, %ecx
862 ja LABEL(nibble_ashr_6_restart_use)
863
864 jmp LABEL(nibble_ashr_exit_use)
865
866 /*
867 * The following cases will be handled by ashr_7
868 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
869 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
870 */
871 .p2align 4
872 LABEL(ashr_7):
873 pslldq $9, D(%xmm2)
874 TOLOWER (%xmm1, %xmm2)
875 pcmpeqb %xmm1, D(%xmm2)
876 psubb %xmm0, D(%xmm2)
877 pmovmskb %xmm2, %r9d
878 shr %cl, %edx
879 shr %cl, %r9d
880 sub %r9d, %edx
881 jnz LABEL(less32bytes)
882 movdqa (%rdi), %xmm3
883
884 UPDATE_STRNCMP_COUNTER
885
886 mov $16, %rcx /* index for loads */
887 mov $7, %r9d /* byte position left over from less32bytes case */
888 /*
889 * Setup %r10 value allows us to detect crossing a page boundary.
890 * When %r10 goes positive we have crossed a page boundary and
891 * need to do a nibble.
892 */
893 lea 7(%rdi), %r10
894 and $0xfff, %r10 /* offset into 4K page */
895 sub $0x1000, %r10 /* subtract 4K pagesize */
896 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
897
898 .p2align 4
899 LABEL(loop_ashr_7_use):
900 add $16, %r10
901 jg LABEL(nibble_ashr_7_use)
902
903 LABEL(nibble_ashr_7_restart_use):
904 movdqa (%rdi, %rdx), %xmm0
905 palignr $7, -16(%rdi, %rdx), D(%xmm0)
906 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
907 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
908 #else
909 movdqa (%rsi,%rdx), %xmm1
910 TOLOWER (%xmm0, %xmm1)
911 pcmpistri $0x1a, %xmm1, %xmm0
912 #endif
913 jbe LABEL(exit_use)
914 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
915 sub $16, %r11
916 jbe LABEL(strcmp_exitz)
917 #endif
918
919 add $16, %rdx
920 add $16, %r10
921 jg LABEL(nibble_ashr_7_use)
922
923 movdqa (%rdi, %rdx), %xmm0
924 palignr $7, -16(%rdi, %rdx), D(%xmm0)
925 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
926 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
927 #else
928 movdqa (%rsi,%rdx), %xmm1
929 TOLOWER (%xmm0, %xmm1)
930 pcmpistri $0x1a, %xmm1, %xmm0
931 #endif
932 jbe LABEL(exit_use)
933 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
934 sub $16, %r11
935 jbe LABEL(strcmp_exitz)
936 #endif
937 add $16, %rdx
938 jmp LABEL(loop_ashr_7_use)
939
940 .p2align 4
941 LABEL(nibble_ashr_7_use):
942 sub $0x1000, %r10
943 movdqa -16(%rdi, %rdx), %xmm0
944 psrldq $7, D(%xmm0)
945 pcmpistri $0x3a,%xmm0, %xmm0
946 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
947 cmp %r11, %rcx
948 jae LABEL(nibble_ashr_exit_use)
949 #endif
950 cmp $8, %ecx
951 ja LABEL(nibble_ashr_7_restart_use)
952
953 jmp LABEL(nibble_ashr_exit_use)
954
955 /*
956 * The following cases will be handled by ashr_8
957 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
958 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
959 */
960 .p2align 4
961 LABEL(ashr_8):
962 pslldq $8, D(%xmm2)
963 TOLOWER (%xmm1, %xmm2)
964 pcmpeqb %xmm1, D(%xmm2)
965 psubb %xmm0, D(%xmm2)
966 pmovmskb %xmm2, %r9d
967 shr %cl, %edx
968 shr %cl, %r9d
969 sub %r9d, %edx
970 jnz LABEL(less32bytes)
971 movdqa (%rdi), %xmm3
972
973 UPDATE_STRNCMP_COUNTER
974
975 mov $16, %rcx /* index for loads */
976 mov $8, %r9d /* byte position left over from less32bytes case */
977 /*
978 * Setup %r10 value allows us to detect crossing a page boundary.
979 * When %r10 goes positive we have crossed a page boundary and
980 * need to do a nibble.
981 */
982 lea 8(%rdi), %r10
983 and $0xfff, %r10 /* offset into 4K page */
984 sub $0x1000, %r10 /* subtract 4K pagesize */
985 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
986
987 .p2align 4
988 LABEL(loop_ashr_8_use):
989 add $16, %r10
990 jg LABEL(nibble_ashr_8_use)
991
992 LABEL(nibble_ashr_8_restart_use):
993 movdqa (%rdi, %rdx), %xmm0
994 palignr $8, -16(%rdi, %rdx), D(%xmm0)
995 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
996 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
997 #else
998 movdqa (%rsi,%rdx), %xmm1
999 TOLOWER (%xmm0, %xmm1)
1000 pcmpistri $0x1a, %xmm1, %xmm0
1001 #endif
1002 jbe LABEL(exit_use)
1003 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1004 sub $16, %r11
1005 jbe LABEL(strcmp_exitz)
1006 #endif
1007
1008 add $16, %rdx
1009 add $16, %r10
1010 jg LABEL(nibble_ashr_8_use)
1011
1012 movdqa (%rdi, %rdx), %xmm0
1013 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1014 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1015 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1016 #else
1017 movdqa (%rsi,%rdx), %xmm1
1018 TOLOWER (%xmm0, %xmm1)
1019 pcmpistri $0x1a, %xmm1, %xmm0
1020 #endif
1021 jbe LABEL(exit_use)
1022 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1023 sub $16, %r11
1024 jbe LABEL(strcmp_exitz)
1025 #endif
1026 add $16, %rdx
1027 jmp LABEL(loop_ashr_8_use)
1028
1029 .p2align 4
1030 LABEL(nibble_ashr_8_use):
1031 sub $0x1000, %r10
1032 movdqa -16(%rdi, %rdx), %xmm0
1033 psrldq $8, D(%xmm0)
1034 pcmpistri $0x3a,%xmm0, %xmm0
1035 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1036 cmp %r11, %rcx
1037 jae LABEL(nibble_ashr_exit_use)
1038 #endif
1039 cmp $7, %ecx
1040 ja LABEL(nibble_ashr_8_restart_use)
1041
1042 jmp LABEL(nibble_ashr_exit_use)
1043
1044 /*
1045 * The following cases will be handled by ashr_9
1046 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1047 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1048 */
1049 .p2align 4
1050 LABEL(ashr_9):
1051 pslldq $7, D(%xmm2)
1052 TOLOWER (%xmm1, %xmm2)
1053 pcmpeqb %xmm1, D(%xmm2)
1054 psubb %xmm0, D(%xmm2)
1055 pmovmskb %xmm2, %r9d
1056 shr %cl, %edx
1057 shr %cl, %r9d
1058 sub %r9d, %edx
1059 jnz LABEL(less32bytes)
1060 movdqa (%rdi), %xmm3
1061
1062 UPDATE_STRNCMP_COUNTER
1063
1064 mov $16, %rcx /* index for loads */
1065 mov $9, %r9d /* byte position left over from less32bytes case */
1066 /*
1067 * Setup %r10 value allows us to detect crossing a page boundary.
1068 * When %r10 goes positive we have crossed a page boundary and
1069 * need to do a nibble.
1070 */
1071 lea 9(%rdi), %r10
1072 and $0xfff, %r10 /* offset into 4K page */
1073 sub $0x1000, %r10 /* subtract 4K pagesize */
1074 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1075
1076 .p2align 4
1077 LABEL(loop_ashr_9_use):
1078 add $16, %r10
1079 jg LABEL(nibble_ashr_9_use)
1080
1081 LABEL(nibble_ashr_9_restart_use):
1082 movdqa (%rdi, %rdx), %xmm0
1083
1084 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1085 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1086 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1087 #else
1088 movdqa (%rsi,%rdx), %xmm1
1089 TOLOWER (%xmm0, %xmm1)
1090 pcmpistri $0x1a, %xmm1, %xmm0
1091 #endif
1092 jbe LABEL(exit_use)
1093 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1094 sub $16, %r11
1095 jbe LABEL(strcmp_exitz)
1096 #endif
1097
1098 add $16, %rdx
1099 add $16, %r10
1100 jg LABEL(nibble_ashr_9_use)
1101
1102 movdqa (%rdi, %rdx), %xmm0
1103 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1104 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1105 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1106 #else
1107 movdqa (%rsi,%rdx), %xmm1
1108 TOLOWER (%xmm0, %xmm1)
1109 pcmpistri $0x1a, %xmm1, %xmm0
1110 #endif
1111 jbe LABEL(exit_use)
1112 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1113 sub $16, %r11
1114 jbe LABEL(strcmp_exitz)
1115 #endif
1116 add $16, %rdx
1117 jmp LABEL(loop_ashr_9_use)
1118
1119 .p2align 4
1120 LABEL(nibble_ashr_9_use):
1121 sub $0x1000, %r10
1122 movdqa -16(%rdi, %rdx), %xmm0
1123 psrldq $9, D(%xmm0)
1124 pcmpistri $0x3a,%xmm0, %xmm0
1125 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1126 cmp %r11, %rcx
1127 jae LABEL(nibble_ashr_exit_use)
1128 #endif
1129 cmp $6, %ecx
1130 ja LABEL(nibble_ashr_9_restart_use)
1131
1132 jmp LABEL(nibble_ashr_exit_use)
1133
1134 /*
1135 * The following cases will be handled by ashr_10
1136 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1137 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1138 */
1139 .p2align 4
1140 LABEL(ashr_10):
1141 pslldq $6, D(%xmm2)
1142 TOLOWER (%xmm1, %xmm2)
1143 pcmpeqb %xmm1, D(%xmm2)
1144 psubb %xmm0, D(%xmm2)
1145 pmovmskb %xmm2, %r9d
1146 shr %cl, %edx
1147 shr %cl, %r9d
1148 sub %r9d, %edx
1149 jnz LABEL(less32bytes)
1150 movdqa (%rdi), %xmm3
1151
1152 UPDATE_STRNCMP_COUNTER
1153
1154 mov $16, %rcx /* index for loads */
1155 mov $10, %r9d /* byte position left over from less32bytes case */
1156 /*
1157 * Setup %r10 value allows us to detect crossing a page boundary.
1158 * When %r10 goes positive we have crossed a page boundary and
1159 * need to do a nibble.
1160 */
1161 lea 10(%rdi), %r10
1162 and $0xfff, %r10 /* offset into 4K page */
1163 sub $0x1000, %r10 /* subtract 4K pagesize */
1164 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1165
1166 .p2align 4
1167 LABEL(loop_ashr_10_use):
1168 add $16, %r10
1169 jg LABEL(nibble_ashr_10_use)
1170
1171 LABEL(nibble_ashr_10_restart_use):
1172 movdqa (%rdi, %rdx), %xmm0
1173 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1174 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1175 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1176 #else
1177 movdqa (%rsi,%rdx), %xmm1
1178 TOLOWER (%xmm0, %xmm1)
1179 pcmpistri $0x1a, %xmm1, %xmm0
1180 #endif
1181 jbe LABEL(exit_use)
1182 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1183 sub $16, %r11
1184 jbe LABEL(strcmp_exitz)
1185 #endif
1186
1187 add $16, %rdx
1188 add $16, %r10
1189 jg LABEL(nibble_ashr_10_use)
1190
1191 movdqa (%rdi, %rdx), %xmm0
1192 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1193 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1194 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1195 #else
1196 movdqa (%rsi,%rdx), %xmm1
1197 TOLOWER (%xmm0, %xmm1)
1198 pcmpistri $0x1a, %xmm1, %xmm0
1199 #endif
1200 jbe LABEL(exit_use)
1201 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1202 sub $16, %r11
1203 jbe LABEL(strcmp_exitz)
1204 #endif
1205 add $16, %rdx
1206 jmp LABEL(loop_ashr_10_use)
1207
1208 .p2align 4
1209 LABEL(nibble_ashr_10_use):
1210 sub $0x1000, %r10
1211 movdqa -16(%rdi, %rdx), %xmm0
1212 psrldq $10, D(%xmm0)
1213 pcmpistri $0x3a,%xmm0, %xmm0
1214 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1215 cmp %r11, %rcx
1216 jae LABEL(nibble_ashr_exit_use)
1217 #endif
1218 cmp $5, %ecx
1219 ja LABEL(nibble_ashr_10_restart_use)
1220
1221 jmp LABEL(nibble_ashr_exit_use)
1222
1223 /*
1224 * The following cases will be handled by ashr_11
1225 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1226 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1227 */
1228 .p2align 4
1229 LABEL(ashr_11):
1230 pslldq $5, D(%xmm2)
1231 TOLOWER (%xmm1, %xmm2)
1232 pcmpeqb %xmm1, D(%xmm2)
1233 psubb %xmm0, D(%xmm2)
1234 pmovmskb %xmm2, %r9d
1235 shr %cl, %edx
1236 shr %cl, %r9d
1237 sub %r9d, %edx
1238 jnz LABEL(less32bytes)
1239 movdqa (%rdi), %xmm3
1240
1241 UPDATE_STRNCMP_COUNTER
1242
1243 mov $16, %rcx /* index for loads */
1244 mov $11, %r9d /* byte position left over from less32bytes case */
1245 /*
1246 * Setup %r10 value allows us to detect crossing a page boundary.
1247 * When %r10 goes positive we have crossed a page boundary and
1248 * need to do a nibble.
1249 */
1250 lea 11(%rdi), %r10
1251 and $0xfff, %r10 /* offset into 4K page */
1252 sub $0x1000, %r10 /* subtract 4K pagesize */
1253 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1254
1255 .p2align 4
1256 LABEL(loop_ashr_11_use):
1257 add $16, %r10
1258 jg LABEL(nibble_ashr_11_use)
1259
1260 LABEL(nibble_ashr_11_restart_use):
1261 movdqa (%rdi, %rdx), %xmm0
1262 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1263 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1264 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1265 #else
1266 movdqa (%rsi,%rdx), %xmm1
1267 TOLOWER (%xmm0, %xmm1)
1268 pcmpistri $0x1a, %xmm1, %xmm0
1269 #endif
1270 jbe LABEL(exit_use)
1271 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1272 sub $16, %r11
1273 jbe LABEL(strcmp_exitz)
1274 #endif
1275
1276 add $16, %rdx
1277 add $16, %r10
1278 jg LABEL(nibble_ashr_11_use)
1279
1280 movdqa (%rdi, %rdx), %xmm0
1281 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1282 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1283 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1284 #else
1285 movdqa (%rsi,%rdx), %xmm1
1286 TOLOWER (%xmm0, %xmm1)
1287 pcmpistri $0x1a, %xmm1, %xmm0
1288 #endif
1289 jbe LABEL(exit_use)
1290 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1291 sub $16, %r11
1292 jbe LABEL(strcmp_exitz)
1293 #endif
1294 add $16, %rdx
1295 jmp LABEL(loop_ashr_11_use)
1296
1297 .p2align 4
1298 LABEL(nibble_ashr_11_use):
1299 sub $0x1000, %r10
1300 movdqa -16(%rdi, %rdx), %xmm0
1301 psrldq $11, D(%xmm0)
1302 pcmpistri $0x3a,%xmm0, %xmm0
1303 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1304 cmp %r11, %rcx
1305 jae LABEL(nibble_ashr_exit_use)
1306 #endif
1307 cmp $4, %ecx
1308 ja LABEL(nibble_ashr_11_restart_use)
1309
1310 jmp LABEL(nibble_ashr_exit_use)
1311
1312 /*
1313 * The following cases will be handled by ashr_12
1314 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1315 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1316 */
1317 .p2align 4
1318 LABEL(ashr_12):
1319 pslldq $4, D(%xmm2)
1320 TOLOWER (%xmm1, %xmm2)
1321 pcmpeqb %xmm1, D(%xmm2)
1322 psubb %xmm0, D(%xmm2)
1323 pmovmskb %xmm2, %r9d
1324 shr %cl, %edx
1325 shr %cl, %r9d
1326 sub %r9d, %edx
1327 jnz LABEL(less32bytes)
1328 movdqa (%rdi), %xmm3
1329
1330 UPDATE_STRNCMP_COUNTER
1331
1332 mov $16, %rcx /* index for loads */
1333 mov $12, %r9d /* byte position left over from less32bytes case */
1334 /*
1335 * Setup %r10 value allows us to detect crossing a page boundary.
1336 * When %r10 goes positive we have crossed a page boundary and
1337 * need to do a nibble.
1338 */
1339 lea 12(%rdi), %r10
1340 and $0xfff, %r10 /* offset into 4K page */
1341 sub $0x1000, %r10 /* subtract 4K pagesize */
1342 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1343
1344 .p2align 4
1345 LABEL(loop_ashr_12_use):
1346 add $16, %r10
1347 jg LABEL(nibble_ashr_12_use)
1348
1349 LABEL(nibble_ashr_12_restart_use):
1350 movdqa (%rdi, %rdx), %xmm0
1351 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1352 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1353 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1354 #else
1355 movdqa (%rsi,%rdx), %xmm1
1356 TOLOWER (%xmm0, %xmm1)
1357 pcmpistri $0x1a, %xmm1, %xmm0
1358 #endif
1359 jbe LABEL(exit_use)
1360 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1361 sub $16, %r11
1362 jbe LABEL(strcmp_exitz)
1363 #endif
1364
1365 add $16, %rdx
1366 add $16, %r10
1367 jg LABEL(nibble_ashr_12_use)
1368
1369 movdqa (%rdi, %rdx), %xmm0
1370 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1371 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1372 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1373 #else
1374 movdqa (%rsi,%rdx), %xmm1
1375 TOLOWER (%xmm0, %xmm1)
1376 pcmpistri $0x1a, %xmm1, %xmm0
1377 #endif
1378 jbe LABEL(exit_use)
1379 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1380 sub $16, %r11
1381 jbe LABEL(strcmp_exitz)
1382 #endif
1383 add $16, %rdx
1384 jmp LABEL(loop_ashr_12_use)
1385
1386 .p2align 4
1387 LABEL(nibble_ashr_12_use):
1388 sub $0x1000, %r10
1389 movdqa -16(%rdi, %rdx), %xmm0
1390 psrldq $12, D(%xmm0)
1391 pcmpistri $0x3a,%xmm0, %xmm0
1392 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1393 cmp %r11, %rcx
1394 jae LABEL(nibble_ashr_exit_use)
1395 #endif
1396 cmp $3, %ecx
1397 ja LABEL(nibble_ashr_12_restart_use)
1398
1399 jmp LABEL(nibble_ashr_exit_use)
1400
1401 /*
1402 * The following cases will be handled by ashr_13
1403 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1404 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1405 */
1406 .p2align 4
1407 LABEL(ashr_13):
1408 pslldq $3, D(%xmm2)
1409 TOLOWER (%xmm1, %xmm2)
1410 pcmpeqb %xmm1, D(%xmm2)
1411 psubb %xmm0, D(%xmm2)
1412 pmovmskb %xmm2, %r9d
1413 shr %cl, %edx
1414 shr %cl, %r9d
1415 sub %r9d, %edx
1416 jnz LABEL(less32bytes)
1417 movdqa (%rdi), %xmm3
1418
1419 UPDATE_STRNCMP_COUNTER
1420
1421 mov $16, %rcx /* index for loads */
1422 mov $13, %r9d /* byte position left over from less32bytes case */
1423 /*
1424 * Setup %r10 value allows us to detect crossing a page boundary.
1425 * When %r10 goes positive we have crossed a page boundary and
1426 * need to do a nibble.
1427 */
1428 lea 13(%rdi), %r10
1429 and $0xfff, %r10 /* offset into 4K page */
1430 sub $0x1000, %r10 /* subtract 4K pagesize */
1431
1432 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1433
1434 .p2align 4
1435 LABEL(loop_ashr_13_use):
1436 add $16, %r10
1437 jg LABEL(nibble_ashr_13_use)
1438
1439 LABEL(nibble_ashr_13_restart_use):
1440 movdqa (%rdi, %rdx), %xmm0
1441 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1442 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1443 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1444 #else
1445 movdqa (%rsi,%rdx), %xmm1
1446 TOLOWER (%xmm0, %xmm1)
1447 pcmpistri $0x1a, %xmm1, %xmm0
1448 #endif
1449 jbe LABEL(exit_use)
1450 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1451 sub $16, %r11
1452 jbe LABEL(strcmp_exitz)
1453 #endif
1454
1455 add $16, %rdx
1456 add $16, %r10
1457 jg LABEL(nibble_ashr_13_use)
1458
1459 movdqa (%rdi, %rdx), %xmm0
1460 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1461 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1462 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1463 #else
1464 movdqa (%rsi,%rdx), %xmm1
1465 TOLOWER (%xmm0, %xmm1)
1466 pcmpistri $0x1a, %xmm1, %xmm0
1467 #endif
1468 jbe LABEL(exit_use)
1469 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1470 sub $16, %r11
1471 jbe LABEL(strcmp_exitz)
1472 #endif
1473 add $16, %rdx
1474 jmp LABEL(loop_ashr_13_use)
1475
1476 .p2align 4
1477 LABEL(nibble_ashr_13_use):
1478 sub $0x1000, %r10
1479 movdqa -16(%rdi, %rdx), %xmm0
1480 psrldq $13, D(%xmm0)
1481 pcmpistri $0x3a,%xmm0, %xmm0
1482 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1483 cmp %r11, %rcx
1484 jae LABEL(nibble_ashr_exit_use)
1485 #endif
1486 cmp $2, %ecx
1487 ja LABEL(nibble_ashr_13_restart_use)
1488
1489 jmp LABEL(nibble_ashr_exit_use)
1490
1491 /*
1492 * The following cases will be handled by ashr_14
1493 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1494 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1495 */
1496 .p2align 4
1497 LABEL(ashr_14):
1498 pslldq $2, D(%xmm2)
1499 TOLOWER (%xmm1, %xmm2)
1500 pcmpeqb %xmm1, D(%xmm2)
1501 psubb %xmm0, D(%xmm2)
1502 pmovmskb %xmm2, %r9d
1503 shr %cl, %edx
1504 shr %cl, %r9d
1505 sub %r9d, %edx
1506 jnz LABEL(less32bytes)
1507 movdqa (%rdi), %xmm3
1508
1509 UPDATE_STRNCMP_COUNTER
1510
1511 mov $16, %rcx /* index for loads */
1512 mov $14, %r9d /* byte position left over from less32bytes case */
1513 /*
1514 * Setup %r10 value allows us to detect crossing a page boundary.
1515 * When %r10 goes positive we have crossed a page boundary and
1516 * need to do a nibble.
1517 */
1518 lea 14(%rdi), %r10
1519 and $0xfff, %r10 /* offset into 4K page */
1520 sub $0x1000, %r10 /* subtract 4K pagesize */
1521
1522 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1523
1524 .p2align 4
1525 LABEL(loop_ashr_14_use):
1526 add $16, %r10
1527 jg LABEL(nibble_ashr_14_use)
1528
1529 LABEL(nibble_ashr_14_restart_use):
1530 movdqa (%rdi, %rdx), %xmm0
1531 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1532 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1533 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1534 #else
1535 movdqa (%rsi,%rdx), %xmm1
1536 TOLOWER (%xmm0, %xmm1)
1537 pcmpistri $0x1a, %xmm1, %xmm0
1538 #endif
1539 jbe LABEL(exit_use)
1540 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1541 sub $16, %r11
1542 jbe LABEL(strcmp_exitz)
1543 #endif
1544
1545 add $16, %rdx
1546 add $16, %r10
1547 jg LABEL(nibble_ashr_14_use)
1548
1549 movdqa (%rdi, %rdx), %xmm0
1550 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1551 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1552 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1553 #else
1554 movdqa (%rsi,%rdx), %xmm1
1555 TOLOWER (%xmm0, %xmm1)
1556 pcmpistri $0x1a, %xmm1, %xmm0
1557 #endif
1558 jbe LABEL(exit_use)
1559 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1560 sub $16, %r11
1561 jbe LABEL(strcmp_exitz)
1562 #endif
1563 add $16, %rdx
1564 jmp LABEL(loop_ashr_14_use)
1565
1566 .p2align 4
1567 LABEL(nibble_ashr_14_use):
1568 sub $0x1000, %r10
1569 movdqa -16(%rdi, %rdx), %xmm0
1570 psrldq $14, D(%xmm0)
1571 pcmpistri $0x3a,%xmm0, %xmm0
1572 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1573 cmp %r11, %rcx
1574 jae LABEL(nibble_ashr_exit_use)
1575 #endif
1576 cmp $1, %ecx
1577 ja LABEL(nibble_ashr_14_restart_use)
1578
1579 jmp LABEL(nibble_ashr_exit_use)
1580
1581 /*
1582 * The following cases will be handled by ashr_15
1583 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1584 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1585 */
1586 .p2align 4
1587 LABEL(ashr_15):
1588 pslldq $1, D(%xmm2)
1589 TOLOWER (%xmm1, %xmm2)
1590 pcmpeqb %xmm1, D(%xmm2)
1591 psubb %xmm0, D(%xmm2)
1592 pmovmskb %xmm2, %r9d
1593 shr %cl, %edx
1594 shr %cl, %r9d
1595 sub %r9d, %edx
1596 jnz LABEL(less32bytes)
1597
1598 movdqa (%rdi), %xmm3
1599
1600 UPDATE_STRNCMP_COUNTER
1601
1602 mov $16, %rcx /* index for loads */
1603 mov $15, %r9d /* byte position left over from less32bytes case */
1604 /*
1605 * Setup %r10 value allows us to detect crossing a page boundary.
1606 * When %r10 goes positive we have crossed a page boundary and
1607 * need to do a nibble.
1608 */
1609 lea 15(%rdi), %r10
1610 and $0xfff, %r10 /* offset into 4K page */
1611
1612 sub $0x1000, %r10 /* subtract 4K pagesize */
1613
1614 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1615
1616 .p2align 4
1617 LABEL(loop_ashr_15_use):
1618 add $16, %r10
1619 jg LABEL(nibble_ashr_15_use)
1620
1621 LABEL(nibble_ashr_15_restart_use):
1622 movdqa (%rdi, %rdx), %xmm0
1623 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1624 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1625 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1626 #else
1627 movdqa (%rsi,%rdx), %xmm1
1628 TOLOWER (%xmm0, %xmm1)
1629 pcmpistri $0x1a, %xmm1, %xmm0
1630 #endif
1631 jbe LABEL(exit_use)
1632 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1633 sub $16, %r11
1634 jbe LABEL(strcmp_exitz)
1635 #endif
1636
1637 add $16, %rdx
1638 add $16, %r10
1639 jg LABEL(nibble_ashr_15_use)
1640
1641 movdqa (%rdi, %rdx), %xmm0
1642 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1643 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1644 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1645 #else
1646 movdqa (%rsi,%rdx), %xmm1
1647 TOLOWER (%xmm0, %xmm1)
1648 pcmpistri $0x1a, %xmm1, %xmm0
1649 #endif
1650 jbe LABEL(exit_use)
1651 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1652 sub $16, %r11
1653 jbe LABEL(strcmp_exitz)
1654 #endif
1655 add $16, %rdx
1656 jmp LABEL(loop_ashr_15_use)
1657
1658 .p2align 4
1659 LABEL(nibble_ashr_15_use):
1660 sub $0x1000, %r10
1661 movdqa -16(%rdi, %rdx), %xmm0
1662 psrldq $15, D(%xmm0)
1663 pcmpistri $0x3a,%xmm0, %xmm0
1664 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1665 cmp %r11, %rcx
1666 jae LABEL(nibble_ashr_exit_use)
1667 #endif
1668 cmp $0, %ecx
1669 ja LABEL(nibble_ashr_15_restart_use)
1670
1671 LABEL(nibble_ashr_exit_use):
1672 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1673 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1674 #else
1675 movdqa (%rsi,%rdx), %xmm1
1676 TOLOWER (%xmm0, %xmm1)
1677 pcmpistri $0x1a, %xmm1, %xmm0
1678 #endif
1679 .p2align 4
1680 LABEL(exit_use):
1681 jnc LABEL(strcmp_exitz)
1682 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1683 sub %rcx, %r11
1684 jbe LABEL(strcmp_exitz)
1685 #endif
1686 add %rcx, %rdx
1687 lea -16(%rdi, %r9), %rdi
1688 movzbl (%rdi, %rdx), %eax
1689 movzbl (%rsi, %rdx), %edx
1690 test %r8d, %r8d
1691 jz LABEL(ret_use)
1692 xchg %eax, %edx
1693 LABEL(ret_use):
1694 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1695 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1696 movl (%rcx,%rdx,4), %edx
1697 movl (%rcx,%rax,4), %eax
1698 #endif
1699
1700 sub %edx, %eax
1701 ret
1702
1703 LABEL(less32bytes):
1704 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1705 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1706 test %r8d, %r8d
1707 jz LABEL(ret)
1708 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1709
1710 .p2align 4
1711 LABEL(ret):
1712 LABEL(less16bytes):
1713 bsf %rdx, %rdx /* find and store bit index in %rdx */
1714
1715 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1716 sub %rdx, %r11
1717 jbe LABEL(strcmp_exitz)
1718 #endif
1719 movzbl (%rsi, %rdx), %ecx
1720 movzbl (%rdi, %rdx), %eax
1721
1722 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1723 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1724 movl (%rdx,%rcx,4), %ecx
1725 movl (%rdx,%rax,4), %eax
1726 #endif
1727
1728 sub %ecx, %eax
1729 ret
1730
1731 LABEL(strcmp_exitz):
1732 xor %eax, %eax
1733 ret
1734
1735 .p2align 4
1736 // XXX Same as code above
1737 LABEL(Byte0):
1738 movzx (%rsi), %ecx
1739 movzx (%rdi), %eax
1740
1741 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1742 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1743 movl (%rdx,%rcx,4), %ecx
1744 movl (%rdx,%rax,4), %eax
1745 #endif
1746
1747 sub %ecx, %eax
1748 ret
1749 cfi_endproc
1750 .size STRCMP_SSE42, .-STRCMP_SSE42
1751
1752 #undef UCLOW_reg
1753 #undef UCHIGH_reg
1754 #undef LCQWORD_reg
1755 #undef TOLOWER
1756
1757 /* Put all SSE 4.2 functions together. */
1758 .section .rodata.SECTION,"a",@progbits
1759 .p2align 3
1760 LABEL(unaligned_table):
1761 .int LABEL(ashr_1) - LABEL(unaligned_table)
1762 .int LABEL(ashr_2) - LABEL(unaligned_table)
1763 .int LABEL(ashr_3) - LABEL(unaligned_table)
1764 .int LABEL(ashr_4) - LABEL(unaligned_table)
1765 .int LABEL(ashr_5) - LABEL(unaligned_table)
1766 .int LABEL(ashr_6) - LABEL(unaligned_table)
1767 .int LABEL(ashr_7) - LABEL(unaligned_table)
1768 .int LABEL(ashr_8) - LABEL(unaligned_table)
1769 .int LABEL(ashr_9) - LABEL(unaligned_table)
1770 .int LABEL(ashr_10) - LABEL(unaligned_table)
1771 .int LABEL(ashr_11) - LABEL(unaligned_table)
1772 .int LABEL(ashr_12) - LABEL(unaligned_table)
1773 .int LABEL(ashr_13) - LABEL(unaligned_table)
1774 .int LABEL(ashr_14) - LABEL(unaligned_table)
1775 .int LABEL(ashr_15) - LABEL(unaligned_table)
1776 .int LABEL(ashr_0) - LABEL(unaligned_table)
1777
1778 #undef LABEL
1779 #undef GLABEL
1780 #undef SECTION
1781 #undef movdqa
1782 #undef movdqu
1783 #undef pmovmskb
1784 #undef pcmpistri
1785 #undef psubb
1786 #undef pcmpeqb
1787 #undef psrldq
1788 #undef pslldq
1789 #undef palignr
1790 #undef pxor
1791 #undef D