]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strcmp-sse42.S
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-sse42.S
CommitLineData
d9a4d2ab 1/* strcmp with SSE4.2
04277e02 2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
d9a4d2ab
UD
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
5a82c748 18 <https://www.gnu.org/licenses/>. */
d9a4d2ab 19
11ffcacb
L
20#include <sysdep.h>
21
22#ifndef STRCMP_SSE42
23# define STRCMP_SSE42 __strcmp_sse42
24#endif
25
26#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
27# include "locale-defines.h"
28#endif
29
30#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
31/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
32 if the new counter > the old one or is 0. */
33# define UPDATE_STRNCMP_COUNTER \
34 /* calculate left number to compare */ \
35 lea -16(%rcx, %r11), %r9; \
36 cmp %r9, %r11; \
37 jb LABEL(strcmp_exitz); \
38 test %r9, %r9; \
39 je LABEL(strcmp_exitz); \
40 mov %r9, %r11
41#else
42# define UPDATE_STRNCMP_COUNTER
43#endif
44
45#ifdef USE_AVX
46# define SECTION avx
47# define GLABEL(l) l##_avx
48#else
49# define SECTION sse4.2
50# define GLABEL(l) l##_sse42
51#endif
52
53#define LABEL(l) .L##l
d9a4d2ab
UD
54
55/* We use 0x1a:
56 _SIDD_SBYTE_OPS
57 | _SIDD_CMP_EQUAL_EACH
58 | _SIDD_NEGATIVE_POLARITY
59 | _SIDD_LEAST_SIGNIFICANT
60 on pcmpistri to find out if two 16byte data elements are the same
61 and the offset of the first different byte. There are 4 cases:
62
63 1. Both 16byte data elements are valid and identical.
64 2. Both 16byte data elements have EOS and identical.
65 3. Both 16byte data elements are valid and they differ at offset X.
66 4. At least one 16byte data element has EOS at offset X. Two 16byte
67 data elements must differ at or before offset X.
68
69 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
70
71 case ECX CFlag ZFlag SFlag
72 1 16 0 0 0
73 2 16 0 1 1
74 3 X 1 0 0
75 4 0 <= X 1 0/1 0/1
76
77 We exit from the loop for cases 2, 3 and 4 with jbe which branches
78 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
79 case 2. */
80
81 /* Put all SSE 4.2 functions together. */
82 .section .text.SECTION,"ax",@progbits
83 .align 16
84 .type STRCMP_SSE42, @function
ac49ecaf
L
85 .globl STRCMP_SSE42
86 .hidden STRCMP_SSE42
d9a4d2ab
UD
87#ifdef USE_AS_STRCASECMP_L
88ENTRY (GLABEL(__strcasecmp))
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
70bc83b9 90 mov %fs:(%rax),%RDX_LP
d9a4d2ab
UD
91
92 // XXX 5 byte should be before the function
93 /* 5-byte NOP. */
94 .byte 0x0f,0x1f,0x44,0x00,0x00
95END (GLABEL(__strcasecmp))
96 /* FALLTHROUGH to strcasecmp_l. */
97#endif
98#ifdef USE_AS_STRNCASECMP_L
99ENTRY (GLABEL(__strncasecmp))
100 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
70bc83b9 101 mov %fs:(%rax),%RCX_LP
d9a4d2ab
UD
102
103 // XXX 5 byte should be before the function
104 /* 5-byte NOP. */
105 .byte 0x0f,0x1f,0x44,0x00,0x00
106END (GLABEL(__strncasecmp))
107 /* FALLTHROUGH to strncasecmp_l. */
108#endif
109
618280a1
UD
110
111#ifdef USE_AVX
112# define movdqa vmovdqa
113# define movdqu vmovdqu
114# define pmovmskb vpmovmskb
115# define pcmpistri vpcmpistri
116# define psubb vpsubb
117# define pcmpeqb vpcmpeqb
118# define psrldq vpsrldq
119# define pslldq vpslldq
120# define palignr vpalignr
121# define pxor vpxor
122# define D(arg) arg, arg
123#else
124# define D(arg) arg
125#endif
126
d9a4d2ab
UD
127STRCMP_SSE42:
128 cfi_startproc
5efc6777 129 _CET_ENDBR
d9a4d2ab
UD
130 CALL_MCOUNT
131
132/*
133 * This implementation uses SSE to compare up to 16 bytes at a time.
134 */
135#ifdef USE_AS_STRCASECMP_L
136 /* We have to fall back on the C implementation for locales
137 with encodings not matching ASCII for single bytes. */
138# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
70bc83b9 139 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
d9a4d2ab 140# else
70bc83b9 141 mov (%rdx), %RAX_LP
d9a4d2ab 142# endif
34372fc6 143 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
d9a4d2ab
UD
144 jne __strcasecmp_l_nonascii
145#endif
146#ifdef USE_AS_STRNCASECMP_L
147 /* We have to fall back on the C implementation for locales
148 with encodings not matching ASCII for single bytes. */
149# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
70bc83b9 150 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
d9a4d2ab 151# else
70bc83b9 152 mov (%rcx), %RAX_LP
d9a4d2ab 153# endif
34372fc6 154 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
d9a4d2ab
UD
155 jne __strncasecmp_l_nonascii
156#endif
157
158#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
ee915088 159 test %RDX_LP, %RDX_LP
d9a4d2ab 160 je LABEL(strcmp_exitz)
ee915088 161 cmp $1, %RDX_LP
d9a4d2ab 162 je LABEL(Byte0)
ee915088 163 mov %RDX_LP, %R11_LP
d9a4d2ab
UD
164#endif
165 mov %esi, %ecx
166 mov %edi, %eax
167/* Use 64bit AND here to avoid long NOP padding. */
168 and $0x3f, %rcx /* rsi alignment in cache line */
169 and $0x3f, %rax /* rdi alignment in cache line */
170#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
171 .section .rodata.cst16,"aM",@progbits,16
172 .align 16
173LABEL(belowupper):
174 .quad 0x4040404040404040
175 .quad 0x4040404040404040
176LABEL(topupper):
177# ifdef USE_AVX
178 .quad 0x5a5a5a5a5a5a5a5a
179 .quad 0x5a5a5a5a5a5a5a5a
180# else
181 .quad 0x5b5b5b5b5b5b5b5b
182 .quad 0x5b5b5b5b5b5b5b5b
183# endif
184LABEL(touppermask):
185 .quad 0x2020202020202020
186 .quad 0x2020202020202020
187 .previous
188 movdqa LABEL(belowupper)(%rip), %xmm4
189# define UCLOW_reg %xmm4
190 movdqa LABEL(topupper)(%rip), %xmm5
191# define UCHIGH_reg %xmm5
192 movdqa LABEL(touppermask)(%rip), %xmm6
193# define LCQWORD_reg %xmm6
194#endif
195 cmp $0x30, %ecx
196 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
197 cmp $0x30, %eax
198 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
199 movdqu (%rdi), %xmm1
200 movdqu (%rsi), %xmm2
201#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
202# ifdef USE_AVX
203# define TOLOWER(reg1, reg2) \
204 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
205 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
206 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
207 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
208 vpandn %xmm7, %xmm8, %xmm8; \
209 vpandn %xmm9, %xmm10, %xmm10; \
210 vpand LCQWORD_reg, %xmm8, %xmm8; \
211 vpand LCQWORD_reg, %xmm10, %xmm10; \
212 vpor reg1, %xmm8, reg1; \
213 vpor reg2, %xmm10, reg2
214# else
215# define TOLOWER(reg1, reg2) \
216 movdqa reg1, %xmm7; \
217 movdqa UCHIGH_reg, %xmm8; \
218 movdqa reg2, %xmm9; \
219 movdqa UCHIGH_reg, %xmm10; \
220 pcmpgtb UCLOW_reg, %xmm7; \
221 pcmpgtb reg1, %xmm8; \
222 pcmpgtb UCLOW_reg, %xmm9; \
223 pcmpgtb reg2, %xmm10; \
224 pand %xmm8, %xmm7; \
225 pand %xmm10, %xmm9; \
226 pand LCQWORD_reg, %xmm7; \
227 pand LCQWORD_reg, %xmm9; \
228 por %xmm7, reg1; \
229 por %xmm9, reg2
230# endif
231 TOLOWER (%xmm1, %xmm2)
232#else
233# define TOLOWER(reg1, reg2)
234#endif
618280a1
UD
235 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
236 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
237 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
238 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
d9a4d2ab
UD
239 pmovmskb %xmm1, %edx
240 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
241 jnz LABEL(less16bytes)/* If not, find different value or null char */
242#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
243 sub $16, %r11
382466e0 244 jbe LABEL(strcmp_exitz)/* finish comparison */
d9a4d2ab
UD
245#endif
246 add $16, %rsi /* prepare to search next 16 bytes */
247 add $16, %rdi /* prepare to search next 16 bytes */
248
249 /*
250 * Determine source and destination string offsets from 16-byte
251 * alignment. Use relative offset difference between the two to
252 * determine which case below to use.
253 */
254 .p2align 4
255LABEL(crosscache):
256 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
257 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
258 mov $0xffff, %edx /* for equivalent offset */
259 xor %r8d, %r8d
260 and $0xf, %ecx /* offset of rsi */
261 and $0xf, %eax /* offset of rdi */
618280a1 262 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
d9a4d2ab
UD
263 cmp %eax, %ecx
264 je LABEL(ashr_0) /* rsi and rdi relative offset same */
265 ja LABEL(bigger)
266 mov %edx, %r8d /* r8d is offset flag for exit tail */
267 xchg %ecx, %eax
268 xchg %rsi, %rdi
269LABEL(bigger):
618280a1
UD
270 movdqa (%rdi), %xmm2
271 movdqa (%rsi), %xmm1
d9a4d2ab
UD
272 lea 15(%rax), %r9
273 sub %rcx, %r9
274 lea LABEL(unaligned_table)(%rip), %r10
275 movslq (%r10, %r9,4), %r9
618280a1 276 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
d9a4d2ab 277 lea (%r10, %r9), %r10
8817df42 278 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
d9a4d2ab
UD
279
280/*
281 * The following cases will be handled by ashr_0
282 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
283 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
284 */
285 .p2align 4
286LABEL(ashr_0):
287
288 movdqa (%rsi), %xmm1
618280a1 289 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
d9a4d2ab 290#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
618280a1 291 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
d9a4d2ab
UD
292#else
293 movdqa (%rdi), %xmm2
294 TOLOWER (%xmm1, %xmm2)
618280a1 295 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
d9a4d2ab 296#endif
618280a1 297 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
d9a4d2ab
UD
298 pmovmskb %xmm1, %r9d
299 shr %cl, %edx /* adjust 0xffff for offset */
300 shr %cl, %r9d /* adjust for 16-byte offset */
301 sub %r9d, %edx
302 /*
303 * edx must be the same with r9d if in left byte (16-rcx) is equal to
304 * the start from (16-rax) and no null char was seen.
305 */
306 jne LABEL(less32bytes) /* mismatch or null char */
307 UPDATE_STRNCMP_COUNTER
308 mov $16, %rcx
309 mov $16, %r9
d9a4d2ab
UD
310
311 /*
312 * Now both strings are aligned at 16-byte boundary. Loop over strings
313 * checking 32-bytes per iteration.
314 */
315 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
316 .p2align 4
317LABEL(ashr_0_use):
318 movdqa (%rdi,%rdx), %xmm0
319#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
320 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
321#else
322 movdqa (%rsi,%rdx), %xmm1
323 TOLOWER (%xmm0, %xmm1)
324 pcmpistri $0x1a, %xmm1, %xmm0
325#endif
326 lea 16(%rdx), %rdx
327 jbe LABEL(ashr_0_exit_use)
328#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
329 sub $16, %r11
330 jbe LABEL(strcmp_exitz)
331#endif
332
333 movdqa (%rdi,%rdx), %xmm0
334#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
335 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
336#else
337 movdqa (%rsi,%rdx), %xmm1
338 TOLOWER (%xmm0, %xmm1)
339 pcmpistri $0x1a, %xmm1, %xmm0
340#endif
341 lea 16(%rdx), %rdx
342 jbe LABEL(ashr_0_exit_use)
343#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
344 sub $16, %r11
345 jbe LABEL(strcmp_exitz)
346#endif
347 jmp LABEL(ashr_0_use)
348
349
350 .p2align 4
351LABEL(ashr_0_exit_use):
352 jnc LABEL(strcmp_exitz)
353#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
354 sub %rcx, %r11
355 jbe LABEL(strcmp_exitz)
356#endif
357 lea -16(%rdx, %rcx), %rcx
358 movzbl (%rdi, %rcx), %eax
359 movzbl (%rsi, %rcx), %edx
360#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
361 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
362 movl (%rcx,%rax,4), %eax
363 movl (%rcx,%rdx,4), %edx
364#endif
365 sub %edx, %eax
366 ret
367
368
369
370/*
371 * The following cases will be handled by ashr_1
372 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
373 * n(15) n -15 0(15 +(n-15) - n) ashr_1
374 */
375 .p2align 4
376LABEL(ashr_1):
618280a1 377 pslldq $15, D(%xmm2) /* shift first string to align with second */
d9a4d2ab 378 TOLOWER (%xmm1, %xmm2)
618280a1
UD
379 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
380 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
d9a4d2ab
UD
381 pmovmskb %xmm2, %r9d
382 shr %cl, %edx /* adjust 0xffff for offset */
383 shr %cl, %r9d /* adjust for 16-byte offset */
384 sub %r9d, %edx
385 jnz LABEL(less32bytes) /* mismatch or null char seen */
386 movdqa (%rdi), %xmm3
387 UPDATE_STRNCMP_COUNTER
388
d9a4d2ab
UD
389 mov $16, %rcx /* index for loads*/
390 mov $1, %r9d /* byte position left over from less32bytes case */
391 /*
392 * Setup %r10 value allows us to detect crossing a page boundary.
393 * When %r10 goes positive we have crossed a page boundary and
394 * need to do a nibble.
395 */
396 lea 1(%rdi), %r10
397 and $0xfff, %r10 /* offset into 4K page */
398 sub $0x1000, %r10 /* subtract 4K pagesize */
399 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
400
401 .p2align 4
402LABEL(loop_ashr_1_use):
403 add $16, %r10
404 jg LABEL(nibble_ashr_1_use)
405
406LABEL(nibble_ashr_1_restart_use):
407 movdqa (%rdi, %rdx), %xmm0
618280a1 408 palignr $1, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
409#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
410 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
411#else
412 movdqa (%rsi,%rdx), %xmm1
413 TOLOWER (%xmm0, %xmm1)
414 pcmpistri $0x1a, %xmm1, %xmm0
415#endif
416 jbe LABEL(exit_use)
417#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
418 sub $16, %r11
419 jbe LABEL(strcmp_exitz)
420#endif
421
422 add $16, %rdx
423 add $16, %r10
424 jg LABEL(nibble_ashr_1_use)
425
426 movdqa (%rdi, %rdx), %xmm0
618280a1 427 palignr $1, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
428#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
429 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
430#else
431 movdqa (%rsi,%rdx), %xmm1
432 TOLOWER (%xmm0, %xmm1)
433 pcmpistri $0x1a, %xmm1, %xmm0
434#endif
435 jbe LABEL(exit_use)
436#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
437 sub $16, %r11
438 jbe LABEL(strcmp_exitz)
439#endif
440 add $16, %rdx
441 jmp LABEL(loop_ashr_1_use)
442
443 .p2align 4
444LABEL(nibble_ashr_1_use):
445 sub $0x1000, %r10
446 movdqa -16(%rdi, %rdx), %xmm0
618280a1 447 psrldq $1, D(%xmm0)
d9a4d2ab
UD
448 pcmpistri $0x3a,%xmm0, %xmm0
449#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
450 cmp %r11, %rcx
451 jae LABEL(nibble_ashr_exit_use)
452#endif
453 cmp $14, %ecx
454 ja LABEL(nibble_ashr_1_restart_use)
455
456 jmp LABEL(nibble_ashr_exit_use)
457
458/*
459 * The following cases will be handled by ashr_2
460 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
461 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
462 */
463 .p2align 4
464LABEL(ashr_2):
618280a1 465 pslldq $14, D(%xmm2)
d9a4d2ab 466 TOLOWER (%xmm1, %xmm2)
618280a1
UD
467 pcmpeqb %xmm1, D(%xmm2)
468 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
469 pmovmskb %xmm2, %r9d
470 shr %cl, %edx
471 shr %cl, %r9d
472 sub %r9d, %edx
473 jnz LABEL(less32bytes)
474 movdqa (%rdi), %xmm3
475 UPDATE_STRNCMP_COUNTER
476
d9a4d2ab
UD
477 mov $16, %rcx /* index for loads */
478 mov $2, %r9d /* byte position left over from less32bytes case */
479 /*
480 * Setup %r10 value allows us to detect crossing a page boundary.
481 * When %r10 goes positive we have crossed a page boundary and
482 * need to do a nibble.
483 */
484 lea 2(%rdi), %r10
485 and $0xfff, %r10 /* offset into 4K page */
486 sub $0x1000, %r10 /* subtract 4K pagesize */
487 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
488
489 .p2align 4
490LABEL(loop_ashr_2_use):
491 add $16, %r10
492 jg LABEL(nibble_ashr_2_use)
493
494LABEL(nibble_ashr_2_restart_use):
495 movdqa (%rdi, %rdx), %xmm0
618280a1 496 palignr $2, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
497#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
498 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
499#else
500 movdqa (%rsi,%rdx), %xmm1
501 TOLOWER (%xmm0, %xmm1)
502 pcmpistri $0x1a, %xmm1, %xmm0
503#endif
504 jbe LABEL(exit_use)
505#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
506 sub $16, %r11
507 jbe LABEL(strcmp_exitz)
508#endif
509
510 add $16, %rdx
511 add $16, %r10
512 jg LABEL(nibble_ashr_2_use)
513
514 movdqa (%rdi, %rdx), %xmm0
618280a1 515 palignr $2, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
516#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
517 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
518#else
519 movdqa (%rsi,%rdx), %xmm1
520 TOLOWER (%xmm0, %xmm1)
521 pcmpistri $0x1a, %xmm1, %xmm0
522#endif
523 jbe LABEL(exit_use)
524#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
525 sub $16, %r11
526 jbe LABEL(strcmp_exitz)
527#endif
528 add $16, %rdx
529 jmp LABEL(loop_ashr_2_use)
530
531 .p2align 4
532LABEL(nibble_ashr_2_use):
533 sub $0x1000, %r10
534 movdqa -16(%rdi, %rdx), %xmm0
618280a1 535 psrldq $2, D(%xmm0)
d9a4d2ab
UD
536 pcmpistri $0x3a,%xmm0, %xmm0
537#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
538 cmp %r11, %rcx
539 jae LABEL(nibble_ashr_exit_use)
540#endif
541 cmp $13, %ecx
542 ja LABEL(nibble_ashr_2_restart_use)
543
544 jmp LABEL(nibble_ashr_exit_use)
545
546/*
547 * The following cases will be handled by ashr_3
548 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
549 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
550 */
551 .p2align 4
552LABEL(ashr_3):
618280a1 553 pslldq $13, D(%xmm2)
d9a4d2ab 554 TOLOWER (%xmm1, %xmm2)
618280a1
UD
555 pcmpeqb %xmm1, D(%xmm2)
556 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
557 pmovmskb %xmm2, %r9d
558 shr %cl, %edx
559 shr %cl, %r9d
560 sub %r9d, %edx
561 jnz LABEL(less32bytes)
562 movdqa (%rdi), %xmm3
563
564 UPDATE_STRNCMP_COUNTER
565
d9a4d2ab
UD
566 mov $16, %rcx /* index for loads */
567 mov $3, %r9d /* byte position left over from less32bytes case */
568 /*
569 * Setup %r10 value allows us to detect crossing a page boundary.
570 * When %r10 goes positive we have crossed a page boundary and
571 * need to do a nibble.
572 */
573 lea 3(%rdi), %r10
574 and $0xfff, %r10 /* offset into 4K page */
575 sub $0x1000, %r10 /* subtract 4K pagesize */
576 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
577
578LABEL(loop_ashr_3_use):
579 add $16, %r10
580 jg LABEL(nibble_ashr_3_use)
581
582LABEL(nibble_ashr_3_restart_use):
583 movdqa (%rdi, %rdx), %xmm0
618280a1 584 palignr $3, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
585#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
586 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
587#else
588 movdqa (%rsi,%rdx), %xmm1
589 TOLOWER (%xmm0, %xmm1)
590 pcmpistri $0x1a, %xmm1, %xmm0
591#endif
592 jbe LABEL(exit_use)
593#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
594 sub $16, %r11
595 jbe LABEL(strcmp_exitz)
596#endif
597
598 add $16, %rdx
599 add $16, %r10
600 jg LABEL(nibble_ashr_3_use)
601
602 movdqa (%rdi, %rdx), %xmm0
618280a1 603 palignr $3, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
604#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
605 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
606#else
607 movdqa (%rsi,%rdx), %xmm1
608 TOLOWER (%xmm0, %xmm1)
609 pcmpistri $0x1a, %xmm1, %xmm0
610#endif
611 jbe LABEL(exit_use)
612#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
613 sub $16, %r11
614 jbe LABEL(strcmp_exitz)
615#endif
616 add $16, %rdx
617 jmp LABEL(loop_ashr_3_use)
618
619 .p2align 4
620LABEL(nibble_ashr_3_use):
621 sub $0x1000, %r10
622 movdqa -16(%rdi, %rdx), %xmm0
618280a1 623 psrldq $3, D(%xmm0)
d9a4d2ab
UD
624 pcmpistri $0x3a,%xmm0, %xmm0
625#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
626 cmp %r11, %rcx
627 jae LABEL(nibble_ashr_exit_use)
628#endif
629 cmp $12, %ecx
630 ja LABEL(nibble_ashr_3_restart_use)
631
632 jmp LABEL(nibble_ashr_exit_use)
633
634/*
635 * The following cases will be handled by ashr_4
636 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
637 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
638 */
639 .p2align 4
640LABEL(ashr_4):
618280a1 641 pslldq $12, D(%xmm2)
d9a4d2ab 642 TOLOWER (%xmm1, %xmm2)
618280a1
UD
643 pcmpeqb %xmm1, D(%xmm2)
644 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
645 pmovmskb %xmm2, %r9d
646 shr %cl, %edx
647 shr %cl, %r9d
648 sub %r9d, %edx
649 jnz LABEL(less32bytes)
650 movdqa (%rdi), %xmm3
651
652 UPDATE_STRNCMP_COUNTER
653
d9a4d2ab
UD
654 mov $16, %rcx /* index for loads */
655 mov $4, %r9d /* byte position left over from less32bytes case */
656 /*
657 * Setup %r10 value allows us to detect crossing a page boundary.
658 * When %r10 goes positive we have crossed a page boundary and
659 * need to do a nibble.
660 */
661 lea 4(%rdi), %r10
662 and $0xfff, %r10 /* offset into 4K page */
663 sub $0x1000, %r10 /* subtract 4K pagesize */
664 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
665
666 .p2align 4
667LABEL(loop_ashr_4_use):
668 add $16, %r10
669 jg LABEL(nibble_ashr_4_use)
670
671LABEL(nibble_ashr_4_restart_use):
672 movdqa (%rdi, %rdx), %xmm0
618280a1 673 palignr $4, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
674#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
675 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
676#else
677 movdqa (%rsi,%rdx), %xmm1
678 TOLOWER (%xmm0, %xmm1)
679 pcmpistri $0x1a, %xmm1, %xmm0
680#endif
681 jbe LABEL(exit_use)
682#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
683 sub $16, %r11
684 jbe LABEL(strcmp_exitz)
685#endif
686
687 add $16, %rdx
688 add $16, %r10
689 jg LABEL(nibble_ashr_4_use)
690
691 movdqa (%rdi, %rdx), %xmm0
618280a1 692 palignr $4, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
693#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
694 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
695#else
696 movdqa (%rsi,%rdx), %xmm1
697 TOLOWER (%xmm0, %xmm1)
698 pcmpistri $0x1a, %xmm1, %xmm0
699#endif
700 jbe LABEL(exit_use)
701#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
702 sub $16, %r11
703 jbe LABEL(strcmp_exitz)
704#endif
705 add $16, %rdx
706 jmp LABEL(loop_ashr_4_use)
707
708 .p2align 4
709LABEL(nibble_ashr_4_use):
710 sub $0x1000, %r10
711 movdqa -16(%rdi, %rdx), %xmm0
618280a1 712 psrldq $4, D(%xmm0)
d9a4d2ab
UD
713 pcmpistri $0x3a,%xmm0, %xmm0
714#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
715 cmp %r11, %rcx
716 jae LABEL(nibble_ashr_exit_use)
717#endif
718 cmp $11, %ecx
719 ja LABEL(nibble_ashr_4_restart_use)
720
721 jmp LABEL(nibble_ashr_exit_use)
722
723/*
724 * The following cases will be handled by ashr_5
725 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
726 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
727 */
728 .p2align 4
729LABEL(ashr_5):
618280a1 730 pslldq $11, D(%xmm2)
d9a4d2ab 731 TOLOWER (%xmm1, %xmm2)
618280a1
UD
732 pcmpeqb %xmm1, D(%xmm2)
733 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
734 pmovmskb %xmm2, %r9d
735 shr %cl, %edx
736 shr %cl, %r9d
737 sub %r9d, %edx
738 jnz LABEL(less32bytes)
739 movdqa (%rdi), %xmm3
740
741 UPDATE_STRNCMP_COUNTER
742
d9a4d2ab
UD
743 mov $16, %rcx /* index for loads */
744 mov $5, %r9d /* byte position left over from less32bytes case */
745 /*
746 * Setup %r10 value allows us to detect crossing a page boundary.
747 * When %r10 goes positive we have crossed a page boundary and
748 * need to do a nibble.
749 */
750 lea 5(%rdi), %r10
751 and $0xfff, %r10 /* offset into 4K page */
752 sub $0x1000, %r10 /* subtract 4K pagesize */
753 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
754
755 .p2align 4
756LABEL(loop_ashr_5_use):
757 add $16, %r10
758 jg LABEL(nibble_ashr_5_use)
759
760LABEL(nibble_ashr_5_restart_use):
761 movdqa (%rdi, %rdx), %xmm0
618280a1 762 palignr $5, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
763#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
764 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
765#else
766 movdqa (%rsi,%rdx), %xmm1
767 TOLOWER (%xmm0, %xmm1)
768 pcmpistri $0x1a, %xmm1, %xmm0
769#endif
770 jbe LABEL(exit_use)
771#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
772 sub $16, %r11
773 jbe LABEL(strcmp_exitz)
774#endif
775
776 add $16, %rdx
777 add $16, %r10
778 jg LABEL(nibble_ashr_5_use)
779
780 movdqa (%rdi, %rdx), %xmm0
781
618280a1 782 palignr $5, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
783#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
784 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
785#else
786 movdqa (%rsi,%rdx), %xmm1
787 TOLOWER (%xmm0, %xmm1)
788 pcmpistri $0x1a, %xmm1, %xmm0
789#endif
790 jbe LABEL(exit_use)
791#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
792 sub $16, %r11
793 jbe LABEL(strcmp_exitz)
794#endif
795 add $16, %rdx
796 jmp LABEL(loop_ashr_5_use)
797
798 .p2align 4
799LABEL(nibble_ashr_5_use):
800 sub $0x1000, %r10
801 movdqa -16(%rdi, %rdx), %xmm0
618280a1 802 psrldq $5, D(%xmm0)
d9a4d2ab
UD
803 pcmpistri $0x3a,%xmm0, %xmm0
804#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
805 cmp %r11, %rcx
806 jae LABEL(nibble_ashr_exit_use)
807#endif
808 cmp $10, %ecx
809 ja LABEL(nibble_ashr_5_restart_use)
810
811 jmp LABEL(nibble_ashr_exit_use)
812
813/*
814 * The following cases will be handled by ashr_6
815 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
816 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
817 */
818 .p2align 4
819LABEL(ashr_6):
618280a1 820 pslldq $10, D(%xmm2)
d9a4d2ab 821 TOLOWER (%xmm1, %xmm2)
618280a1
UD
822 pcmpeqb %xmm1, D(%xmm2)
823 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
824 pmovmskb %xmm2, %r9d
825 shr %cl, %edx
826 shr %cl, %r9d
827 sub %r9d, %edx
828 jnz LABEL(less32bytes)
829 movdqa (%rdi), %xmm3
830
831 UPDATE_STRNCMP_COUNTER
832
d9a4d2ab
UD
833 mov $16, %rcx /* index for loads */
834 mov $6, %r9d /* byte position left over from less32bytes case */
835 /*
836 * Setup %r10 value allows us to detect crossing a page boundary.
837 * When %r10 goes positive we have crossed a page boundary and
838 * need to do a nibble.
839 */
840 lea 6(%rdi), %r10
841 and $0xfff, %r10 /* offset into 4K page */
842 sub $0x1000, %r10 /* subtract 4K pagesize */
843 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
844
845 .p2align 4
846LABEL(loop_ashr_6_use):
847 add $16, %r10
848 jg LABEL(nibble_ashr_6_use)
849
850LABEL(nibble_ashr_6_restart_use):
851 movdqa (%rdi, %rdx), %xmm0
618280a1 852 palignr $6, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
853#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
854 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
855#else
856 movdqa (%rsi,%rdx), %xmm1
857 TOLOWER (%xmm0, %xmm1)
858 pcmpistri $0x1a, %xmm1, %xmm0
859#endif
860 jbe LABEL(exit_use)
861#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
862 sub $16, %r11
863 jbe LABEL(strcmp_exitz)
864#endif
865
866 add $16, %rdx
867 add $16, %r10
868 jg LABEL(nibble_ashr_6_use)
869
870 movdqa (%rdi, %rdx), %xmm0
618280a1 871 palignr $6, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
872#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
873 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
874#else
875 movdqa (%rsi,%rdx), %xmm1
876 TOLOWER (%xmm0, %xmm1)
877 pcmpistri $0x1a, %xmm1, %xmm0
878#endif
879 jbe LABEL(exit_use)
880#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
881 sub $16, %r11
882 jbe LABEL(strcmp_exitz)
883#endif
884 add $16, %rdx
885 jmp LABEL(loop_ashr_6_use)
886
887 .p2align 4
888LABEL(nibble_ashr_6_use):
889 sub $0x1000, %r10
890 movdqa -16(%rdi, %rdx), %xmm0
618280a1 891 psrldq $6, D(%xmm0)
d9a4d2ab
UD
892 pcmpistri $0x3a,%xmm0, %xmm0
893#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
894 cmp %r11, %rcx
895 jae LABEL(nibble_ashr_exit_use)
896#endif
897 cmp $9, %ecx
898 ja LABEL(nibble_ashr_6_restart_use)
899
900 jmp LABEL(nibble_ashr_exit_use)
901
902/*
903 * The following cases will be handled by ashr_7
904 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
905 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
906 */
907 .p2align 4
908LABEL(ashr_7):
618280a1 909 pslldq $9, D(%xmm2)
d9a4d2ab 910 TOLOWER (%xmm1, %xmm2)
618280a1
UD
911 pcmpeqb %xmm1, D(%xmm2)
912 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
913 pmovmskb %xmm2, %r9d
914 shr %cl, %edx
915 shr %cl, %r9d
916 sub %r9d, %edx
917 jnz LABEL(less32bytes)
918 movdqa (%rdi), %xmm3
919
920 UPDATE_STRNCMP_COUNTER
921
d9a4d2ab
UD
922 mov $16, %rcx /* index for loads */
923 mov $7, %r9d /* byte position left over from less32bytes case */
924 /*
925 * Setup %r10 value allows us to detect crossing a page boundary.
926 * When %r10 goes positive we have crossed a page boundary and
927 * need to do a nibble.
928 */
929 lea 7(%rdi), %r10
930 and $0xfff, %r10 /* offset into 4K page */
931 sub $0x1000, %r10 /* subtract 4K pagesize */
932 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
933
934 .p2align 4
935LABEL(loop_ashr_7_use):
936 add $16, %r10
937 jg LABEL(nibble_ashr_7_use)
938
939LABEL(nibble_ashr_7_restart_use):
940 movdqa (%rdi, %rdx), %xmm0
618280a1 941 palignr $7, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
942#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
943 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
944#else
945 movdqa (%rsi,%rdx), %xmm1
946 TOLOWER (%xmm0, %xmm1)
947 pcmpistri $0x1a, %xmm1, %xmm0
948#endif
949 jbe LABEL(exit_use)
950#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
951 sub $16, %r11
952 jbe LABEL(strcmp_exitz)
953#endif
954
955 add $16, %rdx
956 add $16, %r10
957 jg LABEL(nibble_ashr_7_use)
958
959 movdqa (%rdi, %rdx), %xmm0
618280a1 960 palignr $7, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
961#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
962 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
963#else
964 movdqa (%rsi,%rdx), %xmm1
965 TOLOWER (%xmm0, %xmm1)
966 pcmpistri $0x1a, %xmm1, %xmm0
967#endif
968 jbe LABEL(exit_use)
969#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
970 sub $16, %r11
971 jbe LABEL(strcmp_exitz)
972#endif
973 add $16, %rdx
974 jmp LABEL(loop_ashr_7_use)
975
976 .p2align 4
977LABEL(nibble_ashr_7_use):
978 sub $0x1000, %r10
979 movdqa -16(%rdi, %rdx), %xmm0
618280a1 980 psrldq $7, D(%xmm0)
d9a4d2ab
UD
981 pcmpistri $0x3a,%xmm0, %xmm0
982#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
983 cmp %r11, %rcx
984 jae LABEL(nibble_ashr_exit_use)
985#endif
986 cmp $8, %ecx
987 ja LABEL(nibble_ashr_7_restart_use)
988
989 jmp LABEL(nibble_ashr_exit_use)
990
991/*
992 * The following cases will be handled by ashr_8
993 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
994 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
995 */
996 .p2align 4
997LABEL(ashr_8):
618280a1 998 pslldq $8, D(%xmm2)
d9a4d2ab 999 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1000 pcmpeqb %xmm1, D(%xmm2)
1001 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1002 pmovmskb %xmm2, %r9d
1003 shr %cl, %edx
1004 shr %cl, %r9d
1005 sub %r9d, %edx
1006 jnz LABEL(less32bytes)
1007 movdqa (%rdi), %xmm3
1008
1009 UPDATE_STRNCMP_COUNTER
1010
d9a4d2ab
UD
1011 mov $16, %rcx /* index for loads */
1012 mov $8, %r9d /* byte position left over from less32bytes case */
1013 /*
1014 * Setup %r10 value allows us to detect crossing a page boundary.
1015 * When %r10 goes positive we have crossed a page boundary and
1016 * need to do a nibble.
1017 */
1018 lea 8(%rdi), %r10
1019 and $0xfff, %r10 /* offset into 4K page */
1020 sub $0x1000, %r10 /* subtract 4K pagesize */
1021 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1022
1023 .p2align 4
1024LABEL(loop_ashr_8_use):
1025 add $16, %r10
1026 jg LABEL(nibble_ashr_8_use)
1027
1028LABEL(nibble_ashr_8_restart_use):
1029 movdqa (%rdi, %rdx), %xmm0
618280a1 1030 palignr $8, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1031#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1032 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1033#else
1034 movdqa (%rsi,%rdx), %xmm1
1035 TOLOWER (%xmm0, %xmm1)
1036 pcmpistri $0x1a, %xmm1, %xmm0
1037#endif
1038 jbe LABEL(exit_use)
1039#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1040 sub $16, %r11
1041 jbe LABEL(strcmp_exitz)
1042#endif
1043
1044 add $16, %rdx
1045 add $16, %r10
1046 jg LABEL(nibble_ashr_8_use)
1047
1048 movdqa (%rdi, %rdx), %xmm0
618280a1 1049 palignr $8, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1050#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1051 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1052#else
1053 movdqa (%rsi,%rdx), %xmm1
1054 TOLOWER (%xmm0, %xmm1)
1055 pcmpistri $0x1a, %xmm1, %xmm0
1056#endif
1057 jbe LABEL(exit_use)
1058#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1059 sub $16, %r11
1060 jbe LABEL(strcmp_exitz)
1061#endif
1062 add $16, %rdx
1063 jmp LABEL(loop_ashr_8_use)
1064
1065 .p2align 4
1066LABEL(nibble_ashr_8_use):
1067 sub $0x1000, %r10
1068 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1069 psrldq $8, D(%xmm0)
d9a4d2ab
UD
1070 pcmpistri $0x3a,%xmm0, %xmm0
1071#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1072 cmp %r11, %rcx
1073 jae LABEL(nibble_ashr_exit_use)
1074#endif
1075 cmp $7, %ecx
1076 ja LABEL(nibble_ashr_8_restart_use)
1077
1078 jmp LABEL(nibble_ashr_exit_use)
1079
1080/*
1081 * The following cases will be handled by ashr_9
1082 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1083 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1084 */
1085 .p2align 4
1086LABEL(ashr_9):
618280a1 1087 pslldq $7, D(%xmm2)
d9a4d2ab 1088 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1089 pcmpeqb %xmm1, D(%xmm2)
1090 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1091 pmovmskb %xmm2, %r9d
1092 shr %cl, %edx
1093 shr %cl, %r9d
1094 sub %r9d, %edx
1095 jnz LABEL(less32bytes)
1096 movdqa (%rdi), %xmm3
1097
1098 UPDATE_STRNCMP_COUNTER
1099
d9a4d2ab
UD
1100 mov $16, %rcx /* index for loads */
1101 mov $9, %r9d /* byte position left over from less32bytes case */
1102 /*
1103 * Setup %r10 value allows us to detect crossing a page boundary.
1104 * When %r10 goes positive we have crossed a page boundary and
1105 * need to do a nibble.
1106 */
1107 lea 9(%rdi), %r10
1108 and $0xfff, %r10 /* offset into 4K page */
1109 sub $0x1000, %r10 /* subtract 4K pagesize */
1110 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1111
1112 .p2align 4
1113LABEL(loop_ashr_9_use):
1114 add $16, %r10
1115 jg LABEL(nibble_ashr_9_use)
1116
1117LABEL(nibble_ashr_9_restart_use):
1118 movdqa (%rdi, %rdx), %xmm0
1119
618280a1 1120 palignr $9, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1121#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1122 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1123#else
1124 movdqa (%rsi,%rdx), %xmm1
1125 TOLOWER (%xmm0, %xmm1)
1126 pcmpistri $0x1a, %xmm1, %xmm0
1127#endif
1128 jbe LABEL(exit_use)
1129#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1130 sub $16, %r11
1131 jbe LABEL(strcmp_exitz)
1132#endif
1133
1134 add $16, %rdx
1135 add $16, %r10
1136 jg LABEL(nibble_ashr_9_use)
1137
1138 movdqa (%rdi, %rdx), %xmm0
618280a1 1139 palignr $9, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1140#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1141 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1142#else
1143 movdqa (%rsi,%rdx), %xmm1
1144 TOLOWER (%xmm0, %xmm1)
1145 pcmpistri $0x1a, %xmm1, %xmm0
1146#endif
1147 jbe LABEL(exit_use)
1148#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1149 sub $16, %r11
1150 jbe LABEL(strcmp_exitz)
1151#endif
1152 add $16, %rdx
1153 jmp LABEL(loop_ashr_9_use)
1154
1155 .p2align 4
1156LABEL(nibble_ashr_9_use):
1157 sub $0x1000, %r10
1158 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1159 psrldq $9, D(%xmm0)
d9a4d2ab
UD
1160 pcmpistri $0x3a,%xmm0, %xmm0
1161#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1162 cmp %r11, %rcx
1163 jae LABEL(nibble_ashr_exit_use)
1164#endif
1165 cmp $6, %ecx
1166 ja LABEL(nibble_ashr_9_restart_use)
1167
1168 jmp LABEL(nibble_ashr_exit_use)
1169
1170/*
1171 * The following cases will be handled by ashr_10
1172 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1173 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1174 */
1175 .p2align 4
1176LABEL(ashr_10):
618280a1 1177 pslldq $6, D(%xmm2)
d9a4d2ab 1178 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1179 pcmpeqb %xmm1, D(%xmm2)
1180 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1181 pmovmskb %xmm2, %r9d
1182 shr %cl, %edx
1183 shr %cl, %r9d
1184 sub %r9d, %edx
1185 jnz LABEL(less32bytes)
1186 movdqa (%rdi), %xmm3
1187
1188 UPDATE_STRNCMP_COUNTER
1189
d9a4d2ab
UD
1190 mov $16, %rcx /* index for loads */
1191 mov $10, %r9d /* byte position left over from less32bytes case */
1192 /*
1193 * Setup %r10 value allows us to detect crossing a page boundary.
1194 * When %r10 goes positive we have crossed a page boundary and
1195 * need to do a nibble.
1196 */
1197 lea 10(%rdi), %r10
1198 and $0xfff, %r10 /* offset into 4K page */
1199 sub $0x1000, %r10 /* subtract 4K pagesize */
1200 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1201
1202 .p2align 4
1203LABEL(loop_ashr_10_use):
1204 add $16, %r10
1205 jg LABEL(nibble_ashr_10_use)
1206
1207LABEL(nibble_ashr_10_restart_use):
1208 movdqa (%rdi, %rdx), %xmm0
618280a1 1209 palignr $10, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1210#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1211 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1212#else
1213 movdqa (%rsi,%rdx), %xmm1
1214 TOLOWER (%xmm0, %xmm1)
1215 pcmpistri $0x1a, %xmm1, %xmm0
1216#endif
1217 jbe LABEL(exit_use)
1218#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1219 sub $16, %r11
1220 jbe LABEL(strcmp_exitz)
1221#endif
1222
1223 add $16, %rdx
1224 add $16, %r10
1225 jg LABEL(nibble_ashr_10_use)
1226
1227 movdqa (%rdi, %rdx), %xmm0
618280a1 1228 palignr $10, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1229#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1230 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1231#else
1232 movdqa (%rsi,%rdx), %xmm1
1233 TOLOWER (%xmm0, %xmm1)
1234 pcmpistri $0x1a, %xmm1, %xmm0
1235#endif
1236 jbe LABEL(exit_use)
1237#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1238 sub $16, %r11
1239 jbe LABEL(strcmp_exitz)
1240#endif
1241 add $16, %rdx
1242 jmp LABEL(loop_ashr_10_use)
1243
1244 .p2align 4
1245LABEL(nibble_ashr_10_use):
1246 sub $0x1000, %r10
1247 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1248 psrldq $10, D(%xmm0)
d9a4d2ab
UD
1249 pcmpistri $0x3a,%xmm0, %xmm0
1250#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1251 cmp %r11, %rcx
1252 jae LABEL(nibble_ashr_exit_use)
1253#endif
1254 cmp $5, %ecx
1255 ja LABEL(nibble_ashr_10_restart_use)
1256
1257 jmp LABEL(nibble_ashr_exit_use)
1258
1259/*
1260 * The following cases will be handled by ashr_11
1261 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1262 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1263 */
1264 .p2align 4
1265LABEL(ashr_11):
618280a1 1266 pslldq $5, D(%xmm2)
d9a4d2ab 1267 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1268 pcmpeqb %xmm1, D(%xmm2)
1269 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1270 pmovmskb %xmm2, %r9d
1271 shr %cl, %edx
1272 shr %cl, %r9d
1273 sub %r9d, %edx
1274 jnz LABEL(less32bytes)
1275 movdqa (%rdi), %xmm3
1276
1277 UPDATE_STRNCMP_COUNTER
1278
d9a4d2ab
UD
1279 mov $16, %rcx /* index for loads */
1280 mov $11, %r9d /* byte position left over from less32bytes case */
1281 /*
1282 * Setup %r10 value allows us to detect crossing a page boundary.
1283 * When %r10 goes positive we have crossed a page boundary and
1284 * need to do a nibble.
1285 */
1286 lea 11(%rdi), %r10
1287 and $0xfff, %r10 /* offset into 4K page */
1288 sub $0x1000, %r10 /* subtract 4K pagesize */
1289 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1290
1291 .p2align 4
1292LABEL(loop_ashr_11_use):
1293 add $16, %r10
1294 jg LABEL(nibble_ashr_11_use)
1295
1296LABEL(nibble_ashr_11_restart_use):
1297 movdqa (%rdi, %rdx), %xmm0
618280a1 1298 palignr $11, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1299#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1300 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1301#else
1302 movdqa (%rsi,%rdx), %xmm1
1303 TOLOWER (%xmm0, %xmm1)
1304 pcmpistri $0x1a, %xmm1, %xmm0
1305#endif
1306 jbe LABEL(exit_use)
1307#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1308 sub $16, %r11
1309 jbe LABEL(strcmp_exitz)
1310#endif
1311
1312 add $16, %rdx
1313 add $16, %r10
1314 jg LABEL(nibble_ashr_11_use)
1315
1316 movdqa (%rdi, %rdx), %xmm0
618280a1 1317 palignr $11, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1319 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1320#else
1321 movdqa (%rsi,%rdx), %xmm1
1322 TOLOWER (%xmm0, %xmm1)
1323 pcmpistri $0x1a, %xmm1, %xmm0
1324#endif
1325 jbe LABEL(exit_use)
1326#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1327 sub $16, %r11
1328 jbe LABEL(strcmp_exitz)
1329#endif
1330 add $16, %rdx
1331 jmp LABEL(loop_ashr_11_use)
1332
1333 .p2align 4
1334LABEL(nibble_ashr_11_use):
1335 sub $0x1000, %r10
1336 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1337 psrldq $11, D(%xmm0)
d9a4d2ab
UD
1338 pcmpistri $0x3a,%xmm0, %xmm0
1339#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1340 cmp %r11, %rcx
1341 jae LABEL(nibble_ashr_exit_use)
1342#endif
1343 cmp $4, %ecx
1344 ja LABEL(nibble_ashr_11_restart_use)
1345
1346 jmp LABEL(nibble_ashr_exit_use)
1347
1348/*
1349 * The following cases will be handled by ashr_12
1350 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1351 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1352 */
1353 .p2align 4
1354LABEL(ashr_12):
618280a1 1355 pslldq $4, D(%xmm2)
d9a4d2ab 1356 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1357 pcmpeqb %xmm1, D(%xmm2)
1358 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1359 pmovmskb %xmm2, %r9d
1360 shr %cl, %edx
1361 shr %cl, %r9d
1362 sub %r9d, %edx
1363 jnz LABEL(less32bytes)
1364 movdqa (%rdi), %xmm3
1365
1366 UPDATE_STRNCMP_COUNTER
1367
d9a4d2ab
UD
1368 mov $16, %rcx /* index for loads */
1369 mov $12, %r9d /* byte position left over from less32bytes case */
1370 /*
1371 * Setup %r10 value allows us to detect crossing a page boundary.
1372 * When %r10 goes positive we have crossed a page boundary and
1373 * need to do a nibble.
1374 */
1375 lea 12(%rdi), %r10
1376 and $0xfff, %r10 /* offset into 4K page */
1377 sub $0x1000, %r10 /* subtract 4K pagesize */
1378 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1379
1380 .p2align 4
1381LABEL(loop_ashr_12_use):
1382 add $16, %r10
1383 jg LABEL(nibble_ashr_12_use)
1384
1385LABEL(nibble_ashr_12_restart_use):
1386 movdqa (%rdi, %rdx), %xmm0
618280a1 1387 palignr $12, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1388#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1389 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1390#else
1391 movdqa (%rsi,%rdx), %xmm1
1392 TOLOWER (%xmm0, %xmm1)
1393 pcmpistri $0x1a, %xmm1, %xmm0
1394#endif
1395 jbe LABEL(exit_use)
1396#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1397 sub $16, %r11
1398 jbe LABEL(strcmp_exitz)
1399#endif
1400
1401 add $16, %rdx
1402 add $16, %r10
1403 jg LABEL(nibble_ashr_12_use)
1404
1405 movdqa (%rdi, %rdx), %xmm0
618280a1 1406 palignr $12, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1407#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1408 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1409#else
1410 movdqa (%rsi,%rdx), %xmm1
1411 TOLOWER (%xmm0, %xmm1)
1412 pcmpistri $0x1a, %xmm1, %xmm0
1413#endif
1414 jbe LABEL(exit_use)
1415#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1416 sub $16, %r11
1417 jbe LABEL(strcmp_exitz)
1418#endif
1419 add $16, %rdx
1420 jmp LABEL(loop_ashr_12_use)
1421
1422 .p2align 4
1423LABEL(nibble_ashr_12_use):
1424 sub $0x1000, %r10
1425 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1426 psrldq $12, D(%xmm0)
d9a4d2ab
UD
1427 pcmpistri $0x3a,%xmm0, %xmm0
1428#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1429 cmp %r11, %rcx
1430 jae LABEL(nibble_ashr_exit_use)
1431#endif
1432 cmp $3, %ecx
1433 ja LABEL(nibble_ashr_12_restart_use)
1434
1435 jmp LABEL(nibble_ashr_exit_use)
1436
1437/*
1438 * The following cases will be handled by ashr_13
1439 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1440 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1441 */
1442 .p2align 4
1443LABEL(ashr_13):
618280a1 1444 pslldq $3, D(%xmm2)
d9a4d2ab 1445 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1446 pcmpeqb %xmm1, D(%xmm2)
1447 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1448 pmovmskb %xmm2, %r9d
1449 shr %cl, %edx
1450 shr %cl, %r9d
1451 sub %r9d, %edx
1452 jnz LABEL(less32bytes)
1453 movdqa (%rdi), %xmm3
1454
1455 UPDATE_STRNCMP_COUNTER
1456
d9a4d2ab
UD
1457 mov $16, %rcx /* index for loads */
1458 mov $13, %r9d /* byte position left over from less32bytes case */
1459 /*
1460 * Setup %r10 value allows us to detect crossing a page boundary.
1461 * When %r10 goes positive we have crossed a page boundary and
1462 * need to do a nibble.
1463 */
1464 lea 13(%rdi), %r10
1465 and $0xfff, %r10 /* offset into 4K page */
1466 sub $0x1000, %r10 /* subtract 4K pagesize */
1467
1468 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1469
1470 .p2align 4
1471LABEL(loop_ashr_13_use):
1472 add $16, %r10
1473 jg LABEL(nibble_ashr_13_use)
1474
1475LABEL(nibble_ashr_13_restart_use):
1476 movdqa (%rdi, %rdx), %xmm0
618280a1 1477 palignr $13, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1478#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1479 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1480#else
1481 movdqa (%rsi,%rdx), %xmm1
1482 TOLOWER (%xmm0, %xmm1)
1483 pcmpistri $0x1a, %xmm1, %xmm0
1484#endif
1485 jbe LABEL(exit_use)
1486#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1487 sub $16, %r11
1488 jbe LABEL(strcmp_exitz)
1489#endif
1490
1491 add $16, %rdx
1492 add $16, %r10
1493 jg LABEL(nibble_ashr_13_use)
1494
1495 movdqa (%rdi, %rdx), %xmm0
618280a1 1496 palignr $13, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1497#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1498 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1499#else
1500 movdqa (%rsi,%rdx), %xmm1
1501 TOLOWER (%xmm0, %xmm1)
1502 pcmpistri $0x1a, %xmm1, %xmm0
1503#endif
1504 jbe LABEL(exit_use)
1505#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1506 sub $16, %r11
1507 jbe LABEL(strcmp_exitz)
1508#endif
1509 add $16, %rdx
1510 jmp LABEL(loop_ashr_13_use)
1511
1512 .p2align 4
1513LABEL(nibble_ashr_13_use):
1514 sub $0x1000, %r10
1515 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1516 psrldq $13, D(%xmm0)
d9a4d2ab
UD
1517 pcmpistri $0x3a,%xmm0, %xmm0
1518#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1519 cmp %r11, %rcx
1520 jae LABEL(nibble_ashr_exit_use)
1521#endif
1522 cmp $2, %ecx
1523 ja LABEL(nibble_ashr_13_restart_use)
1524
1525 jmp LABEL(nibble_ashr_exit_use)
1526
1527/*
1528 * The following cases will be handled by ashr_14
1529 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1530 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1531 */
1532 .p2align 4
1533LABEL(ashr_14):
618280a1 1534 pslldq $2, D(%xmm2)
d9a4d2ab 1535 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1536 pcmpeqb %xmm1, D(%xmm2)
1537 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1538 pmovmskb %xmm2, %r9d
1539 shr %cl, %edx
1540 shr %cl, %r9d
1541 sub %r9d, %edx
1542 jnz LABEL(less32bytes)
1543 movdqa (%rdi), %xmm3
1544
1545 UPDATE_STRNCMP_COUNTER
1546
d9a4d2ab
UD
1547 mov $16, %rcx /* index for loads */
1548 mov $14, %r9d /* byte position left over from less32bytes case */
1549 /*
1550 * Setup %r10 value allows us to detect crossing a page boundary.
1551 * When %r10 goes positive we have crossed a page boundary and
1552 * need to do a nibble.
1553 */
1554 lea 14(%rdi), %r10
1555 and $0xfff, %r10 /* offset into 4K page */
1556 sub $0x1000, %r10 /* subtract 4K pagesize */
1557
1558 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1559
1560 .p2align 4
1561LABEL(loop_ashr_14_use):
1562 add $16, %r10
1563 jg LABEL(nibble_ashr_14_use)
1564
1565LABEL(nibble_ashr_14_restart_use):
1566 movdqa (%rdi, %rdx), %xmm0
618280a1 1567 palignr $14, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1568#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1569 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1570#else
1571 movdqa (%rsi,%rdx), %xmm1
1572 TOLOWER (%xmm0, %xmm1)
1573 pcmpistri $0x1a, %xmm1, %xmm0
1574#endif
1575 jbe LABEL(exit_use)
1576#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1577 sub $16, %r11
1578 jbe LABEL(strcmp_exitz)
1579#endif
1580
1581 add $16, %rdx
1582 add $16, %r10
1583 jg LABEL(nibble_ashr_14_use)
1584
1585 movdqa (%rdi, %rdx), %xmm0
618280a1 1586 palignr $14, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1587#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1588 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1589#else
1590 movdqa (%rsi,%rdx), %xmm1
1591 TOLOWER (%xmm0, %xmm1)
1592 pcmpistri $0x1a, %xmm1, %xmm0
1593#endif
1594 jbe LABEL(exit_use)
1595#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1596 sub $16, %r11
1597 jbe LABEL(strcmp_exitz)
1598#endif
1599 add $16, %rdx
1600 jmp LABEL(loop_ashr_14_use)
1601
1602 .p2align 4
1603LABEL(nibble_ashr_14_use):
1604 sub $0x1000, %r10
1605 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1606 psrldq $14, D(%xmm0)
d9a4d2ab
UD
1607 pcmpistri $0x3a,%xmm0, %xmm0
1608#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1609 cmp %r11, %rcx
1610 jae LABEL(nibble_ashr_exit_use)
1611#endif
1612 cmp $1, %ecx
1613 ja LABEL(nibble_ashr_14_restart_use)
1614
1615 jmp LABEL(nibble_ashr_exit_use)
1616
1617/*
1618 * The following cases will be handled by ashr_15
1619 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1620 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1621 */
1622 .p2align 4
1623LABEL(ashr_15):
618280a1 1624 pslldq $1, D(%xmm2)
d9a4d2ab 1625 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1626 pcmpeqb %xmm1, D(%xmm2)
1627 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1628 pmovmskb %xmm2, %r9d
1629 shr %cl, %edx
1630 shr %cl, %r9d
1631 sub %r9d, %edx
1632 jnz LABEL(less32bytes)
1633
1634 movdqa (%rdi), %xmm3
1635
1636 UPDATE_STRNCMP_COUNTER
1637
d9a4d2ab
UD
1638 mov $16, %rcx /* index for loads */
1639 mov $15, %r9d /* byte position left over from less32bytes case */
1640 /*
1641 * Setup %r10 value allows us to detect crossing a page boundary.
1642 * When %r10 goes positive we have crossed a page boundary and
1643 * need to do a nibble.
1644 */
1645 lea 15(%rdi), %r10
1646 and $0xfff, %r10 /* offset into 4K page */
1647
1648 sub $0x1000, %r10 /* subtract 4K pagesize */
1649
1650 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1651
1652 .p2align 4
1653LABEL(loop_ashr_15_use):
1654 add $16, %r10
1655 jg LABEL(nibble_ashr_15_use)
1656
1657LABEL(nibble_ashr_15_restart_use):
1658 movdqa (%rdi, %rdx), %xmm0
618280a1 1659 palignr $15, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1660#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1661 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1662#else
1663 movdqa (%rsi,%rdx), %xmm1
1664 TOLOWER (%xmm0, %xmm1)
1665 pcmpistri $0x1a, %xmm1, %xmm0
1666#endif
1667 jbe LABEL(exit_use)
1668#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1669 sub $16, %r11
1670 jbe LABEL(strcmp_exitz)
1671#endif
1672
1673 add $16, %rdx
1674 add $16, %r10
1675 jg LABEL(nibble_ashr_15_use)
1676
1677 movdqa (%rdi, %rdx), %xmm0
618280a1 1678 palignr $15, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1679#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1680 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1681#else
1682 movdqa (%rsi,%rdx), %xmm1
1683 TOLOWER (%xmm0, %xmm1)
1684 pcmpistri $0x1a, %xmm1, %xmm0
1685#endif
1686 jbe LABEL(exit_use)
1687#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1688 sub $16, %r11
1689 jbe LABEL(strcmp_exitz)
1690#endif
1691 add $16, %rdx
1692 jmp LABEL(loop_ashr_15_use)
1693
1694 .p2align 4
1695LABEL(nibble_ashr_15_use):
1696 sub $0x1000, %r10
1697 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1698 psrldq $15, D(%xmm0)
d9a4d2ab
UD
1699 pcmpistri $0x3a,%xmm0, %xmm0
1700#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1701 cmp %r11, %rcx
1702 jae LABEL(nibble_ashr_exit_use)
1703#endif
1704 cmp $0, %ecx
1705 ja LABEL(nibble_ashr_15_restart_use)
1706
1707LABEL(nibble_ashr_exit_use):
1708#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1709 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1710#else
1711 movdqa (%rsi,%rdx), %xmm1
1712 TOLOWER (%xmm0, %xmm1)
1713 pcmpistri $0x1a, %xmm1, %xmm0
1714#endif
1715 .p2align 4
1716LABEL(exit_use):
1717 jnc LABEL(strcmp_exitz)
1718#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1719 sub %rcx, %r11
1720 jbe LABEL(strcmp_exitz)
1721#endif
1722 add %rcx, %rdx
1723 lea -16(%rdi, %r9), %rdi
1724 movzbl (%rdi, %rdx), %eax
1725 movzbl (%rsi, %rdx), %edx
1726 test %r8d, %r8d
1727 jz LABEL(ret_use)
1728 xchg %eax, %edx
1729LABEL(ret_use):
1730#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1731 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1732 movl (%rcx,%rdx,4), %edx
1733 movl (%rcx,%rax,4), %eax
1734#endif
1735
1736 sub %edx, %eax
1737 ret
1738
1739LABEL(less32bytes):
1740 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1741 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1742 test %r8d, %r8d
1743 jz LABEL(ret)
1744 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1745
1746 .p2align 4
1747LABEL(ret):
1748LABEL(less16bytes):
1749 bsf %rdx, %rdx /* find and store bit index in %rdx */
1750
1751#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1752 sub %rdx, %r11
1753 jbe LABEL(strcmp_exitz)
1754#endif
1755 movzbl (%rsi, %rdx), %ecx
1756 movzbl (%rdi, %rdx), %eax
1757
1758#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1759 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1760 movl (%rdx,%rcx,4), %ecx
1761 movl (%rdx,%rax,4), %eax
1762#endif
1763
1764 sub %ecx, %eax
1765 ret
1766
1767LABEL(strcmp_exitz):
1768 xor %eax, %eax
1769 ret
1770
1771 .p2align 4
1772 // XXX Same as code above
1773LABEL(Byte0):
1774 movzx (%rsi), %ecx
1775 movzx (%rdi), %eax
1776
1777#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1778 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1779 movl (%rdx,%rcx,4), %ecx
1780 movl (%rdx,%rax,4), %eax
1781#endif
1782
1783 sub %ecx, %eax
1784 ret
1785 cfi_endproc
1786 .size STRCMP_SSE42, .-STRCMP_SSE42
1787
1788#undef UCLOW_reg
1789#undef UCHIGH_reg
1790#undef LCQWORD_reg
1791#undef TOLOWER
1792
1793 /* Put all SSE 4.2 functions together. */
1794 .section .rodata.SECTION,"a",@progbits
1795 .p2align 3
1796LABEL(unaligned_table):
1797 .int LABEL(ashr_1) - LABEL(unaligned_table)
1798 .int LABEL(ashr_2) - LABEL(unaligned_table)
1799 .int LABEL(ashr_3) - LABEL(unaligned_table)
1800 .int LABEL(ashr_4) - LABEL(unaligned_table)
1801 .int LABEL(ashr_5) - LABEL(unaligned_table)
1802 .int LABEL(ashr_6) - LABEL(unaligned_table)
1803 .int LABEL(ashr_7) - LABEL(unaligned_table)
1804 .int LABEL(ashr_8) - LABEL(unaligned_table)
1805 .int LABEL(ashr_9) - LABEL(unaligned_table)
1806 .int LABEL(ashr_10) - LABEL(unaligned_table)
1807 .int LABEL(ashr_11) - LABEL(unaligned_table)
1808 .int LABEL(ashr_12) - LABEL(unaligned_table)
1809 .int LABEL(ashr_13) - LABEL(unaligned_table)
1810 .int LABEL(ashr_14) - LABEL(unaligned_table)
1811 .int LABEL(ashr_15) - LABEL(unaligned_table)
1812 .int LABEL(ashr_0) - LABEL(unaligned_table)
1813
1814#undef LABEL
1815#undef GLABEL
1816#undef SECTION
618280a1
UD
1817#undef movdqa
1818#undef movdqu
1819#undef pmovmskb
1820#undef pcmpistri
1821#undef psubb
1822#undef pcmpeqb
1823#undef psrldq
1824#undef pslldq
1825#undef palignr
1826#undef pxor
1827#undef D