]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strcmp-sse42.S
Load cache sizes into R*_LP in memcpy-ssse3.S
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-sse42.S
CommitLineData
d9a4d2ab
UD
1/* strcmp with SSE4.2
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
d9a4d2ab
UD
19
20
21/* We use 0x1a:
22 _SIDD_SBYTE_OPS
23 | _SIDD_CMP_EQUAL_EACH
24 | _SIDD_NEGATIVE_POLARITY
25 | _SIDD_LEAST_SIGNIFICANT
26 on pcmpistri to find out if two 16byte data elements are the same
27 and the offset of the first different byte. There are 4 cases:
28
29 1. Both 16byte data elements are valid and identical.
30 2. Both 16byte data elements have EOS and identical.
31 3. Both 16byte data elements are valid and they differ at offset X.
32 4. At least one 16byte data element has EOS at offset X. Two 16byte
33 data elements must differ at or before offset X.
34
35 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
36
37 case ECX CFlag ZFlag SFlag
38 1 16 0 0 0
39 2 16 0 1 1
40 3 X 1 0 0
41 4 0 <= X 1 0/1 0/1
42
43 We exit from the loop for cases 2, 3 and 4 with jbe which branches
44 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
45 case 2. */
46
47 /* Put all SSE 4.2 functions together. */
48 .section .text.SECTION,"ax",@progbits
49 .align 16
50 .type STRCMP_SSE42, @function
51#ifdef USE_AS_STRCASECMP_L
52ENTRY (GLABEL(__strcasecmp))
53 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
54 movq %fs:(%rax),%rdx
55
56 // XXX 5 byte should be before the function
57 /* 5-byte NOP. */
58 .byte 0x0f,0x1f,0x44,0x00,0x00
59END (GLABEL(__strcasecmp))
60 /* FALLTHROUGH to strcasecmp_l. */
61#endif
62#ifdef USE_AS_STRNCASECMP_L
63ENTRY (GLABEL(__strncasecmp))
64 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
65 movq %fs:(%rax),%rcx
66
67 // XXX 5 byte should be before the function
68 /* 5-byte NOP. */
69 .byte 0x0f,0x1f,0x44,0x00,0x00
70END (GLABEL(__strncasecmp))
71 /* FALLTHROUGH to strncasecmp_l. */
72#endif
73
618280a1
UD
74
75#ifdef USE_AVX
76# define movdqa vmovdqa
77# define movdqu vmovdqu
78# define pmovmskb vpmovmskb
79# define pcmpistri vpcmpistri
80# define psubb vpsubb
81# define pcmpeqb vpcmpeqb
82# define psrldq vpsrldq
83# define pslldq vpslldq
84# define palignr vpalignr
85# define pxor vpxor
86# define D(arg) arg, arg
87#else
88# define D(arg) arg
89#endif
90
d9a4d2ab
UD
91STRCMP_SSE42:
92 cfi_startproc
93 CALL_MCOUNT
94
95/*
96 * This implementation uses SSE to compare up to 16 bytes at a time.
97 */
98#ifdef USE_AS_STRCASECMP_L
99 /* We have to fall back on the C implementation for locales
100 with encodings not matching ASCII for single bytes. */
101# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
102 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
103# else
104 movq (%rdx), %rax
105# endif
34372fc6 106 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
d9a4d2ab
UD
107 jne __strcasecmp_l_nonascii
108#endif
109#ifdef USE_AS_STRNCASECMP_L
110 /* We have to fall back on the C implementation for locales
111 with encodings not matching ASCII for single bytes. */
112# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
113 movq LOCALE_T___LOCALES+LC_CTYPE*8(%rcx), %rax
114# else
115 movq (%rcx), %rax
116# endif
34372fc6 117 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
d9a4d2ab
UD
118 jne __strncasecmp_l_nonascii
119#endif
120
121#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
122 test %rdx, %rdx
123 je LABEL(strcmp_exitz)
124 cmp $1, %rdx
125 je LABEL(Byte0)
126 mov %rdx, %r11
127#endif
128 mov %esi, %ecx
129 mov %edi, %eax
130/* Use 64bit AND here to avoid long NOP padding. */
131 and $0x3f, %rcx /* rsi alignment in cache line */
132 and $0x3f, %rax /* rdi alignment in cache line */
133#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
134 .section .rodata.cst16,"aM",@progbits,16
135 .align 16
136LABEL(belowupper):
137 .quad 0x4040404040404040
138 .quad 0x4040404040404040
139LABEL(topupper):
140# ifdef USE_AVX
141 .quad 0x5a5a5a5a5a5a5a5a
142 .quad 0x5a5a5a5a5a5a5a5a
143# else
144 .quad 0x5b5b5b5b5b5b5b5b
145 .quad 0x5b5b5b5b5b5b5b5b
146# endif
147LABEL(touppermask):
148 .quad 0x2020202020202020
149 .quad 0x2020202020202020
150 .previous
151 movdqa LABEL(belowupper)(%rip), %xmm4
152# define UCLOW_reg %xmm4
153 movdqa LABEL(topupper)(%rip), %xmm5
154# define UCHIGH_reg %xmm5
155 movdqa LABEL(touppermask)(%rip), %xmm6
156# define LCQWORD_reg %xmm6
157#endif
158 cmp $0x30, %ecx
159 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
160 cmp $0x30, %eax
161 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
162 movdqu (%rdi), %xmm1
163 movdqu (%rsi), %xmm2
164#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
165# ifdef USE_AVX
166# define TOLOWER(reg1, reg2) \
167 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
168 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
169 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
170 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
171 vpandn %xmm7, %xmm8, %xmm8; \
172 vpandn %xmm9, %xmm10, %xmm10; \
173 vpand LCQWORD_reg, %xmm8, %xmm8; \
174 vpand LCQWORD_reg, %xmm10, %xmm10; \
175 vpor reg1, %xmm8, reg1; \
176 vpor reg2, %xmm10, reg2
177# else
178# define TOLOWER(reg1, reg2) \
179 movdqa reg1, %xmm7; \
180 movdqa UCHIGH_reg, %xmm8; \
181 movdqa reg2, %xmm9; \
182 movdqa UCHIGH_reg, %xmm10; \
183 pcmpgtb UCLOW_reg, %xmm7; \
184 pcmpgtb reg1, %xmm8; \
185 pcmpgtb UCLOW_reg, %xmm9; \
186 pcmpgtb reg2, %xmm10; \
187 pand %xmm8, %xmm7; \
188 pand %xmm10, %xmm9; \
189 pand LCQWORD_reg, %xmm7; \
190 pand LCQWORD_reg, %xmm9; \
191 por %xmm7, reg1; \
192 por %xmm9, reg2
193# endif
194 TOLOWER (%xmm1, %xmm2)
195#else
196# define TOLOWER(reg1, reg2)
197#endif
618280a1
UD
198 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
199 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
200 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
201 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
d9a4d2ab
UD
202 pmovmskb %xmm1, %edx
203 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
204 jnz LABEL(less16bytes)/* If not, find different value or null char */
205#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
206 sub $16, %r11
207 jbe LABEL(strcmp_exitz)/* finish comparision */
208#endif
209 add $16, %rsi /* prepare to search next 16 bytes */
210 add $16, %rdi /* prepare to search next 16 bytes */
211
212 /*
213 * Determine source and destination string offsets from 16-byte
214 * alignment. Use relative offset difference between the two to
215 * determine which case below to use.
216 */
217 .p2align 4
218LABEL(crosscache):
219 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
220 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
221 mov $0xffff, %edx /* for equivalent offset */
222 xor %r8d, %r8d
223 and $0xf, %ecx /* offset of rsi */
224 and $0xf, %eax /* offset of rdi */
618280a1 225 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
d9a4d2ab
UD
226 cmp %eax, %ecx
227 je LABEL(ashr_0) /* rsi and rdi relative offset same */
228 ja LABEL(bigger)
229 mov %edx, %r8d /* r8d is offset flag for exit tail */
230 xchg %ecx, %eax
231 xchg %rsi, %rdi
232LABEL(bigger):
618280a1
UD
233 movdqa (%rdi), %xmm2
234 movdqa (%rsi), %xmm1
d9a4d2ab
UD
235 lea 15(%rax), %r9
236 sub %rcx, %r9
237 lea LABEL(unaligned_table)(%rip), %r10
238 movslq (%r10, %r9,4), %r9
618280a1 239 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
d9a4d2ab
UD
240 lea (%r10, %r9), %r10
241 jmp *%r10 /* jump to corresponding case */
242
243/*
244 * The following cases will be handled by ashr_0
245 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
246 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
247 */
248 .p2align 4
249LABEL(ashr_0):
250
251 movdqa (%rsi), %xmm1
618280a1 252 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
d9a4d2ab 253#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
618280a1 254 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
d9a4d2ab
UD
255#else
256 movdqa (%rdi), %xmm2
257 TOLOWER (%xmm1, %xmm2)
618280a1 258 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
d9a4d2ab 259#endif
618280a1 260 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
d9a4d2ab
UD
261 pmovmskb %xmm1, %r9d
262 shr %cl, %edx /* adjust 0xffff for offset */
263 shr %cl, %r9d /* adjust for 16-byte offset */
264 sub %r9d, %edx
265 /*
266 * edx must be the same with r9d if in left byte (16-rcx) is equal to
267 * the start from (16-rax) and no null char was seen.
268 */
269 jne LABEL(less32bytes) /* mismatch or null char */
270 UPDATE_STRNCMP_COUNTER
271 mov $16, %rcx
272 mov $16, %r9
d9a4d2ab
UD
273
274 /*
275 * Now both strings are aligned at 16-byte boundary. Loop over strings
276 * checking 32-bytes per iteration.
277 */
278 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
279 .p2align 4
280LABEL(ashr_0_use):
281 movdqa (%rdi,%rdx), %xmm0
282#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
283 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
284#else
285 movdqa (%rsi,%rdx), %xmm1
286 TOLOWER (%xmm0, %xmm1)
287 pcmpistri $0x1a, %xmm1, %xmm0
288#endif
289 lea 16(%rdx), %rdx
290 jbe LABEL(ashr_0_exit_use)
291#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
292 sub $16, %r11
293 jbe LABEL(strcmp_exitz)
294#endif
295
296 movdqa (%rdi,%rdx), %xmm0
297#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
298 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
299#else
300 movdqa (%rsi,%rdx), %xmm1
301 TOLOWER (%xmm0, %xmm1)
302 pcmpistri $0x1a, %xmm1, %xmm0
303#endif
304 lea 16(%rdx), %rdx
305 jbe LABEL(ashr_0_exit_use)
306#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
307 sub $16, %r11
308 jbe LABEL(strcmp_exitz)
309#endif
310 jmp LABEL(ashr_0_use)
311
312
313 .p2align 4
314LABEL(ashr_0_exit_use):
315 jnc LABEL(strcmp_exitz)
316#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
317 sub %rcx, %r11
318 jbe LABEL(strcmp_exitz)
319#endif
320 lea -16(%rdx, %rcx), %rcx
321 movzbl (%rdi, %rcx), %eax
322 movzbl (%rsi, %rcx), %edx
323#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
324 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
325 movl (%rcx,%rax,4), %eax
326 movl (%rcx,%rdx,4), %edx
327#endif
328 sub %edx, %eax
329 ret
330
331
332
333/*
334 * The following cases will be handled by ashr_1
335 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
336 * n(15) n -15 0(15 +(n-15) - n) ashr_1
337 */
338 .p2align 4
339LABEL(ashr_1):
618280a1 340 pslldq $15, D(%xmm2) /* shift first string to align with second */
d9a4d2ab 341 TOLOWER (%xmm1, %xmm2)
618280a1
UD
342 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
343 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
d9a4d2ab
UD
344 pmovmskb %xmm2, %r9d
345 shr %cl, %edx /* adjust 0xffff for offset */
346 shr %cl, %r9d /* adjust for 16-byte offset */
347 sub %r9d, %edx
348 jnz LABEL(less32bytes) /* mismatch or null char seen */
349 movdqa (%rdi), %xmm3
350 UPDATE_STRNCMP_COUNTER
351
d9a4d2ab
UD
352 mov $16, %rcx /* index for loads*/
353 mov $1, %r9d /* byte position left over from less32bytes case */
354 /*
355 * Setup %r10 value allows us to detect crossing a page boundary.
356 * When %r10 goes positive we have crossed a page boundary and
357 * need to do a nibble.
358 */
359 lea 1(%rdi), %r10
360 and $0xfff, %r10 /* offset into 4K page */
361 sub $0x1000, %r10 /* subtract 4K pagesize */
362 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
363
364 .p2align 4
365LABEL(loop_ashr_1_use):
366 add $16, %r10
367 jg LABEL(nibble_ashr_1_use)
368
369LABEL(nibble_ashr_1_restart_use):
370 movdqa (%rdi, %rdx), %xmm0
618280a1 371 palignr $1, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
372#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
373 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
374#else
375 movdqa (%rsi,%rdx), %xmm1
376 TOLOWER (%xmm0, %xmm1)
377 pcmpistri $0x1a, %xmm1, %xmm0
378#endif
379 jbe LABEL(exit_use)
380#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
381 sub $16, %r11
382 jbe LABEL(strcmp_exitz)
383#endif
384
385 add $16, %rdx
386 add $16, %r10
387 jg LABEL(nibble_ashr_1_use)
388
389 movdqa (%rdi, %rdx), %xmm0
618280a1 390 palignr $1, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
391#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
392 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
393#else
394 movdqa (%rsi,%rdx), %xmm1
395 TOLOWER (%xmm0, %xmm1)
396 pcmpistri $0x1a, %xmm1, %xmm0
397#endif
398 jbe LABEL(exit_use)
399#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
400 sub $16, %r11
401 jbe LABEL(strcmp_exitz)
402#endif
403 add $16, %rdx
404 jmp LABEL(loop_ashr_1_use)
405
406 .p2align 4
407LABEL(nibble_ashr_1_use):
408 sub $0x1000, %r10
409 movdqa -16(%rdi, %rdx), %xmm0
618280a1 410 psrldq $1, D(%xmm0)
d9a4d2ab
UD
411 pcmpistri $0x3a,%xmm0, %xmm0
412#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
413 cmp %r11, %rcx
414 jae LABEL(nibble_ashr_exit_use)
415#endif
416 cmp $14, %ecx
417 ja LABEL(nibble_ashr_1_restart_use)
418
419 jmp LABEL(nibble_ashr_exit_use)
420
421/*
422 * The following cases will be handled by ashr_2
423 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
424 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
425 */
426 .p2align 4
427LABEL(ashr_2):
618280a1 428 pslldq $14, D(%xmm2)
d9a4d2ab 429 TOLOWER (%xmm1, %xmm2)
618280a1
UD
430 pcmpeqb %xmm1, D(%xmm2)
431 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
432 pmovmskb %xmm2, %r9d
433 shr %cl, %edx
434 shr %cl, %r9d
435 sub %r9d, %edx
436 jnz LABEL(less32bytes)
437 movdqa (%rdi), %xmm3
438 UPDATE_STRNCMP_COUNTER
439
d9a4d2ab
UD
440 mov $16, %rcx /* index for loads */
441 mov $2, %r9d /* byte position left over from less32bytes case */
442 /*
443 * Setup %r10 value allows us to detect crossing a page boundary.
444 * When %r10 goes positive we have crossed a page boundary and
445 * need to do a nibble.
446 */
447 lea 2(%rdi), %r10
448 and $0xfff, %r10 /* offset into 4K page */
449 sub $0x1000, %r10 /* subtract 4K pagesize */
450 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
451
452 .p2align 4
453LABEL(loop_ashr_2_use):
454 add $16, %r10
455 jg LABEL(nibble_ashr_2_use)
456
457LABEL(nibble_ashr_2_restart_use):
458 movdqa (%rdi, %rdx), %xmm0
618280a1 459 palignr $2, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
460#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
461 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
462#else
463 movdqa (%rsi,%rdx), %xmm1
464 TOLOWER (%xmm0, %xmm1)
465 pcmpistri $0x1a, %xmm1, %xmm0
466#endif
467 jbe LABEL(exit_use)
468#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
469 sub $16, %r11
470 jbe LABEL(strcmp_exitz)
471#endif
472
473 add $16, %rdx
474 add $16, %r10
475 jg LABEL(nibble_ashr_2_use)
476
477 movdqa (%rdi, %rdx), %xmm0
618280a1 478 palignr $2, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
479#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
480 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
481#else
482 movdqa (%rsi,%rdx), %xmm1
483 TOLOWER (%xmm0, %xmm1)
484 pcmpistri $0x1a, %xmm1, %xmm0
485#endif
486 jbe LABEL(exit_use)
487#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
488 sub $16, %r11
489 jbe LABEL(strcmp_exitz)
490#endif
491 add $16, %rdx
492 jmp LABEL(loop_ashr_2_use)
493
494 .p2align 4
495LABEL(nibble_ashr_2_use):
496 sub $0x1000, %r10
497 movdqa -16(%rdi, %rdx), %xmm0
618280a1 498 psrldq $2, D(%xmm0)
d9a4d2ab
UD
499 pcmpistri $0x3a,%xmm0, %xmm0
500#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
501 cmp %r11, %rcx
502 jae LABEL(nibble_ashr_exit_use)
503#endif
504 cmp $13, %ecx
505 ja LABEL(nibble_ashr_2_restart_use)
506
507 jmp LABEL(nibble_ashr_exit_use)
508
509/*
510 * The following cases will be handled by ashr_3
511 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
512 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
513 */
514 .p2align 4
515LABEL(ashr_3):
618280a1 516 pslldq $13, D(%xmm2)
d9a4d2ab 517 TOLOWER (%xmm1, %xmm2)
618280a1
UD
518 pcmpeqb %xmm1, D(%xmm2)
519 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
520 pmovmskb %xmm2, %r9d
521 shr %cl, %edx
522 shr %cl, %r9d
523 sub %r9d, %edx
524 jnz LABEL(less32bytes)
525 movdqa (%rdi), %xmm3
526
527 UPDATE_STRNCMP_COUNTER
528
d9a4d2ab
UD
529 mov $16, %rcx /* index for loads */
530 mov $3, %r9d /* byte position left over from less32bytes case */
531 /*
532 * Setup %r10 value allows us to detect crossing a page boundary.
533 * When %r10 goes positive we have crossed a page boundary and
534 * need to do a nibble.
535 */
536 lea 3(%rdi), %r10
537 and $0xfff, %r10 /* offset into 4K page */
538 sub $0x1000, %r10 /* subtract 4K pagesize */
539 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
540
541LABEL(loop_ashr_3_use):
542 add $16, %r10
543 jg LABEL(nibble_ashr_3_use)
544
545LABEL(nibble_ashr_3_restart_use):
546 movdqa (%rdi, %rdx), %xmm0
618280a1 547 palignr $3, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
548#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
549 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
550#else
551 movdqa (%rsi,%rdx), %xmm1
552 TOLOWER (%xmm0, %xmm1)
553 pcmpistri $0x1a, %xmm1, %xmm0
554#endif
555 jbe LABEL(exit_use)
556#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
557 sub $16, %r11
558 jbe LABEL(strcmp_exitz)
559#endif
560
561 add $16, %rdx
562 add $16, %r10
563 jg LABEL(nibble_ashr_3_use)
564
565 movdqa (%rdi, %rdx), %xmm0
618280a1 566 palignr $3, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
567#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
568 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
569#else
570 movdqa (%rsi,%rdx), %xmm1
571 TOLOWER (%xmm0, %xmm1)
572 pcmpistri $0x1a, %xmm1, %xmm0
573#endif
574 jbe LABEL(exit_use)
575#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
576 sub $16, %r11
577 jbe LABEL(strcmp_exitz)
578#endif
579 add $16, %rdx
580 jmp LABEL(loop_ashr_3_use)
581
582 .p2align 4
583LABEL(nibble_ashr_3_use):
584 sub $0x1000, %r10
585 movdqa -16(%rdi, %rdx), %xmm0
618280a1 586 psrldq $3, D(%xmm0)
d9a4d2ab
UD
587 pcmpistri $0x3a,%xmm0, %xmm0
588#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
589 cmp %r11, %rcx
590 jae LABEL(nibble_ashr_exit_use)
591#endif
592 cmp $12, %ecx
593 ja LABEL(nibble_ashr_3_restart_use)
594
595 jmp LABEL(nibble_ashr_exit_use)
596
597/*
598 * The following cases will be handled by ashr_4
599 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
600 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
601 */
602 .p2align 4
603LABEL(ashr_4):
618280a1 604 pslldq $12, D(%xmm2)
d9a4d2ab 605 TOLOWER (%xmm1, %xmm2)
618280a1
UD
606 pcmpeqb %xmm1, D(%xmm2)
607 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
608 pmovmskb %xmm2, %r9d
609 shr %cl, %edx
610 shr %cl, %r9d
611 sub %r9d, %edx
612 jnz LABEL(less32bytes)
613 movdqa (%rdi), %xmm3
614
615 UPDATE_STRNCMP_COUNTER
616
d9a4d2ab
UD
617 mov $16, %rcx /* index for loads */
618 mov $4, %r9d /* byte position left over from less32bytes case */
619 /*
620 * Setup %r10 value allows us to detect crossing a page boundary.
621 * When %r10 goes positive we have crossed a page boundary and
622 * need to do a nibble.
623 */
624 lea 4(%rdi), %r10
625 and $0xfff, %r10 /* offset into 4K page */
626 sub $0x1000, %r10 /* subtract 4K pagesize */
627 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
628
629 .p2align 4
630LABEL(loop_ashr_4_use):
631 add $16, %r10
632 jg LABEL(nibble_ashr_4_use)
633
634LABEL(nibble_ashr_4_restart_use):
635 movdqa (%rdi, %rdx), %xmm0
618280a1 636 palignr $4, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
637#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
638 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
639#else
640 movdqa (%rsi,%rdx), %xmm1
641 TOLOWER (%xmm0, %xmm1)
642 pcmpistri $0x1a, %xmm1, %xmm0
643#endif
644 jbe LABEL(exit_use)
645#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
646 sub $16, %r11
647 jbe LABEL(strcmp_exitz)
648#endif
649
650 add $16, %rdx
651 add $16, %r10
652 jg LABEL(nibble_ashr_4_use)
653
654 movdqa (%rdi, %rdx), %xmm0
618280a1 655 palignr $4, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
656#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
657 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
658#else
659 movdqa (%rsi,%rdx), %xmm1
660 TOLOWER (%xmm0, %xmm1)
661 pcmpistri $0x1a, %xmm1, %xmm0
662#endif
663 jbe LABEL(exit_use)
664#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
665 sub $16, %r11
666 jbe LABEL(strcmp_exitz)
667#endif
668 add $16, %rdx
669 jmp LABEL(loop_ashr_4_use)
670
671 .p2align 4
672LABEL(nibble_ashr_4_use):
673 sub $0x1000, %r10
674 movdqa -16(%rdi, %rdx), %xmm0
618280a1 675 psrldq $4, D(%xmm0)
d9a4d2ab
UD
676 pcmpistri $0x3a,%xmm0, %xmm0
677#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
678 cmp %r11, %rcx
679 jae LABEL(nibble_ashr_exit_use)
680#endif
681 cmp $11, %ecx
682 ja LABEL(nibble_ashr_4_restart_use)
683
684 jmp LABEL(nibble_ashr_exit_use)
685
686/*
687 * The following cases will be handled by ashr_5
688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
689 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
690 */
691 .p2align 4
692LABEL(ashr_5):
618280a1 693 pslldq $11, D(%xmm2)
d9a4d2ab 694 TOLOWER (%xmm1, %xmm2)
618280a1
UD
695 pcmpeqb %xmm1, D(%xmm2)
696 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
697 pmovmskb %xmm2, %r9d
698 shr %cl, %edx
699 shr %cl, %r9d
700 sub %r9d, %edx
701 jnz LABEL(less32bytes)
702 movdqa (%rdi), %xmm3
703
704 UPDATE_STRNCMP_COUNTER
705
d9a4d2ab
UD
706 mov $16, %rcx /* index for loads */
707 mov $5, %r9d /* byte position left over from less32bytes case */
708 /*
709 * Setup %r10 value allows us to detect crossing a page boundary.
710 * When %r10 goes positive we have crossed a page boundary and
711 * need to do a nibble.
712 */
713 lea 5(%rdi), %r10
714 and $0xfff, %r10 /* offset into 4K page */
715 sub $0x1000, %r10 /* subtract 4K pagesize */
716 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
717
718 .p2align 4
719LABEL(loop_ashr_5_use):
720 add $16, %r10
721 jg LABEL(nibble_ashr_5_use)
722
723LABEL(nibble_ashr_5_restart_use):
724 movdqa (%rdi, %rdx), %xmm0
618280a1 725 palignr $5, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
726#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
727 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
728#else
729 movdqa (%rsi,%rdx), %xmm1
730 TOLOWER (%xmm0, %xmm1)
731 pcmpistri $0x1a, %xmm1, %xmm0
732#endif
733 jbe LABEL(exit_use)
734#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
735 sub $16, %r11
736 jbe LABEL(strcmp_exitz)
737#endif
738
739 add $16, %rdx
740 add $16, %r10
741 jg LABEL(nibble_ashr_5_use)
742
743 movdqa (%rdi, %rdx), %xmm0
744
618280a1 745 palignr $5, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
746#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
747 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
748#else
749 movdqa (%rsi,%rdx), %xmm1
750 TOLOWER (%xmm0, %xmm1)
751 pcmpistri $0x1a, %xmm1, %xmm0
752#endif
753 jbe LABEL(exit_use)
754#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
755 sub $16, %r11
756 jbe LABEL(strcmp_exitz)
757#endif
758 add $16, %rdx
759 jmp LABEL(loop_ashr_5_use)
760
761 .p2align 4
762LABEL(nibble_ashr_5_use):
763 sub $0x1000, %r10
764 movdqa -16(%rdi, %rdx), %xmm0
618280a1 765 psrldq $5, D(%xmm0)
d9a4d2ab
UD
766 pcmpistri $0x3a,%xmm0, %xmm0
767#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
768 cmp %r11, %rcx
769 jae LABEL(nibble_ashr_exit_use)
770#endif
771 cmp $10, %ecx
772 ja LABEL(nibble_ashr_5_restart_use)
773
774 jmp LABEL(nibble_ashr_exit_use)
775
776/*
777 * The following cases will be handled by ashr_6
778 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
779 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
780 */
781 .p2align 4
782LABEL(ashr_6):
618280a1 783 pslldq $10, D(%xmm2)
d9a4d2ab 784 TOLOWER (%xmm1, %xmm2)
618280a1
UD
785 pcmpeqb %xmm1, D(%xmm2)
786 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
787 pmovmskb %xmm2, %r9d
788 shr %cl, %edx
789 shr %cl, %r9d
790 sub %r9d, %edx
791 jnz LABEL(less32bytes)
792 movdqa (%rdi), %xmm3
793
794 UPDATE_STRNCMP_COUNTER
795
d9a4d2ab
UD
796 mov $16, %rcx /* index for loads */
797 mov $6, %r9d /* byte position left over from less32bytes case */
798 /*
799 * Setup %r10 value allows us to detect crossing a page boundary.
800 * When %r10 goes positive we have crossed a page boundary and
801 * need to do a nibble.
802 */
803 lea 6(%rdi), %r10
804 and $0xfff, %r10 /* offset into 4K page */
805 sub $0x1000, %r10 /* subtract 4K pagesize */
806 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
807
808 .p2align 4
809LABEL(loop_ashr_6_use):
810 add $16, %r10
811 jg LABEL(nibble_ashr_6_use)
812
813LABEL(nibble_ashr_6_restart_use):
814 movdqa (%rdi, %rdx), %xmm0
618280a1 815 palignr $6, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
816#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
817 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
818#else
819 movdqa (%rsi,%rdx), %xmm1
820 TOLOWER (%xmm0, %xmm1)
821 pcmpistri $0x1a, %xmm1, %xmm0
822#endif
823 jbe LABEL(exit_use)
824#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
825 sub $16, %r11
826 jbe LABEL(strcmp_exitz)
827#endif
828
829 add $16, %rdx
830 add $16, %r10
831 jg LABEL(nibble_ashr_6_use)
832
833 movdqa (%rdi, %rdx), %xmm0
618280a1 834 palignr $6, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
835#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
836 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
837#else
838 movdqa (%rsi,%rdx), %xmm1
839 TOLOWER (%xmm0, %xmm1)
840 pcmpistri $0x1a, %xmm1, %xmm0
841#endif
842 jbe LABEL(exit_use)
843#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
844 sub $16, %r11
845 jbe LABEL(strcmp_exitz)
846#endif
847 add $16, %rdx
848 jmp LABEL(loop_ashr_6_use)
849
850 .p2align 4
851LABEL(nibble_ashr_6_use):
852 sub $0x1000, %r10
853 movdqa -16(%rdi, %rdx), %xmm0
618280a1 854 psrldq $6, D(%xmm0)
d9a4d2ab
UD
855 pcmpistri $0x3a,%xmm0, %xmm0
856#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
857 cmp %r11, %rcx
858 jae LABEL(nibble_ashr_exit_use)
859#endif
860 cmp $9, %ecx
861 ja LABEL(nibble_ashr_6_restart_use)
862
863 jmp LABEL(nibble_ashr_exit_use)
864
865/*
866 * The following cases will be handled by ashr_7
867 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
868 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
869 */
870 .p2align 4
871LABEL(ashr_7):
618280a1 872 pslldq $9, D(%xmm2)
d9a4d2ab 873 TOLOWER (%xmm1, %xmm2)
618280a1
UD
874 pcmpeqb %xmm1, D(%xmm2)
875 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
876 pmovmskb %xmm2, %r9d
877 shr %cl, %edx
878 shr %cl, %r9d
879 sub %r9d, %edx
880 jnz LABEL(less32bytes)
881 movdqa (%rdi), %xmm3
882
883 UPDATE_STRNCMP_COUNTER
884
d9a4d2ab
UD
885 mov $16, %rcx /* index for loads */
886 mov $7, %r9d /* byte position left over from less32bytes case */
887 /*
888 * Setup %r10 value allows us to detect crossing a page boundary.
889 * When %r10 goes positive we have crossed a page boundary and
890 * need to do a nibble.
891 */
892 lea 7(%rdi), %r10
893 and $0xfff, %r10 /* offset into 4K page */
894 sub $0x1000, %r10 /* subtract 4K pagesize */
895 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
896
897 .p2align 4
898LABEL(loop_ashr_7_use):
899 add $16, %r10
900 jg LABEL(nibble_ashr_7_use)
901
902LABEL(nibble_ashr_7_restart_use):
903 movdqa (%rdi, %rdx), %xmm0
618280a1 904 palignr $7, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
905#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
906 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
907#else
908 movdqa (%rsi,%rdx), %xmm1
909 TOLOWER (%xmm0, %xmm1)
910 pcmpistri $0x1a, %xmm1, %xmm0
911#endif
912 jbe LABEL(exit_use)
913#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
914 sub $16, %r11
915 jbe LABEL(strcmp_exitz)
916#endif
917
918 add $16, %rdx
919 add $16, %r10
920 jg LABEL(nibble_ashr_7_use)
921
922 movdqa (%rdi, %rdx), %xmm0
618280a1 923 palignr $7, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
924#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
925 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
926#else
927 movdqa (%rsi,%rdx), %xmm1
928 TOLOWER (%xmm0, %xmm1)
929 pcmpistri $0x1a, %xmm1, %xmm0
930#endif
931 jbe LABEL(exit_use)
932#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
933 sub $16, %r11
934 jbe LABEL(strcmp_exitz)
935#endif
936 add $16, %rdx
937 jmp LABEL(loop_ashr_7_use)
938
939 .p2align 4
940LABEL(nibble_ashr_7_use):
941 sub $0x1000, %r10
942 movdqa -16(%rdi, %rdx), %xmm0
618280a1 943 psrldq $7, D(%xmm0)
d9a4d2ab
UD
944 pcmpistri $0x3a,%xmm0, %xmm0
945#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
946 cmp %r11, %rcx
947 jae LABEL(nibble_ashr_exit_use)
948#endif
949 cmp $8, %ecx
950 ja LABEL(nibble_ashr_7_restart_use)
951
952 jmp LABEL(nibble_ashr_exit_use)
953
954/*
955 * The following cases will be handled by ashr_8
956 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
957 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
958 */
959 .p2align 4
960LABEL(ashr_8):
618280a1 961 pslldq $8, D(%xmm2)
d9a4d2ab 962 TOLOWER (%xmm1, %xmm2)
618280a1
UD
963 pcmpeqb %xmm1, D(%xmm2)
964 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
965 pmovmskb %xmm2, %r9d
966 shr %cl, %edx
967 shr %cl, %r9d
968 sub %r9d, %edx
969 jnz LABEL(less32bytes)
970 movdqa (%rdi), %xmm3
971
972 UPDATE_STRNCMP_COUNTER
973
d9a4d2ab
UD
974 mov $16, %rcx /* index for loads */
975 mov $8, %r9d /* byte position left over from less32bytes case */
976 /*
977 * Setup %r10 value allows us to detect crossing a page boundary.
978 * When %r10 goes positive we have crossed a page boundary and
979 * need to do a nibble.
980 */
981 lea 8(%rdi), %r10
982 and $0xfff, %r10 /* offset into 4K page */
983 sub $0x1000, %r10 /* subtract 4K pagesize */
984 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
985
986 .p2align 4
987LABEL(loop_ashr_8_use):
988 add $16, %r10
989 jg LABEL(nibble_ashr_8_use)
990
991LABEL(nibble_ashr_8_restart_use):
992 movdqa (%rdi, %rdx), %xmm0
618280a1 993 palignr $8, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
994#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
995 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
996#else
997 movdqa (%rsi,%rdx), %xmm1
998 TOLOWER (%xmm0, %xmm1)
999 pcmpistri $0x1a, %xmm1, %xmm0
1000#endif
1001 jbe LABEL(exit_use)
1002#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1003 sub $16, %r11
1004 jbe LABEL(strcmp_exitz)
1005#endif
1006
1007 add $16, %rdx
1008 add $16, %r10
1009 jg LABEL(nibble_ashr_8_use)
1010
1011 movdqa (%rdi, %rdx), %xmm0
618280a1 1012 palignr $8, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1013#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1014 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1015#else
1016 movdqa (%rsi,%rdx), %xmm1
1017 TOLOWER (%xmm0, %xmm1)
1018 pcmpistri $0x1a, %xmm1, %xmm0
1019#endif
1020 jbe LABEL(exit_use)
1021#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1022 sub $16, %r11
1023 jbe LABEL(strcmp_exitz)
1024#endif
1025 add $16, %rdx
1026 jmp LABEL(loop_ashr_8_use)
1027
1028 .p2align 4
1029LABEL(nibble_ashr_8_use):
1030 sub $0x1000, %r10
1031 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1032 psrldq $8, D(%xmm0)
d9a4d2ab
UD
1033 pcmpistri $0x3a,%xmm0, %xmm0
1034#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1035 cmp %r11, %rcx
1036 jae LABEL(nibble_ashr_exit_use)
1037#endif
1038 cmp $7, %ecx
1039 ja LABEL(nibble_ashr_8_restart_use)
1040
1041 jmp LABEL(nibble_ashr_exit_use)
1042
1043/*
1044 * The following cases will be handled by ashr_9
1045 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1046 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1047 */
1048 .p2align 4
1049LABEL(ashr_9):
618280a1 1050 pslldq $7, D(%xmm2)
d9a4d2ab 1051 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1052 pcmpeqb %xmm1, D(%xmm2)
1053 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1054 pmovmskb %xmm2, %r9d
1055 shr %cl, %edx
1056 shr %cl, %r9d
1057 sub %r9d, %edx
1058 jnz LABEL(less32bytes)
1059 movdqa (%rdi), %xmm3
1060
1061 UPDATE_STRNCMP_COUNTER
1062
d9a4d2ab
UD
1063 mov $16, %rcx /* index for loads */
1064 mov $9, %r9d /* byte position left over from less32bytes case */
1065 /*
1066 * Setup %r10 value allows us to detect crossing a page boundary.
1067 * When %r10 goes positive we have crossed a page boundary and
1068 * need to do a nibble.
1069 */
1070 lea 9(%rdi), %r10
1071 and $0xfff, %r10 /* offset into 4K page */
1072 sub $0x1000, %r10 /* subtract 4K pagesize */
1073 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1074
1075 .p2align 4
1076LABEL(loop_ashr_9_use):
1077 add $16, %r10
1078 jg LABEL(nibble_ashr_9_use)
1079
1080LABEL(nibble_ashr_9_restart_use):
1081 movdqa (%rdi, %rdx), %xmm0
1082
618280a1 1083 palignr $9, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1084#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1085 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1086#else
1087 movdqa (%rsi,%rdx), %xmm1
1088 TOLOWER (%xmm0, %xmm1)
1089 pcmpistri $0x1a, %xmm1, %xmm0
1090#endif
1091 jbe LABEL(exit_use)
1092#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1093 sub $16, %r11
1094 jbe LABEL(strcmp_exitz)
1095#endif
1096
1097 add $16, %rdx
1098 add $16, %r10
1099 jg LABEL(nibble_ashr_9_use)
1100
1101 movdqa (%rdi, %rdx), %xmm0
618280a1 1102 palignr $9, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1103#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1104 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1105#else
1106 movdqa (%rsi,%rdx), %xmm1
1107 TOLOWER (%xmm0, %xmm1)
1108 pcmpistri $0x1a, %xmm1, %xmm0
1109#endif
1110 jbe LABEL(exit_use)
1111#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1112 sub $16, %r11
1113 jbe LABEL(strcmp_exitz)
1114#endif
1115 add $16, %rdx
1116 jmp LABEL(loop_ashr_9_use)
1117
1118 .p2align 4
1119LABEL(nibble_ashr_9_use):
1120 sub $0x1000, %r10
1121 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1122 psrldq $9, D(%xmm0)
d9a4d2ab
UD
1123 pcmpistri $0x3a,%xmm0, %xmm0
1124#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1125 cmp %r11, %rcx
1126 jae LABEL(nibble_ashr_exit_use)
1127#endif
1128 cmp $6, %ecx
1129 ja LABEL(nibble_ashr_9_restart_use)
1130
1131 jmp LABEL(nibble_ashr_exit_use)
1132
1133/*
1134 * The following cases will be handled by ashr_10
1135 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1136 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1137 */
1138 .p2align 4
1139LABEL(ashr_10):
618280a1 1140 pslldq $6, D(%xmm2)
d9a4d2ab 1141 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1142 pcmpeqb %xmm1, D(%xmm2)
1143 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1144 pmovmskb %xmm2, %r9d
1145 shr %cl, %edx
1146 shr %cl, %r9d
1147 sub %r9d, %edx
1148 jnz LABEL(less32bytes)
1149 movdqa (%rdi), %xmm3
1150
1151 UPDATE_STRNCMP_COUNTER
1152
d9a4d2ab
UD
1153 mov $16, %rcx /* index for loads */
1154 mov $10, %r9d /* byte position left over from less32bytes case */
1155 /*
1156 * Setup %r10 value allows us to detect crossing a page boundary.
1157 * When %r10 goes positive we have crossed a page boundary and
1158 * need to do a nibble.
1159 */
1160 lea 10(%rdi), %r10
1161 and $0xfff, %r10 /* offset into 4K page */
1162 sub $0x1000, %r10 /* subtract 4K pagesize */
1163 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1164
1165 .p2align 4
1166LABEL(loop_ashr_10_use):
1167 add $16, %r10
1168 jg LABEL(nibble_ashr_10_use)
1169
1170LABEL(nibble_ashr_10_restart_use):
1171 movdqa (%rdi, %rdx), %xmm0
618280a1 1172 palignr $10, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1173#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1174 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1175#else
1176 movdqa (%rsi,%rdx), %xmm1
1177 TOLOWER (%xmm0, %xmm1)
1178 pcmpistri $0x1a, %xmm1, %xmm0
1179#endif
1180 jbe LABEL(exit_use)
1181#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1182 sub $16, %r11
1183 jbe LABEL(strcmp_exitz)
1184#endif
1185
1186 add $16, %rdx
1187 add $16, %r10
1188 jg LABEL(nibble_ashr_10_use)
1189
1190 movdqa (%rdi, %rdx), %xmm0
618280a1 1191 palignr $10, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1192#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1193 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1194#else
1195 movdqa (%rsi,%rdx), %xmm1
1196 TOLOWER (%xmm0, %xmm1)
1197 pcmpistri $0x1a, %xmm1, %xmm0
1198#endif
1199 jbe LABEL(exit_use)
1200#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1201 sub $16, %r11
1202 jbe LABEL(strcmp_exitz)
1203#endif
1204 add $16, %rdx
1205 jmp LABEL(loop_ashr_10_use)
1206
1207 .p2align 4
1208LABEL(nibble_ashr_10_use):
1209 sub $0x1000, %r10
1210 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1211 psrldq $10, D(%xmm0)
d9a4d2ab
UD
1212 pcmpistri $0x3a,%xmm0, %xmm0
1213#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1214 cmp %r11, %rcx
1215 jae LABEL(nibble_ashr_exit_use)
1216#endif
1217 cmp $5, %ecx
1218 ja LABEL(nibble_ashr_10_restart_use)
1219
1220 jmp LABEL(nibble_ashr_exit_use)
1221
1222/*
1223 * The following cases will be handled by ashr_11
1224 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1225 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1226 */
1227 .p2align 4
1228LABEL(ashr_11):
618280a1 1229 pslldq $5, D(%xmm2)
d9a4d2ab 1230 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1231 pcmpeqb %xmm1, D(%xmm2)
1232 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1233 pmovmskb %xmm2, %r9d
1234 shr %cl, %edx
1235 shr %cl, %r9d
1236 sub %r9d, %edx
1237 jnz LABEL(less32bytes)
1238 movdqa (%rdi), %xmm3
1239
1240 UPDATE_STRNCMP_COUNTER
1241
d9a4d2ab
UD
1242 mov $16, %rcx /* index for loads */
1243 mov $11, %r9d /* byte position left over from less32bytes case */
1244 /*
1245 * Setup %r10 value allows us to detect crossing a page boundary.
1246 * When %r10 goes positive we have crossed a page boundary and
1247 * need to do a nibble.
1248 */
1249 lea 11(%rdi), %r10
1250 and $0xfff, %r10 /* offset into 4K page */
1251 sub $0x1000, %r10 /* subtract 4K pagesize */
1252 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1253
1254 .p2align 4
1255LABEL(loop_ashr_11_use):
1256 add $16, %r10
1257 jg LABEL(nibble_ashr_11_use)
1258
1259LABEL(nibble_ashr_11_restart_use):
1260 movdqa (%rdi, %rdx), %xmm0
618280a1 1261 palignr $11, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1262#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1263 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1264#else
1265 movdqa (%rsi,%rdx), %xmm1
1266 TOLOWER (%xmm0, %xmm1)
1267 pcmpistri $0x1a, %xmm1, %xmm0
1268#endif
1269 jbe LABEL(exit_use)
1270#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1271 sub $16, %r11
1272 jbe LABEL(strcmp_exitz)
1273#endif
1274
1275 add $16, %rdx
1276 add $16, %r10
1277 jg LABEL(nibble_ashr_11_use)
1278
1279 movdqa (%rdi, %rdx), %xmm0
618280a1 1280 palignr $11, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1281#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1282 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1283#else
1284 movdqa (%rsi,%rdx), %xmm1
1285 TOLOWER (%xmm0, %xmm1)
1286 pcmpistri $0x1a, %xmm1, %xmm0
1287#endif
1288 jbe LABEL(exit_use)
1289#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1290 sub $16, %r11
1291 jbe LABEL(strcmp_exitz)
1292#endif
1293 add $16, %rdx
1294 jmp LABEL(loop_ashr_11_use)
1295
1296 .p2align 4
1297LABEL(nibble_ashr_11_use):
1298 sub $0x1000, %r10
1299 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1300 psrldq $11, D(%xmm0)
d9a4d2ab
UD
1301 pcmpistri $0x3a,%xmm0, %xmm0
1302#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1303 cmp %r11, %rcx
1304 jae LABEL(nibble_ashr_exit_use)
1305#endif
1306 cmp $4, %ecx
1307 ja LABEL(nibble_ashr_11_restart_use)
1308
1309 jmp LABEL(nibble_ashr_exit_use)
1310
1311/*
1312 * The following cases will be handled by ashr_12
1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1314 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1315 */
1316 .p2align 4
1317LABEL(ashr_12):
618280a1 1318 pslldq $4, D(%xmm2)
d9a4d2ab 1319 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1320 pcmpeqb %xmm1, D(%xmm2)
1321 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1322 pmovmskb %xmm2, %r9d
1323 shr %cl, %edx
1324 shr %cl, %r9d
1325 sub %r9d, %edx
1326 jnz LABEL(less32bytes)
1327 movdqa (%rdi), %xmm3
1328
1329 UPDATE_STRNCMP_COUNTER
1330
d9a4d2ab
UD
1331 mov $16, %rcx /* index for loads */
1332 mov $12, %r9d /* byte position left over from less32bytes case */
1333 /*
1334 * Setup %r10 value allows us to detect crossing a page boundary.
1335 * When %r10 goes positive we have crossed a page boundary and
1336 * need to do a nibble.
1337 */
1338 lea 12(%rdi), %r10
1339 and $0xfff, %r10 /* offset into 4K page */
1340 sub $0x1000, %r10 /* subtract 4K pagesize */
1341 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1342
1343 .p2align 4
1344LABEL(loop_ashr_12_use):
1345 add $16, %r10
1346 jg LABEL(nibble_ashr_12_use)
1347
1348LABEL(nibble_ashr_12_restart_use):
1349 movdqa (%rdi, %rdx), %xmm0
618280a1 1350 palignr $12, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1351#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1352 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1353#else
1354 movdqa (%rsi,%rdx), %xmm1
1355 TOLOWER (%xmm0, %xmm1)
1356 pcmpistri $0x1a, %xmm1, %xmm0
1357#endif
1358 jbe LABEL(exit_use)
1359#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1360 sub $16, %r11
1361 jbe LABEL(strcmp_exitz)
1362#endif
1363
1364 add $16, %rdx
1365 add $16, %r10
1366 jg LABEL(nibble_ashr_12_use)
1367
1368 movdqa (%rdi, %rdx), %xmm0
618280a1 1369 palignr $12, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1370#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1371 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1372#else
1373 movdqa (%rsi,%rdx), %xmm1
1374 TOLOWER (%xmm0, %xmm1)
1375 pcmpistri $0x1a, %xmm1, %xmm0
1376#endif
1377 jbe LABEL(exit_use)
1378#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1379 sub $16, %r11
1380 jbe LABEL(strcmp_exitz)
1381#endif
1382 add $16, %rdx
1383 jmp LABEL(loop_ashr_12_use)
1384
1385 .p2align 4
1386LABEL(nibble_ashr_12_use):
1387 sub $0x1000, %r10
1388 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1389 psrldq $12, D(%xmm0)
d9a4d2ab
UD
1390 pcmpistri $0x3a,%xmm0, %xmm0
1391#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1392 cmp %r11, %rcx
1393 jae LABEL(nibble_ashr_exit_use)
1394#endif
1395 cmp $3, %ecx
1396 ja LABEL(nibble_ashr_12_restart_use)
1397
1398 jmp LABEL(nibble_ashr_exit_use)
1399
1400/*
1401 * The following cases will be handled by ashr_13
1402 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1403 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1404 */
1405 .p2align 4
1406LABEL(ashr_13):
618280a1 1407 pslldq $3, D(%xmm2)
d9a4d2ab 1408 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1409 pcmpeqb %xmm1, D(%xmm2)
1410 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1411 pmovmskb %xmm2, %r9d
1412 shr %cl, %edx
1413 shr %cl, %r9d
1414 sub %r9d, %edx
1415 jnz LABEL(less32bytes)
1416 movdqa (%rdi), %xmm3
1417
1418 UPDATE_STRNCMP_COUNTER
1419
d9a4d2ab
UD
1420 mov $16, %rcx /* index for loads */
1421 mov $13, %r9d /* byte position left over from less32bytes case */
1422 /*
1423 * Setup %r10 value allows us to detect crossing a page boundary.
1424 * When %r10 goes positive we have crossed a page boundary and
1425 * need to do a nibble.
1426 */
1427 lea 13(%rdi), %r10
1428 and $0xfff, %r10 /* offset into 4K page */
1429 sub $0x1000, %r10 /* subtract 4K pagesize */
1430
1431 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1432
1433 .p2align 4
1434LABEL(loop_ashr_13_use):
1435 add $16, %r10
1436 jg LABEL(nibble_ashr_13_use)
1437
1438LABEL(nibble_ashr_13_restart_use):
1439 movdqa (%rdi, %rdx), %xmm0
618280a1 1440 palignr $13, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1441#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1442 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1443#else
1444 movdqa (%rsi,%rdx), %xmm1
1445 TOLOWER (%xmm0, %xmm1)
1446 pcmpistri $0x1a, %xmm1, %xmm0
1447#endif
1448 jbe LABEL(exit_use)
1449#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1450 sub $16, %r11
1451 jbe LABEL(strcmp_exitz)
1452#endif
1453
1454 add $16, %rdx
1455 add $16, %r10
1456 jg LABEL(nibble_ashr_13_use)
1457
1458 movdqa (%rdi, %rdx), %xmm0
618280a1 1459 palignr $13, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1460#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1461 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1462#else
1463 movdqa (%rsi,%rdx), %xmm1
1464 TOLOWER (%xmm0, %xmm1)
1465 pcmpistri $0x1a, %xmm1, %xmm0
1466#endif
1467 jbe LABEL(exit_use)
1468#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1469 sub $16, %r11
1470 jbe LABEL(strcmp_exitz)
1471#endif
1472 add $16, %rdx
1473 jmp LABEL(loop_ashr_13_use)
1474
1475 .p2align 4
1476LABEL(nibble_ashr_13_use):
1477 sub $0x1000, %r10
1478 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1479 psrldq $13, D(%xmm0)
d9a4d2ab
UD
1480 pcmpistri $0x3a,%xmm0, %xmm0
1481#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1482 cmp %r11, %rcx
1483 jae LABEL(nibble_ashr_exit_use)
1484#endif
1485 cmp $2, %ecx
1486 ja LABEL(nibble_ashr_13_restart_use)
1487
1488 jmp LABEL(nibble_ashr_exit_use)
1489
1490/*
1491 * The following cases will be handled by ashr_14
1492 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1493 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1494 */
1495 .p2align 4
1496LABEL(ashr_14):
618280a1 1497 pslldq $2, D(%xmm2)
d9a4d2ab 1498 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1499 pcmpeqb %xmm1, D(%xmm2)
1500 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1501 pmovmskb %xmm2, %r9d
1502 shr %cl, %edx
1503 shr %cl, %r9d
1504 sub %r9d, %edx
1505 jnz LABEL(less32bytes)
1506 movdqa (%rdi), %xmm3
1507
1508 UPDATE_STRNCMP_COUNTER
1509
d9a4d2ab
UD
1510 mov $16, %rcx /* index for loads */
1511 mov $14, %r9d /* byte position left over from less32bytes case */
1512 /*
1513 * Setup %r10 value allows us to detect crossing a page boundary.
1514 * When %r10 goes positive we have crossed a page boundary and
1515 * need to do a nibble.
1516 */
1517 lea 14(%rdi), %r10
1518 and $0xfff, %r10 /* offset into 4K page */
1519 sub $0x1000, %r10 /* subtract 4K pagesize */
1520
1521 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1522
1523 .p2align 4
1524LABEL(loop_ashr_14_use):
1525 add $16, %r10
1526 jg LABEL(nibble_ashr_14_use)
1527
1528LABEL(nibble_ashr_14_restart_use):
1529 movdqa (%rdi, %rdx), %xmm0
618280a1 1530 palignr $14, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1531#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1532 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1533#else
1534 movdqa (%rsi,%rdx), %xmm1
1535 TOLOWER (%xmm0, %xmm1)
1536 pcmpistri $0x1a, %xmm1, %xmm0
1537#endif
1538 jbe LABEL(exit_use)
1539#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1540 sub $16, %r11
1541 jbe LABEL(strcmp_exitz)
1542#endif
1543
1544 add $16, %rdx
1545 add $16, %r10
1546 jg LABEL(nibble_ashr_14_use)
1547
1548 movdqa (%rdi, %rdx), %xmm0
618280a1 1549 palignr $14, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1550#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1551 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1552#else
1553 movdqa (%rsi,%rdx), %xmm1
1554 TOLOWER (%xmm0, %xmm1)
1555 pcmpistri $0x1a, %xmm1, %xmm0
1556#endif
1557 jbe LABEL(exit_use)
1558#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1559 sub $16, %r11
1560 jbe LABEL(strcmp_exitz)
1561#endif
1562 add $16, %rdx
1563 jmp LABEL(loop_ashr_14_use)
1564
1565 .p2align 4
1566LABEL(nibble_ashr_14_use):
1567 sub $0x1000, %r10
1568 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1569 psrldq $14, D(%xmm0)
d9a4d2ab
UD
1570 pcmpistri $0x3a,%xmm0, %xmm0
1571#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1572 cmp %r11, %rcx
1573 jae LABEL(nibble_ashr_exit_use)
1574#endif
1575 cmp $1, %ecx
1576 ja LABEL(nibble_ashr_14_restart_use)
1577
1578 jmp LABEL(nibble_ashr_exit_use)
1579
1580/*
1581 * The following cases will be handled by ashr_15
1582 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1583 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1584 */
1585 .p2align 4
1586LABEL(ashr_15):
618280a1 1587 pslldq $1, D(%xmm2)
d9a4d2ab 1588 TOLOWER (%xmm1, %xmm2)
618280a1
UD
1589 pcmpeqb %xmm1, D(%xmm2)
1590 psubb %xmm0, D(%xmm2)
d9a4d2ab
UD
1591 pmovmskb %xmm2, %r9d
1592 shr %cl, %edx
1593 shr %cl, %r9d
1594 sub %r9d, %edx
1595 jnz LABEL(less32bytes)
1596
1597 movdqa (%rdi), %xmm3
1598
1599 UPDATE_STRNCMP_COUNTER
1600
d9a4d2ab
UD
1601 mov $16, %rcx /* index for loads */
1602 mov $15, %r9d /* byte position left over from less32bytes case */
1603 /*
1604 * Setup %r10 value allows us to detect crossing a page boundary.
1605 * When %r10 goes positive we have crossed a page boundary and
1606 * need to do a nibble.
1607 */
1608 lea 15(%rdi), %r10
1609 and $0xfff, %r10 /* offset into 4K page */
1610
1611 sub $0x1000, %r10 /* subtract 4K pagesize */
1612
1613 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1614
1615 .p2align 4
1616LABEL(loop_ashr_15_use):
1617 add $16, %r10
1618 jg LABEL(nibble_ashr_15_use)
1619
1620LABEL(nibble_ashr_15_restart_use):
1621 movdqa (%rdi, %rdx), %xmm0
618280a1 1622 palignr $15, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1623#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1624 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1625#else
1626 movdqa (%rsi,%rdx), %xmm1
1627 TOLOWER (%xmm0, %xmm1)
1628 pcmpistri $0x1a, %xmm1, %xmm0
1629#endif
1630 jbe LABEL(exit_use)
1631#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1632 sub $16, %r11
1633 jbe LABEL(strcmp_exitz)
1634#endif
1635
1636 add $16, %rdx
1637 add $16, %r10
1638 jg LABEL(nibble_ashr_15_use)
1639
1640 movdqa (%rdi, %rdx), %xmm0
618280a1 1641 palignr $15, -16(%rdi, %rdx), D(%xmm0)
d9a4d2ab
UD
1642#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1643 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1644#else
1645 movdqa (%rsi,%rdx), %xmm1
1646 TOLOWER (%xmm0, %xmm1)
1647 pcmpistri $0x1a, %xmm1, %xmm0
1648#endif
1649 jbe LABEL(exit_use)
1650#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1651 sub $16, %r11
1652 jbe LABEL(strcmp_exitz)
1653#endif
1654 add $16, %rdx
1655 jmp LABEL(loop_ashr_15_use)
1656
1657 .p2align 4
1658LABEL(nibble_ashr_15_use):
1659 sub $0x1000, %r10
1660 movdqa -16(%rdi, %rdx), %xmm0
618280a1 1661 psrldq $15, D(%xmm0)
d9a4d2ab
UD
1662 pcmpistri $0x3a,%xmm0, %xmm0
1663#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1664 cmp %r11, %rcx
1665 jae LABEL(nibble_ashr_exit_use)
1666#endif
1667 cmp $0, %ecx
1668 ja LABEL(nibble_ashr_15_restart_use)
1669
1670LABEL(nibble_ashr_exit_use):
1671#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1672 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1673#else
1674 movdqa (%rsi,%rdx), %xmm1
1675 TOLOWER (%xmm0, %xmm1)
1676 pcmpistri $0x1a, %xmm1, %xmm0
1677#endif
1678 .p2align 4
1679LABEL(exit_use):
1680 jnc LABEL(strcmp_exitz)
1681#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1682 sub %rcx, %r11
1683 jbe LABEL(strcmp_exitz)
1684#endif
1685 add %rcx, %rdx
1686 lea -16(%rdi, %r9), %rdi
1687 movzbl (%rdi, %rdx), %eax
1688 movzbl (%rsi, %rdx), %edx
1689 test %r8d, %r8d
1690 jz LABEL(ret_use)
1691 xchg %eax, %edx
1692LABEL(ret_use):
1693#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1694 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1695 movl (%rcx,%rdx,4), %edx
1696 movl (%rcx,%rax,4), %eax
1697#endif
1698
1699 sub %edx, %eax
1700 ret
1701
1702LABEL(less32bytes):
1703 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1704 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1705 test %r8d, %r8d
1706 jz LABEL(ret)
1707 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1708
1709 .p2align 4
1710LABEL(ret):
1711LABEL(less16bytes):
1712 bsf %rdx, %rdx /* find and store bit index in %rdx */
1713
1714#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1715 sub %rdx, %r11
1716 jbe LABEL(strcmp_exitz)
1717#endif
1718 movzbl (%rsi, %rdx), %ecx
1719 movzbl (%rdi, %rdx), %eax
1720
1721#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1722 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1723 movl (%rdx,%rcx,4), %ecx
1724 movl (%rdx,%rax,4), %eax
1725#endif
1726
1727 sub %ecx, %eax
1728 ret
1729
1730LABEL(strcmp_exitz):
1731 xor %eax, %eax
1732 ret
1733
1734 .p2align 4
1735 // XXX Same as code above
1736LABEL(Byte0):
1737 movzx (%rsi), %ecx
1738 movzx (%rdi), %eax
1739
1740#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1741 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1742 movl (%rdx,%rcx,4), %ecx
1743 movl (%rdx,%rax,4), %eax
1744#endif
1745
1746 sub %ecx, %eax
1747 ret
1748 cfi_endproc
1749 .size STRCMP_SSE42, .-STRCMP_SSE42
1750
1751#undef UCLOW_reg
1752#undef UCHIGH_reg
1753#undef LCQWORD_reg
1754#undef TOLOWER
1755
1756 /* Put all SSE 4.2 functions together. */
1757 .section .rodata.SECTION,"a",@progbits
1758 .p2align 3
1759LABEL(unaligned_table):
1760 .int LABEL(ashr_1) - LABEL(unaligned_table)
1761 .int LABEL(ashr_2) - LABEL(unaligned_table)
1762 .int LABEL(ashr_3) - LABEL(unaligned_table)
1763 .int LABEL(ashr_4) - LABEL(unaligned_table)
1764 .int LABEL(ashr_5) - LABEL(unaligned_table)
1765 .int LABEL(ashr_6) - LABEL(unaligned_table)
1766 .int LABEL(ashr_7) - LABEL(unaligned_table)
1767 .int LABEL(ashr_8) - LABEL(unaligned_table)
1768 .int LABEL(ashr_9) - LABEL(unaligned_table)
1769 .int LABEL(ashr_10) - LABEL(unaligned_table)
1770 .int LABEL(ashr_11) - LABEL(unaligned_table)
1771 .int LABEL(ashr_12) - LABEL(unaligned_table)
1772 .int LABEL(ashr_13) - LABEL(unaligned_table)
1773 .int LABEL(ashr_14) - LABEL(unaligned_table)
1774 .int LABEL(ashr_15) - LABEL(unaligned_table)
1775 .int LABEL(ashr_0) - LABEL(unaligned_table)
1776
1777#undef LABEL
1778#undef GLABEL
1779#undef SECTION
618280a1
UD
1780#undef movdqa
1781#undef movdqu
1782#undef pmovmskb
1783#undef pcmpistri
1784#undef psubb
1785#undef pcmpeqb
1786#undef psrldq
1787#undef pslldq
1788#undef palignr
1789#undef pxor
1790#undef D