]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/strcmp.S
Remove "Contributed by" lines
[thirdparty/glibc.git] / sysdeps / x86_64 / strcmp.S
1 /* Highly optimized version for x86-64.
2 Copyright (C) 1999-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include "asm-syntax.h"
21
22 #undef UPDATE_STRNCMP_COUNTER
23
24 #ifndef LABEL
25 #define LABEL(l) L(l)
26 #endif
27
28 #ifdef USE_AS_STRNCMP
29 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
30 if the new counter > the old one or is 0. */
31 # define UPDATE_STRNCMP_COUNTER \
32 /* calculate left number to compare */ \
33 lea -16(%rcx, %r11), %r9; \
34 cmp %r9, %r11; \
35 jb LABEL(strcmp_exitz); \
36 test %r9, %r9; \
37 je LABEL(strcmp_exitz); \
38 mov %r9, %r11
39
40 #elif defined USE_AS_STRCASECMP_L
41 # include "locale-defines.h"
42
43 # define UPDATE_STRNCMP_COUNTER
44 #elif defined USE_AS_STRNCASECMP_L
45 # include "locale-defines.h"
46
47 # define UPDATE_STRNCMP_COUNTER \
48 /* calculate left number to compare */ \
49 lea -16(%rcx, %r11), %r9; \
50 cmp %r9, %r11; \
51 jb LABEL(strcmp_exitz); \
52 test %r9, %r9; \
53 je LABEL(strcmp_exitz); \
54 mov %r9, %r11
55 #else
56 # define UPDATE_STRNCMP_COUNTER
57 # ifndef STRCMP
58 # define STRCMP strcmp
59 # endif
60 #endif
61
62 #ifndef USE_SSSE3
63 .text
64 #else
65 .section .text.ssse3,"ax",@progbits
66 #endif
67
68 #ifdef USE_AS_STRCASECMP_L
69 # ifndef ENTRY2
70 # define ENTRY2(name) ENTRY (name)
71 # define END2(name) END (name)
72 # endif
73
74 ENTRY2 (__strcasecmp)
75 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
76 mov %fs:(%rax),%RDX_LP
77
78 // XXX 5 byte should be before the function
79 /* 5-byte NOP. */
80 .byte 0x0f,0x1f,0x44,0x00,0x00
81 END2 (__strcasecmp)
82 # ifndef NO_NOLOCALE_ALIAS
83 weak_alias (__strcasecmp, strcasecmp)
84 libc_hidden_def (__strcasecmp)
85 # endif
86 /* FALLTHROUGH to strcasecmp_l. */
87 #elif defined USE_AS_STRNCASECMP_L
88 # ifndef ENTRY2
89 # define ENTRY2(name) ENTRY (name)
90 # define END2(name) END (name)
91 # endif
92
93 ENTRY2 (__strncasecmp)
94 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
95 mov %fs:(%rax),%RCX_LP
96
97 // XXX 5 byte should be before the function
98 /* 5-byte NOP. */
99 .byte 0x0f,0x1f,0x44,0x00,0x00
100 END2 (__strncasecmp)
101 # ifndef NO_NOLOCALE_ALIAS
102 weak_alias (__strncasecmp, strncasecmp)
103 libc_hidden_def (__strncasecmp)
104 # endif
105 /* FALLTHROUGH to strncasecmp_l. */
106 #endif
107
108 ENTRY (STRCMP)
109 #ifdef USE_AS_STRCASECMP_L
110 /* We have to fall back on the C implementation for locales
111 with encodings not matching ASCII for single bytes. */
112 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
113 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
114 # else
115 mov (%rdx), %RAX_LP
116 # endif
117 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
118 jne __strcasecmp_l_nonascii
119 #elif defined USE_AS_STRNCASECMP_L
120 /* We have to fall back on the C implementation for locales
121 with encodings not matching ASCII for single bytes. */
122 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
123 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
124 # else
125 mov (%rcx), %RAX_LP
126 # endif
127 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
128 jne __strncasecmp_l_nonascii
129 #endif
130
131 /*
132 * This implementation uses SSE to compare up to 16 bytes at a time.
133 */
134 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
135 test %RDX_LP, %RDX_LP
136 je LABEL(strcmp_exitz)
137 cmp $1, %RDX_LP
138 je LABEL(Byte0)
139 mov %RDX_LP, %R11_LP
140 #endif
141 mov %esi, %ecx
142 mov %edi, %eax
143 /* Use 64bit AND here to avoid long NOP padding. */
144 and $0x3f, %rcx /* rsi alignment in cache line */
145 and $0x3f, %rax /* rdi alignment in cache line */
146 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
147 .section .rodata.cst16,"aM",@progbits,16
148 .align 16
149 .Lbelowupper:
150 .quad 0x4040404040404040
151 .quad 0x4040404040404040
152 .Ltopupper:
153 .quad 0x5b5b5b5b5b5b5b5b
154 .quad 0x5b5b5b5b5b5b5b5b
155 .Ltouppermask:
156 .quad 0x2020202020202020
157 .quad 0x2020202020202020
158 .previous
159 movdqa .Lbelowupper(%rip), %xmm5
160 # define UCLOW_reg %xmm5
161 movdqa .Ltopupper(%rip), %xmm6
162 # define UCHIGH_reg %xmm6
163 movdqa .Ltouppermask(%rip), %xmm7
164 # define LCQWORD_reg %xmm7
165 #endif
166 cmp $0x30, %ecx
167 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
168 cmp $0x30, %eax
169 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
170 movlpd (%rdi), %xmm1
171 movlpd (%rsi), %xmm2
172 movhpd 8(%rdi), %xmm1
173 movhpd 8(%rsi), %xmm2
174 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
175 # define TOLOWER(reg1, reg2) \
176 movdqa reg1, %xmm8; \
177 movdqa UCHIGH_reg, %xmm9; \
178 movdqa reg2, %xmm10; \
179 movdqa UCHIGH_reg, %xmm11; \
180 pcmpgtb UCLOW_reg, %xmm8; \
181 pcmpgtb reg1, %xmm9; \
182 pcmpgtb UCLOW_reg, %xmm10; \
183 pcmpgtb reg2, %xmm11; \
184 pand %xmm9, %xmm8; \
185 pand %xmm11, %xmm10; \
186 pand LCQWORD_reg, %xmm8; \
187 pand LCQWORD_reg, %xmm10; \
188 por %xmm8, reg1; \
189 por %xmm10, reg2
190 TOLOWER (%xmm1, %xmm2)
191 #else
192 # define TOLOWER(reg1, reg2)
193 #endif
194 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
195 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
196 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
197 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
198 pmovmskb %xmm1, %edx
199 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
200 jnz LABEL(less16bytes) /* If not, find different value or null char */
201 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
202 sub $16, %r11
203 jbe LABEL(strcmp_exitz) /* finish comparision */
204 #endif
205 add $16, %rsi /* prepare to search next 16 bytes */
206 add $16, %rdi /* prepare to search next 16 bytes */
207
208 /*
209 * Determine source and destination string offsets from 16-byte alignment.
210 * Use relative offset difference between the two to determine which case
211 * below to use.
212 */
213 .p2align 4
214 LABEL(crosscache):
215 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
216 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
217 mov $0xffff, %edx /* for equivalent offset */
218 xor %r8d, %r8d
219 and $0xf, %ecx /* offset of rsi */
220 and $0xf, %eax /* offset of rdi */
221 cmp %eax, %ecx
222 je LABEL(ashr_0) /* rsi and rdi relative offset same */
223 ja LABEL(bigger)
224 mov %edx, %r8d /* r8d is offset flag for exit tail */
225 xchg %ecx, %eax
226 xchg %rsi, %rdi
227 LABEL(bigger):
228 lea 15(%rax), %r9
229 sub %rcx, %r9
230 lea LABEL(unaligned_table)(%rip), %r10
231 movslq (%r10, %r9,4), %r9
232 lea (%r10, %r9), %r10
233 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
234
235 /*
236 * The following cases will be handled by ashr_0
237 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
238 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
239 */
240 .p2align 4
241 LABEL(ashr_0):
242
243 movdqa (%rsi), %xmm1
244 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
245 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
246 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
247 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
248 #else
249 movdqa (%rdi), %xmm2
250 TOLOWER (%xmm1, %xmm2)
251 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
252 #endif
253 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
254 pmovmskb %xmm1, %r9d
255 shr %cl, %edx /* adjust 0xffff for offset */
256 shr %cl, %r9d /* adjust for 16-byte offset */
257 sub %r9d, %edx
258 /*
259 * edx must be the same with r9d if in left byte (16-rcx) is equal to
260 * the start from (16-rax) and no null char was seen.
261 */
262 jne LABEL(less32bytes) /* mismatch or null char */
263 UPDATE_STRNCMP_COUNTER
264 mov $16, %rcx
265 mov $16, %r9
266 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
267
268 /*
269 * Now both strings are aligned at 16-byte boundary. Loop over strings
270 * checking 32-bytes per iteration.
271 */
272 .p2align 4
273 LABEL(loop_ashr_0):
274 movdqa (%rsi, %rcx), %xmm1
275 movdqa (%rdi, %rcx), %xmm2
276 TOLOWER (%xmm1, %xmm2)
277
278 pcmpeqb %xmm1, %xmm0
279 pcmpeqb %xmm2, %xmm1
280 psubb %xmm0, %xmm1
281 pmovmskb %xmm1, %edx
282 sub $0xffff, %edx
283 jnz LABEL(exit) /* mismatch or null char seen */
284
285 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
286 sub $16, %r11
287 jbe LABEL(strcmp_exitz)
288 #endif
289 add $16, %rcx
290 movdqa (%rsi, %rcx), %xmm1
291 movdqa (%rdi, %rcx), %xmm2
292 TOLOWER (%xmm1, %xmm2)
293
294 pcmpeqb %xmm1, %xmm0
295 pcmpeqb %xmm2, %xmm1
296 psubb %xmm0, %xmm1
297 pmovmskb %xmm1, %edx
298 sub $0xffff, %edx
299 jnz LABEL(exit)
300 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
301 sub $16, %r11
302 jbe LABEL(strcmp_exitz)
303 #endif
304 add $16, %rcx
305 jmp LABEL(loop_ashr_0)
306
307 /*
308 * The following cases will be handled by ashr_1
309 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
310 * n(15) n -15 0(15 +(n-15) - n) ashr_1
311 */
312 .p2align 4
313 LABEL(ashr_1):
314 pxor %xmm0, %xmm0
315 movdqa (%rdi), %xmm2
316 movdqa (%rsi), %xmm1
317 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
318 pslldq $15, %xmm2 /* shift first string to align with second */
319 TOLOWER (%xmm1, %xmm2)
320 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
321 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
322 pmovmskb %xmm2, %r9d
323 shr %cl, %edx /* adjust 0xffff for offset */
324 shr %cl, %r9d /* adjust for 16-byte offset */
325 sub %r9d, %edx
326 jnz LABEL(less32bytes) /* mismatch or null char seen */
327 movdqa (%rdi), %xmm3
328 UPDATE_STRNCMP_COUNTER
329
330 pxor %xmm0, %xmm0
331 mov $16, %rcx /* index for loads*/
332 mov $1, %r9d /* byte position left over from less32bytes case */
333 /*
334 * Setup %r10 value allows us to detect crossing a page boundary.
335 * When %r10 goes positive we have crossed a page boundary and
336 * need to do a nibble.
337 */
338 lea 1(%rdi), %r10
339 and $0xfff, %r10 /* offset into 4K page */
340 sub $0x1000, %r10 /* subtract 4K pagesize */
341
342 .p2align 4
343 LABEL(loop_ashr_1):
344 add $16, %r10
345 jg LABEL(nibble_ashr_1) /* cross page boundary */
346
347 LABEL(gobble_ashr_1):
348 movdqa (%rsi, %rcx), %xmm1
349 movdqa (%rdi, %rcx), %xmm2
350 movdqa %xmm2, %xmm4 /* store for next cycle */
351
352 #ifndef USE_SSSE3
353 psrldq $1, %xmm3
354 pslldq $15, %xmm2
355 por %xmm3, %xmm2 /* merge into one 16byte value */
356 #else
357 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
358 #endif
359 TOLOWER (%xmm1, %xmm2)
360
361 pcmpeqb %xmm1, %xmm0
362 pcmpeqb %xmm2, %xmm1
363 psubb %xmm0, %xmm1
364 pmovmskb %xmm1, %edx
365 sub $0xffff, %edx
366 jnz LABEL(exit)
367
368 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
369 sub $16, %r11
370 jbe LABEL(strcmp_exitz)
371 #endif
372 add $16, %rcx
373 movdqa %xmm4, %xmm3
374
375 add $16, %r10
376 jg LABEL(nibble_ashr_1) /* cross page boundary */
377
378 movdqa (%rsi, %rcx), %xmm1
379 movdqa (%rdi, %rcx), %xmm2
380 movdqa %xmm2, %xmm4 /* store for next cycle */
381
382 #ifndef USE_SSSE3
383 psrldq $1, %xmm3
384 pslldq $15, %xmm2
385 por %xmm3, %xmm2 /* merge into one 16byte value */
386 #else
387 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
388 #endif
389 TOLOWER (%xmm1, %xmm2)
390
391 pcmpeqb %xmm1, %xmm0
392 pcmpeqb %xmm2, %xmm1
393 psubb %xmm0, %xmm1
394 pmovmskb %xmm1, %edx
395 sub $0xffff, %edx
396 jnz LABEL(exit)
397
398 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
399 sub $16, %r11
400 jbe LABEL(strcmp_exitz)
401 #endif
402 add $16, %rcx
403 movdqa %xmm4, %xmm3
404 jmp LABEL(loop_ashr_1)
405
406 /*
407 * Nibble avoids loads across page boundary. This is to avoid a potential
408 * access into unmapped memory.
409 */
410 .p2align 4
411 LABEL(nibble_ashr_1):
412 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
413 pmovmskb %xmm0, %edx
414 test $0xfffe, %edx
415 jnz LABEL(ashr_1_exittail) /* find null char*/
416
417 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
418 cmp $15, %r11
419 jbe LABEL(ashr_1_exittail)
420 #endif
421
422 pxor %xmm0, %xmm0
423 sub $0x1000, %r10 /* substract 4K from %r10 */
424 jmp LABEL(gobble_ashr_1)
425
426 /*
427 * Once find null char, determine if there is a string mismatch
428 * before the null char.
429 */
430 .p2align 4
431 LABEL(ashr_1_exittail):
432 movdqa (%rsi, %rcx), %xmm1
433 psrldq $1, %xmm0
434 psrldq $1, %xmm3
435 jmp LABEL(aftertail)
436
437 /*
438 * The following cases will be handled by ashr_2
439 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
440 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
441 */
442 .p2align 4
443 LABEL(ashr_2):
444 pxor %xmm0, %xmm0
445 movdqa (%rdi), %xmm2
446 movdqa (%rsi), %xmm1
447 pcmpeqb %xmm1, %xmm0
448 pslldq $14, %xmm2
449 TOLOWER (%xmm1, %xmm2)
450 pcmpeqb %xmm1, %xmm2
451 psubb %xmm0, %xmm2
452 pmovmskb %xmm2, %r9d
453 shr %cl, %edx
454 shr %cl, %r9d
455 sub %r9d, %edx
456 jnz LABEL(less32bytes)
457 movdqa (%rdi), %xmm3
458 UPDATE_STRNCMP_COUNTER
459
460 pxor %xmm0, %xmm0
461 mov $16, %rcx /* index for loads */
462 mov $2, %r9d /* byte position left over from less32bytes case */
463 /*
464 * Setup %r10 value allows us to detect crossing a page boundary.
465 * When %r10 goes positive we have crossed a page boundary and
466 * need to do a nibble.
467 */
468 lea 2(%rdi), %r10
469 and $0xfff, %r10 /* offset into 4K page */
470 sub $0x1000, %r10 /* subtract 4K pagesize */
471
472 .p2align 4
473 LABEL(loop_ashr_2):
474 add $16, %r10
475 jg LABEL(nibble_ashr_2)
476
477 LABEL(gobble_ashr_2):
478 movdqa (%rsi, %rcx), %xmm1
479 movdqa (%rdi, %rcx), %xmm2
480 movdqa %xmm2, %xmm4
481
482 #ifndef USE_SSSE3
483 psrldq $2, %xmm3
484 pslldq $14, %xmm2
485 por %xmm3, %xmm2 /* merge into one 16byte value */
486 #else
487 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
488 #endif
489 TOLOWER (%xmm1, %xmm2)
490
491 pcmpeqb %xmm1, %xmm0
492 pcmpeqb %xmm2, %xmm1
493 psubb %xmm0, %xmm1
494 pmovmskb %xmm1, %edx
495 sub $0xffff, %edx
496 jnz LABEL(exit)
497
498 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
499 sub $16, %r11
500 jbe LABEL(strcmp_exitz)
501 #endif
502
503 add $16, %rcx
504 movdqa %xmm4, %xmm3
505
506 add $16, %r10
507 jg LABEL(nibble_ashr_2) /* cross page boundary */
508
509 movdqa (%rsi, %rcx), %xmm1
510 movdqa (%rdi, %rcx), %xmm2
511 movdqa %xmm2, %xmm4
512
513 #ifndef USE_SSSE3
514 psrldq $2, %xmm3
515 pslldq $14, %xmm2
516 por %xmm3, %xmm2 /* merge into one 16byte value */
517 #else
518 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
519 #endif
520 TOLOWER (%xmm1, %xmm2)
521
522 pcmpeqb %xmm1, %xmm0
523 pcmpeqb %xmm2, %xmm1
524 psubb %xmm0, %xmm1
525 pmovmskb %xmm1, %edx
526 sub $0xffff, %edx
527 jnz LABEL(exit)
528
529 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
530 sub $16, %r11
531 jbe LABEL(strcmp_exitz)
532 #endif
533
534 add $16, %rcx
535 movdqa %xmm4, %xmm3
536 jmp LABEL(loop_ashr_2)
537
538 .p2align 4
539 LABEL(nibble_ashr_2):
540 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
541 pmovmskb %xmm0, %edx
542 test $0xfffc, %edx
543 jnz LABEL(ashr_2_exittail)
544
545 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
546 cmp $14, %r11
547 jbe LABEL(ashr_2_exittail)
548 #endif
549
550 pxor %xmm0, %xmm0
551 sub $0x1000, %r10
552 jmp LABEL(gobble_ashr_2)
553
554 .p2align 4
555 LABEL(ashr_2_exittail):
556 movdqa (%rsi, %rcx), %xmm1
557 psrldq $2, %xmm0
558 psrldq $2, %xmm3
559 jmp LABEL(aftertail)
560
561 /*
562 * The following cases will be handled by ashr_3
563 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
564 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
565 */
566 .p2align 4
567 LABEL(ashr_3):
568 pxor %xmm0, %xmm0
569 movdqa (%rdi), %xmm2
570 movdqa (%rsi), %xmm1
571 pcmpeqb %xmm1, %xmm0
572 pslldq $13, %xmm2
573 TOLOWER (%xmm1, %xmm2)
574 pcmpeqb %xmm1, %xmm2
575 psubb %xmm0, %xmm2
576 pmovmskb %xmm2, %r9d
577 shr %cl, %edx
578 shr %cl, %r9d
579 sub %r9d, %edx
580 jnz LABEL(less32bytes)
581 movdqa (%rdi), %xmm3
582
583 UPDATE_STRNCMP_COUNTER
584
585 pxor %xmm0, %xmm0
586 mov $16, %rcx /* index for loads */
587 mov $3, %r9d /* byte position left over from less32bytes case */
588 /*
589 * Setup %r10 value allows us to detect crossing a page boundary.
590 * When %r10 goes positive we have crossed a page boundary and
591 * need to do a nibble.
592 */
593 lea 3(%rdi), %r10
594 and $0xfff, %r10 /* offset into 4K page */
595 sub $0x1000, %r10 /* subtract 4K pagesize */
596
597 .p2align 4
598 LABEL(loop_ashr_3):
599 add $16, %r10
600 jg LABEL(nibble_ashr_3)
601
602 LABEL(gobble_ashr_3):
603 movdqa (%rsi, %rcx), %xmm1
604 movdqa (%rdi, %rcx), %xmm2
605 movdqa %xmm2, %xmm4
606
607 #ifndef USE_SSSE3
608 psrldq $3, %xmm3
609 pslldq $13, %xmm2
610 por %xmm3, %xmm2 /* merge into one 16byte value */
611 #else
612 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
613 #endif
614 TOLOWER (%xmm1, %xmm2)
615
616 pcmpeqb %xmm1, %xmm0
617 pcmpeqb %xmm2, %xmm1
618 psubb %xmm0, %xmm1
619 pmovmskb %xmm1, %edx
620 sub $0xffff, %edx
621 jnz LABEL(exit)
622
623 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
624 sub $16, %r11
625 jbe LABEL(strcmp_exitz)
626 #endif
627
628 add $16, %rcx
629 movdqa %xmm4, %xmm3
630
631 add $16, %r10
632 jg LABEL(nibble_ashr_3) /* cross page boundary */
633
634 movdqa (%rsi, %rcx), %xmm1
635 movdqa (%rdi, %rcx), %xmm2
636 movdqa %xmm2, %xmm4
637
638 #ifndef USE_SSSE3
639 psrldq $3, %xmm3
640 pslldq $13, %xmm2
641 por %xmm3, %xmm2 /* merge into one 16byte value */
642 #else
643 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
644 #endif
645 TOLOWER (%xmm1, %xmm2)
646
647 pcmpeqb %xmm1, %xmm0
648 pcmpeqb %xmm2, %xmm1
649 psubb %xmm0, %xmm1
650 pmovmskb %xmm1, %edx
651 sub $0xffff, %edx
652 jnz LABEL(exit)
653
654 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
655 sub $16, %r11
656 jbe LABEL(strcmp_exitz)
657 #endif
658
659 add $16, %rcx
660 movdqa %xmm4, %xmm3
661 jmp LABEL(loop_ashr_3)
662
663 .p2align 4
664 LABEL(nibble_ashr_3):
665 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
666 pmovmskb %xmm0, %edx
667 test $0xfff8, %edx
668 jnz LABEL(ashr_3_exittail)
669
670 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
671 cmp $13, %r11
672 jbe LABEL(ashr_3_exittail)
673 #endif
674
675 pxor %xmm0, %xmm0
676 sub $0x1000, %r10
677 jmp LABEL(gobble_ashr_3)
678
679 .p2align 4
680 LABEL(ashr_3_exittail):
681 movdqa (%rsi, %rcx), %xmm1
682 psrldq $3, %xmm0
683 psrldq $3, %xmm3
684 jmp LABEL(aftertail)
685
686 /*
687 * The following cases will be handled by ashr_4
688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
689 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
690 */
691 .p2align 4
692 LABEL(ashr_4):
693 pxor %xmm0, %xmm0
694 movdqa (%rdi), %xmm2
695 movdqa (%rsi), %xmm1
696 pcmpeqb %xmm1, %xmm0
697 pslldq $12, %xmm2
698 TOLOWER (%xmm1, %xmm2)
699 pcmpeqb %xmm1, %xmm2
700 psubb %xmm0, %xmm2
701 pmovmskb %xmm2, %r9d
702 shr %cl, %edx
703 shr %cl, %r9d
704 sub %r9d, %edx
705 jnz LABEL(less32bytes)
706 movdqa (%rdi), %xmm3
707
708 UPDATE_STRNCMP_COUNTER
709
710 pxor %xmm0, %xmm0
711 mov $16, %rcx /* index for loads */
712 mov $4, %r9d /* byte position left over from less32bytes case */
713 /*
714 * Setup %r10 value allows us to detect crossing a page boundary.
715 * When %r10 goes positive we have crossed a page boundary and
716 * need to do a nibble.
717 */
718 lea 4(%rdi), %r10
719 and $0xfff, %r10 /* offset into 4K page */
720 sub $0x1000, %r10 /* subtract 4K pagesize */
721
722 .p2align 4
723 LABEL(loop_ashr_4):
724 add $16, %r10
725 jg LABEL(nibble_ashr_4)
726
727 LABEL(gobble_ashr_4):
728 movdqa (%rsi, %rcx), %xmm1
729 movdqa (%rdi, %rcx), %xmm2
730 movdqa %xmm2, %xmm4
731
732 #ifndef USE_SSSE3
733 psrldq $4, %xmm3
734 pslldq $12, %xmm2
735 por %xmm3, %xmm2 /* merge into one 16byte value */
736 #else
737 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
738 #endif
739 TOLOWER (%xmm1, %xmm2)
740
741 pcmpeqb %xmm1, %xmm0
742 pcmpeqb %xmm2, %xmm1
743 psubb %xmm0, %xmm1
744 pmovmskb %xmm1, %edx
745 sub $0xffff, %edx
746 jnz LABEL(exit)
747
748 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
749 sub $16, %r11
750 jbe LABEL(strcmp_exitz)
751 #endif
752
753 add $16, %rcx
754 movdqa %xmm4, %xmm3
755
756 add $16, %r10
757 jg LABEL(nibble_ashr_4) /* cross page boundary */
758
759 movdqa (%rsi, %rcx), %xmm1
760 movdqa (%rdi, %rcx), %xmm2
761 movdqa %xmm2, %xmm4
762
763 #ifndef USE_SSSE3
764 psrldq $4, %xmm3
765 pslldq $12, %xmm2
766 por %xmm3, %xmm2 /* merge into one 16byte value */
767 #else
768 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
769 #endif
770 TOLOWER (%xmm1, %xmm2)
771
772 pcmpeqb %xmm1, %xmm0
773 pcmpeqb %xmm2, %xmm1
774 psubb %xmm0, %xmm1
775 pmovmskb %xmm1, %edx
776 sub $0xffff, %edx
777 jnz LABEL(exit)
778
779 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
780 sub $16, %r11
781 jbe LABEL(strcmp_exitz)
782 #endif
783
784 add $16, %rcx
785 movdqa %xmm4, %xmm3
786 jmp LABEL(loop_ashr_4)
787
788 .p2align 4
789 LABEL(nibble_ashr_4):
790 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
791 pmovmskb %xmm0, %edx
792 test $0xfff0, %edx
793 jnz LABEL(ashr_4_exittail)
794
795 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
796 cmp $12, %r11
797 jbe LABEL(ashr_4_exittail)
798 #endif
799
800 pxor %xmm0, %xmm0
801 sub $0x1000, %r10
802 jmp LABEL(gobble_ashr_4)
803
804 .p2align 4
805 LABEL(ashr_4_exittail):
806 movdqa (%rsi, %rcx), %xmm1
807 psrldq $4, %xmm0
808 psrldq $4, %xmm3
809 jmp LABEL(aftertail)
810
811 /*
812 * The following cases will be handled by ashr_5
813 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
814 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
815 */
816 .p2align 4
817 LABEL(ashr_5):
818 pxor %xmm0, %xmm0
819 movdqa (%rdi), %xmm2
820 movdqa (%rsi), %xmm1
821 pcmpeqb %xmm1, %xmm0
822 pslldq $11, %xmm2
823 TOLOWER (%xmm1, %xmm2)
824 pcmpeqb %xmm1, %xmm2
825 psubb %xmm0, %xmm2
826 pmovmskb %xmm2, %r9d
827 shr %cl, %edx
828 shr %cl, %r9d
829 sub %r9d, %edx
830 jnz LABEL(less32bytes)
831 movdqa (%rdi), %xmm3
832
833 UPDATE_STRNCMP_COUNTER
834
835 pxor %xmm0, %xmm0
836 mov $16, %rcx /* index for loads */
837 mov $5, %r9d /* byte position left over from less32bytes case */
838 /*
839 * Setup %r10 value allows us to detect crossing a page boundary.
840 * When %r10 goes positive we have crossed a page boundary and
841 * need to do a nibble.
842 */
843 lea 5(%rdi), %r10
844 and $0xfff, %r10 /* offset into 4K page */
845 sub $0x1000, %r10 /* subtract 4K pagesize */
846
847 .p2align 4
848 LABEL(loop_ashr_5):
849 add $16, %r10
850 jg LABEL(nibble_ashr_5)
851
852 LABEL(gobble_ashr_5):
853 movdqa (%rsi, %rcx), %xmm1
854 movdqa (%rdi, %rcx), %xmm2
855 movdqa %xmm2, %xmm4
856
857 #ifndef USE_SSSE3
858 psrldq $5, %xmm3
859 pslldq $11, %xmm2
860 por %xmm3, %xmm2 /* merge into one 16byte value */
861 #else
862 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
863 #endif
864 TOLOWER (%xmm1, %xmm2)
865
866 pcmpeqb %xmm1, %xmm0
867 pcmpeqb %xmm2, %xmm1
868 psubb %xmm0, %xmm1
869 pmovmskb %xmm1, %edx
870 sub $0xffff, %edx
871 jnz LABEL(exit)
872
873 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
874 sub $16, %r11
875 jbe LABEL(strcmp_exitz)
876 #endif
877
878 add $16, %rcx
879 movdqa %xmm4, %xmm3
880
881 add $16, %r10
882 jg LABEL(nibble_ashr_5) /* cross page boundary */
883
884 movdqa (%rsi, %rcx), %xmm1
885 movdqa (%rdi, %rcx), %xmm2
886 movdqa %xmm2, %xmm4
887
888 #ifndef USE_SSSE3
889 psrldq $5, %xmm3
890 pslldq $11, %xmm2
891 por %xmm3, %xmm2 /* merge into one 16byte value */
892 #else
893 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
894 #endif
895 TOLOWER (%xmm1, %xmm2)
896
897 pcmpeqb %xmm1, %xmm0
898 pcmpeqb %xmm2, %xmm1
899 psubb %xmm0, %xmm1
900 pmovmskb %xmm1, %edx
901 sub $0xffff, %edx
902 jnz LABEL(exit)
903
904 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
905 sub $16, %r11
906 jbe LABEL(strcmp_exitz)
907 #endif
908
909 add $16, %rcx
910 movdqa %xmm4, %xmm3
911 jmp LABEL(loop_ashr_5)
912
913 .p2align 4
914 LABEL(nibble_ashr_5):
915 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
916 pmovmskb %xmm0, %edx
917 test $0xffe0, %edx
918 jnz LABEL(ashr_5_exittail)
919
920 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
921 cmp $11, %r11
922 jbe LABEL(ashr_5_exittail)
923 #endif
924
925 pxor %xmm0, %xmm0
926 sub $0x1000, %r10
927 jmp LABEL(gobble_ashr_5)
928
929 .p2align 4
930 LABEL(ashr_5_exittail):
931 movdqa (%rsi, %rcx), %xmm1
932 psrldq $5, %xmm0
933 psrldq $5, %xmm3
934 jmp LABEL(aftertail)
935
936 /*
937 * The following cases will be handled by ashr_6
938 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
939 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
940 */
941 .p2align 4
942 LABEL(ashr_6):
943 pxor %xmm0, %xmm0
944 movdqa (%rdi), %xmm2
945 movdqa (%rsi), %xmm1
946 pcmpeqb %xmm1, %xmm0
947 pslldq $10, %xmm2
948 TOLOWER (%xmm1, %xmm2)
949 pcmpeqb %xmm1, %xmm2
950 psubb %xmm0, %xmm2
951 pmovmskb %xmm2, %r9d
952 shr %cl, %edx
953 shr %cl, %r9d
954 sub %r9d, %edx
955 jnz LABEL(less32bytes)
956 movdqa (%rdi), %xmm3
957
958 UPDATE_STRNCMP_COUNTER
959
960 pxor %xmm0, %xmm0
961 mov $16, %rcx /* index for loads */
962 mov $6, %r9d /* byte position left over from less32bytes case */
963 /*
964 * Setup %r10 value allows us to detect crossing a page boundary.
965 * When %r10 goes positive we have crossed a page boundary and
966 * need to do a nibble.
967 */
968 lea 6(%rdi), %r10
969 and $0xfff, %r10 /* offset into 4K page */
970 sub $0x1000, %r10 /* subtract 4K pagesize */
971
972 .p2align 4
973 LABEL(loop_ashr_6):
974 add $16, %r10
975 jg LABEL(nibble_ashr_6)
976
977 LABEL(gobble_ashr_6):
978 movdqa (%rsi, %rcx), %xmm1
979 movdqa (%rdi, %rcx), %xmm2
980 movdqa %xmm2, %xmm4
981
982 #ifndef USE_SSSE3
983 psrldq $6, %xmm3
984 pslldq $10, %xmm2
985 por %xmm3, %xmm2 /* merge into one 16byte value */
986 #else
987 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
988 #endif
989 TOLOWER (%xmm1, %xmm2)
990
991 pcmpeqb %xmm1, %xmm0
992 pcmpeqb %xmm2, %xmm1
993 psubb %xmm0, %xmm1
994 pmovmskb %xmm1, %edx
995 sub $0xffff, %edx
996 jnz LABEL(exit)
997
998 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
999 sub $16, %r11
1000 jbe LABEL(strcmp_exitz)
1001 #endif
1002
1003 add $16, %rcx
1004 movdqa %xmm4, %xmm3
1005
1006 add $16, %r10
1007 jg LABEL(nibble_ashr_6) /* cross page boundary */
1008
1009 movdqa (%rsi, %rcx), %xmm1
1010 movdqa (%rdi, %rcx), %xmm2
1011 movdqa %xmm2, %xmm4
1012
1013 #ifndef USE_SSSE3
1014 psrldq $6, %xmm3
1015 pslldq $10, %xmm2
1016 por %xmm3, %xmm2 /* merge into one 16byte value */
1017 #else
1018 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
1019 #endif
1020 TOLOWER (%xmm1, %xmm2)
1021
1022 pcmpeqb %xmm1, %xmm0
1023 pcmpeqb %xmm2, %xmm1
1024 psubb %xmm0, %xmm1
1025 pmovmskb %xmm1, %edx
1026 sub $0xffff, %edx
1027 jnz LABEL(exit)
1028
1029 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1030 sub $16, %r11
1031 jbe LABEL(strcmp_exitz)
1032 #endif
1033
1034 add $16, %rcx
1035 movdqa %xmm4, %xmm3
1036 jmp LABEL(loop_ashr_6)
1037
1038 .p2align 4
1039 LABEL(nibble_ashr_6):
1040 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1041 pmovmskb %xmm0, %edx
1042 test $0xffc0, %edx
1043 jnz LABEL(ashr_6_exittail)
1044
1045 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1046 cmp $10, %r11
1047 jbe LABEL(ashr_6_exittail)
1048 #endif
1049
1050 pxor %xmm0, %xmm0
1051 sub $0x1000, %r10
1052 jmp LABEL(gobble_ashr_6)
1053
1054 .p2align 4
1055 LABEL(ashr_6_exittail):
1056 movdqa (%rsi, %rcx), %xmm1
1057 psrldq $6, %xmm0
1058 psrldq $6, %xmm3
1059 jmp LABEL(aftertail)
1060
1061 /*
1062 * The following cases will be handled by ashr_7
1063 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1064 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
1065 */
1066 .p2align 4
1067 LABEL(ashr_7):
1068 pxor %xmm0, %xmm0
1069 movdqa (%rdi), %xmm2
1070 movdqa (%rsi), %xmm1
1071 pcmpeqb %xmm1, %xmm0
1072 pslldq $9, %xmm2
1073 TOLOWER (%xmm1, %xmm2)
1074 pcmpeqb %xmm1, %xmm2
1075 psubb %xmm0, %xmm2
1076 pmovmskb %xmm2, %r9d
1077 shr %cl, %edx
1078 shr %cl, %r9d
1079 sub %r9d, %edx
1080 jnz LABEL(less32bytes)
1081 movdqa (%rdi), %xmm3
1082
1083 UPDATE_STRNCMP_COUNTER
1084
1085 pxor %xmm0, %xmm0
1086 mov $16, %rcx /* index for loads */
1087 mov $7, %r9d /* byte position left over from less32bytes case */
1088 /*
1089 * Setup %r10 value allows us to detect crossing a page boundary.
1090 * When %r10 goes positive we have crossed a page boundary and
1091 * need to do a nibble.
1092 */
1093 lea 7(%rdi), %r10
1094 and $0xfff, %r10 /* offset into 4K page */
1095 sub $0x1000, %r10 /* subtract 4K pagesize */
1096
1097 .p2align 4
1098 LABEL(loop_ashr_7):
1099 add $16, %r10
1100 jg LABEL(nibble_ashr_7)
1101
1102 LABEL(gobble_ashr_7):
1103 movdqa (%rsi, %rcx), %xmm1
1104 movdqa (%rdi, %rcx), %xmm2
1105 movdqa %xmm2, %xmm4
1106
1107 #ifndef USE_SSSE3
1108 psrldq $7, %xmm3
1109 pslldq $9, %xmm2
1110 por %xmm3, %xmm2 /* merge into one 16byte value */
1111 #else
1112 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1113 #endif
1114 TOLOWER (%xmm1, %xmm2)
1115
1116 pcmpeqb %xmm1, %xmm0
1117 pcmpeqb %xmm2, %xmm1
1118 psubb %xmm0, %xmm1
1119 pmovmskb %xmm1, %edx
1120 sub $0xffff, %edx
1121 jnz LABEL(exit)
1122
1123 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1124 sub $16, %r11
1125 jbe LABEL(strcmp_exitz)
1126 #endif
1127
1128 add $16, %rcx
1129 movdqa %xmm4, %xmm3
1130
1131 add $16, %r10
1132 jg LABEL(nibble_ashr_7) /* cross page boundary */
1133
1134 movdqa (%rsi, %rcx), %xmm1
1135 movdqa (%rdi, %rcx), %xmm2
1136 movdqa %xmm2, %xmm4
1137
1138 #ifndef USE_SSSE3
1139 psrldq $7, %xmm3
1140 pslldq $9, %xmm2
1141 por %xmm3, %xmm2 /* merge into one 16byte value */
1142 #else
1143 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1144 #endif
1145 TOLOWER (%xmm1, %xmm2)
1146
1147 pcmpeqb %xmm1, %xmm0
1148 pcmpeqb %xmm2, %xmm1
1149 psubb %xmm0, %xmm1
1150 pmovmskb %xmm1, %edx
1151 sub $0xffff, %edx
1152 jnz LABEL(exit)
1153
1154 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1155 sub $16, %r11
1156 jbe LABEL(strcmp_exitz)
1157 #endif
1158
1159 add $16, %rcx
1160 movdqa %xmm4, %xmm3
1161 jmp LABEL(loop_ashr_7)
1162
1163 .p2align 4
1164 LABEL(nibble_ashr_7):
1165 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1166 pmovmskb %xmm0, %edx
1167 test $0xff80, %edx
1168 jnz LABEL(ashr_7_exittail)
1169
1170 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1171 cmp $9, %r11
1172 jbe LABEL(ashr_7_exittail)
1173 #endif
1174
1175 pxor %xmm0, %xmm0
1176 sub $0x1000, %r10
1177 jmp LABEL(gobble_ashr_7)
1178
1179 .p2align 4
1180 LABEL(ashr_7_exittail):
1181 movdqa (%rsi, %rcx), %xmm1
1182 psrldq $7, %xmm0
1183 psrldq $7, %xmm3
1184 jmp LABEL(aftertail)
1185
1186 /*
1187 * The following cases will be handled by ashr_8
1188 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1189 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1190 */
1191 .p2align 4
1192 LABEL(ashr_8):
1193 pxor %xmm0, %xmm0
1194 movdqa (%rdi), %xmm2
1195 movdqa (%rsi), %xmm1
1196 pcmpeqb %xmm1, %xmm0
1197 pslldq $8, %xmm2
1198 TOLOWER (%xmm1, %xmm2)
1199 pcmpeqb %xmm1, %xmm2
1200 psubb %xmm0, %xmm2
1201 pmovmskb %xmm2, %r9d
1202 shr %cl, %edx
1203 shr %cl, %r9d
1204 sub %r9d, %edx
1205 jnz LABEL(less32bytes)
1206 movdqa (%rdi), %xmm3
1207
1208 UPDATE_STRNCMP_COUNTER
1209
1210 pxor %xmm0, %xmm0
1211 mov $16, %rcx /* index for loads */
1212 mov $8, %r9d /* byte position left over from less32bytes case */
1213 /*
1214 * Setup %r10 value allows us to detect crossing a page boundary.
1215 * When %r10 goes positive we have crossed a page boundary and
1216 * need to do a nibble.
1217 */
1218 lea 8(%rdi), %r10
1219 and $0xfff, %r10 /* offset into 4K page */
1220 sub $0x1000, %r10 /* subtract 4K pagesize */
1221
1222 .p2align 4
1223 LABEL(loop_ashr_8):
1224 add $16, %r10
1225 jg LABEL(nibble_ashr_8)
1226
1227 LABEL(gobble_ashr_8):
1228 movdqa (%rsi, %rcx), %xmm1
1229 movdqa (%rdi, %rcx), %xmm2
1230 movdqa %xmm2, %xmm4
1231
1232 #ifndef USE_SSSE3
1233 psrldq $8, %xmm3
1234 pslldq $8, %xmm2
1235 por %xmm3, %xmm2 /* merge into one 16byte value */
1236 #else
1237 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1238 #endif
1239 TOLOWER (%xmm1, %xmm2)
1240
1241 pcmpeqb %xmm1, %xmm0
1242 pcmpeqb %xmm2, %xmm1
1243 psubb %xmm0, %xmm1
1244 pmovmskb %xmm1, %edx
1245 sub $0xffff, %edx
1246 jnz LABEL(exit)
1247
1248 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1249 sub $16, %r11
1250 jbe LABEL(strcmp_exitz)
1251 #endif
1252
1253 add $16, %rcx
1254 movdqa %xmm4, %xmm3
1255
1256 add $16, %r10
1257 jg LABEL(nibble_ashr_8) /* cross page boundary */
1258
1259 movdqa (%rsi, %rcx), %xmm1
1260 movdqa (%rdi, %rcx), %xmm2
1261 movdqa %xmm2, %xmm4
1262
1263 #ifndef USE_SSSE3
1264 psrldq $8, %xmm3
1265 pslldq $8, %xmm2
1266 por %xmm3, %xmm2 /* merge into one 16byte value */
1267 #else
1268 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1269 #endif
1270 TOLOWER (%xmm1, %xmm2)
1271
1272 pcmpeqb %xmm1, %xmm0
1273 pcmpeqb %xmm2, %xmm1
1274 psubb %xmm0, %xmm1
1275 pmovmskb %xmm1, %edx
1276 sub $0xffff, %edx
1277 jnz LABEL(exit)
1278
1279 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1280 sub $16, %r11
1281 jbe LABEL(strcmp_exitz)
1282 #endif
1283
1284 add $16, %rcx
1285 movdqa %xmm4, %xmm3
1286 jmp LABEL(loop_ashr_8)
1287
1288 .p2align 4
1289 LABEL(nibble_ashr_8):
1290 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1291 pmovmskb %xmm0, %edx
1292 test $0xff00, %edx
1293 jnz LABEL(ashr_8_exittail)
1294
1295 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1296 cmp $8, %r11
1297 jbe LABEL(ashr_8_exittail)
1298 #endif
1299
1300 pxor %xmm0, %xmm0
1301 sub $0x1000, %r10
1302 jmp LABEL(gobble_ashr_8)
1303
1304 .p2align 4
1305 LABEL(ashr_8_exittail):
1306 movdqa (%rsi, %rcx), %xmm1
1307 psrldq $8, %xmm0
1308 psrldq $8, %xmm3
1309 jmp LABEL(aftertail)
1310
1311 /*
1312 * The following cases will be handled by ashr_9
1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1314 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1315 */
1316 .p2align 4
1317 LABEL(ashr_9):
1318 pxor %xmm0, %xmm0
1319 movdqa (%rdi), %xmm2
1320 movdqa (%rsi), %xmm1
1321 pcmpeqb %xmm1, %xmm0
1322 pslldq $7, %xmm2
1323 TOLOWER (%xmm1, %xmm2)
1324 pcmpeqb %xmm1, %xmm2
1325 psubb %xmm0, %xmm2
1326 pmovmskb %xmm2, %r9d
1327 shr %cl, %edx
1328 shr %cl, %r9d
1329 sub %r9d, %edx
1330 jnz LABEL(less32bytes)
1331 movdqa (%rdi), %xmm3
1332
1333 UPDATE_STRNCMP_COUNTER
1334
1335 pxor %xmm0, %xmm0
1336 mov $16, %rcx /* index for loads */
1337 mov $9, %r9d /* byte position left over from less32bytes case */
1338 /*
1339 * Setup %r10 value allows us to detect crossing a page boundary.
1340 * When %r10 goes positive we have crossed a page boundary and
1341 * need to do a nibble.
1342 */
1343 lea 9(%rdi), %r10
1344 and $0xfff, %r10 /* offset into 4K page */
1345 sub $0x1000, %r10 /* subtract 4K pagesize */
1346
1347 .p2align 4
1348 LABEL(loop_ashr_9):
1349 add $16, %r10
1350 jg LABEL(nibble_ashr_9)
1351
1352 LABEL(gobble_ashr_9):
1353 movdqa (%rsi, %rcx), %xmm1
1354 movdqa (%rdi, %rcx), %xmm2
1355 movdqa %xmm2, %xmm4
1356
1357 #ifndef USE_SSSE3
1358 psrldq $9, %xmm3
1359 pslldq $7, %xmm2
1360 por %xmm3, %xmm2 /* merge into one 16byte value */
1361 #else
1362 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1363 #endif
1364 TOLOWER (%xmm1, %xmm2)
1365
1366 pcmpeqb %xmm1, %xmm0
1367 pcmpeqb %xmm2, %xmm1
1368 psubb %xmm0, %xmm1
1369 pmovmskb %xmm1, %edx
1370 sub $0xffff, %edx
1371 jnz LABEL(exit)
1372
1373 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1374 sub $16, %r11
1375 jbe LABEL(strcmp_exitz)
1376 #endif
1377
1378 add $16, %rcx
1379 movdqa %xmm4, %xmm3
1380
1381 add $16, %r10
1382 jg LABEL(nibble_ashr_9) /* cross page boundary */
1383
1384 movdqa (%rsi, %rcx), %xmm1
1385 movdqa (%rdi, %rcx), %xmm2
1386 movdqa %xmm2, %xmm4
1387
1388 #ifndef USE_SSSE3
1389 psrldq $9, %xmm3
1390 pslldq $7, %xmm2
1391 por %xmm3, %xmm2 /* merge into one 16byte value */
1392 #else
1393 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1394 #endif
1395 TOLOWER (%xmm1, %xmm2)
1396
1397 pcmpeqb %xmm1, %xmm0
1398 pcmpeqb %xmm2, %xmm1
1399 psubb %xmm0, %xmm1
1400 pmovmskb %xmm1, %edx
1401 sub $0xffff, %edx
1402 jnz LABEL(exit)
1403
1404 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1405 sub $16, %r11
1406 jbe LABEL(strcmp_exitz)
1407 #endif
1408
1409 add $16, %rcx
1410 movdqa %xmm4, %xmm3 /* store for next cycle */
1411 jmp LABEL(loop_ashr_9)
1412
1413 .p2align 4
1414 LABEL(nibble_ashr_9):
1415 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1416 pmovmskb %xmm0, %edx
1417 test $0xfe00, %edx
1418 jnz LABEL(ashr_9_exittail)
1419
1420 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1421 cmp $7, %r11
1422 jbe LABEL(ashr_9_exittail)
1423 #endif
1424
1425 pxor %xmm0, %xmm0
1426 sub $0x1000, %r10
1427 jmp LABEL(gobble_ashr_9)
1428
1429 .p2align 4
1430 LABEL(ashr_9_exittail):
1431 movdqa (%rsi, %rcx), %xmm1
1432 psrldq $9, %xmm0
1433 psrldq $9, %xmm3
1434 jmp LABEL(aftertail)
1435
1436 /*
1437 * The following cases will be handled by ashr_10
1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1439 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1440 */
1441 .p2align 4
1442 LABEL(ashr_10):
1443 pxor %xmm0, %xmm0
1444 movdqa (%rdi), %xmm2
1445 movdqa (%rsi), %xmm1
1446 pcmpeqb %xmm1, %xmm0
1447 pslldq $6, %xmm2
1448 TOLOWER (%xmm1, %xmm2)
1449 pcmpeqb %xmm1, %xmm2
1450 psubb %xmm0, %xmm2
1451 pmovmskb %xmm2, %r9d
1452 shr %cl, %edx
1453 shr %cl, %r9d
1454 sub %r9d, %edx
1455 jnz LABEL(less32bytes)
1456 movdqa (%rdi), %xmm3
1457
1458 UPDATE_STRNCMP_COUNTER
1459
1460 pxor %xmm0, %xmm0
1461 mov $16, %rcx /* index for loads */
1462 mov $10, %r9d /* byte position left over from less32bytes case */
1463 /*
1464 * Setup %r10 value allows us to detect crossing a page boundary.
1465 * When %r10 goes positive we have crossed a page boundary and
1466 * need to do a nibble.
1467 */
1468 lea 10(%rdi), %r10
1469 and $0xfff, %r10 /* offset into 4K page */
1470 sub $0x1000, %r10 /* subtract 4K pagesize */
1471
1472 .p2align 4
1473 LABEL(loop_ashr_10):
1474 add $16, %r10
1475 jg LABEL(nibble_ashr_10)
1476
1477 LABEL(gobble_ashr_10):
1478 movdqa (%rsi, %rcx), %xmm1
1479 movdqa (%rdi, %rcx), %xmm2
1480 movdqa %xmm2, %xmm4
1481
1482 #ifndef USE_SSSE3
1483 psrldq $10, %xmm3
1484 pslldq $6, %xmm2
1485 por %xmm3, %xmm2 /* merge into one 16byte value */
1486 #else
1487 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1488 #endif
1489 TOLOWER (%xmm1, %xmm2)
1490
1491 pcmpeqb %xmm1, %xmm0
1492 pcmpeqb %xmm2, %xmm1
1493 psubb %xmm0, %xmm1
1494 pmovmskb %xmm1, %edx
1495 sub $0xffff, %edx
1496 jnz LABEL(exit)
1497
1498 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1499 sub $16, %r11
1500 jbe LABEL(strcmp_exitz)
1501 #endif
1502
1503 add $16, %rcx
1504 movdqa %xmm4, %xmm3
1505
1506 add $16, %r10
1507 jg LABEL(nibble_ashr_10) /* cross page boundary */
1508
1509 movdqa (%rsi, %rcx), %xmm1
1510 movdqa (%rdi, %rcx), %xmm2
1511 movdqa %xmm2, %xmm4
1512
1513 #ifndef USE_SSSE3
1514 psrldq $10, %xmm3
1515 pslldq $6, %xmm2
1516 por %xmm3, %xmm2 /* merge into one 16byte value */
1517 #else
1518 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1519 #endif
1520 TOLOWER (%xmm1, %xmm2)
1521
1522 pcmpeqb %xmm1, %xmm0
1523 pcmpeqb %xmm2, %xmm1
1524 psubb %xmm0, %xmm1
1525 pmovmskb %xmm1, %edx
1526 sub $0xffff, %edx
1527 jnz LABEL(exit)
1528
1529 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1530 sub $16, %r11
1531 jbe LABEL(strcmp_exitz)
1532 #endif
1533
1534 add $16, %rcx
1535 movdqa %xmm4, %xmm3
1536 jmp LABEL(loop_ashr_10)
1537
1538 .p2align 4
1539 LABEL(nibble_ashr_10):
1540 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1541 pmovmskb %xmm0, %edx
1542 test $0xfc00, %edx
1543 jnz LABEL(ashr_10_exittail)
1544
1545 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1546 cmp $6, %r11
1547 jbe LABEL(ashr_10_exittail)
1548 #endif
1549
1550 pxor %xmm0, %xmm0
1551 sub $0x1000, %r10
1552 jmp LABEL(gobble_ashr_10)
1553
1554 .p2align 4
1555 LABEL(ashr_10_exittail):
1556 movdqa (%rsi, %rcx), %xmm1
1557 psrldq $10, %xmm0
1558 psrldq $10, %xmm3
1559 jmp LABEL(aftertail)
1560
1561 /*
1562 * The following cases will be handled by ashr_11
1563 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1564 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1565 */
1566 .p2align 4
1567 LABEL(ashr_11):
1568 pxor %xmm0, %xmm0
1569 movdqa (%rdi), %xmm2
1570 movdqa (%rsi), %xmm1
1571 pcmpeqb %xmm1, %xmm0
1572 pslldq $5, %xmm2
1573 TOLOWER (%xmm1, %xmm2)
1574 pcmpeqb %xmm1, %xmm2
1575 psubb %xmm0, %xmm2
1576 pmovmskb %xmm2, %r9d
1577 shr %cl, %edx
1578 shr %cl, %r9d
1579 sub %r9d, %edx
1580 jnz LABEL(less32bytes)
1581 movdqa (%rdi), %xmm3
1582
1583 UPDATE_STRNCMP_COUNTER
1584
1585 pxor %xmm0, %xmm0
1586 mov $16, %rcx /* index for loads */
1587 mov $11, %r9d /* byte position left over from less32bytes case */
1588 /*
1589 * Setup %r10 value allows us to detect crossing a page boundary.
1590 * When %r10 goes positive we have crossed a page boundary and
1591 * need to do a nibble.
1592 */
1593 lea 11(%rdi), %r10
1594 and $0xfff, %r10 /* offset into 4K page */
1595 sub $0x1000, %r10 /* subtract 4K pagesize */
1596
1597 .p2align 4
1598 LABEL(loop_ashr_11):
1599 add $16, %r10
1600 jg LABEL(nibble_ashr_11)
1601
1602 LABEL(gobble_ashr_11):
1603 movdqa (%rsi, %rcx), %xmm1
1604 movdqa (%rdi, %rcx), %xmm2
1605 movdqa %xmm2, %xmm4
1606
1607 #ifndef USE_SSSE3
1608 psrldq $11, %xmm3
1609 pslldq $5, %xmm2
1610 por %xmm3, %xmm2 /* merge into one 16byte value */
1611 #else
1612 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1613 #endif
1614 TOLOWER (%xmm1, %xmm2)
1615
1616 pcmpeqb %xmm1, %xmm0
1617 pcmpeqb %xmm2, %xmm1
1618 psubb %xmm0, %xmm1
1619 pmovmskb %xmm1, %edx
1620 sub $0xffff, %edx
1621 jnz LABEL(exit)
1622
1623 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1624 sub $16, %r11
1625 jbe LABEL(strcmp_exitz)
1626 #endif
1627
1628 add $16, %rcx
1629 movdqa %xmm4, %xmm3
1630
1631 add $16, %r10
1632 jg LABEL(nibble_ashr_11) /* cross page boundary */
1633
1634 movdqa (%rsi, %rcx), %xmm1
1635 movdqa (%rdi, %rcx), %xmm2
1636 movdqa %xmm2, %xmm4
1637
1638 #ifndef USE_SSSE3
1639 psrldq $11, %xmm3
1640 pslldq $5, %xmm2
1641 por %xmm3, %xmm2 /* merge into one 16byte value */
1642 #else
1643 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1644 #endif
1645 TOLOWER (%xmm1, %xmm2)
1646
1647 pcmpeqb %xmm1, %xmm0
1648 pcmpeqb %xmm2, %xmm1
1649 psubb %xmm0, %xmm1
1650 pmovmskb %xmm1, %edx
1651 sub $0xffff, %edx
1652 jnz LABEL(exit)
1653
1654 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1655 sub $16, %r11
1656 jbe LABEL(strcmp_exitz)
1657 #endif
1658
1659 add $16, %rcx
1660 movdqa %xmm4, %xmm3
1661 jmp LABEL(loop_ashr_11)
1662
1663 .p2align 4
1664 LABEL(nibble_ashr_11):
1665 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1666 pmovmskb %xmm0, %edx
1667 test $0xf800, %edx
1668 jnz LABEL(ashr_11_exittail)
1669
1670 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1671 cmp $5, %r11
1672 jbe LABEL(ashr_11_exittail)
1673 #endif
1674
1675 pxor %xmm0, %xmm0
1676 sub $0x1000, %r10
1677 jmp LABEL(gobble_ashr_11)
1678
1679 .p2align 4
1680 LABEL(ashr_11_exittail):
1681 movdqa (%rsi, %rcx), %xmm1
1682 psrldq $11, %xmm0
1683 psrldq $11, %xmm3
1684 jmp LABEL(aftertail)
1685
1686 /*
1687 * The following cases will be handled by ashr_12
1688 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1689 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1690 */
1691 .p2align 4
1692 LABEL(ashr_12):
1693 pxor %xmm0, %xmm0
1694 movdqa (%rdi), %xmm2
1695 movdqa (%rsi), %xmm1
1696 pcmpeqb %xmm1, %xmm0
1697 pslldq $4, %xmm2
1698 TOLOWER (%xmm1, %xmm2)
1699 pcmpeqb %xmm1, %xmm2
1700 psubb %xmm0, %xmm2
1701 pmovmskb %xmm2, %r9d
1702 shr %cl, %edx
1703 shr %cl, %r9d
1704 sub %r9d, %edx
1705 jnz LABEL(less32bytes)
1706 movdqa (%rdi), %xmm3
1707
1708 UPDATE_STRNCMP_COUNTER
1709
1710 pxor %xmm0, %xmm0
1711 mov $16, %rcx /* index for loads */
1712 mov $12, %r9d /* byte position left over from less32bytes case */
1713 /*
1714 * Setup %r10 value allows us to detect crossing a page boundary.
1715 * When %r10 goes positive we have crossed a page boundary and
1716 * need to do a nibble.
1717 */
1718 lea 12(%rdi), %r10
1719 and $0xfff, %r10 /* offset into 4K page */
1720 sub $0x1000, %r10 /* subtract 4K pagesize */
1721
1722 .p2align 4
1723 LABEL(loop_ashr_12):
1724 add $16, %r10
1725 jg LABEL(nibble_ashr_12)
1726
1727 LABEL(gobble_ashr_12):
1728 movdqa (%rsi, %rcx), %xmm1
1729 movdqa (%rdi, %rcx), %xmm2
1730 movdqa %xmm2, %xmm4
1731
1732 #ifndef USE_SSSE3
1733 psrldq $12, %xmm3
1734 pslldq $4, %xmm2
1735 por %xmm3, %xmm2 /* merge into one 16byte value */
1736 #else
1737 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1738 #endif
1739 TOLOWER (%xmm1, %xmm2)
1740
1741 pcmpeqb %xmm1, %xmm0
1742 pcmpeqb %xmm2, %xmm1
1743 psubb %xmm0, %xmm1
1744 pmovmskb %xmm1, %edx
1745 sub $0xffff, %edx
1746 jnz LABEL(exit)
1747
1748 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1749 sub $16, %r11
1750 jbe LABEL(strcmp_exitz)
1751 #endif
1752
1753 add $16, %rcx
1754 movdqa %xmm4, %xmm3
1755
1756 add $16, %r10
1757 jg LABEL(nibble_ashr_12) /* cross page boundary */
1758
1759 movdqa (%rsi, %rcx), %xmm1
1760 movdqa (%rdi, %rcx), %xmm2
1761 movdqa %xmm2, %xmm4
1762
1763 #ifndef USE_SSSE3
1764 psrldq $12, %xmm3
1765 pslldq $4, %xmm2
1766 por %xmm3, %xmm2 /* merge into one 16byte value */
1767 #else
1768 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1769 #endif
1770 TOLOWER (%xmm1, %xmm2)
1771
1772 pcmpeqb %xmm1, %xmm0
1773 pcmpeqb %xmm2, %xmm1
1774 psubb %xmm0, %xmm1
1775 pmovmskb %xmm1, %edx
1776 sub $0xffff, %edx
1777 jnz LABEL(exit)
1778
1779 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1780 sub $16, %r11
1781 jbe LABEL(strcmp_exitz)
1782 #endif
1783
1784 add $16, %rcx
1785 movdqa %xmm4, %xmm3
1786 jmp LABEL(loop_ashr_12)
1787
1788 .p2align 4
1789 LABEL(nibble_ashr_12):
1790 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1791 pmovmskb %xmm0, %edx
1792 test $0xf000, %edx
1793 jnz LABEL(ashr_12_exittail)
1794
1795 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1796 cmp $4, %r11
1797 jbe LABEL(ashr_12_exittail)
1798 #endif
1799
1800 pxor %xmm0, %xmm0
1801 sub $0x1000, %r10
1802 jmp LABEL(gobble_ashr_12)
1803
1804 .p2align 4
1805 LABEL(ashr_12_exittail):
1806 movdqa (%rsi, %rcx), %xmm1
1807 psrldq $12, %xmm0
1808 psrldq $12, %xmm3
1809 jmp LABEL(aftertail)
1810
1811 /*
1812 * The following cases will be handled by ashr_13
1813 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1814 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1815 */
1816 .p2align 4
1817 LABEL(ashr_13):
1818 pxor %xmm0, %xmm0
1819 movdqa (%rdi), %xmm2
1820 movdqa (%rsi), %xmm1
1821 pcmpeqb %xmm1, %xmm0
1822 pslldq $3, %xmm2
1823 TOLOWER (%xmm1, %xmm2)
1824 pcmpeqb %xmm1, %xmm2
1825 psubb %xmm0, %xmm2
1826 pmovmskb %xmm2, %r9d
1827 shr %cl, %edx
1828 shr %cl, %r9d
1829 sub %r9d, %edx
1830 jnz LABEL(less32bytes)
1831 movdqa (%rdi), %xmm3
1832
1833 UPDATE_STRNCMP_COUNTER
1834
1835 pxor %xmm0, %xmm0
1836 mov $16, %rcx /* index for loads */
1837 mov $13, %r9d /* byte position left over from less32bytes case */
1838 /*
1839 * Setup %r10 value allows us to detect crossing a page boundary.
1840 * When %r10 goes positive we have crossed a page boundary and
1841 * need to do a nibble.
1842 */
1843 lea 13(%rdi), %r10
1844 and $0xfff, %r10 /* offset into 4K page */
1845 sub $0x1000, %r10 /* subtract 4K pagesize */
1846
1847 .p2align 4
1848 LABEL(loop_ashr_13):
1849 add $16, %r10
1850 jg LABEL(nibble_ashr_13)
1851
1852 LABEL(gobble_ashr_13):
1853 movdqa (%rsi, %rcx), %xmm1
1854 movdqa (%rdi, %rcx), %xmm2
1855 movdqa %xmm2, %xmm4
1856
1857 #ifndef USE_SSSE3
1858 psrldq $13, %xmm3
1859 pslldq $3, %xmm2
1860 por %xmm3, %xmm2 /* merge into one 16byte value */
1861 #else
1862 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1863 #endif
1864 TOLOWER (%xmm1, %xmm2)
1865
1866 pcmpeqb %xmm1, %xmm0
1867 pcmpeqb %xmm2, %xmm1
1868 psubb %xmm0, %xmm1
1869 pmovmskb %xmm1, %edx
1870 sub $0xffff, %edx
1871 jnz LABEL(exit)
1872
1873 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1874 sub $16, %r11
1875 jbe LABEL(strcmp_exitz)
1876 #endif
1877
1878 add $16, %rcx
1879 movdqa %xmm4, %xmm3
1880
1881 add $16, %r10
1882 jg LABEL(nibble_ashr_13) /* cross page boundary */
1883
1884 movdqa (%rsi, %rcx), %xmm1
1885 movdqa (%rdi, %rcx), %xmm2
1886 movdqa %xmm2, %xmm4
1887
1888 #ifndef USE_SSSE3
1889 psrldq $13, %xmm3
1890 pslldq $3, %xmm2
1891 por %xmm3, %xmm2 /* merge into one 16byte value */
1892 #else
1893 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1894 #endif
1895 TOLOWER (%xmm1, %xmm2)
1896
1897 pcmpeqb %xmm1, %xmm0
1898 pcmpeqb %xmm2, %xmm1
1899 psubb %xmm0, %xmm1
1900 pmovmskb %xmm1, %edx
1901 sub $0xffff, %edx
1902 jnz LABEL(exit)
1903
1904 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1905 sub $16, %r11
1906 jbe LABEL(strcmp_exitz)
1907 #endif
1908
1909 add $16, %rcx
1910 movdqa %xmm4, %xmm3
1911 jmp LABEL(loop_ashr_13)
1912
1913 .p2align 4
1914 LABEL(nibble_ashr_13):
1915 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1916 pmovmskb %xmm0, %edx
1917 test $0xe000, %edx
1918 jnz LABEL(ashr_13_exittail)
1919
1920 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1921 cmp $3, %r11
1922 jbe LABEL(ashr_13_exittail)
1923 #endif
1924
1925 pxor %xmm0, %xmm0
1926 sub $0x1000, %r10
1927 jmp LABEL(gobble_ashr_13)
1928
1929 .p2align 4
1930 LABEL(ashr_13_exittail):
1931 movdqa (%rsi, %rcx), %xmm1
1932 psrldq $13, %xmm0
1933 psrldq $13, %xmm3
1934 jmp LABEL(aftertail)
1935
1936 /*
1937 * The following cases will be handled by ashr_14
1938 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1939 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1940 */
1941 .p2align 4
1942 LABEL(ashr_14):
1943 pxor %xmm0, %xmm0
1944 movdqa (%rdi), %xmm2
1945 movdqa (%rsi), %xmm1
1946 pcmpeqb %xmm1, %xmm0
1947 pslldq $2, %xmm2
1948 TOLOWER (%xmm1, %xmm2)
1949 pcmpeqb %xmm1, %xmm2
1950 psubb %xmm0, %xmm2
1951 pmovmskb %xmm2, %r9d
1952 shr %cl, %edx
1953 shr %cl, %r9d
1954 sub %r9d, %edx
1955 jnz LABEL(less32bytes)
1956 movdqa (%rdi), %xmm3
1957
1958 UPDATE_STRNCMP_COUNTER
1959
1960 pxor %xmm0, %xmm0
1961 mov $16, %rcx /* index for loads */
1962 mov $14, %r9d /* byte position left over from less32bytes case */
1963 /*
1964 * Setup %r10 value allows us to detect crossing a page boundary.
1965 * When %r10 goes positive we have crossed a page boundary and
1966 * need to do a nibble.
1967 */
1968 lea 14(%rdi), %r10
1969 and $0xfff, %r10 /* offset into 4K page */
1970 sub $0x1000, %r10 /* subtract 4K pagesize */
1971
1972 .p2align 4
1973 LABEL(loop_ashr_14):
1974 add $16, %r10
1975 jg LABEL(nibble_ashr_14)
1976
1977 LABEL(gobble_ashr_14):
1978 movdqa (%rsi, %rcx), %xmm1
1979 movdqa (%rdi, %rcx), %xmm2
1980 movdqa %xmm2, %xmm4
1981
1982 #ifndef USE_SSSE3
1983 psrldq $14, %xmm3
1984 pslldq $2, %xmm2
1985 por %xmm3, %xmm2 /* merge into one 16byte value */
1986 #else
1987 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1988 #endif
1989 TOLOWER (%xmm1, %xmm2)
1990
1991 pcmpeqb %xmm1, %xmm0
1992 pcmpeqb %xmm2, %xmm1
1993 psubb %xmm0, %xmm1
1994 pmovmskb %xmm1, %edx
1995 sub $0xffff, %edx
1996 jnz LABEL(exit)
1997
1998 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1999 sub $16, %r11
2000 jbe LABEL(strcmp_exitz)
2001 #endif
2002
2003 add $16, %rcx
2004 movdqa %xmm4, %xmm3
2005
2006 add $16, %r10
2007 jg LABEL(nibble_ashr_14) /* cross page boundary */
2008
2009 movdqa (%rsi, %rcx), %xmm1
2010 movdqa (%rdi, %rcx), %xmm2
2011 movdqa %xmm2, %xmm4
2012
2013 #ifndef USE_SSSE3
2014 psrldq $14, %xmm3
2015 pslldq $2, %xmm2
2016 por %xmm3, %xmm2 /* merge into one 16byte value */
2017 #else
2018 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
2019 #endif
2020 TOLOWER (%xmm1, %xmm2)
2021
2022 pcmpeqb %xmm1, %xmm0
2023 pcmpeqb %xmm2, %xmm1
2024 psubb %xmm0, %xmm1
2025 pmovmskb %xmm1, %edx
2026 sub $0xffff, %edx
2027 jnz LABEL(exit)
2028
2029 #if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
2030 sub $16, %r11
2031 jbe LABEL(strcmp_exitz)
2032 #endif
2033
2034 add $16, %rcx
2035 movdqa %xmm4, %xmm3
2036 jmp LABEL(loop_ashr_14)
2037
2038 .p2align 4
2039 LABEL(nibble_ashr_14):
2040 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2041 pmovmskb %xmm0, %edx
2042 test $0xc000, %edx
2043 jnz LABEL(ashr_14_exittail)
2044
2045 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2046 cmp $2, %r11
2047 jbe LABEL(ashr_14_exittail)
2048 #endif
2049
2050 pxor %xmm0, %xmm0
2051 sub $0x1000, %r10
2052 jmp LABEL(gobble_ashr_14)
2053
2054 .p2align 4
2055 LABEL(ashr_14_exittail):
2056 movdqa (%rsi, %rcx), %xmm1
2057 psrldq $14, %xmm0
2058 psrldq $14, %xmm3
2059 jmp LABEL(aftertail)
2060
2061 /*
2062 * The following cases will be handled by ashr_15
2063 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
2064 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
2065 */
2066 .p2align 4
2067 LABEL(ashr_15):
2068 pxor %xmm0, %xmm0
2069 movdqa (%rdi), %xmm2
2070 movdqa (%rsi), %xmm1
2071 pcmpeqb %xmm1, %xmm0
2072 pslldq $1, %xmm2
2073 TOLOWER (%xmm1, %xmm2)
2074 pcmpeqb %xmm1, %xmm2
2075 psubb %xmm0, %xmm2
2076 pmovmskb %xmm2, %r9d
2077 shr %cl, %edx
2078 shr %cl, %r9d
2079 sub %r9d, %edx
2080 jnz LABEL(less32bytes)
2081
2082 movdqa (%rdi), %xmm3
2083
2084 UPDATE_STRNCMP_COUNTER
2085
2086 pxor %xmm0, %xmm0
2087 mov $16, %rcx /* index for loads */
2088 mov $15, %r9d /* byte position left over from less32bytes case */
2089 /*
2090 * Setup %r10 value allows us to detect crossing a page boundary.
2091 * When %r10 goes positive we have crossed a page boundary and
2092 * need to do a nibble.
2093 */
2094 lea 15(%rdi), %r10
2095 and $0xfff, %r10 /* offset into 4K page */
2096
2097 sub $0x1000, %r10 /* subtract 4K pagesize */
2098
2099 .p2align 4
2100 LABEL(loop_ashr_15):
2101 add $16, %r10
2102 jg LABEL(nibble_ashr_15)
2103
2104 LABEL(gobble_ashr_15):
2105 movdqa (%rsi, %rcx), %xmm1
2106 movdqa (%rdi, %rcx), %xmm2
2107 movdqa %xmm2, %xmm4
2108
2109 #ifndef USE_SSSE3
2110 psrldq $15, %xmm3
2111 pslldq $1, %xmm2
2112 por %xmm3, %xmm2 /* merge into one 16byte value */
2113 #else
2114 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2115 #endif
2116 TOLOWER (%xmm1, %xmm2)
2117
2118 pcmpeqb %xmm1, %xmm0
2119 pcmpeqb %xmm2, %xmm1
2120 psubb %xmm0, %xmm1
2121 pmovmskb %xmm1, %edx
2122 sub $0xffff, %edx
2123 jnz LABEL(exit)
2124
2125 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2126 sub $16, %r11
2127 jbe LABEL(strcmp_exitz)
2128 #endif
2129
2130 add $16, %rcx
2131 movdqa %xmm4, %xmm3
2132
2133 add $16, %r10
2134 jg LABEL(nibble_ashr_15) /* cross page boundary */
2135
2136 movdqa (%rsi, %rcx), %xmm1
2137 movdqa (%rdi, %rcx), %xmm2
2138 movdqa %xmm2, %xmm4
2139
2140 #ifndef USE_SSSE3
2141 psrldq $15, %xmm3
2142 pslldq $1, %xmm2
2143 por %xmm3, %xmm2 /* merge into one 16byte value */
2144 #else
2145 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2146 #endif
2147 TOLOWER (%xmm1, %xmm2)
2148
2149 pcmpeqb %xmm1, %xmm0
2150 pcmpeqb %xmm2, %xmm1
2151 psubb %xmm0, %xmm1
2152 pmovmskb %xmm1, %edx
2153 sub $0xffff, %edx
2154 jnz LABEL(exit)
2155
2156 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2157 sub $16, %r11
2158 jbe LABEL(strcmp_exitz)
2159 #endif
2160
2161 add $16, %rcx
2162 movdqa %xmm4, %xmm3
2163 jmp LABEL(loop_ashr_15)
2164
2165 .p2align 4
2166 LABEL(nibble_ashr_15):
2167 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2168 pmovmskb %xmm0, %edx
2169 test $0x8000, %edx
2170 jnz LABEL(ashr_15_exittail)
2171
2172 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2173 cmpq $1, %r11
2174 jbe LABEL(ashr_15_exittail)
2175 #endif
2176
2177 pxor %xmm0, %xmm0
2178 sub $0x1000, %r10
2179 jmp LABEL(gobble_ashr_15)
2180
2181 .p2align 4
2182 LABEL(ashr_15_exittail):
2183 movdqa (%rsi, %rcx), %xmm1
2184 psrldq $15, %xmm3
2185 psrldq $15, %xmm0
2186
2187 .p2align 4
2188 LABEL(aftertail):
2189 TOLOWER (%xmm1, %xmm3)
2190 pcmpeqb %xmm3, %xmm1
2191 psubb %xmm0, %xmm1
2192 pmovmskb %xmm1, %edx
2193 not %edx
2194
2195 .p2align 4
2196 LABEL(exit):
2197 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
2198 LABEL(less32bytes):
2199 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
2200 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
2201 test %r8d, %r8d
2202 jz LABEL(ret)
2203 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
2204
2205 .p2align 4
2206 LABEL(ret):
2207 LABEL(less16bytes):
2208 bsf %rdx, %rdx /* find and store bit index in %rdx */
2209
2210 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2211 sub %rdx, %r11
2212 jbe LABEL(strcmp_exitz)
2213 #endif
2214 movzbl (%rsi, %rdx), %ecx
2215 movzbl (%rdi, %rdx), %eax
2216
2217 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2218 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2219 movl (%rdx,%rcx,4), %ecx
2220 movl (%rdx,%rax,4), %eax
2221 #endif
2222
2223 sub %ecx, %eax
2224 ret
2225
2226 LABEL(strcmp_exitz):
2227 xor %eax, %eax
2228 ret
2229
2230 .p2align 4
2231 LABEL(Byte0):
2232 movzx (%rsi), %ecx
2233 movzx (%rdi), %eax
2234
2235 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2236 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2237 movl (%rdx,%rcx,4), %ecx
2238 movl (%rdx,%rax,4), %eax
2239 #endif
2240
2241 sub %ecx, %eax
2242 ret
2243 END (STRCMP)
2244
2245 .section .rodata,"a",@progbits
2246 .p2align 3
2247 LABEL(unaligned_table):
2248 .int LABEL(ashr_1) - LABEL(unaligned_table)
2249 .int LABEL(ashr_2) - LABEL(unaligned_table)
2250 .int LABEL(ashr_3) - LABEL(unaligned_table)
2251 .int LABEL(ashr_4) - LABEL(unaligned_table)
2252 .int LABEL(ashr_5) - LABEL(unaligned_table)
2253 .int LABEL(ashr_6) - LABEL(unaligned_table)
2254 .int LABEL(ashr_7) - LABEL(unaligned_table)
2255 .int LABEL(ashr_8) - LABEL(unaligned_table)
2256 .int LABEL(ashr_9) - LABEL(unaligned_table)
2257 .int LABEL(ashr_10) - LABEL(unaligned_table)
2258 .int LABEL(ashr_11) - LABEL(unaligned_table)
2259 .int LABEL(ashr_12) - LABEL(unaligned_table)
2260 .int LABEL(ashr_13) - LABEL(unaligned_table)
2261 .int LABEL(ashr_14) - LABEL(unaligned_table)
2262 .int LABEL(ashr_15) - LABEL(unaligned_table)
2263 .int LABEL(ashr_0) - LABEL(unaligned_table)
2264 libc_hidden_builtin_def (STRCMP)