]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/strcmp.S
Update copyright notices with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / strcmp.S
1 /* Highly optimized version for x86-64.
2 Copyright (C) 1999-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Based on i686 version contributed by Ulrich Drepper
5 <drepper@cygnus.com>, 1999.
6 Updated with SSE2 support contributed by Intel Corporation.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, see
20 <http://www.gnu.org/licenses/>. */
21
22 #include <sysdep.h>
23 #include "asm-syntax.h"
24
25 #undef UPDATE_STRNCMP_COUNTER
26
27 #ifndef LABEL
28 #define LABEL(l) L(l)
29 #endif
30
31 #ifdef USE_AS_STRNCMP
32 /* The simplified code below is not set up to handle strncmp() so far.
33 Should this become necessary it has to be implemented. For now
34 just report the problem. */
35 # ifdef NOT_IN_libc
36 # error "strncmp not implemented so far"
37 # endif
38
39 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
40 if the new counter > the old one or is 0. */
41 # define UPDATE_STRNCMP_COUNTER \
42 /* calculate left number to compare */ \
43 lea -16(%rcx, %r11), %r9; \
44 cmp %r9, %r11; \
45 jb LABEL(strcmp_exitz); \
46 test %r9, %r9; \
47 je LABEL(strcmp_exitz); \
48 mov %r9, %r11
49
50 #elif defined USE_AS_STRCASECMP_L
51 # include "locale-defines.h"
52
53 /* No support for strcasecmp outside libc so far since it is not needed. */
54 # ifdef NOT_IN_libc
55 # error "strcasecmp_l not implemented so far"
56 # endif
57
58 # define UPDATE_STRNCMP_COUNTER
59 #elif defined USE_AS_STRNCASECMP_L
60 # include "locale-defines.h"
61
62 /* No support for strncasecmp outside libc so far since it is not needed. */
63 # ifdef NOT_IN_libc
64 # error "strncasecmp_l not implemented so far"
65 # endif
66
67 # define UPDATE_STRNCMP_COUNTER \
68 /* calculate left number to compare */ \
69 lea -16(%rcx, %r11), %r9; \
70 cmp %r9, %r11; \
71 jb LABEL(strcmp_exitz); \
72 test %r9, %r9; \
73 je LABEL(strcmp_exitz); \
74 mov %r9, %r11
75 #else
76 # define UPDATE_STRNCMP_COUNTER
77 # ifndef STRCMP
78 # define STRCMP strcmp
79 # endif
80 #endif
81
82 #ifndef USE_SSSE3
83 .text
84 #else
85 .section .text.ssse3,"ax",@progbits
86 #endif
87
88 #ifdef USE_AS_STRCASECMP_L
89 # ifndef ENTRY2
90 # define ENTRY2(name) ENTRY (name)
91 # define END2(name) END (name)
92 # endif
93
94 ENTRY2 (__strcasecmp)
95 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
96 mov %fs:(%rax),%RDX_LP
97
98 // XXX 5 byte should be before the function
99 /* 5-byte NOP. */
100 .byte 0x0f,0x1f,0x44,0x00,0x00
101 END2 (__strcasecmp)
102 # ifndef NO_NOLOCALE_ALIAS
103 weak_alias (__strcasecmp, strcasecmp)
104 libc_hidden_def (__strcasecmp)
105 # endif
106 /* FALLTHROUGH to strcasecmp_l. */
107 #elif defined USE_AS_STRNCASECMP_L
108 # ifndef ENTRY2
109 # define ENTRY2(name) ENTRY (name)
110 # define END2(name) END (name)
111 # endif
112
113 ENTRY2 (__strncasecmp)
114 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
115 mov %fs:(%rax),%RCX_LP
116
117 // XXX 5 byte should be before the function
118 /* 5-byte NOP. */
119 .byte 0x0f,0x1f,0x44,0x00,0x00
120 END2 (__strncasecmp)
121 # ifndef NO_NOLOCALE_ALIAS
122 weak_alias (__strncasecmp, strncasecmp)
123 libc_hidden_def (__strncasecmp)
124 # endif
125 /* FALLTHROUGH to strncasecmp_l. */
126 #endif
127
128 ENTRY (STRCMP)
129 #ifdef NOT_IN_libc
130 /* Simple version since we can't use SSE registers in ld.so. */
131 L(oop): movb (%rdi), %al
132 cmpb (%rsi), %al
133 jne L(neq)
134 incq %rdi
135 incq %rsi
136 testb %al, %al
137 jnz L(oop)
138
139 xorl %eax, %eax
140 ret
141
142 L(neq): movl $1, %eax
143 movl $-1, %ecx
144 cmovbl %ecx, %eax
145 ret
146 END (STRCMP)
147 #else /* NOT_IN_libc */
148 # ifdef USE_AS_STRCASECMP_L
149 /* We have to fall back on the C implementation for locales
150 with encodings not matching ASCII for single bytes. */
151 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
152 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
153 # else
154 mov (%rdx), %RAX_LP
155 # endif
156 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
157 jne __strcasecmp_l_nonascii
158 # elif defined USE_AS_STRNCASECMP_L
159 /* We have to fall back on the C implementation for locales
160 with encodings not matching ASCII for single bytes. */
161 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
162 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
163 # else
164 mov (%rcx), %RAX_LP
165 # endif
166 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
167 jne __strncasecmp_l_nonascii
168 # endif
169
170 /*
171 * This implementation uses SSE to compare up to 16 bytes at a time.
172 */
173 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
174 test %rdx, %rdx
175 je LABEL(strcmp_exitz)
176 cmp $1, %rdx
177 je LABEL(Byte0)
178 mov %rdx, %r11
179 # endif
180 mov %esi, %ecx
181 mov %edi, %eax
182 /* Use 64bit AND here to avoid long NOP padding. */
183 and $0x3f, %rcx /* rsi alignment in cache line */
184 and $0x3f, %rax /* rdi alignment in cache line */
185 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
186 .section .rodata.cst16,"aM",@progbits,16
187 .align 16
188 .Lbelowupper:
189 .quad 0x4040404040404040
190 .quad 0x4040404040404040
191 .Ltopupper:
192 .quad 0x5b5b5b5b5b5b5b5b
193 .quad 0x5b5b5b5b5b5b5b5b
194 .Ltouppermask:
195 .quad 0x2020202020202020
196 .quad 0x2020202020202020
197 .previous
198 movdqa .Lbelowupper(%rip), %xmm5
199 # define UCLOW_reg %xmm5
200 movdqa .Ltopupper(%rip), %xmm6
201 # define UCHIGH_reg %xmm6
202 movdqa .Ltouppermask(%rip), %xmm7
203 # define LCQWORD_reg %xmm7
204 # endif
205 cmp $0x30, %ecx
206 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
207 cmp $0x30, %eax
208 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
209 movlpd (%rdi), %xmm1
210 movlpd (%rsi), %xmm2
211 movhpd 8(%rdi), %xmm1
212 movhpd 8(%rsi), %xmm2
213 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
214 # define TOLOWER(reg1, reg2) \
215 movdqa reg1, %xmm8; \
216 movdqa UCHIGH_reg, %xmm9; \
217 movdqa reg2, %xmm10; \
218 movdqa UCHIGH_reg, %xmm11; \
219 pcmpgtb UCLOW_reg, %xmm8; \
220 pcmpgtb reg1, %xmm9; \
221 pcmpgtb UCLOW_reg, %xmm10; \
222 pcmpgtb reg2, %xmm11; \
223 pand %xmm9, %xmm8; \
224 pand %xmm11, %xmm10; \
225 pand LCQWORD_reg, %xmm8; \
226 pand LCQWORD_reg, %xmm10; \
227 por %xmm8, reg1; \
228 por %xmm10, reg2
229 TOLOWER (%xmm1, %xmm2)
230 # else
231 # define TOLOWER(reg1, reg2)
232 # endif
233 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
234 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
235 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
236 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
237 pmovmskb %xmm1, %edx
238 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
239 jnz LABEL(less16bytes) /* If not, find different value or null char */
240 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
241 sub $16, %r11
242 jbe LABEL(strcmp_exitz) /* finish comparision */
243 # endif
244 add $16, %rsi /* prepare to search next 16 bytes */
245 add $16, %rdi /* prepare to search next 16 bytes */
246
247 /*
248 * Determine source and destination string offsets from 16-byte alignment.
249 * Use relative offset difference between the two to determine which case
250 * below to use.
251 */
252 .p2align 4
253 LABEL(crosscache):
254 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
255 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
256 mov $0xffff, %edx /* for equivalent offset */
257 xor %r8d, %r8d
258 and $0xf, %ecx /* offset of rsi */
259 and $0xf, %eax /* offset of rdi */
260 cmp %eax, %ecx
261 je LABEL(ashr_0) /* rsi and rdi relative offset same */
262 ja LABEL(bigger)
263 mov %edx, %r8d /* r8d is offset flag for exit tail */
264 xchg %ecx, %eax
265 xchg %rsi, %rdi
266 LABEL(bigger):
267 lea 15(%rax), %r9
268 sub %rcx, %r9
269 lea LABEL(unaligned_table)(%rip), %r10
270 movslq (%r10, %r9,4), %r9
271 lea (%r10, %r9), %r10
272 jmp *%r10 /* jump to corresponding case */
273
274 /*
275 * The following cases will be handled by ashr_0
276 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
277 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
278 */
279 .p2align 4
280 LABEL(ashr_0):
281
282 movdqa (%rsi), %xmm1
283 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
284 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
285 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
286 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
287 # else
288 movdqa (%rdi), %xmm2
289 TOLOWER (%xmm1, %xmm2)
290 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
291 # endif
292 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
293 pmovmskb %xmm1, %r9d
294 shr %cl, %edx /* adjust 0xffff for offset */
295 shr %cl, %r9d /* adjust for 16-byte offset */
296 sub %r9d, %edx
297 /*
298 * edx must be the same with r9d if in left byte (16-rcx) is equal to
299 * the start from (16-rax) and no null char was seen.
300 */
301 jne LABEL(less32bytes) /* mismatch or null char */
302 UPDATE_STRNCMP_COUNTER
303 mov $16, %rcx
304 mov $16, %r9
305 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
306
307 /*
308 * Now both strings are aligned at 16-byte boundary. Loop over strings
309 * checking 32-bytes per iteration.
310 */
311 .p2align 4
312 LABEL(loop_ashr_0):
313 movdqa (%rsi, %rcx), %xmm1
314 movdqa (%rdi, %rcx), %xmm2
315 TOLOWER (%xmm1, %xmm2)
316
317 pcmpeqb %xmm1, %xmm0
318 pcmpeqb %xmm2, %xmm1
319 psubb %xmm0, %xmm1
320 pmovmskb %xmm1, %edx
321 sub $0xffff, %edx
322 jnz LABEL(exit) /* mismatch or null char seen */
323
324 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
325 sub $16, %r11
326 jbe LABEL(strcmp_exitz)
327 # endif
328 add $16, %rcx
329 movdqa (%rsi, %rcx), %xmm1
330 movdqa (%rdi, %rcx), %xmm2
331 TOLOWER (%xmm1, %xmm2)
332
333 pcmpeqb %xmm1, %xmm0
334 pcmpeqb %xmm2, %xmm1
335 psubb %xmm0, %xmm1
336 pmovmskb %xmm1, %edx
337 sub $0xffff, %edx
338 jnz LABEL(exit)
339 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
340 sub $16, %r11
341 jbe LABEL(strcmp_exitz)
342 # endif
343 add $16, %rcx
344 jmp LABEL(loop_ashr_0)
345
346 /*
347 * The following cases will be handled by ashr_1
348 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
349 * n(15) n -15 0(15 +(n-15) - n) ashr_1
350 */
351 .p2align 4
352 LABEL(ashr_1):
353 pxor %xmm0, %xmm0
354 movdqa (%rdi), %xmm2
355 movdqa (%rsi), %xmm1
356 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
357 pslldq $15, %xmm2 /* shift first string to align with second */
358 TOLOWER (%xmm1, %xmm2)
359 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
360 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
361 pmovmskb %xmm2, %r9d
362 shr %cl, %edx /* adjust 0xffff for offset */
363 shr %cl, %r9d /* adjust for 16-byte offset */
364 sub %r9d, %edx
365 jnz LABEL(less32bytes) /* mismatch or null char seen */
366 movdqa (%rdi), %xmm3
367 UPDATE_STRNCMP_COUNTER
368
369 pxor %xmm0, %xmm0
370 mov $16, %rcx /* index for loads*/
371 mov $1, %r9d /* byte position left over from less32bytes case */
372 /*
373 * Setup %r10 value allows us to detect crossing a page boundary.
374 * When %r10 goes positive we have crossed a page boundary and
375 * need to do a nibble.
376 */
377 lea 1(%rdi), %r10
378 and $0xfff, %r10 /* offset into 4K page */
379 sub $0x1000, %r10 /* subtract 4K pagesize */
380
381 .p2align 4
382 LABEL(loop_ashr_1):
383 add $16, %r10
384 jg LABEL(nibble_ashr_1) /* cross page boundary */
385
386 LABEL(gobble_ashr_1):
387 movdqa (%rsi, %rcx), %xmm1
388 movdqa (%rdi, %rcx), %xmm2
389 movdqa %xmm2, %xmm4 /* store for next cycle */
390
391 # ifndef USE_SSSE3
392 psrldq $1, %xmm3
393 pslldq $15, %xmm2
394 por %xmm3, %xmm2 /* merge into one 16byte value */
395 # else
396 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
397 # endif
398 TOLOWER (%xmm1, %xmm2)
399
400 pcmpeqb %xmm1, %xmm0
401 pcmpeqb %xmm2, %xmm1
402 psubb %xmm0, %xmm1
403 pmovmskb %xmm1, %edx
404 sub $0xffff, %edx
405 jnz LABEL(exit)
406
407 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
408 sub $16, %r11
409 jbe LABEL(strcmp_exitz)
410 # endif
411 add $16, %rcx
412 movdqa %xmm4, %xmm3
413
414 add $16, %r10
415 jg LABEL(nibble_ashr_1) /* cross page boundary */
416
417 movdqa (%rsi, %rcx), %xmm1
418 movdqa (%rdi, %rcx), %xmm2
419 movdqa %xmm2, %xmm4 /* store for next cycle */
420
421 # ifndef USE_SSSE3
422 psrldq $1, %xmm3
423 pslldq $15, %xmm2
424 por %xmm3, %xmm2 /* merge into one 16byte value */
425 # else
426 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
427 # endif
428 TOLOWER (%xmm1, %xmm2)
429
430 pcmpeqb %xmm1, %xmm0
431 pcmpeqb %xmm2, %xmm1
432 psubb %xmm0, %xmm1
433 pmovmskb %xmm1, %edx
434 sub $0xffff, %edx
435 jnz LABEL(exit)
436
437 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
438 sub $16, %r11
439 jbe LABEL(strcmp_exitz)
440 # endif
441 add $16, %rcx
442 movdqa %xmm4, %xmm3
443 jmp LABEL(loop_ashr_1)
444
445 /*
446 * Nibble avoids loads across page boundary. This is to avoid a potential
447 * access into unmapped memory.
448 */
449 .p2align 4
450 LABEL(nibble_ashr_1):
451 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
452 pmovmskb %xmm0, %edx
453 test $0xfffe, %edx
454 jnz LABEL(ashr_1_exittail) /* find null char*/
455
456 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
457 cmp $15, %r11
458 jbe LABEL(ashr_1_exittail)
459 # endif
460
461 pxor %xmm0, %xmm0
462 sub $0x1000, %r10 /* substract 4K from %r10 */
463 jmp LABEL(gobble_ashr_1)
464
465 /*
466 * Once find null char, determine if there is a string mismatch
467 * before the null char.
468 */
469 .p2align 4
470 LABEL(ashr_1_exittail):
471 movdqa (%rsi, %rcx), %xmm1
472 psrldq $1, %xmm0
473 psrldq $1, %xmm3
474 jmp LABEL(aftertail)
475
476 /*
477 * The following cases will be handled by ashr_2
478 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
479 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
480 */
481 .p2align 4
482 LABEL(ashr_2):
483 pxor %xmm0, %xmm0
484 movdqa (%rdi), %xmm2
485 movdqa (%rsi), %xmm1
486 pcmpeqb %xmm1, %xmm0
487 pslldq $14, %xmm2
488 TOLOWER (%xmm1, %xmm2)
489 pcmpeqb %xmm1, %xmm2
490 psubb %xmm0, %xmm2
491 pmovmskb %xmm2, %r9d
492 shr %cl, %edx
493 shr %cl, %r9d
494 sub %r9d, %edx
495 jnz LABEL(less32bytes)
496 movdqa (%rdi), %xmm3
497 UPDATE_STRNCMP_COUNTER
498
499 pxor %xmm0, %xmm0
500 mov $16, %rcx /* index for loads */
501 mov $2, %r9d /* byte position left over from less32bytes case */
502 /*
503 * Setup %r10 value allows us to detect crossing a page boundary.
504 * When %r10 goes positive we have crossed a page boundary and
505 * need to do a nibble.
506 */
507 lea 2(%rdi), %r10
508 and $0xfff, %r10 /* offset into 4K page */
509 sub $0x1000, %r10 /* subtract 4K pagesize */
510
511 .p2align 4
512 LABEL(loop_ashr_2):
513 add $16, %r10
514 jg LABEL(nibble_ashr_2)
515
516 LABEL(gobble_ashr_2):
517 movdqa (%rsi, %rcx), %xmm1
518 movdqa (%rdi, %rcx), %xmm2
519 movdqa %xmm2, %xmm4
520
521 # ifndef USE_SSSE3
522 psrldq $2, %xmm3
523 pslldq $14, %xmm2
524 por %xmm3, %xmm2 /* merge into one 16byte value */
525 # else
526 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
527 # endif
528 TOLOWER (%xmm1, %xmm2)
529
530 pcmpeqb %xmm1, %xmm0
531 pcmpeqb %xmm2, %xmm1
532 psubb %xmm0, %xmm1
533 pmovmskb %xmm1, %edx
534 sub $0xffff, %edx
535 jnz LABEL(exit)
536
537 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
538 sub $16, %r11
539 jbe LABEL(strcmp_exitz)
540 # endif
541
542 add $16, %rcx
543 movdqa %xmm4, %xmm3
544
545 add $16, %r10
546 jg LABEL(nibble_ashr_2) /* cross page boundary */
547
548 movdqa (%rsi, %rcx), %xmm1
549 movdqa (%rdi, %rcx), %xmm2
550 movdqa %xmm2, %xmm4
551
552 # ifndef USE_SSSE3
553 psrldq $2, %xmm3
554 pslldq $14, %xmm2
555 por %xmm3, %xmm2 /* merge into one 16byte value */
556 # else
557 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
558 # endif
559 TOLOWER (%xmm1, %xmm2)
560
561 pcmpeqb %xmm1, %xmm0
562 pcmpeqb %xmm2, %xmm1
563 psubb %xmm0, %xmm1
564 pmovmskb %xmm1, %edx
565 sub $0xffff, %edx
566 jnz LABEL(exit)
567
568 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
569 sub $16, %r11
570 jbe LABEL(strcmp_exitz)
571 # endif
572
573 add $16, %rcx
574 movdqa %xmm4, %xmm3
575 jmp LABEL(loop_ashr_2)
576
577 .p2align 4
578 LABEL(nibble_ashr_2):
579 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
580 pmovmskb %xmm0, %edx
581 test $0xfffc, %edx
582 jnz LABEL(ashr_2_exittail)
583
584 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
585 cmp $14, %r11
586 jbe LABEL(ashr_2_exittail)
587 # endif
588
589 pxor %xmm0, %xmm0
590 sub $0x1000, %r10
591 jmp LABEL(gobble_ashr_2)
592
593 .p2align 4
594 LABEL(ashr_2_exittail):
595 movdqa (%rsi, %rcx), %xmm1
596 psrldq $2, %xmm0
597 psrldq $2, %xmm3
598 jmp LABEL(aftertail)
599
600 /*
601 * The following cases will be handled by ashr_3
602 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
603 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
604 */
605 .p2align 4
606 LABEL(ashr_3):
607 pxor %xmm0, %xmm0
608 movdqa (%rdi), %xmm2
609 movdqa (%rsi), %xmm1
610 pcmpeqb %xmm1, %xmm0
611 pslldq $13, %xmm2
612 TOLOWER (%xmm1, %xmm2)
613 pcmpeqb %xmm1, %xmm2
614 psubb %xmm0, %xmm2
615 pmovmskb %xmm2, %r9d
616 shr %cl, %edx
617 shr %cl, %r9d
618 sub %r9d, %edx
619 jnz LABEL(less32bytes)
620 movdqa (%rdi), %xmm3
621
622 UPDATE_STRNCMP_COUNTER
623
624 pxor %xmm0, %xmm0
625 mov $16, %rcx /* index for loads */
626 mov $3, %r9d /* byte position left over from less32bytes case */
627 /*
628 * Setup %r10 value allows us to detect crossing a page boundary.
629 * When %r10 goes positive we have crossed a page boundary and
630 * need to do a nibble.
631 */
632 lea 3(%rdi), %r10
633 and $0xfff, %r10 /* offset into 4K page */
634 sub $0x1000, %r10 /* subtract 4K pagesize */
635
636 .p2align 4
637 LABEL(loop_ashr_3):
638 add $16, %r10
639 jg LABEL(nibble_ashr_3)
640
641 LABEL(gobble_ashr_3):
642 movdqa (%rsi, %rcx), %xmm1
643 movdqa (%rdi, %rcx), %xmm2
644 movdqa %xmm2, %xmm4
645
646 # ifndef USE_SSSE3
647 psrldq $3, %xmm3
648 pslldq $13, %xmm2
649 por %xmm3, %xmm2 /* merge into one 16byte value */
650 # else
651 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
652 # endif
653 TOLOWER (%xmm1, %xmm2)
654
655 pcmpeqb %xmm1, %xmm0
656 pcmpeqb %xmm2, %xmm1
657 psubb %xmm0, %xmm1
658 pmovmskb %xmm1, %edx
659 sub $0xffff, %edx
660 jnz LABEL(exit)
661
662 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
663 sub $16, %r11
664 jbe LABEL(strcmp_exitz)
665 # endif
666
667 add $16, %rcx
668 movdqa %xmm4, %xmm3
669
670 add $16, %r10
671 jg LABEL(nibble_ashr_3) /* cross page boundary */
672
673 movdqa (%rsi, %rcx), %xmm1
674 movdqa (%rdi, %rcx), %xmm2
675 movdqa %xmm2, %xmm4
676
677 # ifndef USE_SSSE3
678 psrldq $3, %xmm3
679 pslldq $13, %xmm2
680 por %xmm3, %xmm2 /* merge into one 16byte value */
681 # else
682 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
683 # endif
684 TOLOWER (%xmm1, %xmm2)
685
686 pcmpeqb %xmm1, %xmm0
687 pcmpeqb %xmm2, %xmm1
688 psubb %xmm0, %xmm1
689 pmovmskb %xmm1, %edx
690 sub $0xffff, %edx
691 jnz LABEL(exit)
692
693 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
694 sub $16, %r11
695 jbe LABEL(strcmp_exitz)
696 # endif
697
698 add $16, %rcx
699 movdqa %xmm4, %xmm3
700 jmp LABEL(loop_ashr_3)
701
702 .p2align 4
703 LABEL(nibble_ashr_3):
704 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
705 pmovmskb %xmm0, %edx
706 test $0xfff8, %edx
707 jnz LABEL(ashr_3_exittail)
708
709 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
710 cmp $13, %r11
711 jbe LABEL(ashr_3_exittail)
712 # endif
713
714 pxor %xmm0, %xmm0
715 sub $0x1000, %r10
716 jmp LABEL(gobble_ashr_3)
717
718 .p2align 4
719 LABEL(ashr_3_exittail):
720 movdqa (%rsi, %rcx), %xmm1
721 psrldq $3, %xmm0
722 psrldq $3, %xmm3
723 jmp LABEL(aftertail)
724
725 /*
726 * The following cases will be handled by ashr_4
727 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
728 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
729 */
730 .p2align 4
731 LABEL(ashr_4):
732 pxor %xmm0, %xmm0
733 movdqa (%rdi), %xmm2
734 movdqa (%rsi), %xmm1
735 pcmpeqb %xmm1, %xmm0
736 pslldq $12, %xmm2
737 TOLOWER (%xmm1, %xmm2)
738 pcmpeqb %xmm1, %xmm2
739 psubb %xmm0, %xmm2
740 pmovmskb %xmm2, %r9d
741 shr %cl, %edx
742 shr %cl, %r9d
743 sub %r9d, %edx
744 jnz LABEL(less32bytes)
745 movdqa (%rdi), %xmm3
746
747 UPDATE_STRNCMP_COUNTER
748
749 pxor %xmm0, %xmm0
750 mov $16, %rcx /* index for loads */
751 mov $4, %r9d /* byte position left over from less32bytes case */
752 /*
753 * Setup %r10 value allows us to detect crossing a page boundary.
754 * When %r10 goes positive we have crossed a page boundary and
755 * need to do a nibble.
756 */
757 lea 4(%rdi), %r10
758 and $0xfff, %r10 /* offset into 4K page */
759 sub $0x1000, %r10 /* subtract 4K pagesize */
760
761 .p2align 4
762 LABEL(loop_ashr_4):
763 add $16, %r10
764 jg LABEL(nibble_ashr_4)
765
766 LABEL(gobble_ashr_4):
767 movdqa (%rsi, %rcx), %xmm1
768 movdqa (%rdi, %rcx), %xmm2
769 movdqa %xmm2, %xmm4
770
771 # ifndef USE_SSSE3
772 psrldq $4, %xmm3
773 pslldq $12, %xmm2
774 por %xmm3, %xmm2 /* merge into one 16byte value */
775 # else
776 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
777 # endif
778 TOLOWER (%xmm1, %xmm2)
779
780 pcmpeqb %xmm1, %xmm0
781 pcmpeqb %xmm2, %xmm1
782 psubb %xmm0, %xmm1
783 pmovmskb %xmm1, %edx
784 sub $0xffff, %edx
785 jnz LABEL(exit)
786
787 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
788 sub $16, %r11
789 jbe LABEL(strcmp_exitz)
790 # endif
791
792 add $16, %rcx
793 movdqa %xmm4, %xmm3
794
795 add $16, %r10
796 jg LABEL(nibble_ashr_4) /* cross page boundary */
797
798 movdqa (%rsi, %rcx), %xmm1
799 movdqa (%rdi, %rcx), %xmm2
800 movdqa %xmm2, %xmm4
801
802 # ifndef USE_SSSE3
803 psrldq $4, %xmm3
804 pslldq $12, %xmm2
805 por %xmm3, %xmm2 /* merge into one 16byte value */
806 # else
807 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
808 # endif
809 TOLOWER (%xmm1, %xmm2)
810
811 pcmpeqb %xmm1, %xmm0
812 pcmpeqb %xmm2, %xmm1
813 psubb %xmm0, %xmm1
814 pmovmskb %xmm1, %edx
815 sub $0xffff, %edx
816 jnz LABEL(exit)
817
818 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
819 sub $16, %r11
820 jbe LABEL(strcmp_exitz)
821 # endif
822
823 add $16, %rcx
824 movdqa %xmm4, %xmm3
825 jmp LABEL(loop_ashr_4)
826
827 .p2align 4
828 LABEL(nibble_ashr_4):
829 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
830 pmovmskb %xmm0, %edx
831 test $0xfff0, %edx
832 jnz LABEL(ashr_4_exittail)
833
834 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
835 cmp $12, %r11
836 jbe LABEL(ashr_4_exittail)
837 # endif
838
839 pxor %xmm0, %xmm0
840 sub $0x1000, %r10
841 jmp LABEL(gobble_ashr_4)
842
843 .p2align 4
844 LABEL(ashr_4_exittail):
845 movdqa (%rsi, %rcx), %xmm1
846 psrldq $4, %xmm0
847 psrldq $4, %xmm3
848 jmp LABEL(aftertail)
849
850 /*
851 * The following cases will be handled by ashr_5
852 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
853 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
854 */
855 .p2align 4
856 LABEL(ashr_5):
857 pxor %xmm0, %xmm0
858 movdqa (%rdi), %xmm2
859 movdqa (%rsi), %xmm1
860 pcmpeqb %xmm1, %xmm0
861 pslldq $11, %xmm2
862 TOLOWER (%xmm1, %xmm2)
863 pcmpeqb %xmm1, %xmm2
864 psubb %xmm0, %xmm2
865 pmovmskb %xmm2, %r9d
866 shr %cl, %edx
867 shr %cl, %r9d
868 sub %r9d, %edx
869 jnz LABEL(less32bytes)
870 movdqa (%rdi), %xmm3
871
872 UPDATE_STRNCMP_COUNTER
873
874 pxor %xmm0, %xmm0
875 mov $16, %rcx /* index for loads */
876 mov $5, %r9d /* byte position left over from less32bytes case */
877 /*
878 * Setup %r10 value allows us to detect crossing a page boundary.
879 * When %r10 goes positive we have crossed a page boundary and
880 * need to do a nibble.
881 */
882 lea 5(%rdi), %r10
883 and $0xfff, %r10 /* offset into 4K page */
884 sub $0x1000, %r10 /* subtract 4K pagesize */
885
886 .p2align 4
887 LABEL(loop_ashr_5):
888 add $16, %r10
889 jg LABEL(nibble_ashr_5)
890
891 LABEL(gobble_ashr_5):
892 movdqa (%rsi, %rcx), %xmm1
893 movdqa (%rdi, %rcx), %xmm2
894 movdqa %xmm2, %xmm4
895
896 # ifndef USE_SSSE3
897 psrldq $5, %xmm3
898 pslldq $11, %xmm2
899 por %xmm3, %xmm2 /* merge into one 16byte value */
900 # else
901 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
902 # endif
903 TOLOWER (%xmm1, %xmm2)
904
905 pcmpeqb %xmm1, %xmm0
906 pcmpeqb %xmm2, %xmm1
907 psubb %xmm0, %xmm1
908 pmovmskb %xmm1, %edx
909 sub $0xffff, %edx
910 jnz LABEL(exit)
911
912 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
913 sub $16, %r11
914 jbe LABEL(strcmp_exitz)
915 # endif
916
917 add $16, %rcx
918 movdqa %xmm4, %xmm3
919
920 add $16, %r10
921 jg LABEL(nibble_ashr_5) /* cross page boundary */
922
923 movdqa (%rsi, %rcx), %xmm1
924 movdqa (%rdi, %rcx), %xmm2
925 movdqa %xmm2, %xmm4
926
927 # ifndef USE_SSSE3
928 psrldq $5, %xmm3
929 pslldq $11, %xmm2
930 por %xmm3, %xmm2 /* merge into one 16byte value */
931 # else
932 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
933 # endif
934 TOLOWER (%xmm1, %xmm2)
935
936 pcmpeqb %xmm1, %xmm0
937 pcmpeqb %xmm2, %xmm1
938 psubb %xmm0, %xmm1
939 pmovmskb %xmm1, %edx
940 sub $0xffff, %edx
941 jnz LABEL(exit)
942
943 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
944 sub $16, %r11
945 jbe LABEL(strcmp_exitz)
946 # endif
947
948 add $16, %rcx
949 movdqa %xmm4, %xmm3
950 jmp LABEL(loop_ashr_5)
951
952 .p2align 4
953 LABEL(nibble_ashr_5):
954 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
955 pmovmskb %xmm0, %edx
956 test $0xffe0, %edx
957 jnz LABEL(ashr_5_exittail)
958
959 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
960 cmp $11, %r11
961 jbe LABEL(ashr_5_exittail)
962 # endif
963
964 pxor %xmm0, %xmm0
965 sub $0x1000, %r10
966 jmp LABEL(gobble_ashr_5)
967
968 .p2align 4
969 LABEL(ashr_5_exittail):
970 movdqa (%rsi, %rcx), %xmm1
971 psrldq $5, %xmm0
972 psrldq $5, %xmm3
973 jmp LABEL(aftertail)
974
975 /*
976 * The following cases will be handled by ashr_6
977 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
978 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
979 */
980 .p2align 4
981 LABEL(ashr_6):
982 pxor %xmm0, %xmm0
983 movdqa (%rdi), %xmm2
984 movdqa (%rsi), %xmm1
985 pcmpeqb %xmm1, %xmm0
986 pslldq $10, %xmm2
987 TOLOWER (%xmm1, %xmm2)
988 pcmpeqb %xmm1, %xmm2
989 psubb %xmm0, %xmm2
990 pmovmskb %xmm2, %r9d
991 shr %cl, %edx
992 shr %cl, %r9d
993 sub %r9d, %edx
994 jnz LABEL(less32bytes)
995 movdqa (%rdi), %xmm3
996
997 UPDATE_STRNCMP_COUNTER
998
999 pxor %xmm0, %xmm0
1000 mov $16, %rcx /* index for loads */
1001 mov $6, %r9d /* byte position left over from less32bytes case */
1002 /*
1003 * Setup %r10 value allows us to detect crossing a page boundary.
1004 * When %r10 goes positive we have crossed a page boundary and
1005 * need to do a nibble.
1006 */
1007 lea 6(%rdi), %r10
1008 and $0xfff, %r10 /* offset into 4K page */
1009 sub $0x1000, %r10 /* subtract 4K pagesize */
1010
1011 .p2align 4
1012 LABEL(loop_ashr_6):
1013 add $16, %r10
1014 jg LABEL(nibble_ashr_6)
1015
1016 LABEL(gobble_ashr_6):
1017 movdqa (%rsi, %rcx), %xmm1
1018 movdqa (%rdi, %rcx), %xmm2
1019 movdqa %xmm2, %xmm4
1020
1021 # ifndef USE_SSSE3
1022 psrldq $6, %xmm3
1023 pslldq $10, %xmm2
1024 por %xmm3, %xmm2 /* merge into one 16byte value */
1025 # else
1026 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
1027 # endif
1028 TOLOWER (%xmm1, %xmm2)
1029
1030 pcmpeqb %xmm1, %xmm0
1031 pcmpeqb %xmm2, %xmm1
1032 psubb %xmm0, %xmm1
1033 pmovmskb %xmm1, %edx
1034 sub $0xffff, %edx
1035 jnz LABEL(exit)
1036
1037 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1038 sub $16, %r11
1039 jbe LABEL(strcmp_exitz)
1040 # endif
1041
1042 add $16, %rcx
1043 movdqa %xmm4, %xmm3
1044
1045 add $16, %r10
1046 jg LABEL(nibble_ashr_6) /* cross page boundary */
1047
1048 movdqa (%rsi, %rcx), %xmm1
1049 movdqa (%rdi, %rcx), %xmm2
1050 movdqa %xmm2, %xmm4
1051
1052 # ifndef USE_SSSE3
1053 psrldq $6, %xmm3
1054 pslldq $10, %xmm2
1055 por %xmm3, %xmm2 /* merge into one 16byte value */
1056 # else
1057 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
1058 # endif
1059 TOLOWER (%xmm1, %xmm2)
1060
1061 pcmpeqb %xmm1, %xmm0
1062 pcmpeqb %xmm2, %xmm1
1063 psubb %xmm0, %xmm1
1064 pmovmskb %xmm1, %edx
1065 sub $0xffff, %edx
1066 jnz LABEL(exit)
1067
1068 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1069 sub $16, %r11
1070 jbe LABEL(strcmp_exitz)
1071 # endif
1072
1073 add $16, %rcx
1074 movdqa %xmm4, %xmm3
1075 jmp LABEL(loop_ashr_6)
1076
1077 .p2align 4
1078 LABEL(nibble_ashr_6):
1079 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1080 pmovmskb %xmm0, %edx
1081 test $0xffc0, %edx
1082 jnz LABEL(ashr_6_exittail)
1083
1084 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1085 cmp $10, %r11
1086 jbe LABEL(ashr_6_exittail)
1087 # endif
1088
1089 pxor %xmm0, %xmm0
1090 sub $0x1000, %r10
1091 jmp LABEL(gobble_ashr_6)
1092
1093 .p2align 4
1094 LABEL(ashr_6_exittail):
1095 movdqa (%rsi, %rcx), %xmm1
1096 psrldq $6, %xmm0
1097 psrldq $6, %xmm3
1098 jmp LABEL(aftertail)
1099
1100 /*
1101 * The following cases will be handled by ashr_7
1102 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1103 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
1104 */
1105 .p2align 4
1106 LABEL(ashr_7):
1107 pxor %xmm0, %xmm0
1108 movdqa (%rdi), %xmm2
1109 movdqa (%rsi), %xmm1
1110 pcmpeqb %xmm1, %xmm0
1111 pslldq $9, %xmm2
1112 TOLOWER (%xmm1, %xmm2)
1113 pcmpeqb %xmm1, %xmm2
1114 psubb %xmm0, %xmm2
1115 pmovmskb %xmm2, %r9d
1116 shr %cl, %edx
1117 shr %cl, %r9d
1118 sub %r9d, %edx
1119 jnz LABEL(less32bytes)
1120 movdqa (%rdi), %xmm3
1121
1122 UPDATE_STRNCMP_COUNTER
1123
1124 pxor %xmm0, %xmm0
1125 mov $16, %rcx /* index for loads */
1126 mov $7, %r9d /* byte position left over from less32bytes case */
1127 /*
1128 * Setup %r10 value allows us to detect crossing a page boundary.
1129 * When %r10 goes positive we have crossed a page boundary and
1130 * need to do a nibble.
1131 */
1132 lea 7(%rdi), %r10
1133 and $0xfff, %r10 /* offset into 4K page */
1134 sub $0x1000, %r10 /* subtract 4K pagesize */
1135
1136 .p2align 4
1137 LABEL(loop_ashr_7):
1138 add $16, %r10
1139 jg LABEL(nibble_ashr_7)
1140
1141 LABEL(gobble_ashr_7):
1142 movdqa (%rsi, %rcx), %xmm1
1143 movdqa (%rdi, %rcx), %xmm2
1144 movdqa %xmm2, %xmm4
1145
1146 # ifndef USE_SSSE3
1147 psrldq $7, %xmm3
1148 pslldq $9, %xmm2
1149 por %xmm3, %xmm2 /* merge into one 16byte value */
1150 # else
1151 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1152 # endif
1153 TOLOWER (%xmm1, %xmm2)
1154
1155 pcmpeqb %xmm1, %xmm0
1156 pcmpeqb %xmm2, %xmm1
1157 psubb %xmm0, %xmm1
1158 pmovmskb %xmm1, %edx
1159 sub $0xffff, %edx
1160 jnz LABEL(exit)
1161
1162 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1163 sub $16, %r11
1164 jbe LABEL(strcmp_exitz)
1165 # endif
1166
1167 add $16, %rcx
1168 movdqa %xmm4, %xmm3
1169
1170 add $16, %r10
1171 jg LABEL(nibble_ashr_7) /* cross page boundary */
1172
1173 movdqa (%rsi, %rcx), %xmm1
1174 movdqa (%rdi, %rcx), %xmm2
1175 movdqa %xmm2, %xmm4
1176
1177 # ifndef USE_SSSE3
1178 psrldq $7, %xmm3
1179 pslldq $9, %xmm2
1180 por %xmm3, %xmm2 /* merge into one 16byte value */
1181 # else
1182 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1183 # endif
1184 TOLOWER (%xmm1, %xmm2)
1185
1186 pcmpeqb %xmm1, %xmm0
1187 pcmpeqb %xmm2, %xmm1
1188 psubb %xmm0, %xmm1
1189 pmovmskb %xmm1, %edx
1190 sub $0xffff, %edx
1191 jnz LABEL(exit)
1192
1193 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1194 sub $16, %r11
1195 jbe LABEL(strcmp_exitz)
1196 # endif
1197
1198 add $16, %rcx
1199 movdqa %xmm4, %xmm3
1200 jmp LABEL(loop_ashr_7)
1201
1202 .p2align 4
1203 LABEL(nibble_ashr_7):
1204 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1205 pmovmskb %xmm0, %edx
1206 test $0xff80, %edx
1207 jnz LABEL(ashr_7_exittail)
1208
1209 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1210 cmp $9, %r11
1211 jbe LABEL(ashr_7_exittail)
1212 # endif
1213
1214 pxor %xmm0, %xmm0
1215 sub $0x1000, %r10
1216 jmp LABEL(gobble_ashr_7)
1217
1218 .p2align 4
1219 LABEL(ashr_7_exittail):
1220 movdqa (%rsi, %rcx), %xmm1
1221 psrldq $7, %xmm0
1222 psrldq $7, %xmm3
1223 jmp LABEL(aftertail)
1224
1225 /*
1226 * The following cases will be handled by ashr_8
1227 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1228 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1229 */
1230 .p2align 4
1231 LABEL(ashr_8):
1232 pxor %xmm0, %xmm0
1233 movdqa (%rdi), %xmm2
1234 movdqa (%rsi), %xmm1
1235 pcmpeqb %xmm1, %xmm0
1236 pslldq $8, %xmm2
1237 TOLOWER (%xmm1, %xmm2)
1238 pcmpeqb %xmm1, %xmm2
1239 psubb %xmm0, %xmm2
1240 pmovmskb %xmm2, %r9d
1241 shr %cl, %edx
1242 shr %cl, %r9d
1243 sub %r9d, %edx
1244 jnz LABEL(less32bytes)
1245 movdqa (%rdi), %xmm3
1246
1247 UPDATE_STRNCMP_COUNTER
1248
1249 pxor %xmm0, %xmm0
1250 mov $16, %rcx /* index for loads */
1251 mov $8, %r9d /* byte position left over from less32bytes case */
1252 /*
1253 * Setup %r10 value allows us to detect crossing a page boundary.
1254 * When %r10 goes positive we have crossed a page boundary and
1255 * need to do a nibble.
1256 */
1257 lea 8(%rdi), %r10
1258 and $0xfff, %r10 /* offset into 4K page */
1259 sub $0x1000, %r10 /* subtract 4K pagesize */
1260
1261 .p2align 4
1262 LABEL(loop_ashr_8):
1263 add $16, %r10
1264 jg LABEL(nibble_ashr_8)
1265
1266 LABEL(gobble_ashr_8):
1267 movdqa (%rsi, %rcx), %xmm1
1268 movdqa (%rdi, %rcx), %xmm2
1269 movdqa %xmm2, %xmm4
1270
1271 # ifndef USE_SSSE3
1272 psrldq $8, %xmm3
1273 pslldq $8, %xmm2
1274 por %xmm3, %xmm2 /* merge into one 16byte value */
1275 # else
1276 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1277 # endif
1278 TOLOWER (%xmm1, %xmm2)
1279
1280 pcmpeqb %xmm1, %xmm0
1281 pcmpeqb %xmm2, %xmm1
1282 psubb %xmm0, %xmm1
1283 pmovmskb %xmm1, %edx
1284 sub $0xffff, %edx
1285 jnz LABEL(exit)
1286
1287 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1288 sub $16, %r11
1289 jbe LABEL(strcmp_exitz)
1290 # endif
1291
1292 add $16, %rcx
1293 movdqa %xmm4, %xmm3
1294
1295 add $16, %r10
1296 jg LABEL(nibble_ashr_8) /* cross page boundary */
1297
1298 movdqa (%rsi, %rcx), %xmm1
1299 movdqa (%rdi, %rcx), %xmm2
1300 movdqa %xmm2, %xmm4
1301
1302 # ifndef USE_SSSE3
1303 psrldq $8, %xmm3
1304 pslldq $8, %xmm2
1305 por %xmm3, %xmm2 /* merge into one 16byte value */
1306 # else
1307 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1308 # endif
1309 TOLOWER (%xmm1, %xmm2)
1310
1311 pcmpeqb %xmm1, %xmm0
1312 pcmpeqb %xmm2, %xmm1
1313 psubb %xmm0, %xmm1
1314 pmovmskb %xmm1, %edx
1315 sub $0xffff, %edx
1316 jnz LABEL(exit)
1317
1318 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1319 sub $16, %r11
1320 jbe LABEL(strcmp_exitz)
1321 # endif
1322
1323 add $16, %rcx
1324 movdqa %xmm4, %xmm3
1325 jmp LABEL(loop_ashr_8)
1326
1327 .p2align 4
1328 LABEL(nibble_ashr_8):
1329 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1330 pmovmskb %xmm0, %edx
1331 test $0xff00, %edx
1332 jnz LABEL(ashr_8_exittail)
1333
1334 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1335 cmp $8, %r11
1336 jbe LABEL(ashr_8_exittail)
1337 # endif
1338
1339 pxor %xmm0, %xmm0
1340 sub $0x1000, %r10
1341 jmp LABEL(gobble_ashr_8)
1342
1343 .p2align 4
1344 LABEL(ashr_8_exittail):
1345 movdqa (%rsi, %rcx), %xmm1
1346 psrldq $8, %xmm0
1347 psrldq $8, %xmm3
1348 jmp LABEL(aftertail)
1349
1350 /*
1351 * The following cases will be handled by ashr_9
1352 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1353 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1354 */
1355 .p2align 4
1356 LABEL(ashr_9):
1357 pxor %xmm0, %xmm0
1358 movdqa (%rdi), %xmm2
1359 movdqa (%rsi), %xmm1
1360 pcmpeqb %xmm1, %xmm0
1361 pslldq $7, %xmm2
1362 TOLOWER (%xmm1, %xmm2)
1363 pcmpeqb %xmm1, %xmm2
1364 psubb %xmm0, %xmm2
1365 pmovmskb %xmm2, %r9d
1366 shr %cl, %edx
1367 shr %cl, %r9d
1368 sub %r9d, %edx
1369 jnz LABEL(less32bytes)
1370 movdqa (%rdi), %xmm3
1371
1372 UPDATE_STRNCMP_COUNTER
1373
1374 pxor %xmm0, %xmm0
1375 mov $16, %rcx /* index for loads */
1376 mov $9, %r9d /* byte position left over from less32bytes case */
1377 /*
1378 * Setup %r10 value allows us to detect crossing a page boundary.
1379 * When %r10 goes positive we have crossed a page boundary and
1380 * need to do a nibble.
1381 */
1382 lea 9(%rdi), %r10
1383 and $0xfff, %r10 /* offset into 4K page */
1384 sub $0x1000, %r10 /* subtract 4K pagesize */
1385
1386 .p2align 4
1387 LABEL(loop_ashr_9):
1388 add $16, %r10
1389 jg LABEL(nibble_ashr_9)
1390
1391 LABEL(gobble_ashr_9):
1392 movdqa (%rsi, %rcx), %xmm1
1393 movdqa (%rdi, %rcx), %xmm2
1394 movdqa %xmm2, %xmm4
1395
1396 # ifndef USE_SSSE3
1397 psrldq $9, %xmm3
1398 pslldq $7, %xmm2
1399 por %xmm3, %xmm2 /* merge into one 16byte value */
1400 # else
1401 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1402 # endif
1403 TOLOWER (%xmm1, %xmm2)
1404
1405 pcmpeqb %xmm1, %xmm0
1406 pcmpeqb %xmm2, %xmm1
1407 psubb %xmm0, %xmm1
1408 pmovmskb %xmm1, %edx
1409 sub $0xffff, %edx
1410 jnz LABEL(exit)
1411
1412 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1413 sub $16, %r11
1414 jbe LABEL(strcmp_exitz)
1415 # endif
1416
1417 add $16, %rcx
1418 movdqa %xmm4, %xmm3
1419
1420 add $16, %r10
1421 jg LABEL(nibble_ashr_9) /* cross page boundary */
1422
1423 movdqa (%rsi, %rcx), %xmm1
1424 movdqa (%rdi, %rcx), %xmm2
1425 movdqa %xmm2, %xmm4
1426
1427 # ifndef USE_SSSE3
1428 psrldq $9, %xmm3
1429 pslldq $7, %xmm2
1430 por %xmm3, %xmm2 /* merge into one 16byte value */
1431 # else
1432 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1433 # endif
1434 TOLOWER (%xmm1, %xmm2)
1435
1436 pcmpeqb %xmm1, %xmm0
1437 pcmpeqb %xmm2, %xmm1
1438 psubb %xmm0, %xmm1
1439 pmovmskb %xmm1, %edx
1440 sub $0xffff, %edx
1441 jnz LABEL(exit)
1442
1443 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1444 sub $16, %r11
1445 jbe LABEL(strcmp_exitz)
1446 # endif
1447
1448 add $16, %rcx
1449 movdqa %xmm4, %xmm3 /* store for next cycle */
1450 jmp LABEL(loop_ashr_9)
1451
1452 .p2align 4
1453 LABEL(nibble_ashr_9):
1454 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1455 pmovmskb %xmm0, %edx
1456 test $0xfe00, %edx
1457 jnz LABEL(ashr_9_exittail)
1458
1459 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1460 cmp $7, %r11
1461 jbe LABEL(ashr_9_exittail)
1462 # endif
1463
1464 pxor %xmm0, %xmm0
1465 sub $0x1000, %r10
1466 jmp LABEL(gobble_ashr_9)
1467
1468 .p2align 4
1469 LABEL(ashr_9_exittail):
1470 movdqa (%rsi, %rcx), %xmm1
1471 psrldq $9, %xmm0
1472 psrldq $9, %xmm3
1473 jmp LABEL(aftertail)
1474
1475 /*
1476 * The following cases will be handled by ashr_10
1477 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1478 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1479 */
1480 .p2align 4
1481 LABEL(ashr_10):
1482 pxor %xmm0, %xmm0
1483 movdqa (%rdi), %xmm2
1484 movdqa (%rsi), %xmm1
1485 pcmpeqb %xmm1, %xmm0
1486 pslldq $6, %xmm2
1487 TOLOWER (%xmm1, %xmm2)
1488 pcmpeqb %xmm1, %xmm2
1489 psubb %xmm0, %xmm2
1490 pmovmskb %xmm2, %r9d
1491 shr %cl, %edx
1492 shr %cl, %r9d
1493 sub %r9d, %edx
1494 jnz LABEL(less32bytes)
1495 movdqa (%rdi), %xmm3
1496
1497 UPDATE_STRNCMP_COUNTER
1498
1499 pxor %xmm0, %xmm0
1500 mov $16, %rcx /* index for loads */
1501 mov $10, %r9d /* byte position left over from less32bytes case */
1502 /*
1503 * Setup %r10 value allows us to detect crossing a page boundary.
1504 * When %r10 goes positive we have crossed a page boundary and
1505 * need to do a nibble.
1506 */
1507 lea 10(%rdi), %r10
1508 and $0xfff, %r10 /* offset into 4K page */
1509 sub $0x1000, %r10 /* subtract 4K pagesize */
1510
1511 .p2align 4
1512 LABEL(loop_ashr_10):
1513 add $16, %r10
1514 jg LABEL(nibble_ashr_10)
1515
1516 LABEL(gobble_ashr_10):
1517 movdqa (%rsi, %rcx), %xmm1
1518 movdqa (%rdi, %rcx), %xmm2
1519 movdqa %xmm2, %xmm4
1520
1521 # ifndef USE_SSSE3
1522 psrldq $10, %xmm3
1523 pslldq $6, %xmm2
1524 por %xmm3, %xmm2 /* merge into one 16byte value */
1525 # else
1526 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1527 # endif
1528 TOLOWER (%xmm1, %xmm2)
1529
1530 pcmpeqb %xmm1, %xmm0
1531 pcmpeqb %xmm2, %xmm1
1532 psubb %xmm0, %xmm1
1533 pmovmskb %xmm1, %edx
1534 sub $0xffff, %edx
1535 jnz LABEL(exit)
1536
1537 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1538 sub $16, %r11
1539 jbe LABEL(strcmp_exitz)
1540 # endif
1541
1542 add $16, %rcx
1543 movdqa %xmm4, %xmm3
1544
1545 add $16, %r10
1546 jg LABEL(nibble_ashr_10) /* cross page boundary */
1547
1548 movdqa (%rsi, %rcx), %xmm1
1549 movdqa (%rdi, %rcx), %xmm2
1550 movdqa %xmm2, %xmm4
1551
1552 # ifndef USE_SSSE3
1553 psrldq $10, %xmm3
1554 pslldq $6, %xmm2
1555 por %xmm3, %xmm2 /* merge into one 16byte value */
1556 # else
1557 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1558 # endif
1559 TOLOWER (%xmm1, %xmm2)
1560
1561 pcmpeqb %xmm1, %xmm0
1562 pcmpeqb %xmm2, %xmm1
1563 psubb %xmm0, %xmm1
1564 pmovmskb %xmm1, %edx
1565 sub $0xffff, %edx
1566 jnz LABEL(exit)
1567
1568 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1569 sub $16, %r11
1570 jbe LABEL(strcmp_exitz)
1571 # endif
1572
1573 add $16, %rcx
1574 movdqa %xmm4, %xmm3
1575 jmp LABEL(loop_ashr_10)
1576
1577 .p2align 4
1578 LABEL(nibble_ashr_10):
1579 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1580 pmovmskb %xmm0, %edx
1581 test $0xfc00, %edx
1582 jnz LABEL(ashr_10_exittail)
1583
1584 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1585 cmp $6, %r11
1586 jbe LABEL(ashr_10_exittail)
1587 # endif
1588
1589 pxor %xmm0, %xmm0
1590 sub $0x1000, %r10
1591 jmp LABEL(gobble_ashr_10)
1592
1593 .p2align 4
1594 LABEL(ashr_10_exittail):
1595 movdqa (%rsi, %rcx), %xmm1
1596 psrldq $10, %xmm0
1597 psrldq $10, %xmm3
1598 jmp LABEL(aftertail)
1599
1600 /*
1601 * The following cases will be handled by ashr_11
1602 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1603 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1604 */
1605 .p2align 4
1606 LABEL(ashr_11):
1607 pxor %xmm0, %xmm0
1608 movdqa (%rdi), %xmm2
1609 movdqa (%rsi), %xmm1
1610 pcmpeqb %xmm1, %xmm0
1611 pslldq $5, %xmm2
1612 TOLOWER (%xmm1, %xmm2)
1613 pcmpeqb %xmm1, %xmm2
1614 psubb %xmm0, %xmm2
1615 pmovmskb %xmm2, %r9d
1616 shr %cl, %edx
1617 shr %cl, %r9d
1618 sub %r9d, %edx
1619 jnz LABEL(less32bytes)
1620 movdqa (%rdi), %xmm3
1621
1622 UPDATE_STRNCMP_COUNTER
1623
1624 pxor %xmm0, %xmm0
1625 mov $16, %rcx /* index for loads */
1626 mov $11, %r9d /* byte position left over from less32bytes case */
1627 /*
1628 * Setup %r10 value allows us to detect crossing a page boundary.
1629 * When %r10 goes positive we have crossed a page boundary and
1630 * need to do a nibble.
1631 */
1632 lea 11(%rdi), %r10
1633 and $0xfff, %r10 /* offset into 4K page */
1634 sub $0x1000, %r10 /* subtract 4K pagesize */
1635
1636 .p2align 4
1637 LABEL(loop_ashr_11):
1638 add $16, %r10
1639 jg LABEL(nibble_ashr_11)
1640
1641 LABEL(gobble_ashr_11):
1642 movdqa (%rsi, %rcx), %xmm1
1643 movdqa (%rdi, %rcx), %xmm2
1644 movdqa %xmm2, %xmm4
1645
1646 # ifndef USE_SSSE3
1647 psrldq $11, %xmm3
1648 pslldq $5, %xmm2
1649 por %xmm3, %xmm2 /* merge into one 16byte value */
1650 # else
1651 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1652 # endif
1653 TOLOWER (%xmm1, %xmm2)
1654
1655 pcmpeqb %xmm1, %xmm0
1656 pcmpeqb %xmm2, %xmm1
1657 psubb %xmm0, %xmm1
1658 pmovmskb %xmm1, %edx
1659 sub $0xffff, %edx
1660 jnz LABEL(exit)
1661
1662 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1663 sub $16, %r11
1664 jbe LABEL(strcmp_exitz)
1665 # endif
1666
1667 add $16, %rcx
1668 movdqa %xmm4, %xmm3
1669
1670 add $16, %r10
1671 jg LABEL(nibble_ashr_11) /* cross page boundary */
1672
1673 movdqa (%rsi, %rcx), %xmm1
1674 movdqa (%rdi, %rcx), %xmm2
1675 movdqa %xmm2, %xmm4
1676
1677 # ifndef USE_SSSE3
1678 psrldq $11, %xmm3
1679 pslldq $5, %xmm2
1680 por %xmm3, %xmm2 /* merge into one 16byte value */
1681 # else
1682 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1683 # endif
1684 TOLOWER (%xmm1, %xmm2)
1685
1686 pcmpeqb %xmm1, %xmm0
1687 pcmpeqb %xmm2, %xmm1
1688 psubb %xmm0, %xmm1
1689 pmovmskb %xmm1, %edx
1690 sub $0xffff, %edx
1691 jnz LABEL(exit)
1692
1693 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1694 sub $16, %r11
1695 jbe LABEL(strcmp_exitz)
1696 # endif
1697
1698 add $16, %rcx
1699 movdqa %xmm4, %xmm3
1700 jmp LABEL(loop_ashr_11)
1701
1702 .p2align 4
1703 LABEL(nibble_ashr_11):
1704 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1705 pmovmskb %xmm0, %edx
1706 test $0xf800, %edx
1707 jnz LABEL(ashr_11_exittail)
1708
1709 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1710 cmp $5, %r11
1711 jbe LABEL(ashr_11_exittail)
1712 # endif
1713
1714 pxor %xmm0, %xmm0
1715 sub $0x1000, %r10
1716 jmp LABEL(gobble_ashr_11)
1717
1718 .p2align 4
1719 LABEL(ashr_11_exittail):
1720 movdqa (%rsi, %rcx), %xmm1
1721 psrldq $11, %xmm0
1722 psrldq $11, %xmm3
1723 jmp LABEL(aftertail)
1724
1725 /*
1726 * The following cases will be handled by ashr_12
1727 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1728 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1729 */
1730 .p2align 4
1731 LABEL(ashr_12):
1732 pxor %xmm0, %xmm0
1733 movdqa (%rdi), %xmm2
1734 movdqa (%rsi), %xmm1
1735 pcmpeqb %xmm1, %xmm0
1736 pslldq $4, %xmm2
1737 TOLOWER (%xmm1, %xmm2)
1738 pcmpeqb %xmm1, %xmm2
1739 psubb %xmm0, %xmm2
1740 pmovmskb %xmm2, %r9d
1741 shr %cl, %edx
1742 shr %cl, %r9d
1743 sub %r9d, %edx
1744 jnz LABEL(less32bytes)
1745 movdqa (%rdi), %xmm3
1746
1747 UPDATE_STRNCMP_COUNTER
1748
1749 pxor %xmm0, %xmm0
1750 mov $16, %rcx /* index for loads */
1751 mov $12, %r9d /* byte position left over from less32bytes case */
1752 /*
1753 * Setup %r10 value allows us to detect crossing a page boundary.
1754 * When %r10 goes positive we have crossed a page boundary and
1755 * need to do a nibble.
1756 */
1757 lea 12(%rdi), %r10
1758 and $0xfff, %r10 /* offset into 4K page */
1759 sub $0x1000, %r10 /* subtract 4K pagesize */
1760
1761 .p2align 4
1762 LABEL(loop_ashr_12):
1763 add $16, %r10
1764 jg LABEL(nibble_ashr_12)
1765
1766 LABEL(gobble_ashr_12):
1767 movdqa (%rsi, %rcx), %xmm1
1768 movdqa (%rdi, %rcx), %xmm2
1769 movdqa %xmm2, %xmm4
1770
1771 # ifndef USE_SSSE3
1772 psrldq $12, %xmm3
1773 pslldq $4, %xmm2
1774 por %xmm3, %xmm2 /* merge into one 16byte value */
1775 # else
1776 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1777 # endif
1778 TOLOWER (%xmm1, %xmm2)
1779
1780 pcmpeqb %xmm1, %xmm0
1781 pcmpeqb %xmm2, %xmm1
1782 psubb %xmm0, %xmm1
1783 pmovmskb %xmm1, %edx
1784 sub $0xffff, %edx
1785 jnz LABEL(exit)
1786
1787 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1788 sub $16, %r11
1789 jbe LABEL(strcmp_exitz)
1790 # endif
1791
1792 add $16, %rcx
1793 movdqa %xmm4, %xmm3
1794
1795 add $16, %r10
1796 jg LABEL(nibble_ashr_12) /* cross page boundary */
1797
1798 movdqa (%rsi, %rcx), %xmm1
1799 movdqa (%rdi, %rcx), %xmm2
1800 movdqa %xmm2, %xmm4
1801
1802 # ifndef USE_SSSE3
1803 psrldq $12, %xmm3
1804 pslldq $4, %xmm2
1805 por %xmm3, %xmm2 /* merge into one 16byte value */
1806 # else
1807 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1808 # endif
1809 TOLOWER (%xmm1, %xmm2)
1810
1811 pcmpeqb %xmm1, %xmm0
1812 pcmpeqb %xmm2, %xmm1
1813 psubb %xmm0, %xmm1
1814 pmovmskb %xmm1, %edx
1815 sub $0xffff, %edx
1816 jnz LABEL(exit)
1817
1818 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1819 sub $16, %r11
1820 jbe LABEL(strcmp_exitz)
1821 # endif
1822
1823 add $16, %rcx
1824 movdqa %xmm4, %xmm3
1825 jmp LABEL(loop_ashr_12)
1826
1827 .p2align 4
1828 LABEL(nibble_ashr_12):
1829 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1830 pmovmskb %xmm0, %edx
1831 test $0xf000, %edx
1832 jnz LABEL(ashr_12_exittail)
1833
1834 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1835 cmp $4, %r11
1836 jbe LABEL(ashr_12_exittail)
1837 # endif
1838
1839 pxor %xmm0, %xmm0
1840 sub $0x1000, %r10
1841 jmp LABEL(gobble_ashr_12)
1842
1843 .p2align 4
1844 LABEL(ashr_12_exittail):
1845 movdqa (%rsi, %rcx), %xmm1
1846 psrldq $12, %xmm0
1847 psrldq $12, %xmm3
1848 jmp LABEL(aftertail)
1849
1850 /*
1851 * The following cases will be handled by ashr_13
1852 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1853 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1854 */
1855 .p2align 4
1856 LABEL(ashr_13):
1857 pxor %xmm0, %xmm0
1858 movdqa (%rdi), %xmm2
1859 movdqa (%rsi), %xmm1
1860 pcmpeqb %xmm1, %xmm0
1861 pslldq $3, %xmm2
1862 TOLOWER (%xmm1, %xmm2)
1863 pcmpeqb %xmm1, %xmm2
1864 psubb %xmm0, %xmm2
1865 pmovmskb %xmm2, %r9d
1866 shr %cl, %edx
1867 shr %cl, %r9d
1868 sub %r9d, %edx
1869 jnz LABEL(less32bytes)
1870 movdqa (%rdi), %xmm3
1871
1872 UPDATE_STRNCMP_COUNTER
1873
1874 pxor %xmm0, %xmm0
1875 mov $16, %rcx /* index for loads */
1876 mov $13, %r9d /* byte position left over from less32bytes case */
1877 /*
1878 * Setup %r10 value allows us to detect crossing a page boundary.
1879 * When %r10 goes positive we have crossed a page boundary and
1880 * need to do a nibble.
1881 */
1882 lea 13(%rdi), %r10
1883 and $0xfff, %r10 /* offset into 4K page */
1884 sub $0x1000, %r10 /* subtract 4K pagesize */
1885
1886 .p2align 4
1887 LABEL(loop_ashr_13):
1888 add $16, %r10
1889 jg LABEL(nibble_ashr_13)
1890
1891 LABEL(gobble_ashr_13):
1892 movdqa (%rsi, %rcx), %xmm1
1893 movdqa (%rdi, %rcx), %xmm2
1894 movdqa %xmm2, %xmm4
1895
1896 # ifndef USE_SSSE3
1897 psrldq $13, %xmm3
1898 pslldq $3, %xmm2
1899 por %xmm3, %xmm2 /* merge into one 16byte value */
1900 # else
1901 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1902 # endif
1903 TOLOWER (%xmm1, %xmm2)
1904
1905 pcmpeqb %xmm1, %xmm0
1906 pcmpeqb %xmm2, %xmm1
1907 psubb %xmm0, %xmm1
1908 pmovmskb %xmm1, %edx
1909 sub $0xffff, %edx
1910 jnz LABEL(exit)
1911
1912 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1913 sub $16, %r11
1914 jbe LABEL(strcmp_exitz)
1915 # endif
1916
1917 add $16, %rcx
1918 movdqa %xmm4, %xmm3
1919
1920 add $16, %r10
1921 jg LABEL(nibble_ashr_13) /* cross page boundary */
1922
1923 movdqa (%rsi, %rcx), %xmm1
1924 movdqa (%rdi, %rcx), %xmm2
1925 movdqa %xmm2, %xmm4
1926
1927 # ifndef USE_SSSE3
1928 psrldq $13, %xmm3
1929 pslldq $3, %xmm2
1930 por %xmm3, %xmm2 /* merge into one 16byte value */
1931 # else
1932 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1933 # endif
1934 TOLOWER (%xmm1, %xmm2)
1935
1936 pcmpeqb %xmm1, %xmm0
1937 pcmpeqb %xmm2, %xmm1
1938 psubb %xmm0, %xmm1
1939 pmovmskb %xmm1, %edx
1940 sub $0xffff, %edx
1941 jnz LABEL(exit)
1942
1943 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1944 sub $16, %r11
1945 jbe LABEL(strcmp_exitz)
1946 # endif
1947
1948 add $16, %rcx
1949 movdqa %xmm4, %xmm3
1950 jmp LABEL(loop_ashr_13)
1951
1952 .p2align 4
1953 LABEL(nibble_ashr_13):
1954 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1955 pmovmskb %xmm0, %edx
1956 test $0xe000, %edx
1957 jnz LABEL(ashr_13_exittail)
1958
1959 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1960 cmp $3, %r11
1961 jbe LABEL(ashr_13_exittail)
1962 # endif
1963
1964 pxor %xmm0, %xmm0
1965 sub $0x1000, %r10
1966 jmp LABEL(gobble_ashr_13)
1967
1968 .p2align 4
1969 LABEL(ashr_13_exittail):
1970 movdqa (%rsi, %rcx), %xmm1
1971 psrldq $13, %xmm0
1972 psrldq $13, %xmm3
1973 jmp LABEL(aftertail)
1974
1975 /*
1976 * The following cases will be handled by ashr_14
1977 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1978 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1979 */
1980 .p2align 4
1981 LABEL(ashr_14):
1982 pxor %xmm0, %xmm0
1983 movdqa (%rdi), %xmm2
1984 movdqa (%rsi), %xmm1
1985 pcmpeqb %xmm1, %xmm0
1986 pslldq $2, %xmm2
1987 TOLOWER (%xmm1, %xmm2)
1988 pcmpeqb %xmm1, %xmm2
1989 psubb %xmm0, %xmm2
1990 pmovmskb %xmm2, %r9d
1991 shr %cl, %edx
1992 shr %cl, %r9d
1993 sub %r9d, %edx
1994 jnz LABEL(less32bytes)
1995 movdqa (%rdi), %xmm3
1996
1997 UPDATE_STRNCMP_COUNTER
1998
1999 pxor %xmm0, %xmm0
2000 mov $16, %rcx /* index for loads */
2001 mov $14, %r9d /* byte position left over from less32bytes case */
2002 /*
2003 * Setup %r10 value allows us to detect crossing a page boundary.
2004 * When %r10 goes positive we have crossed a page boundary and
2005 * need to do a nibble.
2006 */
2007 lea 14(%rdi), %r10
2008 and $0xfff, %r10 /* offset into 4K page */
2009 sub $0x1000, %r10 /* subtract 4K pagesize */
2010
2011 .p2align 4
2012 LABEL(loop_ashr_14):
2013 add $16, %r10
2014 jg LABEL(nibble_ashr_14)
2015
2016 LABEL(gobble_ashr_14):
2017 movdqa (%rsi, %rcx), %xmm1
2018 movdqa (%rdi, %rcx), %xmm2
2019 movdqa %xmm2, %xmm4
2020
2021 # ifndef USE_SSSE3
2022 psrldq $14, %xmm3
2023 pslldq $2, %xmm2
2024 por %xmm3, %xmm2 /* merge into one 16byte value */
2025 # else
2026 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
2027 # endif
2028 TOLOWER (%xmm1, %xmm2)
2029
2030 pcmpeqb %xmm1, %xmm0
2031 pcmpeqb %xmm2, %xmm1
2032 psubb %xmm0, %xmm1
2033 pmovmskb %xmm1, %edx
2034 sub $0xffff, %edx
2035 jnz LABEL(exit)
2036
2037 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2038 sub $16, %r11
2039 jbe LABEL(strcmp_exitz)
2040 # endif
2041
2042 add $16, %rcx
2043 movdqa %xmm4, %xmm3
2044
2045 add $16, %r10
2046 jg LABEL(nibble_ashr_14) /* cross page boundary */
2047
2048 movdqa (%rsi, %rcx), %xmm1
2049 movdqa (%rdi, %rcx), %xmm2
2050 movdqa %xmm2, %xmm4
2051
2052 # ifndef USE_SSSE3
2053 psrldq $14, %xmm3
2054 pslldq $2, %xmm2
2055 por %xmm3, %xmm2 /* merge into one 16byte value */
2056 # else
2057 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
2058 # endif
2059 TOLOWER (%xmm1, %xmm2)
2060
2061 pcmpeqb %xmm1, %xmm0
2062 pcmpeqb %xmm2, %xmm1
2063 psubb %xmm0, %xmm1
2064 pmovmskb %xmm1, %edx
2065 sub $0xffff, %edx
2066 jnz LABEL(exit)
2067
2068 # if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
2069 sub $16, %r11
2070 jbe LABEL(strcmp_exitz)
2071 # endif
2072
2073 add $16, %rcx
2074 movdqa %xmm4, %xmm3
2075 jmp LABEL(loop_ashr_14)
2076
2077 .p2align 4
2078 LABEL(nibble_ashr_14):
2079 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2080 pmovmskb %xmm0, %edx
2081 test $0xc000, %edx
2082 jnz LABEL(ashr_14_exittail)
2083
2084 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2085 cmp $2, %r11
2086 jbe LABEL(ashr_14_exittail)
2087 # endif
2088
2089 pxor %xmm0, %xmm0
2090 sub $0x1000, %r10
2091 jmp LABEL(gobble_ashr_14)
2092
2093 .p2align 4
2094 LABEL(ashr_14_exittail):
2095 movdqa (%rsi, %rcx), %xmm1
2096 psrldq $14, %xmm0
2097 psrldq $14, %xmm3
2098 jmp LABEL(aftertail)
2099
2100 /*
2101 * The following cases will be handled by ashr_15
2102 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
2103 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
2104 */
2105 .p2align 4
2106 LABEL(ashr_15):
2107 pxor %xmm0, %xmm0
2108 movdqa (%rdi), %xmm2
2109 movdqa (%rsi), %xmm1
2110 pcmpeqb %xmm1, %xmm0
2111 pslldq $1, %xmm2
2112 TOLOWER (%xmm1, %xmm2)
2113 pcmpeqb %xmm1, %xmm2
2114 psubb %xmm0, %xmm2
2115 pmovmskb %xmm2, %r9d
2116 shr %cl, %edx
2117 shr %cl, %r9d
2118 sub %r9d, %edx
2119 jnz LABEL(less32bytes)
2120
2121 movdqa (%rdi), %xmm3
2122
2123 UPDATE_STRNCMP_COUNTER
2124
2125 pxor %xmm0, %xmm0
2126 mov $16, %rcx /* index for loads */
2127 mov $15, %r9d /* byte position left over from less32bytes case */
2128 /*
2129 * Setup %r10 value allows us to detect crossing a page boundary.
2130 * When %r10 goes positive we have crossed a page boundary and
2131 * need to do a nibble.
2132 */
2133 lea 15(%rdi), %r10
2134 and $0xfff, %r10 /* offset into 4K page */
2135
2136 sub $0x1000, %r10 /* subtract 4K pagesize */
2137
2138 .p2align 4
2139 LABEL(loop_ashr_15):
2140 add $16, %r10
2141 jg LABEL(nibble_ashr_15)
2142
2143 LABEL(gobble_ashr_15):
2144 movdqa (%rsi, %rcx), %xmm1
2145 movdqa (%rdi, %rcx), %xmm2
2146 movdqa %xmm2, %xmm4
2147
2148 # ifndef USE_SSSE3
2149 psrldq $15, %xmm3
2150 pslldq $1, %xmm2
2151 por %xmm3, %xmm2 /* merge into one 16byte value */
2152 # else
2153 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2154 # endif
2155 TOLOWER (%xmm1, %xmm2)
2156
2157 pcmpeqb %xmm1, %xmm0
2158 pcmpeqb %xmm2, %xmm1
2159 psubb %xmm0, %xmm1
2160 pmovmskb %xmm1, %edx
2161 sub $0xffff, %edx
2162 jnz LABEL(exit)
2163
2164 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2165 sub $16, %r11
2166 jbe LABEL(strcmp_exitz)
2167 # endif
2168
2169 add $16, %rcx
2170 movdqa %xmm4, %xmm3
2171
2172 add $16, %r10
2173 jg LABEL(nibble_ashr_15) /* cross page boundary */
2174
2175 movdqa (%rsi, %rcx), %xmm1
2176 movdqa (%rdi, %rcx), %xmm2
2177 movdqa %xmm2, %xmm4
2178
2179 # ifndef USE_SSSE3
2180 psrldq $15, %xmm3
2181 pslldq $1, %xmm2
2182 por %xmm3, %xmm2 /* merge into one 16byte value */
2183 # else
2184 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2185 # endif
2186 TOLOWER (%xmm1, %xmm2)
2187
2188 pcmpeqb %xmm1, %xmm0
2189 pcmpeqb %xmm2, %xmm1
2190 psubb %xmm0, %xmm1
2191 pmovmskb %xmm1, %edx
2192 sub $0xffff, %edx
2193 jnz LABEL(exit)
2194
2195 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2196 sub $16, %r11
2197 jbe LABEL(strcmp_exitz)
2198 # endif
2199
2200 add $16, %rcx
2201 movdqa %xmm4, %xmm3
2202 jmp LABEL(loop_ashr_15)
2203
2204 .p2align 4
2205 LABEL(nibble_ashr_15):
2206 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2207 pmovmskb %xmm0, %edx
2208 test $0x8000, %edx
2209 jnz LABEL(ashr_15_exittail)
2210
2211 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2212 cmpq $1, %r11
2213 jbe LABEL(ashr_15_exittail)
2214 # endif
2215
2216 pxor %xmm0, %xmm0
2217 sub $0x1000, %r10
2218 jmp LABEL(gobble_ashr_15)
2219
2220 .p2align 4
2221 LABEL(ashr_15_exittail):
2222 movdqa (%rsi, %rcx), %xmm1
2223 psrldq $15, %xmm3
2224 psrldq $15, %xmm0
2225
2226 .p2align 4
2227 LABEL(aftertail):
2228 TOLOWER (%xmm1, %xmm3)
2229 pcmpeqb %xmm3, %xmm1
2230 psubb %xmm0, %xmm1
2231 pmovmskb %xmm1, %edx
2232 not %edx
2233
2234 .p2align 4
2235 LABEL(exit):
2236 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
2237 LABEL(less32bytes):
2238 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
2239 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
2240 test %r8d, %r8d
2241 jz LABEL(ret)
2242 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
2243
2244 .p2align 4
2245 LABEL(ret):
2246 LABEL(less16bytes):
2247 bsf %rdx, %rdx /* find and store bit index in %rdx */
2248
2249 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2250 sub %rdx, %r11
2251 jbe LABEL(strcmp_exitz)
2252 # endif
2253 movzbl (%rsi, %rdx), %ecx
2254 movzbl (%rdi, %rdx), %eax
2255
2256 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2257 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2258 movl (%rdx,%rcx,4), %ecx
2259 movl (%rdx,%rax,4), %eax
2260 # endif
2261
2262 sub %ecx, %eax
2263 ret
2264
2265 LABEL(strcmp_exitz):
2266 xor %eax, %eax
2267 ret
2268
2269 .p2align 4
2270 LABEL(Byte0):
2271 movzx (%rsi), %ecx
2272 movzx (%rdi), %eax
2273
2274 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2275 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2276 movl (%rdx,%rcx,4), %ecx
2277 movl (%rdx,%rax,4), %eax
2278 # endif
2279
2280 sub %ecx, %eax
2281 ret
2282 END (STRCMP)
2283
2284 .section .rodata,"a",@progbits
2285 .p2align 3
2286 LABEL(unaligned_table):
2287 .int LABEL(ashr_1) - LABEL(unaligned_table)
2288 .int LABEL(ashr_2) - LABEL(unaligned_table)
2289 .int LABEL(ashr_3) - LABEL(unaligned_table)
2290 .int LABEL(ashr_4) - LABEL(unaligned_table)
2291 .int LABEL(ashr_5) - LABEL(unaligned_table)
2292 .int LABEL(ashr_6) - LABEL(unaligned_table)
2293 .int LABEL(ashr_7) - LABEL(unaligned_table)
2294 .int LABEL(ashr_8) - LABEL(unaligned_table)
2295 .int LABEL(ashr_9) - LABEL(unaligned_table)
2296 .int LABEL(ashr_10) - LABEL(unaligned_table)
2297 .int LABEL(ashr_11) - LABEL(unaligned_table)
2298 .int LABEL(ashr_12) - LABEL(unaligned_table)
2299 .int LABEL(ashr_13) - LABEL(unaligned_table)
2300 .int LABEL(ashr_14) - LABEL(unaligned_table)
2301 .int LABEL(ashr_15) - LABEL(unaligned_table)
2302 .int LABEL(ashr_0) - LABEL(unaligned_table)
2303 #endif /* NOT_IN_libc */
2304 libc_hidden_builtin_def (STRCMP)