]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/strrchr-sse2.S
bcb59bdb888e7c632b3e218dfa5492aac5ee6f0d
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / strrchr-sse2.S
1 /* strrchr SSE2 without bsf and bsr
2 Copyright (C) 2011-2018 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23
24 # define CFI_PUSH(REG) \
25 cfi_adjust_cfa_offset (4); \
26 cfi_rel_offset (REG, 0)
27
28 # define CFI_POP(REG) \
29 cfi_adjust_cfa_offset (-4); \
30 cfi_restore (REG)
31
32 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
33 # define POP(REG) popl REG; CFI_POP (REG)
34
35 # define PARMS 8
36 # define ENTRANCE PUSH(%edi);
37 # define RETURN POP(%edi); ret; CFI_PUSH(%edi);
38
39 # define STR1 PARMS
40 # define STR2 STR1+4
41
42 atom_text_section
43 ENTRY (__strrchr_sse2)
44
45 ENTRANCE
46 mov STR1(%esp), %ecx
47 movd STR2(%esp), %xmm1
48
49 pxor %xmm2, %xmm2
50 mov %ecx, %edi
51 punpcklbw %xmm1, %xmm1
52 punpcklbw %xmm1, %xmm1
53 /* ECX has OFFSET. */
54 and $63, %ecx
55 cmp $48, %ecx
56 pshufd $0, %xmm1, %xmm1
57 ja L(crosscache)
58
59 /* unaligned string. */
60 movdqu (%edi), %xmm0
61 pcmpeqb %xmm0, %xmm2
62 pcmpeqb %xmm1, %xmm0
63 /* Find where NULL is. */
64 pmovmskb %xmm2, %ecx
65 /* Check if there is a match. */
66 pmovmskb %xmm0, %eax
67 add $16, %edi
68
69 test %eax, %eax
70 jnz L(unaligned_match1)
71
72 test %ecx, %ecx
73 jnz L(return_null)
74
75 and $-16, %edi
76
77 PUSH (%esi)
78 PUSH (%ebx)
79
80 xor %ebx, %ebx
81 jmp L(loop)
82
83 CFI_POP (%esi)
84 CFI_POP (%ebx)
85
86 .p2align 4
87 L(unaligned_match1):
88 test %ecx, %ecx
89 jnz L(prolog_find_zero_1)
90
91 PUSH (%esi)
92 PUSH (%ebx)
93
94 mov %eax, %ebx
95 mov %edi, %esi
96 and $-16, %edi
97 jmp L(loop)
98
99 CFI_POP (%esi)
100 CFI_POP (%ebx)
101
102 .p2align 4
103 L(crosscache):
104 /* Hancle unaligned string. */
105 and $15, %ecx
106 and $-16, %edi
107 pxor %xmm3, %xmm3
108 movdqa (%edi), %xmm0
109 pcmpeqb %xmm0, %xmm3
110 pcmpeqb %xmm1, %xmm0
111 /* Find where NULL is. */
112 pmovmskb %xmm3, %edx
113 /* Check if there is a match. */
114 pmovmskb %xmm0, %eax
115 /* Remove the leading bytes. */
116 shr %cl, %edx
117 shr %cl, %eax
118 add $16, %edi
119
120 test %eax, %eax
121 jnz L(unaligned_match)
122
123 test %edx, %edx
124 jnz L(return_null)
125
126 PUSH (%esi)
127 PUSH (%ebx)
128
129 xor %ebx, %ebx
130 jmp L(loop)
131
132 CFI_POP (%esi)
133 CFI_POP (%ebx)
134
135 .p2align 4
136 L(unaligned_match):
137 test %edx, %edx
138 jnz L(prolog_find_zero)
139
140 PUSH (%esi)
141 PUSH (%ebx)
142
143 mov %eax, %ebx
144 lea (%edi, %ecx), %esi
145
146 /* Loop start on aligned string. */
147 .p2align 4
148 L(loop):
149 movdqa (%edi), %xmm0
150 pcmpeqb %xmm0, %xmm2
151 add $16, %edi
152 pcmpeqb %xmm1, %xmm0
153 pmovmskb %xmm2, %ecx
154 pmovmskb %xmm0, %eax
155 or %eax, %ecx
156 jnz L(matches)
157
158 movdqa (%edi), %xmm0
159 pcmpeqb %xmm0, %xmm2
160 add $16, %edi
161 pcmpeqb %xmm1, %xmm0
162 pmovmskb %xmm2, %ecx
163 pmovmskb %xmm0, %eax
164 or %eax, %ecx
165 jnz L(matches)
166
167 movdqa (%edi), %xmm0
168 pcmpeqb %xmm0, %xmm2
169 add $16, %edi
170 pcmpeqb %xmm1, %xmm0
171 pmovmskb %xmm2, %ecx
172 pmovmskb %xmm0, %eax
173 or %eax, %ecx
174 jnz L(matches)
175
176 movdqa (%edi), %xmm0
177 pcmpeqb %xmm0, %xmm2
178 add $16, %edi
179 pcmpeqb %xmm1, %xmm0
180 pmovmskb %xmm2, %ecx
181 pmovmskb %xmm0, %eax
182 or %eax, %ecx
183 jz L(loop)
184
185 L(matches):
186 test %eax, %eax
187 jnz L(match)
188 L(return_value):
189 test %ebx, %ebx
190 jz L(return_null_1)
191 mov %ebx, %eax
192 mov %esi, %edi
193
194 POP (%ebx)
195 POP (%esi)
196
197 jmp L(match_exit)
198
199 CFI_PUSH (%ebx)
200 CFI_PUSH (%esi)
201
202 .p2align 4
203 L(return_null_1):
204 POP (%ebx)
205 POP (%esi)
206
207 xor %eax, %eax
208 RETURN
209
210 CFI_PUSH (%ebx)
211 CFI_PUSH (%esi)
212
213 .p2align 4
214 L(match):
215 pmovmskb %xmm2, %ecx
216 test %ecx, %ecx
217 jnz L(find_zero)
218 mov %eax, %ebx
219 mov %edi, %esi
220 jmp L(loop)
221
222 .p2align 4
223 L(find_zero):
224 test %cl, %cl
225 jz L(find_zero_high)
226 mov %cl, %dl
227 and $15, %dl
228 jz L(find_zero_8)
229 test $0x01, %cl
230 jnz L(FindZeroExit1)
231 test $0x02, %cl
232 jnz L(FindZeroExit2)
233 test $0x04, %cl
234 jnz L(FindZeroExit3)
235 and $1 << 4 - 1, %eax
236 jz L(return_value)
237
238 POP (%ebx)
239 POP (%esi)
240 jmp L(match_exit)
241
242 CFI_PUSH (%ebx)
243 CFI_PUSH (%esi)
244
245 .p2align 4
246 L(find_zero_8):
247 test $0x10, %cl
248 jnz L(FindZeroExit5)
249 test $0x20, %cl
250 jnz L(FindZeroExit6)
251 test $0x40, %cl
252 jnz L(FindZeroExit7)
253 and $1 << 8 - 1, %eax
254 jz L(return_value)
255
256 POP (%ebx)
257 POP (%esi)
258 jmp L(match_exit)
259
260 CFI_PUSH (%ebx)
261 CFI_PUSH (%esi)
262
263 .p2align 4
264 L(find_zero_high):
265 mov %ch, %dh
266 and $15, %dh
267 jz L(find_zero_high_8)
268 test $0x01, %ch
269 jnz L(FindZeroExit9)
270 test $0x02, %ch
271 jnz L(FindZeroExit10)
272 test $0x04, %ch
273 jnz L(FindZeroExit11)
274 and $1 << 12 - 1, %eax
275 jz L(return_value)
276
277 POP (%ebx)
278 POP (%esi)
279 jmp L(match_exit)
280
281 CFI_PUSH (%ebx)
282 CFI_PUSH (%esi)
283
284 .p2align 4
285 L(find_zero_high_8):
286 test $0x10, %ch
287 jnz L(FindZeroExit13)
288 test $0x20, %ch
289 jnz L(FindZeroExit14)
290 test $0x40, %ch
291 jnz L(FindZeroExit15)
292 and $1 << 16 - 1, %eax
293 jz L(return_value)
294
295 POP (%ebx)
296 POP (%esi)
297 jmp L(match_exit)
298
299 CFI_PUSH (%ebx)
300 CFI_PUSH (%esi)
301
302 .p2align 4
303 L(FindZeroExit1):
304 and $1, %eax
305 jz L(return_value)
306
307 POP (%ebx)
308 POP (%esi)
309 jmp L(match_exit)
310
311 CFI_PUSH (%ebx)
312 CFI_PUSH (%esi)
313
314 .p2align 4
315 L(FindZeroExit2):
316 and $1 << 2 - 1, %eax
317 jz L(return_value)
318
319 POP (%ebx)
320 POP (%esi)
321 jmp L(match_exit)
322
323 CFI_PUSH (%ebx)
324 CFI_PUSH (%esi)
325
326 .p2align 4
327 L(FindZeroExit3):
328 and $1 << 3 - 1, %eax
329 jz L(return_value)
330
331 POP (%ebx)
332 POP (%esi)
333 jmp L(match_exit)
334
335 CFI_PUSH (%ebx)
336 CFI_PUSH (%esi)
337
338 .p2align 4
339 L(FindZeroExit5):
340 and $1 << 5 - 1, %eax
341 jz L(return_value)
342
343 POP (%ebx)
344 POP (%esi)
345 jmp L(match_exit)
346
347 CFI_PUSH (%ebx)
348 CFI_PUSH (%esi)
349
350 .p2align 4
351 L(FindZeroExit6):
352 and $1 << 6 - 1, %eax
353 jz L(return_value)
354
355 POP (%ebx)
356 POP (%esi)
357 jmp L(match_exit)
358
359 CFI_PUSH (%ebx)
360 CFI_PUSH (%esi)
361
362 .p2align 4
363 L(FindZeroExit7):
364 and $1 << 7 - 1, %eax
365 jz L(return_value)
366
367 POP (%ebx)
368 POP (%esi)
369 jmp L(match_exit)
370
371 CFI_PUSH (%ebx)
372 CFI_PUSH (%esi)
373
374 .p2align 4
375 L(FindZeroExit9):
376 and $1 << 9 - 1, %eax
377 jz L(return_value)
378
379 POP (%ebx)
380 POP (%esi)
381 jmp L(match_exit)
382
383 CFI_PUSH (%ebx)
384 CFI_PUSH (%esi)
385
386 .p2align 4
387 L(FindZeroExit10):
388 and $1 << 10 - 1, %eax
389 jz L(return_value)
390
391 POP (%ebx)
392 POP (%esi)
393 jmp L(match_exit)
394
395 CFI_PUSH (%ebx)
396 CFI_PUSH (%esi)
397
398 .p2align 4
399 L(FindZeroExit11):
400 and $1 << 11 - 1, %eax
401 jz L(return_value)
402
403 POP (%ebx)
404 POP (%esi)
405 jmp L(match_exit)
406
407 CFI_PUSH (%ebx)
408 CFI_PUSH (%esi)
409
410 .p2align 4
411 L(FindZeroExit13):
412 and $1 << 13 - 1, %eax
413 jz L(return_value)
414
415 POP (%ebx)
416 POP (%esi)
417 jmp L(match_exit)
418
419 CFI_PUSH (%ebx)
420 CFI_PUSH (%esi)
421
422 .p2align 4
423 L(FindZeroExit14):
424 and $1 << 14 - 1, %eax
425 jz L(return_value)
426
427 POP (%ebx)
428 POP (%esi)
429 jmp L(match_exit)
430
431 CFI_PUSH (%ebx)
432 CFI_PUSH (%esi)
433
434 .p2align 4
435 L(FindZeroExit15):
436 and $1 << 15 - 1, %eax
437 jz L(return_value)
438
439 POP (%ebx)
440 POP (%esi)
441
442 .p2align 4
443 L(match_exit):
444 test %ah, %ah
445 jnz L(match_exit_high)
446 mov %al, %dl
447 and $15 << 4, %dl
448 jnz L(match_exit_8)
449 test $0x08, %al
450 jnz L(Exit4)
451 test $0x04, %al
452 jnz L(Exit3)
453 test $0x02, %al
454 jnz L(Exit2)
455 lea -16(%edi), %eax
456 RETURN
457
458 .p2align 4
459 L(match_exit_8):
460 test $0x80, %al
461 jnz L(Exit8)
462 test $0x40, %al
463 jnz L(Exit7)
464 test $0x20, %al
465 jnz L(Exit6)
466 lea -12(%edi), %eax
467 RETURN
468
469 .p2align 4
470 L(match_exit_high):
471 mov %ah, %dh
472 and $15 << 4, %dh
473 jnz L(match_exit_high_8)
474 test $0x08, %ah
475 jnz L(Exit12)
476 test $0x04, %ah
477 jnz L(Exit11)
478 test $0x02, %ah
479 jnz L(Exit10)
480 lea -8(%edi), %eax
481 RETURN
482
483 .p2align 4
484 L(match_exit_high_8):
485 test $0x80, %ah
486 jnz L(Exit16)
487 test $0x40, %ah
488 jnz L(Exit15)
489 test $0x20, %ah
490 jnz L(Exit14)
491 lea -4(%edi), %eax
492 RETURN
493
494 .p2align 4
495 L(Exit2):
496 lea -15(%edi), %eax
497 RETURN
498
499 .p2align 4
500 L(Exit3):
501 lea -14(%edi), %eax
502 RETURN
503
504 .p2align 4
505 L(Exit4):
506 lea -13(%edi), %eax
507 RETURN
508
509 .p2align 4
510 L(Exit6):
511 lea -11(%edi), %eax
512 RETURN
513
514 .p2align 4
515 L(Exit7):
516 lea -10(%edi), %eax
517 RETURN
518
519 .p2align 4
520 L(Exit8):
521 lea -9(%edi), %eax
522 RETURN
523
524 .p2align 4
525 L(Exit10):
526 lea -7(%edi), %eax
527 RETURN
528
529 .p2align 4
530 L(Exit11):
531 lea -6(%edi), %eax
532 RETURN
533
534 .p2align 4
535 L(Exit12):
536 lea -5(%edi), %eax
537 RETURN
538
539 .p2align 4
540 L(Exit14):
541 lea -3(%edi), %eax
542 RETURN
543
544 .p2align 4
545 L(Exit15):
546 lea -2(%edi), %eax
547 RETURN
548
549 .p2align 4
550 L(Exit16):
551 lea -1(%edi), %eax
552 RETURN
553
554 /* Return NULL. */
555 .p2align 4
556 L(return_null):
557 xor %eax, %eax
558 RETURN
559
560 .p2align 4
561 L(prolog_find_zero):
562 add %ecx, %edi
563 mov %edx, %ecx
564 L(prolog_find_zero_1):
565 test %cl, %cl
566 jz L(prolog_find_zero_high)
567 mov %cl, %dl
568 and $15, %dl
569 jz L(prolog_find_zero_8)
570 test $0x01, %cl
571 jnz L(PrologFindZeroExit1)
572 test $0x02, %cl
573 jnz L(PrologFindZeroExit2)
574 test $0x04, %cl
575 jnz L(PrologFindZeroExit3)
576 and $1 << 4 - 1, %eax
577 jnz L(match_exit)
578 xor %eax, %eax
579 RETURN
580
581 .p2align 4
582 L(prolog_find_zero_8):
583 test $0x10, %cl
584 jnz L(PrologFindZeroExit5)
585 test $0x20, %cl
586 jnz L(PrologFindZeroExit6)
587 test $0x40, %cl
588 jnz L(PrologFindZeroExit7)
589 and $1 << 8 - 1, %eax
590 jnz L(match_exit)
591 xor %eax, %eax
592 RETURN
593
594 .p2align 4
595 L(prolog_find_zero_high):
596 mov %ch, %dh
597 and $15, %dh
598 jz L(prolog_find_zero_high_8)
599 test $0x01, %ch
600 jnz L(PrologFindZeroExit9)
601 test $0x02, %ch
602 jnz L(PrologFindZeroExit10)
603 test $0x04, %ch
604 jnz L(PrologFindZeroExit11)
605 and $1 << 12 - 1, %eax
606 jnz L(match_exit)
607 xor %eax, %eax
608 RETURN
609
610 .p2align 4
611 L(prolog_find_zero_high_8):
612 test $0x10, %ch
613 jnz L(PrologFindZeroExit13)
614 test $0x20, %ch
615 jnz L(PrologFindZeroExit14)
616 test $0x40, %ch
617 jnz L(PrologFindZeroExit15)
618 and $1 << 16 - 1, %eax
619 jnz L(match_exit)
620 xor %eax, %eax
621 RETURN
622
623 .p2align 4
624 L(PrologFindZeroExit1):
625 and $1, %eax
626 jnz L(match_exit)
627 xor %eax, %eax
628 RETURN
629
630 .p2align 4
631 L(PrologFindZeroExit2):
632 and $1 << 2 - 1, %eax
633 jnz L(match_exit)
634 xor %eax, %eax
635 RETURN
636
637 .p2align 4
638 L(PrologFindZeroExit3):
639 and $1 << 3 - 1, %eax
640 jnz L(match_exit)
641 xor %eax, %eax
642 RETURN
643
644 .p2align 4
645 L(PrologFindZeroExit5):
646 and $1 << 5 - 1, %eax
647 jnz L(match_exit)
648 xor %eax, %eax
649 RETURN
650
651 .p2align 4
652 L(PrologFindZeroExit6):
653 and $1 << 6 - 1, %eax
654 jnz L(match_exit)
655 xor %eax, %eax
656 RETURN
657
658 .p2align 4
659 L(PrologFindZeroExit7):
660 and $1 << 7 - 1, %eax
661 jnz L(match_exit)
662 xor %eax, %eax
663 RETURN
664
665 .p2align 4
666 L(PrologFindZeroExit9):
667 and $1 << 9 - 1, %eax
668 jnz L(match_exit)
669 xor %eax, %eax
670 RETURN
671
672 .p2align 4
673 L(PrologFindZeroExit10):
674 and $1 << 10 - 1, %eax
675 jnz L(match_exit)
676 xor %eax, %eax
677 RETURN
678
679 .p2align 4
680 L(PrologFindZeroExit11):
681 and $1 << 11 - 1, %eax
682 jnz L(match_exit)
683 xor %eax, %eax
684 RETURN
685
686 .p2align 4
687 L(PrologFindZeroExit13):
688 and $1 << 13 - 1, %eax
689 jnz L(match_exit)
690 xor %eax, %eax
691 RETURN
692
693 .p2align 4
694 L(PrologFindZeroExit14):
695 and $1 << 14 - 1, %eax
696 jnz L(match_exit)
697 xor %eax, %eax
698 RETURN
699
700 .p2align 4
701 L(PrologFindZeroExit15):
702 and $1 << 15 - 1, %eax
703 jnz L(match_exit)
704 xor %eax, %eax
705 RETURN
706
707 END (__strrchr_sse2)
708 #endif