]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memrchr-sse2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memrchr-sse2.S
1 /* Optimized memrchr with sse2 without bsf
2 Copyright (C) 2011-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23 # define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27 # define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
32 # define POP(REG) popl REG; CFI_POP (REG)
33
34 # define PARMS 4
35 # define STR1 PARMS
36 # define STR2 STR1+4
37 # define LEN STR2+4
38
39 atom_text_section
40 ENTRY (__memrchr_sse2)
41 mov STR1(%esp), %ecx
42 movd STR2(%esp), %xmm1
43 mov LEN(%esp), %edx
44
45 sub $16, %edx
46 jbe L(length_less16)
47
48 punpcklbw %xmm1, %xmm1
49 add %edx, %ecx
50 punpcklbw %xmm1, %xmm1
51
52 movdqu (%ecx), %xmm0
53 pshufd $0, %xmm1, %xmm1
54 pcmpeqb %xmm1, %xmm0
55
56 pmovmskb %xmm0, %eax
57 test %eax, %eax
58 jnz L(exit_dispatch)
59
60 sub $64, %ecx
61 mov %ecx, %eax
62 and $15, %eax
63 jz L(loop_prolog)
64
65 lea 16(%ecx), %ecx
66 lea 16(%edx), %edx
67 sub %eax, %edx
68 and $-16, %ecx
69
70 .p2align 4
71 /* Loop start on aligned string. */
72 L(loop_prolog):
73 sub $64, %edx
74 jbe L(exit_loop)
75
76 movdqa 48(%ecx), %xmm0
77 pcmpeqb %xmm1, %xmm0
78 pmovmskb %xmm0, %eax
79 test %eax, %eax
80 jnz L(matches48)
81
82 movdqa 32(%ecx), %xmm2
83 pcmpeqb %xmm1, %xmm2
84 pmovmskb %xmm2, %eax
85 test %eax, %eax
86 jnz L(matches32)
87
88 movdqa 16(%ecx), %xmm3
89 pcmpeqb %xmm1, %xmm3
90 pmovmskb %xmm3, %eax
91 test %eax, %eax
92 jnz L(matches16)
93
94 movdqa (%ecx), %xmm4
95 pcmpeqb %xmm1, %xmm4
96 pmovmskb %xmm4, %eax
97 test %eax, %eax
98 jnz L(exit_dispatch)
99
100 sub $64, %ecx
101 sub $64, %edx
102 jbe L(exit_loop)
103
104 movdqa 48(%ecx), %xmm0
105 pcmpeqb %xmm1, %xmm0
106 pmovmskb %xmm0, %eax
107 test %eax, %eax
108 jnz L(matches48)
109
110 movdqa 32(%ecx), %xmm2
111 pcmpeqb %xmm1, %xmm2
112 pmovmskb %xmm2, %eax
113 test %eax, %eax
114 jnz L(matches32)
115
116 movdqa 16(%ecx), %xmm3
117 pcmpeqb %xmm1, %xmm3
118 pmovmskb %xmm3, %eax
119 test %eax, %eax
120 jnz L(matches16)
121
122 movdqa (%ecx), %xmm3
123 pcmpeqb %xmm1, %xmm3
124 pmovmskb %xmm3, %eax
125 test %eax, %eax
126 jnz L(exit_dispatch)
127
128 mov %ecx, %eax
129 and $63, %eax
130 test %eax, %eax
131 jz L(align64_loop)
132
133 lea 64(%ecx), %ecx
134 lea 64(%edx), %edx
135 and $-64, %ecx
136 sub %eax, %edx
137
138 .p2align 4
139 L(align64_loop):
140 sub $64, %ecx
141 sub $64, %edx
142 jbe L(exit_loop)
143
144 movdqa (%ecx), %xmm0
145 movdqa 16(%ecx), %xmm2
146 movdqa 32(%ecx), %xmm3
147 movdqa 48(%ecx), %xmm4
148
149 pcmpeqb %xmm1, %xmm0
150 pcmpeqb %xmm1, %xmm2
151 pcmpeqb %xmm1, %xmm3
152 pcmpeqb %xmm1, %xmm4
153
154 pmaxub %xmm3, %xmm0
155 pmaxub %xmm4, %xmm2
156 pmaxub %xmm0, %xmm2
157 pmovmskb %xmm2, %eax
158
159 test %eax, %eax
160 jz L(align64_loop)
161
162 pmovmskb %xmm4, %eax
163 test %eax, %eax
164 jnz L(matches48)
165
166 pmovmskb %xmm3, %eax
167 test %eax, %eax
168 jnz L(matches32)
169
170 movdqa 16(%ecx), %xmm2
171
172 pcmpeqb %xmm1, %xmm2
173 pcmpeqb (%ecx), %xmm1
174
175 pmovmskb %xmm2, %eax
176 test %eax, %eax
177 jnz L(matches16)
178
179 pmovmskb %xmm1, %eax
180 test %ah, %ah
181 jnz L(exit_dispatch_high)
182 mov %al, %dl
183 and $15 << 4, %dl
184 jnz L(exit_dispatch_8)
185 test $0x08, %al
186 jnz L(exit_4)
187 test $0x04, %al
188 jnz L(exit_3)
189 test $0x02, %al
190 jnz L(exit_2)
191 mov %ecx, %eax
192 ret
193
194 .p2align 4
195 L(exit_loop):
196 add $64, %edx
197 cmp $32, %edx
198 jbe L(exit_loop_32)
199
200 movdqa 48(%ecx), %xmm0
201 pcmpeqb %xmm1, %xmm0
202 pmovmskb %xmm0, %eax
203 test %eax, %eax
204 jnz L(matches48)
205
206 movdqa 32(%ecx), %xmm2
207 pcmpeqb %xmm1, %xmm2
208 pmovmskb %xmm2, %eax
209 test %eax, %eax
210 jnz L(matches32)
211
212 movdqa 16(%ecx), %xmm3
213 pcmpeqb %xmm1, %xmm3
214 pmovmskb %xmm3, %eax
215 test %eax, %eax
216 jnz L(matches16_1)
217 cmp $48, %edx
218 jbe L(return_null)
219
220 pcmpeqb (%ecx), %xmm1
221 pmovmskb %xmm1, %eax
222 test %eax, %eax
223 jnz L(matches0_1)
224 xor %eax, %eax
225 ret
226
227 .p2align 4
228 L(exit_loop_32):
229 movdqa 48(%ecx), %xmm0
230 pcmpeqb %xmm1, %xmm0
231 pmovmskb %xmm0, %eax
232 test %eax, %eax
233 jnz L(matches48_1)
234 cmp $16, %edx
235 jbe L(return_null)
236
237 pcmpeqb 32(%ecx), %xmm1
238 pmovmskb %xmm1, %eax
239 test %eax, %eax
240 jnz L(matches32_1)
241 xor %eax, %eax
242 ret
243
244 .p2align 4
245 L(matches16):
246 lea 16(%ecx), %ecx
247 test %ah, %ah
248 jnz L(exit_dispatch_high)
249 mov %al, %dl
250 and $15 << 4, %dl
251 jnz L(exit_dispatch_8)
252 test $0x08, %al
253 jnz L(exit_4)
254 test $0x04, %al
255 jnz L(exit_3)
256 test $0x02, %al
257 jnz L(exit_2)
258 mov %ecx, %eax
259 ret
260
261 .p2align 4
262 L(matches32):
263 lea 32(%ecx), %ecx
264 test %ah, %ah
265 jnz L(exit_dispatch_high)
266 mov %al, %dl
267 and $15 << 4, %dl
268 jnz L(exit_dispatch_8)
269 test $0x08, %al
270 jnz L(exit_4)
271 test $0x04, %al
272 jnz L(exit_3)
273 test $0x02, %al
274 jnz L(exit_2)
275 mov %ecx, %eax
276 ret
277
278 .p2align 4
279 L(matches48):
280 lea 48(%ecx), %ecx
281
282 .p2align 4
283 L(exit_dispatch):
284 test %ah, %ah
285 jnz L(exit_dispatch_high)
286 mov %al, %dl
287 and $15 << 4, %dl
288 jnz L(exit_dispatch_8)
289 test $0x08, %al
290 jnz L(exit_4)
291 test $0x04, %al
292 jnz L(exit_3)
293 test $0x02, %al
294 jnz L(exit_2)
295 mov %ecx, %eax
296 ret
297
298 .p2align 4
299 L(exit_dispatch_8):
300 test $0x80, %al
301 jnz L(exit_8)
302 test $0x40, %al
303 jnz L(exit_7)
304 test $0x20, %al
305 jnz L(exit_6)
306 lea 4(%ecx), %eax
307 ret
308
309 .p2align 4
310 L(exit_dispatch_high):
311 mov %ah, %dh
312 and $15 << 4, %dh
313 jnz L(exit_dispatch_high_8)
314 test $0x08, %ah
315 jnz L(exit_12)
316 test $0x04, %ah
317 jnz L(exit_11)
318 test $0x02, %ah
319 jnz L(exit_10)
320 lea 8(%ecx), %eax
321 ret
322
323 .p2align 4
324 L(exit_dispatch_high_8):
325 test $0x80, %ah
326 jnz L(exit_16)
327 test $0x40, %ah
328 jnz L(exit_15)
329 test $0x20, %ah
330 jnz L(exit_14)
331 lea 12(%ecx), %eax
332 ret
333
334 .p2align 4
335 L(exit_2):
336 lea 1(%ecx), %eax
337 ret
338
339 .p2align 4
340 L(exit_3):
341 lea 2(%ecx), %eax
342 ret
343
344 .p2align 4
345 L(exit_4):
346 lea 3(%ecx), %eax
347 ret
348
349 .p2align 4
350 L(exit_6):
351 lea 5(%ecx), %eax
352 ret
353
354 .p2align 4
355 L(exit_7):
356 lea 6(%ecx), %eax
357 ret
358
359 .p2align 4
360 L(exit_8):
361 lea 7(%ecx), %eax
362 ret
363
364 .p2align 4
365 L(exit_10):
366 lea 9(%ecx), %eax
367 ret
368
369 .p2align 4
370 L(exit_11):
371 lea 10(%ecx), %eax
372 ret
373
374 .p2align 4
375 L(exit_12):
376 lea 11(%ecx), %eax
377 ret
378
379 .p2align 4
380 L(exit_14):
381 lea 13(%ecx), %eax
382 ret
383
384 .p2align 4
385 L(exit_15):
386 lea 14(%ecx), %eax
387 ret
388
389 .p2align 4
390 L(exit_16):
391 lea 15(%ecx), %eax
392 ret
393
394 .p2align 4
395 L(matches0_1):
396 lea -64(%edx), %edx
397
398 test %ah, %ah
399 jnz L(exit_dispatch_1_high)
400 mov %al, %ah
401 and $15 << 4, %ah
402 jnz L(exit_dispatch_1_8)
403 test $0x08, %al
404 jnz L(exit_1_4)
405 test $0x04, %al
406 jnz L(exit_1_3)
407 test $0x02, %al
408 jnz L(exit_1_2)
409 add $0, %edx
410 jl L(return_null)
411 mov %ecx, %eax
412 ret
413
414 .p2align 4
415 L(matches16_1):
416 lea -48(%edx), %edx
417 lea 16(%ecx), %ecx
418
419 test %ah, %ah
420 jnz L(exit_dispatch_1_high)
421 mov %al, %ah
422 and $15 << 4, %ah
423 jnz L(exit_dispatch_1_8)
424 test $0x08, %al
425 jnz L(exit_1_4)
426 test $0x04, %al
427 jnz L(exit_1_3)
428 test $0x02, %al
429 jnz L(exit_1_2)
430 add $0, %edx
431 jl L(return_null)
432 mov %ecx, %eax
433 ret
434
435 .p2align 4
436 L(matches32_1):
437 lea -32(%edx), %edx
438 lea 32(%ecx), %ecx
439
440 test %ah, %ah
441 jnz L(exit_dispatch_1_high)
442 mov %al, %ah
443 and $15 << 4, %ah
444 jnz L(exit_dispatch_1_8)
445 test $0x08, %al
446 jnz L(exit_1_4)
447 test $0x04, %al
448 jnz L(exit_1_3)
449 test $0x02, %al
450 jnz L(exit_1_2)
451 add $0, %edx
452 jl L(return_null)
453 mov %ecx, %eax
454 ret
455
456 .p2align 4
457 L(matches48_1):
458 lea -16(%edx), %edx
459 lea 48(%ecx), %ecx
460
461 .p2align 4
462 L(exit_dispatch_1):
463 test %ah, %ah
464 jnz L(exit_dispatch_1_high)
465 mov %al, %ah
466 and $15 << 4, %ah
467 jnz L(exit_dispatch_1_8)
468 test $0x08, %al
469 jnz L(exit_1_4)
470 test $0x04, %al
471 jnz L(exit_1_3)
472 test $0x02, %al
473 jnz L(exit_1_2)
474 add $0, %edx
475 jl L(return_null)
476 mov %ecx, %eax
477 ret
478
479 .p2align 4
480 L(exit_dispatch_1_8):
481 test $0x80, %al
482 jnz L(exit_1_8)
483 test $0x40, %al
484 jnz L(exit_1_7)
485 test $0x20, %al
486 jnz L(exit_1_6)
487 add $4, %edx
488 jl L(return_null)
489 lea 4(%ecx), %eax
490 ret
491
492 .p2align 4
493 L(exit_dispatch_1_high):
494 mov %ah, %al
495 and $15 << 4, %al
496 jnz L(exit_dispatch_1_high_8)
497 test $0x08, %ah
498 jnz L(exit_1_12)
499 test $0x04, %ah
500 jnz L(exit_1_11)
501 test $0x02, %ah
502 jnz L(exit_1_10)
503 add $8, %edx
504 jl L(return_null)
505 lea 8(%ecx), %eax
506 ret
507
508 .p2align 4
509 L(exit_dispatch_1_high_8):
510 test $0x80, %ah
511 jnz L(exit_1_16)
512 test $0x40, %ah
513 jnz L(exit_1_15)
514 test $0x20, %ah
515 jnz L(exit_1_14)
516 add $12, %edx
517 jl L(return_null)
518 lea 12(%ecx), %eax
519 ret
520
521 .p2align 4
522 L(exit_1_2):
523 add $1, %edx
524 jl L(return_null)
525 lea 1(%ecx), %eax
526 ret
527
528 .p2align 4
529 L(exit_1_3):
530 add $2, %edx
531 jl L(return_null)
532 lea 2(%ecx), %eax
533 ret
534
535 .p2align 4
536 L(exit_1_4):
537 add $3, %edx
538 jl L(return_null)
539 lea 3(%ecx), %eax
540 ret
541
542 .p2align 4
543 L(exit_1_6):
544 add $5, %edx
545 jl L(return_null)
546 lea 5(%ecx), %eax
547 ret
548
549 .p2align 4
550 L(exit_1_7):
551 add $6, %edx
552 jl L(return_null)
553 lea 6(%ecx), %eax
554 ret
555
556 .p2align 4
557 L(exit_1_8):
558 add $7, %edx
559 jl L(return_null)
560 lea 7(%ecx), %eax
561 ret
562
563 .p2align 4
564 L(exit_1_10):
565 add $9, %edx
566 jl L(return_null)
567 lea 9(%ecx), %eax
568 ret
569
570 .p2align 4
571 L(exit_1_11):
572 add $10, %edx
573 jl L(return_null)
574 lea 10(%ecx), %eax
575 ret
576
577 .p2align 4
578 L(exit_1_12):
579 add $11, %edx
580 jl L(return_null)
581 lea 11(%ecx), %eax
582 ret
583
584 .p2align 4
585 L(exit_1_14):
586 add $13, %edx
587 jl L(return_null)
588 lea 13(%ecx), %eax
589 ret
590
591 .p2align 4
592 L(exit_1_15):
593 add $14, %edx
594 jl L(return_null)
595 lea 14(%ecx), %eax
596 ret
597
598 .p2align 4
599 L(exit_1_16):
600 add $15, %edx
601 jl L(return_null)
602 lea 15(%ecx), %eax
603 ret
604
605 .p2align 4
606 L(return_null):
607 xor %eax, %eax
608 ret
609
610 .p2align 4
611 L(length_less16_offset0):
612 mov %dl, %cl
613 pcmpeqb (%eax), %xmm1
614
615 mov $1, %edx
616 sal %cl, %edx
617 sub $1, %edx
618
619 mov %eax, %ecx
620 pmovmskb %xmm1, %eax
621
622 and %edx, %eax
623 test %eax, %eax
624 jnz L(exit_dispatch)
625
626 xor %eax, %eax
627 ret
628
629 .p2align 4
630 L(length_less16):
631 punpcklbw %xmm1, %xmm1
632 add $16, %edx
633 je L(return_null)
634 punpcklbw %xmm1, %xmm1
635
636 mov %ecx, %eax
637 pshufd $0, %xmm1, %xmm1
638
639 and $15, %ecx
640 jz L(length_less16_offset0)
641
642 PUSH (%edi)
643
644 mov %cl, %dh
645 add %dl, %dh
646 and $-16, %eax
647
648 sub $16, %dh
649 ja L(length_less16_part2)
650
651 pcmpeqb (%eax), %xmm1
652 pmovmskb %xmm1, %edi
653
654 sar %cl, %edi
655 add %ecx, %eax
656 mov %dl, %cl
657
658 mov $1, %edx
659 sal %cl, %edx
660 sub $1, %edx
661
662 and %edx, %edi
663 test %edi, %edi
664 jz L(ret_null)
665
666 bsr %edi, %edi
667 add %edi, %eax
668 POP (%edi)
669 ret
670
671 CFI_PUSH (%edi)
672
673 .p2align 4
674 L(length_less16_part2):
675 movdqa 16(%eax), %xmm2
676 pcmpeqb %xmm1, %xmm2
677 pmovmskb %xmm2, %edi
678
679 mov %cl, %ch
680
681 mov %dh, %cl
682 mov $1, %edx
683 sal %cl, %edx
684 sub $1, %edx
685
686 and %edx, %edi
687
688 test %edi, %edi
689 jnz L(length_less16_part2_return)
690
691 pcmpeqb (%eax), %xmm1
692 pmovmskb %xmm1, %edi
693
694 mov %ch, %cl
695 sar %cl, %edi
696 test %edi, %edi
697 jz L(ret_null)
698
699 bsr %edi, %edi
700 add %edi, %eax
701 xor %ch, %ch
702 add %ecx, %eax
703 POP (%edi)
704 ret
705
706 CFI_PUSH (%edi)
707
708 .p2align 4
709 L(length_less16_part2_return):
710 bsr %edi, %edi
711 lea 16(%eax, %edi), %eax
712 POP (%edi)
713 ret
714
715 CFI_PUSH (%edi)
716
717 .p2align 4
718 L(ret_null):
719 xor %eax, %eax
720 POP (%edi)
721 ret
722
723 END (__memrchr_sse2)
724 #endif