]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memchr-sse2.S
81d218d4308cd59ffcf31f72fdde34c5e2df3dd1
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memchr-sse2.S
1 /* Optimized memchr with sse2 without bsf
2 Copyright (C) 2011-2019 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23
24 # define CFI_PUSH(REG) \
25 cfi_adjust_cfa_offset (4); \
26 cfi_rel_offset (REG, 0)
27
28 # define CFI_POP(REG) \
29 cfi_adjust_cfa_offset (-4); \
30 cfi_restore (REG)
31
32 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
33 # define POP(REG) popl REG; CFI_POP (REG)
34
35 # ifndef USE_AS_RAWMEMCHR
36 # define ENTRANCE PUSH(%edi);
37 # define PARMS 8
38 # define RETURN POP(%edi); ret; CFI_PUSH(%edi);
39 # else
40 # define ENTRANCE
41 # define PARMS 4
42 # endif
43
44 # define STR1 PARMS
45 # define STR2 STR1+4
46
47 # ifndef USE_AS_RAWMEMCHR
48 # define LEN STR2+4
49 # endif
50
51 # ifndef MEMCHR
52 # define MEMCHR __memchr_sse2
53 # endif
54
55 atom_text_section
56 ENTRY (MEMCHR)
57 ENTRANCE
58 mov STR1(%esp), %ecx
59 movd STR2(%esp), %xmm1
60 # ifndef USE_AS_RAWMEMCHR
61 mov LEN(%esp), %edx
62 test %edx, %edx
63 jz L(return_null)
64 # endif
65
66 punpcklbw %xmm1, %xmm1
67 # ifndef USE_AS_RAWMEMCHR
68 mov %ecx, %edi
69 # else
70 mov %ecx, %edx
71 # endif
72 punpcklbw %xmm1, %xmm1
73
74 and $63, %ecx
75 pshufd $0, %xmm1, %xmm1
76 cmp $48, %ecx
77 ja L(crosscache)
78
79 # ifndef USE_AS_RAWMEMCHR
80 movdqu (%edi), %xmm0
81 # else
82 movdqu (%edx), %xmm0
83 # endif
84 pcmpeqb %xmm1, %xmm0
85 pmovmskb %xmm0, %eax
86 test %eax, %eax
87 # ifndef USE_AS_RAWMEMCHR
88 jnz L(match_case2_prolog)
89
90 sub $16, %edx
91 jbe L(return_null)
92 lea 16(%edi), %edi
93 and $15, %ecx
94 and $-16, %edi
95 add %ecx, %edx
96 # else
97 jnz L(match_case1_prolog)
98 lea 16(%edx), %edx
99 and $-16, %edx
100 # endif
101 jmp L(loop_prolog)
102
103 .p2align 4
104 L(crosscache):
105 and $15, %ecx
106 # ifndef USE_AS_RAWMEMCHR
107 and $-16, %edi
108 movdqa (%edi), %xmm0
109 # else
110 and $-16, %edx
111 movdqa (%edx), %xmm0
112 # endif
113 pcmpeqb %xmm1, %xmm0
114 pmovmskb %xmm0, %eax
115 sar %cl, %eax
116 test %eax, %eax
117
118 # ifndef USE_AS_RAWMEMCHR
119 jnz L(match_case2_prolog1)
120 /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using
121 "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
122 possible addition overflow. */
123 neg %ecx
124 add $16, %ecx
125 sub %ecx, %edx
126 jbe L(return_null)
127 lea 16(%edi), %edi
128 # else
129 jnz L(match_case1_prolog1)
130 lea 16(%edx), %edx
131 # endif
132
133 .p2align 4
134 L(loop_prolog):
135 # ifndef USE_AS_RAWMEMCHR
136 sub $64, %edx
137 jbe L(exit_loop)
138 movdqa (%edi), %xmm0
139 # else
140 movdqa (%edx), %xmm0
141 # endif
142 pcmpeqb %xmm1, %xmm0
143 xor %ecx, %ecx
144 pmovmskb %xmm0, %eax
145 test %eax, %eax
146 jnz L(match_case1)
147
148 # ifndef USE_AS_RAWMEMCHR
149 movdqa 16(%edi), %xmm2
150 # else
151 movdqa 16(%edx), %xmm2
152 # endif
153 pcmpeqb %xmm1, %xmm2
154 lea 16(%ecx), %ecx
155 pmovmskb %xmm2, %eax
156 test %eax, %eax
157 jnz L(match_case1)
158
159 # ifndef USE_AS_RAWMEMCHR
160 movdqa 32(%edi), %xmm3
161 # else
162 movdqa 32(%edx), %xmm3
163 # endif
164 pcmpeqb %xmm1, %xmm3
165 lea 16(%ecx), %ecx
166 pmovmskb %xmm3, %eax
167 test %eax, %eax
168 jnz L(match_case1)
169
170 # ifndef USE_AS_RAWMEMCHR
171 movdqa 48(%edi), %xmm4
172 # else
173 movdqa 48(%edx), %xmm4
174 # endif
175 pcmpeqb %xmm1, %xmm4
176 lea 16(%ecx), %ecx
177 pmovmskb %xmm4, %eax
178 test %eax, %eax
179 jnz L(match_case1)
180
181 # ifndef USE_AS_RAWMEMCHR
182 lea 64(%edi), %edi
183 sub $64, %edx
184 jbe L(exit_loop)
185
186 movdqa (%edi), %xmm0
187 # else
188 lea 64(%edx), %edx
189 movdqa (%edx), %xmm0
190 # endif
191 pcmpeqb %xmm1, %xmm0
192 xor %ecx, %ecx
193 pmovmskb %xmm0, %eax
194 test %eax, %eax
195 jnz L(match_case1)
196
197 # ifndef USE_AS_RAWMEMCHR
198 movdqa 16(%edi), %xmm2
199 # else
200 movdqa 16(%edx), %xmm2
201 # endif
202 pcmpeqb %xmm1, %xmm2
203 lea 16(%ecx), %ecx
204 pmovmskb %xmm2, %eax
205 test %eax, %eax
206 jnz L(match_case1)
207
208 # ifndef USE_AS_RAWMEMCHR
209 movdqa 32(%edi), %xmm3
210 # else
211 movdqa 32(%edx), %xmm3
212 # endif
213 pcmpeqb %xmm1, %xmm3
214 lea 16(%ecx), %ecx
215 pmovmskb %xmm3, %eax
216 test %eax, %eax
217 jnz L(match_case1)
218
219 # ifndef USE_AS_RAWMEMCHR
220 movdqa 48(%edi), %xmm4
221 # else
222 movdqa 48(%edx), %xmm4
223 # endif
224 pcmpeqb %xmm1, %xmm4
225 lea 16(%ecx), %ecx
226 pmovmskb %xmm4, %eax
227 test %eax, %eax
228 jnz L(match_case1)
229
230 # ifndef USE_AS_RAWMEMCHR
231 lea 64(%edi), %edi
232 mov %edi, %ecx
233 and $-64, %edi
234 and $63, %ecx
235 add %ecx, %edx
236 # else
237 lea 64(%edx), %edx
238 and $-64, %edx
239 # endif
240
241 .p2align 4
242 L(align64_loop):
243
244 # ifndef USE_AS_RAWMEMCHR
245 sub $64, %edx
246 jbe L(exit_loop)
247 movdqa (%edi), %xmm0
248 movdqa 16(%edi), %xmm2
249 movdqa 32(%edi), %xmm3
250 movdqa 48(%edi), %xmm4
251 # else
252 movdqa (%edx), %xmm0
253 movdqa 16(%edx), %xmm2
254 movdqa 32(%edx), %xmm3
255 movdqa 48(%edx), %xmm4
256 # endif
257 pcmpeqb %xmm1, %xmm0
258 pcmpeqb %xmm1, %xmm2
259 pcmpeqb %xmm1, %xmm3
260 pcmpeqb %xmm1, %xmm4
261
262 pmaxub %xmm0, %xmm3
263 pmaxub %xmm2, %xmm4
264 pmaxub %xmm3, %xmm4
265 # ifndef USE_AS_RAWMEMCHR
266 add $64, %edi
267 # else
268 add $64, %edx
269 # endif
270 pmovmskb %xmm4, %eax
271
272 test %eax, %eax
273 jz L(align64_loop)
274
275 # ifndef USE_AS_RAWMEMCHR
276 sub $64, %edi
277 # else
278 sub $64, %edx
279 # endif
280
281 pmovmskb %xmm0, %eax
282 xor %ecx, %ecx
283 test %eax, %eax
284 jnz L(match_case1)
285
286 pmovmskb %xmm2, %eax
287 lea 16(%ecx), %ecx
288 test %eax, %eax
289 jnz L(match_case1)
290
291 # ifndef USE_AS_RAWMEMCHR
292 movdqa 32(%edi), %xmm3
293 # else
294 movdqa 32(%edx), %xmm3
295 # endif
296 pcmpeqb %xmm1, %xmm3
297 pmovmskb %xmm3, %eax
298 lea 16(%ecx), %ecx
299 test %eax, %eax
300 jnz L(match_case1)
301
302 # ifndef USE_AS_RAWMEMCHR
303 pcmpeqb 48(%edi), %xmm1
304 # else
305 pcmpeqb 48(%edx), %xmm1
306 # endif
307 pmovmskb %xmm1, %eax
308 lea 16(%ecx), %ecx
309
310 .p2align 4
311 L(match_case1):
312 # ifndef USE_AS_RAWMEMCHR
313 add %ecx, %edi
314 # else
315 L(match_case1_prolog1):
316 add %ecx, %edx
317 L(match_case1_prolog):
318 # endif
319 test %al, %al
320 jz L(match_case1_high)
321 mov %al, %cl
322 and $15, %cl
323 jz L(match_case1_8)
324 test $0x01, %al
325 jnz L(ExitCase1_1)
326 test $0x02, %al
327 jnz L(ExitCase1_2)
328 test $0x04, %al
329 jnz L(ExitCase1_3)
330 # ifndef USE_AS_RAWMEMCHR
331 lea 3(%edi), %eax
332 RETURN
333 # else
334 lea 3(%edx), %eax
335 ret
336 # endif
337
338 .p2align 4
339 L(match_case1_8):
340 test $0x10, %al
341 jnz L(ExitCase1_5)
342 test $0x20, %al
343 jnz L(ExitCase1_6)
344 test $0x40, %al
345 jnz L(ExitCase1_7)
346 # ifndef USE_AS_RAWMEMCHR
347 lea 7(%edi), %eax
348 RETURN
349 # else
350 lea 7(%edx), %eax
351 ret
352 # endif
353
354 .p2align 4
355 L(match_case1_high):
356 mov %ah, %ch
357 and $15, %ch
358 jz L(match_case1_high_8)
359 test $0x01, %ah
360 jnz L(ExitCase1_9)
361 test $0x02, %ah
362 jnz L(ExitCase1_10)
363 test $0x04, %ah
364 jnz L(ExitCase1_11)
365 # ifndef USE_AS_RAWMEMCHR
366 lea 11(%edi), %eax
367 RETURN
368 # else
369 lea 11(%edx), %eax
370 ret
371 # endif
372
373 .p2align 4
374 L(match_case1_high_8):
375 test $0x10, %ah
376 jnz L(ExitCase1_13)
377 test $0x20, %ah
378 jnz L(ExitCase1_14)
379 test $0x40, %ah
380 jnz L(ExitCase1_15)
381 # ifndef USE_AS_RAWMEMCHR
382 lea 15(%edi), %eax
383 RETURN
384 # else
385 lea 15(%edx), %eax
386 ret
387 # endif
388
389 # ifndef USE_AS_RAWMEMCHR
390 .p2align 4
391 L(exit_loop):
392 add $64, %edx
393
394 movdqa (%edi), %xmm0
395 pcmpeqb %xmm1, %xmm0
396 xor %ecx, %ecx
397 pmovmskb %xmm0, %eax
398 test %eax, %eax
399 jnz L(match_case2)
400 cmp $16, %edx
401 jbe L(return_null)
402
403 movdqa 16(%edi), %xmm2
404 pcmpeqb %xmm1, %xmm2
405 lea 16(%ecx), %ecx
406 pmovmskb %xmm2, %eax
407 test %eax, %eax
408 jnz L(match_case2)
409 cmp $32, %edx
410 jbe L(return_null)
411
412 movdqa 32(%edi), %xmm3
413 pcmpeqb %xmm1, %xmm3
414 lea 16(%ecx), %ecx
415 pmovmskb %xmm3, %eax
416 test %eax, %eax
417 jnz L(match_case2)
418 cmp $48, %edx
419 jbe L(return_null)
420
421 pcmpeqb 48(%edi), %xmm1
422 lea 16(%ecx), %ecx
423 pmovmskb %xmm1, %eax
424 test %eax, %eax
425 jnz L(match_case2)
426
427 xor %eax, %eax
428 RETURN
429 # endif
430
431 .p2align 4
432 L(ExitCase1_1):
433 # ifndef USE_AS_RAWMEMCHR
434 mov %edi, %eax
435 RETURN
436 # else
437 mov %edx, %eax
438 ret
439 # endif
440
441 .p2align 4
442 L(ExitCase1_2):
443 # ifndef USE_AS_RAWMEMCHR
444 lea 1(%edi), %eax
445 RETURN
446 # else
447 lea 1(%edx), %eax
448 ret
449 # endif
450
451 .p2align 4
452 L(ExitCase1_3):
453 # ifndef USE_AS_RAWMEMCHR
454 lea 2(%edi), %eax
455 RETURN
456 # else
457 lea 2(%edx), %eax
458 ret
459 # endif
460
461 .p2align 4
462 L(ExitCase1_5):
463 # ifndef USE_AS_RAWMEMCHR
464 lea 4(%edi), %eax
465 RETURN
466 # else
467 lea 4(%edx), %eax
468 ret
469 # endif
470
471 .p2align 4
472 L(ExitCase1_6):
473 # ifndef USE_AS_RAWMEMCHR
474 lea 5(%edi), %eax
475 RETURN
476 # else
477 lea 5(%edx), %eax
478 ret
479 # endif
480
481 .p2align 4
482 L(ExitCase1_7):
483 # ifndef USE_AS_RAWMEMCHR
484 lea 6(%edi), %eax
485 RETURN
486 # else
487 lea 6(%edx), %eax
488 ret
489 # endif
490
491 .p2align 4
492 L(ExitCase1_9):
493 # ifndef USE_AS_RAWMEMCHR
494 lea 8(%edi), %eax
495 RETURN
496 # else
497 lea 8(%edx), %eax
498 ret
499 # endif
500
501 .p2align 4
502 L(ExitCase1_10):
503 # ifndef USE_AS_RAWMEMCHR
504 lea 9(%edi), %eax
505 RETURN
506 # else
507 lea 9(%edx), %eax
508 ret
509 # endif
510
511 .p2align 4
512 L(ExitCase1_11):
513 # ifndef USE_AS_RAWMEMCHR
514 lea 10(%edi), %eax
515 RETURN
516 # else
517 lea 10(%edx), %eax
518 ret
519 # endif
520
521 .p2align 4
522 L(ExitCase1_13):
523 # ifndef USE_AS_RAWMEMCHR
524 lea 12(%edi), %eax
525 RETURN
526 # else
527 lea 12(%edx), %eax
528 ret
529 # endif
530
531 .p2align 4
532 L(ExitCase1_14):
533 # ifndef USE_AS_RAWMEMCHR
534 lea 13(%edi), %eax
535 RETURN
536 # else
537 lea 13(%edx), %eax
538 ret
539 # endif
540
541 .p2align 4
542 L(ExitCase1_15):
543 # ifndef USE_AS_RAWMEMCHR
544 lea 14(%edi), %eax
545 RETURN
546 # else
547 lea 14(%edx), %eax
548 ret
549 # endif
550
551 # ifndef USE_AS_RAWMEMCHR
552 .p2align 4
553 L(match_case2):
554 sub %ecx, %edx
555 L(match_case2_prolog1):
556 add %ecx, %edi
557 L(match_case2_prolog):
558 test %al, %al
559 jz L(match_case2_high)
560 mov %al, %cl
561 and $15, %cl
562 jz L(match_case2_8)
563 test $0x01, %al
564 jnz L(ExitCase2_1)
565 test $0x02, %al
566 jnz L(ExitCase2_2)
567 test $0x04, %al
568 jnz L(ExitCase2_3)
569 sub $4, %edx
570 jb L(return_null)
571 lea 3(%edi), %eax
572 RETURN
573
574 .p2align 4
575 L(match_case2_8):
576 test $0x10, %al
577 jnz L(ExitCase2_5)
578 test $0x20, %al
579 jnz L(ExitCase2_6)
580 test $0x40, %al
581 jnz L(ExitCase2_7)
582 sub $8, %edx
583 jb L(return_null)
584 lea 7(%edi), %eax
585 RETURN
586
587 .p2align 4
588 L(match_case2_high):
589 mov %ah, %ch
590 and $15, %ch
591 jz L(match_case2_high_8)
592 test $0x01, %ah
593 jnz L(ExitCase2_9)
594 test $0x02, %ah
595 jnz L(ExitCase2_10)
596 test $0x04, %ah
597 jnz L(ExitCase2_11)
598 sub $12, %edx
599 jb L(return_null)
600 lea 11(%edi), %eax
601 RETURN
602
603 .p2align 4
604 L(match_case2_high_8):
605 test $0x10, %ah
606 jnz L(ExitCase2_13)
607 test $0x20, %ah
608 jnz L(ExitCase2_14)
609 test $0x40, %ah
610 jnz L(ExitCase2_15)
611 sub $16, %edx
612 jb L(return_null)
613 lea 15(%edi), %eax
614 RETURN
615
616 .p2align 4
617 L(ExitCase2_1):
618 mov %edi, %eax
619 RETURN
620
621 .p2align 4
622 L(ExitCase2_2):
623 sub $2, %edx
624 jb L(return_null)
625 lea 1(%edi), %eax
626 RETURN
627
628 .p2align 4
629 L(ExitCase2_3):
630 sub $3, %edx
631 jb L(return_null)
632 lea 2(%edi), %eax
633 RETURN
634
635 .p2align 4
636 L(ExitCase2_5):
637 sub $5, %edx
638 jb L(return_null)
639 lea 4(%edi), %eax
640 RETURN
641
642 .p2align 4
643 L(ExitCase2_6):
644 sub $6, %edx
645 jb L(return_null)
646 lea 5(%edi), %eax
647 RETURN
648
649 .p2align 4
650 L(ExitCase2_7):
651 sub $7, %edx
652 jb L(return_null)
653 lea 6(%edi), %eax
654 RETURN
655
656 .p2align 4
657 L(ExitCase2_9):
658 sub $9, %edx
659 jb L(return_null)
660 lea 8(%edi), %eax
661 RETURN
662
663 .p2align 4
664 L(ExitCase2_10):
665 sub $10, %edx
666 jb L(return_null)
667 lea 9(%edi), %eax
668 RETURN
669
670 .p2align 4
671 L(ExitCase2_11):
672 sub $11, %edx
673 jb L(return_null)
674 lea 10(%edi), %eax
675 RETURN
676
677 .p2align 4
678 L(ExitCase2_13):
679 sub $13, %edx
680 jb L(return_null)
681 lea 12(%edi), %eax
682 RETURN
683
684 .p2align 4
685 L(ExitCase2_14):
686 sub $14, %edx
687 jb L(return_null)
688 lea 13(%edi), %eax
689 RETURN
690
691 .p2align 4
692 L(ExitCase2_15):
693 sub $15, %edx
694 jb L(return_null)
695 lea 14(%edi), %eax
696 RETURN
697 # endif
698
699 .p2align 4
700 L(return_null):
701 xor %eax, %eax
702 # ifndef USE_AS_RAWMEMCHR
703 RETURN
704 # else
705 ret
706 # endif
707
708 END (MEMCHR)
709 #endif