]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/strlen-sse2.S
d5adf2810a14d564ed49bded6732f16e8f18ce31
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / strlen-sse2.S
1 /* strlen with SSE2
2 Copyright (C) 2010-2018 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 /* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
21
22 #if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
23
24 # ifndef USE_AS_STRCAT
25
26 # include <sysdep.h>
27 # define PARMS 4
28 # define STR PARMS
29 # define RETURN ret
30
31 # ifdef USE_AS_STRNLEN
32 # define LEN PARMS + 8
33 # define CFI_PUSH(REG) \
34 cfi_adjust_cfa_offset (4); \
35 cfi_rel_offset (REG, 0)
36
37 # define CFI_POP(REG) \
38 cfi_adjust_cfa_offset (-4); \
39 cfi_restore (REG)
40
41 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
42 # define POP(REG) popl REG; CFI_POP (REG)
43 # undef RETURN
44 # define RETURN POP (%edi); CFI_PUSH(%edi); ret
45 # endif
46
47 # ifndef STRLEN
48 # define STRLEN __strlen_sse2
49 # endif
50
51 atom_text_section
52 ENTRY (STRLEN)
53 mov STR(%esp), %edx
54 # ifdef USE_AS_STRNLEN
55 PUSH (%edi)
56 movl LEN(%esp), %edi
57 sub $4, %edi
58 jbe L(len_less4_prolog)
59 # endif
60 # endif
61 xor %eax, %eax
62 cmpb $0, (%edx)
63 jz L(exit_tail0)
64 cmpb $0, 1(%edx)
65 jz L(exit_tail1)
66 cmpb $0, 2(%edx)
67 jz L(exit_tail2)
68 cmpb $0, 3(%edx)
69 jz L(exit_tail3)
70
71 # ifdef USE_AS_STRNLEN
72 sub $4, %edi
73 jbe L(len_less8_prolog)
74 # endif
75
76 cmpb $0, 4(%edx)
77 jz L(exit_tail4)
78 cmpb $0, 5(%edx)
79 jz L(exit_tail5)
80 cmpb $0, 6(%edx)
81 jz L(exit_tail6)
82 cmpb $0, 7(%edx)
83 jz L(exit_tail7)
84
85 # ifdef USE_AS_STRNLEN
86 sub $4, %edi
87 jbe L(len_less12_prolog)
88 # endif
89
90 cmpb $0, 8(%edx)
91 jz L(exit_tail8)
92 cmpb $0, 9(%edx)
93 jz L(exit_tail9)
94 cmpb $0, 10(%edx)
95 jz L(exit_tail10)
96 cmpb $0, 11(%edx)
97 jz L(exit_tail11)
98
99 # ifdef USE_AS_STRNLEN
100 sub $4, %edi
101 jbe L(len_less16_prolog)
102 # endif
103
104 cmpb $0, 12(%edx)
105 jz L(exit_tail12)
106 cmpb $0, 13(%edx)
107 jz L(exit_tail13)
108 cmpb $0, 14(%edx)
109 jz L(exit_tail14)
110 cmpb $0, 15(%edx)
111 jz L(exit_tail15)
112
113 pxor %xmm0, %xmm0
114 lea 16(%edx), %eax
115 mov %eax, %ecx
116 and $-16, %eax
117
118 # ifdef USE_AS_STRNLEN
119 and $15, %edx
120 add %edx, %edi
121 sub $64, %edi
122 jbe L(len_less64)
123 # endif
124
125 pcmpeqb (%eax), %xmm0
126 pmovmskb %xmm0, %edx
127 pxor %xmm1, %xmm1
128 test %edx, %edx
129 lea 16(%eax), %eax
130 jnz L(exit)
131
132 pcmpeqb (%eax), %xmm1
133 pmovmskb %xmm1, %edx
134 pxor %xmm2, %xmm2
135 test %edx, %edx
136 lea 16(%eax), %eax
137 jnz L(exit)
138
139 pcmpeqb (%eax), %xmm2
140 pmovmskb %xmm2, %edx
141 pxor %xmm3, %xmm3
142 test %edx, %edx
143 lea 16(%eax), %eax
144 jnz L(exit)
145
146 pcmpeqb (%eax), %xmm3
147 pmovmskb %xmm3, %edx
148 test %edx, %edx
149 lea 16(%eax), %eax
150 jnz L(exit)
151
152 # ifdef USE_AS_STRNLEN
153 sub $64, %edi
154 jbe L(len_less64)
155 # endif
156
157 pcmpeqb (%eax), %xmm0
158 pmovmskb %xmm0, %edx
159 test %edx, %edx
160 lea 16(%eax), %eax
161 jnz L(exit)
162
163 pcmpeqb (%eax), %xmm1
164 pmovmskb %xmm1, %edx
165 test %edx, %edx
166 lea 16(%eax), %eax
167 jnz L(exit)
168
169 pcmpeqb (%eax), %xmm2
170 pmovmskb %xmm2, %edx
171 test %edx, %edx
172 lea 16(%eax), %eax
173 jnz L(exit)
174
175 pcmpeqb (%eax), %xmm3
176 pmovmskb %xmm3, %edx
177 test %edx, %edx
178 lea 16(%eax), %eax
179 jnz L(exit)
180
181 # ifdef USE_AS_STRNLEN
182 sub $64, %edi
183 jbe L(len_less64)
184 # endif
185
186 pcmpeqb (%eax), %xmm0
187 pmovmskb %xmm0, %edx
188 test %edx, %edx
189 lea 16(%eax), %eax
190 jnz L(exit)
191
192 pcmpeqb (%eax), %xmm1
193 pmovmskb %xmm1, %edx
194 test %edx, %edx
195 lea 16(%eax), %eax
196 jnz L(exit)
197
198 pcmpeqb (%eax), %xmm2
199 pmovmskb %xmm2, %edx
200 test %edx, %edx
201 lea 16(%eax), %eax
202 jnz L(exit)
203
204 pcmpeqb (%eax), %xmm3
205 pmovmskb %xmm3, %edx
206 test %edx, %edx
207 lea 16(%eax), %eax
208 jnz L(exit)
209
210 # ifdef USE_AS_STRNLEN
211 sub $64, %edi
212 jbe L(len_less64)
213 # endif
214
215 pcmpeqb (%eax), %xmm0
216 pmovmskb %xmm0, %edx
217 test %edx, %edx
218 lea 16(%eax), %eax
219 jnz L(exit)
220
221 pcmpeqb (%eax), %xmm1
222 pmovmskb %xmm1, %edx
223 test %edx, %edx
224 lea 16(%eax), %eax
225 jnz L(exit)
226
227 pcmpeqb (%eax), %xmm2
228 pmovmskb %xmm2, %edx
229 test %edx, %edx
230 lea 16(%eax), %eax
231 jnz L(exit)
232
233 pcmpeqb (%eax), %xmm3
234 pmovmskb %xmm3, %edx
235 test %edx, %edx
236 lea 16(%eax), %eax
237 jnz L(exit)
238
239 # ifdef USE_AS_STRNLEN
240 mov %eax, %edx
241 and $63, %edx
242 add %edx, %edi
243 # endif
244
245 and $-0x40, %eax
246
247 .p2align 4
248 L(aligned_64_loop):
249 # ifdef USE_AS_STRNLEN
250 sub $64, %edi
251 jbe L(len_less64)
252 # endif
253 movaps (%eax), %xmm0
254 movaps 16(%eax), %xmm1
255 movaps 32(%eax), %xmm2
256 movaps 48(%eax), %xmm6
257 pminub %xmm1, %xmm0
258 pminub %xmm6, %xmm2
259 pminub %xmm0, %xmm2
260 pcmpeqb %xmm3, %xmm2
261 pmovmskb %xmm2, %edx
262 test %edx, %edx
263 lea 64(%eax), %eax
264 jz L(aligned_64_loop)
265
266 pcmpeqb -64(%eax), %xmm3
267 pmovmskb %xmm3, %edx
268 test %edx, %edx
269 lea 48(%ecx), %ecx
270 jnz L(exit)
271
272 pcmpeqb %xmm1, %xmm3
273 pmovmskb %xmm3, %edx
274 test %edx, %edx
275 lea -16(%ecx), %ecx
276 jnz L(exit)
277
278 pcmpeqb -32(%eax), %xmm3
279 pmovmskb %xmm3, %edx
280 test %edx, %edx
281 lea -16(%ecx), %ecx
282 jnz L(exit)
283
284 pcmpeqb %xmm6, %xmm3
285 pmovmskb %xmm3, %edx
286 lea -16(%ecx), %ecx
287 L(exit):
288 sub %ecx, %eax
289 test %dl, %dl
290 jz L(exit_high)
291
292 mov %dl, %cl
293 and $15, %cl
294 jz L(exit_8)
295 test $0x01, %dl
296 jnz L(exit_tail0)
297 test $0x02, %dl
298 jnz L(exit_tail1)
299 test $0x04, %dl
300 jnz L(exit_tail2)
301 add $3, %eax
302 RETURN
303
304 .p2align 4
305 L(exit_8):
306 test $0x10, %dl
307 jnz L(exit_tail4)
308 test $0x20, %dl
309 jnz L(exit_tail5)
310 test $0x40, %dl
311 jnz L(exit_tail6)
312 add $7, %eax
313 RETURN
314
315 .p2align 4
316 L(exit_high):
317 mov %dh, %ch
318 and $15, %ch
319 jz L(exit_high_8)
320 test $0x01, %dh
321 jnz L(exit_tail8)
322 test $0x02, %dh
323 jnz L(exit_tail9)
324 test $0x04, %dh
325 jnz L(exit_tail10)
326 add $11, %eax
327 RETURN
328
329 .p2align 4
330 L(exit_high_8):
331 test $0x10, %dh
332 jnz L(exit_tail12)
333 test $0x20, %dh
334 jnz L(exit_tail13)
335 test $0x40, %dh
336 jnz L(exit_tail14)
337 add $15, %eax
338 L(exit_tail0):
339 RETURN
340
341 # ifdef USE_AS_STRNLEN
342
343 .p2align 4
344 L(len_less64):
345 pxor %xmm0, %xmm0
346 add $64, %edi
347
348 pcmpeqb (%eax), %xmm0
349 pmovmskb %xmm0, %edx
350 pxor %xmm1, %xmm1
351 lea 16(%eax), %eax
352 test %edx, %edx
353 jnz L(strnlen_exit)
354
355 sub $16, %edi
356 jbe L(return_start_len)
357
358 pcmpeqb (%eax), %xmm1
359 pmovmskb %xmm1, %edx
360 lea 16(%eax), %eax
361 test %edx, %edx
362 jnz L(strnlen_exit)
363
364 sub $16, %edi
365 jbe L(return_start_len)
366
367 pcmpeqb (%eax), %xmm0
368 pmovmskb %xmm0, %edx
369 lea 16(%eax), %eax
370 test %edx, %edx
371 jnz L(strnlen_exit)
372
373 sub $16, %edi
374 jbe L(return_start_len)
375
376 pcmpeqb (%eax), %xmm1
377 pmovmskb %xmm1, %edx
378 lea 16(%eax), %eax
379 test %edx, %edx
380 jnz L(strnlen_exit)
381
382 movl LEN(%esp), %eax
383 RETURN
384
385 .p2align 4
386 L(strnlen_exit):
387 sub %ecx, %eax
388
389 test %dl, %dl
390 jz L(strnlen_exit_high)
391 mov %dl, %cl
392 and $15, %cl
393 jz L(strnlen_exit_8)
394 test $0x01, %dl
395 jnz L(exit_tail0)
396 test $0x02, %dl
397 jnz L(strnlen_exit_tail1)
398 test $0x04, %dl
399 jnz L(strnlen_exit_tail2)
400 sub $4, %edi
401 jb L(return_start_len)
402 lea 3(%eax), %eax
403 RETURN
404
405 .p2align 4
406 L(strnlen_exit_8):
407 test $0x10, %dl
408 jnz L(strnlen_exit_tail4)
409 test $0x20, %dl
410 jnz L(strnlen_exit_tail5)
411 test $0x40, %dl
412 jnz L(strnlen_exit_tail6)
413 sub $8, %edi
414 jb L(return_start_len)
415 lea 7(%eax), %eax
416 RETURN
417
418 .p2align 4
419 L(strnlen_exit_high):
420 mov %dh, %ch
421 and $15, %ch
422 jz L(strnlen_exit_high_8)
423 test $0x01, %dh
424 jnz L(strnlen_exit_tail8)
425 test $0x02, %dh
426 jnz L(strnlen_exit_tail9)
427 test $0x04, %dh
428 jnz L(strnlen_exit_tail10)
429 sub $12, %edi
430 jb L(return_start_len)
431 lea 11(%eax), %eax
432 RETURN
433
434 .p2align 4
435 L(strnlen_exit_high_8):
436 test $0x10, %dh
437 jnz L(strnlen_exit_tail12)
438 test $0x20, %dh
439 jnz L(strnlen_exit_tail13)
440 test $0x40, %dh
441 jnz L(strnlen_exit_tail14)
442 sub $16, %edi
443 jb L(return_start_len)
444 lea 15(%eax), %eax
445 RETURN
446
447 .p2align 4
448 L(strnlen_exit_tail1):
449 sub $2, %edi
450 jb L(return_start_len)
451 lea 1(%eax), %eax
452 RETURN
453
454 .p2align 4
455 L(strnlen_exit_tail2):
456 sub $3, %edi
457 jb L(return_start_len)
458 lea 2(%eax), %eax
459 RETURN
460
461 .p2align 4
462 L(strnlen_exit_tail4):
463 sub $5, %edi
464 jb L(return_start_len)
465 lea 4(%eax), %eax
466 RETURN
467
468 .p2align 4
469 L(strnlen_exit_tail5):
470 sub $6, %edi
471 jb L(return_start_len)
472 lea 5(%eax), %eax
473 RETURN
474
475 .p2align 4
476 L(strnlen_exit_tail6):
477 sub $7, %edi
478 jb L(return_start_len)
479 lea 6(%eax), %eax
480 RETURN
481
482 .p2align 4
483 L(strnlen_exit_tail8):
484 sub $9, %edi
485 jb L(return_start_len)
486 lea 8(%eax), %eax
487 RETURN
488
489 .p2align 4
490 L(strnlen_exit_tail9):
491 sub $10, %edi
492 jb L(return_start_len)
493 lea 9(%eax), %eax
494 RETURN
495
496 .p2align 4
497 L(strnlen_exit_tail10):
498 sub $11, %edi
499 jb L(return_start_len)
500 lea 10(%eax), %eax
501 RETURN
502
503 .p2align 4
504 L(strnlen_exit_tail12):
505 sub $13, %edi
506 jb L(return_start_len)
507 lea 12(%eax), %eax
508 RETURN
509
510 .p2align 4
511 L(strnlen_exit_tail13):
512 sub $14, %edi
513 jb L(return_start_len)
514 lea 13(%eax), %eax
515 RETURN
516
517 .p2align 4
518 L(strnlen_exit_tail14):
519 sub $15, %edi
520 jb L(return_start_len)
521 lea 14(%eax), %eax
522 RETURN
523
524 .p2align 4
525 L(return_start_len):
526 movl LEN(%esp), %eax
527 RETURN
528
529 /* for prolog only */
530
531 .p2align 4
532 L(len_less4_prolog):
533 xor %eax, %eax
534
535 add $4, %edi
536 jz L(exit_tail0)
537
538 cmpb $0, (%edx)
539 jz L(exit_tail0)
540 cmp $1, %edi
541 je L(exit_tail1)
542
543 cmpb $0, 1(%edx)
544 jz L(exit_tail1)
545 cmp $2, %edi
546 je L(exit_tail2)
547
548 cmpb $0, 2(%edx)
549 jz L(exit_tail2)
550 cmp $3, %edi
551 je L(exit_tail3)
552
553 cmpb $0, 3(%edx)
554 jz L(exit_tail3)
555 mov $4, %eax
556 RETURN
557
558 .p2align 4
559 L(len_less8_prolog):
560 add $4, %edi
561
562 cmpb $0, 4(%edx)
563 jz L(exit_tail4)
564 cmp $1, %edi
565 je L(exit_tail5)
566
567 cmpb $0, 5(%edx)
568 jz L(exit_tail5)
569 cmp $2, %edi
570 je L(exit_tail6)
571
572 cmpb $0, 6(%edx)
573 jz L(exit_tail6)
574 cmp $3, %edi
575 je L(exit_tail7)
576
577 cmpb $0, 7(%edx)
578 jz L(exit_tail7)
579 mov $8, %eax
580 RETURN
581
582
583 .p2align 4
584 L(len_less12_prolog):
585 add $4, %edi
586
587 cmpb $0, 8(%edx)
588 jz L(exit_tail8)
589 cmp $1, %edi
590 je L(exit_tail9)
591
592 cmpb $0, 9(%edx)
593 jz L(exit_tail9)
594 cmp $2, %edi
595 je L(exit_tail10)
596
597 cmpb $0, 10(%edx)
598 jz L(exit_tail10)
599 cmp $3, %edi
600 je L(exit_tail11)
601
602 cmpb $0, 11(%edx)
603 jz L(exit_tail11)
604 mov $12, %eax
605 RETURN
606
607 .p2align 4
608 L(len_less16_prolog):
609 add $4, %edi
610
611 cmpb $0, 12(%edx)
612 jz L(exit_tail12)
613 cmp $1, %edi
614 je L(exit_tail13)
615
616 cmpb $0, 13(%edx)
617 jz L(exit_tail13)
618 cmp $2, %edi
619 je L(exit_tail14)
620
621 cmpb $0, 14(%edx)
622 jz L(exit_tail14)
623 cmp $3, %edi
624 je L(exit_tail15)
625
626 cmpb $0, 15(%edx)
627 jz L(exit_tail15)
628 mov $16, %eax
629 RETURN
630 # endif
631
632 .p2align 4
633 L(exit_tail1):
634 add $1, %eax
635 RETURN
636
637 L(exit_tail2):
638 add $2, %eax
639 RETURN
640
641 L(exit_tail3):
642 add $3, %eax
643 RETURN
644
645 L(exit_tail4):
646 add $4, %eax
647 RETURN
648
649 L(exit_tail5):
650 add $5, %eax
651 RETURN
652
653 L(exit_tail6):
654 add $6, %eax
655 RETURN
656
657 L(exit_tail7):
658 add $7, %eax
659 RETURN
660
661 L(exit_tail8):
662 add $8, %eax
663 RETURN
664
665 L(exit_tail9):
666 add $9, %eax
667 RETURN
668
669 L(exit_tail10):
670 add $10, %eax
671 RETURN
672
673 L(exit_tail11):
674 add $11, %eax
675 RETURN
676
677 L(exit_tail12):
678 add $12, %eax
679 RETURN
680
681 L(exit_tail13):
682 add $13, %eax
683 RETURN
684
685 L(exit_tail14):
686 add $14, %eax
687 RETURN
688
689 L(exit_tail15):
690 add $15, %eax
691 # ifndef USE_AS_STRCAT
692 RETURN
693 END (STRLEN)
694 # endif
695 #endif