]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/i386/i686/multiarch/strlen-sse2.S
Use <> for include of kernel-features.h.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / strlen-sse2.S
CommitLineData
e73015f2 1/* strlen with SSE2
5fa16e9b 2 Copyright (C) 2010, 2011 Free Software Foundation, Inc.
e73015f2
L
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
fc2ee42a
LD
21/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
22
23#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && !defined NOT_IN_libc
24
5fa16e9b 25# ifndef USE_AS_STRCAT
e73015f2 26
5fa16e9b 27# include <sysdep.h>
fc2ee42a
LD
28# define PARMS 4
29# define STR PARMS
30# define RETURN ret
31
32# ifdef USE_AS_STRNLEN
33# define LEN PARMS + 8
34# define CFI_PUSH(REG) \
5fa16e9b
LD
35 cfi_adjust_cfa_offset (4); \
36 cfi_rel_offset (REG, 0)
e73015f2 37
fc2ee42a 38# define CFI_POP(REG) \
5fa16e9b
LD
39 cfi_adjust_cfa_offset (-4); \
40 cfi_restore (REG)
e73015f2 41
fc2ee42a
LD
42# define PUSH(REG) pushl REG; CFI_PUSH (REG)
43# define POP(REG) popl REG; CFI_POP (REG)
44# undef RETURN
45# define RETURN POP (%edi); CFI_PUSH(%edi); ret
46# endif
47
48# ifndef STRLEN
49# define STRLEN __strlen_sse2
50# endif
e73015f2 51
48882a1a 52 atom_text_section
fc2ee42a 53ENTRY (STRLEN)
e73015f2 54 mov STR(%esp), %edx
fc2ee42a
LD
55# ifdef USE_AS_STRNLEN
56 PUSH (%edi)
57 movl LEN(%esp), %edi
58 sub $4, %edi
59 jbe L(len_less4_prolog)
60# endif
5fa16e9b 61# endif
e73015f2
L
62 xor %eax, %eax
63 cmpb $0, (%edx)
64 jz L(exit_tail0)
65 cmpb $0, 1(%edx)
66 jz L(exit_tail1)
67 cmpb $0, 2(%edx)
68 jz L(exit_tail2)
69 cmpb $0, 3(%edx)
70 jz L(exit_tail3)
fc2ee42a
LD
71
72# ifdef USE_AS_STRNLEN
73 sub $4, %edi
74 jbe L(len_less8_prolog)
75# endif
76
e73015f2
L
77 cmpb $0, 4(%edx)
78 jz L(exit_tail4)
79 cmpb $0, 5(%edx)
80 jz L(exit_tail5)
81 cmpb $0, 6(%edx)
82 jz L(exit_tail6)
83 cmpb $0, 7(%edx)
84 jz L(exit_tail7)
fc2ee42a
LD
85
86# ifdef USE_AS_STRNLEN
87 sub $4, %edi
88 jbe L(len_less12_prolog)
89# endif
90
e73015f2
L
91 cmpb $0, 8(%edx)
92 jz L(exit_tail8)
93 cmpb $0, 9(%edx)
94 jz L(exit_tail9)
95 cmpb $0, 10(%edx)
96 jz L(exit_tail10)
97 cmpb $0, 11(%edx)
98 jz L(exit_tail11)
fc2ee42a
LD
99
100# ifdef USE_AS_STRNLEN
101 sub $4, %edi
102 jbe L(len_less16_prolog)
103# endif
104
e73015f2
L
105 cmpb $0, 12(%edx)
106 jz L(exit_tail12)
107 cmpb $0, 13(%edx)
108 jz L(exit_tail13)
109 cmpb $0, 14(%edx)
110 jz L(exit_tail14)
111 cmpb $0, 15(%edx)
112 jz L(exit_tail15)
fc2ee42a 113
e73015f2 114 pxor %xmm0, %xmm0
fc2ee42a
LD
115 lea 16(%edx), %eax
116 mov %eax, %ecx
e73015f2 117 and $-16, %eax
fc2ee42a
LD
118
119# ifdef USE_AS_STRNLEN
120 and $15, %edx
121 add %edx, %edi
122 sub $64, %edi
123 jbe L(len_less64)
124# endif
e73015f2
L
125
126 pcmpeqb (%eax), %xmm0
127 pmovmskb %xmm0, %edx
128 pxor %xmm1, %xmm1
129 test %edx, %edx
130 lea 16(%eax), %eax
131 jnz L(exit)
132
133 pcmpeqb (%eax), %xmm1
134 pmovmskb %xmm1, %edx
135 pxor %xmm2, %xmm2
136 test %edx, %edx
137 lea 16(%eax), %eax
138 jnz L(exit)
139
e73015f2
L
140 pcmpeqb (%eax), %xmm2
141 pmovmskb %xmm2, %edx
142 pxor %xmm3, %xmm3
143 test %edx, %edx
144 lea 16(%eax), %eax
145 jnz L(exit)
146
147 pcmpeqb (%eax), %xmm3
148 pmovmskb %xmm3, %edx
149 test %edx, %edx
150 lea 16(%eax), %eax
151 jnz L(exit)
152
fc2ee42a
LD
153# ifdef USE_AS_STRNLEN
154 sub $64, %edi
155 jbe L(len_less64)
156# endif
157
e73015f2
L
158 pcmpeqb (%eax), %xmm0
159 pmovmskb %xmm0, %edx
160 test %edx, %edx
161 lea 16(%eax), %eax
162 jnz L(exit)
163
164 pcmpeqb (%eax), %xmm1
165 pmovmskb %xmm1, %edx
166 test %edx, %edx
167 lea 16(%eax), %eax
168 jnz L(exit)
169
170 pcmpeqb (%eax), %xmm2
171 pmovmskb %xmm2, %edx
172 test %edx, %edx
173 lea 16(%eax), %eax
174 jnz L(exit)
175
176 pcmpeqb (%eax), %xmm3
177 pmovmskb %xmm3, %edx
178 test %edx, %edx
179 lea 16(%eax), %eax
180 jnz L(exit)
181
fc2ee42a
LD
182# ifdef USE_AS_STRNLEN
183 sub $64, %edi
184 jbe L(len_less64)
185# endif
186
e73015f2
L
187 pcmpeqb (%eax), %xmm0
188 pmovmskb %xmm0, %edx
189 test %edx, %edx
190 lea 16(%eax), %eax
191 jnz L(exit)
192
193 pcmpeqb (%eax), %xmm1
194 pmovmskb %xmm1, %edx
195 test %edx, %edx
196 lea 16(%eax), %eax
197 jnz L(exit)
198
199 pcmpeqb (%eax), %xmm2
200 pmovmskb %xmm2, %edx
201 test %edx, %edx
202 lea 16(%eax), %eax
203 jnz L(exit)
204
205 pcmpeqb (%eax), %xmm3
206 pmovmskb %xmm3, %edx
207 test %edx, %edx
208 lea 16(%eax), %eax
209 jnz L(exit)
210
fc2ee42a
LD
211# ifdef USE_AS_STRNLEN
212 sub $64, %edi
213 jbe L(len_less64)
214# endif
215
e73015f2
L
216 pcmpeqb (%eax), %xmm0
217 pmovmskb %xmm0, %edx
218 test %edx, %edx
219 lea 16(%eax), %eax
220 jnz L(exit)
221
222 pcmpeqb (%eax), %xmm1
223 pmovmskb %xmm1, %edx
224 test %edx, %edx
225 lea 16(%eax), %eax
226 jnz L(exit)
227
228 pcmpeqb (%eax), %xmm2
229 pmovmskb %xmm2, %edx
230 test %edx, %edx
231 lea 16(%eax), %eax
232 jnz L(exit)
233
234 pcmpeqb (%eax), %xmm3
235 pmovmskb %xmm3, %edx
236 test %edx, %edx
237 lea 16(%eax), %eax
238 jnz L(exit)
239
fc2ee42a
LD
240# ifdef USE_AS_STRNLEN
241 mov %eax, %edx
242 and $63, %edx
243 add %edx, %edi
244# endif
245
e73015f2 246 and $-0x40, %eax
fc2ee42a
LD
247
248 .p2align 4
249L(aligned_64_loop):
250# ifdef USE_AS_STRNLEN
251 sub $64, %edi
252 jbe L(len_less64)
253# endif
5fa16e9b
LD
254 movaps (%eax), %xmm0
255 movaps 16(%eax), %xmm1
256 movaps 32(%eax), %xmm2
257 movaps 48(%eax), %xmm6
258 pminub %xmm1, %xmm0
259 pminub %xmm6, %xmm2
260 pminub %xmm0, %xmm2
261 pcmpeqb %xmm3, %xmm2
262 pmovmskb %xmm2, %edx
263 test %edx, %edx
e73015f2 264 lea 64(%eax), %eax
fc2ee42a 265 jz L(aligned_64_loop)
5fa16e9b
LD
266
267 pcmpeqb -64(%eax), %xmm3
268 pmovmskb %xmm3, %edx
269 test %edx, %edx
270 lea 48(%ecx), %ecx
271 jnz L(exit)
272
273 pcmpeqb %xmm1, %xmm3
274 pmovmskb %xmm3, %edx
e73015f2 275 test %edx, %edx
5fa16e9b
LD
276 lea -16(%ecx), %ecx
277 jnz L(exit)
278
279 pcmpeqb -32(%eax), %xmm3
280 pmovmskb %xmm3, %edx
281 test %edx, %edx
282 lea -16(%ecx), %ecx
283 jnz L(exit)
284
285 pcmpeqb %xmm6, %xmm3
286 pmovmskb %xmm3, %edx
287 lea -16(%ecx), %ecx
e73015f2
L
288L(exit):
289 sub %ecx, %eax
290 test %dl, %dl
291 jz L(exit_high)
fc2ee42a
LD
292
293 mov %dl, %cl
294 and $15, %cl
295 jz L(exit_8)
e73015f2
L
296 test $0x01, %dl
297 jnz L(exit_tail0)
e73015f2
L
298 test $0x02, %dl
299 jnz L(exit_tail1)
e73015f2
L
300 test $0x04, %dl
301 jnz L(exit_tail2)
fc2ee42a
LD
302 add $3, %eax
303 RETURN
e73015f2 304
fc2ee42a
LD
305 .p2align 4
306L(exit_8):
e73015f2
L
307 test $0x10, %dl
308 jnz L(exit_tail4)
e73015f2
L
309 test $0x20, %dl
310 jnz L(exit_tail5)
e73015f2
L
311 test $0x40, %dl
312 jnz L(exit_tail6)
313 add $7, %eax
e73015f2
L
314 RETURN
315
fc2ee42a 316 .p2align 4
e73015f2 317L(exit_high):
fc2ee42a
LD
318 mov %dh, %ch
319 and $15, %ch
320 jz L(exit_high_8)
e73015f2 321 test $0x01, %dh
fc2ee42a
LD
322 jnz L(exit_tail8)
323 test $0x02, %dh
324 jnz L(exit_tail9)
325 test $0x04, %dh
326 jnz L(exit_tail10)
327 add $11, %eax
328 RETURN
329
330 .p2align 4
331L(exit_high_8):
332 test $0x10, %dh
333 jnz L(exit_tail12)
334 test $0x20, %dh
335 jnz L(exit_tail13)
336 test $0x40, %dh
337 jnz L(exit_tail14)
338 add $15, %eax
339L(exit_tail0):
340 RETURN
341
342# ifdef USE_AS_STRNLEN
343
344 .p2align 4
345L(len_less64):
346 pxor %xmm0, %xmm0
347 add $64, %edi
348
349 pcmpeqb (%eax), %xmm0
350 pmovmskb %xmm0, %edx
351 pxor %xmm1, %xmm1
352 lea 16(%eax), %eax
353 test %edx, %edx
354 jnz L(strnlen_exit)
355
356 sub $16, %edi
357 jbe L(return_start_len)
358
359 pcmpeqb (%eax), %xmm1
360 pmovmskb %xmm1, %edx
361 lea 16(%eax), %eax
362 test %edx, %edx
363 jnz L(strnlen_exit)
364
365 sub $16, %edi
366 jbe L(return_start_len)
367
368 pcmpeqb (%eax), %xmm0
369 pmovmskb %xmm0, %edx
370 lea 16(%eax), %eax
371 test %edx, %edx
372 jnz L(strnlen_exit)
373
374 sub $16, %edi
375 jbe L(return_start_len)
376
377 pcmpeqb (%eax), %xmm1
378 pmovmskb %xmm1, %edx
379 lea 16(%eax), %eax
380 test %edx, %edx
381 jnz L(strnlen_exit)
382
383 movl LEN(%esp), %eax
384 RETURN
385
386 .p2align 4
387L(strnlen_exit):
388 sub %ecx, %eax
389
390 test %dl, %dl
391 jz L(strnlen_exit_high)
392 mov %dl, %cl
393 and $15, %cl
394 jz L(strnlen_exit_8)
395 test $0x01, %dl
e73015f2 396 jnz L(exit_tail0)
fc2ee42a
LD
397 test $0x02, %dl
398 jnz L(strnlen_exit_tail1)
399 test $0x04, %dl
400 jnz L(strnlen_exit_tail2)
401 sub $4, %edi
402 jb L(return_start_len)
403 lea 3(%eax), %eax
404 RETURN
e73015f2 405
fc2ee42a
LD
406 .p2align 4
407L(strnlen_exit_8):
408 test $0x10, %dl
409 jnz L(strnlen_exit_tail4)
410 test $0x20, %dl
411 jnz L(strnlen_exit_tail5)
412 test $0x40, %dl
413 jnz L(strnlen_exit_tail6)
414 sub $8, %edi
415 jb L(return_start_len)
416 lea 7(%eax), %eax
417 RETURN
e73015f2 418
fc2ee42a
LD
419 .p2align 4
420L(strnlen_exit_high):
421 mov %dh, %ch
422 and $15, %ch
423 jz L(strnlen_exit_high_8)
424 test $0x01, %dh
425 jnz L(strnlen_exit_tail8)
426 test $0x02, %dh
427 jnz L(strnlen_exit_tail9)
e73015f2 428 test $0x04, %dh
fc2ee42a
LD
429 jnz L(strnlen_exit_tail10)
430 sub $12, %edi
431 jb L(return_start_len)
432 lea 11(%eax), %eax
433 RETURN
e73015f2 434
fc2ee42a
LD
435 .p2align 4
436L(strnlen_exit_high_8):
e73015f2 437 test $0x10, %dh
fc2ee42a 438 jnz L(strnlen_exit_tail12)
e73015f2 439 test $0x20, %dh
fc2ee42a 440 jnz L(strnlen_exit_tail13)
e73015f2 441 test $0x40, %dh
fc2ee42a
LD
442 jnz L(strnlen_exit_tail14)
443 sub $16, %edi
444 jb L(return_start_len)
445 lea 15(%eax), %eax
446 RETURN
447
448 .p2align 4
449L(strnlen_exit_tail1):
450 sub $2, %edi
451 jb L(return_start_len)
452 lea 1(%eax), %eax
453 RETURN
454
455 .p2align 4
456L(strnlen_exit_tail2):
457 sub $3, %edi
458 jb L(return_start_len)
459 lea 2(%eax), %eax
460 RETURN
461
462 .p2align 4
463L(strnlen_exit_tail4):
464 sub $5, %edi
465 jb L(return_start_len)
466 lea 4(%eax), %eax
467 RETURN
468
469 .p2align 4
470L(strnlen_exit_tail5):
471 sub $6, %edi
472 jb L(return_start_len)
473 lea 5(%eax), %eax
474 RETURN
475
476 .p2align 4
477L(strnlen_exit_tail6):
478 sub $7, %edi
479 jb L(return_start_len)
480 lea 6(%eax), %eax
481 RETURN
482
483 .p2align 4
484L(strnlen_exit_tail8):
485 sub $9, %edi
486 jb L(return_start_len)
487 lea 8(%eax), %eax
488 RETURN
489
490 .p2align 4
491L(strnlen_exit_tail9):
492 sub $10, %edi
493 jb L(return_start_len)
494 lea 9(%eax), %eax
495 RETURN
496
497 .p2align 4
498L(strnlen_exit_tail10):
499 sub $11, %edi
500 jb L(return_start_len)
501 lea 10(%eax), %eax
502 RETURN
503
504 .p2align 4
505L(strnlen_exit_tail12):
506 sub $13, %edi
507 jb L(return_start_len)
508 lea 12(%eax), %eax
509 RETURN
510
511 .p2align 4
512L(strnlen_exit_tail13):
513 sub $14, %edi
514 jb L(return_start_len)
515 lea 13(%eax), %eax
e73015f2
L
516 RETURN
517
fc2ee42a
LD
518 .p2align 4
519L(strnlen_exit_tail14):
520 sub $15, %edi
521 jb L(return_start_len)
522 lea 14(%eax), %eax
523 RETURN
524
525 .p2align 4
526L(return_start_len):
527 movl LEN(%esp), %eax
528 RETURN
529
530/* for prolog only */
531
532 .p2align 4
533L(len_less4_prolog):
534 xor %eax, %eax
535
536 add $4, %edi
537 jz L(exit_tail0)
538
539 cmpb $0, (%edx)
540 jz L(exit_tail0)
541 cmp $1, %edi
542 je L(exit_tail1)
543
544 cmpb $0, 1(%edx)
545 jz L(exit_tail1)
546 cmp $2, %edi
547 je L(exit_tail2)
548
549 cmpb $0, 2(%edx)
550 jz L(exit_tail2)
551 cmp $3, %edi
552 je L(exit_tail3)
553
554 cmpb $0, 3(%edx)
555 jz L(exit_tail3)
556 mov $4, %eax
557 RETURN
558
559 .p2align 4
560L(len_less8_prolog):
561 add $4, %edi
562
563 cmpb $0, 4(%edx)
564 jz L(exit_tail4)
565 cmp $1, %edi
566 je L(exit_tail5)
567
568 cmpb $0, 5(%edx)
569 jz L(exit_tail5)
570 cmp $2, %edi
571 je L(exit_tail6)
572
573 cmpb $0, 6(%edx)
574 jz L(exit_tail6)
575 cmp $3, %edi
576 je L(exit_tail7)
577
578 cmpb $0, 7(%edx)
579 jz L(exit_tail7)
580 mov $8, %eax
581 RETURN
582
583
584 .p2align 4
585L(len_less12_prolog):
586 add $4, %edi
587
588 cmpb $0, 8(%edx)
589 jz L(exit_tail8)
590 cmp $1, %edi
591 je L(exit_tail9)
592
593 cmpb $0, 9(%edx)
594 jz L(exit_tail9)
595 cmp $2, %edi
596 je L(exit_tail10)
597
598 cmpb $0, 10(%edx)
599 jz L(exit_tail10)
600 cmp $3, %edi
601 je L(exit_tail11)
602
603 cmpb $0, 11(%edx)
604 jz L(exit_tail11)
605 mov $12, %eax
606 RETURN
607
608 .p2align 4
609L(len_less16_prolog):
610 add $4, %edi
611
612 cmpb $0, 12(%edx)
613 jz L(exit_tail12)
614 cmp $1, %edi
615 je L(exit_tail13)
616
617 cmpb $0, 13(%edx)
618 jz L(exit_tail13)
619 cmp $2, %edi
620 je L(exit_tail14)
621
622 cmpb $0, 14(%edx)
623 jz L(exit_tail14)
624 cmp $3, %edi
625 je L(exit_tail15)
626
627 cmpb $0, 15(%edx)
628 jz L(exit_tail15)
629 mov $16, %eax
630 RETURN
631# endif
632
e73015f2
L
633 .p2align 4
634L(exit_tail1):
635 add $1, %eax
636 RETURN
637
638L(exit_tail2):
639 add $2, %eax
640 RETURN
641
642L(exit_tail3):
643 add $3, %eax
644 RETURN
645
646L(exit_tail4):
647 add $4, %eax
648 RETURN
649
650L(exit_tail5):
651 add $5, %eax
652 RETURN
653
654L(exit_tail6):
655 add $6, %eax
656 RETURN
657
658L(exit_tail7):
659 add $7, %eax
660 RETURN
661
662L(exit_tail8):
663 add $8, %eax
664 RETURN
665
666L(exit_tail9):
667 add $9, %eax
668 RETURN
669
670L(exit_tail10):
671 add $10, %eax
672 RETURN
673
674L(exit_tail11):
675 add $11, %eax
676 RETURN
677
678L(exit_tail12):
679 add $12, %eax
680 RETURN
681
682L(exit_tail13):
683 add $13, %eax
684 RETURN
685
686L(exit_tail14):
687 add $14, %eax
688 RETURN
689
690L(exit_tail15):
691 add $15, %eax
5fa16e9b 692# ifndef USE_AS_STRCAT
fc2ee42a
LD
693 RETURN
694END (STRLEN)
5fa16e9b 695# endif
e73015f2 696#endif