]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/i386/i686/multiarch/strlen-sse2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / strlen-sse2.S
CommitLineData
e73015f2 1/* strlen with SSE2
b168057a 2 Copyright (C) 2010-2015 Free Software Foundation, Inc.
e73015f2
L
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
e73015f2 19
fc2ee42a
LD
20/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */
21
4f41c682 22#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc)
fc2ee42a 23
5fa16e9b 24# ifndef USE_AS_STRCAT
e73015f2 25
5fa16e9b 26# include <sysdep.h>
fc2ee42a
LD
27# define PARMS 4
28# define STR PARMS
29# define RETURN ret
30
31# ifdef USE_AS_STRNLEN
32# define LEN PARMS + 8
33# define CFI_PUSH(REG) \
5fa16e9b
LD
34 cfi_adjust_cfa_offset (4); \
35 cfi_rel_offset (REG, 0)
e73015f2 36
fc2ee42a 37# define CFI_POP(REG) \
5fa16e9b
LD
38 cfi_adjust_cfa_offset (-4); \
39 cfi_restore (REG)
e73015f2 40
fc2ee42a
LD
41# define PUSH(REG) pushl REG; CFI_PUSH (REG)
42# define POP(REG) popl REG; CFI_POP (REG)
43# undef RETURN
44# define RETURN POP (%edi); CFI_PUSH(%edi); ret
45# endif
46
47# ifndef STRLEN
48# define STRLEN __strlen_sse2
49# endif
e73015f2 50
48882a1a 51 atom_text_section
fc2ee42a 52ENTRY (STRLEN)
e73015f2 53 mov STR(%esp), %edx
fc2ee42a
LD
54# ifdef USE_AS_STRNLEN
55 PUSH (%edi)
56 movl LEN(%esp), %edi
57 sub $4, %edi
58 jbe L(len_less4_prolog)
59# endif
5fa16e9b 60# endif
e73015f2
L
61 xor %eax, %eax
62 cmpb $0, (%edx)
63 jz L(exit_tail0)
64 cmpb $0, 1(%edx)
65 jz L(exit_tail1)
66 cmpb $0, 2(%edx)
67 jz L(exit_tail2)
68 cmpb $0, 3(%edx)
69 jz L(exit_tail3)
fc2ee42a
LD
70
71# ifdef USE_AS_STRNLEN
72 sub $4, %edi
73 jbe L(len_less8_prolog)
74# endif
75
e73015f2
L
76 cmpb $0, 4(%edx)
77 jz L(exit_tail4)
78 cmpb $0, 5(%edx)
79 jz L(exit_tail5)
80 cmpb $0, 6(%edx)
81 jz L(exit_tail6)
82 cmpb $0, 7(%edx)
83 jz L(exit_tail7)
fc2ee42a
LD
84
85# ifdef USE_AS_STRNLEN
86 sub $4, %edi
87 jbe L(len_less12_prolog)
88# endif
89
e73015f2
L
90 cmpb $0, 8(%edx)
91 jz L(exit_tail8)
92 cmpb $0, 9(%edx)
93 jz L(exit_tail9)
94 cmpb $0, 10(%edx)
95 jz L(exit_tail10)
96 cmpb $0, 11(%edx)
97 jz L(exit_tail11)
fc2ee42a
LD
98
99# ifdef USE_AS_STRNLEN
100 sub $4, %edi
101 jbe L(len_less16_prolog)
102# endif
103
e73015f2
L
104 cmpb $0, 12(%edx)
105 jz L(exit_tail12)
106 cmpb $0, 13(%edx)
107 jz L(exit_tail13)
108 cmpb $0, 14(%edx)
109 jz L(exit_tail14)
110 cmpb $0, 15(%edx)
111 jz L(exit_tail15)
fc2ee42a 112
e73015f2 113 pxor %xmm0, %xmm0
fc2ee42a
LD
114 lea 16(%edx), %eax
115 mov %eax, %ecx
e73015f2 116 and $-16, %eax
fc2ee42a
LD
117
118# ifdef USE_AS_STRNLEN
119 and $15, %edx
120 add %edx, %edi
121 sub $64, %edi
122 jbe L(len_less64)
123# endif
e73015f2
L
124
125 pcmpeqb (%eax), %xmm0
126 pmovmskb %xmm0, %edx
127 pxor %xmm1, %xmm1
128 test %edx, %edx
129 lea 16(%eax), %eax
130 jnz L(exit)
131
132 pcmpeqb (%eax), %xmm1
133 pmovmskb %xmm1, %edx
134 pxor %xmm2, %xmm2
135 test %edx, %edx
136 lea 16(%eax), %eax
137 jnz L(exit)
138
e73015f2
L
139 pcmpeqb (%eax), %xmm2
140 pmovmskb %xmm2, %edx
141 pxor %xmm3, %xmm3
142 test %edx, %edx
143 lea 16(%eax), %eax
144 jnz L(exit)
145
146 pcmpeqb (%eax), %xmm3
147 pmovmskb %xmm3, %edx
148 test %edx, %edx
149 lea 16(%eax), %eax
150 jnz L(exit)
151
fc2ee42a
LD
152# ifdef USE_AS_STRNLEN
153 sub $64, %edi
154 jbe L(len_less64)
155# endif
156
e73015f2
L
157 pcmpeqb (%eax), %xmm0
158 pmovmskb %xmm0, %edx
159 test %edx, %edx
160 lea 16(%eax), %eax
161 jnz L(exit)
162
163 pcmpeqb (%eax), %xmm1
164 pmovmskb %xmm1, %edx
165 test %edx, %edx
166 lea 16(%eax), %eax
167 jnz L(exit)
168
169 pcmpeqb (%eax), %xmm2
170 pmovmskb %xmm2, %edx
171 test %edx, %edx
172 lea 16(%eax), %eax
173 jnz L(exit)
174
175 pcmpeqb (%eax), %xmm3
176 pmovmskb %xmm3, %edx
177 test %edx, %edx
178 lea 16(%eax), %eax
179 jnz L(exit)
180
fc2ee42a
LD
181# ifdef USE_AS_STRNLEN
182 sub $64, %edi
183 jbe L(len_less64)
184# endif
185
e73015f2
L
186 pcmpeqb (%eax), %xmm0
187 pmovmskb %xmm0, %edx
188 test %edx, %edx
189 lea 16(%eax), %eax
190 jnz L(exit)
191
192 pcmpeqb (%eax), %xmm1
193 pmovmskb %xmm1, %edx
194 test %edx, %edx
195 lea 16(%eax), %eax
196 jnz L(exit)
197
198 pcmpeqb (%eax), %xmm2
199 pmovmskb %xmm2, %edx
200 test %edx, %edx
201 lea 16(%eax), %eax
202 jnz L(exit)
203
204 pcmpeqb (%eax), %xmm3
205 pmovmskb %xmm3, %edx
206 test %edx, %edx
207 lea 16(%eax), %eax
208 jnz L(exit)
209
fc2ee42a
LD
210# ifdef USE_AS_STRNLEN
211 sub $64, %edi
212 jbe L(len_less64)
213# endif
214
e73015f2
L
215 pcmpeqb (%eax), %xmm0
216 pmovmskb %xmm0, %edx
217 test %edx, %edx
218 lea 16(%eax), %eax
219 jnz L(exit)
220
221 pcmpeqb (%eax), %xmm1
222 pmovmskb %xmm1, %edx
223 test %edx, %edx
224 lea 16(%eax), %eax
225 jnz L(exit)
226
227 pcmpeqb (%eax), %xmm2
228 pmovmskb %xmm2, %edx
229 test %edx, %edx
230 lea 16(%eax), %eax
231 jnz L(exit)
232
233 pcmpeqb (%eax), %xmm3
234 pmovmskb %xmm3, %edx
235 test %edx, %edx
236 lea 16(%eax), %eax
237 jnz L(exit)
238
fc2ee42a
LD
239# ifdef USE_AS_STRNLEN
240 mov %eax, %edx
241 and $63, %edx
242 add %edx, %edi
243# endif
244
e73015f2 245 and $-0x40, %eax
fc2ee42a
LD
246
247 .p2align 4
248L(aligned_64_loop):
249# ifdef USE_AS_STRNLEN
250 sub $64, %edi
251 jbe L(len_less64)
252# endif
5fa16e9b
LD
253 movaps (%eax), %xmm0
254 movaps 16(%eax), %xmm1
255 movaps 32(%eax), %xmm2
256 movaps 48(%eax), %xmm6
257 pminub %xmm1, %xmm0
258 pminub %xmm6, %xmm2
259 pminub %xmm0, %xmm2
260 pcmpeqb %xmm3, %xmm2
261 pmovmskb %xmm2, %edx
262 test %edx, %edx
e73015f2 263 lea 64(%eax), %eax
fc2ee42a 264 jz L(aligned_64_loop)
5fa16e9b
LD
265
266 pcmpeqb -64(%eax), %xmm3
267 pmovmskb %xmm3, %edx
268 test %edx, %edx
269 lea 48(%ecx), %ecx
270 jnz L(exit)
271
272 pcmpeqb %xmm1, %xmm3
273 pmovmskb %xmm3, %edx
e73015f2 274 test %edx, %edx
5fa16e9b
LD
275 lea -16(%ecx), %ecx
276 jnz L(exit)
277
278 pcmpeqb -32(%eax), %xmm3
279 pmovmskb %xmm3, %edx
280 test %edx, %edx
281 lea -16(%ecx), %ecx
282 jnz L(exit)
283
284 pcmpeqb %xmm6, %xmm3
285 pmovmskb %xmm3, %edx
286 lea -16(%ecx), %ecx
e73015f2
L
287L(exit):
288 sub %ecx, %eax
289 test %dl, %dl
290 jz L(exit_high)
fc2ee42a
LD
291
292 mov %dl, %cl
293 and $15, %cl
294 jz L(exit_8)
e73015f2
L
295 test $0x01, %dl
296 jnz L(exit_tail0)
e73015f2
L
297 test $0x02, %dl
298 jnz L(exit_tail1)
e73015f2
L
299 test $0x04, %dl
300 jnz L(exit_tail2)
fc2ee42a
LD
301 add $3, %eax
302 RETURN
e73015f2 303
fc2ee42a
LD
304 .p2align 4
305L(exit_8):
e73015f2
L
306 test $0x10, %dl
307 jnz L(exit_tail4)
e73015f2
L
308 test $0x20, %dl
309 jnz L(exit_tail5)
e73015f2
L
310 test $0x40, %dl
311 jnz L(exit_tail6)
312 add $7, %eax
e73015f2
L
313 RETURN
314
fc2ee42a 315 .p2align 4
e73015f2 316L(exit_high):
fc2ee42a
LD
317 mov %dh, %ch
318 and $15, %ch
319 jz L(exit_high_8)
e73015f2 320 test $0x01, %dh
fc2ee42a
LD
321 jnz L(exit_tail8)
322 test $0x02, %dh
323 jnz L(exit_tail9)
324 test $0x04, %dh
325 jnz L(exit_tail10)
326 add $11, %eax
327 RETURN
328
329 .p2align 4
330L(exit_high_8):
331 test $0x10, %dh
332 jnz L(exit_tail12)
333 test $0x20, %dh
334 jnz L(exit_tail13)
335 test $0x40, %dh
336 jnz L(exit_tail14)
337 add $15, %eax
338L(exit_tail0):
339 RETURN
340
341# ifdef USE_AS_STRNLEN
342
343 .p2align 4
344L(len_less64):
345 pxor %xmm0, %xmm0
346 add $64, %edi
347
348 pcmpeqb (%eax), %xmm0
349 pmovmskb %xmm0, %edx
350 pxor %xmm1, %xmm1
351 lea 16(%eax), %eax
352 test %edx, %edx
353 jnz L(strnlen_exit)
354
355 sub $16, %edi
356 jbe L(return_start_len)
357
358 pcmpeqb (%eax), %xmm1
359 pmovmskb %xmm1, %edx
360 lea 16(%eax), %eax
361 test %edx, %edx
362 jnz L(strnlen_exit)
363
364 sub $16, %edi
365 jbe L(return_start_len)
366
367 pcmpeqb (%eax), %xmm0
368 pmovmskb %xmm0, %edx
369 lea 16(%eax), %eax
370 test %edx, %edx
371 jnz L(strnlen_exit)
372
373 sub $16, %edi
374 jbe L(return_start_len)
375
376 pcmpeqb (%eax), %xmm1
377 pmovmskb %xmm1, %edx
378 lea 16(%eax), %eax
379 test %edx, %edx
380 jnz L(strnlen_exit)
381
382 movl LEN(%esp), %eax
383 RETURN
384
385 .p2align 4
386L(strnlen_exit):
387 sub %ecx, %eax
388
389 test %dl, %dl
390 jz L(strnlen_exit_high)
391 mov %dl, %cl
392 and $15, %cl
393 jz L(strnlen_exit_8)
394 test $0x01, %dl
e73015f2 395 jnz L(exit_tail0)
fc2ee42a
LD
396 test $0x02, %dl
397 jnz L(strnlen_exit_tail1)
398 test $0x04, %dl
399 jnz L(strnlen_exit_tail2)
400 sub $4, %edi
401 jb L(return_start_len)
402 lea 3(%eax), %eax
403 RETURN
e73015f2 404
fc2ee42a
LD
405 .p2align 4
406L(strnlen_exit_8):
407 test $0x10, %dl
408 jnz L(strnlen_exit_tail4)
409 test $0x20, %dl
410 jnz L(strnlen_exit_tail5)
411 test $0x40, %dl
412 jnz L(strnlen_exit_tail6)
413 sub $8, %edi
414 jb L(return_start_len)
415 lea 7(%eax), %eax
416 RETURN
e73015f2 417
fc2ee42a
LD
418 .p2align 4
419L(strnlen_exit_high):
420 mov %dh, %ch
421 and $15, %ch
422 jz L(strnlen_exit_high_8)
423 test $0x01, %dh
424 jnz L(strnlen_exit_tail8)
425 test $0x02, %dh
426 jnz L(strnlen_exit_tail9)
e73015f2 427 test $0x04, %dh
fc2ee42a
LD
428 jnz L(strnlen_exit_tail10)
429 sub $12, %edi
430 jb L(return_start_len)
431 lea 11(%eax), %eax
432 RETURN
e73015f2 433
fc2ee42a
LD
434 .p2align 4
435L(strnlen_exit_high_8):
e73015f2 436 test $0x10, %dh
fc2ee42a 437 jnz L(strnlen_exit_tail12)
e73015f2 438 test $0x20, %dh
fc2ee42a 439 jnz L(strnlen_exit_tail13)
e73015f2 440 test $0x40, %dh
fc2ee42a
LD
441 jnz L(strnlen_exit_tail14)
442 sub $16, %edi
443 jb L(return_start_len)
444 lea 15(%eax), %eax
445 RETURN
446
447 .p2align 4
448L(strnlen_exit_tail1):
449 sub $2, %edi
450 jb L(return_start_len)
451 lea 1(%eax), %eax
452 RETURN
453
454 .p2align 4
455L(strnlen_exit_tail2):
456 sub $3, %edi
457 jb L(return_start_len)
458 lea 2(%eax), %eax
459 RETURN
460
461 .p2align 4
462L(strnlen_exit_tail4):
463 sub $5, %edi
464 jb L(return_start_len)
465 lea 4(%eax), %eax
466 RETURN
467
468 .p2align 4
469L(strnlen_exit_tail5):
470 sub $6, %edi
471 jb L(return_start_len)
472 lea 5(%eax), %eax
473 RETURN
474
475 .p2align 4
476L(strnlen_exit_tail6):
477 sub $7, %edi
478 jb L(return_start_len)
479 lea 6(%eax), %eax
480 RETURN
481
482 .p2align 4
483L(strnlen_exit_tail8):
484 sub $9, %edi
485 jb L(return_start_len)
486 lea 8(%eax), %eax
487 RETURN
488
489 .p2align 4
490L(strnlen_exit_tail9):
491 sub $10, %edi
492 jb L(return_start_len)
493 lea 9(%eax), %eax
494 RETURN
495
496 .p2align 4
497L(strnlen_exit_tail10):
498 sub $11, %edi
499 jb L(return_start_len)
500 lea 10(%eax), %eax
501 RETURN
502
503 .p2align 4
504L(strnlen_exit_tail12):
505 sub $13, %edi
506 jb L(return_start_len)
507 lea 12(%eax), %eax
508 RETURN
509
510 .p2align 4
511L(strnlen_exit_tail13):
512 sub $14, %edi
513 jb L(return_start_len)
514 lea 13(%eax), %eax
e73015f2
L
515 RETURN
516
fc2ee42a
LD
517 .p2align 4
518L(strnlen_exit_tail14):
519 sub $15, %edi
520 jb L(return_start_len)
521 lea 14(%eax), %eax
522 RETURN
523
524 .p2align 4
525L(return_start_len):
526 movl LEN(%esp), %eax
527 RETURN
528
529/* for prolog only */
530
531 .p2align 4
532L(len_less4_prolog):
533 xor %eax, %eax
534
535 add $4, %edi
536 jz L(exit_tail0)
537
538 cmpb $0, (%edx)
539 jz L(exit_tail0)
540 cmp $1, %edi
541 je L(exit_tail1)
542
543 cmpb $0, 1(%edx)
544 jz L(exit_tail1)
545 cmp $2, %edi
546 je L(exit_tail2)
547
548 cmpb $0, 2(%edx)
549 jz L(exit_tail2)
550 cmp $3, %edi
551 je L(exit_tail3)
552
553 cmpb $0, 3(%edx)
554 jz L(exit_tail3)
555 mov $4, %eax
556 RETURN
557
558 .p2align 4
559L(len_less8_prolog):
560 add $4, %edi
561
562 cmpb $0, 4(%edx)
563 jz L(exit_tail4)
564 cmp $1, %edi
565 je L(exit_tail5)
566
567 cmpb $0, 5(%edx)
568 jz L(exit_tail5)
569 cmp $2, %edi
570 je L(exit_tail6)
571
572 cmpb $0, 6(%edx)
573 jz L(exit_tail6)
574 cmp $3, %edi
575 je L(exit_tail7)
576
577 cmpb $0, 7(%edx)
578 jz L(exit_tail7)
579 mov $8, %eax
580 RETURN
581
582
583 .p2align 4
584L(len_less12_prolog):
585 add $4, %edi
586
587 cmpb $0, 8(%edx)
588 jz L(exit_tail8)
589 cmp $1, %edi
590 je L(exit_tail9)
591
592 cmpb $0, 9(%edx)
593 jz L(exit_tail9)
594 cmp $2, %edi
595 je L(exit_tail10)
596
597 cmpb $0, 10(%edx)
598 jz L(exit_tail10)
599 cmp $3, %edi
600 je L(exit_tail11)
601
602 cmpb $0, 11(%edx)
603 jz L(exit_tail11)
604 mov $12, %eax
605 RETURN
606
607 .p2align 4
608L(len_less16_prolog):
609 add $4, %edi
610
611 cmpb $0, 12(%edx)
612 jz L(exit_tail12)
613 cmp $1, %edi
614 je L(exit_tail13)
615
616 cmpb $0, 13(%edx)
617 jz L(exit_tail13)
618 cmp $2, %edi
619 je L(exit_tail14)
620
621 cmpb $0, 14(%edx)
622 jz L(exit_tail14)
623 cmp $3, %edi
624 je L(exit_tail15)
625
626 cmpb $0, 15(%edx)
627 jz L(exit_tail15)
628 mov $16, %eax
629 RETURN
630# endif
631
e73015f2
L
632 .p2align 4
633L(exit_tail1):
634 add $1, %eax
635 RETURN
636
637L(exit_tail2):
638 add $2, %eax
639 RETURN
640
641L(exit_tail3):
642 add $3, %eax
643 RETURN
644
645L(exit_tail4):
646 add $4, %eax
647 RETURN
648
649L(exit_tail5):
650 add $5, %eax
651 RETURN
652
653L(exit_tail6):
654 add $6, %eax
655 RETURN
656
657L(exit_tail7):
658 add $7, %eax
659 RETURN
660
661L(exit_tail8):
662 add $8, %eax
663 RETURN
664
665L(exit_tail9):
666 add $9, %eax
667 RETURN
668
669L(exit_tail10):
670 add $10, %eax
671 RETURN
672
673L(exit_tail11):
674 add $11, %eax
675 RETURN
676
677L(exit_tail12):
678 add $12, %eax
679 RETURN
680
681L(exit_tail13):
682 add $13, %eax
683 RETURN
684
685L(exit_tail14):
686 add $14, %eax
687 RETURN
688
689L(exit_tail15):
690 add $15, %eax
5fa16e9b 691# ifndef USE_AS_STRCAT
fc2ee42a
LD
692 RETURN
693END (STRLEN)
5fa16e9b 694# endif
e73015f2 695#endif