]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strchr.S
Replace FSF snail mail address with URLs.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strchr.S
1 /* strchr with SSE4.2
2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20 #include <init-arch.h>
21
22
23 /* Define multiple versions only for the definition in libc. */
24 #ifndef NOT_IN_libc
25 .text
26 ENTRY(strchr)
27 .type strchr, @gnu_indirect_function
28 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
29 jne 1f
30 call __init_cpu_features
31 1: leaq __strchr_sse2(%rip), %rax
32 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
33 jz 2f
34 leaq __strchr_sse42(%rip), %rax
35 ret
36 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
37 jz 3f
38 leaq __strchr_sse2_no_bsf(%rip), %rax
39 3: ret
40 END(strchr)
41
42
43 /*
44 This implementation uses SSE4 instructions to compare up to 16 bytes
45 at a time looking for the first occurrence of the character c in the
46 string s:
47
48 char *strchr (const char *s, int c);
49
50 We use 0xa:
51 _SIDD_SBYTE_OPS
52 | _SIDD_CMP_EQUAL_EACH
53 | _SIDD_LEAST_SIGNIFICANT
54 on pcmpistri to compare xmm/mem128
55
56 0 1 2 3 4 5 6 7 8 9 A B C D E F
57 X X X X X X X X X X X X X X X X
58
59 against xmm
60
61 0 1 2 3 4 5 6 7 8 9 A B C D E F
62 C C C C C C C C C C C C C C C C
63
64 to find out if the first 16byte data element has a byte C and the
65 offset of the first byte. There are 3 cases:
66
67 1. The first 16byte data element has the byte C at the offset X.
68 2. The first 16byte data element has EOS and doesn't have the byte C.
69 3. The first 16byte data element is valid and doesn't have the byte C.
70
71 Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
72
73 case ECX CFlag ZFlag SFlag
74 1 X 1 0/1 0
75 2 16 0 1 0
76 3 16 0 0 0
77
78 We exit from the loop for cases 1 and 2 with jbe which branches
79 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
80 X for case 1. */
81
82 .section .text.sse4.2,"ax",@progbits
83 .align 16
84 .type __strchr_sse42, @function
85 __strchr_sse42:
86 cfi_startproc
87 CALL_MCOUNT
88 testb %sil, %sil
89 je __strend_sse4
90 pxor %xmm2, %xmm2
91 movd %esi, %xmm1
92 movl %edi, %ecx
93 pshufb %xmm2, %xmm1
94 andl $15, %ecx
95 movq %rdi, %r8
96 je L(aligned_start)
97
98 /* Handle unaligned string. */
99 andq $-16, %r8
100 movdqa (%r8), %xmm0
101 pcmpeqb %xmm0, %xmm2
102 pcmpeqb %xmm1, %xmm0
103 /* Find where NULL is. */
104 pmovmskb %xmm2, %edx
105 /* Check if there is a match. */
106 pmovmskb %xmm0, %esi
107 /* Remove the leading bytes. */
108 sarl %cl, %edx
109 sarl %cl, %esi
110 testl %esi, %esi
111 je L(unaligned_no_match)
112 /* Check which byte is a match. */
113 bsfl %esi, %eax
114 /* Is there a NULL? */
115 testl %edx, %edx
116 je L(unaligned_match)
117 bsfl %edx, %esi
118 cmpl %esi, %eax
119 /* Return NULL if NULL comes first. */
120 ja L(return_null)
121 L(unaligned_match):
122 addq %rdi, %rax
123 ret
124
125 .p2align 4
126 L(unaligned_no_match):
127 testl %edx, %edx
128 jne L(return_null)
129
130 /* Loop start on aligned string. */
131 L(loop):
132 addq $16, %r8
133 L(aligned_start):
134 pcmpistri $0x2, (%r8), %xmm1
135 jbe L(wrap)
136 addq $16, %r8
137 pcmpistri $0x2, (%r8), %xmm1
138 jbe L(wrap)
139 addq $16, %r8
140 pcmpistri $0x2, (%r8), %xmm1
141 jbe L(wrap)
142 addq $16, %r8
143 pcmpistri $0x2, (%r8), %xmm1
144 jbe L(wrap)
145 jmp L(loop)
146 L(wrap):
147 jc L(loop_exit)
148
149 /* Return NULL. */
150 L(return_null):
151 xorl %eax, %eax
152 ret
153
154 /* Loop exit. */
155 .p2align 4
156 L(loop_exit):
157 leaq (%r8,%rcx), %rax
158 ret
159 cfi_endproc
160 .size __strchr_sse42, .-__strchr_sse42
161
162
163 # undef ENTRY
164 # define ENTRY(name) \
165 .type __strchr_sse2, @function; \
166 .align 16; \
167 __strchr_sse2: cfi_startproc; \
168 CALL_MCOUNT
169 # undef END
170 # define END(name) \
171 cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
172 # undef libc_hidden_builtin_def
173 /* It doesn't make sense to send libc-internal strchr calls through a PLT.
174 The speedup we get from using SSE4.2 instruction is likely eaten away
175 by the indirect call in the PLT. */
176 # define libc_hidden_builtin_def(name) \
177 .globl __GI_strchr; __GI_strchr = __strchr_sse2
178 #endif
179
180 #include "../strchr.S"