]>
Commit | Line | Data |
---|---|---|
1d3e4b61 UD |
1 | /* wcsrchr with SSSE3 |
2 | Copyright (C) 2011 Free Software Foundation, Inc. | |
3 | Contributed by Intel Corporation. | |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
17 | License along with the GNU C Library; if not, write to the Free | |
18 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
19 | 02111-1307 USA. */ | |
20 | ||
21 | #include <sysdep.h> | |
22 | ||
23 | .text | |
24 | ENTRY (wcsrchr) | |
25 | ||
26 | movd %rsi, %xmm1 | |
27 | mov %rdi, %rcx | |
28 | punpckldq %xmm1, %xmm1 | |
29 | pxor %xmm2, %xmm2 | |
30 | punpckldq %xmm1, %xmm1 | |
31 | and $63, %rcx | |
32 | cmp $48, %rcx | |
33 | ja L(crosscache) | |
34 | ||
35 | movdqu (%rdi), %xmm0 | |
36 | pcmpeqd %xmm0, %xmm2 | |
37 | pcmpeqd %xmm1, %xmm0 | |
38 | pmovmskb %xmm2, %rcx | |
39 | pmovmskb %xmm0, %rax | |
40 | add $16, %rdi | |
41 | ||
42 | test %rax, %rax | |
43 | jnz L(unaligned_match1) | |
44 | ||
45 | test %rcx, %rcx | |
46 | jnz L(return_null) | |
47 | ||
48 | and $-16, %rdi | |
49 | xor %r8, %r8 | |
50 | jmp L(loop) | |
51 | ||
52 | .p2align 4 | |
53 | L(unaligned_match1): | |
54 | test %rcx, %rcx | |
55 | jnz L(prolog_find_zero_1) | |
56 | ||
57 | mov %rax, %r8 | |
58 | mov %rdi, %rsi | |
59 | and $-16, %rdi | |
60 | jmp L(loop) | |
61 | ||
62 | .p2align 4 | |
63 | L(crosscache): | |
64 | and $15, %rcx | |
65 | and $-16, %rdi | |
66 | pxor %xmm3, %xmm3 | |
67 | movdqa (%rdi), %xmm0 | |
68 | pcmpeqd %xmm0, %xmm3 | |
69 | pcmpeqd %xmm1, %xmm0 | |
70 | pmovmskb %xmm3, %rdx | |
71 | pmovmskb %xmm0, %rax | |
72 | shr %cl, %rdx | |
73 | shr %cl, %rax | |
74 | add $16, %rdi | |
75 | ||
76 | test %rax, %rax | |
77 | jnz L(unaligned_match) | |
78 | ||
79 | test %rdx, %rdx | |
80 | jnz L(return_null) | |
81 | ||
82 | xor %r8, %r8 | |
83 | jmp L(loop) | |
84 | ||
85 | .p2align 4 | |
86 | L(unaligned_match): | |
87 | test %rdx, %rdx | |
88 | jnz L(prolog_find_zero) | |
89 | ||
90 | mov %rax, %r8 | |
91 | lea (%rdi, %rcx), %rsi | |
92 | ||
93 | /* Loop start on aligned string. */ | |
94 | .p2align 4 | |
95 | L(loop): | |
96 | movdqa (%rdi), %xmm0 | |
97 | pcmpeqd %xmm0, %xmm2 | |
98 | add $16, %rdi | |
99 | pcmpeqd %xmm1, %xmm0 | |
100 | pmovmskb %xmm2, %rcx | |
101 | pmovmskb %xmm0, %rax | |
102 | or %rax, %rcx | |
103 | jnz L(matches) | |
104 | ||
105 | movdqa (%rdi), %xmm3 | |
106 | pcmpeqd %xmm3, %xmm2 | |
107 | add $16, %rdi | |
108 | pcmpeqd %xmm1, %xmm3 | |
109 | pmovmskb %xmm2, %rcx | |
110 | pmovmskb %xmm3, %rax | |
111 | or %rax, %rcx | |
112 | jnz L(matches) | |
113 | ||
114 | movdqa (%rdi), %xmm4 | |
115 | pcmpeqd %xmm4, %xmm2 | |
116 | add $16, %rdi | |
117 | pcmpeqd %xmm1, %xmm4 | |
118 | pmovmskb %xmm2, %rcx | |
119 | pmovmskb %xmm4, %rax | |
120 | or %rax, %rcx | |
121 | jnz L(matches) | |
122 | ||
123 | movdqa (%rdi), %xmm5 | |
124 | pcmpeqd %xmm5, %xmm2 | |
125 | add $16, %rdi | |
126 | pcmpeqd %xmm1, %xmm5 | |
127 | pmovmskb %xmm2, %rcx | |
128 | pmovmskb %xmm5, %rax | |
129 | or %rax, %rcx | |
130 | jz L(loop) | |
131 | ||
132 | .p2align 4 | |
133 | L(matches): | |
134 | test %rax, %rax | |
135 | jnz L(match) | |
136 | L(return_value): | |
137 | test %r8, %r8 | |
138 | jz L(return_null) | |
139 | mov %r8, %rax | |
140 | mov %rsi, %rdi | |
141 | ||
142 | test $15 << 4, %ah | |
143 | jnz L(match_fourth_wchar) | |
144 | test %ah, %ah | |
145 | jnz L(match_third_wchar) | |
146 | test $15 << 4, %al | |
147 | jnz L(match_second_wchar) | |
148 | lea -16(%rdi), %rax | |
149 | ret | |
150 | ||
151 | .p2align 4 | |
152 | L(match): | |
153 | pmovmskb %xmm2, %rcx | |
154 | test %rcx, %rcx | |
155 | jnz L(find_zero) | |
156 | mov %rax, %r8 | |
157 | mov %rdi, %rsi | |
158 | jmp L(loop) | |
159 | ||
160 | .p2align 4 | |
161 | L(find_zero): | |
162 | test $15, %cl | |
163 | jnz L(find_zero_in_first_wchar) | |
164 | test %cl, %cl | |
165 | jnz L(find_zero_in_second_wchar) | |
166 | test $15, %ch | |
167 | jnz L(find_zero_in_third_wchar) | |
168 | ||
169 | and $1 << 13 - 1, %rax | |
170 | jz L(return_value) | |
171 | ||
172 | test $15 << 4, %ah | |
173 | jnz L(match_fourth_wchar) | |
174 | test %ah, %ah | |
175 | jnz L(match_third_wchar) | |
176 | test $15 << 4, %al | |
177 | jnz L(match_second_wchar) | |
178 | lea -16(%rdi), %rax | |
179 | ret | |
180 | ||
181 | .p2align 4 | |
182 | L(find_zero_in_first_wchar): | |
183 | test $1, %rax | |
184 | jz L(return_value) | |
185 | lea -16(%rdi), %rax | |
186 | ret | |
187 | ||
188 | .p2align 4 | |
189 | L(find_zero_in_second_wchar): | |
190 | and $1 << 5 - 1, %rax | |
191 | jz L(return_value) | |
192 | ||
193 | test $15 << 4, %al | |
194 | jnz L(match_second_wchar) | |
195 | lea -16(%rdi), %rax | |
196 | ret | |
197 | ||
198 | .p2align 4 | |
199 | L(find_zero_in_third_wchar): | |
200 | and $1 << 9 - 1, %rax | |
201 | jz L(return_value) | |
202 | ||
203 | test %ah, %ah | |
204 | jnz L(match_third_wchar) | |
205 | test $15 << 4, %al | |
206 | jnz L(match_second_wchar) | |
207 | lea -16(%rdi), %rax | |
208 | ret | |
209 | ||
210 | .p2align 4 | |
211 | L(prolog_find_zero): | |
212 | add %rcx, %rdi | |
213 | mov %rdx, %rcx | |
214 | L(prolog_find_zero_1): | |
215 | test $15, %cl | |
216 | jnz L(prolog_find_zero_in_first_wchar) | |
217 | test %cl, %cl | |
218 | jnz L(prolog_find_zero_in_second_wchar) | |
219 | test $15, %ch | |
220 | jnz L(prolog_find_zero_in_third_wchar) | |
221 | ||
222 | and $1 << 13 - 1, %rax | |
223 | jz L(return_null) | |
224 | ||
225 | test $15 << 4, %ah | |
226 | jnz L(match_fourth_wchar) | |
227 | test %ah, %ah | |
228 | jnz L(match_third_wchar) | |
229 | test $15 << 4, %al | |
230 | jnz L(match_second_wchar) | |
231 | lea -16(%rdi), %rax | |
232 | ret | |
233 | ||
234 | .p2align 4 | |
235 | L(prolog_find_zero_in_first_wchar): | |
236 | test $1, %rax | |
237 | jz L(return_null) | |
238 | lea -16(%rdi), %rax | |
239 | ret | |
240 | ||
241 | .p2align 4 | |
242 | L(prolog_find_zero_in_second_wchar): | |
243 | and $1 << 5 - 1, %rax | |
244 | jz L(return_null) | |
245 | ||
246 | test $15 << 4, %al | |
247 | jnz L(match_second_wchar) | |
248 | lea -16(%rdi), %rax | |
249 | ret | |
250 | ||
251 | .p2align 4 | |
252 | L(prolog_find_zero_in_third_wchar): | |
253 | and $1 << 9 - 1, %rax | |
254 | jz L(return_null) | |
255 | ||
256 | test %ah, %ah | |
257 | jnz L(match_third_wchar) | |
258 | test $15 << 4, %al | |
259 | jnz L(match_second_wchar) | |
260 | lea -16(%rdi), %rax | |
261 | ret | |
262 | ||
263 | .p2align 4 | |
264 | L(match_second_wchar): | |
265 | lea -12(%rdi), %rax | |
266 | ret | |
267 | ||
268 | .p2align 4 | |
269 | L(match_third_wchar): | |
270 | lea -8(%rdi), %rax | |
271 | ret | |
272 | ||
273 | .p2align 4 | |
274 | L(match_fourth_wchar): | |
275 | lea -4(%rdi), %rax | |
276 | ret | |
277 | ||
278 | .p2align 4 | |
279 | L(return_null): | |
280 | xor %rax, %rax | |
281 | ret | |
282 | ||
283 | END (wcsrchr) |