]>
Commit | Line | Data |
---|---|---|
1d3e4b61 | 1 | /* wcsrchr with SSE2, without using bsf instructions. |
04277e02 | 2 | Copyright (C) 2011-2019 Free Software Foundation, Inc. |
1d3e4b61 UD |
3 | Contributed by Intel Corporation. |
4 | This file is part of the GNU C Library. | |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
17 | License along with the GNU C Library; if not, see |
18 | <http://www.gnu.org/licenses/>. */ | |
1d3e4b61 | 19 | |
4f41c682 | 20 | #if IS_IN (libc) |
1d3e4b61 UD |
21 | # include <sysdep.h> |
22 | # define CFI_PUSH(REG) \ | |
23 | cfi_adjust_cfa_offset (4); \ | |
24 | cfi_rel_offset (REG, 0) | |
25 | ||
26 | # define CFI_POP(REG) \ | |
27 | cfi_adjust_cfa_offset (-4); \ | |
28 | cfi_restore (REG) | |
29 | ||
30 | # define PUSH(REG) pushl REG; CFI_PUSH (REG) | |
31 | # define POP(REG) popl REG; CFI_POP (REG) | |
32 | ||
33 | # define PARMS 8 | |
34 | # define ENTRANCE PUSH (%edi); | |
35 | # define RETURN POP (%edi); ret; CFI_PUSH (%edi); | |
36 | # define STR1 PARMS | |
37 | # define STR2 STR1+4 | |
38 | ||
39 | atom_text_section | |
40 | ENTRY (__wcsrchr_sse2) | |
41 | ||
42 | ENTRANCE | |
43 | mov STR1(%esp), %ecx | |
44 | movd STR2(%esp), %xmm1 | |
45 | ||
46 | mov %ecx, %edi | |
47 | punpckldq %xmm1, %xmm1 | |
48 | pxor %xmm2, %xmm2 | |
49 | punpckldq %xmm1, %xmm1 | |
50 | ||
51 | /* ECX has OFFSET. */ | |
52 | and $63, %ecx | |
53 | cmp $48, %ecx | |
54 | ja L(crosscache) | |
55 | ||
56 | /* unaligned string. */ | |
57 | movdqu (%edi), %xmm0 | |
58 | pcmpeqd %xmm0, %xmm2 | |
59 | pcmpeqd %xmm1, %xmm0 | |
60 | /* Find where NULL is. */ | |
61 | pmovmskb %xmm2, %ecx | |
62 | /* Check if there is a match. */ | |
63 | pmovmskb %xmm0, %eax | |
64 | add $16, %edi | |
65 | ||
66 | test %eax, %eax | |
67 | jnz L(unaligned_match1) | |
68 | ||
69 | test %ecx, %ecx | |
70 | jnz L(return_null) | |
71 | ||
72 | and $-16, %edi | |
73 | ||
74 | PUSH (%esi) | |
75 | ||
76 | xor %edx, %edx | |
77 | jmp L(loop) | |
78 | ||
79 | CFI_POP (%esi) | |
80 | ||
81 | .p2align 4 | |
82 | L(unaligned_match1): | |
83 | test %ecx, %ecx | |
84 | jnz L(prolog_find_zero_1) | |
85 | ||
86 | PUSH (%esi) | |
87 | ||
88 | /* Save current match */ | |
89 | mov %eax, %edx | |
90 | mov %edi, %esi | |
91 | and $-16, %edi | |
92 | jmp L(loop) | |
93 | ||
94 | CFI_POP (%esi) | |
95 | ||
96 | .p2align 4 | |
97 | L(crosscache): | |
98 | /* Hancle unaligned string. */ | |
99 | and $15, %ecx | |
100 | and $-16, %edi | |
101 | pxor %xmm3, %xmm3 | |
102 | movdqa (%edi), %xmm0 | |
103 | pcmpeqd %xmm0, %xmm3 | |
104 | pcmpeqd %xmm1, %xmm0 | |
105 | /* Find where NULL is. */ | |
106 | pmovmskb %xmm3, %edx | |
107 | /* Check if there is a match. */ | |
108 | pmovmskb %xmm0, %eax | |
109 | /* Remove the leading bytes. */ | |
110 | shr %cl, %edx | |
111 | shr %cl, %eax | |
112 | add $16, %edi | |
113 | ||
114 | test %eax, %eax | |
115 | jnz L(unaligned_match) | |
116 | ||
117 | test %edx, %edx | |
118 | jnz L(return_null) | |
119 | ||
120 | PUSH (%esi) | |
121 | ||
122 | xor %edx, %edx | |
123 | jmp L(loop) | |
124 | ||
125 | CFI_POP (%esi) | |
126 | ||
127 | .p2align 4 | |
128 | L(unaligned_match): | |
129 | test %edx, %edx | |
130 | jnz L(prolog_find_zero) | |
131 | ||
132 | PUSH (%esi) | |
133 | ||
134 | mov %eax, %edx | |
135 | lea (%edi, %ecx), %esi | |
136 | ||
137 | /* Loop start on aligned string. */ | |
138 | .p2align 4 | |
139 | L(loop): | |
140 | movdqa (%edi), %xmm0 | |
141 | pcmpeqd %xmm0, %xmm2 | |
142 | add $16, %edi | |
143 | pcmpeqd %xmm1, %xmm0 | |
144 | pmovmskb %xmm2, %ecx | |
145 | pmovmskb %xmm0, %eax | |
146 | or %eax, %ecx | |
147 | jnz L(matches) | |
148 | ||
149 | movdqa (%edi), %xmm3 | |
150 | pcmpeqd %xmm3, %xmm2 | |
151 | add $16, %edi | |
152 | pcmpeqd %xmm1, %xmm3 | |
153 | pmovmskb %xmm2, %ecx | |
154 | pmovmskb %xmm3, %eax | |
155 | or %eax, %ecx | |
156 | jnz L(matches) | |
157 | ||
158 | movdqa (%edi), %xmm4 | |
159 | pcmpeqd %xmm4, %xmm2 | |
160 | add $16, %edi | |
161 | pcmpeqd %xmm1, %xmm4 | |
162 | pmovmskb %xmm2, %ecx | |
163 | pmovmskb %xmm4, %eax | |
164 | or %eax, %ecx | |
165 | jnz L(matches) | |
166 | ||
167 | movdqa (%edi), %xmm5 | |
168 | pcmpeqd %xmm5, %xmm2 | |
169 | add $16, %edi | |
170 | pcmpeqd %xmm1, %xmm5 | |
171 | pmovmskb %xmm2, %ecx | |
172 | pmovmskb %xmm5, %eax | |
173 | or %eax, %ecx | |
174 | jz L(loop) | |
175 | ||
176 | .p2align 4 | |
177 | L(matches): | |
178 | test %eax, %eax | |
179 | jnz L(match) | |
180 | L(return_value): | |
181 | test %edx, %edx | |
182 | jz L(return_null_1) | |
183 | mov %edx, %eax | |
184 | mov %esi, %edi | |
185 | ||
186 | POP (%esi) | |
187 | ||
188 | test %ah, %ah | |
189 | jnz L(match_third_or_fourth_wchar) | |
190 | test $15 << 4, %al | |
191 | jnz L(match_second_wchar) | |
192 | lea -16(%edi), %eax | |
193 | RETURN | |
194 | ||
195 | CFI_PUSH (%esi) | |
196 | ||
197 | .p2align 4 | |
198 | L(return_null_1): | |
199 | POP (%esi) | |
200 | ||
201 | xor %eax, %eax | |
202 | RETURN | |
203 | ||
204 | CFI_PUSH (%esi) | |
205 | ||
206 | .p2align 4 | |
207 | L(match): | |
208 | pmovmskb %xmm2, %ecx | |
209 | test %ecx, %ecx | |
210 | jnz L(find_zero) | |
211 | /* save match info */ | |
212 | mov %eax, %edx | |
213 | mov %edi, %esi | |
214 | jmp L(loop) | |
215 | ||
216 | .p2align 4 | |
217 | L(find_zero): | |
218 | test %cl, %cl | |
219 | jz L(find_zero_in_third_or_fourth_wchar) | |
220 | test $15, %cl | |
221 | jz L(find_zero_in_second_wchar) | |
222 | and $1, %eax | |
223 | jz L(return_value) | |
224 | ||
225 | POP (%esi) | |
226 | ||
227 | lea -16(%edi), %eax | |
228 | RETURN | |
229 | ||
230 | CFI_PUSH (%esi) | |
231 | ||
232 | .p2align 4 | |
233 | L(find_zero_in_second_wchar): | |
234 | and $1 << 5 - 1, %eax | |
235 | jz L(return_value) | |
236 | ||
237 | POP (%esi) | |
238 | ||
239 | test $15 << 4, %al | |
240 | jnz L(match_second_wchar) | |
241 | lea -16(%edi), %eax | |
242 | RETURN | |
243 | ||
244 | CFI_PUSH (%esi) | |
245 | ||
246 | .p2align 4 | |
247 | L(find_zero_in_third_or_fourth_wchar): | |
248 | test $15, %ch | |
249 | jz L(find_zero_in_fourth_wchar) | |
250 | and $1 << 9 - 1, %eax | |
251 | jz L(return_value) | |
252 | ||
253 | POP (%esi) | |
254 | ||
255 | test %ah, %ah | |
256 | jnz L(match_third_wchar) | |
257 | test $15 << 4, %al | |
258 | jnz L(match_second_wchar) | |
259 | lea -16(%edi), %eax | |
260 | RETURN | |
261 | ||
262 | CFI_PUSH (%esi) | |
263 | ||
264 | .p2align 4 | |
265 | L(find_zero_in_fourth_wchar): | |
266 | ||
267 | POP (%esi) | |
268 | ||
269 | test %ah, %ah | |
270 | jnz L(match_third_or_fourth_wchar) | |
271 | test $15 << 4, %al | |
272 | jnz L(match_second_wchar) | |
273 | lea -16(%edi), %eax | |
274 | RETURN | |
275 | ||
276 | CFI_PUSH (%esi) | |
277 | ||
278 | .p2align 4 | |
279 | L(match_second_wchar): | |
280 | lea -12(%edi), %eax | |
281 | RETURN | |
282 | ||
283 | .p2align 4 | |
284 | L(match_third_or_fourth_wchar): | |
285 | test $15 << 4, %ah | |
286 | jnz L(match_fourth_wchar) | |
287 | lea -8(%edi), %eax | |
288 | RETURN | |
289 | ||
290 | .p2align 4 | |
291 | L(match_third_wchar): | |
292 | lea -8(%edi), %eax | |
293 | RETURN | |
294 | ||
295 | .p2align 4 | |
296 | L(match_fourth_wchar): | |
297 | lea -4(%edi), %eax | |
298 | RETURN | |
299 | ||
300 | .p2align 4 | |
301 | L(return_null): | |
302 | xor %eax, %eax | |
303 | RETURN | |
304 | ||
305 | .p2align 4 | |
306 | L(prolog_find_zero): | |
307 | add %ecx, %edi | |
308 | mov %edx, %ecx | |
309 | L(prolog_find_zero_1): | |
310 | test %cl, %cl | |
311 | jz L(prolog_find_zero_in_third_or_fourth_wchar) | |
312 | test $15, %cl | |
313 | jz L(prolog_find_zero_in_second_wchar) | |
314 | and $1, %eax | |
315 | jz L(return_null) | |
316 | ||
317 | lea -16(%edi), %eax | |
318 | RETURN | |
319 | ||
320 | .p2align 4 | |
321 | L(prolog_find_zero_in_second_wchar): | |
322 | and $1 << 5 - 1, %eax | |
323 | jz L(return_null) | |
324 | ||
325 | test $15 << 4, %al | |
326 | jnz L(match_second_wchar) | |
327 | lea -16(%edi), %eax | |
328 | RETURN | |
329 | ||
330 | .p2align 4 | |
331 | L(prolog_find_zero_in_third_or_fourth_wchar): | |
332 | test $15, %ch | |
333 | jz L(prolog_find_zero_in_fourth_wchar) | |
334 | and $1 << 9 - 1, %eax | |
335 | jz L(return_null) | |
336 | ||
337 | test %ah, %ah | |
338 | jnz L(match_third_wchar) | |
339 | test $15 << 4, %al | |
340 | jnz L(match_second_wchar) | |
341 | lea -16(%edi), %eax | |
342 | RETURN | |
343 | ||
344 | .p2align 4 | |
345 | L(prolog_find_zero_in_fourth_wchar): | |
346 | test %ah, %ah | |
347 | jnz L(match_third_or_fourth_wchar) | |
348 | test $15 << 4, %al | |
349 | jnz L(match_second_wchar) | |
350 | lea -16(%edi), %eax | |
351 | RETURN | |
352 | ||
353 | END (__wcsrchr_sse2) | |
354 | #endif |