]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / wcsrchr-sse2.S
1 /* wcsrchr with SSE2, without using bsf instructions.
2 Copyright (C) 2011-2015 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21 # include <sysdep.h>
22 # define CFI_PUSH(REG) \
23 cfi_adjust_cfa_offset (4); \
24 cfi_rel_offset (REG, 0)
25
26 # define CFI_POP(REG) \
27 cfi_adjust_cfa_offset (-4); \
28 cfi_restore (REG)
29
30 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
31 # define POP(REG) popl REG; CFI_POP (REG)
32
33 # define PARMS 8
34 # define ENTRANCE PUSH (%edi);
35 # define RETURN POP (%edi); ret; CFI_PUSH (%edi);
36 # define STR1 PARMS
37 # define STR2 STR1+4
38
39 atom_text_section
40 ENTRY (__wcsrchr_sse2)
41
42 ENTRANCE
43 mov STR1(%esp), %ecx
44 movd STR2(%esp), %xmm1
45
46 mov %ecx, %edi
47 punpckldq %xmm1, %xmm1
48 pxor %xmm2, %xmm2
49 punpckldq %xmm1, %xmm1
50
51 /* ECX has OFFSET. */
52 and $63, %ecx
53 cmp $48, %ecx
54 ja L(crosscache)
55
56 /* unaligned string. */
57 movdqu (%edi), %xmm0
58 pcmpeqd %xmm0, %xmm2
59 pcmpeqd %xmm1, %xmm0
60 /* Find where NULL is. */
61 pmovmskb %xmm2, %ecx
62 /* Check if there is a match. */
63 pmovmskb %xmm0, %eax
64 add $16, %edi
65
66 test %eax, %eax
67 jnz L(unaligned_match1)
68
69 test %ecx, %ecx
70 jnz L(return_null)
71
72 and $-16, %edi
73
74 PUSH (%esi)
75
76 xor %edx, %edx
77 jmp L(loop)
78
79 CFI_POP (%esi)
80
81 .p2align 4
82 L(unaligned_match1):
83 test %ecx, %ecx
84 jnz L(prolog_find_zero_1)
85
86 PUSH (%esi)
87
88 /* Save current match */
89 mov %eax, %edx
90 mov %edi, %esi
91 and $-16, %edi
92 jmp L(loop)
93
94 CFI_POP (%esi)
95
96 .p2align 4
97 L(crosscache):
98 /* Hancle unaligned string. */
99 and $15, %ecx
100 and $-16, %edi
101 pxor %xmm3, %xmm3
102 movdqa (%edi), %xmm0
103 pcmpeqd %xmm0, %xmm3
104 pcmpeqd %xmm1, %xmm0
105 /* Find where NULL is. */
106 pmovmskb %xmm3, %edx
107 /* Check if there is a match. */
108 pmovmskb %xmm0, %eax
109 /* Remove the leading bytes. */
110 shr %cl, %edx
111 shr %cl, %eax
112 add $16, %edi
113
114 test %eax, %eax
115 jnz L(unaligned_match)
116
117 test %edx, %edx
118 jnz L(return_null)
119
120 PUSH (%esi)
121
122 xor %edx, %edx
123 jmp L(loop)
124
125 CFI_POP (%esi)
126
127 .p2align 4
128 L(unaligned_match):
129 test %edx, %edx
130 jnz L(prolog_find_zero)
131
132 PUSH (%esi)
133
134 mov %eax, %edx
135 lea (%edi, %ecx), %esi
136
137 /* Loop start on aligned string. */
138 .p2align 4
139 L(loop):
140 movdqa (%edi), %xmm0
141 pcmpeqd %xmm0, %xmm2
142 add $16, %edi
143 pcmpeqd %xmm1, %xmm0
144 pmovmskb %xmm2, %ecx
145 pmovmskb %xmm0, %eax
146 or %eax, %ecx
147 jnz L(matches)
148
149 movdqa (%edi), %xmm3
150 pcmpeqd %xmm3, %xmm2
151 add $16, %edi
152 pcmpeqd %xmm1, %xmm3
153 pmovmskb %xmm2, %ecx
154 pmovmskb %xmm3, %eax
155 or %eax, %ecx
156 jnz L(matches)
157
158 movdqa (%edi), %xmm4
159 pcmpeqd %xmm4, %xmm2
160 add $16, %edi
161 pcmpeqd %xmm1, %xmm4
162 pmovmskb %xmm2, %ecx
163 pmovmskb %xmm4, %eax
164 or %eax, %ecx
165 jnz L(matches)
166
167 movdqa (%edi), %xmm5
168 pcmpeqd %xmm5, %xmm2
169 add $16, %edi
170 pcmpeqd %xmm1, %xmm5
171 pmovmskb %xmm2, %ecx
172 pmovmskb %xmm5, %eax
173 or %eax, %ecx
174 jz L(loop)
175
176 .p2align 4
177 L(matches):
178 test %eax, %eax
179 jnz L(match)
180 L(return_value):
181 test %edx, %edx
182 jz L(return_null_1)
183 mov %edx, %eax
184 mov %esi, %edi
185
186 POP (%esi)
187
188 test %ah, %ah
189 jnz L(match_third_or_fourth_wchar)
190 test $15 << 4, %al
191 jnz L(match_second_wchar)
192 lea -16(%edi), %eax
193 RETURN
194
195 CFI_PUSH (%esi)
196
197 .p2align 4
198 L(return_null_1):
199 POP (%esi)
200
201 xor %eax, %eax
202 RETURN
203
204 CFI_PUSH (%esi)
205
206 .p2align 4
207 L(match):
208 pmovmskb %xmm2, %ecx
209 test %ecx, %ecx
210 jnz L(find_zero)
211 /* save match info */
212 mov %eax, %edx
213 mov %edi, %esi
214 jmp L(loop)
215
216 .p2align 4
217 L(find_zero):
218 test %cl, %cl
219 jz L(find_zero_in_third_or_fourth_wchar)
220 test $15, %cl
221 jz L(find_zero_in_second_wchar)
222 and $1, %eax
223 jz L(return_value)
224
225 POP (%esi)
226
227 lea -16(%edi), %eax
228 RETURN
229
230 CFI_PUSH (%esi)
231
232 .p2align 4
233 L(find_zero_in_second_wchar):
234 and $1 << 5 - 1, %eax
235 jz L(return_value)
236
237 POP (%esi)
238
239 test $15 << 4, %al
240 jnz L(match_second_wchar)
241 lea -16(%edi), %eax
242 RETURN
243
244 CFI_PUSH (%esi)
245
246 .p2align 4
247 L(find_zero_in_third_or_fourth_wchar):
248 test $15, %ch
249 jz L(find_zero_in_fourth_wchar)
250 and $1 << 9 - 1, %eax
251 jz L(return_value)
252
253 POP (%esi)
254
255 test %ah, %ah
256 jnz L(match_third_wchar)
257 test $15 << 4, %al
258 jnz L(match_second_wchar)
259 lea -16(%edi), %eax
260 RETURN
261
262 CFI_PUSH (%esi)
263
264 .p2align 4
265 L(find_zero_in_fourth_wchar):
266
267 POP (%esi)
268
269 test %ah, %ah
270 jnz L(match_third_or_fourth_wchar)
271 test $15 << 4, %al
272 jnz L(match_second_wchar)
273 lea -16(%edi), %eax
274 RETURN
275
276 CFI_PUSH (%esi)
277
278 .p2align 4
279 L(match_second_wchar):
280 lea -12(%edi), %eax
281 RETURN
282
283 .p2align 4
284 L(match_third_or_fourth_wchar):
285 test $15 << 4, %ah
286 jnz L(match_fourth_wchar)
287 lea -8(%edi), %eax
288 RETURN
289
290 .p2align 4
291 L(match_third_wchar):
292 lea -8(%edi), %eax
293 RETURN
294
295 .p2align 4
296 L(match_fourth_wchar):
297 lea -4(%edi), %eax
298 RETURN
299
300 .p2align 4
301 L(return_null):
302 xor %eax, %eax
303 RETURN
304
305 .p2align 4
306 L(prolog_find_zero):
307 add %ecx, %edi
308 mov %edx, %ecx
309 L(prolog_find_zero_1):
310 test %cl, %cl
311 jz L(prolog_find_zero_in_third_or_fourth_wchar)
312 test $15, %cl
313 jz L(prolog_find_zero_in_second_wchar)
314 and $1, %eax
315 jz L(return_null)
316
317 lea -16(%edi), %eax
318 RETURN
319
320 .p2align 4
321 L(prolog_find_zero_in_second_wchar):
322 and $1 << 5 - 1, %eax
323 jz L(return_null)
324
325 test $15 << 4, %al
326 jnz L(match_second_wchar)
327 lea -16(%edi), %eax
328 RETURN
329
330 .p2align 4
331 L(prolog_find_zero_in_third_or_fourth_wchar):
332 test $15, %ch
333 jz L(prolog_find_zero_in_fourth_wchar)
334 and $1 << 9 - 1, %eax
335 jz L(return_null)
336
337 test %ah, %ah
338 jnz L(match_third_wchar)
339 test $15 << 4, %al
340 jnz L(match_second_wchar)
341 lea -16(%edi), %eax
342 RETURN
343
344 .p2align 4
345 L(prolog_find_zero_in_fourth_wchar):
346 test %ah, %ah
347 jnz L(match_third_or_fourth_wchar)
348 test $15 << 4, %al
349 jnz L(match_second_wchar)
350 lea -16(%edi), %eax
351 RETURN
352
353 END (__wcsrchr_sse2)
354 #endif