]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
bc6277508c158eb8702b054a17eeb266f4a3ae90
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memrchr-sse2-bsf.S
1 /* Optimized memrchr with sse2
2 Copyright (C) 2011-2014 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23
24 # define CFI_PUSH(REG) \
25 cfi_adjust_cfa_offset (4); \
26 cfi_rel_offset (REG, 0)
27
28 # define CFI_POP(REG) \
29 cfi_adjust_cfa_offset (-4); \
30 cfi_restore (REG)
31
32 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
33 # define POP(REG) popl REG; CFI_POP (REG)
34
35 # define PARMS 4
36 # define STR1 PARMS
37 # define STR2 STR1+4
38 # define LEN STR2+4
39
40 # define MEMCHR __memrchr_sse2_bsf
41
42 .text
43 ENTRY (MEMCHR)
44 mov STR1(%esp), %ecx
45 movd STR2(%esp), %xmm1
46 mov LEN(%esp), %edx
47
48 sub $16, %edx
49 jbe L(length_less16)
50
51 punpcklbw %xmm1, %xmm1
52 add %edx, %ecx
53 punpcklbw %xmm1, %xmm1
54
55 movdqu (%ecx), %xmm0
56 pshufd $0, %xmm1, %xmm1
57 pcmpeqb %xmm1, %xmm0
58
59 /* Check if there is a match. */
60 pmovmskb %xmm0, %eax
61 test %eax, %eax
62 jnz L(matches0)
63
64 sub $64, %ecx
65 mov %ecx, %eax
66 and $15, %eax
67 jz L(loop_prolog)
68
69 add $16, %ecx
70 add $16, %edx
71 sub %eax, %ecx
72 sub %eax, %edx
73
74 .p2align 4
75 /* Loop start on aligned string. */
76 L(loop_prolog):
77 sub $64, %edx
78 jbe L(exit_loop)
79
80 movdqa 48(%ecx), %xmm0
81 pcmpeqb %xmm1, %xmm0
82 pmovmskb %xmm0, %eax
83 test %eax, %eax
84 jnz L(matches48)
85
86 movdqa 32(%ecx), %xmm2
87 pcmpeqb %xmm1, %xmm2
88 pmovmskb %xmm2, %eax
89 test %eax, %eax
90 jnz L(matches32)
91
92 movdqa 16(%ecx), %xmm3
93 pcmpeqb %xmm1, %xmm3
94 pmovmskb %xmm3, %eax
95 test %eax, %eax
96 jnz L(matches16)
97
98 movdqa (%ecx), %xmm4
99 pcmpeqb %xmm1, %xmm4
100 pmovmskb %xmm4, %eax
101 test %eax, %eax
102 jnz L(matches0)
103
104 sub $64, %ecx
105 sub $64, %edx
106 jbe L(exit_loop)
107
108 movdqa 48(%ecx), %xmm0
109 pcmpeqb %xmm1, %xmm0
110 pmovmskb %xmm0, %eax
111 test %eax, %eax
112 jnz L(matches48)
113
114 movdqa 32(%ecx), %xmm2
115 pcmpeqb %xmm1, %xmm2
116 pmovmskb %xmm2, %eax
117 test %eax, %eax
118 jnz L(matches32)
119
120 movdqa 16(%ecx), %xmm3
121 pcmpeqb %xmm1, %xmm3
122 pmovmskb %xmm3, %eax
123 test %eax, %eax
124 jnz L(matches16)
125
126 movdqa (%ecx), %xmm3
127 pcmpeqb %xmm1, %xmm3
128 pmovmskb %xmm3, %eax
129 test %eax, %eax
130 jnz L(matches0)
131
132 mov %ecx, %eax
133 and $63, %eax
134 test %eax, %eax
135 jz L(align64_loop)
136
137 add $64, %ecx
138 add $64, %edx
139 sub %eax, %ecx
140 sub %eax, %edx
141
142 .p2align 4
143 L(align64_loop):
144 sub $64, %ecx
145 sub $64, %edx
146 jbe L(exit_loop)
147
148 movdqa (%ecx), %xmm0
149 movdqa 16(%ecx), %xmm2
150 movdqa 32(%ecx), %xmm3
151 movdqa 48(%ecx), %xmm4
152
153 pcmpeqb %xmm1, %xmm0
154 pcmpeqb %xmm1, %xmm2
155 pcmpeqb %xmm1, %xmm3
156 pcmpeqb %xmm1, %xmm4
157
158 pmaxub %xmm3, %xmm0
159 pmaxub %xmm4, %xmm2
160 pmaxub %xmm0, %xmm2
161 pmovmskb %xmm2, %eax
162
163 test %eax, %eax
164 jz L(align64_loop)
165
166 pmovmskb %xmm4, %eax
167 test %eax, %eax
168 jnz L(matches48)
169
170 pmovmskb %xmm3, %eax
171 test %eax, %eax
172 jnz L(matches32)
173
174 movdqa 16(%ecx), %xmm2
175
176 pcmpeqb %xmm1, %xmm2
177 pcmpeqb (%ecx), %xmm1
178
179 pmovmskb %xmm2, %eax
180 test %eax, %eax
181 jnz L(matches16)
182
183 pmovmskb %xmm1, %eax
184 bsr %eax, %eax
185
186 add %ecx, %eax
187 ret
188
189 .p2align 4
190 L(exit_loop):
191 add $64, %edx
192 cmp $32, %edx
193 jbe L(exit_loop_32)
194
195 movdqa 48(%ecx), %xmm0
196 pcmpeqb %xmm1, %xmm0
197 pmovmskb %xmm0, %eax
198 test %eax, %eax
199 jnz L(matches48)
200
201 movdqa 32(%ecx), %xmm2
202 pcmpeqb %xmm1, %xmm2
203 pmovmskb %xmm2, %eax
204 test %eax, %eax
205 jnz L(matches32)
206
207 movdqa 16(%ecx), %xmm3
208 pcmpeqb %xmm1, %xmm3
209 pmovmskb %xmm3, %eax
210 test %eax, %eax
211 jnz L(matches16_1)
212 cmp $48, %edx
213 jbe L(return_null)
214
215 pcmpeqb (%ecx), %xmm1
216 pmovmskb %xmm1, %eax
217 test %eax, %eax
218 jnz L(matches0_1)
219 xor %eax, %eax
220 ret
221
222 .p2align 4
223 L(exit_loop_32):
224 movdqa 48(%ecx), %xmm0
225 pcmpeqb %xmm1, %xmm0
226 pmovmskb %xmm0, %eax
227 test %eax, %eax
228 jnz L(matches48_1)
229 cmp $16, %edx
230 jbe L(return_null)
231
232 pcmpeqb 32(%ecx), %xmm1
233 pmovmskb %xmm1, %eax
234 test %eax, %eax
235 jnz L(matches32_1)
236 xor %eax, %eax
237 ret
238
239 .p2align 4
240 L(matches0):
241 bsr %eax, %eax
242 add %ecx, %eax
243 ret
244
245 .p2align 4
246 L(matches16):
247 bsr %eax, %eax
248 lea 16(%eax, %ecx), %eax
249 ret
250
251 .p2align 4
252 L(matches32):
253 bsr %eax, %eax
254 lea 32(%eax, %ecx), %eax
255 ret
256
257 .p2align 4
258 L(matches48):
259 bsr %eax, %eax
260 lea 48(%eax, %ecx), %eax
261 ret
262
263 .p2align 4
264 L(matches0_1):
265 bsr %eax, %eax
266 sub $64, %edx
267 add %eax, %edx
268 jl L(return_null)
269 add %ecx, %eax
270 ret
271
272 .p2align 4
273 L(matches16_1):
274 bsr %eax, %eax
275 sub $48, %edx
276 add %eax, %edx
277 jl L(return_null)
278 lea 16(%ecx, %eax), %eax
279 ret
280
281 .p2align 4
282 L(matches32_1):
283 bsr %eax, %eax
284 sub $32, %edx
285 add %eax, %edx
286 jl L(return_null)
287 lea 32(%ecx, %eax), %eax
288 ret
289
290 .p2align 4
291 L(matches48_1):
292 bsr %eax, %eax
293 sub $16, %edx
294 add %eax, %edx
295 jl L(return_null)
296 lea 48(%ecx, %eax), %eax
297 ret
298
299 .p2align 4
300 L(return_null):
301 xor %eax, %eax
302 ret
303
304 .p2align 4
305 L(length_less16_offset0):
306 mov %dl, %cl
307 pcmpeqb (%eax), %xmm1
308
309 mov $1, %edx
310 sal %cl, %edx
311 sub $1, %edx
312 mov %edx, %ecx
313
314 pmovmskb %xmm1, %edx
315
316 and %ecx, %edx
317 test %edx, %edx
318 jz L(return_null)
319
320 bsr %edx, %ecx
321 add %ecx, %eax
322 ret
323
324 .p2align 4
325 L(length_less16):
326 punpcklbw %xmm1, %xmm1
327 mov %ecx, %eax
328 punpcklbw %xmm1, %xmm1
329 add $16, %edx
330 jz L(return_null)
331
332 pshufd $0, %xmm1, %xmm1
333 and $15, %ecx
334 jz L(length_less16_offset0)
335
336 PUSH (%edi)
337 mov %cl, %dh
338 add %dl, %dh
339 and $-16, %eax
340
341 sub $16, %dh
342 ja L(length_less16_part2)
343
344 pcmpeqb (%eax), %xmm1
345 pmovmskb %xmm1, %edi
346
347 sar %cl, %edi
348 add %ecx, %eax
349 mov %dl, %cl
350
351 mov $1, %edx
352 sal %cl, %edx
353 sub $1, %edx
354
355 and %edx, %edi
356 test %edi, %edi
357 jz L(ret_null)
358
359 bsr %edi, %edi
360 add %edi, %eax
361 POP (%edi)
362 ret
363
364 CFI_PUSH (%edi)
365
366 .p2align 4
367 L(length_less16_part2):
368 movdqa 16(%eax), %xmm2
369 pcmpeqb %xmm1, %xmm2
370 pmovmskb %xmm2, %edi
371
372 mov %cl, %ch
373
374 mov %dh, %cl
375 mov $1, %edx
376 sal %cl, %edx
377 sub $1, %edx
378
379 and %edx, %edi
380
381 test %edi, %edi
382 jnz L(length_less16_part2_return)
383
384 pcmpeqb (%eax), %xmm1
385 pmovmskb %xmm1, %edi
386
387 mov %ch, %cl
388 sar %cl, %edi
389 test %edi, %edi
390 jz L(ret_null)
391
392 bsr %edi, %edi
393 add %edi, %eax
394 xor %ch, %ch
395 add %ecx, %eax
396 POP (%edi)
397 ret
398
399 CFI_PUSH (%edi)
400
401 .p2align 4
402 L(length_less16_part2_return):
403 bsr %edi, %edi
404 lea 16(%eax, %edi), %eax
405 POP (%edi)
406 ret
407
408 CFI_PUSH (%edi)
409
410 .p2align 4
411 L(ret_null):
412 xor %eax, %eax
413 POP (%edi)
414 ret
415
416 END (MEMCHR)
417 #endif