]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S
Optimized memchr, memrchr, rawmemchr for x86-32
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memrchr-sse2-bsf.S
1 /* Optimized memrchr with sse2
2 Copyright (C) 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21 #ifndef NOT_IN_libc
22
23 # include <sysdep.h>
24
25 # define CFI_PUSH(REG) \
26 cfi_adjust_cfa_offset (4); \
27 cfi_rel_offset (REG, 0)
28
29 # define CFI_POP(REG) \
30 cfi_adjust_cfa_offset (-4); \
31 cfi_restore (REG)
32
33 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
34 # define POP(REG) popl REG; CFI_POP (REG)
35
36 # define PARMS 4
37 # define STR1 PARMS
38 # define STR2 STR1+4
39 # define LEN STR2+4
40
41 # define MEMCHR __memrchr_sse2_bsf
42
43 .text
44 ENTRY (MEMCHR)
45 mov STR1(%esp), %ecx
46 movd STR2(%esp), %xmm1
47 mov LEN(%esp), %edx
48
49 sub $16, %edx
50 jbe L(length_less16)
51
52 punpcklbw %xmm1, %xmm1
53 add %edx, %ecx
54 punpcklbw %xmm1, %xmm1
55
56 movdqu (%ecx), %xmm0
57 pshufd $0, %xmm1, %xmm1
58 pcmpeqb %xmm1, %xmm0
59
60 /* Check if there is a match. */
61 pmovmskb %xmm0, %eax
62 test %eax, %eax
63 jnz L(matches0)
64
65 sub $64, %ecx
66 mov %ecx, %eax
67 and $15, %eax
68 jz L(loop_prolog)
69
70 add $16, %ecx
71 add $16, %edx
72 sub %eax, %ecx
73 sub %eax, %edx
74
75 .p2align 4
76 /* Loop start on aligned string. */
77 L(loop_prolog):
78 sub $64, %edx
79 jbe L(exit_loop)
80
81 movdqa 48(%ecx), %xmm0
82 pcmpeqb %xmm1, %xmm0
83 pmovmskb %xmm0, %eax
84 test %eax, %eax
85 jnz L(matches48)
86
87 movdqa 32(%ecx), %xmm2
88 pcmpeqb %xmm1, %xmm2
89 pmovmskb %xmm2, %eax
90 test %eax, %eax
91 jnz L(matches32)
92
93 movdqa 16(%ecx), %xmm3
94 pcmpeqb %xmm1, %xmm3
95 pmovmskb %xmm3, %eax
96 test %eax, %eax
97 jnz L(matches16)
98
99 movdqa (%ecx), %xmm4
100 pcmpeqb %xmm1, %xmm4
101 pmovmskb %xmm4, %eax
102 test %eax, %eax
103 jnz L(matches0)
104
105 sub $64, %ecx
106 sub $64, %edx
107 jbe L(exit_loop)
108
109 movdqa 48(%ecx), %xmm0
110 pcmpeqb %xmm1, %xmm0
111 pmovmskb %xmm0, %eax
112 test %eax, %eax
113 jnz L(matches48)
114
115 movdqa 32(%ecx), %xmm2
116 pcmpeqb %xmm1, %xmm2
117 pmovmskb %xmm2, %eax
118 test %eax, %eax
119 jnz L(matches32)
120
121 movdqa 16(%ecx), %xmm3
122 pcmpeqb %xmm1, %xmm3
123 pmovmskb %xmm3, %eax
124 test %eax, %eax
125 jnz L(matches16)
126
127 movdqa (%ecx), %xmm3
128 pcmpeqb %xmm1, %xmm3
129 pmovmskb %xmm3, %eax
130 test %eax, %eax
131 jnz L(matches0)
132
133 mov %ecx, %eax
134 and $63, %eax
135 test %eax, %eax
136 jz L(align64_loop)
137
138 add $64, %ecx
139 add $64, %edx
140 sub %eax, %ecx
141 sub %eax, %edx
142
143 .p2align 4
144 L(align64_loop):
145 sub $64, %ecx
146 sub $64, %edx
147 jbe L(exit_loop)
148
149 movdqa (%ecx), %xmm0
150 movdqa 16(%ecx), %xmm2
151 movdqa 32(%ecx), %xmm3
152 movdqa 48(%ecx), %xmm4
153
154 pcmpeqb %xmm1, %xmm0
155 pcmpeqb %xmm1, %xmm2
156 pcmpeqb %xmm1, %xmm3
157 pcmpeqb %xmm1, %xmm4
158
159 pmaxub %xmm3, %xmm0
160 pmaxub %xmm4, %xmm2
161 pmaxub %xmm0, %xmm2
162 pmovmskb %xmm2, %eax
163
164 test %eax, %eax
165 jz L(align64_loop)
166
167 pmovmskb %xmm4, %eax
168 test %eax, %eax
169 jnz L(matches48)
170
171 pmovmskb %xmm3, %eax
172 test %eax, %eax
173 jnz L(matches32)
174
175 movdqa 16(%ecx), %xmm2
176
177 pcmpeqb %xmm1, %xmm2
178 pcmpeqb (%ecx), %xmm1
179
180 pmovmskb %xmm2, %eax
181 test %eax, %eax
182 jnz L(matches16)
183
184 pmovmskb %xmm1, %eax
185 bsr %eax, %eax
186
187 add %ecx, %eax
188 ret
189
190 .p2align 4
191 L(exit_loop):
192 add $64, %edx
193 cmp $32, %edx
194 jbe L(exit_loop_32)
195
196 movdqa 48(%ecx), %xmm0
197 pcmpeqb %xmm1, %xmm0
198 pmovmskb %xmm0, %eax
199 test %eax, %eax
200 jnz L(matches48)
201
202 movdqa 32(%ecx), %xmm2
203 pcmpeqb %xmm1, %xmm2
204 pmovmskb %xmm2, %eax
205 test %eax, %eax
206 jnz L(matches32)
207
208 movdqa 16(%ecx), %xmm3
209 pcmpeqb %xmm1, %xmm3
210 pmovmskb %xmm3, %eax
211 test %eax, %eax
212 jnz L(matches16_1)
213 cmp $48, %edx
214 jbe L(return_null)
215
216 pcmpeqb (%ecx), %xmm1
217 pmovmskb %xmm1, %eax
218 test %eax, %eax
219 jnz L(matches0_1)
220 xor %eax, %eax
221 ret
222
223 .p2align 4
224 L(exit_loop_32):
225 movdqa 48(%ecx), %xmm0
226 pcmpeqb %xmm1, %xmm0
227 pmovmskb %xmm0, %eax
228 test %eax, %eax
229 jnz L(matches48_1)
230 cmp $16, %edx
231 jbe L(return_null)
232
233 pcmpeqb 32(%ecx), %xmm1
234 pmovmskb %xmm1, %eax
235 test %eax, %eax
236 jnz L(matches32_1)
237 xor %eax, %eax
238 ret
239
240 .p2align 4
241 L(matches0):
242 bsr %eax, %eax
243 add %ecx, %eax
244 ret
245
246 .p2align 4
247 L(matches16):
248 bsr %eax, %eax
249 lea 16(%eax, %ecx), %eax
250 ret
251
252 .p2align 4
253 L(matches32):
254 bsr %eax, %eax
255 lea 32(%eax, %ecx), %eax
256 ret
257
258 .p2align 4
259 L(matches48):
260 bsr %eax, %eax
261 lea 48(%eax, %ecx), %eax
262 ret
263
264 .p2align 4
265 L(matches0_1):
266 bsr %eax, %eax
267 sub $64, %edx
268 add %eax, %edx
269 jl L(return_null)
270 add %ecx, %eax
271 ret
272
273 .p2align 4
274 L(matches16_1):
275 bsr %eax, %eax
276 sub $48, %edx
277 add %eax, %edx
278 jl L(return_null)
279 lea 16(%ecx, %eax), %eax
280 ret
281
282 .p2align 4
283 L(matches32_1):
284 bsr %eax, %eax
285 sub $32, %edx
286 add %eax, %edx
287 jl L(return_null)
288 lea 32(%ecx, %eax), %eax
289 ret
290
291 .p2align 4
292 L(matches48_1):
293 bsr %eax, %eax
294 sub $16, %edx
295 add %eax, %edx
296 jl L(return_null)
297 lea 48(%ecx, %eax), %eax
298 ret
299
300 .p2align 4
301 L(return_null):
302 xor %eax, %eax
303 ret
304
305 .p2align 4
306 L(length_less16_offset0):
307 mov %dl, %cl
308 pcmpeqb (%eax), %xmm1
309
310 mov $1, %edx
311 sal %cl, %edx
312 sub $1, %edx
313 mov %edx, %ecx
314
315 pmovmskb %xmm1, %edx
316
317 and %ecx, %edx
318 test %edx, %edx
319 jz L(return_null)
320
321 bsr %edx, %ecx
322 add %ecx, %eax
323 ret
324
325 .p2align 4
326 L(length_less16):
327 punpcklbw %xmm1, %xmm1
328 mov %ecx, %eax
329 punpcklbw %xmm1, %xmm1
330 add $16, %edx
331 jz L(return_null)
332
333 pshufd $0, %xmm1, %xmm1
334 and $15, %ecx
335 jz L(length_less16_offset0)
336
337 PUSH (%edi)
338 mov %cl, %dh
339 add %dl, %dh
340 and $-16, %eax
341
342 sub $16, %dh
343 ja L(length_less16_part2)
344
345 pcmpeqb (%eax), %xmm1
346 pmovmskb %xmm1, %edi
347
348 sar %cl, %edi
349 add %ecx, %eax
350 mov %dl, %cl
351
352 mov $1, %edx
353 sal %cl, %edx
354 sub $1, %edx
355
356 and %edx, %edi
357 test %edi, %edi
358 jz L(ret_null)
359
360 bsr %edi, %edi
361 add %edi, %eax
362 POP (%edi)
363 ret
364
365 CFI_PUSH (%edi)
366
367 .p2align 4
368 L(length_less16_part2):
369 movdqa 16(%eax), %xmm2
370 pcmpeqb %xmm1, %xmm2
371 pmovmskb %xmm2, %edi
372
373 mov %cl, %ch
374
375 mov %dh, %cl
376 mov $1, %edx
377 sal %cl, %edx
378 sub $1, %edx
379
380 and %edx, %edi
381
382 test %edi, %edi
383 jnz L(length_less16_part2_return)
384
385 pcmpeqb (%eax), %xmm1
386 pmovmskb %xmm1, %edi
387
388 mov %ch, %cl
389 sar %cl, %edi
390 test %edi, %edi
391 jz L(ret_null)
392
393 bsr %edi, %edi
394 add %edi, %eax
395 xor %ch, %ch
396 add %ecx, %eax
397 POP (%edi)
398 ret
399
400 CFI_PUSH (%edi)
401
402 .p2align 4
403 L(length_less16_part2_return):
404 bsr %edi, %edi
405 lea 16(%eax, %edi), %eax
406 POP (%edi)
407 ret
408
409 CFI_PUSH (%edi)
410
411 .p2align 4
412 L(ret_null):
413 xor %eax, %eax
414 POP (%edi)
415 ret
416
417 END (MEMCHR)
418 #endif