]>
Commit | Line | Data |
---|---|---|
093ecf92 LD |
1 | /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using |
2 | ||
2b778ceb | 3 | Copyright (C) 2011-2021 Free Software Foundation, Inc. |
093ecf92 LD |
4 | This file is part of the GNU C Library. |
5 | ||
6 | The GNU C Library is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Lesser General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2.1 of the License, or (at your option) any later version. | |
10 | ||
11 | The GNU C Library is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | Lesser General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 17 | License along with the GNU C Library; if not, see |
5a82c748 | 18 | <https://www.gnu.org/licenses/>. */ |
093ecf92 LD |
19 | |
20 | #include <sysdep.h> | |
21 | ||
22 | .text | |
380292ba | 23 | ENTRY (__memrchr) |
7395928b | 24 | movd %esi, %xmm1 |
093ecf92 | 25 | |
ecd8b842 | 26 | sub $16, %RDX_LP |
093ecf92 LD |
27 | jbe L(length_less16) |
28 | ||
29 | punpcklbw %xmm1, %xmm1 | |
30 | punpcklbw %xmm1, %xmm1 | |
31 | ||
ecd8b842 | 32 | add %RDX_LP, %RDI_LP |
093ecf92 LD |
33 | pshufd $0, %xmm1, %xmm1 |
34 | ||
35 | movdqu (%rdi), %xmm0 | |
36 | pcmpeqb %xmm1, %xmm0 | |
37 | ||
38 | /* Check if there is a match. */ | |
39 | pmovmskb %xmm0, %eax | |
40 | test %eax, %eax | |
41 | jnz L(matches0) | |
42 | ||
43 | sub $64, %rdi | |
7395928b L |
44 | mov %edi, %ecx |
45 | and $15, %ecx | |
093ecf92 LD |
46 | jz L(loop_prolog) |
47 | ||
48 | add $16, %rdi | |
49 | add $16, %rdx | |
50 | and $-16, %rdi | |
51 | sub %rcx, %rdx | |
52 | ||
53 | .p2align 4 | |
54 | L(loop_prolog): | |
55 | sub $64, %rdx | |
56 | jbe L(exit_loop) | |
57 | ||
58 | movdqa 48(%rdi), %xmm0 | |
59 | pcmpeqb %xmm1, %xmm0 | |
60 | pmovmskb %xmm0, %eax | |
61 | test %eax, %eax | |
62 | jnz L(matches48) | |
63 | ||
64 | movdqa 32(%rdi), %xmm2 | |
65 | pcmpeqb %xmm1, %xmm2 | |
66 | pmovmskb %xmm2, %eax | |
67 | test %eax, %eax | |
68 | jnz L(matches32) | |
69 | ||
70 | movdqa 16(%rdi), %xmm3 | |
71 | pcmpeqb %xmm1, %xmm3 | |
72 | pmovmskb %xmm3, %eax | |
73 | test %eax, %eax | |
74 | jnz L(matches16) | |
75 | ||
76 | movdqa (%rdi), %xmm4 | |
77 | pcmpeqb %xmm1, %xmm4 | |
78 | pmovmskb %xmm4, %eax | |
79 | test %eax, %eax | |
80 | jnz L(matches0) | |
81 | ||
82 | sub $64, %rdi | |
83 | sub $64, %rdx | |
84 | jbe L(exit_loop) | |
85 | ||
86 | movdqa 48(%rdi), %xmm0 | |
87 | pcmpeqb %xmm1, %xmm0 | |
88 | pmovmskb %xmm0, %eax | |
89 | test %eax, %eax | |
90 | jnz L(matches48) | |
91 | ||
92 | movdqa 32(%rdi), %xmm2 | |
93 | pcmpeqb %xmm1, %xmm2 | |
94 | pmovmskb %xmm2, %eax | |
95 | test %eax, %eax | |
96 | jnz L(matches32) | |
97 | ||
98 | movdqa 16(%rdi), %xmm3 | |
99 | pcmpeqb %xmm1, %xmm3 | |
100 | pmovmskb %xmm3, %eax | |
101 | test %eax, %eax | |
102 | jnz L(matches16) | |
103 | ||
104 | movdqa (%rdi), %xmm3 | |
105 | pcmpeqb %xmm1, %xmm3 | |
106 | pmovmskb %xmm3, %eax | |
107 | test %eax, %eax | |
108 | jnz L(matches0) | |
109 | ||
7395928b L |
110 | mov %edi, %ecx |
111 | and $63, %ecx | |
66fb11b1 | 112 | jz L(align64_loop) |
093ecf92 LD |
113 | |
114 | add $64, %rdi | |
115 | add $64, %rdx | |
116 | and $-64, %rdi | |
117 | sub %rcx, %rdx | |
118 | ||
119 | .p2align 4 | |
120 | L(align64_loop): | |
121 | sub $64, %rdi | |
122 | sub $64, %rdx | |
123 | jbe L(exit_loop) | |
124 | ||
125 | movdqa (%rdi), %xmm0 | |
126 | movdqa 16(%rdi), %xmm2 | |
127 | movdqa 32(%rdi), %xmm3 | |
128 | movdqa 48(%rdi), %xmm4 | |
129 | ||
130 | pcmpeqb %xmm1, %xmm0 | |
131 | pcmpeqb %xmm1, %xmm2 | |
132 | pcmpeqb %xmm1, %xmm3 | |
133 | pcmpeqb %xmm1, %xmm4 | |
134 | ||
135 | pmaxub %xmm3, %xmm0 | |
136 | pmaxub %xmm4, %xmm2 | |
137 | pmaxub %xmm0, %xmm2 | |
138 | pmovmskb %xmm2, %eax | |
139 | ||
140 | test %eax, %eax | |
141 | jz L(align64_loop) | |
142 | ||
143 | pmovmskb %xmm4, %eax | |
144 | test %eax, %eax | |
145 | jnz L(matches48) | |
146 | ||
147 | pmovmskb %xmm3, %eax | |
148 | test %eax, %eax | |
149 | jnz L(matches32) | |
150 | ||
151 | movdqa 16(%rdi), %xmm2 | |
152 | ||
153 | pcmpeqb %xmm1, %xmm2 | |
154 | pcmpeqb (%rdi), %xmm1 | |
155 | ||
156 | pmovmskb %xmm2, %eax | |
157 | test %eax, %eax | |
158 | jnz L(matches16) | |
159 | ||
160 | pmovmskb %xmm1, %eax | |
161 | bsr %eax, %eax | |
162 | ||
163 | add %rdi, %rax | |
164 | ret | |
165 | ||
166 | .p2align 4 | |
167 | L(exit_loop): | |
7395928b L |
168 | add $64, %edx |
169 | cmp $32, %edx | |
093ecf92 LD |
170 | jbe L(exit_loop_32) |
171 | ||
172 | movdqa 48(%rdi), %xmm0 | |
173 | pcmpeqb %xmm1, %xmm0 | |
174 | pmovmskb %xmm0, %eax | |
175 | test %eax, %eax | |
176 | jnz L(matches48) | |
177 | ||
178 | movdqa 32(%rdi), %xmm2 | |
179 | pcmpeqb %xmm1, %xmm2 | |
180 | pmovmskb %xmm2, %eax | |
181 | test %eax, %eax | |
182 | jnz L(matches32) | |
183 | ||
184 | movdqa 16(%rdi), %xmm3 | |
185 | pcmpeqb %xmm1, %xmm3 | |
186 | pmovmskb %xmm3, %eax | |
187 | test %eax, %eax | |
188 | jnz L(matches16_1) | |
7395928b | 189 | cmp $48, %edx |
093ecf92 LD |
190 | jbe L(return_null) |
191 | ||
192 | pcmpeqb (%rdi), %xmm1 | |
193 | pmovmskb %xmm1, %eax | |
194 | test %eax, %eax | |
195 | jnz L(matches0_1) | |
196 | xor %eax, %eax | |
197 | ret | |
198 | ||
199 | .p2align 4 | |
200 | L(exit_loop_32): | |
201 | movdqa 48(%rdi), %xmm0 | |
202 | pcmpeqb %xmm1, %xmm0 | |
203 | pmovmskb %xmm0, %eax | |
204 | test %eax, %eax | |
205 | jnz L(matches48_1) | |
7395928b | 206 | cmp $16, %edx |
093ecf92 LD |
207 | jbe L(return_null) |
208 | ||
209 | pcmpeqb 32(%rdi), %xmm1 | |
210 | pmovmskb %xmm1, %eax | |
211 | test %eax, %eax | |
212 | jnz L(matches32_1) | |
213 | xor %eax, %eax | |
214 | ret | |
215 | ||
216 | .p2align 4 | |
217 | L(matches0): | |
218 | bsr %eax, %eax | |
219 | add %rdi, %rax | |
220 | ret | |
221 | ||
222 | .p2align 4 | |
223 | L(matches16): | |
224 | bsr %eax, %eax | |
225 | lea 16(%rax, %rdi), %rax | |
226 | ret | |
227 | ||
228 | .p2align 4 | |
229 | L(matches32): | |
230 | bsr %eax, %eax | |
231 | lea 32(%rax, %rdi), %rax | |
232 | ret | |
233 | ||
234 | .p2align 4 | |
235 | L(matches48): | |
236 | bsr %eax, %eax | |
237 | lea 48(%rax, %rdi), %rax | |
238 | ret | |
239 | ||
240 | .p2align 4 | |
241 | L(matches0_1): | |
242 | bsr %eax, %eax | |
243 | sub $64, %rdx | |
244 | add %rax, %rdx | |
245 | jl L(return_null) | |
246 | add %rdi, %rax | |
247 | ret | |
248 | ||
249 | .p2align 4 | |
250 | L(matches16_1): | |
251 | bsr %eax, %eax | |
252 | sub $48, %rdx | |
253 | add %rax, %rdx | |
254 | jl L(return_null) | |
255 | lea 16(%rdi, %rax), %rax | |
256 | ret | |
257 | ||
258 | .p2align 4 | |
259 | L(matches32_1): | |
260 | bsr %eax, %eax | |
261 | sub $32, %rdx | |
262 | add %rax, %rdx | |
263 | jl L(return_null) | |
264 | lea 32(%rdi, %rax), %rax | |
265 | ret | |
266 | ||
267 | .p2align 4 | |
268 | L(matches48_1): | |
269 | bsr %eax, %eax | |
270 | sub $16, %rdx | |
271 | add %rax, %rdx | |
272 | jl L(return_null) | |
273 | lea 48(%rdi, %rax), %rax | |
274 | ret | |
275 | ||
276 | .p2align 4 | |
277 | L(return_null): | |
7395928b | 278 | xor %eax, %eax |
093ecf92 LD |
279 | ret |
280 | ||
281 | .p2align 4 | |
282 | L(length_less16_offset0): | |
81dcc7fb AS |
283 | test %edx, %edx |
284 | jz L(return_null) | |
285 | ||
093ecf92 LD |
286 | mov %dl, %cl |
287 | pcmpeqb (%rdi), %xmm1 | |
288 | ||
289 | mov $1, %edx | |
290 | sal %cl, %edx | |
291 | sub $1, %edx | |
292 | ||
293 | pmovmskb %xmm1, %eax | |
294 | ||
295 | and %edx, %eax | |
296 | test %eax, %eax | |
297 | jz L(return_null) | |
298 | ||
299 | bsr %eax, %eax | |
300 | add %rdi, %rax | |
301 | ret | |
302 | ||
303 | .p2align 4 | |
304 | L(length_less16): | |
305 | punpcklbw %xmm1, %xmm1 | |
306 | punpcklbw %xmm1, %xmm1 | |
307 | ||
7395928b | 308 | add $16, %edx |
093ecf92 LD |
309 | |
310 | pshufd $0, %xmm1, %xmm1 | |
311 | ||
7395928b L |
312 | mov %edi, %ecx |
313 | and $15, %ecx | |
093ecf92 LD |
314 | jz L(length_less16_offset0) |
315 | ||
093ecf92 | 316 | mov %cl, %dh |
7395928b | 317 | mov %ecx, %esi |
093ecf92 LD |
318 | add %dl, %dh |
319 | and $-16, %rdi | |
320 | ||
321 | sub $16, %dh | |
322 | ja L(length_less16_part2) | |
323 | ||
324 | pcmpeqb (%rdi), %xmm1 | |
325 | pmovmskb %xmm1, %eax | |
326 | ||
327 | sar %cl, %eax | |
328 | mov %dl, %cl | |
329 | ||
330 | mov $1, %edx | |
331 | sal %cl, %edx | |
332 | sub $1, %edx | |
333 | ||
334 | and %edx, %eax | |
335 | test %eax, %eax | |
336 | jz L(return_null) | |
337 | ||
338 | bsr %eax, %eax | |
339 | add %rdi, %rax | |
7395928b | 340 | add %rsi, %rax |
093ecf92 LD |
341 | ret |
342 | ||
343 | .p2align 4 | |
344 | L(length_less16_part2): | |
345 | movdqa 16(%rdi), %xmm2 | |
346 | pcmpeqb %xmm1, %xmm2 | |
347 | pmovmskb %xmm2, %eax | |
348 | ||
349 | mov %dh, %cl | |
350 | mov $1, %edx | |
351 | sal %cl, %edx | |
352 | sub $1, %edx | |
353 | ||
354 | and %edx, %eax | |
355 | ||
356 | test %eax, %eax | |
357 | jnz L(length_less16_part2_return) | |
358 | ||
359 | pcmpeqb (%rdi), %xmm1 | |
360 | pmovmskb %xmm1, %eax | |
361 | ||
7395928b | 362 | mov %esi, %ecx |
093ecf92 LD |
363 | sar %cl, %eax |
364 | test %eax, %eax | |
365 | jz L(return_null) | |
366 | ||
367 | bsr %eax, %eax | |
368 | add %rdi, %rax | |
7395928b | 369 | add %rsi, %rax |
093ecf92 LD |
370 | ret |
371 | ||
372 | .p2align 4 | |
373 | L(length_less16_part2_return): | |
374 | bsr %eax, %eax | |
375 | lea 16(%rax, %rdi), %rax | |
376 | ret | |
377 | ||
380292ba JM |
378 | END (__memrchr) |
379 | weak_alias (__memrchr, memrchr) |