]>
Commit | Line | Data |
---|---|---|
d614a753 | 1 | /* Copyright (C) 2011-2020 Free Software Foundation, Inc. |
093ecf92 | 2 | Contributed by Intel Corporation. |
322e23db UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 | 16 | License along with the GNU C Library; if not, see |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
322e23db UD |
18 | |
19 | #include <sysdep.h> | |
20 | ||
2f5d20ac L |
21 | #ifdef USE_AS_WMEMCHR |
22 | # define MEMCHR wmemchr | |
23 | # define PCMPEQ pcmpeqd | |
24 | #else | |
25 | # define MEMCHR memchr | |
26 | # define PCMPEQ pcmpeqb | |
27 | #endif | |
28 | ||
093ecf92 | 29 | /* fast SSE2 version with using pmaxub and 64 byte loop */ |
322e23db UD |
30 | |
31 | .text | |
2f5d20ac | 32 | ENTRY(MEMCHR) |
4f26ef1b L |
33 | movd %esi, %xmm1 |
34 | mov %edi, %ecx | |
093ecf92 | 35 | |
2f5d20ac | 36 | #ifdef USE_AS_WMEMCHR |
97700a34 | 37 | test %RDX_LP, %RDX_LP |
2f5d20ac | 38 | jz L(return_null) |
97700a34 | 39 | shl $2, %RDX_LP |
2f5d20ac | 40 | #else |
97700a34 L |
41 | # ifdef __ILP32__ |
42 | /* Clear the upper 32 bits. */ | |
43 | movl %edx, %edx | |
44 | # endif | |
322e23db | 45 | punpcklbw %xmm1, %xmm1 |
97700a34 | 46 | test %RDX_LP, %RDX_LP |
093ecf92 | 47 | jz L(return_null) |
322e23db | 48 | punpcklbw %xmm1, %xmm1 |
2f5d20ac | 49 | #endif |
093ecf92 | 50 | |
4f26ef1b | 51 | and $63, %ecx |
322e23db | 52 | pshufd $0, %xmm1, %xmm1 |
093ecf92 | 53 | |
4f26ef1b | 54 | cmp $48, %ecx |
093ecf92 LD |
55 | ja L(crosscache) |
56 | ||
57 | movdqu (%rdi), %xmm0 | |
2f5d20ac | 58 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
59 | pmovmskb %xmm0, %eax |
60 | test %eax, %eax | |
61 | ||
62 | jnz L(matches_1) | |
63 | sub $16, %rdx | |
64 | jbe L(return_null) | |
65 | add $16, %rdi | |
4f26ef1b | 66 | and $15, %ecx |
093ecf92 LD |
67 | and $-16, %rdi |
68 | add %rcx, %rdx | |
69 | sub $64, %rdx | |
70 | jbe L(exit_loop) | |
71 | jmp L(loop_prolog) | |
72 | ||
73 | .p2align 4 | |
74 | L(crosscache): | |
4f26ef1b | 75 | and $15, %ecx |
093ecf92 LD |
76 | and $-16, %rdi |
77 | movdqa (%rdi), %xmm0 | |
78 | ||
2f5d20ac | 79 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
80 | /* Check if there is a match. */ |
81 | pmovmskb %xmm0, %eax | |
82 | /* Remove the leading bytes. */ | |
83 | sar %cl, %eax | |
84 | test %eax, %eax | |
85 | je L(unaligned_no_match) | |
86 | /* Check which byte is a match. */ | |
87 | bsf %eax, %eax | |
88 | ||
89 | sub %rax, %rdx | |
90 | jbe L(return_null) | |
91 | add %rdi, %rax | |
92 | add %rcx, %rax | |
93 | ret | |
94 | ||
95 | .p2align 4 | |
96 | L(unaligned_no_match): | |
402bf069 L |
97 | /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using |
98 | "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void | |
99 | possible addition overflow. */ | |
100 | neg %rcx | |
101 | add $16, %rcx | |
102 | sub %rcx, %rdx | |
093ecf92 LD |
103 | jbe L(return_null) |
104 | add $16, %rdi | |
105 | sub $64, %rdx | |
106 | jbe L(exit_loop) | |
107 | ||
108 | .p2align 4 | |
109 | L(loop_prolog): | |
110 | movdqa (%rdi), %xmm0 | |
2f5d20ac | 111 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
112 | pmovmskb %xmm0, %eax |
113 | test %eax, %eax | |
114 | jnz L(matches) | |
115 | ||
116 | movdqa 16(%rdi), %xmm2 | |
2f5d20ac | 117 | PCMPEQ %xmm1, %xmm2 |
093ecf92 LD |
118 | pmovmskb %xmm2, %eax |
119 | test %eax, %eax | |
120 | jnz L(matches16) | |
121 | ||
122 | movdqa 32(%rdi), %xmm3 | |
2f5d20ac | 123 | PCMPEQ %xmm1, %xmm3 |
093ecf92 LD |
124 | pmovmskb %xmm3, %eax |
125 | test %eax, %eax | |
126 | jnz L(matches32) | |
127 | ||
128 | movdqa 48(%rdi), %xmm4 | |
2f5d20ac | 129 | PCMPEQ %xmm1, %xmm4 |
093ecf92 LD |
130 | add $64, %rdi |
131 | pmovmskb %xmm4, %eax | |
132 | test %eax, %eax | |
133 | jnz L(matches0) | |
134 | ||
135 | test $0x3f, %rdi | |
136 | jz L(align64_loop) | |
137 | ||
138 | sub $64, %rdx | |
139 | jbe L(exit_loop) | |
140 | ||
141 | movdqa (%rdi), %xmm0 | |
2f5d20ac | 142 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
143 | pmovmskb %xmm0, %eax |
144 | test %eax, %eax | |
145 | jnz L(matches) | |
146 | ||
147 | movdqa 16(%rdi), %xmm2 | |
2f5d20ac | 148 | PCMPEQ %xmm1, %xmm2 |
093ecf92 LD |
149 | pmovmskb %xmm2, %eax |
150 | test %eax, %eax | |
151 | jnz L(matches16) | |
152 | ||
153 | movdqa 32(%rdi), %xmm3 | |
2f5d20ac | 154 | PCMPEQ %xmm1, %xmm3 |
093ecf92 LD |
155 | pmovmskb %xmm3, %eax |
156 | test %eax, %eax | |
157 | jnz L(matches32) | |
158 | ||
159 | movdqa 48(%rdi), %xmm3 | |
2f5d20ac | 160 | PCMPEQ %xmm1, %xmm3 |
093ecf92 LD |
161 | pmovmskb %xmm3, %eax |
162 | ||
163 | add $64, %rdi | |
164 | test %eax, %eax | |
165 | jnz L(matches0) | |
166 | ||
167 | mov %rdi, %rcx | |
168 | and $-64, %rdi | |
4f26ef1b | 169 | and $63, %ecx |
093ecf92 LD |
170 | add %rcx, %rdx |
171 | ||
172 | .p2align 4 | |
173 | L(align64_loop): | |
174 | sub $64, %rdx | |
175 | jbe L(exit_loop) | |
176 | movdqa (%rdi), %xmm0 | |
177 | movdqa 16(%rdi), %xmm2 | |
178 | movdqa 32(%rdi), %xmm3 | |
179 | movdqa 48(%rdi), %xmm4 | |
180 | ||
2f5d20ac L |
181 | PCMPEQ %xmm1, %xmm0 |
182 | PCMPEQ %xmm1, %xmm2 | |
183 | PCMPEQ %xmm1, %xmm3 | |
184 | PCMPEQ %xmm1, %xmm4 | |
093ecf92 LD |
185 | |
186 | pmaxub %xmm0, %xmm3 | |
187 | pmaxub %xmm2, %xmm4 | |
188 | pmaxub %xmm3, %xmm4 | |
189 | pmovmskb %xmm4, %eax | |
190 | ||
191 | add $64, %rdi | |
192 | ||
193 | test %eax, %eax | |
194 | jz L(align64_loop) | |
195 | ||
196 | sub $64, %rdi | |
197 | ||
198 | pmovmskb %xmm0, %eax | |
199 | test %eax, %eax | |
200 | jnz L(matches) | |
201 | ||
202 | pmovmskb %xmm2, %eax | |
203 | test %eax, %eax | |
204 | jnz L(matches16) | |
205 | ||
206 | movdqa 32(%rdi), %xmm3 | |
2f5d20ac | 207 | PCMPEQ %xmm1, %xmm3 |
093ecf92 | 208 | |
2f5d20ac | 209 | PCMPEQ 48(%rdi), %xmm1 |
093ecf92 LD |
210 | pmovmskb %xmm3, %eax |
211 | test %eax, %eax | |
212 | jnz L(matches32) | |
213 | ||
214 | pmovmskb %xmm1, %eax | |
215 | bsf %eax, %eax | |
216 | lea 48(%rdi, %rax), %rax | |
217 | ret | |
218 | ||
219 | .p2align 4 | |
220 | L(exit_loop): | |
4f26ef1b | 221 | add $32, %edx |
093ecf92 LD |
222 | jle L(exit_loop_32) |
223 | ||
224 | movdqa (%rdi), %xmm0 | |
2f5d20ac | 225 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
226 | pmovmskb %xmm0, %eax |
227 | test %eax, %eax | |
228 | jnz L(matches) | |
229 | ||
230 | movdqa 16(%rdi), %xmm2 | |
2f5d20ac | 231 | PCMPEQ %xmm1, %xmm2 |
093ecf92 LD |
232 | pmovmskb %xmm2, %eax |
233 | test %eax, %eax | |
234 | jnz L(matches16) | |
235 | ||
236 | movdqa 32(%rdi), %xmm3 | |
2f5d20ac | 237 | PCMPEQ %xmm1, %xmm3 |
093ecf92 LD |
238 | pmovmskb %xmm3, %eax |
239 | test %eax, %eax | |
240 | jnz L(matches32_1) | |
4f26ef1b | 241 | sub $16, %edx |
093ecf92 LD |
242 | jle L(return_null) |
243 | ||
2f5d20ac | 244 | PCMPEQ 48(%rdi), %xmm1 |
093ecf92 LD |
245 | pmovmskb %xmm1, %eax |
246 | test %eax, %eax | |
247 | jnz L(matches48_1) | |
4f26ef1b | 248 | xor %eax, %eax |
093ecf92 LD |
249 | ret |
250 | ||
251 | .p2align 4 | |
252 | L(exit_loop_32): | |
4f26ef1b | 253 | add $32, %edx |
093ecf92 | 254 | movdqa (%rdi), %xmm0 |
2f5d20ac | 255 | PCMPEQ %xmm1, %xmm0 |
093ecf92 LD |
256 | pmovmskb %xmm0, %eax |
257 | test %eax, %eax | |
258 | jnz L(matches_1) | |
4f26ef1b | 259 | sub $16, %edx |
093ecf92 | 260 | jbe L(return_null) |
322e23db | 261 | |
2f5d20ac | 262 | PCMPEQ 16(%rdi), %xmm1 |
093ecf92 LD |
263 | pmovmskb %xmm1, %eax |
264 | test %eax, %eax | |
265 | jnz L(matches16_1) | |
4f26ef1b | 266 | xor %eax, %eax |
322e23db UD |
267 | ret |
268 | ||
093ecf92 LD |
269 | .p2align 4 |
270 | L(matches0): | |
271 | bsf %eax, %eax | |
272 | lea -16(%rax, %rdi), %rax | |
322e23db | 273 | ret |
093ecf92 LD |
274 | |
275 | .p2align 4 | |
276 | L(matches): | |
277 | bsf %eax, %eax | |
278 | add %rdi, %rax | |
279 | ret | |
280 | ||
281 | .p2align 4 | |
282 | L(matches16): | |
283 | bsf %eax, %eax | |
284 | lea 16(%rax, %rdi), %rax | |
285 | ret | |
286 | ||
287 | .p2align 4 | |
288 | L(matches32): | |
289 | bsf %eax, %eax | |
290 | lea 32(%rax, %rdi), %rax | |
291 | ret | |
292 | ||
293 | .p2align 4 | |
294 | L(matches_1): | |
295 | bsf %eax, %eax | |
296 | sub %rax, %rdx | |
297 | jbe L(return_null) | |
298 | add %rdi, %rax | |
299 | ret | |
300 | ||
301 | .p2align 4 | |
302 | L(matches16_1): | |
303 | bsf %eax, %eax | |
304 | sub %rax, %rdx | |
305 | jbe L(return_null) | |
306 | lea 16(%rdi, %rax), %rax | |
307 | ret | |
308 | ||
309 | .p2align 4 | |
310 | L(matches32_1): | |
311 | bsf %eax, %eax | |
312 | sub %rax, %rdx | |
313 | jbe L(return_null) | |
314 | lea 32(%rdi, %rax), %rax | |
315 | ret | |
316 | ||
317 | .p2align 4 | |
318 | L(matches48_1): | |
319 | bsf %eax, %eax | |
320 | sub %rax, %rdx | |
321 | jbe L(return_null) | |
322 | lea 48(%rdi, %rax), %rax | |
323 | ret | |
324 | ||
325 | .p2align 4 | |
326 | L(return_null): | |
4f26ef1b | 327 | xor %eax, %eax |
093ecf92 | 328 | ret |
2f5d20ac | 329 | END(MEMCHR) |
322e23db | 330 | |
2f5d20ac | 331 | #ifndef USE_AS_WMEMCHR |
322e23db | 332 | strong_alias (memchr, __memchr) |
093ecf92 | 333 | libc_hidden_builtin_def(memchr) |
2f5d20ac | 334 | #endif |