]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memchr.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / memchr.S
CommitLineData
d614a753 1/* Copyright (C) 2011-2020 Free Software Foundation, Inc.
093ecf92 2 Contributed by Intel Corporation.
322e23db
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6 16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
322e23db
UD
18
19#include <sysdep.h>
20
2f5d20ac
L
21#ifdef USE_AS_WMEMCHR
22# define MEMCHR wmemchr
23# define PCMPEQ pcmpeqd
24#else
25# define MEMCHR memchr
26# define PCMPEQ pcmpeqb
27#endif
28
093ecf92 29/* fast SSE2 version with using pmaxub and 64 byte loop */
322e23db
UD
30
31 .text
2f5d20ac 32ENTRY(MEMCHR)
4f26ef1b
L
33 movd %esi, %xmm1
34 mov %edi, %ecx
093ecf92 35
2f5d20ac 36#ifdef USE_AS_WMEMCHR
97700a34 37 test %RDX_LP, %RDX_LP
2f5d20ac 38 jz L(return_null)
97700a34 39 shl $2, %RDX_LP
2f5d20ac 40#else
97700a34
L
41# ifdef __ILP32__
42 /* Clear the upper 32 bits. */
43 movl %edx, %edx
44# endif
322e23db 45 punpcklbw %xmm1, %xmm1
97700a34 46 test %RDX_LP, %RDX_LP
093ecf92 47 jz L(return_null)
322e23db 48 punpcklbw %xmm1, %xmm1
2f5d20ac 49#endif
093ecf92 50
4f26ef1b 51 and $63, %ecx
322e23db 52 pshufd $0, %xmm1, %xmm1
093ecf92 53
4f26ef1b 54 cmp $48, %ecx
093ecf92
LD
55 ja L(crosscache)
56
57 movdqu (%rdi), %xmm0
2f5d20ac 58 PCMPEQ %xmm1, %xmm0
093ecf92
LD
59 pmovmskb %xmm0, %eax
60 test %eax, %eax
61
62 jnz L(matches_1)
63 sub $16, %rdx
64 jbe L(return_null)
65 add $16, %rdi
4f26ef1b 66 and $15, %ecx
093ecf92
LD
67 and $-16, %rdi
68 add %rcx, %rdx
69 sub $64, %rdx
70 jbe L(exit_loop)
71 jmp L(loop_prolog)
72
73 .p2align 4
74L(crosscache):
4f26ef1b 75 and $15, %ecx
093ecf92
LD
76 and $-16, %rdi
77 movdqa (%rdi), %xmm0
78
2f5d20ac 79 PCMPEQ %xmm1, %xmm0
093ecf92
LD
80/* Check if there is a match. */
81 pmovmskb %xmm0, %eax
82/* Remove the leading bytes. */
83 sar %cl, %eax
84 test %eax, %eax
85 je L(unaligned_no_match)
86/* Check which byte is a match. */
87 bsf %eax, %eax
88
89 sub %rax, %rdx
90 jbe L(return_null)
91 add %rdi, %rax
92 add %rcx, %rax
93 ret
94
95 .p2align 4
96L(unaligned_no_match):
402bf069
L
97 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
98 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
99 possible addition overflow. */
100 neg %rcx
101 add $16, %rcx
102 sub %rcx, %rdx
093ecf92
LD
103 jbe L(return_null)
104 add $16, %rdi
105 sub $64, %rdx
106 jbe L(exit_loop)
107
108 .p2align 4
109L(loop_prolog):
110 movdqa (%rdi), %xmm0
2f5d20ac 111 PCMPEQ %xmm1, %xmm0
093ecf92
LD
112 pmovmskb %xmm0, %eax
113 test %eax, %eax
114 jnz L(matches)
115
116 movdqa 16(%rdi), %xmm2
2f5d20ac 117 PCMPEQ %xmm1, %xmm2
093ecf92
LD
118 pmovmskb %xmm2, %eax
119 test %eax, %eax
120 jnz L(matches16)
121
122 movdqa 32(%rdi), %xmm3
2f5d20ac 123 PCMPEQ %xmm1, %xmm3
093ecf92
LD
124 pmovmskb %xmm3, %eax
125 test %eax, %eax
126 jnz L(matches32)
127
128 movdqa 48(%rdi), %xmm4
2f5d20ac 129 PCMPEQ %xmm1, %xmm4
093ecf92
LD
130 add $64, %rdi
131 pmovmskb %xmm4, %eax
132 test %eax, %eax
133 jnz L(matches0)
134
135 test $0x3f, %rdi
136 jz L(align64_loop)
137
138 sub $64, %rdx
139 jbe L(exit_loop)
140
141 movdqa (%rdi), %xmm0
2f5d20ac 142 PCMPEQ %xmm1, %xmm0
093ecf92
LD
143 pmovmskb %xmm0, %eax
144 test %eax, %eax
145 jnz L(matches)
146
147 movdqa 16(%rdi), %xmm2
2f5d20ac 148 PCMPEQ %xmm1, %xmm2
093ecf92
LD
149 pmovmskb %xmm2, %eax
150 test %eax, %eax
151 jnz L(matches16)
152
153 movdqa 32(%rdi), %xmm3
2f5d20ac 154 PCMPEQ %xmm1, %xmm3
093ecf92
LD
155 pmovmskb %xmm3, %eax
156 test %eax, %eax
157 jnz L(matches32)
158
159 movdqa 48(%rdi), %xmm3
2f5d20ac 160 PCMPEQ %xmm1, %xmm3
093ecf92
LD
161 pmovmskb %xmm3, %eax
162
163 add $64, %rdi
164 test %eax, %eax
165 jnz L(matches0)
166
167 mov %rdi, %rcx
168 and $-64, %rdi
4f26ef1b 169 and $63, %ecx
093ecf92
LD
170 add %rcx, %rdx
171
172 .p2align 4
173L(align64_loop):
174 sub $64, %rdx
175 jbe L(exit_loop)
176 movdqa (%rdi), %xmm0
177 movdqa 16(%rdi), %xmm2
178 movdqa 32(%rdi), %xmm3
179 movdqa 48(%rdi), %xmm4
180
2f5d20ac
L
181 PCMPEQ %xmm1, %xmm0
182 PCMPEQ %xmm1, %xmm2
183 PCMPEQ %xmm1, %xmm3
184 PCMPEQ %xmm1, %xmm4
093ecf92
LD
185
186 pmaxub %xmm0, %xmm3
187 pmaxub %xmm2, %xmm4
188 pmaxub %xmm3, %xmm4
189 pmovmskb %xmm4, %eax
190
191 add $64, %rdi
192
193 test %eax, %eax
194 jz L(align64_loop)
195
196 sub $64, %rdi
197
198 pmovmskb %xmm0, %eax
199 test %eax, %eax
200 jnz L(matches)
201
202 pmovmskb %xmm2, %eax
203 test %eax, %eax
204 jnz L(matches16)
205
206 movdqa 32(%rdi), %xmm3
2f5d20ac 207 PCMPEQ %xmm1, %xmm3
093ecf92 208
2f5d20ac 209 PCMPEQ 48(%rdi), %xmm1
093ecf92
LD
210 pmovmskb %xmm3, %eax
211 test %eax, %eax
212 jnz L(matches32)
213
214 pmovmskb %xmm1, %eax
215 bsf %eax, %eax
216 lea 48(%rdi, %rax), %rax
217 ret
218
219 .p2align 4
220L(exit_loop):
4f26ef1b 221 add $32, %edx
093ecf92
LD
222 jle L(exit_loop_32)
223
224 movdqa (%rdi), %xmm0
2f5d20ac 225 PCMPEQ %xmm1, %xmm0
093ecf92
LD
226 pmovmskb %xmm0, %eax
227 test %eax, %eax
228 jnz L(matches)
229
230 movdqa 16(%rdi), %xmm2
2f5d20ac 231 PCMPEQ %xmm1, %xmm2
093ecf92
LD
232 pmovmskb %xmm2, %eax
233 test %eax, %eax
234 jnz L(matches16)
235
236 movdqa 32(%rdi), %xmm3
2f5d20ac 237 PCMPEQ %xmm1, %xmm3
093ecf92
LD
238 pmovmskb %xmm3, %eax
239 test %eax, %eax
240 jnz L(matches32_1)
4f26ef1b 241 sub $16, %edx
093ecf92
LD
242 jle L(return_null)
243
2f5d20ac 244 PCMPEQ 48(%rdi), %xmm1
093ecf92
LD
245 pmovmskb %xmm1, %eax
246 test %eax, %eax
247 jnz L(matches48_1)
4f26ef1b 248 xor %eax, %eax
093ecf92
LD
249 ret
250
251 .p2align 4
252L(exit_loop_32):
4f26ef1b 253 add $32, %edx
093ecf92 254 movdqa (%rdi), %xmm0
2f5d20ac 255 PCMPEQ %xmm1, %xmm0
093ecf92
LD
256 pmovmskb %xmm0, %eax
257 test %eax, %eax
258 jnz L(matches_1)
4f26ef1b 259 sub $16, %edx
093ecf92 260 jbe L(return_null)
322e23db 261
2f5d20ac 262 PCMPEQ 16(%rdi), %xmm1
093ecf92
LD
263 pmovmskb %xmm1, %eax
264 test %eax, %eax
265 jnz L(matches16_1)
4f26ef1b 266 xor %eax, %eax
322e23db
UD
267 ret
268
093ecf92
LD
269 .p2align 4
270L(matches0):
271 bsf %eax, %eax
272 lea -16(%rax, %rdi), %rax
322e23db 273 ret
093ecf92
LD
274
275 .p2align 4
276L(matches):
277 bsf %eax, %eax
278 add %rdi, %rax
279 ret
280
281 .p2align 4
282L(matches16):
283 bsf %eax, %eax
284 lea 16(%rax, %rdi), %rax
285 ret
286
287 .p2align 4
288L(matches32):
289 bsf %eax, %eax
290 lea 32(%rax, %rdi), %rax
291 ret
292
293 .p2align 4
294L(matches_1):
295 bsf %eax, %eax
296 sub %rax, %rdx
297 jbe L(return_null)
298 add %rdi, %rax
299 ret
300
301 .p2align 4
302L(matches16_1):
303 bsf %eax, %eax
304 sub %rax, %rdx
305 jbe L(return_null)
306 lea 16(%rdi, %rax), %rax
307 ret
308
309 .p2align 4
310L(matches32_1):
311 bsf %eax, %eax
312 sub %rax, %rdx
313 jbe L(return_null)
314 lea 32(%rdi, %rax), %rax
315 ret
316
317 .p2align 4
318L(matches48_1):
319 bsf %eax, %eax
320 sub %rax, %rdx
321 jbe L(return_null)
322 lea 48(%rdi, %rax), %rax
323 ret
324
325 .p2align 4
326L(return_null):
4f26ef1b 327 xor %eax, %eax
093ecf92 328 ret
2f5d20ac 329END(MEMCHR)
322e23db 330
2f5d20ac 331#ifndef USE_AS_WMEMCHR
322e23db 332strong_alias (memchr, __memchr)
093ecf92 333libc_hidden_builtin_def(memchr)
2f5d20ac 334#endif