]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/memrchr.S
x86-64: Add vector log2/log2f implementation to libmvec
[thirdparty/glibc.git] / sysdeps / x86_64 / memrchr.S
CommitLineData
093ecf92
LD
1/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
2
2b778ceb 3 Copyright (C) 2011-2021 Free Software Foundation, Inc.
093ecf92
LD
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
59ba27a6 17 License along with the GNU C Library; if not, see
5a82c748 18 <https://www.gnu.org/licenses/>. */
093ecf92
LD
19
20#include <sysdep.h>
21
22 .text
380292ba 23ENTRY (__memrchr)
7395928b 24 movd %esi, %xmm1
093ecf92 25
ecd8b842 26 sub $16, %RDX_LP
093ecf92
LD
27 jbe L(length_less16)
28
29 punpcklbw %xmm1, %xmm1
30 punpcklbw %xmm1, %xmm1
31
ecd8b842 32 add %RDX_LP, %RDI_LP
093ecf92
LD
33 pshufd $0, %xmm1, %xmm1
34
35 movdqu (%rdi), %xmm0
36 pcmpeqb %xmm1, %xmm0
37
38/* Check if there is a match. */
39 pmovmskb %xmm0, %eax
40 test %eax, %eax
41 jnz L(matches0)
42
43 sub $64, %rdi
7395928b
L
44 mov %edi, %ecx
45 and $15, %ecx
093ecf92
LD
46 jz L(loop_prolog)
47
48 add $16, %rdi
49 add $16, %rdx
50 and $-16, %rdi
51 sub %rcx, %rdx
52
53 .p2align 4
54L(loop_prolog):
55 sub $64, %rdx
56 jbe L(exit_loop)
57
58 movdqa 48(%rdi), %xmm0
59 pcmpeqb %xmm1, %xmm0
60 pmovmskb %xmm0, %eax
61 test %eax, %eax
62 jnz L(matches48)
63
64 movdqa 32(%rdi), %xmm2
65 pcmpeqb %xmm1, %xmm2
66 pmovmskb %xmm2, %eax
67 test %eax, %eax
68 jnz L(matches32)
69
70 movdqa 16(%rdi), %xmm3
71 pcmpeqb %xmm1, %xmm3
72 pmovmskb %xmm3, %eax
73 test %eax, %eax
74 jnz L(matches16)
75
76 movdqa (%rdi), %xmm4
77 pcmpeqb %xmm1, %xmm4
78 pmovmskb %xmm4, %eax
79 test %eax, %eax
80 jnz L(matches0)
81
82 sub $64, %rdi
83 sub $64, %rdx
84 jbe L(exit_loop)
85
86 movdqa 48(%rdi), %xmm0
87 pcmpeqb %xmm1, %xmm0
88 pmovmskb %xmm0, %eax
89 test %eax, %eax
90 jnz L(matches48)
91
92 movdqa 32(%rdi), %xmm2
93 pcmpeqb %xmm1, %xmm2
94 pmovmskb %xmm2, %eax
95 test %eax, %eax
96 jnz L(matches32)
97
98 movdqa 16(%rdi), %xmm3
99 pcmpeqb %xmm1, %xmm3
100 pmovmskb %xmm3, %eax
101 test %eax, %eax
102 jnz L(matches16)
103
104 movdqa (%rdi), %xmm3
105 pcmpeqb %xmm1, %xmm3
106 pmovmskb %xmm3, %eax
107 test %eax, %eax
108 jnz L(matches0)
109
7395928b
L
110 mov %edi, %ecx
111 and $63, %ecx
66fb11b1 112 jz L(align64_loop)
093ecf92
LD
113
114 add $64, %rdi
115 add $64, %rdx
116 and $-64, %rdi
117 sub %rcx, %rdx
118
119 .p2align 4
120L(align64_loop):
121 sub $64, %rdi
122 sub $64, %rdx
123 jbe L(exit_loop)
124
125 movdqa (%rdi), %xmm0
126 movdqa 16(%rdi), %xmm2
127 movdqa 32(%rdi), %xmm3
128 movdqa 48(%rdi), %xmm4
129
130 pcmpeqb %xmm1, %xmm0
131 pcmpeqb %xmm1, %xmm2
132 pcmpeqb %xmm1, %xmm3
133 pcmpeqb %xmm1, %xmm4
134
135 pmaxub %xmm3, %xmm0
136 pmaxub %xmm4, %xmm2
137 pmaxub %xmm0, %xmm2
138 pmovmskb %xmm2, %eax
139
140 test %eax, %eax
141 jz L(align64_loop)
142
143 pmovmskb %xmm4, %eax
144 test %eax, %eax
145 jnz L(matches48)
146
147 pmovmskb %xmm3, %eax
148 test %eax, %eax
149 jnz L(matches32)
150
151 movdqa 16(%rdi), %xmm2
152
153 pcmpeqb %xmm1, %xmm2
154 pcmpeqb (%rdi), %xmm1
155
156 pmovmskb %xmm2, %eax
157 test %eax, %eax
158 jnz L(matches16)
159
160 pmovmskb %xmm1, %eax
161 bsr %eax, %eax
162
163 add %rdi, %rax
164 ret
165
166 .p2align 4
167L(exit_loop):
7395928b
L
168 add $64, %edx
169 cmp $32, %edx
093ecf92
LD
170 jbe L(exit_loop_32)
171
172 movdqa 48(%rdi), %xmm0
173 pcmpeqb %xmm1, %xmm0
174 pmovmskb %xmm0, %eax
175 test %eax, %eax
176 jnz L(matches48)
177
178 movdqa 32(%rdi), %xmm2
179 pcmpeqb %xmm1, %xmm2
180 pmovmskb %xmm2, %eax
181 test %eax, %eax
182 jnz L(matches32)
183
184 movdqa 16(%rdi), %xmm3
185 pcmpeqb %xmm1, %xmm3
186 pmovmskb %xmm3, %eax
187 test %eax, %eax
188 jnz L(matches16_1)
7395928b 189 cmp $48, %edx
093ecf92
LD
190 jbe L(return_null)
191
192 pcmpeqb (%rdi), %xmm1
193 pmovmskb %xmm1, %eax
194 test %eax, %eax
195 jnz L(matches0_1)
196 xor %eax, %eax
197 ret
198
199 .p2align 4
200L(exit_loop_32):
201 movdqa 48(%rdi), %xmm0
202 pcmpeqb %xmm1, %xmm0
203 pmovmskb %xmm0, %eax
204 test %eax, %eax
205 jnz L(matches48_1)
7395928b 206 cmp $16, %edx
093ecf92
LD
207 jbe L(return_null)
208
209 pcmpeqb 32(%rdi), %xmm1
210 pmovmskb %xmm1, %eax
211 test %eax, %eax
212 jnz L(matches32_1)
213 xor %eax, %eax
214 ret
215
216 .p2align 4
217L(matches0):
218 bsr %eax, %eax
219 add %rdi, %rax
220 ret
221
222 .p2align 4
223L(matches16):
224 bsr %eax, %eax
225 lea 16(%rax, %rdi), %rax
226 ret
227
228 .p2align 4
229L(matches32):
230 bsr %eax, %eax
231 lea 32(%rax, %rdi), %rax
232 ret
233
234 .p2align 4
235L(matches48):
236 bsr %eax, %eax
237 lea 48(%rax, %rdi), %rax
238 ret
239
240 .p2align 4
241L(matches0_1):
242 bsr %eax, %eax
243 sub $64, %rdx
244 add %rax, %rdx
245 jl L(return_null)
246 add %rdi, %rax
247 ret
248
249 .p2align 4
250L(matches16_1):
251 bsr %eax, %eax
252 sub $48, %rdx
253 add %rax, %rdx
254 jl L(return_null)
255 lea 16(%rdi, %rax), %rax
256 ret
257
258 .p2align 4
259L(matches32_1):
260 bsr %eax, %eax
261 sub $32, %rdx
262 add %rax, %rdx
263 jl L(return_null)
264 lea 32(%rdi, %rax), %rax
265 ret
266
267 .p2align 4
268L(matches48_1):
269 bsr %eax, %eax
270 sub $16, %rdx
271 add %rax, %rdx
272 jl L(return_null)
273 lea 48(%rdi, %rax), %rax
274 ret
275
276 .p2align 4
277L(return_null):
7395928b 278 xor %eax, %eax
093ecf92
LD
279 ret
280
281 .p2align 4
282L(length_less16_offset0):
81dcc7fb
AS
283 test %edx, %edx
284 jz L(return_null)
285
093ecf92
LD
286 mov %dl, %cl
287 pcmpeqb (%rdi), %xmm1
288
289 mov $1, %edx
290 sal %cl, %edx
291 sub $1, %edx
292
293 pmovmskb %xmm1, %eax
294
295 and %edx, %eax
296 test %eax, %eax
297 jz L(return_null)
298
299 bsr %eax, %eax
300 add %rdi, %rax
301 ret
302
303 .p2align 4
304L(length_less16):
305 punpcklbw %xmm1, %xmm1
306 punpcklbw %xmm1, %xmm1
307
7395928b 308 add $16, %edx
093ecf92
LD
309
310 pshufd $0, %xmm1, %xmm1
311
7395928b
L
312 mov %edi, %ecx
313 and $15, %ecx
093ecf92
LD
314 jz L(length_less16_offset0)
315
093ecf92 316 mov %cl, %dh
7395928b 317 mov %ecx, %esi
093ecf92
LD
318 add %dl, %dh
319 and $-16, %rdi
320
321 sub $16, %dh
322 ja L(length_less16_part2)
323
324 pcmpeqb (%rdi), %xmm1
325 pmovmskb %xmm1, %eax
326
327 sar %cl, %eax
328 mov %dl, %cl
329
330 mov $1, %edx
331 sal %cl, %edx
332 sub $1, %edx
333
334 and %edx, %eax
335 test %eax, %eax
336 jz L(return_null)
337
338 bsr %eax, %eax
339 add %rdi, %rax
7395928b 340 add %rsi, %rax
093ecf92
LD
341 ret
342
343 .p2align 4
344L(length_less16_part2):
345 movdqa 16(%rdi), %xmm2
346 pcmpeqb %xmm1, %xmm2
347 pmovmskb %xmm2, %eax
348
349 mov %dh, %cl
350 mov $1, %edx
351 sal %cl, %edx
352 sub $1, %edx
353
354 and %edx, %eax
355
356 test %eax, %eax
357 jnz L(length_less16_part2_return)
358
359 pcmpeqb (%rdi), %xmm1
360 pmovmskb %xmm1, %eax
361
7395928b 362 mov %esi, %ecx
093ecf92
LD
363 sar %cl, %eax
364 test %eax, %eax
365 jz L(return_null)
366
367 bsr %eax, %eax
368 add %rdi, %rax
7395928b 369 add %rsi, %rax
093ecf92
LD
370 ret
371
372 .p2align 4
373L(length_less16_part2_return):
374 bsr %eax, %eax
375 lea 16(%rax, %rdi), %rax
376 ret
377
380292ba
JM
378END (__memrchr)
379weak_alias (__memrchr, memrchr)