]>
Commit | Line | Data |
---|---|---|
5ac7aa1d | 1 | /* memrchr optimized with AVX2. |
688903eb | 2 | Copyright (C) 2017-2018 Free Software Foundation, Inc. |
5ac7aa1d L |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | #if IS_IN (libc) | |
20 | ||
21 | # include <sysdep.h> | |
22 | ||
23 | # ifndef VZEROUPPER | |
24 | # define VZEROUPPER vzeroupper | |
25 | # endif | |
26 | ||
27 | # define VEC_SIZE 32 | |
28 | ||
29 | .section .text.avx,"ax",@progbits | |
30 | ENTRY (__memrchr_avx2) | |
31 | /* Broadcast CHAR to YMM0. */ | |
32 | vmovd %esi, %xmm0 | |
33 | vpbroadcastb %xmm0, %ymm0 | |
34 | ||
eee0a3d0 | 35 | sub $VEC_SIZE, %RDX_LP |
5ac7aa1d L |
36 | jbe L(last_vec_or_less) |
37 | ||
eee0a3d0 | 38 | add %RDX_LP, %RDI_LP |
5ac7aa1d L |
39 | |
40 | /* Check the last VEC_SIZE bytes. */ | |
41 | vpcmpeqb (%rdi), %ymm0, %ymm1 | |
42 | vpmovmskb %ymm1, %eax | |
43 | testl %eax, %eax | |
44 | jnz L(last_vec_x0) | |
45 | ||
46 | subq $(VEC_SIZE * 4), %rdi | |
47 | movl %edi, %ecx | |
48 | andl $(VEC_SIZE - 1), %ecx | |
49 | jz L(aligned_more) | |
50 | ||
51 | /* Align data for aligned loads in the loop. */ | |
52 | addq $VEC_SIZE, %rdi | |
53 | addq $VEC_SIZE, %rdx | |
54 | andq $-VEC_SIZE, %rdi | |
55 | subq %rcx, %rdx | |
56 | ||
57 | .p2align 4 | |
58 | L(aligned_more): | |
59 | subq $(VEC_SIZE * 4), %rdx | |
60 | jbe L(last_4x_vec_or_less) | |
61 | ||
62 | /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time | |
63 | since data is only aligned to VEC_SIZE. */ | |
64 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 | |
65 | vpmovmskb %ymm1, %eax | |
66 | testl %eax, %eax | |
67 | jnz L(last_vec_x3) | |
68 | ||
69 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 | |
70 | vpmovmskb %ymm2, %eax | |
71 | testl %eax, %eax | |
72 | jnz L(last_vec_x2) | |
73 | ||
74 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 | |
75 | vpmovmskb %ymm3, %eax | |
76 | testl %eax, %eax | |
77 | jnz L(last_vec_x1) | |
78 | ||
79 | vpcmpeqb (%rdi), %ymm0, %ymm4 | |
80 | vpmovmskb %ymm4, %eax | |
81 | testl %eax, %eax | |
82 | jnz L(last_vec_x0) | |
83 | ||
84 | /* Align data to 4 * VEC_SIZE for loop with fewer branches. | |
85 | There are some overlaps with above if data isn't aligned | |
86 | to 4 * VEC_SIZE. */ | |
87 | movl %edi, %ecx | |
88 | andl $(VEC_SIZE * 4 - 1), %ecx | |
89 | jz L(loop_4x_vec) | |
90 | ||
91 | addq $(VEC_SIZE * 4), %rdi | |
92 | addq $(VEC_SIZE * 4), %rdx | |
93 | andq $-(VEC_SIZE * 4), %rdi | |
94 | subq %rcx, %rdx | |
95 | ||
96 | .p2align 4 | |
97 | L(loop_4x_vec): | |
98 | /* Compare 4 * VEC at a time forward. */ | |
99 | subq $(VEC_SIZE * 4), %rdi | |
100 | subq $(VEC_SIZE * 4), %rdx | |
101 | jbe L(last_4x_vec_or_less) | |
102 | ||
103 | vmovdqa (%rdi), %ymm1 | |
104 | vmovdqa VEC_SIZE(%rdi), %ymm2 | |
105 | vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 | |
106 | vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 | |
107 | ||
108 | vpcmpeqb %ymm1, %ymm0, %ymm1 | |
109 | vpcmpeqb %ymm2, %ymm0, %ymm2 | |
110 | vpcmpeqb %ymm3, %ymm0, %ymm3 | |
111 | vpcmpeqb %ymm4, %ymm0, %ymm4 | |
112 | ||
113 | vpor %ymm1, %ymm2, %ymm5 | |
114 | vpor %ymm3, %ymm4, %ymm6 | |
115 | vpor %ymm5, %ymm6, %ymm5 | |
116 | ||
117 | vpmovmskb %ymm5, %eax | |
118 | testl %eax, %eax | |
119 | jz L(loop_4x_vec) | |
120 | ||
121 | /* There is a match. */ | |
122 | vpmovmskb %ymm4, %eax | |
123 | testl %eax, %eax | |
124 | jnz L(last_vec_x3) | |
125 | ||
126 | vpmovmskb %ymm3, %eax | |
127 | testl %eax, %eax | |
128 | jnz L(last_vec_x2) | |
129 | ||
130 | vpmovmskb %ymm2, %eax | |
131 | testl %eax, %eax | |
132 | jnz L(last_vec_x1) | |
133 | ||
134 | vpmovmskb %ymm1, %eax | |
135 | bsrl %eax, %eax | |
136 | addq %rdi, %rax | |
137 | VZEROUPPER | |
138 | ret | |
139 | ||
140 | .p2align 4 | |
141 | L(last_4x_vec_or_less): | |
142 | addl $(VEC_SIZE * 4), %edx | |
143 | cmpl $(VEC_SIZE * 2), %edx | |
144 | jbe L(last_2x_vec) | |
145 | ||
146 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 | |
147 | vpmovmskb %ymm1, %eax | |
148 | testl %eax, %eax | |
149 | jnz L(last_vec_x3) | |
150 | ||
151 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 | |
152 | vpmovmskb %ymm2, %eax | |
153 | testl %eax, %eax | |
154 | jnz L(last_vec_x2) | |
155 | ||
156 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 | |
157 | vpmovmskb %ymm3, %eax | |
158 | testl %eax, %eax | |
159 | jnz L(last_vec_x1_check) | |
160 | cmpl $(VEC_SIZE * 3), %edx | |
161 | jbe L(zero) | |
162 | ||
163 | vpcmpeqb (%rdi), %ymm0, %ymm4 | |
164 | vpmovmskb %ymm4, %eax | |
165 | testl %eax, %eax | |
166 | jz L(zero) | |
167 | bsrl %eax, %eax | |
168 | subq $(VEC_SIZE * 4), %rdx | |
169 | addq %rax, %rdx | |
170 | jl L(zero) | |
171 | addq %rdi, %rax | |
172 | VZEROUPPER | |
173 | ret | |
174 | ||
175 | .p2align 4 | |
176 | L(last_2x_vec): | |
177 | vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 | |
178 | vpmovmskb %ymm1, %eax | |
179 | testl %eax, %eax | |
180 | jnz L(last_vec_x3_check) | |
181 | cmpl $VEC_SIZE, %edx | |
182 | jbe L(zero) | |
183 | ||
184 | vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 | |
185 | vpmovmskb %ymm1, %eax | |
186 | testl %eax, %eax | |
187 | jz L(zero) | |
188 | bsrl %eax, %eax | |
189 | subq $(VEC_SIZE * 2), %rdx | |
190 | addq %rax, %rdx | |
191 | jl L(zero) | |
192 | addl $(VEC_SIZE * 2), %eax | |
193 | addq %rdi, %rax | |
194 | VZEROUPPER | |
195 | ret | |
196 | ||
197 | .p2align 4 | |
198 | L(last_vec_x0): | |
199 | bsrl %eax, %eax | |
200 | addq %rdi, %rax | |
201 | VZEROUPPER | |
202 | ret | |
203 | ||
204 | .p2align 4 | |
205 | L(last_vec_x1): | |
206 | bsrl %eax, %eax | |
207 | addl $VEC_SIZE, %eax | |
208 | addq %rdi, %rax | |
209 | VZEROUPPER | |
210 | ret | |
211 | ||
212 | .p2align 4 | |
213 | L(last_vec_x2): | |
214 | bsrl %eax, %eax | |
215 | addl $(VEC_SIZE * 2), %eax | |
216 | addq %rdi, %rax | |
217 | VZEROUPPER | |
218 | ret | |
219 | ||
220 | .p2align 4 | |
221 | L(last_vec_x3): | |
222 | bsrl %eax, %eax | |
223 | addl $(VEC_SIZE * 3), %eax | |
224 | addq %rdi, %rax | |
225 | ret | |
226 | ||
227 | .p2align 4 | |
228 | L(last_vec_x1_check): | |
229 | bsrl %eax, %eax | |
230 | subq $(VEC_SIZE * 3), %rdx | |
231 | addq %rax, %rdx | |
232 | jl L(zero) | |
233 | addl $VEC_SIZE, %eax | |
234 | addq %rdi, %rax | |
235 | VZEROUPPER | |
236 | ret | |
237 | ||
238 | .p2align 4 | |
239 | L(last_vec_x3_check): | |
240 | bsrl %eax, %eax | |
241 | subq $VEC_SIZE, %rdx | |
242 | addq %rax, %rdx | |
243 | jl L(zero) | |
244 | addl $(VEC_SIZE * 3), %eax | |
245 | addq %rdi, %rax | |
246 | VZEROUPPER | |
247 | ret | |
248 | ||
249 | .p2align 4 | |
250 | L(zero): | |
251 | VZEROUPPER | |
252 | L(null): | |
253 | xorl %eax, %eax | |
254 | ret | |
255 | ||
256 | .p2align 4 | |
257 | L(last_vec_or_less_aligned): | |
258 | movl %edx, %ecx | |
259 | ||
260 | vpcmpeqb (%rdi), %ymm0, %ymm1 | |
261 | ||
262 | movl $1, %edx | |
263 | /* Support rdx << 32. */ | |
264 | salq %cl, %rdx | |
265 | subq $1, %rdx | |
266 | ||
267 | vpmovmskb %ymm1, %eax | |
268 | ||
269 | /* Remove the trailing bytes. */ | |
270 | andl %edx, %eax | |
271 | testl %eax, %eax | |
272 | jz L(zero) | |
273 | ||
274 | bsrl %eax, %eax | |
275 | addq %rdi, %rax | |
276 | VZEROUPPER | |
277 | ret | |
278 | ||
279 | .p2align 4 | |
280 | L(last_vec_or_less): | |
281 | addl $VEC_SIZE, %edx | |
282 | ||
283 | /* Check for zero length. */ | |
284 | testl %edx, %edx | |
285 | jz L(null) | |
286 | ||
287 | movl %edi, %ecx | |
288 | andl $(VEC_SIZE - 1), %ecx | |
289 | jz L(last_vec_or_less_aligned) | |
290 | ||
291 | movl %ecx, %esi | |
292 | movl %ecx, %r8d | |
293 | addl %edx, %esi | |
294 | andq $-VEC_SIZE, %rdi | |
295 | ||
296 | subl $VEC_SIZE, %esi | |
297 | ja L(last_vec_2x_aligned) | |
298 | ||
299 | /* Check the last VEC. */ | |
300 | vpcmpeqb (%rdi), %ymm0, %ymm1 | |
301 | vpmovmskb %ymm1, %eax | |
302 | ||
303 | /* Remove the leading and trailing bytes. */ | |
304 | sarl %cl, %eax | |
305 | movl %edx, %ecx | |
306 | ||
307 | movl $1, %edx | |
308 | sall %cl, %edx | |
309 | subl $1, %edx | |
310 | ||
311 | andl %edx, %eax | |
312 | testl %eax, %eax | |
313 | jz L(zero) | |
314 | ||
315 | bsrl %eax, %eax | |
316 | addq %rdi, %rax | |
317 | addq %r8, %rax | |
318 | VZEROUPPER | |
319 | ret | |
320 | ||
321 | .p2align 4 | |
322 | L(last_vec_2x_aligned): | |
323 | movl %esi, %ecx | |
324 | ||
325 | /* Check the last VEC. */ | |
326 | vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 | |
327 | ||
328 | movl $1, %edx | |
329 | sall %cl, %edx | |
330 | subl $1, %edx | |
331 | ||
332 | vpmovmskb %ymm1, %eax | |
333 | ||
334 | /* Remove the trailing bytes. */ | |
335 | andl %edx, %eax | |
336 | ||
337 | testl %eax, %eax | |
338 | jnz L(last_vec_x1) | |
339 | ||
340 | /* Check the second last VEC. */ | |
341 | vpcmpeqb (%rdi), %ymm0, %ymm1 | |
342 | ||
343 | movl %r8d, %ecx | |
344 | ||
345 | vpmovmskb %ymm1, %eax | |
346 | ||
347 | /* Remove the leading bytes. Must use unsigned right shift for | |
348 | bsrl below. */ | |
349 | shrl %cl, %eax | |
350 | testl %eax, %eax | |
351 | jz L(zero) | |
352 | ||
353 | bsrl %eax, %eax | |
354 | addq %rdi, %rax | |
355 | addq %r8, %rax | |
356 | VZEROUPPER | |
357 | ret | |
358 | END (__memrchr_avx2) | |
359 | #endif |