]>
Commit | Line | Data |
---|---|---|
d2538b91 | 1 | /* strrchr/wcsrchr optimized with AVX2. |
581c785b | 2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |
d2538b91 L |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
d2538b91 | 18 | |
ceabdcd1 NG |
19 | #include <isa-level.h> |
20 | ||
21 | #if ISA_SHOULD_BUILD (3) | |
d2538b91 L |
22 | |
23 | # include <sysdep.h> | |
24 | ||
25 | # ifndef STRRCHR | |
26 | # define STRRCHR __strrchr_avx2 | |
27 | # endif | |
28 | ||
29 | # ifdef USE_AS_WCSRCHR | |
30 | # define VPBROADCAST vpbroadcastd | |
31 | # define VPCMPEQ vpcmpeqd | |
df7e295d NG |
32 | # define VPMIN vpminud |
33 | # define CHAR_SIZE 4 | |
d2538b91 L |
34 | # else |
35 | # define VPBROADCAST vpbroadcastb | |
36 | # define VPCMPEQ vpcmpeqb | |
df7e295d NG |
37 | # define VPMIN vpminub |
38 | # define CHAR_SIZE 1 | |
d2538b91 L |
39 | # endif |
40 | ||
41 | # ifndef VZEROUPPER | |
42 | # define VZEROUPPER vzeroupper | |
43 | # endif | |
44 | ||
7ebba913 L |
45 | # ifndef SECTION |
46 | # define SECTION(p) p##.avx | |
47 | # endif | |
48 | ||
d2538b91 | 49 | # define VEC_SIZE 32 |
df7e295d | 50 | # define PAGE_SIZE 4096 |
d2538b91 | 51 | |
df7e295d NG |
52 | .section SECTION(.text), "ax", @progbits |
53 | ENTRY(STRRCHR) | |
3079f652 | 54 | vmovd %esi, %xmm7 |
df7e295d | 55 | movl %edi, %eax |
d2538b91 | 56 | /* Broadcast CHAR to YMM4. */ |
df7e295d | 57 | VPBROADCAST %xmm7, %ymm7 |
a35a5903 | 58 | vpxor %xmm0, %xmm0, %xmm0 |
d2538b91 | 59 | |
df7e295d NG |
60 | /* Shift here instead of `andl` to save code size (saves a fetch |
61 | block). */ | |
62 | sall $20, %eax | |
63 | cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax | |
64 | ja L(cross_page) | |
d2538b91 | 65 | |
df7e295d | 66 | L(page_cross_continue): |
d2538b91 | 67 | vmovdqu (%rdi), %ymm1 |
df7e295d NG |
68 | /* Check end of string match. */ |
69 | VPCMPEQ %ymm1, %ymm0, %ymm6 | |
70 | vpmovmskb %ymm6, %ecx | |
71 | testl %ecx, %ecx | |
72 | jz L(aligned_more) | |
73 | ||
74 | /* Only check match with search CHAR if needed. */ | |
75 | VPCMPEQ %ymm1, %ymm7, %ymm1 | |
76 | vpmovmskb %ymm1, %eax | |
77 | /* Check if match before first zero. */ | |
78 | blsmskl %ecx, %ecx | |
79 | andl %ecx, %eax | |
80 | jz L(ret0) | |
81 | bsrl %eax, %eax | |
82 | addq %rdi, %rax | |
83 | /* We are off by 3 for wcsrchr if search CHAR is non-zero. If | |
84 | search CHAR is zero we are correct. Either way `andq | |
85 | -CHAR_SIZE, %rax` gets the correct result. */ | |
86 | # ifdef USE_AS_WCSRCHR | |
87 | andq $-CHAR_SIZE, %rax | |
88 | # endif | |
89 | L(ret0): | |
90 | L(return_vzeroupper): | |
91 | ZERO_UPPER_VEC_REGISTERS_RETURN | |
92 | ||
93 | /* Returns for first vec x1/x2 have hard coded backward search | |
94 | path for earlier matches. */ | |
95 | .p2align 4,, 10 | |
96 | L(first_vec_x1): | |
97 | VPCMPEQ %ymm2, %ymm7, %ymm6 | |
98 | vpmovmskb %ymm6, %eax | |
99 | blsmskl %ecx, %ecx | |
100 | andl %ecx, %eax | |
101 | jnz L(first_vec_x1_return) | |
102 | ||
103 | .p2align 4,, 4 | |
104 | L(first_vec_x0_test): | |
105 | VPCMPEQ %ymm1, %ymm7, %ymm6 | |
106 | vpmovmskb %ymm6, %eax | |
107 | testl %eax, %eax | |
108 | jz L(ret1) | |
109 | bsrl %eax, %eax | |
110 | addq %r8, %rax | |
111 | # ifdef USE_AS_WCSRCHR | |
112 | andq $-CHAR_SIZE, %rax | |
113 | # endif | |
114 | L(ret1): | |
115 | VZEROUPPER_RETURN | |
d2538b91 | 116 | |
df7e295d NG |
117 | .p2align 4,, 10 |
118 | L(first_vec_x0_x1_test): | |
119 | VPCMPEQ %ymm2, %ymm7, %ymm6 | |
120 | vpmovmskb %ymm6, %eax | |
121 | /* Check ymm2 for search CHAR match. If no match then check ymm1 | |
122 | before returning. */ | |
d2538b91 | 123 | testl %eax, %eax |
df7e295d NG |
124 | jz L(first_vec_x0_test) |
125 | .p2align 4,, 4 | |
126 | L(first_vec_x1_return): | |
127 | bsrl %eax, %eax | |
128 | leaq 1(%rdi, %rax), %rax | |
129 | # ifdef USE_AS_WCSRCHR | |
130 | andq $-CHAR_SIZE, %rax | |
131 | # endif | |
132 | VZEROUPPER_RETURN | |
d2538b91 | 133 | |
d2538b91 | 134 | |
df7e295d NG |
135 | .p2align 4,, 10 |
136 | L(first_vec_x2): | |
137 | VPCMPEQ %ymm3, %ymm7, %ymm6 | |
138 | vpmovmskb %ymm6, %eax | |
139 | blsmskl %ecx, %ecx | |
140 | /* If no in-range search CHAR match in ymm3 then need to check | |
141 | ymm1/ymm2 for an earlier match (we delay checking search | |
142 | CHAR matches until needed). */ | |
143 | andl %ecx, %eax | |
144 | jz L(first_vec_x0_x1_test) | |
145 | bsrl %eax, %eax | |
146 | leaq (VEC_SIZE + 1)(%rdi, %rax), %rax | |
147 | # ifdef USE_AS_WCSRCHR | |
148 | andq $-CHAR_SIZE, %rax | |
149 | # endif | |
150 | VZEROUPPER_RETURN | |
151 | ||
d2538b91 L |
152 | |
153 | .p2align 4 | |
df7e295d NG |
154 | L(aligned_more): |
155 | /* Save original pointer if match was in VEC 0. */ | |
156 | movq %rdi, %r8 | |
157 | ||
158 | /* Align src. */ | |
159 | orq $(VEC_SIZE - 1), %rdi | |
160 | vmovdqu 1(%rdi), %ymm2 | |
161 | VPCMPEQ %ymm2, %ymm0, %ymm6 | |
162 | vpmovmskb %ymm6, %ecx | |
d2538b91 | 163 | testl %ecx, %ecx |
df7e295d | 164 | jnz L(first_vec_x1) |
d2538b91 | 165 | |
df7e295d NG |
166 | vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 |
167 | VPCMPEQ %ymm3, %ymm0, %ymm6 | |
168 | vpmovmskb %ymm6, %ecx | |
169 | testl %ecx, %ecx | |
170 | jnz L(first_vec_x2) | |
d2538b91 | 171 | |
df7e295d NG |
172 | /* Save pointer again before realigning. */ |
173 | movq %rdi, %rsi | |
174 | addq $(VEC_SIZE + 1), %rdi | |
175 | andq $-(VEC_SIZE * 2), %rdi | |
d2538b91 | 176 | .p2align 4 |
df7e295d NG |
177 | L(first_aligned_loop): |
178 | /* Do 2x VEC at a time. Any more and the cost of finding the | |
179 | match outweights loop benefit. */ | |
180 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 | |
181 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 | |
182 | ||
183 | VPCMPEQ %ymm4, %ymm7, %ymm6 | |
184 | VPMIN %ymm4, %ymm5, %ymm8 | |
185 | VPCMPEQ %ymm5, %ymm7, %ymm10 | |
186 | vpor %ymm6, %ymm10, %ymm5 | |
187 | VPCMPEQ %ymm8, %ymm0, %ymm8 | |
188 | vpor %ymm5, %ymm8, %ymm9 | |
189 | ||
190 | vpmovmskb %ymm9, %eax | |
191 | addq $(VEC_SIZE * 2), %rdi | |
192 | /* No zero or search CHAR. */ | |
d2538b91 | 193 | testl %eax, %eax |
df7e295d | 194 | jz L(first_aligned_loop) |
d2538b91 | 195 | |
df7e295d NG |
196 | /* If no zero CHAR then go to second loop (this allows us to |
197 | throw away all prior work). */ | |
198 | vpmovmskb %ymm8, %ecx | |
199 | testl %ecx, %ecx | |
200 | jz L(second_aligned_loop_prep) | |
d2538b91 | 201 | |
df7e295d NG |
202 | /* Search char could be zero so we need to get the true match. |
203 | */ | |
204 | vpmovmskb %ymm5, %eax | |
205 | testl %eax, %eax | |
206 | jnz L(first_aligned_loop_return) | |
d2538b91 | 207 | |
df7e295d NG |
208 | .p2align 4,, 4 |
209 | L(first_vec_x1_or_x2): | |
210 | VPCMPEQ %ymm3, %ymm7, %ymm3 | |
211 | VPCMPEQ %ymm2, %ymm7, %ymm2 | |
d2538b91 | 212 | vpmovmskb %ymm3, %eax |
df7e295d NG |
213 | vpmovmskb %ymm2, %edx |
214 | /* Use add for macro-fusion. */ | |
215 | addq %rax, %rdx | |
216 | jz L(first_vec_x0_test) | |
217 | /* NB: We could move this shift to before the branch and save a | |
218 | bit of code size / performance on the fall through. The | |
219 | branch leads to the null case which generally seems hotter | |
220 | than char in first 3x VEC. */ | |
221 | salq $32, %rax | |
222 | addq %rdx, %rax | |
223 | bsrq %rax, %rax | |
224 | leaq 1(%rsi, %rax), %rax | |
225 | # ifdef USE_AS_WCSRCHR | |
226 | andq $-CHAR_SIZE, %rax | |
227 | # endif | |
228 | VZEROUPPER_RETURN | |
d2538b91 | 229 | |
df7e295d NG |
230 | .p2align 4,, 8 |
231 | L(first_aligned_loop_return): | |
232 | VPCMPEQ %ymm4, %ymm0, %ymm4 | |
233 | vpmovmskb %ymm4, %edx | |
234 | salq $32, %rcx | |
235 | orq %rdx, %rcx | |
236 | ||
237 | vpmovmskb %ymm10, %eax | |
238 | vpmovmskb %ymm6, %edx | |
239 | salq $32, %rax | |
240 | orq %rdx, %rax | |
241 | blsmskq %rcx, %rcx | |
242 | andq %rcx, %rax | |
243 | jz L(first_vec_x1_or_x2) | |
244 | ||
245 | bsrq %rax, %rax | |
246 | leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax | |
d2538b91 | 247 | # ifdef USE_AS_WCSRCHR |
df7e295d | 248 | andq $-CHAR_SIZE, %rax |
d2538b91 | 249 | # endif |
df7e295d | 250 | VZEROUPPER_RETURN |
d2538b91 | 251 | |
df7e295d | 252 | /* Search char cannot be zero. */ |
d2538b91 | 253 | .p2align 4 |
df7e295d NG |
254 | L(second_aligned_loop_set_furthest_match): |
255 | /* Save VEC and pointer from most recent match. */ | |
256 | L(second_aligned_loop_prep): | |
d2538b91 | 257 | movq %rdi, %rsi |
df7e295d NG |
258 | vmovdqu %ymm6, %ymm2 |
259 | vmovdqu %ymm10, %ymm3 | |
d2538b91 L |
260 | |
261 | .p2align 4 | |
df7e295d NG |
262 | L(second_aligned_loop): |
263 | /* Search 2x at at time. */ | |
264 | vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 | |
265 | vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 | |
266 | ||
267 | VPCMPEQ %ymm4, %ymm7, %ymm6 | |
268 | VPMIN %ymm4, %ymm5, %ymm1 | |
269 | VPCMPEQ %ymm5, %ymm7, %ymm10 | |
270 | vpor %ymm6, %ymm10, %ymm5 | |
271 | VPCMPEQ %ymm1, %ymm0, %ymm1 | |
272 | vpor %ymm5, %ymm1, %ymm9 | |
273 | ||
274 | vpmovmskb %ymm9, %eax | |
275 | addq $(VEC_SIZE * 2), %rdi | |
d2538b91 | 276 | testl %eax, %eax |
df7e295d NG |
277 | jz L(second_aligned_loop) |
278 | vpmovmskb %ymm1, %ecx | |
279 | testl %ecx, %ecx | |
280 | jz L(second_aligned_loop_set_furthest_match) | |
281 | vpmovmskb %ymm5, %eax | |
d2538b91 | 282 | testl %eax, %eax |
df7e295d NG |
283 | jnz L(return_new_match) |
284 | ||
285 | /* This is the hot patch. We know CHAR is inbounds and that | |
286 | ymm3/ymm2 have latest match. */ | |
287 | .p2align 4,, 4 | |
288 | L(return_old_match): | |
289 | vpmovmskb %ymm3, %eax | |
290 | vpmovmskb %ymm2, %edx | |
291 | salq $32, %rax | |
292 | orq %rdx, %rax | |
293 | bsrq %rax, %rax | |
294 | /* Search char cannot be zero so safe to just use lea for | |
295 | wcsrchr. */ | |
296 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax | |
7ebba913 | 297 | VZEROUPPER_RETURN |
d2538b91 | 298 | |
df7e295d NG |
299 | /* Last iteration also potentially has a match. */ |
300 | .p2align 4,, 8 | |
301 | L(return_new_match): | |
302 | VPCMPEQ %ymm4, %ymm0, %ymm4 | |
303 | vpmovmskb %ymm4, %edx | |
304 | salq $32, %rcx | |
305 | orq %rdx, %rcx | |
306 | ||
307 | vpmovmskb %ymm10, %eax | |
308 | vpmovmskb %ymm6, %edx | |
309 | salq $32, %rax | |
310 | orq %rdx, %rax | |
311 | blsmskq %rcx, %rcx | |
312 | andq %rcx, %rax | |
313 | jz L(return_old_match) | |
314 | bsrq %rax, %rax | |
315 | /* Search char cannot be zero so safe to just use lea for | |
316 | wcsrchr. */ | |
317 | leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax | |
7ebba913 | 318 | VZEROUPPER_RETURN |
d2538b91 | 319 | |
df7e295d NG |
320 | .p2align 4,, 4 |
321 | L(cross_page): | |
322 | movq %rdi, %rsi | |
323 | andq $-VEC_SIZE, %rsi | |
324 | vmovdqu (%rsi), %ymm1 | |
325 | VPCMPEQ %ymm1, %ymm0, %ymm6 | |
326 | vpmovmskb %ymm6, %ecx | |
327 | /* Shift out zero CHAR matches that are before the begining of | |
328 | src (rdi). */ | |
329 | shrxl %edi, %ecx, %ecx | |
330 | testl %ecx, %ecx | |
331 | jz L(page_cross_continue) | |
332 | VPCMPEQ %ymm1, %ymm7, %ymm1 | |
333 | vpmovmskb %ymm1, %eax | |
334 | ||
335 | /* Shift out search CHAR matches that are before the begining of | |
336 | src (rdi). */ | |
337 | shrxl %edi, %eax, %eax | |
338 | blsmskl %ecx, %ecx | |
339 | /* Check if any search CHAR match in range. */ | |
340 | andl %ecx, %eax | |
341 | jz L(ret2) | |
342 | bsrl %eax, %eax | |
343 | addq %rdi, %rax | |
344 | # ifdef USE_AS_WCSRCHR | |
345 | andq $-CHAR_SIZE, %rax | |
346 | # endif | |
347 | L(ret2): | |
348 | VZEROUPPER_RETURN | |
349 | END(STRRCHR) | |
d2538b91 | 350 | #endif |