]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strlen-avx2.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-avx2.S
1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef STRLEN
24 # define STRLEN __strlen_avx2
25 # endif
26
27 # ifdef USE_AS_WCSLEN
28 # define VPCMPEQ vpcmpeqd
29 # define VPMINU vpminud
30 # define CHAR_SIZE 4
31 # else
32 # define VPCMPEQ vpcmpeqb
33 # define VPMINU vpminub
34 # define CHAR_SIZE 1
35 # endif
36
37 # ifndef VZEROUPPER
38 # define VZEROUPPER vzeroupper
39 # endif
40
41 # ifndef SECTION
42 # define SECTION(p) p##.avx
43 # endif
44
45 # define VEC_SIZE 32
46 # define PAGE_SIZE 4096
47 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
48
49 .section SECTION(.text),"ax",@progbits
50 ENTRY (STRLEN)
51 # ifdef USE_AS_STRNLEN
52 /* Check zero length. */
53 # ifdef __ILP32__
54 /* Clear upper bits. */
55 and %RSI_LP, %RSI_LP
56 # else
57 test %RSI_LP, %RSI_LP
58 # endif
59 jz L(zero)
60 /* Store max len in R8_LP before adjusting if using WCSLEN. */
61 mov %RSI_LP, %R8_LP
62 # endif
63 movl %edi, %eax
64 movq %rdi, %rdx
65 vpxor %xmm0, %xmm0, %xmm0
66 /* Clear high bits from edi. Only keeping bits relevant to page
67 cross check. */
68 andl $(PAGE_SIZE - 1), %eax
69 /* Check if we may cross page boundary with one vector load. */
70 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
71 ja L(cross_page_boundary)
72
73 /* Check the first VEC_SIZE bytes. */
74 VPCMPEQ (%rdi), %ymm0, %ymm1
75 vpmovmskb %ymm1, %eax
76 # ifdef USE_AS_STRNLEN
77 /* If length < VEC_SIZE handle special. */
78 cmpq $CHAR_PER_VEC, %rsi
79 jbe L(first_vec_x0)
80 # endif
81 /* If empty continue to aligned_more. Otherwise return bit
82 position of first match. */
83 testl %eax, %eax
84 jz L(aligned_more)
85 tzcntl %eax, %eax
86 # ifdef USE_AS_WCSLEN
87 /* NB: Divide bytes by 4 to get wchar_t count. */
88 shrl $2, %eax
89 # endif
90 VZEROUPPER_RETURN
91
92 # ifdef USE_AS_STRNLEN
93 L(zero):
94 xorl %eax, %eax
95 ret
96
97 .p2align 4
98 L(first_vec_x0):
99 /* Set bit for max len so that tzcnt will return min of max len
100 and position of first match. */
101 # ifdef USE_AS_WCSLEN
102 /* NB: Multiply length by 4 to get byte count. */
103 sall $2, %esi
104 # endif
105 btsq %rsi, %rax
106 tzcntl %eax, %eax
107 # ifdef USE_AS_WCSLEN
108 /* NB: Divide bytes by 4 to get wchar_t count. */
109 shrl $2, %eax
110 # endif
111 VZEROUPPER_RETURN
112 # endif
113
114 .p2align 4
115 L(first_vec_x1):
116 tzcntl %eax, %eax
117 /* Safe to use 32 bit instructions as these are only called for
118 size = [1, 159]. */
119 # ifdef USE_AS_STRNLEN
120 /* Use ecx which was computed earlier to compute correct value.
121 */
122 # ifdef USE_AS_WCSLEN
123 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
124 # else
125 subl $(VEC_SIZE * 4 + 1), %ecx
126 addl %ecx, %eax
127 # endif
128 # else
129 subl %edx, %edi
130 incl %edi
131 addl %edi, %eax
132 # endif
133 # ifdef USE_AS_WCSLEN
134 /* NB: Divide bytes by 4 to get wchar_t count. */
135 shrl $2, %eax
136 # endif
137 VZEROUPPER_RETURN
138
139 .p2align 4
140 L(first_vec_x2):
141 tzcntl %eax, %eax
142 /* Safe to use 32 bit instructions as these are only called for
143 size = [1, 159]. */
144 # ifdef USE_AS_STRNLEN
145 /* Use ecx which was computed earlier to compute correct value.
146 */
147 # ifdef USE_AS_WCSLEN
148 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
149 # else
150 subl $(VEC_SIZE * 3 + 1), %ecx
151 addl %ecx, %eax
152 # endif
153 # else
154 subl %edx, %edi
155 addl $(VEC_SIZE + 1), %edi
156 addl %edi, %eax
157 # endif
158 # ifdef USE_AS_WCSLEN
159 /* NB: Divide bytes by 4 to get wchar_t count. */
160 shrl $2, %eax
161 # endif
162 VZEROUPPER_RETURN
163
164 .p2align 4
165 L(first_vec_x3):
166 tzcntl %eax, %eax
167 /* Safe to use 32 bit instructions as these are only called for
168 size = [1, 159]. */
169 # ifdef USE_AS_STRNLEN
170 /* Use ecx which was computed earlier to compute correct value.
171 */
172 # ifdef USE_AS_WCSLEN
173 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
174 # else
175 subl $(VEC_SIZE * 2 + 1), %ecx
176 addl %ecx, %eax
177 # endif
178 # else
179 subl %edx, %edi
180 addl $(VEC_SIZE * 2 + 1), %edi
181 addl %edi, %eax
182 # endif
183 # ifdef USE_AS_WCSLEN
184 /* NB: Divide bytes by 4 to get wchar_t count. */
185 shrl $2, %eax
186 # endif
187 VZEROUPPER_RETURN
188
189 .p2align 4
190 L(first_vec_x4):
191 tzcntl %eax, %eax
192 /* Safe to use 32 bit instructions as these are only called for
193 size = [1, 159]. */
194 # ifdef USE_AS_STRNLEN
195 /* Use ecx which was computed earlier to compute correct value.
196 */
197 # ifdef USE_AS_WCSLEN
198 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
199 # else
200 subl $(VEC_SIZE + 1), %ecx
201 addl %ecx, %eax
202 # endif
203 # else
204 subl %edx, %edi
205 addl $(VEC_SIZE * 3 + 1), %edi
206 addl %edi, %eax
207 # endif
208 # ifdef USE_AS_WCSLEN
209 /* NB: Divide bytes by 4 to get wchar_t count. */
210 shrl $2, %eax
211 # endif
212 VZEROUPPER_RETURN
213
214 .p2align 5
215 L(aligned_more):
216 /* Align data to VEC_SIZE - 1. This is the same number of
217 instructions as using andq with -VEC_SIZE but saves 4 bytes of
218 code on the x4 check. */
219 orq $(VEC_SIZE - 1), %rdi
220 L(cross_page_continue):
221 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
222 since data is only aligned to VEC_SIZE. */
223 # ifdef USE_AS_STRNLEN
224 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
225 because it simplies the logic in last_4x_vec_or_less. */
226 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
227 subq %rdx, %rcx
228 # ifdef USE_AS_WCSLEN
229 /* NB: Divide bytes by 4 to get the wchar_t count. */
230 sarl $2, %ecx
231 # endif
232 # endif
233 /* Load first VEC regardless. */
234 VPCMPEQ 1(%rdi), %ymm0, %ymm1
235 # ifdef USE_AS_STRNLEN
236 /* Adjust length. If near end handle specially. */
237 subq %rcx, %rsi
238 jb L(last_4x_vec_or_less)
239 # endif
240 vpmovmskb %ymm1, %eax
241 testl %eax, %eax
242 jnz L(first_vec_x1)
243
244 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
245 vpmovmskb %ymm1, %eax
246 testl %eax, %eax
247 jnz L(first_vec_x2)
248
249 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
250 vpmovmskb %ymm1, %eax
251 testl %eax, %eax
252 jnz L(first_vec_x3)
253
254 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
255 vpmovmskb %ymm1, %eax
256 testl %eax, %eax
257 jnz L(first_vec_x4)
258
259 /* Align data to VEC_SIZE * 4 - 1. */
260 # ifdef USE_AS_STRNLEN
261 /* Before adjusting length check if at last VEC_SIZE * 4. */
262 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
263 jbe L(last_4x_vec_or_less_load)
264 incq %rdi
265 movl %edi, %ecx
266 orq $(VEC_SIZE * 4 - 1), %rdi
267 andl $(VEC_SIZE * 4 - 1), %ecx
268 # ifdef USE_AS_WCSLEN
269 /* NB: Divide bytes by 4 to get the wchar_t count. */
270 sarl $2, %ecx
271 # endif
272 /* Readjust length. */
273 addq %rcx, %rsi
274 # else
275 incq %rdi
276 orq $(VEC_SIZE * 4 - 1), %rdi
277 # endif
278 /* Compare 4 * VEC at a time forward. */
279 .p2align 4
280 L(loop_4x_vec):
281 # ifdef USE_AS_STRNLEN
282 /* Break if at end of length. */
283 subq $(CHAR_PER_VEC * 4), %rsi
284 jb L(last_4x_vec_or_less_cmpeq)
285 # endif
286 /* Save some code size by microfusing VPMINU with the load.
287 Since the matches in ymm2/ymm4 can only be returned if there
288 where no matches in ymm1/ymm3 respectively there is no issue
289 with overlap. */
290 vmovdqa 1(%rdi), %ymm1
291 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
292 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
293 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
294
295 VPMINU %ymm2, %ymm4, %ymm5
296 VPCMPEQ %ymm5, %ymm0, %ymm5
297 vpmovmskb %ymm5, %ecx
298
299 subq $-(VEC_SIZE * 4), %rdi
300 testl %ecx, %ecx
301 jz L(loop_4x_vec)
302
303
304 VPCMPEQ %ymm1, %ymm0, %ymm1
305 vpmovmskb %ymm1, %eax
306 subq %rdx, %rdi
307 testl %eax, %eax
308 jnz L(last_vec_return_x0)
309
310 VPCMPEQ %ymm2, %ymm0, %ymm2
311 vpmovmskb %ymm2, %eax
312 testl %eax, %eax
313 jnz L(last_vec_return_x1)
314
315 /* Combine last 2 VEC. */
316 VPCMPEQ %ymm3, %ymm0, %ymm3
317 vpmovmskb %ymm3, %eax
318 /* rcx has combined result from all 4 VEC. It will only be used
319 if the first 3 other VEC all did not contain a match. */
320 salq $32, %rcx
321 orq %rcx, %rax
322 tzcntq %rax, %rax
323 subq $(VEC_SIZE * 2 - 1), %rdi
324 addq %rdi, %rax
325 # ifdef USE_AS_WCSLEN
326 /* NB: Divide bytes by 4 to get wchar_t count. */
327 shrq $2, %rax
328 # endif
329 VZEROUPPER_RETURN
330
331
332 # ifdef USE_AS_STRNLEN
333 .p2align 4
334 L(last_4x_vec_or_less_load):
335 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
336 */
337 subq $-(VEC_SIZE * 4), %rdi
338 L(last_4x_vec_or_less_cmpeq):
339 VPCMPEQ 1(%rdi), %ymm0, %ymm1
340 L(last_4x_vec_or_less):
341 # ifdef USE_AS_WCSLEN
342 /* NB: Multiply length by 4 to get byte count. */
343 sall $2, %esi
344 # endif
345 vpmovmskb %ymm1, %eax
346 /* If remaining length > VEC_SIZE * 2. This works if esi is off
347 by VEC_SIZE * 4. */
348 testl $(VEC_SIZE * 2), %esi
349 jnz L(last_4x_vec)
350
351 /* length may have been negative or positive by an offset of
352 VEC_SIZE * 4 depending on where this was called from. This fixes
353 that. */
354 andl $(VEC_SIZE * 4 - 1), %esi
355 testl %eax, %eax
356 jnz L(last_vec_x1_check)
357
358 subl $VEC_SIZE, %esi
359 jb L(max)
360
361 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
362 vpmovmskb %ymm1, %eax
363 tzcntl %eax, %eax
364 /* Check the end of data. */
365 cmpl %eax, %esi
366 jb L(max)
367 subq %rdx, %rdi
368 addl $(VEC_SIZE + 1), %eax
369 addq %rdi, %rax
370 # ifdef USE_AS_WCSLEN
371 /* NB: Divide bytes by 4 to get wchar_t count. */
372 shrq $2, %rax
373 # endif
374 VZEROUPPER_RETURN
375 # endif
376
377 .p2align 4
378 L(last_vec_return_x0):
379 tzcntl %eax, %eax
380 subq $(VEC_SIZE * 4 - 1), %rdi
381 addq %rdi, %rax
382 # ifdef USE_AS_WCSLEN
383 /* NB: Divide bytes by 4 to get wchar_t count. */
384 shrq $2, %rax
385 # endif
386 VZEROUPPER_RETURN
387
388 .p2align 4
389 L(last_vec_return_x1):
390 tzcntl %eax, %eax
391 subq $(VEC_SIZE * 3 - 1), %rdi
392 addq %rdi, %rax
393 # ifdef USE_AS_WCSLEN
394 /* NB: Divide bytes by 4 to get wchar_t count. */
395 shrq $2, %rax
396 # endif
397 VZEROUPPER_RETURN
398
399 # ifdef USE_AS_STRNLEN
400 .p2align 4
401 L(last_vec_x1_check):
402
403 tzcntl %eax, %eax
404 /* Check the end of data. */
405 cmpl %eax, %esi
406 jb L(max)
407 subq %rdx, %rdi
408 incl %eax
409 addq %rdi, %rax
410 # ifdef USE_AS_WCSLEN
411 /* NB: Divide bytes by 4 to get wchar_t count. */
412 shrq $2, %rax
413 # endif
414 VZEROUPPER_RETURN
415
416 L(max):
417 movq %r8, %rax
418 VZEROUPPER_RETURN
419
420 .p2align 4
421 L(last_4x_vec):
422 /* Test first 2x VEC normally. */
423 testl %eax, %eax
424 jnz L(last_vec_x1)
425
426 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
427 vpmovmskb %ymm1, %eax
428 testl %eax, %eax
429 jnz L(last_vec_x2)
430
431 /* Normalize length. */
432 andl $(VEC_SIZE * 4 - 1), %esi
433 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
434 vpmovmskb %ymm1, %eax
435 testl %eax, %eax
436 jnz L(last_vec_x3)
437
438 subl $(VEC_SIZE * 3), %esi
439 jb L(max)
440
441 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
442 vpmovmskb %ymm1, %eax
443 tzcntl %eax, %eax
444 /* Check the end of data. */
445 cmpl %eax, %esi
446 jb L(max)
447 subq %rdx, %rdi
448 addl $(VEC_SIZE * 3 + 1), %eax
449 addq %rdi, %rax
450 # ifdef USE_AS_WCSLEN
451 /* NB: Divide bytes by 4 to get wchar_t count. */
452 shrq $2, %rax
453 # endif
454 VZEROUPPER_RETURN
455
456
457 .p2align 4
458 L(last_vec_x1):
459 /* essentially duplicates of first_vec_x1 but use 64 bit
460 instructions. */
461 tzcntl %eax, %eax
462 subq %rdx, %rdi
463 incl %eax
464 addq %rdi, %rax
465 # ifdef USE_AS_WCSLEN
466 /* NB: Divide bytes by 4 to get wchar_t count. */
467 shrq $2, %rax
468 # endif
469 VZEROUPPER_RETURN
470
471 .p2align 4
472 L(last_vec_x2):
473 /* essentially duplicates of first_vec_x1 but use 64 bit
474 instructions. */
475 tzcntl %eax, %eax
476 subq %rdx, %rdi
477 addl $(VEC_SIZE + 1), %eax
478 addq %rdi, %rax
479 # ifdef USE_AS_WCSLEN
480 /* NB: Divide bytes by 4 to get wchar_t count. */
481 shrq $2, %rax
482 # endif
483 VZEROUPPER_RETURN
484
485 .p2align 4
486 L(last_vec_x3):
487 tzcntl %eax, %eax
488 subl $(VEC_SIZE * 2), %esi
489 /* Check the end of data. */
490 cmpl %eax, %esi
491 jb L(max_end)
492 subq %rdx, %rdi
493 addl $(VEC_SIZE * 2 + 1), %eax
494 addq %rdi, %rax
495 # ifdef USE_AS_WCSLEN
496 /* NB: Divide bytes by 4 to get wchar_t count. */
497 shrq $2, %rax
498 # endif
499 VZEROUPPER_RETURN
500 L(max_end):
501 movq %r8, %rax
502 VZEROUPPER_RETURN
503 # endif
504
505 /* Cold case for crossing page with first load. */
506 .p2align 4
507 L(cross_page_boundary):
508 /* Align data to VEC_SIZE - 1. */
509 orq $(VEC_SIZE - 1), %rdi
510 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
511 vpmovmskb %ymm1, %eax
512 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
513 so no need to manually mod rdx. */
514 sarxl %edx, %eax, %eax
515 # ifdef USE_AS_STRNLEN
516 testl %eax, %eax
517 jnz L(cross_page_less_vec)
518 leaq 1(%rdi), %rcx
519 subq %rdx, %rcx
520 # ifdef USE_AS_WCSLEN
521 /* NB: Divide bytes by 4 to get wchar_t count. */
522 shrl $2, %ecx
523 # endif
524 /* Check length. */
525 cmpq %rsi, %rcx
526 jb L(cross_page_continue)
527 movq %r8, %rax
528 # else
529 testl %eax, %eax
530 jz L(cross_page_continue)
531 tzcntl %eax, %eax
532 # ifdef USE_AS_WCSLEN
533 /* NB: Divide length by 4 to get wchar_t count. */
534 shrl $2, %eax
535 # endif
536 # endif
537 L(return_vzeroupper):
538 ZERO_UPPER_VEC_REGISTERS_RETURN
539
540 # ifdef USE_AS_STRNLEN
541 .p2align 4
542 L(cross_page_less_vec):
543 tzcntl %eax, %eax
544 # ifdef USE_AS_WCSLEN
545 /* NB: Multiply length by 4 to get byte count. */
546 sall $2, %esi
547 # endif
548 cmpq %rax, %rsi
549 cmovb %esi, %eax
550 # ifdef USE_AS_WCSLEN
551 shrl $2, %eax
552 # endif
553 VZEROUPPER_RETURN
554 # endif
555
556 END (STRLEN)
557 #endif