1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
24 # define STRLEN __strlen_avx2
28 # define VPCMPEQ vpcmpeqd
29 # define VPMINU vpminud
32 # define VPCMPEQ vpcmpeqb
33 # define VPMINU vpminub
38 # define VZEROUPPER vzeroupper
42 # define SECTION(p) p##.avx
46 # define PAGE_SIZE 4096
47 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
49 .section SECTION(.text),"ax",@progbits
51 # ifdef USE_AS_STRNLEN
52 /* Check zero length. */
54 /* Clear upper bits. */
60 /* Store max len in R8_LP before adjusting if using WCSLEN. */
65 vpxor %xmm0, %xmm0, %xmm0
66 /* Clear high bits from edi. Only keeping bits relevant to page
68 andl $(PAGE_SIZE - 1), %eax
69 /* Check if we may cross page boundary with one vector load. */
70 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
71 ja L(cross_page_boundary)
73 /* Check the first VEC_SIZE bytes. */
74 VPCMPEQ (%rdi), %ymm0, %ymm1
76 # ifdef USE_AS_STRNLEN
77 /* If length < VEC_SIZE handle special. */
78 cmpq $CHAR_PER_VEC, %rsi
81 /* If empty continue to aligned_more. Otherwise return bit
82 position of first match. */
87 /* NB: Divide bytes by 4 to get wchar_t count. */
92 # ifdef USE_AS_STRNLEN
99 /* Set bit for max len so that tzcnt will return min of max len
100 and position of first match. */
101 # ifdef USE_AS_WCSLEN
102 /* NB: Multiply length by 4 to get byte count. */
107 # ifdef USE_AS_WCSLEN
108 /* NB: Divide bytes by 4 to get wchar_t count. */
117 /* Safe to use 32 bit instructions as these are only called for
119 # ifdef USE_AS_STRNLEN
120 /* Use ecx which was computed earlier to compute correct value.
122 # ifdef USE_AS_WCSLEN
123 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
125 subl $(VEC_SIZE * 4 + 1), %ecx
133 # ifdef USE_AS_WCSLEN
134 /* NB: Divide bytes by 4 to get wchar_t count. */
142 /* Safe to use 32 bit instructions as these are only called for
144 # ifdef USE_AS_STRNLEN
145 /* Use ecx which was computed earlier to compute correct value.
147 # ifdef USE_AS_WCSLEN
148 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
150 subl $(VEC_SIZE * 3 + 1), %ecx
155 addl $(VEC_SIZE + 1), %edi
158 # ifdef USE_AS_WCSLEN
159 /* NB: Divide bytes by 4 to get wchar_t count. */
167 /* Safe to use 32 bit instructions as these are only called for
169 # ifdef USE_AS_STRNLEN
170 /* Use ecx which was computed earlier to compute correct value.
172 # ifdef USE_AS_WCSLEN
173 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
175 subl $(VEC_SIZE * 2 + 1), %ecx
180 addl $(VEC_SIZE * 2 + 1), %edi
183 # ifdef USE_AS_WCSLEN
184 /* NB: Divide bytes by 4 to get wchar_t count. */
192 /* Safe to use 32 bit instructions as these are only called for
194 # ifdef USE_AS_STRNLEN
195 /* Use ecx which was computed earlier to compute correct value.
197 # ifdef USE_AS_WCSLEN
198 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
200 subl $(VEC_SIZE + 1), %ecx
205 addl $(VEC_SIZE * 3 + 1), %edi
208 # ifdef USE_AS_WCSLEN
209 /* NB: Divide bytes by 4 to get wchar_t count. */
216 /* Align data to VEC_SIZE - 1. This is the same number of
217 instructions as using andq with -VEC_SIZE but saves 4 bytes of
218 code on the x4 check. */
219 orq $(VEC_SIZE - 1), %rdi
220 L(cross_page_continue):
221 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
222 since data is only aligned to VEC_SIZE. */
223 # ifdef USE_AS_STRNLEN
224 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
225 because it simplies the logic in last_4x_vec_or_less. */
226 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
228 # ifdef USE_AS_WCSLEN
229 /* NB: Divide bytes by 4 to get the wchar_t count. */
233 /* Load first VEC regardless. */
234 VPCMPEQ 1(%rdi), %ymm0, %ymm1
235 # ifdef USE_AS_STRNLEN
236 /* Adjust length. If near end handle specially. */
238 jb L(last_4x_vec_or_less)
240 vpmovmskb %ymm1, %eax
244 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
245 vpmovmskb %ymm1, %eax
249 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
250 vpmovmskb %ymm1, %eax
254 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
255 vpmovmskb %ymm1, %eax
259 /* Align data to VEC_SIZE * 4 - 1. */
260 # ifdef USE_AS_STRNLEN
261 /* Before adjusting length check if at last VEC_SIZE * 4. */
262 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
263 jbe L(last_4x_vec_or_less_load)
266 orq $(VEC_SIZE * 4 - 1), %rdi
267 andl $(VEC_SIZE * 4 - 1), %ecx
268 # ifdef USE_AS_WCSLEN
269 /* NB: Divide bytes by 4 to get the wchar_t count. */
272 /* Readjust length. */
276 orq $(VEC_SIZE * 4 - 1), %rdi
278 /* Compare 4 * VEC at a time forward. */
281 # ifdef USE_AS_STRNLEN
282 /* Break if at end of length. */
283 subq $(CHAR_PER_VEC * 4), %rsi
284 jb L(last_4x_vec_or_less_cmpeq)
286 /* Save some code size by microfusing VPMINU with the load.
287 Since the matches in ymm2/ymm4 can only be returned if there
288 where no matches in ymm1/ymm3 respectively there is no issue
290 vmovdqa 1(%rdi), %ymm1
291 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
292 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
293 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
295 VPMINU %ymm2, %ymm4, %ymm5
296 VPCMPEQ %ymm5, %ymm0, %ymm5
297 vpmovmskb %ymm5, %ecx
299 subq $-(VEC_SIZE * 4), %rdi
304 VPCMPEQ %ymm1, %ymm0, %ymm1
305 vpmovmskb %ymm1, %eax
308 jnz L(last_vec_return_x0)
310 VPCMPEQ %ymm2, %ymm0, %ymm2
311 vpmovmskb %ymm2, %eax
313 jnz L(last_vec_return_x1)
315 /* Combine last 2 VEC. */
316 VPCMPEQ %ymm3, %ymm0, %ymm3
317 vpmovmskb %ymm3, %eax
318 /* rcx has combined result from all 4 VEC. It will only be used
319 if the first 3 other VEC all did not contain a match. */
323 subq $(VEC_SIZE * 2 - 1), %rdi
325 # ifdef USE_AS_WCSLEN
326 /* NB: Divide bytes by 4 to get wchar_t count. */
332 # ifdef USE_AS_STRNLEN
334 L(last_4x_vec_or_less_load):
335 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
337 subq $-(VEC_SIZE * 4), %rdi
338 L(last_4x_vec_or_less_cmpeq):
339 VPCMPEQ 1(%rdi), %ymm0, %ymm1
340 L(last_4x_vec_or_less):
341 # ifdef USE_AS_WCSLEN
342 /* NB: Multiply length by 4 to get byte count. */
345 vpmovmskb %ymm1, %eax
346 /* If remaining length > VEC_SIZE * 2. This works if esi is off
348 testl $(VEC_SIZE * 2), %esi
351 /* length may have been negative or positive by an offset of
352 VEC_SIZE * 4 depending on where this was called from. This fixes
354 andl $(VEC_SIZE * 4 - 1), %esi
356 jnz L(last_vec_x1_check)
361 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
362 vpmovmskb %ymm1, %eax
364 /* Check the end of data. */
368 addl $(VEC_SIZE + 1), %eax
370 # ifdef USE_AS_WCSLEN
371 /* NB: Divide bytes by 4 to get wchar_t count. */
378 L(last_vec_return_x0):
380 subq $(VEC_SIZE * 4 - 1), %rdi
382 # ifdef USE_AS_WCSLEN
383 /* NB: Divide bytes by 4 to get wchar_t count. */
389 L(last_vec_return_x1):
391 subq $(VEC_SIZE * 3 - 1), %rdi
393 # ifdef USE_AS_WCSLEN
394 /* NB: Divide bytes by 4 to get wchar_t count. */
399 # ifdef USE_AS_STRNLEN
401 L(last_vec_x1_check):
404 /* Check the end of data. */
410 # ifdef USE_AS_WCSLEN
411 /* NB: Divide bytes by 4 to get wchar_t count. */
422 /* Test first 2x VEC normally. */
426 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
427 vpmovmskb %ymm1, %eax
431 /* Normalize length. */
432 andl $(VEC_SIZE * 4 - 1), %esi
433 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
434 vpmovmskb %ymm1, %eax
438 subl $(VEC_SIZE * 3), %esi
441 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
442 vpmovmskb %ymm1, %eax
444 /* Check the end of data. */
448 addl $(VEC_SIZE * 3 + 1), %eax
450 # ifdef USE_AS_WCSLEN
451 /* NB: Divide bytes by 4 to get wchar_t count. */
459 /* essentially duplicates of first_vec_x1 but use 64 bit
465 # ifdef USE_AS_WCSLEN
466 /* NB: Divide bytes by 4 to get wchar_t count. */
473 /* essentially duplicates of first_vec_x1 but use 64 bit
477 addl $(VEC_SIZE + 1), %eax
479 # ifdef USE_AS_WCSLEN
480 /* NB: Divide bytes by 4 to get wchar_t count. */
488 subl $(VEC_SIZE * 2), %esi
489 /* Check the end of data. */
493 addl $(VEC_SIZE * 2 + 1), %eax
495 # ifdef USE_AS_WCSLEN
496 /* NB: Divide bytes by 4 to get wchar_t count. */
505 /* Cold case for crossing page with first load. */
507 L(cross_page_boundary):
508 /* Align data to VEC_SIZE - 1. */
509 orq $(VEC_SIZE - 1), %rdi
510 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
511 vpmovmskb %ymm1, %eax
512 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
513 so no need to manually mod rdx. */
514 sarxl %edx, %eax, %eax
515 # ifdef USE_AS_STRNLEN
517 jnz L(cross_page_less_vec)
520 # ifdef USE_AS_WCSLEN
521 /* NB: Divide bytes by 4 to get wchar_t count. */
526 jb L(cross_page_continue)
530 jz L(cross_page_continue)
532 # ifdef USE_AS_WCSLEN
533 /* NB: Divide length by 4 to get wchar_t count. */
537 L(return_vzeroupper):
538 ZERO_UPPER_VEC_REGISTERS_RETURN
540 # ifdef USE_AS_STRNLEN
542 L(cross_page_less_vec):
544 # ifdef USE_AS_WCSLEN
545 /* NB: Multiply length by 4 to get byte count. */
550 # ifdef USE_AS_WCSLEN