1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
24 # define STRLEN __strlen_avx2
28 # define VPCMPEQ vpcmpeqd
29 # define VPMINU vpminud
31 # define VPCMPEQ vpcmpeqb
32 # define VPMINU vpminub
36 # define VZEROUPPER vzeroupper
41 .section .text.avx,"ax",@progbits
43 # ifdef USE_AS_STRNLEN
44 /* Check for zero length. */
54 vpxor %xmm0, %xmm0, %xmm0
56 /* Check if we may cross page boundary with one vector load. */
57 andl $(2 * VEC_SIZE - 1), %ecx
59 ja L(cros_page_boundary)
61 /* Check the first VEC_SIZE bytes. */
62 VPCMPEQ (%rdi), %ymm0, %ymm1
66 # ifdef USE_AS_STRNLEN
67 jnz L(first_vec_x0_check)
68 /* Adjust length and check the end of data. */
75 /* Align data for aligned loads in the loop. */
77 andl $(VEC_SIZE - 1), %ecx
80 # ifdef USE_AS_STRNLEN
84 subq $(VEC_SIZE * 4), %rsi
85 jbe L(last_4x_vec_or_less)
90 L(cros_page_boundary):
91 andl $(VEC_SIZE - 1), %ecx
93 VPCMPEQ (%rdi), %ymm0, %ymm1
95 /* Remove the leading bytes. */
100 # ifdef USE_AS_STRNLEN
101 /* Check the end of data. */
108 # ifdef USE_AS_WCSLEN
116 # ifdef USE_AS_STRNLEN
117 /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
118 with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
119 to void possible addition overflow. */
123 /* Check the end of data. */
130 # ifdef USE_AS_STRNLEN
131 subq $(VEC_SIZE * 4), %rsi
132 jbe L(last_4x_vec_or_less)
136 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
137 since data is only aligned to VEC_SIZE. */
138 VPCMPEQ (%rdi), %ymm0, %ymm1
139 vpmovmskb %ymm1, %eax
143 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
144 vpmovmskb %ymm1, %eax
148 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
149 vpmovmskb %ymm1, %eax
153 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
154 vpmovmskb %ymm1, %eax
158 addq $(VEC_SIZE * 4), %rdi
160 # ifdef USE_AS_STRNLEN
161 subq $(VEC_SIZE * 4), %rsi
162 jbe L(last_4x_vec_or_less)
165 /* Align data to 4 * VEC_SIZE. */
167 andl $(4 * VEC_SIZE - 1), %ecx
168 andq $-(4 * VEC_SIZE), %rdi
170 # ifdef USE_AS_STRNLEN
177 /* Compare 4 * VEC at a time forward. */
178 vmovdqa (%rdi), %ymm1
179 vmovdqa VEC_SIZE(%rdi), %ymm2
180 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
181 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
182 VPMINU %ymm1, %ymm2, %ymm5
183 VPMINU %ymm3, %ymm4, %ymm6
184 VPMINU %ymm5, %ymm6, %ymm5
186 VPCMPEQ %ymm5, %ymm0, %ymm5
187 vpmovmskb %ymm5, %eax
191 addq $(VEC_SIZE * 4), %rdi
193 # ifndef USE_AS_STRNLEN
196 subq $(VEC_SIZE * 4), %rsi
199 L(last_4x_vec_or_less):
200 /* Less than 4 * VEC and aligned to VEC_SIZE. */
201 addl $(VEC_SIZE * 2), %esi
204 VPCMPEQ (%rdi), %ymm0, %ymm1
205 vpmovmskb %ymm1, %eax
209 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
210 vpmovmskb %ymm1, %eax
214 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
215 vpmovmskb %ymm1, %eax
218 jnz L(first_vec_x2_check)
222 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
223 vpmovmskb %ymm1, %eax
226 jnz L(first_vec_x3_check)
228 # ifdef USE_AS_WCSLEN
236 addl $(VEC_SIZE * 2), %esi
237 VPCMPEQ (%rdi), %ymm0, %ymm1
238 vpmovmskb %ymm1, %eax
241 jnz L(first_vec_x0_check)
245 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
246 vpmovmskb %ymm1, %eax
248 jnz L(first_vec_x1_check)
250 # ifdef USE_AS_WCSLEN
257 L(first_vec_x0_check):
259 /* Check the end of data. */
264 # ifdef USE_AS_WCSLEN
271 L(first_vec_x1_check):
273 /* Check the end of data. */
279 # ifdef USE_AS_WCSLEN
286 L(first_vec_x2_check):
288 /* Check the end of data. */
291 addq $(VEC_SIZE * 2), %rax
294 # ifdef USE_AS_WCSLEN
301 L(first_vec_x3_check):
303 /* Check the end of data. */
306 addq $(VEC_SIZE * 3), %rax
309 # ifdef USE_AS_WCSLEN
318 # ifdef USE_AS_WCSLEN
335 # ifdef USE_AS_WCSLEN
347 # ifdef USE_AS_WCSLEN
356 addq $(VEC_SIZE * 2), %rax
359 # ifdef USE_AS_WCSLEN
367 VPCMPEQ %ymm1, %ymm0, %ymm1
368 vpmovmskb %ymm1, %eax
371 VPCMPEQ %ymm2, %ymm0, %ymm2
372 vpmovmskb %ymm2, %eax
375 VPCMPEQ %ymm3, %ymm0, %ymm3
376 vpmovmskb %ymm3, %eax
379 VPCMPEQ %ymm4, %ymm0, %ymm4
380 vpmovmskb %ymm4, %eax
383 addq $(VEC_SIZE * 3), %rax
386 # ifdef USE_AS_WCSLEN