]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strlen-avx2.S
x86: Optimize strlen-avx2.S
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-avx2.S
CommitLineData
dc485ceb 1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
04277e02 2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
dc485ceb
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRLEN
24# define STRLEN __strlen_avx2
25# endif
26
27# ifdef USE_AS_WCSLEN
28# define VPCMPEQ vpcmpeqd
29# define VPMINU vpminud
0a3b2efc 30# define CHAR_SIZE 4
dc485ceb
L
31# else
32# define VPCMPEQ vpcmpeqb
33# define VPMINU vpminub
0a3b2efc 34# define CHAR_SIZE 1
dc485ceb
L
35# endif
36
37# ifndef VZEROUPPER
38# define VZEROUPPER vzeroupper
39# endif
40
5be8a847
L
41# ifndef SECTION
42# define SECTION(p) p##.avx
43# endif
44
dc485ceb 45# define VEC_SIZE 32
0a3b2efc 46# define PAGE_SIZE 4096
dc485ceb 47
5be8a847 48 .section SECTION(.text),"ax",@progbits
dc485ceb
L
49ENTRY (STRLEN)
50# ifdef USE_AS_STRNLEN
0a3b2efc 51 /* Check zero length. */
5165de69 52 test %RSI_LP, %RSI_LP
dc485ceb 53 jz L(zero)
0a3b2efc
NG
54 /* Store max len in R8_LP before adjusting if using WCSLEN. */
55 mov %RSI_LP, %R8_LP
dc485ceb 56# ifdef USE_AS_WCSLEN
5165de69
L
57 shl $2, %RSI_LP
58# elif defined __ILP32__
59 /* Clear the upper 32 bits. */
60 movl %esi, %esi
dc485ceb 61# endif
dc485ceb 62# endif
0a3b2efc 63 movl %edi, %eax
dc485ceb
L
64 movq %rdi, %rdx
65 vpxor %xmm0, %xmm0, %xmm0
0a3b2efc
NG
66 /* Clear high bits from edi. Only keeping bits relevant to page
67 cross check. */
68 andl $(PAGE_SIZE - 1), %eax
dc485ceb 69 /* Check if we may cross page boundary with one vector load. */
0a3b2efc
NG
70 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
71 ja L(cross_page_boundary)
dc485ceb
L
72
73 /* Check the first VEC_SIZE bytes. */
0a3b2efc
NG
74 VPCMPEQ (%rdi), %ymm0, %ymm1
75 vpmovmskb %ymm1, %eax
dc485ceb 76# ifdef USE_AS_STRNLEN
0a3b2efc
NG
77 /* If length < VEC_SIZE handle special. */
78 cmpq $VEC_SIZE, %rsi
79 jbe L(first_vec_x0)
dc485ceb 80# endif
0a3b2efc
NG
81 /* If empty continue to aligned_more. Otherwise return bit
82 position of first match. */
83 testl %eax, %eax
84 jz L(aligned_more)
85 tzcntl %eax, %eax
86# ifdef USE_AS_WCSLEN
87 shrl $2, %eax
88# endif
89 VZEROUPPER_RETURN
dc485ceb
L
90
91# ifdef USE_AS_STRNLEN
0a3b2efc
NG
92L(zero):
93 xorl %eax, %eax
94 ret
dc485ceb 95
0a3b2efc
NG
96 .p2align 4
97L(first_vec_x0):
98 /* Set bit for max len so that tzcnt will return min of max len
99 and position of first match. */
100 btsq %rsi, %rax
101 tzcntl %eax, %eax
102# ifdef USE_AS_WCSLEN
103 shrl $2, %eax
104# endif
105 VZEROUPPER_RETURN
dc485ceb 106# endif
dc485ceb
L
107
108 .p2align 4
0a3b2efc 109L(first_vec_x1):
dc485ceb 110 tzcntl %eax, %eax
0a3b2efc
NG
111 /* Safe to use 32 bit instructions as these are only called for
112 size = [1, 159]. */
dc485ceb 113# ifdef USE_AS_STRNLEN
0a3b2efc
NG
114 /* Use ecx which was computed earlier to compute correct value.
115 */
116 subl $(VEC_SIZE * 4 + 1), %ecx
117 addl %ecx, %eax
118# else
119 subl %edx, %edi
120 incl %edi
121 addl %edi, %eax
dc485ceb 122# endif
dc485ceb 123# ifdef USE_AS_WCSLEN
0a3b2efc 124 shrl $2, %eax
dc485ceb 125# endif
0a3b2efc 126 VZEROUPPER_RETURN
dc485ceb
L
127
128 .p2align 4
0a3b2efc
NG
129L(first_vec_x2):
130 tzcntl %eax, %eax
131 /* Safe to use 32 bit instructions as these are only called for
132 size = [1, 159]. */
dc485ceb 133# ifdef USE_AS_STRNLEN
0a3b2efc
NG
134 /* Use ecx which was computed earlier to compute correct value.
135 */
136 subl $(VEC_SIZE * 3 + 1), %ecx
137 addl %ecx, %eax
138# else
139 subl %edx, %edi
140 addl $(VEC_SIZE + 1), %edi
141 addl %edi, %eax
dc485ceb 142# endif
0a3b2efc
NG
143# ifdef USE_AS_WCSLEN
144 shrl $2, %eax
145# endif
146 VZEROUPPER_RETURN
dc485ceb 147
0a3b2efc
NG
148 .p2align 4
149L(first_vec_x3):
150 tzcntl %eax, %eax
151 /* Safe to use 32 bit instructions as these are only called for
152 size = [1, 159]. */
153# ifdef USE_AS_STRNLEN
154 /* Use ecx which was computed earlier to compute correct value.
155 */
156 subl $(VEC_SIZE * 2 + 1), %ecx
157 addl %ecx, %eax
158# else
159 subl %edx, %edi
160 addl $(VEC_SIZE * 2 + 1), %edi
161 addl %edi, %eax
162# endif
163# ifdef USE_AS_WCSLEN
164 shrl $2, %eax
165# endif
166 VZEROUPPER_RETURN
dc485ceb 167
0a3b2efc
NG
168 .p2align 4
169L(first_vec_x4):
170 tzcntl %eax, %eax
171 /* Safe to use 32 bit instructions as these are only called for
172 size = [1, 159]. */
dc485ceb 173# ifdef USE_AS_STRNLEN
0a3b2efc
NG
174 /* Use ecx which was computed earlier to compute correct value.
175 */
176 subl $(VEC_SIZE + 1), %ecx
177 addl %ecx, %eax
178# else
179 subl %edx, %edi
180 addl $(VEC_SIZE * 3 + 1), %edi
181 addl %edi, %eax
dc485ceb 182# endif
0a3b2efc
NG
183# ifdef USE_AS_WCSLEN
184 shrl $2, %eax
185# endif
186 VZEROUPPER_RETURN
dc485ceb 187
0a3b2efc
NG
188 .p2align 5
189L(aligned_more):
190 /* Align data to VEC_SIZE - 1. This is the same number of
191 instructions as using andq with -VEC_SIZE but saves 4 bytes of
192 code on the x4 check. */
193 orq $(VEC_SIZE - 1), %rdi
194L(cross_page_continue):
dc485ceb
L
195 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
196 since data is only aligned to VEC_SIZE. */
0a3b2efc
NG
197# ifdef USE_AS_STRNLEN
198 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
199 it simplies the logic in last_4x_vec_or_less. */
200 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
201 subq %rdx, %rcx
202# endif
203 /* Load first VEC regardless. */
204 VPCMPEQ 1(%rdi), %ymm0, %ymm1
205# ifdef USE_AS_STRNLEN
206 /* Adjust length. If near end handle specially. */
207 subq %rcx, %rsi
208 jb L(last_4x_vec_or_less)
209# endif
210 vpmovmskb %ymm1, %eax
dc485ceb
L
211 testl %eax, %eax
212 jnz L(first_vec_x1)
213
0a3b2efc
NG
214 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
215 vpmovmskb %ymm1, %eax
dc485ceb
L
216 testl %eax, %eax
217 jnz L(first_vec_x2)
218
0a3b2efc
NG
219 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
220 vpmovmskb %ymm1, %eax
dc485ceb
L
221 testl %eax, %eax
222 jnz L(first_vec_x3)
223
0a3b2efc
NG
224 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
225 vpmovmskb %ymm1, %eax
226 testl %eax, %eax
227 jnz L(first_vec_x4)
dc485ceb 228
0a3b2efc 229 /* Align data to VEC_SIZE * 4 - 1. */
dc485ceb 230# ifdef USE_AS_STRNLEN
0a3b2efc
NG
231 /* Before adjusting length check if at last VEC_SIZE * 4. */
232 cmpq $(VEC_SIZE * 4 - 1), %rsi
233 jbe L(last_4x_vec_or_less_load)
234 incq %rdi
235 movl %edi, %ecx
236 orq $(VEC_SIZE * 4 - 1), %rdi
237 andl $(VEC_SIZE * 4 - 1), %ecx
238 /* Readjust length. */
dc485ceb 239 addq %rcx, %rsi
0a3b2efc
NG
240# else
241 incq %rdi
242 orq $(VEC_SIZE * 4 - 1), %rdi
dc485ceb 243# endif
0a3b2efc 244 /* Compare 4 * VEC at a time forward. */
dc485ceb
L
245 .p2align 4
246L(loop_4x_vec):
0a3b2efc
NG
247# ifdef USE_AS_STRNLEN
248 /* Break if at end of length. */
dc485ceb 249 subq $(VEC_SIZE * 4), %rsi
0a3b2efc
NG
250 jb L(last_4x_vec_or_less_cmpeq)
251# endif
252 /* Save some code size by microfusing VPMINU with the load. Since
253 the matches in ymm2/ymm4 can only be returned if there where no
254 matches in ymm1/ymm3 respectively there is no issue with overlap.
255 */
256 vmovdqa 1(%rdi), %ymm1
257 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
258 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
259 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
260
261 VPMINU %ymm2, %ymm4, %ymm5
262 VPCMPEQ %ymm5, %ymm0, %ymm5
263 vpmovmskb %ymm5, %ecx
dc485ceb 264
0a3b2efc
NG
265 subq $-(VEC_SIZE * 4), %rdi
266 testl %ecx, %ecx
267 jz L(loop_4x_vec)
dc485ceb 268
dc485ceb 269
0a3b2efc
NG
270 VPCMPEQ %ymm1, %ymm0, %ymm1
271 vpmovmskb %ymm1, %eax
272 subq %rdx, %rdi
dc485ceb 273 testl %eax, %eax
0a3b2efc 274 jnz L(last_vec_return_x0)
dc485ceb 275
0a3b2efc
NG
276 VPCMPEQ %ymm2, %ymm0, %ymm2
277 vpmovmskb %ymm2, %eax
dc485ceb 278 testl %eax, %eax
0a3b2efc
NG
279 jnz L(last_vec_return_x1)
280
281 /* Combine last 2 VEC. */
282 VPCMPEQ %ymm3, %ymm0, %ymm3
283 vpmovmskb %ymm3, %eax
284 /* rcx has combined result from all 4 VEC. It will only be used if
285 the first 3 other VEC all did not contain a match. */
286 salq $32, %rcx
287 orq %rcx, %rax
288 tzcntq %rax, %rax
289 subq $(VEC_SIZE * 2 - 1), %rdi
290 addq %rdi, %rax
291# ifdef USE_AS_WCSLEN
dc485ceb 292 shrq $2, %rax
0a3b2efc 293# endif
5be8a847 294 VZEROUPPER_RETURN
dc485ceb 295
0a3b2efc
NG
296
297# ifdef USE_AS_STRNLEN
dc485ceb 298 .p2align 4
0a3b2efc
NG
299L(last_4x_vec_or_less_load):
300 /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
301 subq $-(VEC_SIZE * 4), %rdi
302L(last_4x_vec_or_less_cmpeq):
303 VPCMPEQ 1(%rdi), %ymm0, %ymm1
304L(last_4x_vec_or_less):
dc485ceb 305
0a3b2efc
NG
306 vpmovmskb %ymm1, %eax
307 /* If remaining length > VEC_SIZE * 2. This works if esi is off by
308 VEC_SIZE * 4. */
309 testl $(VEC_SIZE * 2), %esi
310 jnz L(last_4x_vec)
dc485ceb 311
0a3b2efc
NG
312 /* length may have been negative or positive by an offset of
313 VEC_SIZE * 4 depending on where this was called from. This fixes
314 that. */
315 andl $(VEC_SIZE * 4 - 1), %esi
dc485ceb 316 testl %eax, %eax
0a3b2efc 317 jnz L(last_vec_x1_check)
dc485ceb 318
0a3b2efc
NG
319 subl $VEC_SIZE, %esi
320 jb L(max)
321
322 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
323 vpmovmskb %ymm1, %eax
dc485ceb
L
324 tzcntl %eax, %eax
325 /* Check the end of data. */
0a3b2efc
NG
326 cmpl %eax, %esi
327 jb L(max)
328 subq %rdx, %rdi
329 addl $(VEC_SIZE + 1), %eax
dc485ceb 330 addq %rdi, %rax
dc485ceb
L
331# ifdef USE_AS_WCSLEN
332 shrq $2, %rax
333# endif
5be8a847 334 VZEROUPPER_RETURN
0a3b2efc 335# endif
dc485ceb
L
336
337 .p2align 4
0a3b2efc 338L(last_vec_return_x0):
dc485ceb 339 tzcntl %eax, %eax
0a3b2efc 340 subq $(VEC_SIZE * 4 - 1), %rdi
dc485ceb 341 addq %rdi, %rax
0a3b2efc 342# ifdef USE_AS_WCSLEN
dc485ceb 343 shrq $2, %rax
0a3b2efc 344# endif
5be8a847 345 VZEROUPPER_RETURN
dc485ceb
L
346
347 .p2align 4
0a3b2efc 348L(last_vec_return_x1):
dc485ceb 349 tzcntl %eax, %eax
0a3b2efc 350 subq $(VEC_SIZE * 3 - 1), %rdi
dc485ceb 351 addq %rdi, %rax
0a3b2efc 352# ifdef USE_AS_WCSLEN
dc485ceb 353 shrq $2, %rax
0a3b2efc 354# endif
5be8a847 355 VZEROUPPER_RETURN
dc485ceb 356
0a3b2efc 357# ifdef USE_AS_STRNLEN
dc485ceb 358 .p2align 4
0a3b2efc
NG
359L(last_vec_x1_check):
360
dc485ceb
L
361 tzcntl %eax, %eax
362 /* Check the end of data. */
0a3b2efc
NG
363 cmpl %eax, %esi
364 jb L(max)
365 subq %rdx, %rdi
366 incl %eax
dc485ceb 367 addq %rdi, %rax
dc485ceb
L
368# ifdef USE_AS_WCSLEN
369 shrq $2, %rax
370# endif
5be8a847 371 VZEROUPPER_RETURN
dc485ceb 372
dc485ceb
L
373L(max):
374 movq %r8, %rax
0a3b2efc
NG
375 VZEROUPPER_RETURN
376
377 .p2align 4
378L(last_4x_vec):
379 /* Test first 2x VEC normally. */
380 testl %eax, %eax
381 jnz L(last_vec_x1)
382
383 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
384 vpmovmskb %ymm1, %eax
385 testl %eax, %eax
386 jnz L(last_vec_x2)
387
388 /* Normalize length. */
389 andl $(VEC_SIZE * 4 - 1), %esi
390 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
391 vpmovmskb %ymm1, %eax
392 testl %eax, %eax
393 jnz L(last_vec_x3)
394
395 subl $(VEC_SIZE * 3), %esi
396 jb L(max)
397
398 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
399 vpmovmskb %ymm1, %eax
400 tzcntl %eax, %eax
401 /* Check the end of data. */
402 cmpl %eax, %esi
403 jb L(max)
404 subq %rdx, %rdi
405 addl $(VEC_SIZE * 3 + 1), %eax
406 addq %rdi, %rax
dc485ceb
L
407# ifdef USE_AS_WCSLEN
408 shrq $2, %rax
409# endif
5be8a847 410 VZEROUPPER_RETURN
dc485ceb 411
dc485ceb
L
412
413 .p2align 4
0a3b2efc
NG
414L(last_vec_x1):
415 /* essentially duplicates of first_vec_x1 but use 64 bit
416 instructions. */
dc485ceb 417 tzcntl %eax, %eax
0a3b2efc
NG
418 subq %rdx, %rdi
419 incl %eax
dc485ceb 420 addq %rdi, %rax
0a3b2efc 421# ifdef USE_AS_WCSLEN
dc485ceb 422 shrq $2, %rax
0a3b2efc 423# endif
5be8a847 424 VZEROUPPER_RETURN
dc485ceb
L
425
426 .p2align 4
0a3b2efc
NG
427L(last_vec_x2):
428 /* essentially duplicates of first_vec_x1 but use 64 bit
429 instructions. */
dc485ceb 430 tzcntl %eax, %eax
0a3b2efc
NG
431 subq %rdx, %rdi
432 addl $(VEC_SIZE + 1), %eax
dc485ceb 433 addq %rdi, %rax
0a3b2efc 434# ifdef USE_AS_WCSLEN
dc485ceb 435 shrq $2, %rax
0a3b2efc 436# endif
5be8a847 437 VZEROUPPER_RETURN
dc485ceb
L
438
439 .p2align 4
0a3b2efc 440L(last_vec_x3):
dc485ceb 441 tzcntl %eax, %eax
0a3b2efc
NG
442 subl $(VEC_SIZE * 2), %esi
443 /* Check the end of data. */
444 cmpl %eax, %esi
445 jb L(max_end)
446 subq %rdx, %rdi
447 addl $(VEC_SIZE * 2 + 1), %eax
dc485ceb 448 addq %rdi, %rax
0a3b2efc 449# ifdef USE_AS_WCSLEN
dc485ceb 450 shrq $2, %rax
0a3b2efc
NG
451# endif
452 VZEROUPPER_RETURN
453L(max_end):
454 movq %r8, %rax
5be8a847 455 VZEROUPPER_RETURN
0a3b2efc 456# endif
dc485ceb 457
0a3b2efc 458 /* Cold case for crossing page with first load. */
dc485ceb 459 .p2align 4
0a3b2efc
NG
460L(cross_page_boundary):
461 /* Align data to VEC_SIZE - 1. */
462 orq $(VEC_SIZE - 1), %rdi
463 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
464 vpmovmskb %ymm1, %eax
465 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
466 so no need to manually mod rdx. */
467 sarxl %edx, %eax, %eax
468# ifdef USE_AS_STRNLEN
dc485ceb 469 testl %eax, %eax
0a3b2efc
NG
470 jnz L(cross_page_less_vec)
471 leaq 1(%rdi), %rcx
472 subq %rdx, %rcx
473 /* Check length. */
474 cmpq %rsi, %rcx
475 jb L(cross_page_continue)
476 movq %r8, %rax
477# else
dc485ceb 478 testl %eax, %eax
0a3b2efc 479 jz L(cross_page_continue)
dc485ceb 480 tzcntl %eax, %eax
0a3b2efc
NG
481# ifdef USE_AS_WCSLEN
482 shrl $2, %eax
483# endif
dc485ceb 484# endif
0a3b2efc
NG
485L(return_vzeroupper):
486 ZERO_UPPER_VEC_REGISTERS_RETURN
487
488# ifdef USE_AS_STRNLEN
489 .p2align 4
490L(cross_page_less_vec):
491 tzcntl %eax, %eax
492 cmpq %rax, %rsi
493 cmovb %esi, %eax
494# ifdef USE_AS_WCSLEN
495 shrl $2, %eax
496# endif
5be8a847 497 VZEROUPPER_RETURN
0a3b2efc 498# endif
dc485ceb
L
499
500END (STRLEN)
501#endif