1 /* strlen/wcslen optimized with 256/512-bit EVEX instructions.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include <isa-level.h>
22 #if ISA_SHOULD_BUILD (4)
27 # define VPCMPEQ vpcmpeqd
28 # define VPCMPNEQ vpcmpneqd
29 # define VPTESTN vptestnmd
30 # define VPTEST vptestmd
31 # define VPMINU vpminud
33 # define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
35 # define VPCMPEQ vpcmpeqb
36 # define VPCMPNEQ vpcmpneqb
37 # define VPTESTN vptestnmb
38 # define VPTEST vptestmb
39 # define VPMINU vpminub
41 # define CHAR_SIZE_SHIFT_REG(reg)
43 # define REG_WIDTH VEC_SIZE
46 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
48 # include "reg-macros.h"
50 # if CHAR_PER_VEC == 64
52 # define TAIL_RETURN_LBL first_vec_x2
53 # define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
55 # define FALLTHROUGH_RETURN_LBL first_vec_x3
56 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
60 # define TAIL_RETURN_LBL first_vec_x3
61 # define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
63 # define FALLTHROUGH_RETURN_LBL first_vec_x2
64 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
67 # define XZERO VMM_128(0)
69 # define PAGE_SIZE 4096
71 .section SECTION(.text), "ax", @progbits
72 ENTRY_P2ALIGN(STRLEN, 6)
74 vpxorq %XZERO, %XZERO, %XZERO
75 andl $(PAGE_SIZE - 1), %eax
76 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
77 ja L(cross_page_boundary)
79 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
81 VPCMPEQ (%rdi), %VZERO, %k0
92 CHAR_SIZE_SHIFT_REG (edi)
93 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
98 /* Aligned more for strnlen compares remaining length vs 2 *
99 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
100 going to the loop. */
104 andq $(VEC_SIZE * -1), %rdi
105 L(cross_page_continue):
106 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
107 rechecking bounds. */
108 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
113 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
118 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
123 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
128 subq $(VEC_SIZE * -1), %rdi
130 # if CHAR_PER_VEC == 64
131 /* No partial register stalls on processors that we use evex512
132 on and this saves code size. */
135 andq $-(VEC_SIZE * 4), %rdi
140 /* Compare 4 * VEC at a time forward. */
143 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
144 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
145 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
146 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
147 VPTESTN %VMM(2), %VMM(2), %k0
148 VPTESTN %VMM(4), %VMM(4), %k2
150 subq $-(VEC_SIZE * 4), %rdi
154 VPTESTN %VMM(1), %VMM(1), %k1
163 VPTESTN %VMM(3), %VMM(3), %k0
165 # if CHAR_PER_VEC == 64
171 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */
174 salq $CHAR_PER_VEC, %rdx
178 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */
180 L(FALLTHROUGH_RETURN_LBL):
183 CHAR_SIZE_SHIFT_REG (rdi)
184 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
191 CHAR_SIZE_SHIFT_REG (rdi)
199 CHAR_SIZE_SHIFT_REG (rdi)
200 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
204 /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */
208 CHAR_SIZE_SHIFT_REG (VRDI)
209 lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
213 L(cross_page_boundary):
215 /* Align data to VEC_SIZE. */
216 andq $-VEC_SIZE, %rdi
218 VPCMPEQ (%rdi), %VZERO, %k0
221 # ifdef USE_AS_WCSLEN
224 andl $(CHAR_PER_VEC - 1), %edx
225 shrx %edx, %eax, %eax
230 jz L(cross_page_continue)