]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strlen-evex.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-evex.S
CommitLineData
1fd8c163 1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
6d7e8eda 2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
1fd8c163
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
ceabdcd1
NG
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
1fd8c163
L
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26# define STRLEN __strlen_evex
27# endif
28
b79f8ff2
NG
29# ifndef VEC_SIZE
30# include "x86-evex256-vecs.h"
31# endif
1fd8c163
L
32
33# ifdef USE_AS_WCSLEN
b79f8ff2
NG
34# define VPCMPEQ vpcmpeqd
35# define VPCMPNEQ vpcmpneqd
36# define VPTESTN vptestnmd
37# define VPTEST vptestmd
1fd8c163 38# define VPMINU vpminud
4ba65586 39# define CHAR_SIZE 4
b79f8ff2 40# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
1fd8c163 41# else
b79f8ff2
NG
42# define VPCMPEQ vpcmpeqb
43# define VPCMPNEQ vpcmpneqb
44# define VPTESTN vptestnmb
45# define VPTEST vptestmb
1fd8c163 46# define VPMINU vpminub
4ba65586 47# define CHAR_SIZE 1
b79f8ff2
NG
48# define CHAR_SIZE_SHIFT_REG(reg)
49
50# define REG_WIDTH VEC_SIZE
1fd8c163
L
51# endif
52
b79f8ff2
NG
53# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
54
55# include "reg-macros.h"
56
57# if CHAR_PER_VEC == 64
58
59# define TAIL_RETURN_LBL first_vec_x2
60# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
61
62# define FALLTHROUGH_RETURN_LBL first_vec_x3
63# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
64
65# else
66
67# define TAIL_RETURN_LBL first_vec_x3
68# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
69
70# define FALLTHROUGH_RETURN_LBL first_vec_x2
71# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
1fd8c163 72# endif
b79f8ff2
NG
73
74# define XZERO VMM_128(0)
75# define VZERO VMM(0)
76# define PAGE_SIZE 4096
77
78 .section SECTION(.text), "ax", @progbits
79ENTRY_P2ALIGN (STRLEN, 6)
4ba65586 80 movl %edi, %eax
b79f8ff2 81 vpxorq %XZERO, %XZERO, %XZERO
4ba65586 82 andl $(PAGE_SIZE - 1), %eax
4ba65586
NG
83 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
84 ja L(cross_page_boundary)
1fd8c163
L
85
86 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
87 null byte. */
b79f8ff2
NG
88 VPCMPEQ (%rdi), %VZERO, %k0
89 KMOV %k0, %VRAX
90 test %VRAX, %VRAX
4ba65586 91 jz L(aligned_more)
b79f8ff2 92 bsf %VRAX, %VRAX
1fd8c163
L
93 ret
94
b79f8ff2 95 .p2align 4,, 8
4ba65586 96L(first_vec_x4):
b79f8ff2
NG
97 bsf %VRAX, %VRAX
98 subl %ecx, %edi
99 CHAR_SIZE_SHIFT_REG (edi)
4ba65586 100 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
4ba65586 101 ret
1fd8c163 102
b79f8ff2
NG
103
104
105 /* Aligned more for strnlen compares remaining length vs 2 *
106 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
107 going to the loop. */
108 .p2align 4,, 10
4ba65586 109L(aligned_more):
b79f8ff2
NG
110 movq %rdi, %rcx
111 andq $(VEC_SIZE * -1), %rdi
4ba65586 112L(cross_page_continue):
b79f8ff2
NG
113 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
114 rechecking bounds. */
115 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
116 KMOV %k0, %VRAX
117 test %VRAX, %VRAX
1fd8c163
L
118 jnz L(first_vec_x1)
119
b79f8ff2
NG
120 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
121 KMOV %k0, %VRAX
122 test %VRAX, %VRAX
1fd8c163
L
123 jnz L(first_vec_x2)
124
b79f8ff2
NG
125 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
126 KMOV %k0, %VRAX
127 test %VRAX, %VRAX
1fd8c163
L
128 jnz L(first_vec_x3)
129
b79f8ff2
NG
130 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
131 KMOV %k0, %VRAX
132 test %VRAX, %VRAX
4ba65586 133 jnz L(first_vec_x4)
1fd8c163 134
b79f8ff2
NG
135 subq $(VEC_SIZE * -1), %rdi
136
137# if CHAR_PER_VEC == 64
138 /* No partial register stalls on processors that we use evex512
139 on and this saves code size. */
140 xorb %dil, %dil
141# else
4ba65586 142 andq $-(VEC_SIZE * 4), %rdi
b79f8ff2
NG
143# endif
144
145
1fd8c163 146
4ba65586 147 /* Compare 4 * VEC at a time forward. */
1fd8c163
L
148 .p2align 4
149L(loop_4x_vec):
b79f8ff2
NG
150 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
151 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
152 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
153 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
154 VPTESTN %VMM(2), %VMM(2), %k0
155 VPTESTN %VMM(4), %VMM(4), %k2
4ba65586 156
4ba65586 157 subq $-(VEC_SIZE * 4), %rdi
b79f8ff2 158 KORTEST %k0, %k2
4ba65586
NG
159 jz L(loop_4x_vec)
160
b79f8ff2
NG
161 VPTESTN %VMM(1), %VMM(1), %k1
162 KMOV %k1, %VRAX
163 test %VRAX, %VRAX
164 jnz L(first_vec_x0)
1fd8c163 165
b79f8ff2
NG
166 KMOV %k0, %VRAX
167 test %VRAX, %VRAX
168 jnz L(first_vec_x1)
1fd8c163 169
b79f8ff2 170 VPTESTN %VMM(3), %VMM(3), %k0
1fd8c163 171
b79f8ff2
NG
172# if CHAR_PER_VEC == 64
173 KMOV %k0, %VRAX
174 test %VRAX, %VRAX
175 jnz L(first_vec_x2)
176 KMOV %k2, %VRAX
4ba65586 177# else
b79f8ff2
NG
178 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
179 */
180 kmovd %k2, %edx
181 kmovd %k0, %eax
182 salq $CHAR_PER_VEC, %rdx
183 orq %rdx, %rax
4ba65586 184# endif
4ba65586 185
b79f8ff2
NG
186 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
187 */
188 .p2align 4,, 2
189L(FALLTHROUGH_RETURN_LBL):
190 bsfq %rax, %rax
191 subq %rcx, %rdi
192 CHAR_SIZE_SHIFT_REG (rdi)
193 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
1fd8c163
L
194 ret
195
b79f8ff2
NG
196 .p2align 4,, 8
197L(first_vec_x0):
198 bsf %VRAX, %VRAX
199 sub %rcx, %rdi
200 CHAR_SIZE_SHIFT_REG (rdi)
201 addq %rdi, %rax
1fd8c163
L
202 ret
203
b79f8ff2
NG
204 .p2align 4,, 10
205L(first_vec_x1):
206 bsf %VRAX, %VRAX
207 sub %rcx, %rdi
208 CHAR_SIZE_SHIFT_REG (rdi)
4ba65586 209 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
1fd8c163
L
210 ret
211
b79f8ff2
NG
212 .p2align 4,, 10
213 /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
214 */
215L(TAIL_RETURN_LBL):
216 bsf %VRAX, %VRAX
217 sub %VRCX, %VRDI
218 CHAR_SIZE_SHIFT_REG (VRDI)
219 lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
1fd8c163 220 ret
1fd8c163 221
b79f8ff2 222 .p2align 4,, 8
4ba65586 223L(cross_page_boundary):
b79f8ff2 224 movq %rdi, %rcx
4ba65586
NG
225 /* Align data to VEC_SIZE. */
226 andq $-VEC_SIZE, %rdi
b79f8ff2
NG
227
228 VPCMPEQ (%rdi), %VZERO, %k0
229
230 KMOV %k0, %VRAX
1fd8c163 231# ifdef USE_AS_WCSLEN
b79f8ff2
NG
232 movl %ecx, %edx
233 shrl $2, %edx
234 andl $(CHAR_PER_VEC - 1), %edx
235 shrx %edx, %eax, %eax
4ba65586 236 testl %eax, %eax
4ba65586 237# else
b79f8ff2 238 shr %cl, %VRAX
4ba65586 239# endif
b79f8ff2
NG
240 jz L(cross_page_continue)
241 bsf %VRAX, %VRAX
242 ret
1fd8c163
L
243
244END (STRLEN)
245#endif