]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strlen-evex-base.S
x86: Add support to build strcmp/strlen/strchr with explicit ISA level
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-evex-base.S
1 /* Placeholder function, not used by any processor at the moment.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 /* UNUSED. Exists purely as reference implementation. */
20
21 #include <isa-level.h>
22
23 #if ISA_SHOULD_BUILD (4)
24
25 # include <sysdep.h>
26
27 # ifdef USE_AS_WCSLEN
28 # define VPCMP vpcmpd
29 # define VPTESTN vptestnmd
30 # define VPMINU vpminud
31 # define CHAR_SIZE 4
32 # else
33 # define VPCMP vpcmpb
34 # define VPTESTN vptestnmb
35 # define VPMINU vpminub
36 # define CHAR_SIZE 1
37 # endif
38
39 # define XMM0 xmm16
40 # define PAGE_SIZE 4096
41 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
42
43 # if VEC_SIZE == 64
44 # define KMOV kmovq
45 # define KORTEST kortestq
46 # define RAX rax
47 # define RCX rcx
48 # define RDX rdx
49 # define SHR shrq
50 # define TEXTSUFFIX evex512
51 # define VMM0 zmm16
52 # define VMM1 zmm17
53 # define VMM2 zmm18
54 # define VMM3 zmm19
55 # define VMM4 zmm20
56 # define VMOVA vmovdqa64
57 # elif VEC_SIZE == 32
58 /* Currently Unused. */
59 # define KMOV kmovd
60 # define KORTEST kortestd
61 # define RAX eax
62 # define RCX ecx
63 # define RDX edx
64 # define SHR shrl
65 # define TEXTSUFFIX evex256
66 # define VMM0 ymm16
67 # define VMM1 ymm17
68 # define VMM2 ymm18
69 # define VMM3 ymm19
70 # define VMM4 ymm20
71 # define VMOVA vmovdqa32
72 # endif
73
74 .section .text.TEXTSUFFIX, "ax", @progbits
75 /* Aligning entry point to 64 byte, provides better performance for
76 one vector length string. */
77 ENTRY_P2ALIGN (STRLEN, 6)
78 # ifdef USE_AS_STRNLEN
79 /* Check zero length. */
80 test %RSI_LP, %RSI_LP
81 jz L(ret_max)
82 # ifdef __ILP32__
83 /* Clear the upper 32 bits. */
84 movl %esi, %esi
85 # endif
86 # endif
87
88 movl %edi, %eax
89 vpxorq %XMM0, %XMM0, %XMM0
90 andl $(PAGE_SIZE - 1), %eax
91 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
92 ja L(page_cross)
93
94 /* Compare [w]char for null, mask bit will be set for match. */
95 VPCMP $0, (%rdi), %VMM0, %k0
96 KMOV %k0, %RAX
97 test %RAX, %RAX
98 jz L(align_more)
99
100 bsf %RAX, %RAX
101 # ifdef USE_AS_STRNLEN
102 cmpq %rsi, %rax
103 cmovnb %rsi, %rax
104 # endif
105 ret
106
107 /* At this point vector max length reached. */
108 # ifdef USE_AS_STRNLEN
109 .p2align 4,,3
110 L(ret_max):
111 movq %rsi, %rax
112 ret
113 # endif
114
115 L(align_more):
116 leaq VEC_SIZE(%rdi), %rax
117 /* Align rax to VEC_SIZE. */
118 andq $-VEC_SIZE, %rax
119 # ifdef USE_AS_STRNLEN
120 movq %rax, %rdx
121 subq %rdi, %rdx
122 # ifdef USE_AS_WCSLEN
123 SHR $2, %RDX
124 # endif
125 /* At this point rdx contains [w]chars already compared. */
126 subq %rsi, %rdx
127 jae L(ret_max)
128 negq %rdx
129 /* At this point rdx contains number of w[char] needs to go.
130 Now onwards rdx will keep decrementing with each compare. */
131 # endif
132
133 /* Loop unroll 4 times for 4 vector loop. */
134 VPCMP $0, (%rax), %VMM0, %k0
135 KMOV %k0, %RCX
136 test %RCX, %RCX
137 jnz L(ret_vec_x1)
138
139 # ifdef USE_AS_STRNLEN
140 subq $CHAR_PER_VEC, %rdx
141 jbe L(ret_max)
142 # endif
143
144 VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
145 KMOV %k0, %RCX
146 test %RCX, %RCX
147 jnz L(ret_vec_x2)
148
149 # ifdef USE_AS_STRNLEN
150 subq $CHAR_PER_VEC, %rdx
151 jbe L(ret_max)
152 # endif
153
154 VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
155 KMOV %k0, %RCX
156 test %RCX, %RCX
157 jnz L(ret_vec_x3)
158
159 # ifdef USE_AS_STRNLEN
160 subq $CHAR_PER_VEC, %rdx
161 jbe L(ret_max)
162 # endif
163
164 VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
165 KMOV %k0, %RCX
166 test %RCX, %RCX
167 jnz L(ret_vec_x4)
168
169 # ifdef USE_AS_STRNLEN
170 subq $CHAR_PER_VEC, %rdx
171 jbe L(ret_max)
172 /* Save pointer before 4 x VEC_SIZE alignment. */
173 movq %rax, %rcx
174 # endif
175
176 /* Align address to VEC_SIZE * 4 for loop. */
177 andq $-(VEC_SIZE * 4), %rax
178
179 # ifdef USE_AS_STRNLEN
180 subq %rax, %rcx
181 # ifdef USE_AS_WCSLEN
182 SHR $2, %RCX
183 # endif
184 /* rcx contains number of [w]char will be recompared due to
185 alignment fixes. rdx must be incremented by rcx to offset
186 alignment adjustment. */
187 addq %rcx, %rdx
188 /* Need jump as we don't want to add/subtract rdx for first
189 iteration of 4 x VEC_SIZE aligned loop. */
190 jmp L(loop_entry)
191 # endif
192
193 .p2align 4,,11
194 L(loop):
195 # ifdef USE_AS_STRNLEN
196 subq $(CHAR_PER_VEC * 4), %rdx
197 jbe L(ret_max)
198 L(loop_entry):
199 # endif
200 /* VPMINU and VPCMP combination provide better performance as
201 compared to alternative combinations. */
202 VMOVA (VEC_SIZE * 4)(%rax), %VMM1
203 VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
204 VMOVA (VEC_SIZE * 6)(%rax), %VMM3
205 VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
206
207 VPTESTN %VMM2, %VMM2, %k0
208 VPTESTN %VMM4, %VMM4, %k1
209
210 subq $-(VEC_SIZE * 4), %rax
211 KORTEST %k0, %k1
212 jz L(loop)
213
214 VPTESTN %VMM1, %VMM1, %k2
215 KMOV %k2, %RCX
216 test %RCX, %RCX
217 jnz L(ret_vec_x1)
218
219 KMOV %k0, %RCX
220 /* At this point, if k0 is non zero, null char must be in the
221 second vector. */
222 test %RCX, %RCX
223 jnz L(ret_vec_x2)
224
225 VPTESTN %VMM3, %VMM3, %k3
226 KMOV %k3, %RCX
227 test %RCX, %RCX
228 jnz L(ret_vec_x3)
229 /* At this point null [w]char must be in the fourth vector so no
230 need to check. */
231 KMOV %k1, %RCX
232
233 /* Fourth, third, second vector terminating are pretty much
234 same, implemented this way to avoid branching and reuse code
235 from pre loop exit condition. */
236 L(ret_vec_x4):
237 bsf %RCX, %RCX
238 subq %rdi, %rax
239 # ifdef USE_AS_WCSLEN
240 subq $-(VEC_SIZE * 3), %rax
241 shrq $2, %rax
242 addq %rcx, %rax
243 # else
244 leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
245 # endif
246 # ifdef USE_AS_STRNLEN
247 cmpq %rsi, %rax
248 cmovnb %rsi, %rax
249 # endif
250 ret
251
252 L(ret_vec_x3):
253 bsf %RCX, %RCX
254 subq %rdi, %rax
255 # ifdef USE_AS_WCSLEN
256 subq $-(VEC_SIZE * 2), %rax
257 shrq $2, %rax
258 addq %rcx, %rax
259 # else
260 leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
261 # endif
262 # ifdef USE_AS_STRNLEN
263 cmpq %rsi, %rax
264 cmovnb %rsi, %rax
265 # endif
266 ret
267
268 L(ret_vec_x2):
269 subq $-VEC_SIZE, %rax
270 L(ret_vec_x1):
271 bsf %RCX, %RCX
272 subq %rdi, %rax
273 # ifdef USE_AS_WCSLEN
274 shrq $2, %rax
275 # endif
276 addq %rcx, %rax
277 # ifdef USE_AS_STRNLEN
278 cmpq %rsi, %rax
279 cmovnb %rsi, %rax
280 # endif
281 ret
282
283 L(page_cross):
284 movl %eax, %ecx
285 # ifdef USE_AS_WCSLEN
286 andl $(VEC_SIZE - 1), %ecx
287 sarl $2, %ecx
288 # endif
289 /* ecx contains number of w[char] to be skipped as a result
290 of address alignment. */
291 xorq %rdi, %rax
292 VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
293 KMOV %k0, %RAX
294 /* Ignore number of character for alignment adjustment. */
295 SHR %cl, %RAX
296 jz L(align_more)
297
298 bsf %RAX, %RAX
299 # ifdef USE_AS_STRNLEN
300 cmpq %rsi, %rax
301 cmovnb %rsi, %rax
302 # endif
303 ret
304
305 END (STRLEN)
306 #endif