]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strlen-evex.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-evex.S
1 /* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef STRLEN
24 # define STRLEN __strlen_evex
25 # endif
26
27 # define VMOVA vmovdqa64
28
29 # ifdef USE_AS_WCSLEN
30 # define VPCMP vpcmpd
31 # define VPMINU vpminud
32 # define SHIFT_REG ecx
33 # define CHAR_SIZE 4
34 # else
35 # define VPCMP vpcmpb
36 # define VPMINU vpminub
37 # define SHIFT_REG edx
38 # define CHAR_SIZE 1
39 # endif
40
41 # define XMMZERO xmm16
42 # define YMMZERO ymm16
43 # define YMM1 ymm17
44 # define YMM2 ymm18
45 # define YMM3 ymm19
46 # define YMM4 ymm20
47 # define YMM5 ymm21
48 # define YMM6 ymm22
49
50 # define VEC_SIZE 32
51 # define PAGE_SIZE 4096
52 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
53
54 .section .text.evex,"ax",@progbits
55 ENTRY (STRLEN)
56 # ifdef USE_AS_STRNLEN
57 /* Check zero length. */
58 test %RSI_LP, %RSI_LP
59 jz L(zero)
60 # ifdef __ILP32__
61 /* Clear the upper 32 bits. */
62 movl %esi, %esi
63 # endif
64 mov %RSI_LP, %R8_LP
65 # endif
66 movl %edi, %eax
67 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
68 /* Clear high bits from edi. Only keeping bits relevant to page
69 cross check. */
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we may cross page boundary with one vector load. */
72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
73 ja L(cross_page_boundary)
74
75 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
76 null byte. */
77 VPCMP $0, (%rdi), %YMMZERO, %k0
78 kmovd %k0, %eax
79 # ifdef USE_AS_STRNLEN
80 /* If length < CHAR_PER_VEC handle special. */
81 cmpq $CHAR_PER_VEC, %rsi
82 jbe L(first_vec_x0)
83 # endif
84 testl %eax, %eax
85 jz L(aligned_more)
86 tzcntl %eax, %eax
87 ret
88 # ifdef USE_AS_STRNLEN
89 L(zero):
90 xorl %eax, %eax
91 ret
92
93 .p2align 4
94 L(first_vec_x0):
95 /* Set bit for max len so that tzcnt will return min of max len
96 and position of first match. */
97 btsq %rsi, %rax
98 tzcntl %eax, %eax
99 ret
100 # endif
101
102 .p2align 4
103 L(first_vec_x1):
104 tzcntl %eax, %eax
105 /* Safe to use 32 bit instructions as these are only called for
106 size = [1, 159]. */
107 # ifdef USE_AS_STRNLEN
108 /* Use ecx which was computed earlier to compute correct value.
109 */
110 leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
111 # else
112 subl %edx, %edi
113 # ifdef USE_AS_WCSLEN
114 /* NB: Divide bytes by 4 to get the wchar_t count. */
115 sarl $2, %edi
116 # endif
117 leal CHAR_PER_VEC(%rdi, %rax), %eax
118 # endif
119 ret
120
121 .p2align 4
122 L(first_vec_x2):
123 tzcntl %eax, %eax
124 /* Safe to use 32 bit instructions as these are only called for
125 size = [1, 159]. */
126 # ifdef USE_AS_STRNLEN
127 /* Use ecx which was computed earlier to compute correct value.
128 */
129 leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
130 # else
131 subl %edx, %edi
132 # ifdef USE_AS_WCSLEN
133 /* NB: Divide bytes by 4 to get the wchar_t count. */
134 sarl $2, %edi
135 # endif
136 leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
137 # endif
138 ret
139
140 .p2align 4
141 L(first_vec_x3):
142 tzcntl %eax, %eax
143 /* Safe to use 32 bit instructions as these are only called for
144 size = [1, 159]. */
145 # ifdef USE_AS_STRNLEN
146 /* Use ecx which was computed earlier to compute correct value.
147 */
148 leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
149 # else
150 subl %edx, %edi
151 # ifdef USE_AS_WCSLEN
152 /* NB: Divide bytes by 4 to get the wchar_t count. */
153 sarl $2, %edi
154 # endif
155 leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
156 # endif
157 ret
158
159 .p2align 4
160 L(first_vec_x4):
161 tzcntl %eax, %eax
162 /* Safe to use 32 bit instructions as these are only called for
163 size = [1, 159]. */
164 # ifdef USE_AS_STRNLEN
165 /* Use ecx which was computed earlier to compute correct value.
166 */
167 leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
168 # else
169 subl %edx, %edi
170 # ifdef USE_AS_WCSLEN
171 /* NB: Divide bytes by 4 to get the wchar_t count. */
172 sarl $2, %edi
173 # endif
174 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
175 # endif
176 ret
177
178 .p2align 5
179 L(aligned_more):
180 movq %rdi, %rdx
181 /* Align data to VEC_SIZE. */
182 andq $-(VEC_SIZE), %rdi
183 L(cross_page_continue):
184 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
185 since data is only aligned to VEC_SIZE. */
186 # ifdef USE_AS_STRNLEN
187 /* + CHAR_SIZE because it simplies the logic in
188 last_4x_vec_or_less. */
189 leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
190 subq %rdx, %rcx
191 # ifdef USE_AS_WCSLEN
192 /* NB: Divide bytes by 4 to get the wchar_t count. */
193 sarl $2, %ecx
194 # endif
195 # endif
196 /* Load first VEC regardless. */
197 VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
198 # ifdef USE_AS_STRNLEN
199 /* Adjust length. If near end handle specially. */
200 subq %rcx, %rsi
201 jb L(last_4x_vec_or_less)
202 # endif
203 kmovd %k0, %eax
204 testl %eax, %eax
205 jnz L(first_vec_x1)
206
207 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
208 kmovd %k0, %eax
209 test %eax, %eax
210 jnz L(first_vec_x2)
211
212 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
213 kmovd %k0, %eax
214 testl %eax, %eax
215 jnz L(first_vec_x3)
216
217 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
218 kmovd %k0, %eax
219 testl %eax, %eax
220 jnz L(first_vec_x4)
221
222 addq $VEC_SIZE, %rdi
223 # ifdef USE_AS_STRNLEN
224 /* Check if at last VEC_SIZE * 4 length. */
225 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
226 jbe L(last_4x_vec_or_less_load)
227 movl %edi, %ecx
228 andl $(VEC_SIZE * 4 - 1), %ecx
229 # ifdef USE_AS_WCSLEN
230 /* NB: Divide bytes by 4 to get the wchar_t count. */
231 sarl $2, %ecx
232 # endif
233 /* Readjust length. */
234 addq %rcx, %rsi
235 # endif
236 /* Align data to VEC_SIZE * 4. */
237 andq $-(VEC_SIZE * 4), %rdi
238
239 /* Compare 4 * VEC at a time forward. */
240 .p2align 4
241 L(loop_4x_vec):
242 /* Load first VEC regardless. */
243 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
244 # ifdef USE_AS_STRNLEN
245 /* Break if at end of length. */
246 subq $(CHAR_PER_VEC * 4), %rsi
247 jb L(last_4x_vec_or_less_cmpeq)
248 # endif
249 /* Save some code size by microfusing VPMINU with the load. Since
250 the matches in ymm2/ymm4 can only be returned if there where no
251 matches in ymm1/ymm3 respectively there is no issue with overlap.
252 */
253 VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
254 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
255 VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
256
257 VPCMP $0, %YMM2, %YMMZERO, %k0
258 VPCMP $0, %YMM4, %YMMZERO, %k1
259 subq $-(VEC_SIZE * 4), %rdi
260 kortestd %k0, %k1
261 jz L(loop_4x_vec)
262
263 /* Check if end was in first half. */
264 kmovd %k0, %eax
265 subq %rdx, %rdi
266 # ifdef USE_AS_WCSLEN
267 shrq $2, %rdi
268 # endif
269 testl %eax, %eax
270 jz L(second_vec_return)
271
272 VPCMP $0, %YMM1, %YMMZERO, %k2
273 kmovd %k2, %edx
274 /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
275 # ifdef USE_AS_WCSLEN
276 sall $CHAR_PER_VEC, %eax
277 orl %edx, %eax
278 tzcntl %eax, %eax
279 # else
280 salq $CHAR_PER_VEC, %rax
281 orq %rdx, %rax
282 tzcntq %rax, %rax
283 # endif
284 addq %rdi, %rax
285 ret
286
287
288 # ifdef USE_AS_STRNLEN
289
290 L(last_4x_vec_or_less_load):
291 /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
292 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
293 L(last_4x_vec_or_less_cmpeq):
294 VPCMP $0, %YMM1, %YMMZERO, %k0
295 addq $(VEC_SIZE * 3), %rdi
296 L(last_4x_vec_or_less):
297 kmovd %k0, %eax
298 /* If remaining length > VEC_SIZE * 2. This works if esi is off by
299 VEC_SIZE * 4. */
300 testl $(CHAR_PER_VEC * 2), %esi
301 jnz L(last_4x_vec)
302
303 /* length may have been negative or positive by an offset of
304 CHAR_PER_VEC * 4 depending on where this was called from. This
305 fixes that. */
306 andl $(CHAR_PER_VEC * 4 - 1), %esi
307 testl %eax, %eax
308 jnz L(last_vec_x1_check)
309
310 /* Check the end of data. */
311 subl $CHAR_PER_VEC, %esi
312 jb L(max)
313
314 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
315 kmovd %k0, %eax
316 tzcntl %eax, %eax
317 /* Check the end of data. */
318 cmpl %eax, %esi
319 jb L(max)
320
321 subq %rdx, %rdi
322 # ifdef USE_AS_WCSLEN
323 /* NB: Divide bytes by 4 to get the wchar_t count. */
324 sarq $2, %rdi
325 # endif
326 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
327 ret
328 L(max):
329 movq %r8, %rax
330 ret
331 # endif
332
333 /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
334 in the 4x VEC loop can use 2 byte encoding. */
335 .p2align 4
336 L(second_vec_return):
337 VPCMP $0, %YMM3, %YMMZERO, %k0
338 /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
339 # ifdef USE_AS_WCSLEN
340 kunpckbw %k0, %k1, %k0
341 kmovd %k0, %eax
342 tzcntl %eax, %eax
343 # else
344 kunpckdq %k0, %k1, %k0
345 kmovq %k0, %rax
346 tzcntq %rax, %rax
347 # endif
348 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
349 ret
350
351
352 # ifdef USE_AS_STRNLEN
353 L(last_vec_x1_check):
354 tzcntl %eax, %eax
355 /* Check the end of data. */
356 cmpl %eax, %esi
357 jb L(max)
358 subq %rdx, %rdi
359 # ifdef USE_AS_WCSLEN
360 /* NB: Divide bytes by 4 to get the wchar_t count. */
361 sarq $2, %rdi
362 # endif
363 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
364 ret
365
366 .p2align 4
367 L(last_4x_vec):
368 /* Test first 2x VEC normally. */
369 testl %eax, %eax
370 jnz L(last_vec_x1)
371
372 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
373 kmovd %k0, %eax
374 testl %eax, %eax
375 jnz L(last_vec_x2)
376
377 /* Normalize length. */
378 andl $(CHAR_PER_VEC * 4 - 1), %esi
379 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
380 kmovd %k0, %eax
381 testl %eax, %eax
382 jnz L(last_vec_x3)
383
384 /* Check the end of data. */
385 subl $(CHAR_PER_VEC * 3), %esi
386 jb L(max)
387
388 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
389 kmovd %k0, %eax
390 tzcntl %eax, %eax
391 /* Check the end of data. */
392 cmpl %eax, %esi
393 jb L(max_end)
394
395 subq %rdx, %rdi
396 # ifdef USE_AS_WCSLEN
397 /* NB: Divide bytes by 4 to get the wchar_t count. */
398 sarq $2, %rdi
399 # endif
400 leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
401 ret
402
403 .p2align 4
404 L(last_vec_x1):
405 tzcntl %eax, %eax
406 subq %rdx, %rdi
407 # ifdef USE_AS_WCSLEN
408 /* NB: Divide bytes by 4 to get the wchar_t count. */
409 sarq $2, %rdi
410 # endif
411 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
412 ret
413
414 .p2align 4
415 L(last_vec_x2):
416 tzcntl %eax, %eax
417 subq %rdx, %rdi
418 # ifdef USE_AS_WCSLEN
419 /* NB: Divide bytes by 4 to get the wchar_t count. */
420 sarq $2, %rdi
421 # endif
422 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
423 ret
424
425 .p2align 4
426 L(last_vec_x3):
427 tzcntl %eax, %eax
428 subl $(CHAR_PER_VEC * 2), %esi
429 /* Check the end of data. */
430 cmpl %eax, %esi
431 jb L(max_end)
432 subq %rdx, %rdi
433 # ifdef USE_AS_WCSLEN
434 /* NB: Divide bytes by 4 to get the wchar_t count. */
435 sarq $2, %rdi
436 # endif
437 leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
438 ret
439 L(max_end):
440 movq %r8, %rax
441 ret
442 # endif
443
444 /* Cold case for crossing page with first load. */
445 .p2align 4
446 L(cross_page_boundary):
447 movq %rdi, %rdx
448 /* Align data to VEC_SIZE. */
449 andq $-VEC_SIZE, %rdi
450 VPCMP $0, (%rdi), %YMMZERO, %k0
451 kmovd %k0, %eax
452 /* Remove the leading bytes. */
453 # ifdef USE_AS_WCSLEN
454 /* NB: Divide shift count by 4 since each bit in K0 represent 4
455 bytes. */
456 movl %edx, %ecx
457 shrl $2, %ecx
458 andl $(CHAR_PER_VEC - 1), %ecx
459 # endif
460 /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
461 sarxl %SHIFT_REG, %eax, %eax
462 testl %eax, %eax
463 # ifndef USE_AS_STRNLEN
464 jz L(cross_page_continue)
465 tzcntl %eax, %eax
466 ret
467 # else
468 jnz L(cross_page_less_vec)
469 # ifndef USE_AS_WCSLEN
470 movl %edx, %ecx
471 andl $(CHAR_PER_VEC - 1), %ecx
472 # endif
473 movl $CHAR_PER_VEC, %eax
474 subl %ecx, %eax
475 /* Check the end of data. */
476 cmpq %rax, %rsi
477 ja L(cross_page_continue)
478 movl %esi, %eax
479 ret
480 L(cross_page_less_vec):
481 tzcntl %eax, %eax
482 /* Select min of length and position of first null. */
483 cmpq %rax, %rsi
484 cmovb %esi, %eax
485 ret
486 # endif
487
488 END (STRLEN)
489 #endif