1 /* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 # define STRLEN __strlen_evex
27 # define VMOVA vmovdqa64
31 # define VPMINU vpminud
32 # define SHIFT_REG ecx
36 # define VPMINU vpminub
37 # define SHIFT_REG edx
41 # define XMMZERO xmm16
42 # define YMMZERO ymm16
51 # define PAGE_SIZE 4096
52 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
54 .section .text.evex,"ax",@progbits
56 # ifdef USE_AS_STRNLEN
57 /* Check zero length. */
61 /* Clear the upper 32 bits. */
67 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
68 /* Clear high bits from edi. Only keeping bits relevant to page
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we may cross page boundary with one vector load. */
72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
73 ja L(cross_page_boundary)
75 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
77 VPCMP $0, (%rdi), %YMMZERO, %k0
79 # ifdef USE_AS_STRNLEN
80 /* If length < CHAR_PER_VEC handle special. */
81 cmpq $CHAR_PER_VEC, %rsi
88 # ifdef USE_AS_STRNLEN
95 /* Set bit for max len so that tzcnt will return min of max len
96 and position of first match. */
105 /* Safe to use 32 bit instructions as these are only called for
107 # ifdef USE_AS_STRNLEN
108 /* Use ecx which was computed earlier to compute correct value.
110 leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
113 # ifdef USE_AS_WCSLEN
114 /* NB: Divide bytes by 4 to get the wchar_t count. */
117 leal CHAR_PER_VEC(%rdi, %rax), %eax
124 /* Safe to use 32 bit instructions as these are only called for
126 # ifdef USE_AS_STRNLEN
127 /* Use ecx which was computed earlier to compute correct value.
129 leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
132 # ifdef USE_AS_WCSLEN
133 /* NB: Divide bytes by 4 to get the wchar_t count. */
136 leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
143 /* Safe to use 32 bit instructions as these are only called for
145 # ifdef USE_AS_STRNLEN
146 /* Use ecx which was computed earlier to compute correct value.
148 leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
151 # ifdef USE_AS_WCSLEN
152 /* NB: Divide bytes by 4 to get the wchar_t count. */
155 leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
162 /* Safe to use 32 bit instructions as these are only called for
164 # ifdef USE_AS_STRNLEN
165 /* Use ecx which was computed earlier to compute correct value.
167 leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
170 # ifdef USE_AS_WCSLEN
171 /* NB: Divide bytes by 4 to get the wchar_t count. */
174 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
181 /* Align data to VEC_SIZE. */
182 andq $-(VEC_SIZE), %rdi
183 L(cross_page_continue):
184 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
185 since data is only aligned to VEC_SIZE. */
186 # ifdef USE_AS_STRNLEN
187 /* + CHAR_SIZE because it simplies the logic in
188 last_4x_vec_or_less. */
189 leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
191 # ifdef USE_AS_WCSLEN
192 /* NB: Divide bytes by 4 to get the wchar_t count. */
196 /* Load first VEC regardless. */
197 VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
198 # ifdef USE_AS_STRNLEN
199 /* Adjust length. If near end handle specially. */
201 jb L(last_4x_vec_or_less)
207 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
212 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
217 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
223 # ifdef USE_AS_STRNLEN
224 /* Check if at last VEC_SIZE * 4 length. */
225 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
226 jbe L(last_4x_vec_or_less_load)
228 andl $(VEC_SIZE * 4 - 1), %ecx
229 # ifdef USE_AS_WCSLEN
230 /* NB: Divide bytes by 4 to get the wchar_t count. */
233 /* Readjust length. */
236 /* Align data to VEC_SIZE * 4. */
237 andq $-(VEC_SIZE * 4), %rdi
239 /* Compare 4 * VEC at a time forward. */
242 /* Load first VEC regardless. */
243 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
244 # ifdef USE_AS_STRNLEN
245 /* Break if at end of length. */
246 subq $(CHAR_PER_VEC * 4), %rsi
247 jb L(last_4x_vec_or_less_cmpeq)
249 /* Save some code size by microfusing VPMINU with the load. Since
250 the matches in ymm2/ymm4 can only be returned if there where no
251 matches in ymm1/ymm3 respectively there is no issue with overlap.
253 VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
254 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
255 VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
257 VPCMP $0, %YMM2, %YMMZERO, %k0
258 VPCMP $0, %YMM4, %YMMZERO, %k1
259 subq $-(VEC_SIZE * 4), %rdi
263 /* Check if end was in first half. */
266 # ifdef USE_AS_WCSLEN
270 jz L(second_vec_return)
272 VPCMP $0, %YMM1, %YMMZERO, %k2
274 /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
275 # ifdef USE_AS_WCSLEN
276 sall $CHAR_PER_VEC, %eax
280 salq $CHAR_PER_VEC, %rax
288 # ifdef USE_AS_STRNLEN
290 L(last_4x_vec_or_less_load):
291 /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
292 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
293 L(last_4x_vec_or_less_cmpeq):
294 VPCMP $0, %YMM1, %YMMZERO, %k0
295 addq $(VEC_SIZE * 3), %rdi
296 L(last_4x_vec_or_less):
298 /* If remaining length > VEC_SIZE * 2. This works if esi is off by
300 testl $(CHAR_PER_VEC * 2), %esi
303 /* length may have been negative or positive by an offset of
304 CHAR_PER_VEC * 4 depending on where this was called from. This
306 andl $(CHAR_PER_VEC * 4 - 1), %esi
308 jnz L(last_vec_x1_check)
310 /* Check the end of data. */
311 subl $CHAR_PER_VEC, %esi
314 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
317 /* Check the end of data. */
322 # ifdef USE_AS_WCSLEN
323 /* NB: Divide bytes by 4 to get the wchar_t count. */
326 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
333 /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
334 in the 4x VEC loop can use 2 byte encoding. */
336 L(second_vec_return):
337 VPCMP $0, %YMM3, %YMMZERO, %k0
338 /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
339 # ifdef USE_AS_WCSLEN
340 kunpckbw %k0, %k1, %k0
344 kunpckdq %k0, %k1, %k0
348 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
352 # ifdef USE_AS_STRNLEN
353 L(last_vec_x1_check):
355 /* Check the end of data. */
359 # ifdef USE_AS_WCSLEN
360 /* NB: Divide bytes by 4 to get the wchar_t count. */
363 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
368 /* Test first 2x VEC normally. */
372 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
377 /* Normalize length. */
378 andl $(CHAR_PER_VEC * 4 - 1), %esi
379 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
384 /* Check the end of data. */
385 subl $(CHAR_PER_VEC * 3), %esi
388 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
391 /* Check the end of data. */
396 # ifdef USE_AS_WCSLEN
397 /* NB: Divide bytes by 4 to get the wchar_t count. */
400 leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
407 # ifdef USE_AS_WCSLEN
408 /* NB: Divide bytes by 4 to get the wchar_t count. */
411 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
418 # ifdef USE_AS_WCSLEN
419 /* NB: Divide bytes by 4 to get the wchar_t count. */
422 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
428 subl $(CHAR_PER_VEC * 2), %esi
429 /* Check the end of data. */
433 # ifdef USE_AS_WCSLEN
434 /* NB: Divide bytes by 4 to get the wchar_t count. */
437 leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
444 /* Cold case for crossing page with first load. */
446 L(cross_page_boundary):
448 /* Align data to VEC_SIZE. */
449 andq $-VEC_SIZE, %rdi
450 VPCMP $0, (%rdi), %YMMZERO, %k0
452 /* Remove the leading bytes. */
453 # ifdef USE_AS_WCSLEN
454 /* NB: Divide shift count by 4 since each bit in K0 represent 4
458 andl $(CHAR_PER_VEC - 1), %ecx
460 /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
461 sarxl %SHIFT_REG, %eax, %eax
463 # ifndef USE_AS_STRNLEN
464 jz L(cross_page_continue)
468 jnz L(cross_page_less_vec)
469 # ifndef USE_AS_WCSLEN
471 andl $(CHAR_PER_VEC - 1), %ecx
473 movl $CHAR_PER_VEC, %eax
475 /* Check the end of data. */
477 ja L(cross_page_continue)
480 L(cross_page_less_vec):
482 /* Select min of length and position of first null. */