1 /* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 # define MEMCHR __memchr_evex
27 # ifdef USE_AS_WMEMCHR
28 # define VPBROADCAST vpbroadcastd
30 # define SHIFT_REG r8d
32 # define VPBROADCAST vpbroadcastb
34 # define SHIFT_REG ecx
37 # define XMMMATCH xmm16
38 # define YMMMATCH ymm16
48 .section .text.evex,"ax",@progbits
50 # ifndef USE_AS_RAWMEMCHR
51 /* Check for zero length. */
56 # ifdef USE_AS_WMEMCHR
60 /* Clear the upper 32 bits. */
64 /* Broadcast CHAR to YMMMATCH. */
65 VPBROADCAST %esi, %YMMMATCH
66 /* Check if we may cross page boundary with one vector load. */
67 andl $(2 * VEC_SIZE - 1), %ecx
69 ja L(cros_page_boundary)
71 /* Check the first VEC_SIZE bytes. */
72 VPCMP $0, (%rdi), %YMMMATCH, %k1
76 # ifndef USE_AS_RAWMEMCHR
77 jnz L(first_vec_x0_check)
78 /* Adjust length and check the end of data. */
85 /* Align data for aligned loads in the loop. */
87 andl $(VEC_SIZE - 1), %ecx
90 # ifndef USE_AS_RAWMEMCHR
94 subq $(VEC_SIZE * 4), %rdx
95 jbe L(last_4x_vec_or_less)
100 L(cros_page_boundary):
101 andl $(VEC_SIZE - 1), %ecx
102 # ifdef USE_AS_WMEMCHR
103 /* NB: Divide shift count by 4 since each bit in K1 represent 4
105 movl %ecx, %SHIFT_REG
108 andq $-VEC_SIZE, %rdi
109 VPCMP $0, (%rdi), %YMMMATCH, %k1
111 /* Remove the leading bytes. */
112 sarxl %SHIFT_REG, %eax, %eax
116 # ifdef USE_AS_WMEMCHR
117 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
120 # ifndef USE_AS_RAWMEMCHR
121 /* Check the end of data. */
131 # ifndef USE_AS_RAWMEMCHR
132 /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
133 instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
138 /* Check the end of data. */
145 # ifndef USE_AS_RAWMEMCHR
146 subq $(VEC_SIZE * 4), %rdx
147 jbe L(last_4x_vec_or_less)
151 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
152 since data is only aligned to VEC_SIZE. */
153 VPCMP $0, (%rdi), %YMMMATCH, %k1
158 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
163 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
168 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
173 addq $(VEC_SIZE * 4), %rdi
175 # ifndef USE_AS_RAWMEMCHR
176 subq $(VEC_SIZE * 4), %rdx
177 jbe L(last_4x_vec_or_less)
180 /* Align data to 4 * VEC_SIZE. */
182 andl $(4 * VEC_SIZE - 1), %ecx
183 andq $-(4 * VEC_SIZE), %rdi
185 # ifndef USE_AS_RAWMEMCHR
192 /* Compare 4 * VEC at a time forward. */
193 VPCMP $0, (%rdi), %YMMMATCH, %k1
194 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
196 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
197 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
203 addq $(VEC_SIZE * 4), %rdi
205 # ifdef USE_AS_RAWMEMCHR
208 subq $(VEC_SIZE * 4), %rdx
211 L(last_4x_vec_or_less):
212 /* Less than 4 * VEC and aligned to VEC_SIZE. */
213 addl $(VEC_SIZE * 2), %edx
216 VPCMP $0, (%rdi), %YMMMATCH, %k1
221 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
226 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
230 jnz L(first_vec_x2_check)
234 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
238 jnz L(first_vec_x3_check)
244 addl $(VEC_SIZE * 2), %edx
245 VPCMP $0, (%rdi), %YMMMATCH, %k1
249 jnz L(first_vec_x0_check)
253 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
256 jnz L(first_vec_x1_check)
261 L(first_vec_x0_check):
263 # ifdef USE_AS_WMEMCHR
264 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
267 /* Check the end of data. */
274 L(first_vec_x1_check):
276 # ifdef USE_AS_WMEMCHR
277 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
280 /* Check the end of data. */
288 L(first_vec_x2_check):
290 # ifdef USE_AS_WMEMCHR
291 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
294 /* Check the end of data. */
297 addq $(VEC_SIZE * 2), %rax
302 L(first_vec_x3_check):
304 # ifdef USE_AS_WMEMCHR
305 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
308 /* Check the end of data. */
311 addq $(VEC_SIZE * 3), %rax
324 # ifdef USE_AS_WMEMCHR
325 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
326 leaq (%rdi, %rax, 4), %rax
335 # ifdef USE_AS_WMEMCHR
336 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
337 leaq VEC_SIZE(%rdi, %rax, 4), %rax
347 # ifdef USE_AS_WMEMCHR
348 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
349 leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
351 addq $(VEC_SIZE * 2), %rax
371 # ifdef USE_AS_WMEMCHR
372 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
373 leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
375 addq $(VEC_SIZE * 3), %rax