]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memchr-evex.S
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memchr-evex.S
1 /* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef MEMCHR
24 # define MEMCHR __memchr_evex
25 # endif
26
27 # ifdef USE_AS_WMEMCHR
28 # define VPBROADCAST vpbroadcastd
29 # define VPCMP vpcmpd
30 # define SHIFT_REG r8d
31 # else
32 # define VPBROADCAST vpbroadcastb
33 # define VPCMP vpcmpb
34 # define SHIFT_REG ecx
35 # endif
36
37 # define XMMMATCH xmm16
38 # define YMMMATCH ymm16
39 # define YMM1 ymm17
40 # define YMM2 ymm18
41 # define YMM3 ymm19
42 # define YMM4 ymm20
43 # define YMM5 ymm21
44 # define YMM6 ymm22
45
46 # define VEC_SIZE 32
47
48 .section .text.evex,"ax",@progbits
49 ENTRY (MEMCHR)
50 # ifndef USE_AS_RAWMEMCHR
51 /* Check for zero length. */
52 test %RDX_LP, %RDX_LP
53 jz L(zero)
54 # endif
55 movl %edi, %ecx
56 # ifdef USE_AS_WMEMCHR
57 shl $2, %RDX_LP
58 # else
59 # ifdef __ILP32__
60 /* Clear the upper 32 bits. */
61 movl %edx, %edx
62 # endif
63 # endif
64 /* Broadcast CHAR to YMMMATCH. */
65 VPBROADCAST %esi, %YMMMATCH
66 /* Check if we may cross page boundary with one vector load. */
67 andl $(2 * VEC_SIZE - 1), %ecx
68 cmpl $VEC_SIZE, %ecx
69 ja L(cros_page_boundary)
70
71 /* Check the first VEC_SIZE bytes. */
72 VPCMP $0, (%rdi), %YMMMATCH, %k1
73 kmovd %k1, %eax
74 testl %eax, %eax
75
76 # ifndef USE_AS_RAWMEMCHR
77 jnz L(first_vec_x0_check)
78 /* Adjust length and check the end of data. */
79 subq $VEC_SIZE, %rdx
80 jbe L(zero)
81 # else
82 jnz L(first_vec_x0)
83 # endif
84
85 /* Align data for aligned loads in the loop. */
86 addq $VEC_SIZE, %rdi
87 andl $(VEC_SIZE - 1), %ecx
88 andq $-VEC_SIZE, %rdi
89
90 # ifndef USE_AS_RAWMEMCHR
91 /* Adjust length. */
92 addq %rcx, %rdx
93
94 subq $(VEC_SIZE * 4), %rdx
95 jbe L(last_4x_vec_or_less)
96 # endif
97 jmp L(more_4x_vec)
98
99 .p2align 4
100 L(cros_page_boundary):
101 andl $(VEC_SIZE - 1), %ecx
102 # ifdef USE_AS_WMEMCHR
103 /* NB: Divide shift count by 4 since each bit in K1 represent 4
104 bytes. */
105 movl %ecx, %SHIFT_REG
106 sarl $2, %SHIFT_REG
107 # endif
108 andq $-VEC_SIZE, %rdi
109 VPCMP $0, (%rdi), %YMMMATCH, %k1
110 kmovd %k1, %eax
111 /* Remove the leading bytes. */
112 sarxl %SHIFT_REG, %eax, %eax
113 testl %eax, %eax
114 jz L(aligned_more)
115 tzcntl %eax, %eax
116 # ifdef USE_AS_WMEMCHR
117 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
118 sall $2, %eax
119 # endif
120 # ifndef USE_AS_RAWMEMCHR
121 /* Check the end of data. */
122 cmpq %rax, %rdx
123 jbe L(zero)
124 # endif
125 addq %rdi, %rax
126 addq %rcx, %rax
127 ret
128
129 .p2align 4
130 L(aligned_more):
131 # ifndef USE_AS_RAWMEMCHR
132 /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
133 instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
134 overflow. */
135 negq %rcx
136 addq $VEC_SIZE, %rcx
137
138 /* Check the end of data. */
139 subq %rcx, %rdx
140 jbe L(zero)
141 # endif
142
143 addq $VEC_SIZE, %rdi
144
145 # ifndef USE_AS_RAWMEMCHR
146 subq $(VEC_SIZE * 4), %rdx
147 jbe L(last_4x_vec_or_less)
148 # endif
149
150 L(more_4x_vec):
151 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
152 since data is only aligned to VEC_SIZE. */
153 VPCMP $0, (%rdi), %YMMMATCH, %k1
154 kmovd %k1, %eax
155 testl %eax, %eax
156 jnz L(first_vec_x0)
157
158 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
159 kmovd %k1, %eax
160 testl %eax, %eax
161 jnz L(first_vec_x1)
162
163 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
164 kmovd %k1, %eax
165 testl %eax, %eax
166 jnz L(first_vec_x2)
167
168 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
169 kmovd %k1, %eax
170 testl %eax, %eax
171 jnz L(first_vec_x3)
172
173 addq $(VEC_SIZE * 4), %rdi
174
175 # ifndef USE_AS_RAWMEMCHR
176 subq $(VEC_SIZE * 4), %rdx
177 jbe L(last_4x_vec_or_less)
178 # endif
179
180 /* Align data to 4 * VEC_SIZE. */
181 movq %rdi, %rcx
182 andl $(4 * VEC_SIZE - 1), %ecx
183 andq $-(4 * VEC_SIZE), %rdi
184
185 # ifndef USE_AS_RAWMEMCHR
186 /* Adjust length. */
187 addq %rcx, %rdx
188 # endif
189
190 .p2align 4
191 L(loop_4x_vec):
192 /* Compare 4 * VEC at a time forward. */
193 VPCMP $0, (%rdi), %YMMMATCH, %k1
194 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
195 kord %k1, %k2, %k5
196 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
197 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
198
199 kord %k3, %k4, %k6
200 kortestd %k5, %k6
201 jnz L(4x_vec_end)
202
203 addq $(VEC_SIZE * 4), %rdi
204
205 # ifdef USE_AS_RAWMEMCHR
206 jmp L(loop_4x_vec)
207 # else
208 subq $(VEC_SIZE * 4), %rdx
209 ja L(loop_4x_vec)
210
211 L(last_4x_vec_or_less):
212 /* Less than 4 * VEC and aligned to VEC_SIZE. */
213 addl $(VEC_SIZE * 2), %edx
214 jle L(last_2x_vec)
215
216 VPCMP $0, (%rdi), %YMMMATCH, %k1
217 kmovd %k1, %eax
218 testl %eax, %eax
219 jnz L(first_vec_x0)
220
221 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
222 kmovd %k1, %eax
223 testl %eax, %eax
224 jnz L(first_vec_x1)
225
226 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
227 kmovd %k1, %eax
228 testl %eax, %eax
229
230 jnz L(first_vec_x2_check)
231 subl $VEC_SIZE, %edx
232 jle L(zero)
233
234 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
235 kmovd %k1, %eax
236 testl %eax, %eax
237
238 jnz L(first_vec_x3_check)
239 xorl %eax, %eax
240 ret
241
242 .p2align 4
243 L(last_2x_vec):
244 addl $(VEC_SIZE * 2), %edx
245 VPCMP $0, (%rdi), %YMMMATCH, %k1
246 kmovd %k1, %eax
247 testl %eax, %eax
248
249 jnz L(first_vec_x0_check)
250 subl $VEC_SIZE, %edx
251 jle L(zero)
252
253 VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
254 kmovd %k1, %eax
255 testl %eax, %eax
256 jnz L(first_vec_x1_check)
257 xorl %eax, %eax
258 ret
259
260 .p2align 4
261 L(first_vec_x0_check):
262 tzcntl %eax, %eax
263 # ifdef USE_AS_WMEMCHR
264 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
265 sall $2, %eax
266 # endif
267 /* Check the end of data. */
268 cmpq %rax, %rdx
269 jbe L(zero)
270 addq %rdi, %rax
271 ret
272
273 .p2align 4
274 L(first_vec_x1_check):
275 tzcntl %eax, %eax
276 # ifdef USE_AS_WMEMCHR
277 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
278 sall $2, %eax
279 # endif
280 /* Check the end of data. */
281 cmpq %rax, %rdx
282 jbe L(zero)
283 addq $VEC_SIZE, %rax
284 addq %rdi, %rax
285 ret
286
287 .p2align 4
288 L(first_vec_x2_check):
289 tzcntl %eax, %eax
290 # ifdef USE_AS_WMEMCHR
291 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
292 sall $2, %eax
293 # endif
294 /* Check the end of data. */
295 cmpq %rax, %rdx
296 jbe L(zero)
297 addq $(VEC_SIZE * 2), %rax
298 addq %rdi, %rax
299 ret
300
301 .p2align 4
302 L(first_vec_x3_check):
303 tzcntl %eax, %eax
304 # ifdef USE_AS_WMEMCHR
305 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
306 sall $2, %eax
307 # endif
308 /* Check the end of data. */
309 cmpq %rax, %rdx
310 jbe L(zero)
311 addq $(VEC_SIZE * 3), %rax
312 addq %rdi, %rax
313 ret
314
315 .p2align 4
316 L(zero):
317 xorl %eax, %eax
318 ret
319 # endif
320
321 .p2align 4
322 L(first_vec_x0):
323 tzcntl %eax, %eax
324 # ifdef USE_AS_WMEMCHR
325 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
326 leaq (%rdi, %rax, 4), %rax
327 # else
328 addq %rdi, %rax
329 # endif
330 ret
331
332 .p2align 4
333 L(first_vec_x1):
334 tzcntl %eax, %eax
335 # ifdef USE_AS_WMEMCHR
336 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
337 leaq VEC_SIZE(%rdi, %rax, 4), %rax
338 # else
339 addq $VEC_SIZE, %rax
340 addq %rdi, %rax
341 # endif
342 ret
343
344 .p2align 4
345 L(first_vec_x2):
346 tzcntl %eax, %eax
347 # ifdef USE_AS_WMEMCHR
348 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
349 leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
350 # else
351 addq $(VEC_SIZE * 2), %rax
352 addq %rdi, %rax
353 # endif
354 ret
355
356 .p2align 4
357 L(4x_vec_end):
358 kmovd %k1, %eax
359 testl %eax, %eax
360 jnz L(first_vec_x0)
361 kmovd %k2, %eax
362 testl %eax, %eax
363 jnz L(first_vec_x1)
364 kmovd %k3, %eax
365 testl %eax, %eax
366 jnz L(first_vec_x2)
367 kmovd %k4, %eax
368 testl %eax, %eax
369 L(first_vec_x3):
370 tzcntl %eax, %eax
371 # ifdef USE_AS_WMEMCHR
372 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
373 leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
374 # else
375 addq $(VEC_SIZE * 3), %rax
376 addq %rdi, %rax
377 # endif
378 ret
379
380 END (MEMCHR)
381 #endif