]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strlen-avx2.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strlen-avx2.S
1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef STRLEN
24 # define STRLEN __strlen_avx2
25 # endif
26
27 # ifdef USE_AS_WCSLEN
28 # define VPCMPEQ vpcmpeqd
29 # define VPMINU vpminud
30 # else
31 # define VPCMPEQ vpcmpeqb
32 # define VPMINU vpminub
33 # endif
34
35 # ifndef VZEROUPPER
36 # define VZEROUPPER vzeroupper
37 # endif
38
39 # define VEC_SIZE 32
40
41 .section .text.avx,"ax",@progbits
42 ENTRY (STRLEN)
43 # ifdef USE_AS_STRNLEN
44 /* Check for zero length. */
45 testq %rsi, %rsi
46 jz L(zero)
47 # ifdef USE_AS_WCSLEN
48 shl $2, %rsi
49 # endif
50 movq %rsi, %r8
51 # endif
52 movl %edi, %ecx
53 movq %rdi, %rdx
54 vpxor %xmm0, %xmm0, %xmm0
55
56 /* Check if we may cross page boundary with one vector load. */
57 andl $(2 * VEC_SIZE - 1), %ecx
58 cmpl $VEC_SIZE, %ecx
59 ja L(cros_page_boundary)
60
61 /* Check the first VEC_SIZE bytes. */
62 VPCMPEQ (%rdi), %ymm0, %ymm1
63 vpmovmskb %ymm1, %eax
64 testl %eax, %eax
65
66 # ifdef USE_AS_STRNLEN
67 jnz L(first_vec_x0_check)
68 /* Adjust length and check the end of data. */
69 subq $VEC_SIZE, %rsi
70 jbe L(max)
71 # else
72 jnz L(first_vec_x0)
73 # endif
74
75 /* Align data for aligned loads in the loop. */
76 addq $VEC_SIZE, %rdi
77 andl $(VEC_SIZE - 1), %ecx
78 andq $-VEC_SIZE, %rdi
79
80 # ifdef USE_AS_STRNLEN
81 /* Adjust length. */
82 addq %rcx, %rsi
83
84 subq $(VEC_SIZE * 4), %rsi
85 jbe L(last_4x_vec_or_less)
86 # endif
87 jmp L(more_4x_vec)
88
89 .p2align 4
90 L(cros_page_boundary):
91 andl $(VEC_SIZE - 1), %ecx
92 andq $-VEC_SIZE, %rdi
93 VPCMPEQ (%rdi), %ymm0, %ymm1
94 vpmovmskb %ymm1, %eax
95 /* Remove the leading bytes. */
96 sarl %cl, %eax
97 testl %eax, %eax
98 jz L(aligned_more)
99 tzcntl %eax, %eax
100 # ifdef USE_AS_STRNLEN
101 /* Check the end of data. */
102 cmpq %rax, %rsi
103 jbe L(max)
104 # endif
105 addq %rdi, %rax
106 addq %rcx, %rax
107 subq %rdx, %rax
108 # ifdef USE_AS_WCSLEN
109 shrq $2, %rax
110 # endif
111 VZEROUPPER
112 ret
113
114 .p2align 4
115 L(aligned_more):
116 # ifdef USE_AS_STRNLEN
117 /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
118 with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
119 to void possible addition overflow. */
120 negq %rcx
121 addq $VEC_SIZE, %rcx
122
123 /* Check the end of data. */
124 subq %rcx, %rsi
125 jbe L(max)
126 # endif
127
128 addq $VEC_SIZE, %rdi
129
130 # ifdef USE_AS_STRNLEN
131 subq $(VEC_SIZE * 4), %rsi
132 jbe L(last_4x_vec_or_less)
133 # endif
134
135 L(more_4x_vec):
136 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
137 since data is only aligned to VEC_SIZE. */
138 VPCMPEQ (%rdi), %ymm0, %ymm1
139 vpmovmskb %ymm1, %eax
140 testl %eax, %eax
141 jnz L(first_vec_x0)
142
143 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
144 vpmovmskb %ymm1, %eax
145 testl %eax, %eax
146 jnz L(first_vec_x1)
147
148 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
149 vpmovmskb %ymm1, %eax
150 testl %eax, %eax
151 jnz L(first_vec_x2)
152
153 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
154 vpmovmskb %ymm1, %eax
155 testl %eax, %eax
156 jnz L(first_vec_x3)
157
158 addq $(VEC_SIZE * 4), %rdi
159
160 # ifdef USE_AS_STRNLEN
161 subq $(VEC_SIZE * 4), %rsi
162 jbe L(last_4x_vec_or_less)
163 # endif
164
165 /* Align data to 4 * VEC_SIZE. */
166 movq %rdi, %rcx
167 andl $(4 * VEC_SIZE - 1), %ecx
168 andq $-(4 * VEC_SIZE), %rdi
169
170 # ifdef USE_AS_STRNLEN
171 /* Adjust length. */
172 addq %rcx, %rsi
173 # endif
174
175 .p2align 4
176 L(loop_4x_vec):
177 /* Compare 4 * VEC at a time forward. */
178 vmovdqa (%rdi), %ymm1
179 vmovdqa VEC_SIZE(%rdi), %ymm2
180 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
181 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
182 VPMINU %ymm1, %ymm2, %ymm5
183 VPMINU %ymm3, %ymm4, %ymm6
184 VPMINU %ymm5, %ymm6, %ymm5
185
186 VPCMPEQ %ymm5, %ymm0, %ymm5
187 vpmovmskb %ymm5, %eax
188 testl %eax, %eax
189 jnz L(4x_vec_end)
190
191 addq $(VEC_SIZE * 4), %rdi
192
193 # ifndef USE_AS_STRNLEN
194 jmp L(loop_4x_vec)
195 # else
196 subq $(VEC_SIZE * 4), %rsi
197 ja L(loop_4x_vec)
198
199 L(last_4x_vec_or_less):
200 /* Less than 4 * VEC and aligned to VEC_SIZE. */
201 addl $(VEC_SIZE * 2), %esi
202 jle L(last_2x_vec)
203
204 VPCMPEQ (%rdi), %ymm0, %ymm1
205 vpmovmskb %ymm1, %eax
206 testl %eax, %eax
207 jnz L(first_vec_x0)
208
209 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
210 vpmovmskb %ymm1, %eax
211 testl %eax, %eax
212 jnz L(first_vec_x1)
213
214 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
215 vpmovmskb %ymm1, %eax
216 testl %eax, %eax
217
218 jnz L(first_vec_x2_check)
219 subl $VEC_SIZE, %esi
220 jle L(max)
221
222 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
223 vpmovmskb %ymm1, %eax
224 testl %eax, %eax
225
226 jnz L(first_vec_x3_check)
227 movq %r8, %rax
228 # ifdef USE_AS_WCSLEN
229 shrq $2, %rax
230 # endif
231 VZEROUPPER
232 ret
233
234 .p2align 4
235 L(last_2x_vec):
236 addl $(VEC_SIZE * 2), %esi
237 VPCMPEQ (%rdi), %ymm0, %ymm1
238 vpmovmskb %ymm1, %eax
239 testl %eax, %eax
240
241 jnz L(first_vec_x0_check)
242 subl $VEC_SIZE, %esi
243 jle L(max)
244
245 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
246 vpmovmskb %ymm1, %eax
247 testl %eax, %eax
248 jnz L(first_vec_x1_check)
249 movq %r8, %rax
250 # ifdef USE_AS_WCSLEN
251 shrq $2, %rax
252 # endif
253 VZEROUPPER
254 ret
255
256 .p2align 4
257 L(first_vec_x0_check):
258 tzcntl %eax, %eax
259 /* Check the end of data. */
260 cmpq %rax, %rsi
261 jbe L(max)
262 addq %rdi, %rax
263 subq %rdx, %rax
264 # ifdef USE_AS_WCSLEN
265 shrq $2, %rax
266 # endif
267 VZEROUPPER
268 ret
269
270 .p2align 4
271 L(first_vec_x1_check):
272 tzcntl %eax, %eax
273 /* Check the end of data. */
274 cmpq %rax, %rsi
275 jbe L(max)
276 addq $VEC_SIZE, %rax
277 addq %rdi, %rax
278 subq %rdx, %rax
279 # ifdef USE_AS_WCSLEN
280 shrq $2, %rax
281 # endif
282 VZEROUPPER
283 ret
284
285 .p2align 4
286 L(first_vec_x2_check):
287 tzcntl %eax, %eax
288 /* Check the end of data. */
289 cmpq %rax, %rsi
290 jbe L(max)
291 addq $(VEC_SIZE * 2), %rax
292 addq %rdi, %rax
293 subq %rdx, %rax
294 # ifdef USE_AS_WCSLEN
295 shrq $2, %rax
296 # endif
297 VZEROUPPER
298 ret
299
300 .p2align 4
301 L(first_vec_x3_check):
302 tzcntl %eax, %eax
303 /* Check the end of data. */
304 cmpq %rax, %rsi
305 jbe L(max)
306 addq $(VEC_SIZE * 3), %rax
307 addq %rdi, %rax
308 subq %rdx, %rax
309 # ifdef USE_AS_WCSLEN
310 shrq $2, %rax
311 # endif
312 VZEROUPPER
313 ret
314
315 .p2align 4
316 L(max):
317 movq %r8, %rax
318 # ifdef USE_AS_WCSLEN
319 shrq $2, %rax
320 # endif
321 VZEROUPPER
322 ret
323
324 .p2align 4
325 L(zero):
326 xorl %eax, %eax
327 ret
328 # endif
329
330 .p2align 4
331 L(first_vec_x0):
332 tzcntl %eax, %eax
333 addq %rdi, %rax
334 subq %rdx, %rax
335 # ifdef USE_AS_WCSLEN
336 shrq $2, %rax
337 # endif
338 VZEROUPPER
339 ret
340
341 .p2align 4
342 L(first_vec_x1):
343 tzcntl %eax, %eax
344 addq $VEC_SIZE, %rax
345 addq %rdi, %rax
346 subq %rdx, %rax
347 # ifdef USE_AS_WCSLEN
348 shrq $2, %rax
349 # endif
350 VZEROUPPER
351 ret
352
353 .p2align 4
354 L(first_vec_x2):
355 tzcntl %eax, %eax
356 addq $(VEC_SIZE * 2), %rax
357 addq %rdi, %rax
358 subq %rdx, %rax
359 # ifdef USE_AS_WCSLEN
360 shrq $2, %rax
361 # endif
362 VZEROUPPER
363 ret
364
365 .p2align 4
366 L(4x_vec_end):
367 VPCMPEQ %ymm1, %ymm0, %ymm1
368 vpmovmskb %ymm1, %eax
369 testl %eax, %eax
370 jnz L(first_vec_x0)
371 VPCMPEQ %ymm2, %ymm0, %ymm2
372 vpmovmskb %ymm2, %eax
373 testl %eax, %eax
374 jnz L(first_vec_x1)
375 VPCMPEQ %ymm3, %ymm0, %ymm3
376 vpmovmskb %ymm3, %eax
377 testl %eax, %eax
378 jnz L(first_vec_x2)
379 VPCMPEQ %ymm4, %ymm0, %ymm4
380 vpmovmskb %ymm4, %eax
381 L(first_vec_x3):
382 tzcntl %eax, %eax
383 addq $(VEC_SIZE * 3), %rax
384 addq %rdi, %rax
385 subq %rdx, %rax
386 # ifdef USE_AS_WCSLEN
387 shrq $2, %rax
388 # endif
389 VZEROUPPER
390 ret
391
392 END (STRLEN)
393 #endif