]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strcmp-avx2.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-avx2.S
CommitLineData
14570163 1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
581c785b 2 Copyright (C) 2018-2022 Free Software Foundation, Inc.
14570163
LS
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
5a82c748 17 <https://www.gnu.org/licenses/>. */
14570163
LS
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24# define STRCMP __strcmp_avx2
25# endif
26
27# define PAGE_SIZE 4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE 32
31
32/* Shift for dividing by (VEC_SIZE * 4). */
33# define DIVIDE_BY_VEC_4_SHIFT 7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# ifdef USE_AS_WCSCMP
39/* Compare packed dwords. */
40# define VPCMPEQ vpcmpeqd
41/* Compare packed dwords and store minimum. */
42# define VPMINU vpminud
43/* 1 dword char == 4 bytes. */
44# define SIZE_OF_CHAR 4
45# else
46/* Compare packed bytes. */
47# define VPCMPEQ vpcmpeqb
48/* Compare packed bytes and store minimum. */
49# define VPMINU vpminub
50/* 1 byte char == 1 byte. */
51# define SIZE_OF_CHAR 1
52# endif
53
54# ifndef VZEROUPPER
55# define VZEROUPPER vzeroupper
56# endif
57
7ebba913
L
58# ifndef SECTION
59# define SECTION(p) p##.avx
60# endif
61
14570163
LS
62/* Warning!
63 wcscmp/wcsncmp have to use SIGNED comparison for elements.
64 strcmp/strncmp have to use UNSIGNED comparison for elements.
65*/
66
67/* The main idea of the string comparison (byte or dword) using AVX2
68 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
69 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
70 to check the null char, algorithm keeps the matched bytes/dwords,
71 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
72 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
73 one VPMINU instructions, together with movdqu and testl instructions.
74 Main loop (away from from page boundary) compares 4 vectors are a time,
75 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
76
77 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
78 is the same as strcmp, except that an a maximum offset is tracked. If
79 the maximum offset is reached before a difference is found, zero is
80 returned. */
81
7ebba913 82 .section SECTION(.text),"ax",@progbits
14570163
LS
83ENTRY (STRCMP)
84# ifdef USE_AS_STRNCMP
85 /* Check for simple cases (0 or 1) in offset. */
ee915088 86 cmp $1, %RDX_LP
14570163
LS
87 je L(char0)
88 jb L(zero)
89# ifdef USE_AS_WCSCMP
90 /* Convert units: from wide to byte char. */
ee915088 91 shl $2, %RDX_LP
14570163
LS
92# endif
93 /* Register %r11 tracks the maximum offset. */
ee915088 94 mov %RDX_LP, %R11_LP
14570163
LS
95# endif
96 movl %edi, %eax
97 xorl %edx, %edx
a35a5903
L
98 /* Make %xmm7 (%ymm7) all zeros in this function. */
99 vpxor %xmm7, %xmm7, %xmm7
14570163
LS
100 orl %esi, %eax
101 andl $(PAGE_SIZE - 1), %eax
102 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
103 jg L(cross_page)
104 /* Start comparing 4 vectors. */
105 vmovdqu (%rdi), %ymm1
106 VPCMPEQ (%rsi), %ymm1, %ymm0
107 VPMINU %ymm1, %ymm0, %ymm0
108 VPCMPEQ %ymm7, %ymm0, %ymm0
109 vpmovmskb %ymm0, %ecx
110 testl %ecx, %ecx
111 je L(next_3_vectors)
112 tzcntl %ecx, %edx
113# ifdef USE_AS_STRNCMP
114 /* Return 0 if the mismatched index (%rdx) is after the maximum
115 offset (%r11). */
116 cmpq %r11, %rdx
117 jae L(zero)
118# endif
119# ifdef USE_AS_WCSCMP
120 xorl %eax, %eax
121 movl (%rdi, %rdx), %ecx
122 cmpl (%rsi, %rdx), %ecx
123 je L(return)
124L(wcscmp_return):
125 setl %al
126 negl %eax
127 orl $1, %eax
128L(return):
129# else
130 movzbl (%rdi, %rdx), %eax
131 movzbl (%rsi, %rdx), %edx
132 subl %edx, %eax
133# endif
7ebba913
L
134L(return_vzeroupper):
135 ZERO_UPPER_VEC_REGISTERS_RETURN
14570163
LS
136
137 .p2align 4
138L(return_vec_size):
139 tzcntl %ecx, %edx
140# ifdef USE_AS_STRNCMP
141 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
142 the maximum offset (%r11). */
143 addq $VEC_SIZE, %rdx
144 cmpq %r11, %rdx
145 jae L(zero)
146# ifdef USE_AS_WCSCMP
147 xorl %eax, %eax
148 movl (%rdi, %rdx), %ecx
149 cmpl (%rsi, %rdx), %ecx
150 jne L(wcscmp_return)
151# else
152 movzbl (%rdi, %rdx), %eax
153 movzbl (%rsi, %rdx), %edx
154 subl %edx, %eax
155# endif
156# else
157# ifdef USE_AS_WCSCMP
158 xorl %eax, %eax
159 movl VEC_SIZE(%rdi, %rdx), %ecx
160 cmpl VEC_SIZE(%rsi, %rdx), %ecx
161 jne L(wcscmp_return)
162# else
163 movzbl VEC_SIZE(%rdi, %rdx), %eax
164 movzbl VEC_SIZE(%rsi, %rdx), %edx
165 subl %edx, %eax
166# endif
167# endif
7ebba913 168 VZEROUPPER_RETURN
14570163
LS
169
170 .p2align 4
171L(return_2_vec_size):
172 tzcntl %ecx, %edx
173# ifdef USE_AS_STRNCMP
174 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
175 after the maximum offset (%r11). */
176 addq $(VEC_SIZE * 2), %rdx
177 cmpq %r11, %rdx
178 jae L(zero)
179# ifdef USE_AS_WCSCMP
180 xorl %eax, %eax
181 movl (%rdi, %rdx), %ecx
182 cmpl (%rsi, %rdx), %ecx
183 jne L(wcscmp_return)
184# else
185 movzbl (%rdi, %rdx), %eax
186 movzbl (%rsi, %rdx), %edx
187 subl %edx, %eax
188# endif
189# else
190# ifdef USE_AS_WCSCMP
191 xorl %eax, %eax
192 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
193 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
194 jne L(wcscmp_return)
195# else
196 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
197 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
198 subl %edx, %eax
199# endif
200# endif
7ebba913 201 VZEROUPPER_RETURN
14570163
LS
202
203 .p2align 4
204L(return_3_vec_size):
205 tzcntl %ecx, %edx
206# ifdef USE_AS_STRNCMP
207 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
208 after the maximum offset (%r11). */
209 addq $(VEC_SIZE * 3), %rdx
210 cmpq %r11, %rdx
211 jae L(zero)
212# ifdef USE_AS_WCSCMP
213 xorl %eax, %eax
214 movl (%rdi, %rdx), %ecx
215 cmpl (%rsi, %rdx), %ecx
216 jne L(wcscmp_return)
217# else
218 movzbl (%rdi, %rdx), %eax
219 movzbl (%rsi, %rdx), %edx
220 subl %edx, %eax
221# endif
222# else
223# ifdef USE_AS_WCSCMP
224 xorl %eax, %eax
225 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
226 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
227 jne L(wcscmp_return)
228# else
229 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
230 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
231 subl %edx, %eax
232# endif
233# endif
7ebba913 234 VZEROUPPER_RETURN
14570163
LS
235
236 .p2align 4
237L(next_3_vectors):
238 vmovdqu VEC_SIZE(%rdi), %ymm6
239 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
240 VPMINU %ymm6, %ymm3, %ymm3
241 VPCMPEQ %ymm7, %ymm3, %ymm3
242 vpmovmskb %ymm3, %ecx
243 testl %ecx, %ecx
244 jne L(return_vec_size)
245 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
246 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
247 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
248 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
249 VPMINU %ymm5, %ymm2, %ymm2
250 VPCMPEQ %ymm4, %ymm0, %ymm0
251 VPCMPEQ %ymm7, %ymm2, %ymm2
252 vpmovmskb %ymm2, %ecx
253 testl %ecx, %ecx
254 jne L(return_2_vec_size)
255 VPMINU %ymm4, %ymm0, %ymm0
256 VPCMPEQ %ymm7, %ymm0, %ymm0
257 vpmovmskb %ymm0, %ecx
258 testl %ecx, %ecx
259 jne L(return_3_vec_size)
260L(main_loop_header):
261 leaq (VEC_SIZE * 4)(%rdi), %rdx
262 movl $PAGE_SIZE, %ecx
263 /* Align load via RAX. */
264 andq $-(VEC_SIZE * 4), %rdx
265 subq %rdi, %rdx
266 leaq (%rdi, %rdx), %rax
267# ifdef USE_AS_STRNCMP
268 /* Starting from this point, the maximum offset, or simply the
269 'offset', DECREASES by the same amount when base pointers are
270 moved forward. Return 0 when:
271 1) On match: offset <= the matched vector index.
272 2) On mistmach, offset is before the mistmatched index.
273 */
274 subq %rdx, %r11
275 jbe L(zero)
276# endif
277 addq %rsi, %rdx
278 movq %rdx, %rsi
279 andl $(PAGE_SIZE - 1), %esi
280 /* Number of bytes before page crossing. */
281 subq %rsi, %rcx
282 /* Number of VEC_SIZE * 4 blocks before page crossing. */
283 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
284 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
285 movl %ecx, %esi
286 jmp L(loop_start)
287
288 .p2align 4
289L(loop):
290# ifdef USE_AS_STRNCMP
291 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
292 the maximum offset (%r11) by the same amount. */
293 subq $(VEC_SIZE * 4), %r11
294 jbe L(zero)
295# endif
296 addq $(VEC_SIZE * 4), %rax
297 addq $(VEC_SIZE * 4), %rdx
298L(loop_start):
299 testl %esi, %esi
300 leal -1(%esi), %esi
301 je L(loop_cross_page)
302L(back_to_loop):
303 /* Main loop, comparing 4 vectors are a time. */
304 vmovdqa (%rax), %ymm0
305 vmovdqa VEC_SIZE(%rax), %ymm3
306 VPCMPEQ (%rdx), %ymm0, %ymm4
307 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
308 VPMINU %ymm0, %ymm4, %ymm4
309 VPMINU %ymm3, %ymm1, %ymm1
310 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
311 VPMINU %ymm1, %ymm4, %ymm0
312 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
313 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
314 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
315 VPMINU %ymm2, %ymm5, %ymm5
316 VPMINU %ymm3, %ymm6, %ymm6
317 VPMINU %ymm5, %ymm0, %ymm0
318 VPMINU %ymm6, %ymm0, %ymm0
319 VPCMPEQ %ymm7, %ymm0, %ymm0
320
321 /* Test each mask (32 bits) individually because for VEC_SIZE
322 == 32 is not possible to OR the four masks and keep all bits
323 in a 64-bit integer register, differing from SSE2 strcmp
324 where ORing is possible. */
325 vpmovmskb %ymm0, %ecx
326 testl %ecx, %ecx
327 je L(loop)
328 VPCMPEQ %ymm7, %ymm4, %ymm0
329 vpmovmskb %ymm0, %edi
330 testl %edi, %edi
331 je L(test_vec)
332 tzcntl %edi, %ecx
333# ifdef USE_AS_STRNCMP
334 cmpq %rcx, %r11
335 jbe L(zero)
336# ifdef USE_AS_WCSCMP
337 movq %rax, %rsi
338 xorl %eax, %eax
339 movl (%rsi, %rcx), %edi
340 cmpl (%rdx, %rcx), %edi
341 jne L(wcscmp_return)
342# else
343 movzbl (%rax, %rcx), %eax
344 movzbl (%rdx, %rcx), %edx
345 subl %edx, %eax
346# endif
347# else
348# ifdef USE_AS_WCSCMP
349 movq %rax, %rsi
350 xorl %eax, %eax
351 movl (%rsi, %rcx), %edi
352 cmpl (%rdx, %rcx), %edi
353 jne L(wcscmp_return)
354# else
355 movzbl (%rax, %rcx), %eax
356 movzbl (%rdx, %rcx), %edx
357 subl %edx, %eax
358# endif
359# endif
7ebba913 360 VZEROUPPER_RETURN
14570163
LS
361
362 .p2align 4
363L(test_vec):
364# ifdef USE_AS_STRNCMP
365 /* The first vector matched. Return 0 if the maximum offset
366 (%r11) <= VEC_SIZE. */
367 cmpq $VEC_SIZE, %r11
368 jbe L(zero)
369# endif
370 VPCMPEQ %ymm7, %ymm1, %ymm1
371 vpmovmskb %ymm1, %ecx
372 testl %ecx, %ecx
373 je L(test_2_vec)
374 tzcntl %ecx, %edi
375# ifdef USE_AS_STRNCMP
376 addq $VEC_SIZE, %rdi
377 cmpq %rdi, %r11
378 jbe L(zero)
379# ifdef USE_AS_WCSCMP
380 movq %rax, %rsi
381 xorl %eax, %eax
382 movl (%rsi, %rdi), %ecx
383 cmpl (%rdx, %rdi), %ecx
384 jne L(wcscmp_return)
385# else
386 movzbl (%rax, %rdi), %eax
387 movzbl (%rdx, %rdi), %edx
388 subl %edx, %eax
389# endif
390# else
391# ifdef USE_AS_WCSCMP
392 movq %rax, %rsi
393 xorl %eax, %eax
394 movl VEC_SIZE(%rsi, %rdi), %ecx
395 cmpl VEC_SIZE(%rdx, %rdi), %ecx
396 jne L(wcscmp_return)
397# else
398 movzbl VEC_SIZE(%rax, %rdi), %eax
399 movzbl VEC_SIZE(%rdx, %rdi), %edx
400 subl %edx, %eax
401# endif
402# endif
7ebba913 403 VZEROUPPER_RETURN
14570163
LS
404
405 .p2align 4
406L(test_2_vec):
407# ifdef USE_AS_STRNCMP
408 /* The first 2 vectors matched. Return 0 if the maximum offset
409 (%r11) <= 2 * VEC_SIZE. */
410 cmpq $(VEC_SIZE * 2), %r11
411 jbe L(zero)
412# endif
413 VPCMPEQ %ymm7, %ymm5, %ymm5
414 vpmovmskb %ymm5, %ecx
415 testl %ecx, %ecx
416 je L(test_3_vec)
417 tzcntl %ecx, %edi
418# ifdef USE_AS_STRNCMP
419 addq $(VEC_SIZE * 2), %rdi
420 cmpq %rdi, %r11
421 jbe L(zero)
422# ifdef USE_AS_WCSCMP
423 movq %rax, %rsi
424 xorl %eax, %eax
425 movl (%rsi, %rdi), %ecx
426 cmpl (%rdx, %rdi), %ecx
427 jne L(wcscmp_return)
428# else
429 movzbl (%rax, %rdi), %eax
430 movzbl (%rdx, %rdi), %edx
431 subl %edx, %eax
432# endif
433# else
434# ifdef USE_AS_WCSCMP
435 movq %rax, %rsi
436 xorl %eax, %eax
437 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
438 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
439 jne L(wcscmp_return)
440# else
441 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
442 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
443 subl %edx, %eax
444# endif
445# endif
7ebba913 446 VZEROUPPER_RETURN
14570163
LS
447
448 .p2align 4
449L(test_3_vec):
450# ifdef USE_AS_STRNCMP
451 /* The first 3 vectors matched. Return 0 if the maximum offset
452 (%r11) <= 3 * VEC_SIZE. */
453 cmpq $(VEC_SIZE * 3), %r11
454 jbe L(zero)
455# endif
456 VPCMPEQ %ymm7, %ymm6, %ymm6
457 vpmovmskb %ymm6, %esi
458 tzcntl %esi, %ecx
459# ifdef USE_AS_STRNCMP
460 addq $(VEC_SIZE * 3), %rcx
461 cmpq %rcx, %r11
462 jbe L(zero)
463# ifdef USE_AS_WCSCMP
464 movq %rax, %rsi
465 xorl %eax, %eax
466 movl (%rsi, %rcx), %esi
467 cmpl (%rdx, %rcx), %esi
468 jne L(wcscmp_return)
469# else
470 movzbl (%rax, %rcx), %eax
471 movzbl (%rdx, %rcx), %edx
472 subl %edx, %eax
473# endif
474# else
475# ifdef USE_AS_WCSCMP
476 movq %rax, %rsi
477 xorl %eax, %eax
478 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
479 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
480 jne L(wcscmp_return)
481# else
482 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
483 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
484 subl %edx, %eax
485# endif
486# endif
7ebba913 487 VZEROUPPER_RETURN
14570163
LS
488
489 .p2align 4
490L(loop_cross_page):
491 xorl %r10d, %r10d
492 movq %rdx, %rcx
493 /* Align load via RDX. We load the extra ECX bytes which should
494 be ignored. */
495 andl $((VEC_SIZE * 4) - 1), %ecx
496 /* R10 is -RCX. */
497 subq %rcx, %r10
498
499 /* This works only if VEC_SIZE * 2 == 64. */
500# if (VEC_SIZE * 2) != 64
501# error (VEC_SIZE * 2) != 64
502# endif
503
504 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
505 cmpl $(VEC_SIZE * 2), %ecx
506 jge L(loop_cross_page_2_vec)
507
508 vmovdqu (%rax, %r10), %ymm2
509 vmovdqu VEC_SIZE(%rax, %r10), %ymm3
510 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
511 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
512 VPMINU %ymm2, %ymm0, %ymm0
513 VPMINU %ymm3, %ymm1, %ymm1
514 VPCMPEQ %ymm7, %ymm0, %ymm0
515 VPCMPEQ %ymm7, %ymm1, %ymm1
516
517 vpmovmskb %ymm0, %edi
518 vpmovmskb %ymm1, %esi
519
520 salq $32, %rsi
521 xorq %rsi, %rdi
522
523 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
524 shrq %cl, %rdi
525
526 testq %rdi, %rdi
527 je L(loop_cross_page_2_vec)
528 tzcntq %rdi, %rcx
529# ifdef USE_AS_STRNCMP
530 cmpq %rcx, %r11
531 jbe L(zero)
532# ifdef USE_AS_WCSCMP
533 movq %rax, %rsi
534 xorl %eax, %eax
535 movl (%rsi, %rcx), %edi
536 cmpl (%rdx, %rcx), %edi
537 jne L(wcscmp_return)
538# else
539 movzbl (%rax, %rcx), %eax
540 movzbl (%rdx, %rcx), %edx
541 subl %edx, %eax
542# endif
543# else
544# ifdef USE_AS_WCSCMP
545 movq %rax, %rsi
546 xorl %eax, %eax
547 movl (%rsi, %rcx), %edi
548 cmpl (%rdx, %rcx), %edi
549 jne L(wcscmp_return)
550# else
551 movzbl (%rax, %rcx), %eax
552 movzbl (%rdx, %rcx), %edx
553 subl %edx, %eax
554# endif
555# endif
7ebba913 556 VZEROUPPER_RETURN
14570163
LS
557
558 .p2align 4
559L(loop_cross_page_2_vec):
560 /* The first VEC_SIZE * 2 bytes match or are ignored. */
561 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
562 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
563 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
564 VPMINU %ymm2, %ymm5, %ymm5
565 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
566 VPCMPEQ %ymm7, %ymm5, %ymm5
567 VPMINU %ymm3, %ymm6, %ymm6
568 VPCMPEQ %ymm7, %ymm6, %ymm6
569
570 vpmovmskb %ymm5, %edi
571 vpmovmskb %ymm6, %esi
572
573 salq $32, %rsi
574 xorq %rsi, %rdi
575
576 xorl %r8d, %r8d
577 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
578 subl $(VEC_SIZE * 2), %ecx
579 jle 1f
580 /* Skip ECX bytes. */
581 shrq %cl, %rdi
582 /* R8 has number of bytes skipped. */
583 movl %ecx, %r8d
5841:
585 /* Before jumping back to the loop, set ESI to the number of
586 VEC_SIZE * 4 blocks before page crossing. */
587 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
588
589 testq %rdi, %rdi
75870237
SP
590# ifdef USE_AS_STRNCMP
591 /* At this point, if %rdi value is 0, it already tested
592 VEC_SIZE*4+%r10 byte starting from %rax. This label
593 checks whether strncmp maximum offset reached or not. */
594 je L(string_nbyte_offset_check)
595# else
14570163 596 je L(back_to_loop)
75870237 597# endif
14570163
LS
598 tzcntq %rdi, %rcx
599 addq %r10, %rcx
600 /* Adjust for number of bytes skipped. */
601 addq %r8, %rcx
602# ifdef USE_AS_STRNCMP
603 addq $(VEC_SIZE * 2), %rcx
604 subq %rcx, %r11
605 jbe L(zero)
606# ifdef USE_AS_WCSCMP
607 movq %rax, %rsi
608 xorl %eax, %eax
609 movl (%rsi, %rcx), %edi
610 cmpl (%rdx, %rcx), %edi
611 jne L(wcscmp_return)
612# else
613 movzbl (%rax, %rcx), %eax
614 movzbl (%rdx, %rcx), %edx
615 subl %edx, %eax
616# endif
617# else
618# ifdef USE_AS_WCSCMP
619 movq %rax, %rsi
620 xorl %eax, %eax
621 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
622 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
623 jne L(wcscmp_return)
624# else
625 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
626 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
627 subl %edx, %eax
628# endif
629# endif
7ebba913 630 VZEROUPPER_RETURN
14570163 631
75870237
SP
632# ifdef USE_AS_STRNCMP
633L(string_nbyte_offset_check):
634 leaq (VEC_SIZE * 4)(%r10), %r10
635 cmpq %r10, %r11
636 jbe L(zero)
637 jmp L(back_to_loop)
638# endif
639
14570163
LS
640 .p2align 4
641L(cross_page_loop):
642 /* Check one byte/dword at a time. */
643# ifdef USE_AS_WCSCMP
644 cmpl %ecx, %eax
645# else
646 subl %ecx, %eax
647# endif
648 jne L(different)
649 addl $SIZE_OF_CHAR, %edx
650 cmpl $(VEC_SIZE * 4), %edx
651 je L(main_loop_header)
652# ifdef USE_AS_STRNCMP
653 cmpq %r11, %rdx
654 jae L(zero)
655# endif
656# ifdef USE_AS_WCSCMP
657 movl (%rdi, %rdx), %eax
658 movl (%rsi, %rdx), %ecx
659# else
660 movzbl (%rdi, %rdx), %eax
661 movzbl (%rsi, %rdx), %ecx
662# endif
663 /* Check null char. */
664 testl %eax, %eax
665 jne L(cross_page_loop)
666 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
667 comparisons. */
668 subl %ecx, %eax
669# ifndef USE_AS_WCSCMP
670L(different):
671# endif
7ebba913 672 VZEROUPPER_RETURN
14570163
LS
673
674# ifdef USE_AS_WCSCMP
675 .p2align 4
676L(different):
677 /* Use movl to avoid modifying EFLAGS. */
678 movl $0, %eax
679 setl %al
680 negl %eax
681 orl $1, %eax
7ebba913 682 VZEROUPPER_RETURN
14570163
LS
683# endif
684
685# ifdef USE_AS_STRNCMP
686 .p2align 4
687L(zero):
688 xorl %eax, %eax
7ebba913 689 VZEROUPPER_RETURN
14570163
LS
690
691 .p2align 4
692L(char0):
693# ifdef USE_AS_WCSCMP
694 xorl %eax, %eax
695 movl (%rdi), %ecx
696 cmpl (%rsi), %ecx
697 jne L(wcscmp_return)
698# else
699 movzbl (%rsi), %ecx
700 movzbl (%rdi), %eax
701 subl %ecx, %eax
702# endif
7ebba913 703 VZEROUPPER_RETURN
14570163
LS
704# endif
705
706 .p2align 4
707L(last_vector):
708 addq %rdx, %rdi
709 addq %rdx, %rsi
710# ifdef USE_AS_STRNCMP
711 subq %rdx, %r11
712# endif
713 tzcntl %ecx, %edx
714# ifdef USE_AS_STRNCMP
715 cmpq %r11, %rdx
716 jae L(zero)
717# endif
718# ifdef USE_AS_WCSCMP
719 xorl %eax, %eax
720 movl (%rdi, %rdx), %ecx
721 cmpl (%rsi, %rdx), %ecx
722 jne L(wcscmp_return)
723# else
724 movzbl (%rdi, %rdx), %eax
725 movzbl (%rsi, %rdx), %edx
726 subl %edx, %eax
727# endif
7ebba913 728 VZEROUPPER_RETURN
14570163
LS
729
730 /* Comparing on page boundary region requires special treatment:
731 It must done one vector at the time, starting with the wider
732 ymm vector if possible, if not, with xmm. If fetching 16 bytes
733 (xmm) still passes the boundary, byte comparison must be done.
734 */
735 .p2align 4
736L(cross_page):
737 /* Try one ymm vector at a time. */
738 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
739 jg L(cross_page_1_vector)
740L(loop_1_vector):
741 vmovdqu (%rdi, %rdx), %ymm1
742 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
743 VPMINU %ymm1, %ymm0, %ymm0
744 VPCMPEQ %ymm7, %ymm0, %ymm0
745 vpmovmskb %ymm0, %ecx
746 testl %ecx, %ecx
747 jne L(last_vector)
748
749 addl $VEC_SIZE, %edx
750
751 addl $VEC_SIZE, %eax
752# ifdef USE_AS_STRNCMP
753 /* Return 0 if the current offset (%rdx) >= the maximum offset
754 (%r11). */
755 cmpq %r11, %rdx
756 jae L(zero)
757# endif
758 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
759 jle L(loop_1_vector)
760L(cross_page_1_vector):
761 /* Less than 32 bytes to check, try one xmm vector. */
762 cmpl $(PAGE_SIZE - 16), %eax
763 jg L(cross_page_1_xmm)
764 vmovdqu (%rdi, %rdx), %xmm1
765 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
766 VPMINU %xmm1, %xmm0, %xmm0
767 VPCMPEQ %xmm7, %xmm0, %xmm0
768 vpmovmskb %xmm0, %ecx
769 testl %ecx, %ecx
770 jne L(last_vector)
771
772 addl $16, %edx
773# ifndef USE_AS_WCSCMP
774 addl $16, %eax
775# endif
776# ifdef USE_AS_STRNCMP
777 /* Return 0 if the current offset (%rdx) >= the maximum offset
778 (%r11). */
779 cmpq %r11, %rdx
780 jae L(zero)
781# endif
782
783L(cross_page_1_xmm):
784# ifndef USE_AS_WCSCMP
785 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
786 for wcscmp nor wcsncmp since wide char is 4 bytes. */
787 cmpl $(PAGE_SIZE - 8), %eax
788 jg L(cross_page_8bytes)
789 vmovq (%rdi, %rdx), %xmm1
790 vmovq (%rsi, %rdx), %xmm0
791 VPCMPEQ %xmm0, %xmm1, %xmm0
792 VPMINU %xmm1, %xmm0, %xmm0
793 VPCMPEQ %xmm7, %xmm0, %xmm0
794 vpmovmskb %xmm0, %ecx
795 /* Only last 8 bits are valid. */
796 andl $0xff, %ecx
797 testl %ecx, %ecx
798 jne L(last_vector)
799
800 addl $8, %edx
801 addl $8, %eax
802# ifdef USE_AS_STRNCMP
803 /* Return 0 if the current offset (%rdx) >= the maximum offset
804 (%r11). */
805 cmpq %r11, %rdx
806 jae L(zero)
807# endif
808
809L(cross_page_8bytes):
810 /* Less than 8 bytes to check, try 4 byte vector. */
811 cmpl $(PAGE_SIZE - 4), %eax
812 jg L(cross_page_4bytes)
813 vmovd (%rdi, %rdx), %xmm1
814 vmovd (%rsi, %rdx), %xmm0
815 VPCMPEQ %xmm0, %xmm1, %xmm0
816 VPMINU %xmm1, %xmm0, %xmm0
817 VPCMPEQ %xmm7, %xmm0, %xmm0
818 vpmovmskb %xmm0, %ecx
819 /* Only last 4 bits are valid. */
820 andl $0xf, %ecx
821 testl %ecx, %ecx
822 jne L(last_vector)
823
824 addl $4, %edx
825# ifdef USE_AS_STRNCMP
826 /* Return 0 if the current offset (%rdx) >= the maximum offset
827 (%r11). */
828 cmpq %r11, %rdx
829 jae L(zero)
830# endif
831
832L(cross_page_4bytes):
833# endif
834 /* Less than 4 bytes to check, try one byte/dword at a time. */
835# ifdef USE_AS_STRNCMP
836 cmpq %r11, %rdx
837 jae L(zero)
838# endif
839# ifdef USE_AS_WCSCMP
840 movl (%rdi, %rdx), %eax
841 movl (%rsi, %rdx), %ecx
842# else
843 movzbl (%rdi, %rdx), %eax
844 movzbl (%rsi, %rdx), %ecx
845# endif
846 testl %eax, %eax
847 jne L(cross_page_loop)
848 subl %ecx, %eax
7ebba913 849 VZEROUPPER_RETURN
14570163
LS
850END (STRCMP)
851#endif