]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/strcmp-evex.S
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-evex.S
1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef STRCMP
24 # define STRCMP __strcmp_evex
25 # endif
26
27 # define PAGE_SIZE 4096
28
29 /* VEC_SIZE = Number of bytes in a ymm register */
30 # define VEC_SIZE 32
31
32 /* Shift for dividing by (VEC_SIZE * 4). */
33 # define DIVIDE_BY_VEC_4_SHIFT 7
34 # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35 # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36 # endif
37
38 # define VMOVU vmovdqu64
39 # define VMOVA vmovdqa64
40
41 # ifdef USE_AS_WCSCMP
42 /* Compare packed dwords. */
43 # define VPCMP vpcmpd
44 # define SHIFT_REG32 r8d
45 # define SHIFT_REG64 r8
46 /* 1 dword char == 4 bytes. */
47 # define SIZE_OF_CHAR 4
48 # else
49 /* Compare packed bytes. */
50 # define VPCMP vpcmpb
51 # define SHIFT_REG32 ecx
52 # define SHIFT_REG64 rcx
53 /* 1 byte char == 1 byte. */
54 # define SIZE_OF_CHAR 1
55 # endif
56
57 # define XMMZERO xmm16
58 # define XMM0 xmm17
59 # define XMM1 xmm18
60
61 # define YMMZERO ymm16
62 # define YMM0 ymm17
63 # define YMM1 ymm18
64 # define YMM2 ymm19
65 # define YMM3 ymm20
66 # define YMM4 ymm21
67 # define YMM5 ymm22
68 # define YMM6 ymm23
69 # define YMM7 ymm24
70
71 /* Warning!
72 wcscmp/wcsncmp have to use SIGNED comparison for elements.
73 strcmp/strncmp have to use UNSIGNED comparison for elements.
74 */
75
76 /* The main idea of the string comparison (byte or dword) using 256-bit
77 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
78 latter can be on either packed bytes or dwords depending on
79 USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
80 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
81 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
82 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
83 instructions. Main loop (away from from page boundary) compares 4
84 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
85 bytes) on each loop.
86
87 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
88 is the same as strcmp, except that an a maximum offset is tracked. If
89 the maximum offset is reached before a difference is found, zero is
90 returned. */
91
92 .section .text.evex,"ax",@progbits
93 ENTRY (STRCMP)
94 # ifdef USE_AS_STRNCMP
95 /* Check for simple cases (0 or 1) in offset. */
96 cmp $1, %RDX_LP
97 je L(char0)
98 jb L(zero)
99 # ifdef USE_AS_WCSCMP
100 /* Convert units: from wide to byte char. */
101 shl $2, %RDX_LP
102 # endif
103 /* Register %r11 tracks the maximum offset. */
104 mov %RDX_LP, %R11_LP
105 # endif
106 movl %edi, %eax
107 xorl %edx, %edx
108 /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
109 vpxorq %XMMZERO, %XMMZERO, %XMMZERO
110 orl %esi, %eax
111 andl $(PAGE_SIZE - 1), %eax
112 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
113 jg L(cross_page)
114 /* Start comparing 4 vectors. */
115 VMOVU (%rdi), %YMM0
116 VMOVU (%rsi), %YMM1
117
118 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
119 VPCMP $4, %YMM0, %YMM1, %k0
120
121 /* Check for NULL in YMM0. */
122 VPCMP $0, %YMMZERO, %YMM0, %k1
123 /* Check for NULL in YMM1. */
124 VPCMP $0, %YMMZERO, %YMM1, %k2
125 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
126 kord %k1, %k2, %k1
127
128 /* Each bit in K1 represents:
129 1. A mismatch in YMM0 and YMM1. Or
130 2. A NULL in YMM0 or YMM1.
131 */
132 kord %k0, %k1, %k1
133
134 ktestd %k1, %k1
135 je L(next_3_vectors)
136 kmovd %k1, %ecx
137 tzcntl %ecx, %edx
138 # ifdef USE_AS_WCSCMP
139 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
140 sall $2, %edx
141 # endif
142 # ifdef USE_AS_STRNCMP
143 /* Return 0 if the mismatched index (%rdx) is after the maximum
144 offset (%r11). */
145 cmpq %r11, %rdx
146 jae L(zero)
147 # endif
148 # ifdef USE_AS_WCSCMP
149 xorl %eax, %eax
150 movl (%rdi, %rdx), %ecx
151 cmpl (%rsi, %rdx), %ecx
152 je L(return)
153 L(wcscmp_return):
154 setl %al
155 negl %eax
156 orl $1, %eax
157 L(return):
158 # else
159 movzbl (%rdi, %rdx), %eax
160 movzbl (%rsi, %rdx), %edx
161 subl %edx, %eax
162 # endif
163 ret
164
165 .p2align 4
166 L(return_vec_size):
167 kmovd %k1, %ecx
168 tzcntl %ecx, %edx
169 # ifdef USE_AS_WCSCMP
170 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
171 sall $2, %edx
172 # endif
173 # ifdef USE_AS_STRNCMP
174 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
175 the maximum offset (%r11). */
176 addq $VEC_SIZE, %rdx
177 cmpq %r11, %rdx
178 jae L(zero)
179 # ifdef USE_AS_WCSCMP
180 xorl %eax, %eax
181 movl (%rdi, %rdx), %ecx
182 cmpl (%rsi, %rdx), %ecx
183 jne L(wcscmp_return)
184 # else
185 movzbl (%rdi, %rdx), %eax
186 movzbl (%rsi, %rdx), %edx
187 subl %edx, %eax
188 # endif
189 # else
190 # ifdef USE_AS_WCSCMP
191 xorl %eax, %eax
192 movl VEC_SIZE(%rdi, %rdx), %ecx
193 cmpl VEC_SIZE(%rsi, %rdx), %ecx
194 jne L(wcscmp_return)
195 # else
196 movzbl VEC_SIZE(%rdi, %rdx), %eax
197 movzbl VEC_SIZE(%rsi, %rdx), %edx
198 subl %edx, %eax
199 # endif
200 # endif
201 ret
202
203 .p2align 4
204 L(return_2_vec_size):
205 kmovd %k1, %ecx
206 tzcntl %ecx, %edx
207 # ifdef USE_AS_WCSCMP
208 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
209 sall $2, %edx
210 # endif
211 # ifdef USE_AS_STRNCMP
212 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
213 after the maximum offset (%r11). */
214 addq $(VEC_SIZE * 2), %rdx
215 cmpq %r11, %rdx
216 jae L(zero)
217 # ifdef USE_AS_WCSCMP
218 xorl %eax, %eax
219 movl (%rdi, %rdx), %ecx
220 cmpl (%rsi, %rdx), %ecx
221 jne L(wcscmp_return)
222 # else
223 movzbl (%rdi, %rdx), %eax
224 movzbl (%rsi, %rdx), %edx
225 subl %edx, %eax
226 # endif
227 # else
228 # ifdef USE_AS_WCSCMP
229 xorl %eax, %eax
230 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
231 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
232 jne L(wcscmp_return)
233 # else
234 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
235 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
236 subl %edx, %eax
237 # endif
238 # endif
239 ret
240
241 .p2align 4
242 L(return_3_vec_size):
243 kmovd %k1, %ecx
244 tzcntl %ecx, %edx
245 # ifdef USE_AS_WCSCMP
246 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
247 sall $2, %edx
248 # endif
249 # ifdef USE_AS_STRNCMP
250 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
251 after the maximum offset (%r11). */
252 addq $(VEC_SIZE * 3), %rdx
253 cmpq %r11, %rdx
254 jae L(zero)
255 # ifdef USE_AS_WCSCMP
256 xorl %eax, %eax
257 movl (%rdi, %rdx), %ecx
258 cmpl (%rsi, %rdx), %ecx
259 jne L(wcscmp_return)
260 # else
261 movzbl (%rdi, %rdx), %eax
262 movzbl (%rsi, %rdx), %edx
263 subl %edx, %eax
264 # endif
265 # else
266 # ifdef USE_AS_WCSCMP
267 xorl %eax, %eax
268 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
269 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
270 jne L(wcscmp_return)
271 # else
272 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
273 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
274 subl %edx, %eax
275 # endif
276 # endif
277 ret
278
279 .p2align 4
280 L(next_3_vectors):
281 VMOVU VEC_SIZE(%rdi), %YMM0
282 VMOVU VEC_SIZE(%rsi), %YMM1
283 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
284 VPCMP $4, %YMM0, %YMM1, %k0
285 VPCMP $0, %YMMZERO, %YMM0, %k1
286 VPCMP $0, %YMMZERO, %YMM1, %k2
287 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
288 kord %k1, %k2, %k1
289 /* Each bit in K1 represents a NULL or a mismatch. */
290 kord %k0, %k1, %k1
291 ktestd %k1, %k1
292 jne L(return_vec_size)
293
294 VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
295 VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
296 VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
297 VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
298
299 /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
300 VPCMP $4, %YMM2, %YMM4, %k0
301 VPCMP $0, %YMMZERO, %YMM2, %k1
302 VPCMP $0, %YMMZERO, %YMM4, %k2
303 /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
304 kord %k1, %k2, %k1
305 /* Each bit in K1 represents a NULL or a mismatch. */
306 kord %k0, %k1, %k1
307 ktestd %k1, %k1
308 jne L(return_2_vec_size)
309
310 /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
311 VPCMP $4, %YMM3, %YMM5, %k0
312 VPCMP $0, %YMMZERO, %YMM3, %k1
313 VPCMP $0, %YMMZERO, %YMM5, %k2
314 /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
315 kord %k1, %k2, %k1
316 /* Each bit in K1 represents a NULL or a mismatch. */
317 kord %k0, %k1, %k1
318 ktestd %k1, %k1
319 jne L(return_3_vec_size)
320 L(main_loop_header):
321 leaq (VEC_SIZE * 4)(%rdi), %rdx
322 movl $PAGE_SIZE, %ecx
323 /* Align load via RAX. */
324 andq $-(VEC_SIZE * 4), %rdx
325 subq %rdi, %rdx
326 leaq (%rdi, %rdx), %rax
327 # ifdef USE_AS_STRNCMP
328 /* Starting from this point, the maximum offset, or simply the
329 'offset', DECREASES by the same amount when base pointers are
330 moved forward. Return 0 when:
331 1) On match: offset <= the matched vector index.
332 2) On mistmach, offset is before the mistmatched index.
333 */
334 subq %rdx, %r11
335 jbe L(zero)
336 # endif
337 addq %rsi, %rdx
338 movq %rdx, %rsi
339 andl $(PAGE_SIZE - 1), %esi
340 /* Number of bytes before page crossing. */
341 subq %rsi, %rcx
342 /* Number of VEC_SIZE * 4 blocks before page crossing. */
343 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
344 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
345 movl %ecx, %esi
346 jmp L(loop_start)
347
348 .p2align 4
349 L(loop):
350 # ifdef USE_AS_STRNCMP
351 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
352 the maximum offset (%r11) by the same amount. */
353 subq $(VEC_SIZE * 4), %r11
354 jbe L(zero)
355 # endif
356 addq $(VEC_SIZE * 4), %rax
357 addq $(VEC_SIZE * 4), %rdx
358 L(loop_start):
359 testl %esi, %esi
360 leal -1(%esi), %esi
361 je L(loop_cross_page)
362 L(back_to_loop):
363 /* Main loop, comparing 4 vectors are a time. */
364 VMOVA (%rax), %YMM0
365 VMOVA VEC_SIZE(%rax), %YMM2
366 VMOVA (VEC_SIZE * 2)(%rax), %YMM4
367 VMOVA (VEC_SIZE * 3)(%rax), %YMM6
368 VMOVU (%rdx), %YMM1
369 VMOVU VEC_SIZE(%rdx), %YMM3
370 VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
371 VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
372
373 VPCMP $4, %YMM0, %YMM1, %k0
374 VPCMP $0, %YMMZERO, %YMM0, %k1
375 VPCMP $0, %YMMZERO, %YMM1, %k2
376 kord %k1, %k2, %k1
377 /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
378 YMM1. */
379 kord %k0, %k1, %k4
380
381 VPCMP $4, %YMM2, %YMM3, %k0
382 VPCMP $0, %YMMZERO, %YMM2, %k1
383 VPCMP $0, %YMMZERO, %YMM3, %k2
384 kord %k1, %k2, %k1
385 /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
386 YMM3. */
387 kord %k0, %k1, %k5
388
389 VPCMP $4, %YMM4, %YMM5, %k0
390 VPCMP $0, %YMMZERO, %YMM4, %k1
391 VPCMP $0, %YMMZERO, %YMM5, %k2
392 kord %k1, %k2, %k1
393 /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
394 YMM5. */
395 kord %k0, %k1, %k6
396
397 VPCMP $4, %YMM6, %YMM7, %k0
398 VPCMP $0, %YMMZERO, %YMM6, %k1
399 VPCMP $0, %YMMZERO, %YMM7, %k2
400 kord %k1, %k2, %k1
401 /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
402 YMM7. */
403 kord %k0, %k1, %k7
404
405 kord %k4, %k5, %k0
406 kord %k6, %k7, %k1
407
408 /* Test each mask (32 bits) individually because for VEC_SIZE
409 == 32 is not possible to OR the four masks and keep all bits
410 in a 64-bit integer register, differing from SSE2 strcmp
411 where ORing is possible. */
412 kortestd %k0, %k1
413 je L(loop)
414 ktestd %k4, %k4
415 je L(test_vec)
416 kmovd %k4, %edi
417 tzcntl %edi, %ecx
418 # ifdef USE_AS_WCSCMP
419 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
420 sall $2, %ecx
421 # endif
422 # ifdef USE_AS_STRNCMP
423 cmpq %rcx, %r11
424 jbe L(zero)
425 # ifdef USE_AS_WCSCMP
426 movq %rax, %rsi
427 xorl %eax, %eax
428 movl (%rsi, %rcx), %edi
429 cmpl (%rdx, %rcx), %edi
430 jne L(wcscmp_return)
431 # else
432 movzbl (%rax, %rcx), %eax
433 movzbl (%rdx, %rcx), %edx
434 subl %edx, %eax
435 # endif
436 # else
437 # ifdef USE_AS_WCSCMP
438 movq %rax, %rsi
439 xorl %eax, %eax
440 movl (%rsi, %rcx), %edi
441 cmpl (%rdx, %rcx), %edi
442 jne L(wcscmp_return)
443 # else
444 movzbl (%rax, %rcx), %eax
445 movzbl (%rdx, %rcx), %edx
446 subl %edx, %eax
447 # endif
448 # endif
449 ret
450
451 .p2align 4
452 L(test_vec):
453 # ifdef USE_AS_STRNCMP
454 /* The first vector matched. Return 0 if the maximum offset
455 (%r11) <= VEC_SIZE. */
456 cmpq $VEC_SIZE, %r11
457 jbe L(zero)
458 # endif
459 ktestd %k5, %k5
460 je L(test_2_vec)
461 kmovd %k5, %ecx
462 tzcntl %ecx, %edi
463 # ifdef USE_AS_WCSCMP
464 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
465 sall $2, %edi
466 # endif
467 # ifdef USE_AS_STRNCMP
468 addq $VEC_SIZE, %rdi
469 cmpq %rdi, %r11
470 jbe L(zero)
471 # ifdef USE_AS_WCSCMP
472 movq %rax, %rsi
473 xorl %eax, %eax
474 movl (%rsi, %rdi), %ecx
475 cmpl (%rdx, %rdi), %ecx
476 jne L(wcscmp_return)
477 # else
478 movzbl (%rax, %rdi), %eax
479 movzbl (%rdx, %rdi), %edx
480 subl %edx, %eax
481 # endif
482 # else
483 # ifdef USE_AS_WCSCMP
484 movq %rax, %rsi
485 xorl %eax, %eax
486 movl VEC_SIZE(%rsi, %rdi), %ecx
487 cmpl VEC_SIZE(%rdx, %rdi), %ecx
488 jne L(wcscmp_return)
489 # else
490 movzbl VEC_SIZE(%rax, %rdi), %eax
491 movzbl VEC_SIZE(%rdx, %rdi), %edx
492 subl %edx, %eax
493 # endif
494 # endif
495 ret
496
497 .p2align 4
498 L(test_2_vec):
499 # ifdef USE_AS_STRNCMP
500 /* The first 2 vectors matched. Return 0 if the maximum offset
501 (%r11) <= 2 * VEC_SIZE. */
502 cmpq $(VEC_SIZE * 2), %r11
503 jbe L(zero)
504 # endif
505 ktestd %k6, %k6
506 je L(test_3_vec)
507 kmovd %k6, %ecx
508 tzcntl %ecx, %edi
509 # ifdef USE_AS_WCSCMP
510 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
511 sall $2, %edi
512 # endif
513 # ifdef USE_AS_STRNCMP
514 addq $(VEC_SIZE * 2), %rdi
515 cmpq %rdi, %r11
516 jbe L(zero)
517 # ifdef USE_AS_WCSCMP
518 movq %rax, %rsi
519 xorl %eax, %eax
520 movl (%rsi, %rdi), %ecx
521 cmpl (%rdx, %rdi), %ecx
522 jne L(wcscmp_return)
523 # else
524 movzbl (%rax, %rdi), %eax
525 movzbl (%rdx, %rdi), %edx
526 subl %edx, %eax
527 # endif
528 # else
529 # ifdef USE_AS_WCSCMP
530 movq %rax, %rsi
531 xorl %eax, %eax
532 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
533 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
534 jne L(wcscmp_return)
535 # else
536 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
537 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
538 subl %edx, %eax
539 # endif
540 # endif
541 ret
542
543 .p2align 4
544 L(test_3_vec):
545 # ifdef USE_AS_STRNCMP
546 /* The first 3 vectors matched. Return 0 if the maximum offset
547 (%r11) <= 3 * VEC_SIZE. */
548 cmpq $(VEC_SIZE * 3), %r11
549 jbe L(zero)
550 # endif
551 kmovd %k7, %esi
552 tzcntl %esi, %ecx
553 # ifdef USE_AS_WCSCMP
554 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
555 sall $2, %ecx
556 # endif
557 # ifdef USE_AS_STRNCMP
558 addq $(VEC_SIZE * 3), %rcx
559 cmpq %rcx, %r11
560 jbe L(zero)
561 # ifdef USE_AS_WCSCMP
562 movq %rax, %rsi
563 xorl %eax, %eax
564 movl (%rsi, %rcx), %esi
565 cmpl (%rdx, %rcx), %esi
566 jne L(wcscmp_return)
567 # else
568 movzbl (%rax, %rcx), %eax
569 movzbl (%rdx, %rcx), %edx
570 subl %edx, %eax
571 # endif
572 # else
573 # ifdef USE_AS_WCSCMP
574 movq %rax, %rsi
575 xorl %eax, %eax
576 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
577 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
578 jne L(wcscmp_return)
579 # else
580 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
581 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
582 subl %edx, %eax
583 # endif
584 # endif
585 ret
586
587 .p2align 4
588 L(loop_cross_page):
589 xorl %r10d, %r10d
590 movq %rdx, %rcx
591 /* Align load via RDX. We load the extra ECX bytes which should
592 be ignored. */
593 andl $((VEC_SIZE * 4) - 1), %ecx
594 /* R10 is -RCX. */
595 subq %rcx, %r10
596
597 /* This works only if VEC_SIZE * 2 == 64. */
598 # if (VEC_SIZE * 2) != 64
599 # error (VEC_SIZE * 2) != 64
600 # endif
601
602 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
603 cmpl $(VEC_SIZE * 2), %ecx
604 jge L(loop_cross_page_2_vec)
605
606 VMOVU (%rax, %r10), %YMM2
607 VMOVU VEC_SIZE(%rax, %r10), %YMM3
608 VMOVU (%rdx, %r10), %YMM4
609 VMOVU VEC_SIZE(%rdx, %r10), %YMM5
610
611 VPCMP $4, %YMM4, %YMM2, %k0
612 VPCMP $0, %YMMZERO, %YMM2, %k1
613 VPCMP $0, %YMMZERO, %YMM4, %k2
614 kord %k1, %k2, %k1
615 /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
616 YMM4. */
617 kord %k0, %k1, %k1
618
619 VPCMP $4, %YMM5, %YMM3, %k3
620 VPCMP $0, %YMMZERO, %YMM3, %k4
621 VPCMP $0, %YMMZERO, %YMM5, %k5
622 kord %k4, %k5, %k4
623 /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
624 YMM5. */
625 kord %k3, %k4, %k3
626
627 # ifdef USE_AS_WCSCMP
628 /* NB: Each bit in K1/K3 represents 4-byte element. */
629 kshiftlw $8, %k3, %k2
630 /* NB: Divide shift count by 4 since each bit in K1 represent 4
631 bytes. */
632 movl %ecx, %SHIFT_REG32
633 sarl $2, %SHIFT_REG32
634 # else
635 kshiftlq $32, %k3, %k2
636 # endif
637
638 /* Each bit in K1 represents a NULL or a mismatch. */
639 korq %k1, %k2, %k1
640 kmovq %k1, %rdi
641
642 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
643 shrxq %SHIFT_REG64, %rdi, %rdi
644 testq %rdi, %rdi
645 je L(loop_cross_page_2_vec)
646 tzcntq %rdi, %rcx
647 # ifdef USE_AS_WCSCMP
648 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
649 sall $2, %ecx
650 # endif
651 # ifdef USE_AS_STRNCMP
652 cmpq %rcx, %r11
653 jbe L(zero)
654 # ifdef USE_AS_WCSCMP
655 movq %rax, %rsi
656 xorl %eax, %eax
657 movl (%rsi, %rcx), %edi
658 cmpl (%rdx, %rcx), %edi
659 jne L(wcscmp_return)
660 # else
661 movzbl (%rax, %rcx), %eax
662 movzbl (%rdx, %rcx), %edx
663 subl %edx, %eax
664 # endif
665 # else
666 # ifdef USE_AS_WCSCMP
667 movq %rax, %rsi
668 xorl %eax, %eax
669 movl (%rsi, %rcx), %edi
670 cmpl (%rdx, %rcx), %edi
671 jne L(wcscmp_return)
672 # else
673 movzbl (%rax, %rcx), %eax
674 movzbl (%rdx, %rcx), %edx
675 subl %edx, %eax
676 # endif
677 # endif
678 ret
679
680 .p2align 4
681 L(loop_cross_page_2_vec):
682 /* The first VEC_SIZE * 2 bytes match or are ignored. */
683 VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
684 VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
685 VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
686 VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
687
688 VPCMP $4, %YMM0, %YMM2, %k0
689 VPCMP $0, %YMMZERO, %YMM0, %k1
690 VPCMP $0, %YMMZERO, %YMM2, %k2
691 kord %k1, %k2, %k1
692 /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
693 YMM2. */
694 kord %k0, %k1, %k1
695
696 VPCMP $4, %YMM1, %YMM3, %k3
697 VPCMP $0, %YMMZERO, %YMM1, %k4
698 VPCMP $0, %YMMZERO, %YMM3, %k5
699 kord %k4, %k5, %k4
700 /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
701 YMM3. */
702 kord %k3, %k4, %k3
703
704 # ifdef USE_AS_WCSCMP
705 /* NB: Each bit in K1/K3 represents 4-byte element. */
706 kshiftlw $8, %k3, %k2
707 # else
708 kshiftlq $32, %k3, %k2
709 # endif
710
711 /* Each bit in K1 represents a NULL or a mismatch. */
712 korq %k1, %k2, %k1
713 kmovq %k1, %rdi
714
715 xorl %r8d, %r8d
716 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
717 subl $(VEC_SIZE * 2), %ecx
718 jle 1f
719 /* R8 has number of bytes skipped. */
720 movl %ecx, %r8d
721 # ifdef USE_AS_WCSCMP
722 /* NB: Divide shift count by 4 since each bit in K1 represent 4
723 bytes. */
724 sarl $2, %ecx
725 # endif
726 /* Skip ECX bytes. */
727 shrq %cl, %rdi
728 1:
729 /* Before jumping back to the loop, set ESI to the number of
730 VEC_SIZE * 4 blocks before page crossing. */
731 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
732
733 testq %rdi, %rdi
734 # ifdef USE_AS_STRNCMP
735 /* At this point, if %rdi value is 0, it already tested
736 VEC_SIZE*4+%r10 byte starting from %rax. This label
737 checks whether strncmp maximum offset reached or not. */
738 je L(string_nbyte_offset_check)
739 # else
740 je L(back_to_loop)
741 # endif
742 tzcntq %rdi, %rcx
743 # ifdef USE_AS_WCSCMP
744 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
745 sall $2, %ecx
746 # endif
747 addq %r10, %rcx
748 /* Adjust for number of bytes skipped. */
749 addq %r8, %rcx
750 # ifdef USE_AS_STRNCMP
751 addq $(VEC_SIZE * 2), %rcx
752 subq %rcx, %r11
753 jbe L(zero)
754 # ifdef USE_AS_WCSCMP
755 movq %rax, %rsi
756 xorl %eax, %eax
757 movl (%rsi, %rcx), %edi
758 cmpl (%rdx, %rcx), %edi
759 jne L(wcscmp_return)
760 # else
761 movzbl (%rax, %rcx), %eax
762 movzbl (%rdx, %rcx), %edx
763 subl %edx, %eax
764 # endif
765 # else
766 # ifdef USE_AS_WCSCMP
767 movq %rax, %rsi
768 xorl %eax, %eax
769 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
770 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
771 jne L(wcscmp_return)
772 # else
773 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
774 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
775 subl %edx, %eax
776 # endif
777 # endif
778 ret
779
780 # ifdef USE_AS_STRNCMP
781 L(string_nbyte_offset_check):
782 leaq (VEC_SIZE * 4)(%r10), %r10
783 cmpq %r10, %r11
784 jbe L(zero)
785 jmp L(back_to_loop)
786 # endif
787
788 .p2align 4
789 L(cross_page_loop):
790 /* Check one byte/dword at a time. */
791 # ifdef USE_AS_WCSCMP
792 cmpl %ecx, %eax
793 # else
794 subl %ecx, %eax
795 # endif
796 jne L(different)
797 addl $SIZE_OF_CHAR, %edx
798 cmpl $(VEC_SIZE * 4), %edx
799 je L(main_loop_header)
800 # ifdef USE_AS_STRNCMP
801 cmpq %r11, %rdx
802 jae L(zero)
803 # endif
804 # ifdef USE_AS_WCSCMP
805 movl (%rdi, %rdx), %eax
806 movl (%rsi, %rdx), %ecx
807 # else
808 movzbl (%rdi, %rdx), %eax
809 movzbl (%rsi, %rdx), %ecx
810 # endif
811 /* Check null char. */
812 testl %eax, %eax
813 jne L(cross_page_loop)
814 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
815 comparisons. */
816 subl %ecx, %eax
817 # ifndef USE_AS_WCSCMP
818 L(different):
819 # endif
820 ret
821
822 # ifdef USE_AS_WCSCMP
823 .p2align 4
824 L(different):
825 /* Use movl to avoid modifying EFLAGS. */
826 movl $0, %eax
827 setl %al
828 negl %eax
829 orl $1, %eax
830 ret
831 # endif
832
833 # ifdef USE_AS_STRNCMP
834 .p2align 4
835 L(zero):
836 xorl %eax, %eax
837 ret
838
839 .p2align 4
840 L(char0):
841 # ifdef USE_AS_WCSCMP
842 xorl %eax, %eax
843 movl (%rdi), %ecx
844 cmpl (%rsi), %ecx
845 jne L(wcscmp_return)
846 # else
847 movzbl (%rsi), %ecx
848 movzbl (%rdi), %eax
849 subl %ecx, %eax
850 # endif
851 ret
852 # endif
853
854 .p2align 4
855 L(last_vector):
856 addq %rdx, %rdi
857 addq %rdx, %rsi
858 # ifdef USE_AS_STRNCMP
859 subq %rdx, %r11
860 # endif
861 tzcntl %ecx, %edx
862 # ifdef USE_AS_WCSCMP
863 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
864 sall $2, %edx
865 # endif
866 # ifdef USE_AS_STRNCMP
867 cmpq %r11, %rdx
868 jae L(zero)
869 # endif
870 # ifdef USE_AS_WCSCMP
871 xorl %eax, %eax
872 movl (%rdi, %rdx), %ecx
873 cmpl (%rsi, %rdx), %ecx
874 jne L(wcscmp_return)
875 # else
876 movzbl (%rdi, %rdx), %eax
877 movzbl (%rsi, %rdx), %edx
878 subl %edx, %eax
879 # endif
880 ret
881
882 /* Comparing on page boundary region requires special treatment:
883 It must done one vector at the time, starting with the wider
884 ymm vector if possible, if not, with xmm. If fetching 16 bytes
885 (xmm) still passes the boundary, byte comparison must be done.
886 */
887 .p2align 4
888 L(cross_page):
889 /* Try one ymm vector at a time. */
890 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
891 jg L(cross_page_1_vector)
892 L(loop_1_vector):
893 VMOVU (%rdi, %rdx), %YMM0
894 VMOVU (%rsi, %rdx), %YMM1
895
896 /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
897 VPCMP $4, %YMM0, %YMM1, %k0
898 VPCMP $0, %YMMZERO, %YMM0, %k1
899 VPCMP $0, %YMMZERO, %YMM1, %k2
900 /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
901 kord %k1, %k2, %k1
902 /* Each bit in K1 represents a NULL or a mismatch. */
903 kord %k0, %k1, %k1
904 kmovd %k1, %ecx
905 testl %ecx, %ecx
906 jne L(last_vector)
907
908 addl $VEC_SIZE, %edx
909
910 addl $VEC_SIZE, %eax
911 # ifdef USE_AS_STRNCMP
912 /* Return 0 if the current offset (%rdx) >= the maximum offset
913 (%r11). */
914 cmpq %r11, %rdx
915 jae L(zero)
916 # endif
917 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
918 jle L(loop_1_vector)
919 L(cross_page_1_vector):
920 /* Less than 32 bytes to check, try one xmm vector. */
921 cmpl $(PAGE_SIZE - 16), %eax
922 jg L(cross_page_1_xmm)
923 VMOVU (%rdi, %rdx), %XMM0
924 VMOVU (%rsi, %rdx), %XMM1
925
926 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
927 VPCMP $4, %XMM0, %XMM1, %k0
928 VPCMP $0, %XMMZERO, %XMM0, %k1
929 VPCMP $0, %XMMZERO, %XMM1, %k2
930 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
931 korw %k1, %k2, %k1
932 /* Each bit in K1 represents a NULL or a mismatch. */
933 korw %k0, %k1, %k1
934 kmovw %k1, %ecx
935 testl %ecx, %ecx
936 jne L(last_vector)
937
938 addl $16, %edx
939 # ifndef USE_AS_WCSCMP
940 addl $16, %eax
941 # endif
942 # ifdef USE_AS_STRNCMP
943 /* Return 0 if the current offset (%rdx) >= the maximum offset
944 (%r11). */
945 cmpq %r11, %rdx
946 jae L(zero)
947 # endif
948
949 L(cross_page_1_xmm):
950 # ifndef USE_AS_WCSCMP
951 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
952 for wcscmp nor wcsncmp since wide char is 4 bytes. */
953 cmpl $(PAGE_SIZE - 8), %eax
954 jg L(cross_page_8bytes)
955 vmovq (%rdi, %rdx), %XMM0
956 vmovq (%rsi, %rdx), %XMM1
957
958 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
959 VPCMP $4, %XMM0, %XMM1, %k0
960 VPCMP $0, %XMMZERO, %XMM0, %k1
961 VPCMP $0, %XMMZERO, %XMM1, %k2
962 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
963 kord %k1, %k2, %k1
964 /* Each bit in K1 represents a NULL or a mismatch. */
965 kord %k0, %k1, %k1
966 kmovd %k1, %ecx
967
968 # ifdef USE_AS_WCSCMP
969 /* Only last 2 bits are valid. */
970 andl $0x3, %ecx
971 # else
972 /* Only last 8 bits are valid. */
973 andl $0xff, %ecx
974 # endif
975
976 testl %ecx, %ecx
977 jne L(last_vector)
978
979 addl $8, %edx
980 addl $8, %eax
981 # ifdef USE_AS_STRNCMP
982 /* Return 0 if the current offset (%rdx) >= the maximum offset
983 (%r11). */
984 cmpq %r11, %rdx
985 jae L(zero)
986 # endif
987
988 L(cross_page_8bytes):
989 /* Less than 8 bytes to check, try 4 byte vector. */
990 cmpl $(PAGE_SIZE - 4), %eax
991 jg L(cross_page_4bytes)
992 vmovd (%rdi, %rdx), %XMM0
993 vmovd (%rsi, %rdx), %XMM1
994
995 /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
996 VPCMP $4, %XMM0, %XMM1, %k0
997 VPCMP $0, %XMMZERO, %XMM0, %k1
998 VPCMP $0, %XMMZERO, %XMM1, %k2
999 /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
1000 kord %k1, %k2, %k1
1001 /* Each bit in K1 represents a NULL or a mismatch. */
1002 kord %k0, %k1, %k1
1003 kmovd %k1, %ecx
1004
1005 # ifdef USE_AS_WCSCMP
1006 /* Only the last bit is valid. */
1007 andl $0x1, %ecx
1008 # else
1009 /* Only last 4 bits are valid. */
1010 andl $0xf, %ecx
1011 # endif
1012
1013 testl %ecx, %ecx
1014 jne L(last_vector)
1015
1016 addl $4, %edx
1017 # ifdef USE_AS_STRNCMP
1018 /* Return 0 if the current offset (%rdx) >= the maximum offset
1019 (%r11). */
1020 cmpq %r11, %rdx
1021 jae L(zero)
1022 # endif
1023
1024 L(cross_page_4bytes):
1025 # endif
1026 /* Less than 4 bytes to check, try one byte/dword at a time. */
1027 # ifdef USE_AS_STRNCMP
1028 cmpq %r11, %rdx
1029 jae L(zero)
1030 # endif
1031 # ifdef USE_AS_WCSCMP
1032 movl (%rdi, %rdx), %eax
1033 movl (%rsi, %rdx), %ecx
1034 # else
1035 movzbl (%rdi, %rdx), %eax
1036 movzbl (%rsi, %rdx), %ecx
1037 # endif
1038 testl %eax, %eax
1039 jne L(cross_page_loop)
1040 subl %ecx, %eax
1041 ret
1042 END (STRCMP)
1043 #endif