]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/x86_64/multiarch/strcmp-evex.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / strcmp-evex.S
CommitLineData
1fd8c163 1/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
6d7e8eda 2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
1fd8c163
L
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
ceabdcd1
NG
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
5ce97664
NG
23# ifndef VEC_SIZE
24# include "x86-evex256-vecs.h"
25# endif
26
ceabdcd1
NG
27# define STRCMP_ISA _evex
28# include "strcmp-naming.h"
1fd8c163
L
29
30# include <sysdep.h>
84e7c46d
NG
31# if defined USE_AS_STRCASECMP_L
32# include "locale-defines.h"
33# endif
1fd8c163
L
34
35# ifndef STRCMP
36# define STRCMP __strcmp_evex
37# endif
38
39# define PAGE_SIZE 4096
40
8418eb3f 41 /* VEC_SIZE = Number of bytes in a ymm register. */
8418eb3f 42# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
1fd8c163 43
1fd8c163 44# ifdef USE_AS_WCSCMP
8418eb3f
NG
45 /* Compare packed dwords. */
46# define VPCMP vpcmpd
5ce97664 47# define VPCMPEQ vpcmpeqd
c46e9afb
L
48# define VPMINU vpminud
49# define VPTESTM vptestmd
84e7c46d 50# define VPTESTNM vptestnmd
8418eb3f 51 /* 1 dword char == 4 bytes. */
1fd8c163 52# define SIZE_OF_CHAR 4
5ce97664
NG
53
54# define TESTEQ sub $((1 << CHAR_PER_VEC) - 1),
55
56# define USE_WIDE_CHAR
1fd8c163 57# else
8418eb3f
NG
58 /* Compare packed bytes. */
59# define VPCMP vpcmpb
5ce97664 60# define VPCMPEQ vpcmpeqb
c46e9afb
L
61# define VPMINU vpminub
62# define VPTESTM vptestmb
84e7c46d 63# define VPTESTNM vptestnmb
8418eb3f 64 /* 1 byte char == 1 byte. */
1fd8c163 65# define SIZE_OF_CHAR 1
5ce97664
NG
66
67# define TESTEQ inc
68# endif
69
70# include "reg-macros.h"
71
72# if VEC_SIZE == 64
73# define RODATA_SECTION rodata.cst64
74# else
75# define RODATA_SECTION rodata.cst32
76# endif
77
78# if CHAR_PER_VEC == 64
79# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 3)
80# else
81# define FALLTHROUGH_RETURN_OFFSET (VEC_SIZE * 2)
1fd8c163
L
82# endif
83
8418eb3f 84# ifdef USE_AS_STRNCMP
5ce97664 85# define LOOP_REG VR9
8418eb3f
NG
86# define LOOP_REG64 r9
87
88# define OFFSET_REG8 r9b
89# define OFFSET_REG r9d
90# define OFFSET_REG64 r9
91# else
5ce97664 92# define LOOP_REG VRDX
8418eb3f
NG
93# define LOOP_REG64 rdx
94
95# define OFFSET_REG8 dl
96# define OFFSET_REG edx
97# define OFFSET_REG64 rdx
98# endif
99
100# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
101# define VEC_OFFSET 0
102# else
103# define VEC_OFFSET (-VEC_SIZE)
104# endif
105
84e7c46d
NG
106# ifdef USE_AS_STRCASECMP_L
107# define BYTE_LOOP_REG OFFSET_REG
108# else
109# define BYTE_LOOP_REG ecx
110# endif
111
112# ifdef USE_AS_STRCASECMP_L
113# ifdef USE_AS_STRNCMP
84e7c46d
NG
114# define LOCALE_REG rcx
115# define LOCALE_REG_LP RCX_LP
84e7c46d 116# else
84e7c46d
NG
117# define LOCALE_REG rdx
118# define LOCALE_REG_LP RDX_LP
84e7c46d
NG
119# endif
120# endif
121
5ce97664
NG
122# define LCASE_MIN_V VMM(12)
123# define LCASE_MAX_V VMM(13)
124# define CASE_ADD_V VMM(14)
84e7c46d 125
5ce97664
NG
126# if VEC_SIZE == 64
127# define LCASE_MIN_YMM VMM_256(12)
128# define LCASE_MAX_YMM VMM_256(13)
129# define CASE_ADD_YMM VMM_256(14)
130# endif
131
132# define LCASE_MIN_XMM VMM_128(12)
133# define LCASE_MAX_XMM VMM_128(13)
134# define CASE_ADD_XMM VMM_128(14)
84e7c46d
NG
135
136 /* NB: wcsncmp uses r11 but strcasecmp is never used in
137 conjunction with wcscmp. */
138# define TOLOWER_BASE %r11
139
140# ifdef USE_AS_STRCASECMP_L
5ce97664
NG
141# define _REG(x, y) x ## y
142# define REG(x, y) _REG(x, y)
143# define TOLOWER(reg1, reg2, ext, vec_macro) \
144 vpsubb %REG(LCASE_MIN_, ext), reg1, %vec_macro(10); \
145 vpsubb %REG(LCASE_MIN_, ext), reg2, %vec_macro(11); \
146 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
147 vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
148 vpaddb reg1, %REG(CASE_ADD_, ext), reg1{%k5}; \
149 vpaddb reg2, %REG(CASE_ADD_, ext), reg2{%k6}
150
151# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152# define TOLOWER_VMM(...) TOLOWER(__VA_ARGS__, V, VMM)
153# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM, VMM_256)
154# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM, VMM_128)
155
156# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro) \
157 TOLOWER (s1_reg, s2_reg, ext, vec_macro); \
158 VPCMPEQ s1_reg, s2_reg, reg_out
159
160# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro) \
161 VMOVU s2_mem, s2_reg; \
162 CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
163
164# define CMP_R1_R2_VMM(...) CMP_R1_R2(__VA_ARGS__, V, VMM)
165# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
166# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
167
168# define CMP_R1_S2_VMM(...) CMP_R1_S2(__VA_ARGS__, V, VMM)
169# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
170# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
84e7c46d
NG
171
172# else
173# define TOLOWER_gpr(...)
5ce97664 174# define TOLOWER_VMM(...)
84e7c46d
NG
175# define TOLOWER_YMM(...)
176# define TOLOWER_XMM(...)
177
5ce97664
NG
178# define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out) \
179 VPCMPEQ s2_reg, s1_reg, reg_out
84e7c46d 180
5ce97664
NG
181# define CMP_R1_R2_YMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
182# define CMP_R1_R2_XMM(...) CMP_R1_R2_VMM(__VA_ARGS__)
84e7c46d 183
5ce97664
NG
184# define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out) \
185 VPCMPEQ s2_mem, s1_reg, reg_out
186# define CMP_R1_S2_YMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
187# define CMP_R1_S2_XMM(...) CMP_R1_S2_VMM(__VA_ARGS__)
84e7c46d 188# endif
1fd8c163
L
189
190/* Warning!
191 wcscmp/wcsncmp have to use SIGNED comparison for elements.
192 strcmp/strncmp have to use UNSIGNED comparison for elements.
193*/
194
195/* The main idea of the string comparison (byte or dword) using 256-bit
196 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
197 latter can be on either packed bytes or dwords depending on
c46e9afb 198 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
1fd8c163
L
199 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
200 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
201 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
202 instructions. Main loop (away from from page boundary) compares 4
203 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
204 bytes) on each loop.
205
206 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
207 is the same as strcmp, except that an a maximum offset is tracked. If
208 the maximum offset is reached before a difference is found, zero is
209 returned. */
210
5ce97664 211 .section SECTION(.text), "ax", @progbits
84e7c46d
NG
212 .align 16
213 .type STRCMP, @function
214 .globl STRCMP
84e7c46d
NG
215# ifdef USE_AS_STRCASECMP_L
216ENTRY (STRCASECMP)
217 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
218 mov %fs:(%rax), %LOCALE_REG_LP
219
220 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
221 .p2align 4
222END (STRCASECMP)
223 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
224# endif
225
226 .p2align 4
227STRCMP:
228 cfi_startproc
229 _CET_ENDBR
230 CALL_MCOUNT
231
232# if defined USE_AS_STRCASECMP_L
233 /* We have to fall back on the C implementation for locales with
234 encodings not matching ASCII for single bytes. */
235# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
236 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
237# else
238 mov (%LOCALE_REG), %RAX_LP
239# endif
5ce97664 240 testb $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
30e57e0a 241 jne STRCASECMP_L_NONASCII
84e7c46d
NG
242 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
243# endif
244
1fd8c163 245# ifdef USE_AS_STRNCMP
84e7c46d
NG
246 /* Don't overwrite LOCALE_REG (rcx) until we have pass
247 L(one_or_less). Otherwise we might use the wrong locale in
248 the OVERFLOW_STRCMP (strcasecmp_l). */
8418eb3f
NG
249# ifdef __ILP32__
250 /* Clear the upper 32 bits. */
0e0199a9 251 movl %edx, %edx
1fd8c163 252# endif
8418eb3f
NG
253 cmp $1, %RDX_LP
254 /* Signed comparison intentional. We use this branch to also
255 test cases where length >= 2^63. These very large sizes can be
256 handled with strcmp as there is no way for that length to
257 actually bound the buffer. */
258 jle L(one_or_less)
1fd8c163 259# endif
84e7c46d
NG
260
261# if defined USE_AS_STRCASECMP_L
5ce97664
NG
262 .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
263 .align VEC_SIZE
84e7c46d
NG
264L(lcase_min):
265 .quad 0x4141414141414141
266 .quad 0x4141414141414141
267 .quad 0x4141414141414141
268 .quad 0x4141414141414141
5ce97664
NG
269# if VEC_SIZE == 64
270 .quad 0x4141414141414141
271 .quad 0x4141414141414141
272 .quad 0x4141414141414141
273 .quad 0x4141414141414141
274# endif
84e7c46d
NG
275L(lcase_max):
276 .quad 0x1a1a1a1a1a1a1a1a
277 .quad 0x1a1a1a1a1a1a1a1a
278 .quad 0x1a1a1a1a1a1a1a1a
279 .quad 0x1a1a1a1a1a1a1a1a
5ce97664
NG
280# if VEC_SIZE == 64
281 .quad 0x1a1a1a1a1a1a1a1a
282 .quad 0x1a1a1a1a1a1a1a1a
283 .quad 0x1a1a1a1a1a1a1a1a
284 .quad 0x1a1a1a1a1a1a1a1a
285# endif
84e7c46d
NG
286L(case_add):
287 .quad 0x2020202020202020
288 .quad 0x2020202020202020
289 .quad 0x2020202020202020
290 .quad 0x2020202020202020
5ce97664
NG
291# if VEC_SIZE == 64
292 .quad 0x2020202020202020
293 .quad 0x2020202020202020
294 .quad 0x2020202020202020
295 .quad 0x2020202020202020
296# endif
84e7c46d
NG
297 .previous
298
5ce97664
NG
299 VMOVA L(lcase_min)(%rip), %LCASE_MIN_V
300 VMOVA L(lcase_max)(%rip), %LCASE_MAX_V
301 VMOVA L(case_add)(%rip), %CASE_ADD_V
84e7c46d
NG
302# endif
303
1fd8c163 304 movl %edi, %eax
1fd8c163 305 orl %esi, %eax
8418eb3f
NG
306 /* Shift out the bits irrelivant to page boundary ([63:12]). */
307 sall $20, %eax
308 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
309 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
310 ja L(page_cross)
311
312L(no_page_cross):
313 /* Safe to compare 4x vectors. */
5ce97664
NG
314 VMOVU (%rdi), %VMM(0)
315 VPTESTM %VMM(0), %VMM(0), %k2
c46e9afb
L
316 /* Each bit cleared in K1 represents a mismatch or a null CHAR
317 in YMM0 and 32 bytes at (%rsi). */
5ce97664
NG
318 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
319 KMOV %k1, %VRCX
1fd8c163 320# ifdef USE_AS_STRNCMP
8418eb3f
NG
321 cmpq $CHAR_PER_VEC, %rdx
322 jbe L(vec_0_test_len)
1fd8c163 323# endif
8418eb3f
NG
324
325 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
326 wcscmp/wcsncmp. */
327
328 /* All 1s represents all equals. TESTEQ will overflow to zero in
5ce97664
NG
329 all equals case. Otherwise 1s will carry until position of
330 first mismatch. */
331 TESTEQ %VRCX
8418eb3f
NG
332 jz L(more_3x_vec)
333
334 .p2align 4,, 4
335L(return_vec_0):
5ce97664 336 bsf %VRCX, %VRCX
1fd8c163 337# ifdef USE_AS_WCSCMP
8418eb3f 338 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1fd8c163 339 xorl %eax, %eax
8418eb3f
NG
340 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
341 je L(ret0)
1fd8c163
L
342 setl %al
343 negl %eax
344 orl $1, %eax
1fd8c163 345# else
8418eb3f 346 movzbl (%rdi, %rcx), %eax
5ce97664
NG
347 /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
348 and keep logic for len <= VEC_SIZE (common) in just the
349 first cache line. NB: No evex512 processor has partial-
350 register stalls. If that changes this ifdef can be disabled
351 without affecting correctness. */
352# if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
353 movb (%rsi, %rcx), %cl
354# else
8418eb3f 355 movzbl (%rsi, %rcx), %ecx
5ce97664 356# endif
84e7c46d
NG
357 TOLOWER_gpr (%rax, %eax)
358 TOLOWER_gpr (%rcx, %ecx)
8418eb3f 359 subl %ecx, %eax
1fd8c163 360# endif
8418eb3f 361L(ret0):
1fd8c163
L
362 ret
363
1fd8c163 364# ifdef USE_AS_STRNCMP
8418eb3f
NG
365 .p2align 4,, 4
366L(vec_0_test_len):
5ce97664
NG
367 not %VRCX
368 bzhi %VRDX, %VRCX, %VRAX
8418eb3f
NG
369 jnz L(return_vec_0)
370 /* Align if will cross fetch block. */
371 .p2align 4,, 2
372L(ret_zero):
1fd8c163 373 xorl %eax, %eax
8418eb3f
NG
374 ret
375
376 .p2align 4,, 5
377L(one_or_less):
84e7c46d
NG
378# ifdef USE_AS_STRCASECMP_L
379 /* Set locale argument for strcasecmp. */
380 movq %LOCALE_REG, %rdx
381# endif
8418eb3f 382 jb L(ret_zero)
8418eb3f
NG
383 /* 'nbe' covers the case where length is negative (large
384 unsigned). */
84e7c46d
NG
385 jnbe OVERFLOW_STRCMP
386# ifdef USE_AS_WCSCMP
8418eb3f 387 movl (%rdi), %edx
1fd8c163 388 xorl %eax, %eax
8418eb3f
NG
389 cmpl (%rsi), %edx
390 je L(ret1)
391 setl %al
392 negl %eax
393 orl $1, %eax
1fd8c163 394# else
8418eb3f
NG
395 movzbl (%rdi), %eax
396 movzbl (%rsi), %ecx
84e7c46d
NG
397 TOLOWER_gpr (%rax, %eax)
398 TOLOWER_gpr (%rcx, %ecx)
8418eb3f 399 subl %ecx, %eax
1fd8c163 400# endif
8418eb3f 401L(ret1):
1fd8c163 402 ret
8418eb3f 403# endif
1fd8c163 404
8418eb3f
NG
405 .p2align 4,, 10
406L(return_vec_1):
5ce97664 407 bsf %VRCX, %VRCX
8418eb3f
NG
408# ifdef USE_AS_STRNCMP
409 /* rdx must be > CHAR_PER_VEC so its safe to subtract without
410 worrying about underflow. */
411 addq $-CHAR_PER_VEC, %rdx
412 cmpq %rcx, %rdx
413 jbe L(ret_zero)
414# endif
1fd8c163 415# ifdef USE_AS_WCSCMP
8418eb3f
NG
416 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
417 xorl %eax, %eax
418 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
419 je L(ret2)
420 setl %al
421 negl %eax
422 orl $1, %eax
423# else
424 movzbl VEC_SIZE(%rdi, %rcx), %eax
425 movzbl VEC_SIZE(%rsi, %rcx), %ecx
84e7c46d
NG
426 TOLOWER_gpr (%rax, %eax)
427 TOLOWER_gpr (%rcx, %ecx)
8418eb3f 428 subl %ecx, %eax
1fd8c163 429# endif
8418eb3f
NG
430L(ret2):
431 ret
432
433 .p2align 4,, 10
1fd8c163 434# ifdef USE_AS_STRNCMP
8418eb3f 435L(return_vec_3):
5ce97664
NG
436# if CHAR_PER_VEC <= 32
437 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
438 additional branches by adjusting the bit positions from
439 VEC3. We can't do this for CHAR_PER_VEC == 64. */
440# if CHAR_PER_VEC <= 16
8418eb3f 441 sall $CHAR_PER_VEC, %ecx
5ce97664 442# else
8418eb3f 443 salq $CHAR_PER_VEC, %rcx
5ce97664
NG
444# endif
445# else
446 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
447 check it. */
448 bsf %VRCX, %VRCX
449 addl $(CHAR_PER_VEC), %ecx
450 cmpq %rcx, %rdx
451 ja L(ret_vec_3_finish)
452 xorl %eax, %eax
453 ret
1fd8c163 454# endif
8418eb3f 455# endif
5ce97664
NG
456
457 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
458 2x VEC so need seperate return label. */
8418eb3f
NG
459L(return_vec_2):
460# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
5ce97664 461 bsf %VRCX, %VRCX
1fd8c163 462# else
5ce97664 463 bsfq %rcx, %rcx
1fd8c163 464# endif
1fd8c163 465# ifdef USE_AS_STRNCMP
8418eb3f
NG
466 cmpq %rcx, %rdx
467 jbe L(ret_zero)
468# endif
469
5ce97664 470L(ret_vec_3_finish):
8418eb3f
NG
471# ifdef USE_AS_WCSCMP
472 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
1fd8c163 473 xorl %eax, %eax
8418eb3f
NG
474 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
475 je L(ret3)
476 setl %al
477 negl %eax
478 orl $1, %eax
1fd8c163 479# else
8418eb3f
NG
480 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
481 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
84e7c46d
NG
482 TOLOWER_gpr (%rax, %eax)
483 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
484 subl %ecx, %eax
485# endif
486L(ret3):
487 ret
488
489# ifndef USE_AS_STRNCMP
490 .p2align 4,, 10
491L(return_vec_3):
5ce97664 492 bsf %VRCX, %VRCX
1fd8c163 493# ifdef USE_AS_WCSCMP
8418eb3f 494 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
1fd8c163 495 xorl %eax, %eax
8418eb3f
NG
496 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
497 je L(ret4)
498 setl %al
499 negl %eax
500 orl $1, %eax
1fd8c163 501# else
8418eb3f
NG
502 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
503 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
84e7c46d
NG
504 TOLOWER_gpr (%rax, %eax)
505 TOLOWER_gpr (%rcx, %ecx)
8418eb3f 506 subl %ecx, %eax
1fd8c163 507# endif
8418eb3f 508L(ret4):
1fd8c163 509 ret
8418eb3f 510# endif
1fd8c163 511
8418eb3f
NG
512 /* 32 byte align here ensures the main loop is ideally aligned
513 for DSB. */
514 .p2align 5
515L(more_3x_vec):
516 /* Safe to compare 4x vectors. */
5ce97664
NG
517 VMOVU (VEC_SIZE)(%rdi), %VMM(0)
518 VPTESTM %VMM(0), %VMM(0), %k2
519 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
520 KMOV %k1, %VRCX
521 TESTEQ %VRCX
8418eb3f
NG
522 jnz L(return_vec_1)
523
524# ifdef USE_AS_STRNCMP
525 subq $(CHAR_PER_VEC * 2), %rdx
526 jbe L(ret_zero)
c46e9afb 527# endif
1fd8c163 528
5ce97664
NG
529 VMOVU (VEC_SIZE * 2)(%rdi), %VMM(0)
530 VPTESTM %VMM(0), %VMM(0), %k2
531 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
532 KMOV %k1, %VRCX
533 TESTEQ %VRCX
8418eb3f 534 jnz L(return_vec_2)
1fd8c163 535
5ce97664
NG
536 VMOVU (VEC_SIZE * 3)(%rdi), %VMM(0)
537 VPTESTM %VMM(0), %VMM(0), %k2
538 CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
539 KMOV %k1, %VRCX
540 TESTEQ %VRCX
8418eb3f
NG
541 jnz L(return_vec_3)
542
543# ifdef USE_AS_STRNCMP
544 cmpq $(CHAR_PER_VEC * 2), %rdx
545 jbe L(ret_zero)
546# endif
547
548
c46e9afb 549# ifdef USE_AS_WCSCMP
8418eb3f
NG
550 /* any non-zero positive value that doesn't inference with 0x1.
551 */
552 movl $2, %r8d
553
c46e9afb 554# else
8418eb3f 555 xorl %r8d, %r8d
c46e9afb 556# endif
8418eb3f
NG
557
558 /* The prepare labels are various entry points from the page
559 cross logic. */
560L(prepare_loop):
561
1fd8c163 562# ifdef USE_AS_STRNCMP
8418eb3f
NG
563# ifdef USE_AS_WCSCMP
564L(prepare_loop_no_len):
565 movl %edi, %ecx
566 andl $(VEC_SIZE * 4 - 1), %ecx
567 shrl $2, %ecx
568 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
569# else
570 /* Store N + (VEC_SIZE * 4) and place check at the begining of
571 the loop. */
572 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
573L(prepare_loop_no_len):
574# endif
575# else
576L(prepare_loop_no_len):
1fd8c163 577# endif
1fd8c163 578
8418eb3f
NG
579 /* Align s1 and adjust s2 accordingly. */
580 subq %rdi, %rsi
581 andq $-(VEC_SIZE * 4), %rdi
582L(prepare_loop_readj):
583 addq %rdi, %rsi
584# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
585 subq %rdi, %rdx
586# endif
587
588L(prepare_loop_aligned):
589 /* eax stores distance from rsi to next page cross. These cases
590 need to be handled specially as the 4x loop could potentially
591 read memory past the length of s1 or s2 and across a page
592 boundary. */
593 movl $-(VEC_SIZE * 4), %eax
594 subl %esi, %eax
595 andl $(PAGE_SIZE - 1), %eax
596
8418eb3f
NG
597
598 /* Loop 4x comparisons at a time. */
1fd8c163
L
599 .p2align 4
600L(loop):
8418eb3f
NG
601
602 /* End condition for strncmp. */
1fd8c163 603# ifdef USE_AS_STRNCMP
8418eb3f
NG
604 subq $(CHAR_PER_VEC * 4), %rdx
605 jbe L(ret_zero)
1fd8c163 606# endif
8418eb3f
NG
607
608 subq $-(VEC_SIZE * 4), %rdi
609 subq $-(VEC_SIZE * 4), %rsi
610
611 /* Check if rsi loads will cross a page boundary. */
612 addl $-(VEC_SIZE * 4), %eax
613 jnb L(page_cross_during_loop)
614
615 /* Loop entry after handling page cross during loop. */
616L(loop_skip_page_cross_check):
5ce97664
NG
617 VMOVA (VEC_SIZE * 0)(%rdi), %VMM(0)
618 VMOVA (VEC_SIZE * 1)(%rdi), %VMM(2)
619 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
620 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
c46e9afb 621
5ce97664
NG
622 VPMINU %VMM(0), %VMM(2), %VMM(8)
623 VPMINU %VMM(4), %VMM(6), %VMM(9)
c46e9afb 624
8418eb3f 625 /* A zero CHAR in YMM9 means that there is a null CHAR. */
5ce97664 626 VPMINU %VMM(8), %VMM(9), %VMM(9)
c46e9afb 627
84e7c46d 628 /* Each bit set in K1 represents a non-null CHAR in YMM9. */
5ce97664 629 VPTESTM %VMM(9), %VMM(9), %k1
84e7c46d 630# ifndef USE_AS_STRCASECMP_L
5ce97664
NG
631 vpxorq (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
632 vpxorq (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
633 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
8418eb3f
NG
634 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
635 oring with YMM1. Result is stored in YMM6. */
5ce97664 636 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
84e7c46d 637# else
5ce97664
NG
638 VMOVU (VEC_SIZE * 0)(%rsi), %VMM(1)
639 TOLOWER_VMM (%VMM(0), %VMM(1))
640 VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3)
641 TOLOWER_VMM (%VMM(2), %VMM(3))
642 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
643 TOLOWER_VMM (%VMM(4), %VMM(5))
644 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
645 TOLOWER_VMM (%VMM(6), %VMM(7))
646 vpxorq %VMM(0), %VMM(1), %VMM(1)
647 vpxorq %VMM(2), %VMM(3), %VMM(3)
648 vpxorq %VMM(4), %VMM(5), %VMM(5)
649 vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
84e7c46d 650# endif
8418eb3f 651 /* Or together YMM3, YMM5, and YMM6. */
5ce97664 652 vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
c46e9afb 653
c46e9afb 654
8418eb3f 655 /* A non-zero CHAR in YMM6 represents a mismatch. */
5ce97664
NG
656 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
657 KMOV %k0, %LOOP_REG
c46e9afb 658
8418eb3f
NG
659 TESTEQ %LOOP_REG
660 jz L(loop)
661
662
663 /* Find which VEC has the mismatch of end of string. */
5ce97664
NG
664 VPTESTM %VMM(0), %VMM(0), %k1
665 VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
666 KMOV %k0, %VRCX
667 TESTEQ %VRCX
8418eb3f 668 jnz L(return_vec_0_end)
1fd8c163 669
5ce97664
NG
670 VPTESTM %VMM(2), %VMM(2), %k1
671 VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
672 KMOV %k0, %VRCX
673 TESTEQ %VRCX
8418eb3f 674 jnz L(return_vec_1_end)
1fd8c163 675
8418eb3f 676
5ce97664
NG
677 /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
678 */
8418eb3f 679L(return_vec_2_3_end):
1fd8c163 680# ifdef USE_AS_STRNCMP
8418eb3f
NG
681 subq $(CHAR_PER_VEC * 2), %rdx
682 jbe L(ret_zero_end)
1fd8c163 683# endif
8418eb3f 684
5ce97664
NG
685 VPTESTM %VMM(4), %VMM(4), %k1
686 VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
687 KMOV %k0, %VRCX
688 TESTEQ %VRCX
8418eb3f
NG
689# if CHAR_PER_VEC <= 16
690 sall $CHAR_PER_VEC, %LOOP_REG
691 orl %ecx, %LOOP_REG
5ce97664 692# elif CHAR_PER_VEC <= 32
8418eb3f
NG
693 salq $CHAR_PER_VEC, %LOOP_REG64
694 orq %rcx, %LOOP_REG64
5ce97664
NG
695# else
696 /* We aren't combining last 2x VEC so branch on second the last.
697 */
698 jnz L(return_vec_2_end)
8418eb3f 699# endif
5ce97664 700
8418eb3f 701 /* LOOP_REG contains matches for null/mismatch from the loop. If
5ce97664
NG
702 VEC 0,1,and 2 all have no null and no mismatches then
703 mismatch must entirely be from VEC 3 which is fully
704 represented by LOOP_REG. */
8418eb3f 705# if CHAR_PER_VEC <= 16
5ce97664 706 bsf %LOOP_REG, %LOOP_REG
8418eb3f 707# else
5ce97664 708 bsfq %LOOP_REG64, %LOOP_REG64
8418eb3f
NG
709# endif
710# ifdef USE_AS_STRNCMP
5ce97664
NG
711
712 /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
713 adj length before last comparison. */
714# if CHAR_PER_VEC == 64
715 subq $CHAR_PER_VEC, %rdx
716 jbe L(ret_zero_end)
717# endif
718
8418eb3f
NG
719 cmpq %LOOP_REG64, %rdx
720 jbe L(ret_zero_end)
c46e9afb 721# endif
8418eb3f 722
1fd8c163 723# ifdef USE_AS_WCSCMP
5ce97664 724 movl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
8418eb3f 725 xorl %eax, %eax
5ce97664 726 cmpl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
8418eb3f
NG
727 je L(ret5)
728 setl %al
729 negl %eax
730 xorl %r8d, %eax
731# else
5ce97664
NG
732 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
733 movzbl (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
84e7c46d
NG
734 TOLOWER_gpr (%rax, %eax)
735 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
736 subl %ecx, %eax
737 xorl %r8d, %eax
738 subl %r8d, %eax
1fd8c163 739# endif
8418eb3f
NG
740L(ret5):
741 ret
742
1fd8c163 743# ifdef USE_AS_STRNCMP
8418eb3f
NG
744 .p2align 4,, 2
745L(ret_zero_end):
1fd8c163 746 xorl %eax, %eax
8418eb3f
NG
747 ret
748# endif
749
750
5ce97664 751
8418eb3f 752 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
5ce97664
NG
753 they use the value of `r8` to negate the return value. This
754 is because the page cross logic can swap `rdi` and `rsi`.
755 */
8418eb3f
NG
756 .p2align 4,, 10
757# ifdef USE_AS_STRNCMP
758L(return_vec_1_end):
5ce97664
NG
759# if CHAR_PER_VEC <= 32
760 /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
761 without additional branches by adjusting the bit positions
762 from VEC1. We can't do this for CHAR_PER_VEC == 64. */
763# if CHAR_PER_VEC <= 16
8418eb3f 764 sall $CHAR_PER_VEC, %ecx
5ce97664 765# else
8418eb3f 766 salq $CHAR_PER_VEC, %rcx
5ce97664
NG
767# endif
768# else
769 /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
770 check it. */
771 bsf %VRCX, %VRCX
772 addl $(CHAR_PER_VEC), %ecx
773 cmpq %rcx, %rdx
774 ja L(ret_vec_0_end_finish)
775 xorl %eax, %eax
776 ret
1fd8c163 777# endif
8418eb3f
NG
778# endif
779L(return_vec_0_end):
780# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
5ce97664 781 bsf %VRCX, %VRCX
1fd8c163 782# else
5ce97664 783 bsfq %rcx, %rcx
1fd8c163 784# endif
1fd8c163 785
1fd8c163 786# ifdef USE_AS_STRNCMP
8418eb3f
NG
787 cmpq %rcx, %rdx
788 jbe L(ret_zero_end)
1fd8c163 789# endif
8418eb3f 790
5ce97664 791L(ret_vec_0_end_finish):
c46e9afb 792# ifdef USE_AS_WCSCMP
8418eb3f
NG
793 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
794 xorl %eax, %eax
795 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
796 je L(ret6)
797 setl %al
798 negl %eax
799 /* This is the non-zero case for `eax` so just xorl with `r8d`
800 flip is `rdi` and `rsi` where swapped. */
801 xorl %r8d, %eax
c46e9afb 802# else
8418eb3f
NG
803 movzbl (%rdi, %rcx), %eax
804 movzbl (%rsi, %rcx), %ecx
84e7c46d
NG
805 TOLOWER_gpr (%rax, %eax)
806 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
807 subl %ecx, %eax
808 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
809 logic. Subtract `r8d` after xor for zero case. */
810 xorl %r8d, %eax
811 subl %r8d, %eax
c46e9afb 812# endif
8418eb3f
NG
813L(ret6):
814 ret
815
816# ifndef USE_AS_STRNCMP
817 .p2align 4,, 10
818L(return_vec_1_end):
5ce97664 819 bsf %VRCX, %VRCX
1fd8c163 820# ifdef USE_AS_WCSCMP
8418eb3f 821 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
1fd8c163 822 xorl %eax, %eax
8418eb3f
NG
823 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
824 je L(ret7)
825 setl %al
826 negl %eax
827 xorl %r8d, %eax
1fd8c163 828# else
8418eb3f
NG
829 movzbl VEC_SIZE(%rdi, %rcx), %eax
830 movzbl VEC_SIZE(%rsi, %rcx), %ecx
84e7c46d
NG
831 TOLOWER_gpr (%rax, %eax)
832 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
833 subl %ecx, %eax
834 xorl %r8d, %eax
835 subl %r8d, %eax
1fd8c163 836# endif
8418eb3f 837L(ret7):
1fd8c163 838 ret
1fd8c163
L
839# endif
840
1fd8c163 841
5ce97664
NG
842 /* If CHAR_PER_VEC == 64 we can't combine matches from the last
843 2x VEC so need seperate return label. */
844# if CHAR_PER_VEC == 64
845L(return_vec_2_end):
846 bsf %VRCX, %VRCX
847# ifdef USE_AS_STRNCMP
848 cmpq %rcx, %rdx
849 jbe L(ret_zero_end)
850# endif
851# ifdef USE_AS_WCSCMP
852 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
853 xorl %eax, %eax
854 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
855 je L(ret31)
856 setl %al
857 negl %eax
858 /* This is the non-zero case for `eax` so just xorl with `r8d`
859 flip is `rdi` and `rsi` where swapped. */
860 xorl %r8d, %eax
861# else
862 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
863 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
864 TOLOWER_gpr (%rax, %eax)
865 TOLOWER_gpr (%rcx, %ecx)
866 subl %ecx, %eax
867 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
868 logic. Subtract `r8d` after xor for zero case. */
869 xorl %r8d, %eax
870 subl %r8d, %eax
871# endif
872L(ret13):
873 ret
874# endif
875
876
8418eb3f 877 /* Page cross in rsi in next 4x VEC. */
c46e9afb 878
8418eb3f
NG
879 /* TODO: Improve logic here. */
880 .p2align 4,, 10
881L(page_cross_during_loop):
882 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
c46e9afb 883
8418eb3f
NG
884 /* Optimistically rsi and rdi and both aligned in which case we
885 don't need any logic here. */
886 cmpl $-(VEC_SIZE * 4), %eax
887 /* Don't adjust eax before jumping back to loop and we will
888 never hit page cross case again. */
889 je L(loop_skip_page_cross_check)
1fd8c163 890
8418eb3f
NG
891 /* Check if we can safely load a VEC. */
892 cmpl $-(VEC_SIZE * 3), %eax
893 jle L(less_1x_vec_till_page_cross)
1fd8c163 894
5ce97664
NG
895 VMOVA (%rdi), %VMM(0)
896 VPTESTM %VMM(0), %VMM(0), %k2
897 CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
898 KMOV %k1, %VRCX
899 TESTEQ %VRCX
8418eb3f
NG
900 jnz L(return_vec_0_end)
901
902 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
903 cmpl $-(VEC_SIZE * 2), %eax
904 jg L(more_2x_vec_till_page_cross)
905
906 .p2align 4,, 4
907L(less_1x_vec_till_page_cross):
908 subl $-(VEC_SIZE * 4), %eax
909 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
910 concerning case is first iteration if incoming s1 was near start
911 of a page and s2 near end. If s1 was near the start of the page
912 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
913 to read back -VEC_SIZE. If rdi is truly at the start of a page
914 here, it means the previous page (rdi - VEC_SIZE) has already
915 been loaded earlier so must be valid. */
5ce97664
NG
916 VMOVU -VEC_SIZE(%rdi, %rax), %VMM(0)
917 VPTESTM %VMM(0), %VMM(0), %k2
918 CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
8418eb3f
NG
919 /* Mask of potentially valid bits. The lower bits can be out of
920 range comparisons (but safe regarding page crosses). */
1fd8c163 921
1fd8c163 922# ifdef USE_AS_WCSCMP
8418eb3f
NG
923 movl $-1, %r10d
924 movl %esi, %ecx
925 andl $(VEC_SIZE - 1), %ecx
926 shrl $2, %ecx
927 shlxl %ecx, %r10d, %ecx
5ce97664
NG
928 /* Depending on CHAR_PER_VEC extract mask for possible in-bound
929 matches. */
930# if CHAR_PER_VEC == 16
931 movzwl %cx, %r10d
932# elif CHAR_PER_VEC == 8
8418eb3f 933 movzbl %cl, %r10d
5ce97664
NG
934# else
935# error "Invalid CHAR_SIZE or VEC_SIZE"
936# endif
8418eb3f 937# else
5ce97664
NG
938 mov $-1, %VRCX
939 shlx %VRSI, %VRCX, %VR10
1fd8c163 940# endif
8418eb3f 941
5ce97664
NG
942 KMOV %k1, %VRCX
943 not %VRCX
8418eb3f
NG
944
945
1fd8c163 946# ifdef USE_AS_STRNCMP
1fd8c163 947# ifdef USE_AS_WCSCMP
84e7c46d
NG
948 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
949 safe. */
8418eb3f
NG
950 movl %eax, %r11d
951 shrl $2, %r11d
952 cmpq %r11, %rdx
1fd8c163 953# else
8418eb3f 954 cmpq %rax, %rdx
1fd8c163 955# endif
8418eb3f
NG
956 jbe L(return_page_cross_end_check)
957# endif
958 movl %eax, %OFFSET_REG
959
960 /* Readjust eax before potentially returning to the loop. */
961 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
962
5ce97664 963 and %VR10, %VRCX
8418eb3f
NG
964 jz L(loop_skip_page_cross_check)
965
5ce97664 966 bsf %VRCX, %VRCX
8418eb3f
NG
967
968# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
969 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
970L(return_page_cross_cmp_mem):
1fd8c163 971# else
8418eb3f
NG
972 addl %OFFSET_REG, %ecx
973# endif
974# ifdef USE_AS_WCSCMP
975 movl VEC_OFFSET(%rdi, %rcx), %edx
1fd8c163 976 xorl %eax, %eax
8418eb3f
NG
977 cmpl VEC_OFFSET(%rsi, %rcx), %edx
978 je L(ret8)
979 setl %al
980 negl %eax
981 xorl %r8d, %eax
982# else
983 movzbl VEC_OFFSET(%rdi, %rcx), %eax
984 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
84e7c46d
NG
985 TOLOWER_gpr (%rax, %eax)
986 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
987 subl %ecx, %eax
988 xorl %r8d, %eax
989 subl %r8d, %eax
1fd8c163 990# endif
8418eb3f 991L(ret8):
1fd8c163
L
992 ret
993
8418eb3f
NG
994# ifdef USE_AS_STRNCMP
995 .p2align 4,, 10
996L(return_page_cross_end_check):
5ce97664
NG
997 and %VR10, %VRCX
998 /* Need to use tzcnt here as VRCX may be zero. If VRCX is zero
999 tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
1000 guranteed to be <= CHAR_PER_VEC so we will only use the return
1001 idx if VRCX was non-zero. */
1002 tzcnt %VRCX, %VRCX
8418eb3f
NG
1003 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1004# ifdef USE_AS_WCSCMP
1005 sall $2, %edx
1006# endif
1007 cmpl %ecx, %edx
1008 ja L(return_page_cross_cmp_mem)
1009 xorl %eax, %eax
1010 ret
1011# endif
1012
1fd8c163 1013
8418eb3f
NG
1014 .p2align 4,, 10
1015L(more_2x_vec_till_page_cross):
1016 /* If more 2x vec till cross we will complete a full loop
1017 iteration here. */
1018
5ce97664
NG
1019 VMOVA VEC_SIZE(%rdi), %VMM(0)
1020 VPTESTM %VMM(0), %VMM(0), %k2
1021 CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
1022 KMOV %k1, %VRCX
1023 TESTEQ %VRCX
8418eb3f 1024 jnz L(return_vec_1_end)
c46e9afb 1025
8418eb3f
NG
1026# ifdef USE_AS_STRNCMP
1027 cmpq $(CHAR_PER_VEC * 2), %rdx
1028 jbe L(ret_zero_in_loop_page_cross)
1fd8c163
L
1029# endif
1030
8418eb3f 1031 subl $-(VEC_SIZE * 4), %eax
c46e9afb 1032
8418eb3f 1033 /* Safe to include comparisons from lower bytes. */
5ce97664
NG
1034 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
1035 VPTESTM %VMM(0), %VMM(0), %k2
1036 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
1037 KMOV %k1, %VRCX
1038 TESTEQ %VRCX
8418eb3f
NG
1039 jnz L(return_vec_page_cross_0)
1040
5ce97664
NG
1041 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
1042 VPTESTM %VMM(0), %VMM(0), %k2
1043 CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
1044 KMOV %k1, %VRCX
1045 TESTEQ %VRCX
8418eb3f 1046 jnz L(return_vec_page_cross_1)
c46e9afb 1047
8418eb3f
NG
1048# ifdef USE_AS_STRNCMP
1049 /* Must check length here as length might proclude reading next
1050 page. */
1051# ifdef USE_AS_WCSCMP
84e7c46d
NG
1052 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
1053 safe. */
8418eb3f
NG
1054 movl %eax, %r11d
1055 shrl $2, %r11d
1056 cmpq %r11, %rdx
1057# else
1058 cmpq %rax, %rdx
1059# endif
1060 jbe L(ret_zero_in_loop_page_cross)
c46e9afb 1061# endif
1fd8c163 1062
8418eb3f 1063 /* Finish the loop. */
5ce97664
NG
1064 VMOVA (VEC_SIZE * 2)(%rdi), %VMM(4)
1065 VMOVA (VEC_SIZE * 3)(%rdi), %VMM(6)
1066 VPMINU %VMM(4), %VMM(6), %VMM(9)
1067 VPTESTM %VMM(9), %VMM(9), %k1
84e7c46d 1068# ifndef USE_AS_STRCASECMP_L
5ce97664 1069 vpxorq (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
8418eb3f 1070 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
5ce97664 1071 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
84e7c46d 1072# else
5ce97664
NG
1073 VMOVU (VEC_SIZE * 2)(%rsi), %VMM(5)
1074 TOLOWER_VMM (%VMM(4), %VMM(5))
1075 VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7)
1076 TOLOWER_VMM (%VMM(6), %VMM(7))
1077 vpxorq %VMM(4), %VMM(5), %VMM(5)
1078 vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
1079# endif
1080 VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
1081 KMOV %k0, %LOOP_REG
8418eb3f
NG
1082 TESTEQ %LOOP_REG
1083 jnz L(return_vec_2_3_end)
1084
1085 /* Best for code size to include ucond-jmp here. Would be faster
5ce97664
NG
1086 if this case is hot to duplicate the L(return_vec_2_3_end)
1087 code as fall-through and have jump back to loop on mismatch
8418eb3f
NG
1088 comparison. */
1089 subq $-(VEC_SIZE * 4), %rdi
1090 subq $-(VEC_SIZE * 4), %rsi
1091 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
1092# ifdef USE_AS_STRNCMP
1093 subq $(CHAR_PER_VEC * 4), %rdx
1094 ja L(loop_skip_page_cross_check)
1095L(ret_zero_in_loop_page_cross):
1096 xorl %eax, %eax
1097 ret
c46e9afb 1098# else
8418eb3f 1099 jmp L(loop_skip_page_cross_check)
c46e9afb 1100# endif
1fd8c163 1101
8418eb3f
NG
1102
1103 .p2align 4,, 10
1104L(return_vec_page_cross_0):
1105 addl $-VEC_SIZE, %eax
1106L(return_vec_page_cross_1):
5ce97664 1107 bsf %VRCX, %VRCX
8418eb3f
NG
1108# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
1109 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
1110# ifdef USE_AS_STRNCMP
1111# ifdef USE_AS_WCSCMP
1112 /* Must divide ecx instead of multiply rdx due to overflow. */
1113 movl %ecx, %eax
1114 shrl $2, %eax
1115 cmpq %rax, %rdx
1116# else
1117 cmpq %rcx, %rdx
1118# endif
1119 jbe L(ret_zero_in_loop_page_cross)
1120# endif
1fd8c163 1121# else
8418eb3f 1122 addl %eax, %ecx
1fd8c163 1123# endif
8418eb3f 1124
1fd8c163 1125# ifdef USE_AS_WCSCMP
8418eb3f 1126 movl VEC_OFFSET(%rdi, %rcx), %edx
1fd8c163 1127 xorl %eax, %eax
8418eb3f
NG
1128 cmpl VEC_OFFSET(%rsi, %rcx), %edx
1129 je L(ret9)
1130 setl %al
1131 negl %eax
1132 xorl %r8d, %eax
1fd8c163 1133# else
8418eb3f
NG
1134 movzbl VEC_OFFSET(%rdi, %rcx), %eax
1135 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
84e7c46d
NG
1136 TOLOWER_gpr (%rax, %eax)
1137 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
1138 subl %ecx, %eax
1139 xorl %r8d, %eax
1140 subl %r8d, %eax
1fd8c163 1141# endif
8418eb3f 1142L(ret9):
1fd8c163
L
1143 ret
1144
8418eb3f
NG
1145
1146 .p2align 4,, 10
1147L(page_cross):
1148# ifndef USE_AS_STRNCMP
1149 /* If both are VEC aligned we don't need any special logic here.
5ce97664
NG
1150 Only valid for strcmp where stop condition is guranteed to
1151 be reachable by just reading memory. */
8418eb3f
NG
1152 testl $((VEC_SIZE - 1) << 20), %eax
1153 jz L(no_page_cross)
1fd8c163
L
1154# endif
1155
8418eb3f
NG
1156 movl %edi, %eax
1157 movl %esi, %ecx
1158 andl $(PAGE_SIZE - 1), %eax
1159 andl $(PAGE_SIZE - 1), %ecx
1160
1161 xorl %OFFSET_REG, %OFFSET_REG
1162
1163 /* Check which is closer to page cross, s1 or s2. */
1164 cmpl %eax, %ecx
1165 jg L(page_cross_s2)
1166
1167 /* The previous page cross check has false positives. Check for
1168 true positive as page cross logic is very expensive. */
1169 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
1170 jbe L(no_page_cross)
1171
1172
1173 /* Set r8 to not interfere with normal return value (rdi and rsi
1174 did not swap). */
1fd8c163 1175# ifdef USE_AS_WCSCMP
8418eb3f
NG
1176 /* any non-zero positive value that doesn't inference with 0x1.
1177 */
1178 movl $2, %r8d
1fd8c163 1179# else
8418eb3f 1180 xorl %r8d, %r8d
1fd8c163 1181# endif
8418eb3f
NG
1182
1183 /* Check if less than 1x VEC till page cross. */
1184 subl $(VEC_SIZE * 3), %eax
1185 jg L(less_1x_vec_till_page)
1186
1187
1188 /* If more than 1x VEC till page cross, loop throuh safely
1189 loadable memory until within 1x VEC of page cross. */
1190 .p2align 4,, 8
1191L(page_cross_loop):
5ce97664
NG
1192 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1193 VPTESTM %VMM(0), %VMM(0), %k2
1194 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
1195 KMOV %k1, %VRCX
1196 TESTEQ %VRCX
8418eb3f
NG
1197 jnz L(check_ret_vec_page_cross)
1198 addl $CHAR_PER_VEC, %OFFSET_REG
1fd8c163 1199# ifdef USE_AS_STRNCMP
8418eb3f
NG
1200 cmpq %OFFSET_REG64, %rdx
1201 jbe L(ret_zero_page_cross)
1fd8c163 1202# endif
8418eb3f
NG
1203 addl $VEC_SIZE, %eax
1204 jl L(page_cross_loop)
1205
1fd8c163 1206# ifdef USE_AS_WCSCMP
8418eb3f 1207 shrl $2, %eax
1fd8c163 1208# endif
8418eb3f
NG
1209
1210
1211 subl %eax, %OFFSET_REG
1212 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1213 to not cross page so is safe to load. Since we have already
5ce97664
NG
1214 loaded at least 1 VEC from rsi it is also guranteed to be
1215 safe. */
1216 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
1217 VPTESTM %VMM(0), %VMM(0), %k2
1218 CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
8418eb3f 1219
5ce97664 1220 KMOV %k1, %VRCX
8418eb3f
NG
1221# ifdef USE_AS_STRNCMP
1222 leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1223 cmpq %rax, %rdx
1224 jbe L(check_ret_vec_page_cross2)
1225# ifdef USE_AS_WCSCMP
1226 addq $-(CHAR_PER_VEC * 2), %rdx
1227# else
1228 addq %rdi, %rdx
1229# endif
1fd8c163 1230# endif
5ce97664 1231 TESTEQ %VRCX
8418eb3f 1232 jz L(prepare_loop_no_len)
1fd8c163 1233
8418eb3f
NG
1234 .p2align 4,, 4
1235L(ret_vec_page_cross):
1236# ifndef USE_AS_STRNCMP
1237L(check_ret_vec_page_cross):
1238# endif
5ce97664 1239 tzcnt %VRCX, %VRCX
8418eb3f
NG
1240 addl %OFFSET_REG, %ecx
1241L(ret_vec_page_cross_cont):
1fd8c163 1242# ifdef USE_AS_WCSCMP
8418eb3f
NG
1243 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1244 xorl %eax, %eax
1245 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1246 je L(ret12)
1fd8c163
L
1247 setl %al
1248 negl %eax
8418eb3f
NG
1249 xorl %r8d, %eax
1250# else
1251 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1252 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
84e7c46d
NG
1253 TOLOWER_gpr (%rax, %eax)
1254 TOLOWER_gpr (%rcx, %ecx)
8418eb3f
NG
1255 subl %ecx, %eax
1256 xorl %r8d, %eax
1257 subl %r8d, %eax
1fd8c163 1258# endif
8418eb3f
NG
1259L(ret12):
1260 ret
1261
1fd8c163
L
1262
1263# ifdef USE_AS_STRNCMP
8418eb3f
NG
1264 .p2align 4,, 10
1265L(check_ret_vec_page_cross2):
5ce97664 1266 TESTEQ %VRCX
8418eb3f 1267L(check_ret_vec_page_cross):
5ce97664 1268 tzcnt %VRCX, %VRCX
8418eb3f
NG
1269 addl %OFFSET_REG, %ecx
1270 cmpq %rcx, %rdx
1271 ja L(ret_vec_page_cross_cont)
1272 .p2align 4,, 2
1273L(ret_zero_page_cross):
1fd8c163
L
1274 xorl %eax, %eax
1275 ret
8418eb3f 1276# endif
1fd8c163 1277
8418eb3f
NG
1278 .p2align 4,, 4
1279L(page_cross_s2):
1280 /* Ensure this is a true page cross. */
1281 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1282 jbe L(no_page_cross)
1283
1284
1285 movl %ecx, %eax
1286 movq %rdi, %rcx
1287 movq %rsi, %rdi
1288 movq %rcx, %rsi
1289
1290 /* set r8 to negate return value as rdi and rsi swapped. */
1291# ifdef USE_AS_WCSCMP
1292 movl $-4, %r8d
1293# else
1294 movl $-1, %r8d
1fd8c163 1295# endif
8418eb3f 1296 xorl %OFFSET_REG, %OFFSET_REG
1fd8c163 1297
8418eb3f
NG
1298 /* Check if more than 1x VEC till page cross. */
1299 subl $(VEC_SIZE * 3), %eax
1300 jle L(page_cross_loop)
1301
1302 .p2align 4,, 6
1303L(less_1x_vec_till_page):
1304# ifdef USE_AS_WCSCMP
1305 shrl $2, %eax
1fd8c163 1306# endif
5ce97664
NG
1307
1308 /* Find largest load size we can use. VEC_SIZE == 64 only check
1309 if we can do a full ymm load. */
1310# if VEC_SIZE == 64
1311
1312 cmpl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
1313 ja L(less_32_till_page)
1314
1315
1316 /* Use 16 byte comparison. */
1317 VMOVU (%rdi), %VMM_256(0)
1318 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1319 CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
1320 kmovd %k1, %ecx
1321# ifdef USE_AS_WCSCMP
1322 subl $0xff, %ecx
1323# else
1324 incl %ecx
1325# endif
1326 jnz L(check_ret_vec_page_cross)
1327 movl $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
1328# ifdef USE_AS_STRNCMP
1329 cmpq %OFFSET_REG64, %rdx
1330 jbe L(ret_zero_page_cross_slow_case64)
1331 subl %eax, %OFFSET_REG
1332# else
1333 /* Explicit check for 32 byte alignment. */
1334 subl %eax, %OFFSET_REG
1335 jz L(prepare_loop)
1336# endif
1337 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
1338 VPTESTM %VMM_256(0), %VMM_256(0), %k2
1339 CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
1340 kmovd %k1, %ecx
1341# ifdef USE_AS_WCSCMP
1342 subl $0xff, %ecx
1343# else
1344 incl %ecx
1345# endif
1346 jnz L(check_ret_vec_page_cross)
1347# ifdef USE_AS_STRNCMP
1348 addl $(32 / SIZE_OF_CHAR), %OFFSET_REG
1349 subq %OFFSET_REG64, %rdx
1350 jbe L(ret_zero_page_cross_slow_case64)
1351 subq $-(CHAR_PER_VEC * 4), %rdx
1352
1353 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1354 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1355# else
1356 leaq (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1357 leaq (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1358# endif
1359 jmp L(prepare_loop_aligned)
1360
1361# ifdef USE_AS_STRNCMP
1362 .p2align 4,, 2
1363L(ret_zero_page_cross_slow_case64):
1364 xorl %eax, %eax
1365 ret
1366# endif
1367L(less_32_till_page):
1368# endif
1369
8418eb3f 1370 /* Find largest load size we can use. */
5ce97664 1371 cmpl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
8418eb3f
NG
1372 ja L(less_16_till_page)
1373
1374 /* Use 16 byte comparison. */
1375 vmovdqu (%rdi), %xmm0
1376 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1377 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
8418eb3f 1378 kmovd %k1, %ecx
1fd8c163 1379# ifdef USE_AS_WCSCMP
8418eb3f
NG
1380 subl $0xf, %ecx
1381# else
1382 incw %cx
1fd8c163 1383# endif
8418eb3f 1384 jnz L(check_ret_vec_page_cross)
5ce97664
NG
1385
1386 movl $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
1fd8c163 1387# ifdef USE_AS_STRNCMP
5ce97664 1388# if VEC_SIZE == 32
8418eb3f 1389 cmpq %OFFSET_REG64, %rdx
5ce97664
NG
1390# else
1391 cmpq $(16 / SIZE_OF_CHAR), %rdx
1392# endif
8418eb3f
NG
1393 jbe L(ret_zero_page_cross_slow_case0)
1394 subl %eax, %OFFSET_REG
1395# else
1396 /* Explicit check for 16 byte alignment. */
1397 subl %eax, %OFFSET_REG
1398 jz L(prepare_loop)
1fd8c163 1399# endif
8418eb3f
NG
1400 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1401 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1402 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
8418eb3f 1403 kmovd %k1, %ecx
1fd8c163 1404# ifdef USE_AS_WCSCMP
8418eb3f 1405 subl $0xf, %ecx
1fd8c163 1406# else
8418eb3f 1407 incw %cx
1fd8c163 1408# endif
8418eb3f
NG
1409 jnz L(check_ret_vec_page_cross)
1410# ifdef USE_AS_STRNCMP
1411 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1412 subq %OFFSET_REG64, %rdx
1413 jbe L(ret_zero_page_cross_slow_case0)
1414 subq $-(CHAR_PER_VEC * 4), %rdx
1415
1416 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1417 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1418# else
1419 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1420 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1421# endif
1422 jmp L(prepare_loop_aligned)
1423
1424# ifdef USE_AS_STRNCMP
1425 .p2align 4,, 2
1426L(ret_zero_page_cross_slow_case0):
1427 xorl %eax, %eax
1fd8c163 1428 ret
8418eb3f 1429# endif
1fd8c163 1430
c46e9afb 1431
8418eb3f
NG
1432 .p2align 4,, 10
1433L(less_16_till_page):
5ce97664 1434 cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
8418eb3f
NG
1435 ja L(less_8_till_page)
1436
1437 /* Use 8 byte comparison. */
1438 vmovq (%rdi), %xmm0
1439 vmovq (%rsi), %xmm1
1440 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1441 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1fd8c163 1442 kmovd %k1, %ecx
c46e9afb 1443# ifdef USE_AS_WCSCMP
8418eb3f 1444 subl $0x3, %ecx
c46e9afb 1445# else
8418eb3f 1446 incb %cl
c46e9afb 1447# endif
8418eb3f 1448 jnz L(check_ret_vec_page_cross)
1fd8c163 1449
1fd8c163 1450
1fd8c163 1451# ifdef USE_AS_STRNCMP
8418eb3f
NG
1452 cmpq $(8 / SIZE_OF_CHAR), %rdx
1453 jbe L(ret_zero_page_cross_slow_case0)
1fd8c163 1454# endif
5ce97664 1455 movl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
8418eb3f 1456 subl %eax, %OFFSET_REG
c46e9afb 1457
8418eb3f
NG
1458 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1459 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1460 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1461 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
c46e9afb
L
1462 kmovd %k1, %ecx
1463# ifdef USE_AS_WCSCMP
8418eb3f 1464 subl $0x3, %ecx
c46e9afb 1465# else
8418eb3f 1466 incb %cl
c46e9afb 1467# endif
8418eb3f
NG
1468 jnz L(check_ret_vec_page_cross)
1469
1fd8c163 1470
1fd8c163 1471# ifdef USE_AS_STRNCMP
8418eb3f
NG
1472 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
1473 subq %OFFSET_REG64, %rdx
1474 jbe L(ret_zero_page_cross_slow_case0)
1475 subq $-(CHAR_PER_VEC * 4), %rdx
1476
1477 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1478 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1479# else
1480 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1481 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1fd8c163 1482# endif
8418eb3f 1483 jmp L(prepare_loop_aligned)
1fd8c163 1484
1fd8c163 1485
8418eb3f
NG
1486
1487
1488 .p2align 4,, 10
1489L(less_8_till_page):
1fd8c163 1490# ifdef USE_AS_WCSCMP
8418eb3f
NG
1491 /* If using wchar then this is the only check before we reach
1492 the page boundary. */
1493 movl (%rdi), %eax
1494 movl (%rsi), %ecx
1495 cmpl %ecx, %eax
1496 jnz L(ret_less_8_wcs)
1497# ifdef USE_AS_STRNCMP
1498 addq $-(CHAR_PER_VEC * 2), %rdx
1499 /* We already checked for len <= 1 so cannot hit that case here.
1500 */
1501# endif
1502 testl %eax, %eax
1503 jnz L(prepare_loop)
1504 ret
1505
1506 .p2align 4,, 8
1507L(ret_less_8_wcs):
1508 setl %OFFSET_REG8
1509 negl %OFFSET_REG
1510 movl %OFFSET_REG, %eax
1511 xorl %r8d, %eax
1512 ret
1513
1fd8c163 1514# else
5ce97664 1515 cmpl $(VEC_SIZE - 4), %eax
8418eb3f
NG
1516 ja L(less_4_till_page)
1517
1518 vmovd (%rdi), %xmm0
1519 vmovd (%rsi), %xmm1
1520 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1521 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
8418eb3f
NG
1522 kmovd %k1, %ecx
1523 subl $0xf, %ecx
1524 jnz L(check_ret_vec_page_cross)
1fd8c163 1525
1fd8c163 1526# ifdef USE_AS_STRNCMP
8418eb3f
NG
1527 cmpq $4, %rdx
1528 jbe L(ret_zero_page_cross_slow_case1)
1fd8c163 1529# endif
5ce97664 1530 movl $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
8418eb3f 1531 subl %eax, %OFFSET_REG
1fd8c163 1532
8418eb3f
NG
1533 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1534 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1535 VPTESTM %xmm0, %xmm0, %k2
84e7c46d 1536 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1fd8c163 1537 kmovd %k1, %ecx
c46e9afb 1538 subl $0xf, %ecx
8418eb3f
NG
1539 jnz L(check_ret_vec_page_cross)
1540# ifdef USE_AS_STRNCMP
1541 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
1542 subq %OFFSET_REG64, %rdx
1543 jbe L(ret_zero_page_cross_slow_case1)
1544 subq $-(CHAR_PER_VEC * 4), %rdx
1545
1546 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1547 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1548# else
1549 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1550 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1551# endif
1552 jmp L(prepare_loop_aligned)
1553
1fd8c163 1554
1fd8c163 1555# ifdef USE_AS_STRNCMP
8418eb3f
NG
1556 .p2align 4,, 2
1557L(ret_zero_page_cross_slow_case1):
1558 xorl %eax, %eax
1559 ret
1fd8c163
L
1560# endif
1561
8418eb3f
NG
1562 .p2align 4,, 10
1563L(less_4_till_page):
1564 subq %rdi, %rsi
1565 /* Extremely slow byte comparison loop. */
1566L(less_4_loop):
1567 movzbl (%rdi), %eax
1568 movzbl (%rsi, %rdi), %ecx
84e7c46d
NG
1569 TOLOWER_gpr (%rax, %eax)
1570 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1571 subl %BYTE_LOOP_REG, %eax
8418eb3f
NG
1572 jnz L(ret_less_4_loop)
1573 testl %ecx, %ecx
1574 jz L(ret_zero_4_loop)
1575# ifdef USE_AS_STRNCMP
1576 decq %rdx
1577 jz L(ret_zero_4_loop)
1578# endif
1579 incq %rdi
1580 /* end condition is reach page boundary (rdi is aligned). */
5ce97664 1581 testb $(VEC_SIZE - 1), %dil
8418eb3f
NG
1582 jnz L(less_4_loop)
1583 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1584 addq $-(VEC_SIZE * 4), %rdi
1585# ifdef USE_AS_STRNCMP
1586 subq $-(CHAR_PER_VEC * 4), %rdx
1587# endif
1588 jmp L(prepare_loop_aligned)
1589
1590L(ret_zero_4_loop):
1591 xorl %eax, %eax
1592 ret
1593L(ret_less_4_loop):
1594 xorl %r8d, %eax
1595 subl %r8d, %eax
1fd8c163 1596 ret
8418eb3f 1597# endif
84e7c46d
NG
1598 cfi_endproc
1599 .size STRCMP, .-STRCMP
1fd8c163 1600#endif