]>
Commit | Line | Data |
---|---|---|
14570163 | 1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
581c785b | 2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
14570163 LS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
14570163 | 18 | |
ceabdcd1 NG |
19 | #include <isa-level.h> |
20 | ||
21 | #if ISA_SHOULD_BUILD (3) | |
22 | ||
23 | # ifndef STRCMP_ISA | |
24 | # define STRCMP_ISA _avx2 | |
25 | # endif | |
26 | ||
27 | # include "strcmp-naming.h" | |
14570163 LS |
28 | |
29 | # include <sysdep.h> | |
30 | ||
bbf81222 NG |
31 | # if defined USE_AS_STRCASECMP_L |
32 | # include "locale-defines.h" | |
33 | # endif | |
34 | ||
14570163 LS |
35 | # ifndef STRCMP |
36 | # define STRCMP __strcmp_avx2 | |
37 | # endif | |
38 | ||
39 | # define PAGE_SIZE 4096 | |
40 | ||
b77b06e0 | 41 | /* VEC_SIZE = Number of bytes in a ymm register. */ |
14570163 LS |
42 | # define VEC_SIZE 32 |
43 | ||
b77b06e0 NG |
44 | # define VMOVU vmovdqu |
45 | # define VMOVA vmovdqa | |
14570163 LS |
46 | |
47 | # ifdef USE_AS_WCSCMP | |
b77b06e0 | 48 | /* Compare packed dwords. */ |
14570163 | 49 | # define VPCMPEQ vpcmpeqd |
b77b06e0 | 50 | /* Compare packed dwords and store minimum. */ |
14570163 | 51 | # define VPMINU vpminud |
b77b06e0 | 52 | /* 1 dword char == 4 bytes. */ |
14570163 LS |
53 | # define SIZE_OF_CHAR 4 |
54 | # else | |
b77b06e0 | 55 | /* Compare packed bytes. */ |
14570163 | 56 | # define VPCMPEQ vpcmpeqb |
b77b06e0 | 57 | /* Compare packed bytes and store minimum. */ |
14570163 | 58 | # define VPMINU vpminub |
b77b06e0 | 59 | /* 1 byte char == 1 byte. */ |
14570163 LS |
60 | # define SIZE_OF_CHAR 1 |
61 | # endif | |
62 | ||
b77b06e0 NG |
63 | # ifdef USE_AS_STRNCMP |
64 | # define LOOP_REG r9d | |
65 | # define LOOP_REG64 r9 | |
66 | ||
67 | # define OFFSET_REG8 r9b | |
68 | # define OFFSET_REG r9d | |
69 | # define OFFSET_REG64 r9 | |
70 | # else | |
71 | # define LOOP_REG edx | |
72 | # define LOOP_REG64 rdx | |
73 | ||
74 | # define OFFSET_REG8 dl | |
75 | # define OFFSET_REG edx | |
76 | # define OFFSET_REG64 rdx | |
77 | # endif | |
78 | ||
14570163 LS |
79 | # ifndef VZEROUPPER |
80 | # define VZEROUPPER vzeroupper | |
81 | # endif | |
82 | ||
b77b06e0 NG |
83 | # if defined USE_AS_STRNCMP |
84 | # define VEC_OFFSET 0 | |
85 | # else | |
86 | # define VEC_OFFSET (-VEC_SIZE) | |
87 | # endif | |
88 | ||
bbf81222 NG |
89 | # ifdef USE_AS_STRCASECMP_L |
90 | # define BYTE_LOOP_REG OFFSET_REG | |
91 | # else | |
92 | # define BYTE_LOOP_REG ecx | |
93 | # endif | |
94 | ||
95 | # ifdef USE_AS_STRCASECMP_L | |
96 | # ifdef USE_AS_STRNCMP | |
bbf81222 NG |
97 | # define LOCALE_REG rcx |
98 | # define LOCALE_REG_LP RCX_LP | |
bbf81222 | 99 | # else |
bbf81222 NG |
100 | # define LOCALE_REG rdx |
101 | # define LOCALE_REG_LP RDX_LP | |
bbf81222 NG |
102 | # endif |
103 | # endif | |
104 | ||
b77b06e0 NG |
105 | # define xmmZERO xmm15 |
106 | # define ymmZERO ymm15 | |
107 | ||
bbf81222 NG |
108 | # define LCASE_MIN_ymm %ymm10 |
109 | # define LCASE_MAX_ymm %ymm11 | |
110 | # define CASE_ADD_ymm %ymm12 | |
111 | ||
112 | # define LCASE_MIN_xmm %xmm10 | |
113 | # define LCASE_MAX_xmm %xmm11 | |
114 | # define CASE_ADD_xmm %xmm12 | |
115 | ||
116 | /* r11 is never use elsewhere so this is safe to maintain. */ | |
117 | # define TOLOWER_BASE %r11 | |
118 | ||
7ebba913 L |
119 | # ifndef SECTION |
120 | # define SECTION(p) p##.avx | |
121 | # endif | |
122 | ||
bbf81222 NG |
123 | # ifdef USE_AS_STRCASECMP_L |
124 | # define REG(x, y) x ## y | |
125 | # define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ | |
126 | vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ | |
127 | vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ | |
128 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ | |
129 | vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ | |
130 | vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ | |
131 | vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ | |
132 | vpaddb REG(%ext, 8), reg1_in, reg1_out; \ | |
133 | vpaddb REG(%ext, 9), reg2_in, reg2_out | |
134 | ||
135 | # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst | |
136 | # define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) | |
137 | # define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) | |
138 | ||
139 | # define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ | |
140 | TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ | |
141 | VPCMPEQ scratch_reg, s2_reg, reg_out | |
142 | ||
143 | # define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ | |
144 | VMOVU s2_mem, reg_out; \ | |
145 | CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) | |
146 | ||
147 | # define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) | |
148 | # define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) | |
149 | ||
150 | # define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) | |
151 | # define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) | |
152 | ||
153 | # else | |
154 | # define TOLOWER_gpr(...) | |
155 | # define TOLOWER_ymm(...) | |
156 | # define TOLOWER_xmm(...) | |
157 | ||
158 | # define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ | |
159 | VPCMPEQ s2_reg, s1_reg, reg_out | |
160 | ||
161 | # define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) | |
162 | ||
163 | # define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) | |
164 | # define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) | |
165 | # endif | |
166 | ||
14570163 LS |
167 | /* Warning! |
168 | wcscmp/wcsncmp have to use SIGNED comparison for elements. | |
169 | strcmp/strncmp have to use UNSIGNED comparison for elements. | |
170 | */ | |
171 | ||
172 | /* The main idea of the string comparison (byte or dword) using AVX2 | |
173 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on | |
174 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order | |
175 | to check the null char, algorithm keeps the matched bytes/dwords, | |
176 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, | |
177 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and | |
178 | one VPMINU instructions, together with movdqu and testl instructions. | |
179 | Main loop (away from from page boundary) compares 4 vectors are a time, | |
180 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. | |
181 | ||
182 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic | |
183 | is the same as strcmp, except that an a maximum offset is tracked. If | |
184 | the maximum offset is reached before a difference is found, zero is | |
185 | returned. */ | |
186 | ||
b77b06e0 | 187 | .section SECTION(.text), "ax", @progbits |
bbf81222 NG |
188 | .align 16 |
189 | .type STRCMP, @function | |
190 | .globl STRCMP | |
bbf81222 | 191 | |
bbf81222 | 192 | # ifdef USE_AS_STRCASECMP_L |
ceabdcd1 | 193 | ENTRY (STRCASECMP) |
bbf81222 NG |
194 | movq __libc_tsd_LOCALE@gottpoff(%rip), %rax |
195 | mov %fs:(%rax), %LOCALE_REG_LP | |
196 | ||
197 | /* Either 1 or 5 bytes (dependeing if CET is enabled). */ | |
198 | .p2align 4 | |
ceabdcd1 | 199 | END (STRCASECMP) |
bbf81222 NG |
200 | /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ |
201 | # endif | |
202 | ||
203 | .p2align 4 | |
204 | STRCMP: | |
205 | cfi_startproc | |
206 | _CET_ENDBR | |
207 | CALL_MCOUNT | |
208 | ||
209 | # if defined USE_AS_STRCASECMP_L | |
210 | /* We have to fall back on the C implementation for locales with | |
211 | encodings not matching ASCII for single bytes. */ | |
212 | # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 | |
213 | mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP | |
214 | # else | |
215 | mov (%LOCALE_REG), %RAX_LP | |
216 | # endif | |
217 | testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) | |
30e57e0a | 218 | jne STRCASECMP_L_NONASCII |
bbf81222 NG |
219 | leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE |
220 | # endif | |
221 | ||
14570163 | 222 | # ifdef USE_AS_STRNCMP |
bbf81222 NG |
223 | /* Don't overwrite LOCALE_REG (rcx) until we have pass |
224 | L(one_or_less). Otherwise we might use the wrong locale in | |
225 | the OVERFLOW_STRCMP (strcasecmp_l). */ | |
b77b06e0 NG |
226 | # ifdef __ILP32__ |
227 | /* Clear the upper 32 bits. */ | |
c15efd01 | 228 | movl %edx, %edx |
b77b06e0 | 229 | # endif |
ee915088 | 230 | cmp $1, %RDX_LP |
b77b06e0 NG |
231 | /* Signed comparison intentional. We use this branch to also |
232 | test cases where length >= 2^63. These very large sizes can be | |
233 | handled with strcmp as there is no way for that length to | |
234 | actually bound the buffer. */ | |
235 | jle L(one_or_less) | |
14570163 | 236 | # ifdef USE_AS_WCSCMP |
ddf0992c | 237 | movq %rdx, %rcx |
b77b06e0 NG |
238 | |
239 | /* Multiplying length by sizeof(wchar_t) can result in overflow. | |
240 | Check if that is possible. All cases where overflow are possible | |
241 | are cases where length is large enough that it can never be a | |
242 | bound on valid memory so just use wcscmp. */ | |
ddf0992c | 243 | shrq $56, %rcx |
9fef7039 | 244 | jnz OVERFLOW_STRCMP |
b77b06e0 NG |
245 | |
246 | leaq (, %rdx, 4), %rdx | |
ddf0992c | 247 | # endif |
14570163 | 248 | # endif |
b77b06e0 | 249 | vpxor %xmmZERO, %xmmZERO, %xmmZERO |
bbf81222 NG |
250 | # if defined USE_AS_STRCASECMP_L |
251 | .section .rodata.cst32, "aM", @progbits, 32 | |
252 | .align 32 | |
253 | L(lcase_min): | |
254 | .quad 0x3f3f3f3f3f3f3f3f | |
255 | .quad 0x3f3f3f3f3f3f3f3f | |
256 | .quad 0x3f3f3f3f3f3f3f3f | |
257 | .quad 0x3f3f3f3f3f3f3f3f | |
258 | L(lcase_max): | |
259 | .quad 0x9999999999999999 | |
260 | .quad 0x9999999999999999 | |
261 | .quad 0x9999999999999999 | |
262 | .quad 0x9999999999999999 | |
263 | L(case_add): | |
264 | .quad 0x2020202020202020 | |
265 | .quad 0x2020202020202020 | |
266 | .quad 0x2020202020202020 | |
267 | .quad 0x2020202020202020 | |
268 | .previous | |
269 | ||
270 | vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm | |
271 | vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm | |
272 | vmovdqa L(case_add)(%rip), CASE_ADD_ymm | |
273 | # endif | |
14570163 | 274 | movl %edi, %eax |
14570163 | 275 | orl %esi, %eax |
b77b06e0 NG |
276 | sall $20, %eax |
277 | /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ | |
278 | cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax | |
279 | ja L(page_cross) | |
280 | ||
281 | L(no_page_cross): | |
282 | /* Safe to compare 4x vectors. */ | |
283 | VMOVU (%rdi), %ymm0 | |
bbf81222 NG |
284 | /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. |
285 | Otherwise converts ymm0 and load from rsi to lower. ymm2 is | |
286 | scratch and ymm1 is the return. */ | |
287 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) | |
b77b06e0 NG |
288 | /* 1s at null CHAR. */ |
289 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 | |
290 | /* 1s where s1 and s2 equal AND not null CHAR. */ | |
291 | vpandn %ymm1, %ymm2, %ymm1 | |
292 | ||
293 | /* All 1s -> keep going, any 0s -> return. */ | |
294 | vpmovmskb %ymm1, %ecx | |
14570163 | 295 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
296 | cmpq $VEC_SIZE, %rdx |
297 | jbe L(vec_0_test_len) | |
14570163 | 298 | # endif |
b77b06e0 NG |
299 | |
300 | /* All 1s represents all equals. incl will overflow to zero in | |
301 | all equals case. Otherwise 1s will carry until position of first | |
302 | mismatch. */ | |
303 | incl %ecx | |
304 | jz L(more_3x_vec) | |
305 | ||
306 | .p2align 4,, 4 | |
307 | L(return_vec_0): | |
308 | tzcntl %ecx, %ecx | |
14570163 | 309 | # ifdef USE_AS_WCSCMP |
b77b06e0 | 310 | movl (%rdi, %rcx), %edx |
14570163 | 311 | xorl %eax, %eax |
b77b06e0 NG |
312 | cmpl (%rsi, %rcx), %edx |
313 | je L(ret0) | |
14570163 LS |
314 | setl %al |
315 | negl %eax | |
316 | orl $1, %eax | |
14570163 | 317 | # else |
b77b06e0 NG |
318 | movzbl (%rdi, %rcx), %eax |
319 | movzbl (%rsi, %rcx), %ecx | |
bbf81222 NG |
320 | TOLOWER_gpr (%rax, %eax) |
321 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 | 322 | subl %ecx, %eax |
14570163 | 323 | # endif |
b77b06e0 | 324 | L(ret0): |
7ebba913 L |
325 | L(return_vzeroupper): |
326 | ZERO_UPPER_VEC_REGISTERS_RETURN | |
14570163 | 327 | |
14570163 | 328 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
329 | .p2align 4,, 8 |
330 | L(vec_0_test_len): | |
331 | notl %ecx | |
332 | bzhil %edx, %ecx, %eax | |
333 | jnz L(return_vec_0) | |
334 | /* Align if will cross fetch block. */ | |
335 | .p2align 4,, 2 | |
336 | L(ret_zero): | |
14570163 | 337 | xorl %eax, %eax |
b77b06e0 NG |
338 | VZEROUPPER_RETURN |
339 | ||
340 | .p2align 4,, 5 | |
341 | L(one_or_less): | |
bbf81222 NG |
342 | # ifdef USE_AS_STRCASECMP_L |
343 | /* Set locale argument for strcasecmp. */ | |
344 | movq %LOCALE_REG, %rdx | |
345 | # endif | |
b77b06e0 | 346 | jb L(ret_zero) |
b77b06e0 NG |
347 | /* 'nbe' covers the case where length is negative (large |
348 | unsigned). */ | |
c6272098 NG |
349 | jnbe OVERFLOW_STRCMP |
350 | # ifdef USE_AS_WCSCMP | |
b77b06e0 | 351 | movl (%rdi), %edx |
14570163 | 352 | xorl %eax, %eax |
b77b06e0 NG |
353 | cmpl (%rsi), %edx |
354 | je L(ret1) | |
355 | setl %al | |
356 | negl %eax | |
357 | orl $1, %eax | |
14570163 | 358 | # else |
b77b06e0 NG |
359 | movzbl (%rdi), %eax |
360 | movzbl (%rsi), %ecx | |
bbf81222 NG |
361 | TOLOWER_gpr (%rax, %eax) |
362 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 | 363 | subl %ecx, %eax |
14570163 | 364 | # endif |
b77b06e0 NG |
365 | L(ret1): |
366 | ret | |
14570163 | 367 | # endif |
14570163 | 368 | |
b77b06e0 NG |
369 | .p2align 4,, 10 |
370 | L(return_vec_1): | |
371 | tzcntl %ecx, %ecx | |
14570163 | 372 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
373 | /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of |
374 | overflow. */ | |
375 | addq $-VEC_SIZE, %rdx | |
376 | cmpq %rcx, %rdx | |
377 | jbe L(ret_zero) | |
378 | # endif | |
379 | # ifdef USE_AS_WCSCMP | |
380 | movl VEC_SIZE(%rdi, %rcx), %edx | |
14570163 | 381 | xorl %eax, %eax |
b77b06e0 NG |
382 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
383 | je L(ret2) | |
384 | setl %al | |
385 | negl %eax | |
386 | orl $1, %eax | |
14570163 | 387 | # else |
b77b06e0 NG |
388 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
389 | movzbl VEC_SIZE(%rsi, %rcx), %ecx | |
bbf81222 NG |
390 | TOLOWER_gpr (%rax, %eax) |
391 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 | 392 | subl %ecx, %eax |
14570163 | 393 | # endif |
b77b06e0 | 394 | L(ret2): |
7ebba913 | 395 | VZEROUPPER_RETURN |
14570163 | 396 | |
b77b06e0 | 397 | .p2align 4,, 10 |
14570163 | 398 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
399 | L(return_vec_3): |
400 | salq $32, %rcx | |
401 | # endif | |
402 | ||
403 | L(return_vec_2): | |
404 | # ifndef USE_AS_STRNCMP | |
405 | tzcntl %ecx, %ecx | |
406 | # else | |
407 | tzcntq %rcx, %rcx | |
408 | cmpq %rcx, %rdx | |
409 | jbe L(ret_zero) | |
410 | # endif | |
411 | ||
412 | # ifdef USE_AS_WCSCMP | |
413 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx | |
14570163 | 414 | xorl %eax, %eax |
b77b06e0 NG |
415 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
416 | je L(ret3) | |
417 | setl %al | |
418 | negl %eax | |
419 | orl $1, %eax | |
14570163 | 420 | # else |
b77b06e0 NG |
421 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
422 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx | |
bbf81222 NG |
423 | TOLOWER_gpr (%rax, %eax) |
424 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
425 | subl %ecx, %eax |
426 | # endif | |
427 | L(ret3): | |
428 | VZEROUPPER_RETURN | |
429 | ||
430 | # ifndef USE_AS_STRNCMP | |
431 | .p2align 4,, 10 | |
432 | L(return_vec_3): | |
433 | tzcntl %ecx, %ecx | |
14570163 | 434 | # ifdef USE_AS_WCSCMP |
b77b06e0 | 435 | movl (VEC_SIZE * 3)(%rdi, %rcx), %edx |
14570163 | 436 | xorl %eax, %eax |
b77b06e0 NG |
437 | cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx |
438 | je L(ret4) | |
439 | setl %al | |
440 | negl %eax | |
441 | orl $1, %eax | |
14570163 | 442 | # else |
b77b06e0 NG |
443 | movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax |
444 | movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx | |
bbf81222 NG |
445 | TOLOWER_gpr (%rax, %eax) |
446 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 | 447 | subl %ecx, %eax |
14570163 | 448 | # endif |
b77b06e0 | 449 | L(ret4): |
7ebba913 | 450 | VZEROUPPER_RETURN |
b77b06e0 NG |
451 | # endif |
452 | ||
453 | .p2align 4,, 10 | |
454 | L(more_3x_vec): | |
455 | /* Safe to compare 4x vectors. */ | |
456 | VMOVU VEC_SIZE(%rdi), %ymm0 | |
bbf81222 | 457 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
b77b06e0 NG |
458 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
459 | vpandn %ymm1, %ymm2, %ymm1 | |
460 | vpmovmskb %ymm1, %ecx | |
461 | incl %ecx | |
462 | jnz L(return_vec_1) | |
463 | ||
464 | # ifdef USE_AS_STRNCMP | |
465 | subq $(VEC_SIZE * 2), %rdx | |
466 | jbe L(ret_zero) | |
467 | # endif | |
468 | ||
469 | VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 | |
bbf81222 | 470 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) |
b77b06e0 NG |
471 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
472 | vpandn %ymm1, %ymm2, %ymm1 | |
473 | vpmovmskb %ymm1, %ecx | |
474 | incl %ecx | |
475 | jnz L(return_vec_2) | |
476 | ||
477 | VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 | |
bbf81222 | 478 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) |
b77b06e0 NG |
479 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
480 | vpandn %ymm1, %ymm2, %ymm1 | |
481 | vpmovmskb %ymm1, %ecx | |
482 | incl %ecx | |
483 | jnz L(return_vec_3) | |
14570163 | 484 | |
14570163 | 485 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
486 | cmpq $(VEC_SIZE * 2), %rdx |
487 | jbe L(ret_zero) | |
488 | # endif | |
489 | ||
490 | # ifdef USE_AS_WCSCMP | |
491 | /* any non-zero positive value that doesn't inference with 0x1. | |
14570163 | 492 | */ |
b77b06e0 | 493 | movl $2, %r8d |
14570163 | 494 | |
b77b06e0 NG |
495 | # else |
496 | xorl %r8d, %r8d | |
497 | # endif | |
498 | ||
499 | /* The prepare labels are various entry points from the page | |
500 | cross logic. */ | |
501 | L(prepare_loop): | |
502 | ||
503 | # ifdef USE_AS_STRNCMP | |
504 | /* Store N + (VEC_SIZE * 4) and place check at the begining of | |
505 | the loop. */ | |
506 | leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx | |
507 | # endif | |
508 | L(prepare_loop_no_len): | |
509 | ||
510 | /* Align s1 and adjust s2 accordingly. */ | |
511 | subq %rdi, %rsi | |
512 | andq $-(VEC_SIZE * 4), %rdi | |
513 | addq %rdi, %rsi | |
514 | ||
515 | # ifdef USE_AS_STRNCMP | |
516 | subq %rdi, %rdx | |
517 | # endif | |
518 | ||
519 | L(prepare_loop_aligned): | |
520 | /* eax stores distance from rsi to next page cross. These cases | |
521 | need to be handled specially as the 4x loop could potentially | |
522 | read memory past the length of s1 or s2 and across a page | |
523 | boundary. */ | |
524 | movl $-(VEC_SIZE * 4), %eax | |
525 | subl %esi, %eax | |
526 | andl $(PAGE_SIZE - 1), %eax | |
527 | ||
528 | /* Loop 4x comparisons at a time. */ | |
14570163 LS |
529 | .p2align 4 |
530 | L(loop): | |
b77b06e0 NG |
531 | |
532 | /* End condition for strncmp. */ | |
14570163 | 533 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
534 | subq $(VEC_SIZE * 4), %rdx |
535 | jbe L(ret_zero) | |
536 | # endif | |
537 | ||
538 | subq $-(VEC_SIZE * 4), %rdi | |
539 | subq $-(VEC_SIZE * 4), %rsi | |
540 | ||
541 | /* Check if rsi loads will cross a page boundary. */ | |
542 | addl $-(VEC_SIZE * 4), %eax | |
543 | jnb L(page_cross_during_loop) | |
544 | ||
545 | /* Loop entry after handling page cross during loop. */ | |
546 | L(loop_skip_page_cross_check): | |
547 | VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 | |
548 | VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 | |
549 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 | |
550 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 | |
551 | ||
552 | /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ | |
bbf81222 NG |
553 | CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) |
554 | CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) | |
555 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) | |
556 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) | |
b77b06e0 NG |
557 | |
558 | /* If any mismatches or null CHAR then 0 CHAR, otherwise non- | |
559 | zero. */ | |
560 | vpand %ymm0, %ymm1, %ymm1 | |
561 | ||
562 | ||
563 | vpand %ymm2, %ymm3, %ymm3 | |
564 | vpand %ymm4, %ymm5, %ymm5 | |
565 | vpand %ymm6, %ymm7, %ymm7 | |
566 | ||
567 | VPMINU %ymm1, %ymm3, %ymm3 | |
568 | VPMINU %ymm5, %ymm7, %ymm7 | |
569 | ||
570 | /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ | |
571 | VPMINU %ymm3, %ymm7, %ymm7 | |
572 | ||
573 | /* If any 0 CHAR then done. */ | |
574 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 | |
575 | vpmovmskb %ymm7, %LOOP_REG | |
576 | testl %LOOP_REG, %LOOP_REG | |
577 | jz L(loop) | |
578 | ||
579 | /* Find which VEC has the mismatch of end of string. */ | |
580 | VPCMPEQ %ymm1, %ymmZERO, %ymm1 | |
581 | vpmovmskb %ymm1, %ecx | |
14570163 | 582 | testl %ecx, %ecx |
b77b06e0 NG |
583 | jnz L(return_vec_0_end) |
584 | ||
585 | ||
586 | VPCMPEQ %ymm3, %ymmZERO, %ymm3 | |
587 | vpmovmskb %ymm3, %ecx | |
588 | testl %ecx, %ecx | |
589 | jnz L(return_vec_1_end) | |
590 | ||
591 | L(return_vec_2_3_end): | |
14570163 | 592 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
593 | subq $(VEC_SIZE * 2), %rdx |
594 | jbe L(ret_zero_end) | |
595 | # endif | |
596 | ||
597 | VPCMPEQ %ymm5, %ymmZERO, %ymm5 | |
598 | vpmovmskb %ymm5, %ecx | |
599 | testl %ecx, %ecx | |
600 | jnz L(return_vec_2_end) | |
601 | ||
602 | /* LOOP_REG contains matches for null/mismatch from the loop. If | |
603 | VEC 0,1,and 2 all have no null and no mismatches then mismatch | |
604 | must entirely be from VEC 3 which is fully represented by | |
605 | LOOP_REG. */ | |
606 | tzcntl %LOOP_REG, %LOOP_REG | |
607 | ||
608 | # ifdef USE_AS_STRNCMP | |
609 | subl $-(VEC_SIZE), %LOOP_REG | |
610 | cmpq %LOOP_REG64, %rdx | |
611 | jbe L(ret_zero_end) | |
612 | # endif | |
613 | ||
614 | # ifdef USE_AS_WCSCMP | |
615 | movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx | |
14570163 | 616 | xorl %eax, %eax |
b77b06e0 NG |
617 | cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx |
618 | je L(ret5) | |
619 | setl %al | |
620 | negl %eax | |
621 | xorl %r8d, %eax | |
14570163 | 622 | # else |
b77b06e0 NG |
623 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax |
624 | movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx | |
bbf81222 NG |
625 | TOLOWER_gpr (%rax, %eax) |
626 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
627 | subl %ecx, %eax |
628 | xorl %r8d, %eax | |
629 | subl %r8d, %eax | |
14570163 | 630 | # endif |
b77b06e0 | 631 | L(ret5): |
7ebba913 | 632 | VZEROUPPER_RETURN |
14570163 | 633 | |
14570163 | 634 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
635 | .p2align 4,, 2 |
636 | L(ret_zero_end): | |
637 | xorl %eax, %eax | |
638 | VZEROUPPER_RETURN | |
14570163 | 639 | # endif |
b77b06e0 NG |
640 | |
641 | ||
642 | /* The L(return_vec_N_end) differ from L(return_vec_N) in that | |
643 | they use the value of `r8` to negate the return value. This is | |
644 | because the page cross logic can swap `rdi` and `rsi`. */ | |
645 | .p2align 4,, 10 | |
14570163 | 646 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
647 | L(return_vec_1_end): |
648 | salq $32, %rcx | |
649 | # endif | |
650 | L(return_vec_0_end): | |
651 | # ifndef USE_AS_STRNCMP | |
652 | tzcntl %ecx, %ecx | |
653 | # else | |
654 | tzcntq %rcx, %rcx | |
655 | cmpq %rcx, %rdx | |
656 | jbe L(ret_zero_end) | |
657 | # endif | |
658 | ||
659 | # ifdef USE_AS_WCSCMP | |
660 | movl (%rdi, %rcx), %edx | |
14570163 | 661 | xorl %eax, %eax |
b77b06e0 NG |
662 | cmpl (%rsi, %rcx), %edx |
663 | je L(ret6) | |
664 | setl %al | |
665 | negl %eax | |
666 | xorl %r8d, %eax | |
14570163 | 667 | # else |
b77b06e0 NG |
668 | movzbl (%rdi, %rcx), %eax |
669 | movzbl (%rsi, %rcx), %ecx | |
bbf81222 NG |
670 | TOLOWER_gpr (%rax, %eax) |
671 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
672 | subl %ecx, %eax |
673 | xorl %r8d, %eax | |
674 | subl %r8d, %eax | |
675 | # endif | |
676 | L(ret6): | |
677 | VZEROUPPER_RETURN | |
678 | ||
679 | # ifndef USE_AS_STRNCMP | |
680 | .p2align 4,, 10 | |
681 | L(return_vec_1_end): | |
682 | tzcntl %ecx, %ecx | |
14570163 | 683 | # ifdef USE_AS_WCSCMP |
b77b06e0 | 684 | movl VEC_SIZE(%rdi, %rcx), %edx |
14570163 | 685 | xorl %eax, %eax |
b77b06e0 NG |
686 | cmpl VEC_SIZE(%rsi, %rcx), %edx |
687 | je L(ret7) | |
688 | setl %al | |
689 | negl %eax | |
690 | xorl %r8d, %eax | |
14570163 | 691 | # else |
b77b06e0 NG |
692 | movzbl VEC_SIZE(%rdi, %rcx), %eax |
693 | movzbl VEC_SIZE(%rsi, %rcx), %ecx | |
bbf81222 NG |
694 | TOLOWER_gpr (%rax, %eax) |
695 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
696 | subl %ecx, %eax |
697 | xorl %r8d, %eax | |
698 | subl %r8d, %eax | |
14570163 | 699 | # endif |
b77b06e0 | 700 | L(ret7): |
7ebba913 | 701 | VZEROUPPER_RETURN |
b77b06e0 | 702 | # endif |
14570163 | 703 | |
b77b06e0 NG |
704 | .p2align 4,, 10 |
705 | L(return_vec_2_end): | |
706 | tzcntl %ecx, %ecx | |
14570163 | 707 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
708 | cmpq %rcx, %rdx |
709 | jbe L(ret_zero_page_cross) | |
14570163 | 710 | # endif |
b77b06e0 NG |
711 | # ifdef USE_AS_WCSCMP |
712 | movl (VEC_SIZE * 2)(%rdi, %rcx), %edx | |
14570163 | 713 | xorl %eax, %eax |
b77b06e0 NG |
714 | cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx |
715 | je L(ret11) | |
716 | setl %al | |
717 | negl %eax | |
718 | xorl %r8d, %eax | |
14570163 | 719 | # else |
b77b06e0 NG |
720 | movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax |
721 | movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx | |
bbf81222 NG |
722 | TOLOWER_gpr (%rax, %eax) |
723 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
724 | subl %ecx, %eax |
725 | xorl %r8d, %eax | |
726 | subl %r8d, %eax | |
14570163 | 727 | # endif |
b77b06e0 | 728 | L(ret11): |
7ebba913 | 729 | VZEROUPPER_RETURN |
14570163 | 730 | |
b77b06e0 NG |
731 | |
732 | /* Page cross in rsi in next 4x VEC. */ | |
733 | ||
734 | /* TODO: Improve logic here. */ | |
735 | .p2align 4,, 10 | |
736 | L(page_cross_during_loop): | |
737 | /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ | |
738 | ||
739 | /* Optimistically rsi and rdi and both aligned inwhich case we | |
740 | don't need any logic here. */ | |
741 | cmpl $-(VEC_SIZE * 4), %eax | |
742 | /* Don't adjust eax before jumping back to loop and we will | |
743 | never hit page cross case again. */ | |
744 | je L(loop_skip_page_cross_check) | |
745 | ||
746 | /* Check if we can safely load a VEC. */ | |
747 | cmpl $-(VEC_SIZE * 3), %eax | |
748 | jle L(less_1x_vec_till_page_cross) | |
749 | ||
750 | VMOVA (%rdi), %ymm0 | |
bbf81222 | 751 | CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) |
b77b06e0 NG |
752 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
753 | vpandn %ymm1, %ymm2, %ymm1 | |
754 | vpmovmskb %ymm1, %ecx | |
755 | incl %ecx | |
756 | jnz L(return_vec_0_end) | |
757 | ||
758 | /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ | |
759 | cmpl $-(VEC_SIZE * 2), %eax | |
760 | jg L(more_2x_vec_till_page_cross) | |
761 | ||
762 | .p2align 4,, 4 | |
763 | L(less_1x_vec_till_page_cross): | |
764 | subl $-(VEC_SIZE * 4), %eax | |
765 | /* Guranteed safe to read from rdi - VEC_SIZE here. The only | |
766 | concerning case is first iteration if incoming s1 was near start | |
767 | of a page and s2 near end. If s1 was near the start of the page | |
768 | we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe | |
769 | to read back -VEC_SIZE. If rdi is truly at the start of a page | |
770 | here, it means the previous page (rdi - VEC_SIZE) has already | |
771 | been loaded earlier so must be valid. */ | |
772 | VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 | |
bbf81222 | 773 | CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) |
b77b06e0 NG |
774 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
775 | vpandn %ymm1, %ymm2, %ymm1 | |
776 | vpmovmskb %ymm1, %ecx | |
777 | ||
778 | /* Mask of potentially valid bits. The lower bits can be out of | |
779 | range comparisons (but safe regarding page crosses). */ | |
780 | movl $-1, %r10d | |
781 | shlxl %esi, %r10d, %r10d | |
782 | notl %ecx | |
783 | ||
14570163 | 784 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
785 | cmpq %rax, %rdx |
786 | jbe L(return_page_cross_end_check) | |
787 | # endif | |
788 | movl %eax, %OFFSET_REG | |
789 | addl $(PAGE_SIZE - VEC_SIZE * 4), %eax | |
790 | ||
791 | andl %r10d, %ecx | |
792 | jz L(loop_skip_page_cross_check) | |
793 | ||
794 | .p2align 4,, 3 | |
795 | L(return_page_cross_end): | |
796 | tzcntl %ecx, %ecx | |
797 | ||
14570163 | 798 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
799 | leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx |
800 | L(return_page_cross_cmp_mem): | |
14570163 | 801 | # else |
b77b06e0 NG |
802 | addl %OFFSET_REG, %ecx |
803 | # endif | |
804 | # ifdef USE_AS_WCSCMP | |
805 | movl VEC_OFFSET(%rdi, %rcx), %edx | |
14570163 | 806 | xorl %eax, %eax |
b77b06e0 NG |
807 | cmpl VEC_OFFSET(%rsi, %rcx), %edx |
808 | je L(ret8) | |
809 | setl %al | |
810 | negl %eax | |
811 | xorl %r8d, %eax | |
812 | # else | |
813 | movzbl VEC_OFFSET(%rdi, %rcx), %eax | |
814 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx | |
bbf81222 NG |
815 | TOLOWER_gpr (%rax, %eax) |
816 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
817 | subl %ecx, %eax |
818 | xorl %r8d, %eax | |
819 | subl %r8d, %eax | |
14570163 | 820 | # endif |
b77b06e0 | 821 | L(ret8): |
7ebba913 | 822 | VZEROUPPER_RETURN |
14570163 | 823 | |
14570163 | 824 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
825 | .p2align 4,, 10 |
826 | L(return_page_cross_end_check): | |
e108c02a | 827 | andl %r10d, %ecx |
b77b06e0 NG |
828 | tzcntl %ecx, %ecx |
829 | leal -VEC_SIZE(%rax, %rcx), %ecx | |
830 | cmpl %ecx, %edx | |
831 | ja L(return_page_cross_cmp_mem) | |
14570163 | 832 | xorl %eax, %eax |
7ebba913 | 833 | VZEROUPPER_RETURN |
b77b06e0 | 834 | # endif |
14570163 | 835 | |
14570163 | 836 | |
b77b06e0 NG |
837 | .p2align 4,, 10 |
838 | L(more_2x_vec_till_page_cross): | |
839 | /* If more 2x vec till cross we will complete a full loop | |
840 | iteration here. */ | |
841 | ||
842 | VMOVU VEC_SIZE(%rdi), %ymm0 | |
bbf81222 | 843 | CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) |
b77b06e0 NG |
844 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
845 | vpandn %ymm1, %ymm2, %ymm1 | |
846 | vpmovmskb %ymm1, %ecx | |
847 | incl %ecx | |
848 | jnz L(return_vec_1_end) | |
849 | ||
75870237 | 850 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
851 | cmpq $(VEC_SIZE * 2), %rdx |
852 | jbe L(ret_zero_in_loop_page_cross) | |
75870237 | 853 | # endif |
b77b06e0 NG |
854 | |
855 | subl $-(VEC_SIZE * 4), %eax | |
856 | ||
857 | /* Safe to include comparisons from lower bytes. */ | |
858 | VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 | |
bbf81222 | 859 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) |
b77b06e0 NG |
860 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
861 | vpandn %ymm1, %ymm2, %ymm1 | |
862 | vpmovmskb %ymm1, %ecx | |
863 | incl %ecx | |
864 | jnz L(return_vec_page_cross_0) | |
865 | ||
866 | VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 | |
bbf81222 | 867 | CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) |
b77b06e0 NG |
868 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
869 | vpandn %ymm1, %ymm2, %ymm1 | |
870 | vpmovmskb %ymm1, %ecx | |
871 | incl %ecx | |
872 | jnz L(return_vec_page_cross_1) | |
873 | ||
14570163 | 874 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
875 | /* Must check length here as length might proclude reading next |
876 | page. */ | |
877 | cmpq %rax, %rdx | |
878 | jbe L(ret_zero_in_loop_page_cross) | |
879 | # endif | |
880 | ||
881 | /* Finish the loop. */ | |
882 | VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 | |
883 | VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 | |
884 | ||
bbf81222 NG |
885 | CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) |
886 | CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) | |
b77b06e0 NG |
887 | vpand %ymm4, %ymm5, %ymm5 |
888 | vpand %ymm6, %ymm7, %ymm7 | |
889 | VPMINU %ymm5, %ymm7, %ymm7 | |
890 | VPCMPEQ %ymm7, %ymmZERO, %ymm7 | |
891 | vpmovmskb %ymm7, %LOOP_REG | |
892 | testl %LOOP_REG, %LOOP_REG | |
893 | jnz L(return_vec_2_3_end) | |
894 | ||
895 | /* Best for code size to include ucond-jmp here. Would be faster | |
896 | if this case is hot to duplicate the L(return_vec_2_3_end) code | |
897 | as fall-through and have jump back to loop on mismatch | |
898 | comparison. */ | |
899 | subq $-(VEC_SIZE * 4), %rdi | |
900 | subq $-(VEC_SIZE * 4), %rsi | |
901 | addl $(PAGE_SIZE - VEC_SIZE * 8), %eax | |
902 | # ifdef USE_AS_STRNCMP | |
903 | subq $(VEC_SIZE * 4), %rdx | |
904 | ja L(loop_skip_page_cross_check) | |
905 | L(ret_zero_in_loop_page_cross): | |
14570163 | 906 | xorl %eax, %eax |
b77b06e0 | 907 | VZEROUPPER_RETURN |
14570163 | 908 | # else |
b77b06e0 | 909 | jmp L(loop_skip_page_cross_check) |
14570163 | 910 | # endif |
14570163 | 911 | |
b77b06e0 NG |
912 | |
913 | .p2align 4,, 10 | |
914 | L(return_vec_page_cross_0): | |
915 | addl $-VEC_SIZE, %eax | |
916 | L(return_vec_page_cross_1): | |
917 | tzcntl %ecx, %ecx | |
75870237 | 918 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
919 | leal -VEC_SIZE(%rax, %rcx), %ecx |
920 | cmpq %rcx, %rdx | |
921 | jbe L(ret_zero_in_loop_page_cross) | |
922 | # else | |
923 | addl %eax, %ecx | |
75870237 SP |
924 | # endif |
925 | ||
14570163 | 926 | # ifdef USE_AS_WCSCMP |
b77b06e0 NG |
927 | movl VEC_OFFSET(%rdi, %rcx), %edx |
928 | xorl %eax, %eax | |
929 | cmpl VEC_OFFSET(%rsi, %rcx), %edx | |
930 | je L(ret9) | |
931 | setl %al | |
932 | negl %eax | |
933 | xorl %r8d, %eax | |
14570163 | 934 | # else |
b77b06e0 NG |
935 | movzbl VEC_OFFSET(%rdi, %rcx), %eax |
936 | movzbl VEC_OFFSET(%rsi, %rcx), %ecx | |
bbf81222 NG |
937 | TOLOWER_gpr (%rax, %eax) |
938 | TOLOWER_gpr (%rcx, %ecx) | |
14570163 | 939 | subl %ecx, %eax |
b77b06e0 NG |
940 | xorl %r8d, %eax |
941 | subl %r8d, %eax | |
14570163 | 942 | # endif |
b77b06e0 NG |
943 | L(ret9): |
944 | VZEROUPPER_RETURN | |
945 | ||
946 | ||
947 | .p2align 4,, 10 | |
948 | L(page_cross): | |
949 | # ifndef USE_AS_STRNCMP | |
950 | /* If both are VEC aligned we don't need any special logic here. | |
951 | Only valid for strcmp where stop condition is guranteed to be | |
952 | reachable by just reading memory. */ | |
953 | testl $((VEC_SIZE - 1) << 20), %eax | |
954 | jz L(no_page_cross) | |
14570163 | 955 | # endif |
b77b06e0 NG |
956 | |
957 | movl %edi, %eax | |
958 | movl %esi, %ecx | |
959 | andl $(PAGE_SIZE - 1), %eax | |
960 | andl $(PAGE_SIZE - 1), %ecx | |
961 | ||
962 | xorl %OFFSET_REG, %OFFSET_REG | |
963 | ||
964 | /* Check which is closer to page cross, s1 or s2. */ | |
965 | cmpl %eax, %ecx | |
966 | jg L(page_cross_s2) | |
967 | ||
968 | /* The previous page cross check has false positives. Check for | |
969 | true positive as page cross logic is very expensive. */ | |
970 | subl $(PAGE_SIZE - VEC_SIZE * 4), %eax | |
971 | jbe L(no_page_cross) | |
972 | ||
973 | /* Set r8 to not interfere with normal return value (rdi and rsi | |
974 | did not swap). */ | |
14570163 | 975 | # ifdef USE_AS_WCSCMP |
b77b06e0 NG |
976 | /* any non-zero positive value that doesn't inference with 0x1. |
977 | */ | |
978 | movl $2, %r8d | |
14570163 | 979 | # else |
b77b06e0 | 980 | xorl %r8d, %r8d |
14570163 | 981 | # endif |
b77b06e0 NG |
982 | |
983 | /* Check if less than 1x VEC till page cross. */ | |
984 | subl $(VEC_SIZE * 3), %eax | |
985 | jg L(less_1x_vec_till_page) | |
986 | ||
987 | /* If more than 1x VEC till page cross, loop throuh safely | |
988 | loadable memory until within 1x VEC of page cross. */ | |
989 | ||
990 | .p2align 4,, 10 | |
991 | L(page_cross_loop): | |
992 | ||
993 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 | |
bbf81222 | 994 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
b77b06e0 NG |
995 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
996 | vpandn %ymm1, %ymm2, %ymm1 | |
997 | vpmovmskb %ymm1, %ecx | |
998 | incl %ecx | |
999 | ||
1000 | jnz L(check_ret_vec_page_cross) | |
1001 | addl $VEC_SIZE, %OFFSET_REG | |
1002 | # ifdef USE_AS_STRNCMP | |
1003 | cmpq %OFFSET_REG64, %rdx | |
1004 | jbe L(ret_zero_page_cross) | |
14570163 | 1005 | # endif |
b77b06e0 NG |
1006 | addl $VEC_SIZE, %eax |
1007 | jl L(page_cross_loop) | |
1008 | ||
1009 | subl %eax, %OFFSET_REG | |
1010 | /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed | |
1011 | to not cross page so is safe to load. Since we have already | |
bbf81222 NG |
1012 | loaded at least 1 VEC from rsi it is also guranteed to be |
1013 | safe. */ | |
b77b06e0 NG |
1014 | |
1015 | VMOVU (%rdi, %OFFSET_REG64), %ymm0 | |
bbf81222 | 1016 | CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) |
b77b06e0 NG |
1017 | VPCMPEQ %ymm0, %ymmZERO, %ymm2 |
1018 | vpandn %ymm1, %ymm2, %ymm1 | |
1019 | vpmovmskb %ymm1, %ecx | |
1020 | ||
1021 | # ifdef USE_AS_STRNCMP | |
1022 | leal VEC_SIZE(%OFFSET_REG64), %eax | |
1023 | cmpq %rax, %rdx | |
1024 | jbe L(check_ret_vec_page_cross2) | |
1025 | addq %rdi, %rdx | |
1026 | # endif | |
1027 | incl %ecx | |
1028 | jz L(prepare_loop_no_len) | |
14570163 | 1029 | |
b77b06e0 NG |
1030 | .p2align 4,, 4 |
1031 | L(ret_vec_page_cross): | |
1032 | # ifndef USE_AS_STRNCMP | |
1033 | L(check_ret_vec_page_cross): | |
1034 | # endif | |
1035 | tzcntl %ecx, %ecx | |
1036 | addl %OFFSET_REG, %ecx | |
1037 | L(ret_vec_page_cross_cont): | |
14570163 | 1038 | # ifdef USE_AS_WCSCMP |
b77b06e0 NG |
1039 | movl (%rdi, %rcx), %edx |
1040 | xorl %eax, %eax | |
1041 | cmpl (%rsi, %rcx), %edx | |
1042 | je L(ret12) | |
14570163 LS |
1043 | setl %al |
1044 | negl %eax | |
b77b06e0 NG |
1045 | xorl %r8d, %eax |
1046 | # else | |
1047 | movzbl (%rdi, %rcx), %eax | |
1048 | movzbl (%rsi, %rcx), %ecx | |
bbf81222 NG |
1049 | TOLOWER_gpr (%rax, %eax) |
1050 | TOLOWER_gpr (%rcx, %ecx) | |
b77b06e0 NG |
1051 | subl %ecx, %eax |
1052 | xorl %r8d, %eax | |
1053 | subl %r8d, %eax | |
14570163 | 1054 | # endif |
b77b06e0 NG |
1055 | L(ret12): |
1056 | VZEROUPPER_RETURN | |
14570163 LS |
1057 | |
1058 | # ifdef USE_AS_STRNCMP | |
b77b06e0 NG |
1059 | .p2align 4,, 10 |
1060 | L(check_ret_vec_page_cross2): | |
1061 | incl %ecx | |
1062 | L(check_ret_vec_page_cross): | |
1063 | tzcntl %ecx, %ecx | |
1064 | addl %OFFSET_REG, %ecx | |
1065 | cmpq %rcx, %rdx | |
1066 | ja L(ret_vec_page_cross_cont) | |
1067 | .p2align 4,, 2 | |
1068 | L(ret_zero_page_cross): | |
14570163 | 1069 | xorl %eax, %eax |
7ebba913 | 1070 | VZEROUPPER_RETURN |
b77b06e0 | 1071 | # endif |
14570163 | 1072 | |
b77b06e0 NG |
1073 | .p2align 4,, 4 |
1074 | L(page_cross_s2): | |
1075 | /* Ensure this is a true page cross. */ | |
1076 | subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx | |
1077 | jbe L(no_page_cross) | |
1078 | ||
1079 | ||
1080 | movl %ecx, %eax | |
1081 | movq %rdi, %rcx | |
1082 | movq %rsi, %rdi | |
1083 | movq %rcx, %rsi | |
1084 | ||
1085 | /* set r8 to negate return value as rdi and rsi swapped. */ | |
1086 | # ifdef USE_AS_WCSCMP | |
1087 | movl $-4, %r8d | |
1088 | # else | |
1089 | movl $-1, %r8d | |
14570163 | 1090 | # endif |
b77b06e0 | 1091 | xorl %OFFSET_REG, %OFFSET_REG |
14570163 | 1092 | |
b77b06e0 NG |
1093 | /* Check if more than 1x VEC till page cross. */ |
1094 | subl $(VEC_SIZE * 3), %eax | |
1095 | jle L(page_cross_loop) | |
1096 | ||
1097 | .p2align 4,, 6 | |
1098 | L(less_1x_vec_till_page): | |
1099 | /* Find largest load size we can use. */ | |
1100 | cmpl $16, %eax | |
1101 | ja L(less_16_till_page) | |
1102 | ||
1103 | VMOVU (%rdi), %xmm0 | |
bbf81222 | 1104 | CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) |
b77b06e0 NG |
1105 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1106 | vpandn %xmm1, %xmm2, %xmm1 | |
1107 | vpmovmskb %ymm1, %ecx | |
1108 | incw %cx | |
1109 | jnz L(check_ret_vec_page_cross) | |
1110 | movl $16, %OFFSET_REG | |
14570163 | 1111 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
1112 | cmpq %OFFSET_REG64, %rdx |
1113 | jbe L(ret_zero_page_cross_slow_case0) | |
1114 | subl %eax, %OFFSET_REG | |
1115 | # else | |
1116 | /* Explicit check for 16 byte alignment. */ | |
1117 | subl %eax, %OFFSET_REG | |
1118 | jz L(prepare_loop) | |
14570163 | 1119 | # endif |
b77b06e0 NG |
1120 | |
1121 | VMOVU (%rdi, %OFFSET_REG64), %xmm0 | |
bbf81222 | 1122 | CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) |
b77b06e0 NG |
1123 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 |
1124 | vpandn %xmm1, %xmm2, %xmm1 | |
1125 | vpmovmskb %ymm1, %ecx | |
1126 | incw %cx | |
1127 | jnz L(check_ret_vec_page_cross) | |
1128 | ||
14570163 | 1129 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
1130 | addl $16, %OFFSET_REG |
1131 | subq %OFFSET_REG64, %rdx | |
1132 | jbe L(ret_zero_page_cross_slow_case0) | |
1133 | subq $-(VEC_SIZE * 4), %rdx | |
1134 | ||
1135 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi | |
1136 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
1137 | # else | |
1138 | leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi | |
1139 | leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
14570163 | 1140 | # endif |
b77b06e0 NG |
1141 | jmp L(prepare_loop_aligned) |
1142 | ||
1143 | # ifdef USE_AS_STRNCMP | |
1144 | .p2align 4,, 2 | |
1145 | L(ret_zero_page_cross_slow_case0): | |
14570163 | 1146 | xorl %eax, %eax |
b77b06e0 | 1147 | ret |
14570163 | 1148 | # endif |
14570163 | 1149 | |
14570163 | 1150 | |
b77b06e0 NG |
1151 | .p2align 4,, 10 |
1152 | L(less_16_till_page): | |
1153 | /* Find largest load size we can use. */ | |
1154 | cmpl $24, %eax | |
1155 | ja L(less_8_till_page) | |
14570163 | 1156 | |
b77b06e0 NG |
1157 | vmovq (%rdi), %xmm0 |
1158 | vmovq (%rsi), %xmm1 | |
1159 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 | |
bbf81222 | 1160 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
b77b06e0 NG |
1161 | vpandn %xmm1, %xmm2, %xmm1 |
1162 | vpmovmskb %ymm1, %ecx | |
1163 | incb %cl | |
1164 | jnz L(check_ret_vec_page_cross) | |
14570163 | 1165 | |
b77b06e0 NG |
1166 | |
1167 | # ifdef USE_AS_STRNCMP | |
1168 | cmpq $8, %rdx | |
1169 | jbe L(ret_zero_page_cross_slow_case0) | |
14570163 | 1170 | # endif |
b77b06e0 NG |
1171 | movl $24, %OFFSET_REG |
1172 | /* Explicit check for 16 byte alignment. */ | |
1173 | subl %eax, %OFFSET_REG | |
1174 | ||
1175 | ||
1176 | ||
1177 | vmovq (%rdi, %OFFSET_REG64), %xmm0 | |
1178 | vmovq (%rsi, %OFFSET_REG64), %xmm1 | |
1179 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 | |
bbf81222 | 1180 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
b77b06e0 NG |
1181 | vpandn %xmm1, %xmm2, %xmm1 |
1182 | vpmovmskb %ymm1, %ecx | |
1183 | incb %cl | |
1184 | jnz L(check_ret_vec_page_cross) | |
1185 | ||
14570163 | 1186 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
1187 | addl $8, %OFFSET_REG |
1188 | subq %OFFSET_REG64, %rdx | |
1189 | jbe L(ret_zero_page_cross_slow_case0) | |
1190 | subq $-(VEC_SIZE * 4), %rdx | |
14570163 | 1191 | |
b77b06e0 NG |
1192 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi |
1193 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
1194 | # else | |
1195 | leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi | |
1196 | leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
1197 | # endif | |
1198 | jmp L(prepare_loop_aligned) | |
1199 | ||
1200 | ||
1201 | .p2align 4,, 10 | |
1202 | L(less_8_till_page): | |
1203 | # ifdef USE_AS_WCSCMP | |
1204 | /* If using wchar then this is the only check before we reach | |
1205 | the page boundary. */ | |
1206 | movl (%rdi), %eax | |
1207 | movl (%rsi), %ecx | |
1208 | cmpl %ecx, %eax | |
1209 | jnz L(ret_less_8_wcs) | |
14570163 | 1210 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
1211 | addq %rdi, %rdx |
1212 | /* We already checked for len <= 1 so cannot hit that case here. | |
1213 | */ | |
14570163 | 1214 | # endif |
b77b06e0 NG |
1215 | testl %eax, %eax |
1216 | jnz L(prepare_loop_no_len) | |
1217 | ret | |
14570163 | 1218 | |
b77b06e0 NG |
1219 | .p2align 4,, 8 |
1220 | L(ret_less_8_wcs): | |
1221 | setl %OFFSET_REG8 | |
1222 | negl %OFFSET_REG | |
1223 | movl %OFFSET_REG, %eax | |
1224 | xorl %r8d, %eax | |
1225 | ret | |
1226 | ||
1227 | # else | |
1228 | ||
1229 | /* Find largest load size we can use. */ | |
1230 | cmpl $28, %eax | |
1231 | ja L(less_4_till_page) | |
1232 | ||
1233 | vmovd (%rdi), %xmm0 | |
1234 | vmovd (%rsi), %xmm1 | |
1235 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 | |
bbf81222 | 1236 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
b77b06e0 NG |
1237 | vpandn %xmm1, %xmm2, %xmm1 |
1238 | vpmovmskb %ymm1, %ecx | |
1239 | subl $0xf, %ecx | |
1240 | jnz L(check_ret_vec_page_cross) | |
14570163 | 1241 | |
14570163 | 1242 | # ifdef USE_AS_STRNCMP |
b77b06e0 NG |
1243 | cmpq $4, %rdx |
1244 | jbe L(ret_zero_page_cross_slow_case1) | |
14570163 | 1245 | # endif |
b77b06e0 NG |
1246 | movl $28, %OFFSET_REG |
1247 | /* Explicit check for 16 byte alignment. */ | |
1248 | subl %eax, %OFFSET_REG | |
14570163 | 1249 | |
b77b06e0 NG |
1250 | |
1251 | ||
1252 | vmovd (%rdi, %OFFSET_REG64), %xmm0 | |
1253 | vmovd (%rsi, %OFFSET_REG64), %xmm1 | |
1254 | VPCMPEQ %xmm0, %xmmZERO, %xmm2 | |
bbf81222 | 1255 | CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) |
b77b06e0 NG |
1256 | vpandn %xmm1, %xmm2, %xmm1 |
1257 | vpmovmskb %ymm1, %ecx | |
1258 | subl $0xf, %ecx | |
1259 | jnz L(check_ret_vec_page_cross) | |
1260 | ||
1261 | # ifdef USE_AS_STRNCMP | |
1262 | addl $4, %OFFSET_REG | |
1263 | subq %OFFSET_REG64, %rdx | |
1264 | jbe L(ret_zero_page_cross_slow_case1) | |
1265 | subq $-(VEC_SIZE * 4), %rdx | |
1266 | ||
1267 | leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi | |
1268 | leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
1269 | # else | |
1270 | leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi | |
1271 | leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi | |
1272 | # endif | |
1273 | jmp L(prepare_loop_aligned) | |
1274 | ||
1275 | # ifdef USE_AS_STRNCMP | |
1276 | .p2align 4,, 2 | |
1277 | L(ret_zero_page_cross_slow_case1): | |
1278 | xorl %eax, %eax | |
1279 | ret | |
1280 | # endif | |
1281 | ||
1282 | .p2align 4,, 10 | |
1283 | L(less_4_till_page): | |
1284 | subq %rdi, %rsi | |
1285 | /* Extremely slow byte comparison loop. */ | |
1286 | L(less_4_loop): | |
1287 | movzbl (%rdi), %eax | |
1288 | movzbl (%rsi, %rdi), %ecx | |
bbf81222 NG |
1289 | TOLOWER_gpr (%rax, %eax) |
1290 | TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) | |
1291 | subl %BYTE_LOOP_REG, %eax | |
b77b06e0 NG |
1292 | jnz L(ret_less_4_loop) |
1293 | testl %ecx, %ecx | |
1294 | jz L(ret_zero_4_loop) | |
1295 | # ifdef USE_AS_STRNCMP | |
1296 | decq %rdx | |
1297 | jz L(ret_zero_4_loop) | |
1298 | # endif | |
1299 | incq %rdi | |
1300 | /* end condition is reach page boundary (rdi is aligned). */ | |
1301 | testl $31, %edi | |
1302 | jnz L(less_4_loop) | |
1303 | leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi | |
1304 | addq $-(VEC_SIZE * 4), %rdi | |
1305 | # ifdef USE_AS_STRNCMP | |
1306 | subq $-(VEC_SIZE * 4), %rdx | |
1307 | # endif | |
1308 | jmp L(prepare_loop_aligned) | |
1309 | ||
1310 | L(ret_zero_4_loop): | |
1311 | xorl %eax, %eax | |
1312 | ret | |
1313 | L(ret_less_4_loop): | |
1314 | xorl %r8d, %eax | |
1315 | subl %r8d, %eax | |
1316 | ret | |
1317 | # endif | |
bbf81222 NG |
1318 | cfi_endproc |
1319 | .size STRCMP, .-STRCMP | |
14570163 | 1320 | #endif |