]>
Commit | Line | Data |
---|---|---|
14570163 | 1 | /* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. |
581c785b | 2 | Copyright (C) 2018-2022 Free Software Foundation, Inc. |
14570163 LS |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
14570163 LS |
18 | |
19 | #if IS_IN (libc) | |
20 | ||
21 | # include <sysdep.h> | |
22 | ||
23 | # ifndef STRCMP | |
24 | # define STRCMP __strcmp_avx2 | |
25 | # endif | |
26 | ||
27 | # define PAGE_SIZE 4096 | |
28 | ||
29 | /* VEC_SIZE = Number of bytes in a ymm register */ | |
30 | # define VEC_SIZE 32 | |
31 | ||
32 | /* Shift for dividing by (VEC_SIZE * 4). */ | |
33 | # define DIVIDE_BY_VEC_4_SHIFT 7 | |
34 | # if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) | |
35 | # error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) | |
36 | # endif | |
37 | ||
38 | # ifdef USE_AS_WCSCMP | |
39 | /* Compare packed dwords. */ | |
40 | # define VPCMPEQ vpcmpeqd | |
41 | /* Compare packed dwords and store minimum. */ | |
42 | # define VPMINU vpminud | |
43 | /* 1 dword char == 4 bytes. */ | |
44 | # define SIZE_OF_CHAR 4 | |
45 | # else | |
46 | /* Compare packed bytes. */ | |
47 | # define VPCMPEQ vpcmpeqb | |
48 | /* Compare packed bytes and store minimum. */ | |
49 | # define VPMINU vpminub | |
50 | /* 1 byte char == 1 byte. */ | |
51 | # define SIZE_OF_CHAR 1 | |
52 | # endif | |
53 | ||
54 | # ifndef VZEROUPPER | |
55 | # define VZEROUPPER vzeroupper | |
56 | # endif | |
57 | ||
7ebba913 L |
58 | # ifndef SECTION |
59 | # define SECTION(p) p##.avx | |
60 | # endif | |
61 | ||
14570163 LS |
62 | /* Warning! |
63 | wcscmp/wcsncmp have to use SIGNED comparison for elements. | |
64 | strcmp/strncmp have to use UNSIGNED comparison for elements. | |
65 | */ | |
66 | ||
67 | /* The main idea of the string comparison (byte or dword) using AVX2 | |
68 | consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on | |
69 | either packed bytes or dwords depending on USE_AS_WCSCMP. In order | |
70 | to check the null char, algorithm keeps the matched bytes/dwords, | |
71 | requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, | |
72 | the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and | |
73 | one VPMINU instructions, together with movdqu and testl instructions. | |
74 | Main loop (away from from page boundary) compares 4 vectors are a time, | |
75 | effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. | |
76 | ||
77 | The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic | |
78 | is the same as strcmp, except that an a maximum offset is tracked. If | |
79 | the maximum offset is reached before a difference is found, zero is | |
80 | returned. */ | |
81 | ||
7ebba913 | 82 | .section SECTION(.text),"ax",@progbits |
14570163 LS |
83 | ENTRY (STRCMP) |
84 | # ifdef USE_AS_STRNCMP | |
85 | /* Check for simple cases (0 or 1) in offset. */ | |
ee915088 | 86 | cmp $1, %RDX_LP |
14570163 LS |
87 | je L(char0) |
88 | jb L(zero) | |
89 | # ifdef USE_AS_WCSCMP | |
90 | /* Convert units: from wide to byte char. */ | |
ee915088 | 91 | shl $2, %RDX_LP |
14570163 LS |
92 | # endif |
93 | /* Register %r11 tracks the maximum offset. */ | |
ee915088 | 94 | mov %RDX_LP, %R11_LP |
14570163 LS |
95 | # endif |
96 | movl %edi, %eax | |
97 | xorl %edx, %edx | |
a35a5903 L |
98 | /* Make %xmm7 (%ymm7) all zeros in this function. */ |
99 | vpxor %xmm7, %xmm7, %xmm7 | |
14570163 LS |
100 | orl %esi, %eax |
101 | andl $(PAGE_SIZE - 1), %eax | |
102 | cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax | |
103 | jg L(cross_page) | |
104 | /* Start comparing 4 vectors. */ | |
105 | vmovdqu (%rdi), %ymm1 | |
106 | VPCMPEQ (%rsi), %ymm1, %ymm0 | |
107 | VPMINU %ymm1, %ymm0, %ymm0 | |
108 | VPCMPEQ %ymm7, %ymm0, %ymm0 | |
109 | vpmovmskb %ymm0, %ecx | |
110 | testl %ecx, %ecx | |
111 | je L(next_3_vectors) | |
112 | tzcntl %ecx, %edx | |
113 | # ifdef USE_AS_STRNCMP | |
114 | /* Return 0 if the mismatched index (%rdx) is after the maximum | |
115 | offset (%r11). */ | |
116 | cmpq %r11, %rdx | |
117 | jae L(zero) | |
118 | # endif | |
119 | # ifdef USE_AS_WCSCMP | |
120 | xorl %eax, %eax | |
121 | movl (%rdi, %rdx), %ecx | |
122 | cmpl (%rsi, %rdx), %ecx | |
123 | je L(return) | |
124 | L(wcscmp_return): | |
125 | setl %al | |
126 | negl %eax | |
127 | orl $1, %eax | |
128 | L(return): | |
129 | # else | |
130 | movzbl (%rdi, %rdx), %eax | |
131 | movzbl (%rsi, %rdx), %edx | |
132 | subl %edx, %eax | |
133 | # endif | |
7ebba913 L |
134 | L(return_vzeroupper): |
135 | ZERO_UPPER_VEC_REGISTERS_RETURN | |
14570163 LS |
136 | |
137 | .p2align 4 | |
138 | L(return_vec_size): | |
139 | tzcntl %ecx, %edx | |
140 | # ifdef USE_AS_STRNCMP | |
141 | /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after | |
142 | the maximum offset (%r11). */ | |
143 | addq $VEC_SIZE, %rdx | |
144 | cmpq %r11, %rdx | |
145 | jae L(zero) | |
146 | # ifdef USE_AS_WCSCMP | |
147 | xorl %eax, %eax | |
148 | movl (%rdi, %rdx), %ecx | |
149 | cmpl (%rsi, %rdx), %ecx | |
150 | jne L(wcscmp_return) | |
151 | # else | |
152 | movzbl (%rdi, %rdx), %eax | |
153 | movzbl (%rsi, %rdx), %edx | |
154 | subl %edx, %eax | |
155 | # endif | |
156 | # else | |
157 | # ifdef USE_AS_WCSCMP | |
158 | xorl %eax, %eax | |
159 | movl VEC_SIZE(%rdi, %rdx), %ecx | |
160 | cmpl VEC_SIZE(%rsi, %rdx), %ecx | |
161 | jne L(wcscmp_return) | |
162 | # else | |
163 | movzbl VEC_SIZE(%rdi, %rdx), %eax | |
164 | movzbl VEC_SIZE(%rsi, %rdx), %edx | |
165 | subl %edx, %eax | |
166 | # endif | |
167 | # endif | |
7ebba913 | 168 | VZEROUPPER_RETURN |
14570163 LS |
169 | |
170 | .p2align 4 | |
171 | L(return_2_vec_size): | |
172 | tzcntl %ecx, %edx | |
173 | # ifdef USE_AS_STRNCMP | |
174 | /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is | |
175 | after the maximum offset (%r11). */ | |
176 | addq $(VEC_SIZE * 2), %rdx | |
177 | cmpq %r11, %rdx | |
178 | jae L(zero) | |
179 | # ifdef USE_AS_WCSCMP | |
180 | xorl %eax, %eax | |
181 | movl (%rdi, %rdx), %ecx | |
182 | cmpl (%rsi, %rdx), %ecx | |
183 | jne L(wcscmp_return) | |
184 | # else | |
185 | movzbl (%rdi, %rdx), %eax | |
186 | movzbl (%rsi, %rdx), %edx | |
187 | subl %edx, %eax | |
188 | # endif | |
189 | # else | |
190 | # ifdef USE_AS_WCSCMP | |
191 | xorl %eax, %eax | |
192 | movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx | |
193 | cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx | |
194 | jne L(wcscmp_return) | |
195 | # else | |
196 | movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax | |
197 | movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx | |
198 | subl %edx, %eax | |
199 | # endif | |
200 | # endif | |
7ebba913 | 201 | VZEROUPPER_RETURN |
14570163 LS |
202 | |
203 | .p2align 4 | |
204 | L(return_3_vec_size): | |
205 | tzcntl %ecx, %edx | |
206 | # ifdef USE_AS_STRNCMP | |
207 | /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is | |
208 | after the maximum offset (%r11). */ | |
209 | addq $(VEC_SIZE * 3), %rdx | |
210 | cmpq %r11, %rdx | |
211 | jae L(zero) | |
212 | # ifdef USE_AS_WCSCMP | |
213 | xorl %eax, %eax | |
214 | movl (%rdi, %rdx), %ecx | |
215 | cmpl (%rsi, %rdx), %ecx | |
216 | jne L(wcscmp_return) | |
217 | # else | |
218 | movzbl (%rdi, %rdx), %eax | |
219 | movzbl (%rsi, %rdx), %edx | |
220 | subl %edx, %eax | |
221 | # endif | |
222 | # else | |
223 | # ifdef USE_AS_WCSCMP | |
224 | xorl %eax, %eax | |
225 | movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx | |
226 | cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx | |
227 | jne L(wcscmp_return) | |
228 | # else | |
229 | movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax | |
230 | movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx | |
231 | subl %edx, %eax | |
232 | # endif | |
233 | # endif | |
7ebba913 | 234 | VZEROUPPER_RETURN |
14570163 LS |
235 | |
236 | .p2align 4 | |
237 | L(next_3_vectors): | |
238 | vmovdqu VEC_SIZE(%rdi), %ymm6 | |
239 | VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 | |
240 | VPMINU %ymm6, %ymm3, %ymm3 | |
241 | VPCMPEQ %ymm7, %ymm3, %ymm3 | |
242 | vpmovmskb %ymm3, %ecx | |
243 | testl %ecx, %ecx | |
244 | jne L(return_vec_size) | |
245 | vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 | |
246 | vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 | |
247 | vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 | |
248 | VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 | |
249 | VPMINU %ymm5, %ymm2, %ymm2 | |
250 | VPCMPEQ %ymm4, %ymm0, %ymm0 | |
251 | VPCMPEQ %ymm7, %ymm2, %ymm2 | |
252 | vpmovmskb %ymm2, %ecx | |
253 | testl %ecx, %ecx | |
254 | jne L(return_2_vec_size) | |
255 | VPMINU %ymm4, %ymm0, %ymm0 | |
256 | VPCMPEQ %ymm7, %ymm0, %ymm0 | |
257 | vpmovmskb %ymm0, %ecx | |
258 | testl %ecx, %ecx | |
259 | jne L(return_3_vec_size) | |
260 | L(main_loop_header): | |
261 | leaq (VEC_SIZE * 4)(%rdi), %rdx | |
262 | movl $PAGE_SIZE, %ecx | |
263 | /* Align load via RAX. */ | |
264 | andq $-(VEC_SIZE * 4), %rdx | |
265 | subq %rdi, %rdx | |
266 | leaq (%rdi, %rdx), %rax | |
267 | # ifdef USE_AS_STRNCMP | |
268 | /* Starting from this point, the maximum offset, or simply the | |
269 | 'offset', DECREASES by the same amount when base pointers are | |
270 | moved forward. Return 0 when: | |
271 | 1) On match: offset <= the matched vector index. | |
272 | 2) On mistmach, offset is before the mistmatched index. | |
273 | */ | |
274 | subq %rdx, %r11 | |
275 | jbe L(zero) | |
276 | # endif | |
277 | addq %rsi, %rdx | |
278 | movq %rdx, %rsi | |
279 | andl $(PAGE_SIZE - 1), %esi | |
280 | /* Number of bytes before page crossing. */ | |
281 | subq %rsi, %rcx | |
282 | /* Number of VEC_SIZE * 4 blocks before page crossing. */ | |
283 | shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx | |
284 | /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ | |
285 | movl %ecx, %esi | |
286 | jmp L(loop_start) | |
287 | ||
288 | .p2align 4 | |
289 | L(loop): | |
290 | # ifdef USE_AS_STRNCMP | |
291 | /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease | |
292 | the maximum offset (%r11) by the same amount. */ | |
293 | subq $(VEC_SIZE * 4), %r11 | |
294 | jbe L(zero) | |
295 | # endif | |
296 | addq $(VEC_SIZE * 4), %rax | |
297 | addq $(VEC_SIZE * 4), %rdx | |
298 | L(loop_start): | |
299 | testl %esi, %esi | |
300 | leal -1(%esi), %esi | |
301 | je L(loop_cross_page) | |
302 | L(back_to_loop): | |
303 | /* Main loop, comparing 4 vectors are a time. */ | |
304 | vmovdqa (%rax), %ymm0 | |
305 | vmovdqa VEC_SIZE(%rax), %ymm3 | |
306 | VPCMPEQ (%rdx), %ymm0, %ymm4 | |
307 | VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 | |
308 | VPMINU %ymm0, %ymm4, %ymm4 | |
309 | VPMINU %ymm3, %ymm1, %ymm1 | |
310 | vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 | |
311 | VPMINU %ymm1, %ymm4, %ymm0 | |
312 | vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 | |
313 | VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 | |
314 | VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 | |
315 | VPMINU %ymm2, %ymm5, %ymm5 | |
316 | VPMINU %ymm3, %ymm6, %ymm6 | |
317 | VPMINU %ymm5, %ymm0, %ymm0 | |
318 | VPMINU %ymm6, %ymm0, %ymm0 | |
319 | VPCMPEQ %ymm7, %ymm0, %ymm0 | |
320 | ||
321 | /* Test each mask (32 bits) individually because for VEC_SIZE | |
322 | == 32 is not possible to OR the four masks and keep all bits | |
323 | in a 64-bit integer register, differing from SSE2 strcmp | |
324 | where ORing is possible. */ | |
325 | vpmovmskb %ymm0, %ecx | |
326 | testl %ecx, %ecx | |
327 | je L(loop) | |
328 | VPCMPEQ %ymm7, %ymm4, %ymm0 | |
329 | vpmovmskb %ymm0, %edi | |
330 | testl %edi, %edi | |
331 | je L(test_vec) | |
332 | tzcntl %edi, %ecx | |
333 | # ifdef USE_AS_STRNCMP | |
334 | cmpq %rcx, %r11 | |
335 | jbe L(zero) | |
336 | # ifdef USE_AS_WCSCMP | |
337 | movq %rax, %rsi | |
338 | xorl %eax, %eax | |
339 | movl (%rsi, %rcx), %edi | |
340 | cmpl (%rdx, %rcx), %edi | |
341 | jne L(wcscmp_return) | |
342 | # else | |
343 | movzbl (%rax, %rcx), %eax | |
344 | movzbl (%rdx, %rcx), %edx | |
345 | subl %edx, %eax | |
346 | # endif | |
347 | # else | |
348 | # ifdef USE_AS_WCSCMP | |
349 | movq %rax, %rsi | |
350 | xorl %eax, %eax | |
351 | movl (%rsi, %rcx), %edi | |
352 | cmpl (%rdx, %rcx), %edi | |
353 | jne L(wcscmp_return) | |
354 | # else | |
355 | movzbl (%rax, %rcx), %eax | |
356 | movzbl (%rdx, %rcx), %edx | |
357 | subl %edx, %eax | |
358 | # endif | |
359 | # endif | |
7ebba913 | 360 | VZEROUPPER_RETURN |
14570163 LS |
361 | |
362 | .p2align 4 | |
363 | L(test_vec): | |
364 | # ifdef USE_AS_STRNCMP | |
365 | /* The first vector matched. Return 0 if the maximum offset | |
366 | (%r11) <= VEC_SIZE. */ | |
367 | cmpq $VEC_SIZE, %r11 | |
368 | jbe L(zero) | |
369 | # endif | |
370 | VPCMPEQ %ymm7, %ymm1, %ymm1 | |
371 | vpmovmskb %ymm1, %ecx | |
372 | testl %ecx, %ecx | |
373 | je L(test_2_vec) | |
374 | tzcntl %ecx, %edi | |
375 | # ifdef USE_AS_STRNCMP | |
376 | addq $VEC_SIZE, %rdi | |
377 | cmpq %rdi, %r11 | |
378 | jbe L(zero) | |
379 | # ifdef USE_AS_WCSCMP | |
380 | movq %rax, %rsi | |
381 | xorl %eax, %eax | |
382 | movl (%rsi, %rdi), %ecx | |
383 | cmpl (%rdx, %rdi), %ecx | |
384 | jne L(wcscmp_return) | |
385 | # else | |
386 | movzbl (%rax, %rdi), %eax | |
387 | movzbl (%rdx, %rdi), %edx | |
388 | subl %edx, %eax | |
389 | # endif | |
390 | # else | |
391 | # ifdef USE_AS_WCSCMP | |
392 | movq %rax, %rsi | |
393 | xorl %eax, %eax | |
394 | movl VEC_SIZE(%rsi, %rdi), %ecx | |
395 | cmpl VEC_SIZE(%rdx, %rdi), %ecx | |
396 | jne L(wcscmp_return) | |
397 | # else | |
398 | movzbl VEC_SIZE(%rax, %rdi), %eax | |
399 | movzbl VEC_SIZE(%rdx, %rdi), %edx | |
400 | subl %edx, %eax | |
401 | # endif | |
402 | # endif | |
7ebba913 | 403 | VZEROUPPER_RETURN |
14570163 LS |
404 | |
405 | .p2align 4 | |
406 | L(test_2_vec): | |
407 | # ifdef USE_AS_STRNCMP | |
408 | /* The first 2 vectors matched. Return 0 if the maximum offset | |
409 | (%r11) <= 2 * VEC_SIZE. */ | |
410 | cmpq $(VEC_SIZE * 2), %r11 | |
411 | jbe L(zero) | |
412 | # endif | |
413 | VPCMPEQ %ymm7, %ymm5, %ymm5 | |
414 | vpmovmskb %ymm5, %ecx | |
415 | testl %ecx, %ecx | |
416 | je L(test_3_vec) | |
417 | tzcntl %ecx, %edi | |
418 | # ifdef USE_AS_STRNCMP | |
419 | addq $(VEC_SIZE * 2), %rdi | |
420 | cmpq %rdi, %r11 | |
421 | jbe L(zero) | |
422 | # ifdef USE_AS_WCSCMP | |
423 | movq %rax, %rsi | |
424 | xorl %eax, %eax | |
425 | movl (%rsi, %rdi), %ecx | |
426 | cmpl (%rdx, %rdi), %ecx | |
427 | jne L(wcscmp_return) | |
428 | # else | |
429 | movzbl (%rax, %rdi), %eax | |
430 | movzbl (%rdx, %rdi), %edx | |
431 | subl %edx, %eax | |
432 | # endif | |
433 | # else | |
434 | # ifdef USE_AS_WCSCMP | |
435 | movq %rax, %rsi | |
436 | xorl %eax, %eax | |
437 | movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx | |
438 | cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx | |
439 | jne L(wcscmp_return) | |
440 | # else | |
441 | movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax | |
442 | movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx | |
443 | subl %edx, %eax | |
444 | # endif | |
445 | # endif | |
7ebba913 | 446 | VZEROUPPER_RETURN |
14570163 LS |
447 | |
448 | .p2align 4 | |
449 | L(test_3_vec): | |
450 | # ifdef USE_AS_STRNCMP | |
451 | /* The first 3 vectors matched. Return 0 if the maximum offset | |
452 | (%r11) <= 3 * VEC_SIZE. */ | |
453 | cmpq $(VEC_SIZE * 3), %r11 | |
454 | jbe L(zero) | |
455 | # endif | |
456 | VPCMPEQ %ymm7, %ymm6, %ymm6 | |
457 | vpmovmskb %ymm6, %esi | |
458 | tzcntl %esi, %ecx | |
459 | # ifdef USE_AS_STRNCMP | |
460 | addq $(VEC_SIZE * 3), %rcx | |
461 | cmpq %rcx, %r11 | |
462 | jbe L(zero) | |
463 | # ifdef USE_AS_WCSCMP | |
464 | movq %rax, %rsi | |
465 | xorl %eax, %eax | |
466 | movl (%rsi, %rcx), %esi | |
467 | cmpl (%rdx, %rcx), %esi | |
468 | jne L(wcscmp_return) | |
469 | # else | |
470 | movzbl (%rax, %rcx), %eax | |
471 | movzbl (%rdx, %rcx), %edx | |
472 | subl %edx, %eax | |
473 | # endif | |
474 | # else | |
475 | # ifdef USE_AS_WCSCMP | |
476 | movq %rax, %rsi | |
477 | xorl %eax, %eax | |
478 | movl (VEC_SIZE * 3)(%rsi, %rcx), %esi | |
479 | cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi | |
480 | jne L(wcscmp_return) | |
481 | # else | |
482 | movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax | |
483 | movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx | |
484 | subl %edx, %eax | |
485 | # endif | |
486 | # endif | |
7ebba913 | 487 | VZEROUPPER_RETURN |
14570163 LS |
488 | |
489 | .p2align 4 | |
490 | L(loop_cross_page): | |
491 | xorl %r10d, %r10d | |
492 | movq %rdx, %rcx | |
493 | /* Align load via RDX. We load the extra ECX bytes which should | |
494 | be ignored. */ | |
495 | andl $((VEC_SIZE * 4) - 1), %ecx | |
496 | /* R10 is -RCX. */ | |
497 | subq %rcx, %r10 | |
498 | ||
499 | /* This works only if VEC_SIZE * 2 == 64. */ | |
500 | # if (VEC_SIZE * 2) != 64 | |
501 | # error (VEC_SIZE * 2) != 64 | |
502 | # endif | |
503 | ||
504 | /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ | |
505 | cmpl $(VEC_SIZE * 2), %ecx | |
506 | jge L(loop_cross_page_2_vec) | |
507 | ||
508 | vmovdqu (%rax, %r10), %ymm2 | |
509 | vmovdqu VEC_SIZE(%rax, %r10), %ymm3 | |
510 | VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 | |
511 | VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 | |
512 | VPMINU %ymm2, %ymm0, %ymm0 | |
513 | VPMINU %ymm3, %ymm1, %ymm1 | |
514 | VPCMPEQ %ymm7, %ymm0, %ymm0 | |
515 | VPCMPEQ %ymm7, %ymm1, %ymm1 | |
516 | ||
517 | vpmovmskb %ymm0, %edi | |
518 | vpmovmskb %ymm1, %esi | |
519 | ||
520 | salq $32, %rsi | |
521 | xorq %rsi, %rdi | |
522 | ||
523 | /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ | |
524 | shrq %cl, %rdi | |
525 | ||
526 | testq %rdi, %rdi | |
527 | je L(loop_cross_page_2_vec) | |
528 | tzcntq %rdi, %rcx | |
529 | # ifdef USE_AS_STRNCMP | |
530 | cmpq %rcx, %r11 | |
531 | jbe L(zero) | |
532 | # ifdef USE_AS_WCSCMP | |
533 | movq %rax, %rsi | |
534 | xorl %eax, %eax | |
535 | movl (%rsi, %rcx), %edi | |
536 | cmpl (%rdx, %rcx), %edi | |
537 | jne L(wcscmp_return) | |
538 | # else | |
539 | movzbl (%rax, %rcx), %eax | |
540 | movzbl (%rdx, %rcx), %edx | |
541 | subl %edx, %eax | |
542 | # endif | |
543 | # else | |
544 | # ifdef USE_AS_WCSCMP | |
545 | movq %rax, %rsi | |
546 | xorl %eax, %eax | |
547 | movl (%rsi, %rcx), %edi | |
548 | cmpl (%rdx, %rcx), %edi | |
549 | jne L(wcscmp_return) | |
550 | # else | |
551 | movzbl (%rax, %rcx), %eax | |
552 | movzbl (%rdx, %rcx), %edx | |
553 | subl %edx, %eax | |
554 | # endif | |
555 | # endif | |
7ebba913 | 556 | VZEROUPPER_RETURN |
14570163 LS |
557 | |
558 | .p2align 4 | |
559 | L(loop_cross_page_2_vec): | |
560 | /* The first VEC_SIZE * 2 bytes match or are ignored. */ | |
561 | vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 | |
562 | vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 | |
563 | VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 | |
564 | VPMINU %ymm2, %ymm5, %ymm5 | |
565 | VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 | |
566 | VPCMPEQ %ymm7, %ymm5, %ymm5 | |
567 | VPMINU %ymm3, %ymm6, %ymm6 | |
568 | VPCMPEQ %ymm7, %ymm6, %ymm6 | |
569 | ||
570 | vpmovmskb %ymm5, %edi | |
571 | vpmovmskb %ymm6, %esi | |
572 | ||
573 | salq $32, %rsi | |
574 | xorq %rsi, %rdi | |
575 | ||
576 | xorl %r8d, %r8d | |
577 | /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ | |
578 | subl $(VEC_SIZE * 2), %ecx | |
579 | jle 1f | |
580 | /* Skip ECX bytes. */ | |
581 | shrq %cl, %rdi | |
582 | /* R8 has number of bytes skipped. */ | |
583 | movl %ecx, %r8d | |
584 | 1: | |
585 | /* Before jumping back to the loop, set ESI to the number of | |
586 | VEC_SIZE * 4 blocks before page crossing. */ | |
587 | movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi | |
588 | ||
589 | testq %rdi, %rdi | |
75870237 SP |
590 | # ifdef USE_AS_STRNCMP |
591 | /* At this point, if %rdi value is 0, it already tested | |
592 | VEC_SIZE*4+%r10 byte starting from %rax. This label | |
593 | checks whether strncmp maximum offset reached or not. */ | |
594 | je L(string_nbyte_offset_check) | |
595 | # else | |
14570163 | 596 | je L(back_to_loop) |
75870237 | 597 | # endif |
14570163 LS |
598 | tzcntq %rdi, %rcx |
599 | addq %r10, %rcx | |
600 | /* Adjust for number of bytes skipped. */ | |
601 | addq %r8, %rcx | |
602 | # ifdef USE_AS_STRNCMP | |
603 | addq $(VEC_SIZE * 2), %rcx | |
604 | subq %rcx, %r11 | |
605 | jbe L(zero) | |
606 | # ifdef USE_AS_WCSCMP | |
607 | movq %rax, %rsi | |
608 | xorl %eax, %eax | |
609 | movl (%rsi, %rcx), %edi | |
610 | cmpl (%rdx, %rcx), %edi | |
611 | jne L(wcscmp_return) | |
612 | # else | |
613 | movzbl (%rax, %rcx), %eax | |
614 | movzbl (%rdx, %rcx), %edx | |
615 | subl %edx, %eax | |
616 | # endif | |
617 | # else | |
618 | # ifdef USE_AS_WCSCMP | |
619 | movq %rax, %rsi | |
620 | xorl %eax, %eax | |
621 | movl (VEC_SIZE * 2)(%rsi, %rcx), %edi | |
622 | cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi | |
623 | jne L(wcscmp_return) | |
624 | # else | |
625 | movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax | |
626 | movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx | |
627 | subl %edx, %eax | |
628 | # endif | |
629 | # endif | |
7ebba913 | 630 | VZEROUPPER_RETURN |
14570163 | 631 | |
75870237 SP |
632 | # ifdef USE_AS_STRNCMP |
633 | L(string_nbyte_offset_check): | |
634 | leaq (VEC_SIZE * 4)(%r10), %r10 | |
635 | cmpq %r10, %r11 | |
636 | jbe L(zero) | |
637 | jmp L(back_to_loop) | |
638 | # endif | |
639 | ||
14570163 LS |
640 | .p2align 4 |
641 | L(cross_page_loop): | |
642 | /* Check one byte/dword at a time. */ | |
643 | # ifdef USE_AS_WCSCMP | |
644 | cmpl %ecx, %eax | |
645 | # else | |
646 | subl %ecx, %eax | |
647 | # endif | |
648 | jne L(different) | |
649 | addl $SIZE_OF_CHAR, %edx | |
650 | cmpl $(VEC_SIZE * 4), %edx | |
651 | je L(main_loop_header) | |
652 | # ifdef USE_AS_STRNCMP | |
653 | cmpq %r11, %rdx | |
654 | jae L(zero) | |
655 | # endif | |
656 | # ifdef USE_AS_WCSCMP | |
657 | movl (%rdi, %rdx), %eax | |
658 | movl (%rsi, %rdx), %ecx | |
659 | # else | |
660 | movzbl (%rdi, %rdx), %eax | |
661 | movzbl (%rsi, %rdx), %ecx | |
662 | # endif | |
663 | /* Check null char. */ | |
664 | testl %eax, %eax | |
665 | jne L(cross_page_loop) | |
666 | /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED | |
667 | comparisons. */ | |
668 | subl %ecx, %eax | |
669 | # ifndef USE_AS_WCSCMP | |
670 | L(different): | |
671 | # endif | |
7ebba913 | 672 | VZEROUPPER_RETURN |
14570163 LS |
673 | |
674 | # ifdef USE_AS_WCSCMP | |
675 | .p2align 4 | |
676 | L(different): | |
677 | /* Use movl to avoid modifying EFLAGS. */ | |
678 | movl $0, %eax | |
679 | setl %al | |
680 | negl %eax | |
681 | orl $1, %eax | |
7ebba913 | 682 | VZEROUPPER_RETURN |
14570163 LS |
683 | # endif |
684 | ||
685 | # ifdef USE_AS_STRNCMP | |
686 | .p2align 4 | |
687 | L(zero): | |
688 | xorl %eax, %eax | |
7ebba913 | 689 | VZEROUPPER_RETURN |
14570163 LS |
690 | |
691 | .p2align 4 | |
692 | L(char0): | |
693 | # ifdef USE_AS_WCSCMP | |
694 | xorl %eax, %eax | |
695 | movl (%rdi), %ecx | |
696 | cmpl (%rsi), %ecx | |
697 | jne L(wcscmp_return) | |
698 | # else | |
699 | movzbl (%rsi), %ecx | |
700 | movzbl (%rdi), %eax | |
701 | subl %ecx, %eax | |
702 | # endif | |
7ebba913 | 703 | VZEROUPPER_RETURN |
14570163 LS |
704 | # endif |
705 | ||
706 | .p2align 4 | |
707 | L(last_vector): | |
708 | addq %rdx, %rdi | |
709 | addq %rdx, %rsi | |
710 | # ifdef USE_AS_STRNCMP | |
711 | subq %rdx, %r11 | |
712 | # endif | |
713 | tzcntl %ecx, %edx | |
714 | # ifdef USE_AS_STRNCMP | |
715 | cmpq %r11, %rdx | |
716 | jae L(zero) | |
717 | # endif | |
718 | # ifdef USE_AS_WCSCMP | |
719 | xorl %eax, %eax | |
720 | movl (%rdi, %rdx), %ecx | |
721 | cmpl (%rsi, %rdx), %ecx | |
722 | jne L(wcscmp_return) | |
723 | # else | |
724 | movzbl (%rdi, %rdx), %eax | |
725 | movzbl (%rsi, %rdx), %edx | |
726 | subl %edx, %eax | |
727 | # endif | |
7ebba913 | 728 | VZEROUPPER_RETURN |
14570163 LS |
729 | |
730 | /* Comparing on page boundary region requires special treatment: | |
731 | It must done one vector at the time, starting with the wider | |
732 | ymm vector if possible, if not, with xmm. If fetching 16 bytes | |
733 | (xmm) still passes the boundary, byte comparison must be done. | |
734 | */ | |
735 | .p2align 4 | |
736 | L(cross_page): | |
737 | /* Try one ymm vector at a time. */ | |
738 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax | |
739 | jg L(cross_page_1_vector) | |
740 | L(loop_1_vector): | |
741 | vmovdqu (%rdi, %rdx), %ymm1 | |
742 | VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 | |
743 | VPMINU %ymm1, %ymm0, %ymm0 | |
744 | VPCMPEQ %ymm7, %ymm0, %ymm0 | |
745 | vpmovmskb %ymm0, %ecx | |
746 | testl %ecx, %ecx | |
747 | jne L(last_vector) | |
748 | ||
749 | addl $VEC_SIZE, %edx | |
750 | ||
751 | addl $VEC_SIZE, %eax | |
752 | # ifdef USE_AS_STRNCMP | |
753 | /* Return 0 if the current offset (%rdx) >= the maximum offset | |
754 | (%r11). */ | |
755 | cmpq %r11, %rdx | |
756 | jae L(zero) | |
757 | # endif | |
758 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax | |
759 | jle L(loop_1_vector) | |
760 | L(cross_page_1_vector): | |
761 | /* Less than 32 bytes to check, try one xmm vector. */ | |
762 | cmpl $(PAGE_SIZE - 16), %eax | |
763 | jg L(cross_page_1_xmm) | |
764 | vmovdqu (%rdi, %rdx), %xmm1 | |
765 | VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 | |
766 | VPMINU %xmm1, %xmm0, %xmm0 | |
767 | VPCMPEQ %xmm7, %xmm0, %xmm0 | |
768 | vpmovmskb %xmm0, %ecx | |
769 | testl %ecx, %ecx | |
770 | jne L(last_vector) | |
771 | ||
772 | addl $16, %edx | |
773 | # ifndef USE_AS_WCSCMP | |
774 | addl $16, %eax | |
775 | # endif | |
776 | # ifdef USE_AS_STRNCMP | |
777 | /* Return 0 if the current offset (%rdx) >= the maximum offset | |
778 | (%r11). */ | |
779 | cmpq %r11, %rdx | |
780 | jae L(zero) | |
781 | # endif | |
782 | ||
783 | L(cross_page_1_xmm): | |
784 | # ifndef USE_AS_WCSCMP | |
785 | /* Less than 16 bytes to check, try 8 byte vector. NB: No need | |
786 | for wcscmp nor wcsncmp since wide char is 4 bytes. */ | |
787 | cmpl $(PAGE_SIZE - 8), %eax | |
788 | jg L(cross_page_8bytes) | |
789 | vmovq (%rdi, %rdx), %xmm1 | |
790 | vmovq (%rsi, %rdx), %xmm0 | |
791 | VPCMPEQ %xmm0, %xmm1, %xmm0 | |
792 | VPMINU %xmm1, %xmm0, %xmm0 | |
793 | VPCMPEQ %xmm7, %xmm0, %xmm0 | |
794 | vpmovmskb %xmm0, %ecx | |
795 | /* Only last 8 bits are valid. */ | |
796 | andl $0xff, %ecx | |
797 | testl %ecx, %ecx | |
798 | jne L(last_vector) | |
799 | ||
800 | addl $8, %edx | |
801 | addl $8, %eax | |
802 | # ifdef USE_AS_STRNCMP | |
803 | /* Return 0 if the current offset (%rdx) >= the maximum offset | |
804 | (%r11). */ | |
805 | cmpq %r11, %rdx | |
806 | jae L(zero) | |
807 | # endif | |
808 | ||
809 | L(cross_page_8bytes): | |
810 | /* Less than 8 bytes to check, try 4 byte vector. */ | |
811 | cmpl $(PAGE_SIZE - 4), %eax | |
812 | jg L(cross_page_4bytes) | |
813 | vmovd (%rdi, %rdx), %xmm1 | |
814 | vmovd (%rsi, %rdx), %xmm0 | |
815 | VPCMPEQ %xmm0, %xmm1, %xmm0 | |
816 | VPMINU %xmm1, %xmm0, %xmm0 | |
817 | VPCMPEQ %xmm7, %xmm0, %xmm0 | |
818 | vpmovmskb %xmm0, %ecx | |
819 | /* Only last 4 bits are valid. */ | |
820 | andl $0xf, %ecx | |
821 | testl %ecx, %ecx | |
822 | jne L(last_vector) | |
823 | ||
824 | addl $4, %edx | |
825 | # ifdef USE_AS_STRNCMP | |
826 | /* Return 0 if the current offset (%rdx) >= the maximum offset | |
827 | (%r11). */ | |
828 | cmpq %r11, %rdx | |
829 | jae L(zero) | |
830 | # endif | |
831 | ||
832 | L(cross_page_4bytes): | |
833 | # endif | |
834 | /* Less than 4 bytes to check, try one byte/dword at a time. */ | |
835 | # ifdef USE_AS_STRNCMP | |
836 | cmpq %r11, %rdx | |
837 | jae L(zero) | |
838 | # endif | |
839 | # ifdef USE_AS_WCSCMP | |
840 | movl (%rdi, %rdx), %eax | |
841 | movl (%rsi, %rdx), %ecx | |
842 | # else | |
843 | movzbl (%rdi, %rdx), %eax | |
844 | movzbl (%rsi, %rdx), %ecx | |
845 | # endif | |
846 | testl %eax, %eax | |
847 | jne L(cross_page_loop) | |
848 | subl %ecx, %eax | |
7ebba913 | 849 | VZEROUPPER_RETURN |
14570163 LS |
850 | END (STRCMP) |
851 | #endif |