]>
Commit | Line | Data |
---|---|---|
b91a52d0 | 1 | /* memcmp with SSE2. |
6d7e8eda | 2 | Copyright (C) 2017-2023 Free Software Foundation, Inc. |
b91a52d0 L |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
b91a52d0 | 18 | |
ae308947 NG |
19 | |
20 | #include <isa-level.h> | |
21 | ||
22 | /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation | |
23 | so we need this to build for ISA V2 builds. */ | |
24 | #if ISA_SHOULD_BUILD (2) | |
25 | ||
26 | #include <sysdep.h> | |
27 | ||
8804157a NG |
28 | # ifndef MEMCMP |
29 | # define MEMCMP __memcmp_sse2 | |
cf4fd28e | 30 | # endif |
b91a52d0 | 31 | |
ae308947 NG |
32 | # ifdef USE_AS_WMEMCMP |
33 | # define PCMPEQ pcmpeqd | |
34 | # define CHAR_SIZE 4 | |
35 | # define SIZE_OFFSET (0) | |
36 | # else | |
37 | # define PCMPEQ pcmpeqb | |
38 | # define CHAR_SIZE 1 | |
39 | # endif | |
9894127d | 40 | |
ae308947 NG |
41 | # ifdef USE_AS_MEMCMPEQ |
42 | # define SIZE_OFFSET (0) | |
43 | # define CHECK_CMP(x, y) subl x, y | |
44 | # else | |
45 | # ifndef SIZE_OFFSET | |
46 | # define SIZE_OFFSET (CHAR_PER_VEC * 2) | |
47 | # endif | |
48 | # define CHECK_CMP(x, y) cmpl x, y | |
b91a52d0 L |
49 | # endif |
50 | ||
ae308947 NG |
51 | # define VEC_SIZE 16 |
52 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) | |
44829b3d | 53 | |
ae308947 NG |
54 | # ifndef MEMCMP |
55 | # define MEMCMP memcmp | |
56 | # endif | |
57 | ||
58 | .text | |
59 | ENTRY(MEMCMP) | |
60 | # ifdef __ILP32__ | |
61 | /* Clear the upper 32 bits. */ | |
62 | movl %edx, %edx | |
63 | # endif | |
64 | # ifdef USE_AS_WMEMCMP | |
65 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store | |
66 | in ecx for code size. This is preferable to using `incw` as | |
67 | it avoids partial register stalls on older hardware (pre | |
68 | SnB). */ | |
69 | movl $0xffff, %ecx | |
70 | # endif | |
71 | cmpq $CHAR_PER_VEC, %rdx | |
72 | ja L(more_1x_vec) | |
73 | ||
74 | # ifdef USE_AS_WMEMCMP | |
75 | /* saves a byte of code keeping the fall through path n = [2, 4] | |
76 | in the initial cache line. */ | |
77 | decl %edx | |
78 | jle L(cmp_0_1) | |
79 | ||
80 | movq (%rsi), %xmm0 | |
81 | movq (%rdi), %xmm1 | |
82 | PCMPEQ %xmm0, %xmm1 | |
83 | pmovmskb %xmm1, %eax | |
84 | subl %ecx, %eax | |
85 | jnz L(ret_nonzero_vec_start_0) | |
86 | ||
87 | movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 | |
88 | movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 | |
89 | PCMPEQ %xmm0, %xmm1 | |
90 | pmovmskb %xmm1, %eax | |
91 | subl %ecx, %eax | |
92 | jnz L(ret_nonzero_vec_end_0_adj) | |
93 | # else | |
94 | cmpl $8, %edx | |
95 | ja L(cmp_9_16) | |
96 | ||
97 | cmpl $4, %edx | |
98 | jb L(cmp_0_3) | |
99 | ||
100 | # ifdef USE_AS_MEMCMPEQ | |
101 | movl (%rsi), %eax | |
102 | subl (%rdi), %eax | |
103 | ||
104 | movl -4(%rsi, %rdx), %esi | |
105 | subl -4(%rdi, %rdx), %esi | |
106 | ||
107 | orl %esi, %eax | |
108 | ret | |
109 | # else | |
110 | /* Combine comparisons for lo and hi 4-byte comparisons. */ | |
111 | movl -4(%rsi, %rdx), %ecx | |
112 | movl -4(%rdi, %rdx), %eax | |
113 | shlq $32, %rcx | |
114 | shlq $32, %rax | |
115 | movl (%rsi), %esi | |
116 | movl (%rdi), %edi | |
117 | orq %rsi, %rcx | |
118 | orq %rdi, %rax | |
119 | /* Only compute proper return if not-equal. */ | |
120 | cmpq %rcx, %rax | |
121 | jnz L(ret_nonzero) | |
122 | xorl %eax, %eax | |
123 | ret | |
124 | # endif | |
125 | ||
126 | .p2align 4,, 10 | |
127 | L(cmp_9_16): | |
128 | # ifdef USE_AS_MEMCMPEQ | |
129 | movq (%rsi), %rax | |
130 | subq (%rdi), %rax | |
131 | ||
132 | movq -8(%rsi, %rdx), %rcx | |
133 | subq -8(%rdi, %rdx), %rcx | |
134 | orq %rcx, %rax | |
135 | /* Convert 64 bit -> 32 bit boolean (we should have made the ABI | |
136 | return long). */ | |
137 | setnz %cl | |
138 | movzbl %cl, %eax | |
139 | # else | |
140 | movq (%rsi), %rcx | |
141 | movq (%rdi), %rax | |
142 | /* Only compute proper return if not-equal. */ | |
143 | cmpq %rcx, %rax | |
144 | jnz L(ret_nonzero) | |
145 | ||
146 | movq -8(%rsi, %rdx, CHAR_SIZE), %rcx | |
147 | movq -8(%rdi, %rdx, CHAR_SIZE), %rax | |
148 | /* Only compute proper return if not-equal. */ | |
149 | cmpq %rcx, %rax | |
150 | jnz L(ret_nonzero) | |
151 | xorl %eax, %eax | |
152 | # endif | |
153 | # endif | |
154 | ret | |
155 | ||
156 | .p2align 4,, 8 | |
157 | L(cmp_0_1): | |
158 | /* Flag set by earlier comparison against 1. */ | |
159 | jne L(cmp_0_0) | |
160 | # ifdef USE_AS_WMEMCMP | |
161 | movl (%rdi), %ecx | |
162 | xorl %edx, %edx | |
163 | cmpl (%rsi), %ecx | |
164 | je L(cmp_0_0) | |
165 | setg %dl | |
166 | leal -1(%rdx, %rdx), %eax | |
167 | # else | |
168 | movzbl (%rdi), %eax | |
169 | movzbl (%rsi), %ecx | |
170 | subl %ecx, %eax | |
171 | # endif | |
172 | ret | |
173 | ||
174 | /* Fits in aligning bytes. */ | |
175 | L(cmp_0_0): | |
176 | xorl %eax, %eax | |
177 | ret | |
178 | ||
179 | # ifdef USE_AS_WMEMCMP | |
180 | .p2align 4 | |
181 | L(ret_nonzero_vec_start_0): | |
182 | bsfl %eax, %eax | |
183 | movl (%rdi, %rax), %ecx | |
184 | xorl %edx, %edx | |
185 | cmpl (%rsi, %rax), %ecx | |
186 | /* NB: no partial register stall here because xorl zero idiom | |
187 | above. */ | |
188 | setg %dl | |
189 | leal -1(%rdx, %rdx), %eax | |
190 | ret | |
191 | # else | |
192 | ||
193 | # ifndef USE_AS_MEMCMPEQ | |
194 | .p2align 4,, 14 | |
195 | L(ret_nonzero): | |
196 | /* Need to bswap to get proper return without branch. */ | |
197 | bswapq %rcx | |
198 | bswapq %rax | |
199 | subq %rcx, %rax | |
200 | sbbl %eax, %eax | |
201 | orl $1, %eax | |
202 | ret | |
203 | # endif | |
204 | ||
205 | .p2align 4 | |
206 | L(cmp_0_3): | |
207 | # ifdef USE_AS_MEMCMPEQ | |
208 | /* No reason to add to dependency chain on rdx. Saving a the | |
209 | bytes here doesn't change number of fetch blocks. */ | |
210 | cmpl $1, %edx | |
211 | jbe L(cmp_0_1) | |
212 | # else | |
213 | /* We need the code size to prevent taking an extra fetch block. | |
214 | */ | |
215 | decl %edx | |
216 | jle L(cmp_0_1) | |
217 | # endif | |
218 | movzwl (%rsi), %ecx | |
219 | movzwl (%rdi), %eax | |
220 | ||
221 | # ifdef USE_AS_MEMCMPEQ | |
222 | subl %ecx, %eax | |
223 | ||
224 | movzbl -1(%rsi, %rdx), %esi | |
225 | movzbl -1(%rdi, %rdx), %edi | |
226 | subl %edi, %esi | |
227 | orl %esi, %eax | |
228 | # else | |
229 | bswapl %ecx | |
230 | bswapl %eax | |
231 | ||
232 | /* Implicit right shift by one. We just need to displace the | |
233 | sign bits. */ | |
234 | shrl %ecx | |
235 | shrl %eax | |
236 | ||
237 | /* Eat a partial register stall here. Saves code stopping | |
238 | L(cmp_0_3) from bleeding into the next fetch block and saves | |
239 | an ALU. */ | |
240 | movb (%rsi, %rdx), %cl | |
241 | movzbl (%rdi, %rdx), %edi | |
242 | orl %edi, %eax | |
243 | subl %ecx, %eax | |
244 | # endif | |
245 | ret | |
246 | # endif | |
247 | ||
248 | .p2align 5 | |
249 | L(more_1x_vec): | |
250 | # ifndef USE_AS_WMEMCMP | |
251 | /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store | |
252 | in ecx for code size. This is preferable to using `incw` as | |
253 | it avoids partial register stalls on older hardware (pre | |
254 | SnB). */ | |
255 | movl $0xffff, %ecx | |
256 | # endif | |
257 | movups (%rsi), %xmm0 | |
258 | movups (%rdi), %xmm1 | |
259 | PCMPEQ %xmm0, %xmm1 | |
260 | pmovmskb %xmm1, %eax | |
261 | subl %ecx, %eax | |
262 | jnz L(ret_nonzero_vec_start_0) | |
263 | # if SIZE_OFFSET == 0 | |
264 | cmpq $(CHAR_PER_VEC * 2), %rdx | |
265 | # else | |
266 | /* Offset rdx. Saves just enough code size to keep the | |
267 | L(last_2x_vec) case and the non-zero return in a single | |
268 | cache line. */ | |
269 | subq $(CHAR_PER_VEC * 2), %rdx | |
270 | # endif | |
271 | ja L(more_2x_vec) | |
b91a52d0 | 272 | |
ae308947 NG |
273 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 |
274 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 | |
275 | PCMPEQ %xmm0, %xmm1 | |
276 | pmovmskb %xmm1, %eax | |
277 | subl %ecx, %eax | |
278 | # ifndef USE_AS_MEMCMPEQ | |
279 | /* Don't use `incw ax` as machines this code runs on are liable | |
280 | to have partial register stall. */ | |
281 | jnz L(ret_nonzero_vec_end_0) | |
282 | # else | |
283 | /* Various return targets for memcmpeq. Will always be hot in | |
284 | Icache and get short encoding. */ | |
285 | L(ret_nonzero_vec_start_1): | |
286 | L(ret_nonzero_vec_start_0): | |
287 | L(ret_nonzero_vec_end_0): | |
288 | # endif | |
289 | ret | |
290 | ||
291 | # ifndef USE_AS_MEMCMPEQ | |
292 | # ifdef USE_AS_WMEMCMP | |
293 | .p2align 4 | |
294 | L(ret_nonzero_vec_end_0_adj): | |
295 | addl $3, %edx | |
296 | # else | |
297 | .p2align 4,, 8 | |
298 | # endif | |
299 | L(ret_nonzero_vec_end_0): | |
300 | bsfl %eax, %eax | |
301 | # ifdef USE_AS_WMEMCMP | |
302 | leal (%rax, %rdx, CHAR_SIZE), %eax | |
303 | movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx | |
304 | xorl %edx, %edx | |
305 | cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx | |
306 | /* NB: no partial register stall here because xorl zero idiom | |
307 | above. */ | |
308 | setg %dl | |
309 | leal -1(%rdx, %rdx), %eax | |
310 | # else | |
b712be52 NG |
311 | /* Use `addq` instead of `addl` here so that even if `rax` + `rdx` |
312 | is negative value of the sum will be usable as a 64-bit offset | |
313 | (negative 32-bit numbers zero-extend to a large and often | |
314 | out-of-bounds 64-bit offsets). Note that `rax` + `rdx` >= 0 is | |
315 | an invariant when `memcmp` is used correctly, but if the input | |
316 | strings `rsi`/`rdi` are concurrently modified as the function | |
317 | runs (there is a Data-Race) it is possible for `rax` + `rdx` to | |
318 | be negative. Given that there is virtually no extra to cost | |
319 | using `addq` instead of `addl` we may as well protect the | |
320 | data-race case. */ | |
321 | addq %rdx, %rax | |
ae308947 NG |
322 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx |
323 | movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax | |
324 | subl %ecx, %eax | |
325 | # endif | |
326 | ret | |
327 | # ifndef USE_AS_WMEMCMP | |
328 | .p2align 4,, 10 | |
329 | L(ret_nonzero_vec_start_0): | |
330 | bsfl %eax, %eax | |
331 | movzbl (%rsi, %rax), %ecx | |
332 | movzbl (%rdi, %rax), %eax | |
333 | subl %ecx, %eax | |
334 | ret | |
335 | # endif | |
336 | # else | |
337 | # endif | |
338 | ||
339 | .p2align 5 | |
340 | L(more_2x_vec): | |
341 | movups (VEC_SIZE * 1)(%rsi), %xmm0 | |
342 | movups (VEC_SIZE * 1)(%rdi), %xmm1 | |
343 | PCMPEQ %xmm0, %xmm1 | |
344 | pmovmskb %xmm1, %eax | |
345 | subl %ecx, %eax | |
346 | jnz L(ret_nonzero_vec_start_1) | |
347 | ||
348 | cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx | |
349 | jbe L(last_2x_vec) | |
350 | ||
351 | cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx | |
352 | ja L(more_8x_vec) | |
353 | ||
354 | /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. | |
355 | This can harm performance if non-zero return in [65, 80] or | |
356 | [97, 112] but helps performance otherwise. Generally zero- | |
357 | return is hotter. */ | |
358 | movups (VEC_SIZE * 2)(%rsi), %xmm0 | |
359 | movups (VEC_SIZE * 2)(%rdi), %xmm1 | |
360 | PCMPEQ %xmm0, %xmm1 | |
361 | movups (VEC_SIZE * 3)(%rsi), %xmm2 | |
362 | movups (VEC_SIZE * 3)(%rdi), %xmm3 | |
363 | PCMPEQ %xmm2, %xmm3 | |
364 | pand %xmm1, %xmm3 | |
365 | ||
366 | pmovmskb %xmm3, %eax | |
367 | CHECK_CMP (%ecx, %eax) | |
368 | jnz L(ret_nonzero_vec_start_2_3) | |
369 | ||
370 | cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx | |
371 | jbe L(last_2x_vec) | |
372 | ||
373 | movups (VEC_SIZE * 4)(%rsi), %xmm0 | |
374 | movups (VEC_SIZE * 4)(%rdi), %xmm1 | |
375 | PCMPEQ %xmm0, %xmm1 | |
376 | movups (VEC_SIZE * 5)(%rsi), %xmm2 | |
377 | movups (VEC_SIZE * 5)(%rdi), %xmm3 | |
378 | PCMPEQ %xmm2, %xmm3 | |
379 | pand %xmm1, %xmm3 | |
380 | ||
381 | pmovmskb %xmm3, %eax | |
382 | CHECK_CMP (%ecx, %eax) | |
383 | # ifdef USE_AS_MEMCMPEQ | |
384 | jz L(last_2x_vec) | |
385 | ret | |
386 | # else | |
387 | jnz L(ret_nonzero_vec_start_4_5) | |
388 | # endif | |
389 | .p2align 4 | |
390 | L(last_2x_vec): | |
391 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 | |
392 | movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 | |
393 | PCMPEQ %xmm0, %xmm1 | |
394 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 | |
395 | movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 | |
396 | PCMPEQ %xmm2, %xmm3 | |
397 | pand %xmm1, %xmm3 | |
398 | pmovmskb %xmm3, %eax | |
399 | subl %ecx, %eax | |
400 | # ifdef USE_AS_MEMCMPEQ | |
401 | /* Various return targets for memcmpeq. Will always be hot in | |
402 | Icache and get short encoding. */ | |
403 | L(ret_nonzero_vec_start_2_3): | |
404 | L(ret_nonzero_vec_start_4_5): | |
405 | ret | |
406 | # else | |
407 | jnz L(ret_nonzero_vec_end_1) | |
408 | ret | |
409 | ||
410 | .p2align 4,, 8 | |
411 | L(ret_nonzero_vec_end_1): | |
412 | pmovmskb %xmm1, %ecx | |
413 | /* High 16 bits of eax guranteed to be all ones. Rotate them in | |
414 | to we can do `or + not` with just `xor`. */ | |
415 | rorl $16, %eax | |
416 | xorl %ecx, %eax | |
417 | /* Partial register stall. */ | |
418 | ||
419 | bsfl %eax, %eax | |
420 | # ifdef USE_AS_WMEMCMP | |
421 | leal (%rax, %rdx, CHAR_SIZE), %eax | |
422 | movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx | |
423 | xorl %edx, %edx | |
424 | cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx | |
425 | /* NB: no partial register stall here because xorl zero idiom | |
426 | above. */ | |
427 | setg %dl | |
428 | leal -1(%rdx, %rdx), %eax | |
429 | # else | |
430 | addl %edx, %eax | |
431 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx | |
432 | movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax | |
433 | subl %ecx, %eax | |
434 | # endif | |
435 | ret | |
436 | ||
437 | .p2align 4 | |
438 | L(ret_nonzero_vec_start_4_5): | |
439 | pmovmskb %xmm1, %edx | |
440 | sall $16, %eax | |
441 | leal 1(%rax, %rdx), %eax | |
442 | bsfl %eax, %eax | |
443 | # ifdef USE_AS_WMEMCMP | |
444 | movl (VEC_SIZE * 4)(%rdi, %rax), %ecx | |
445 | xorl %edx, %edx | |
446 | cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx | |
447 | /* NB: no partial register stall here because xorl zero idiom | |
448 | above. */ | |
449 | setg %dl | |
450 | leal -1(%rdx, %rdx), %eax | |
451 | # else | |
452 | movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx | |
453 | movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax | |
454 | subl %ecx, %eax | |
455 | # endif | |
456 | ret | |
457 | ||
458 | .p2align 4,, 8 | |
459 | L(ret_nonzero_vec_start_1): | |
460 | bsfl %eax, %eax | |
461 | # ifdef USE_AS_WMEMCMP | |
462 | movl (VEC_SIZE * 1)(%rdi, %rax), %ecx | |
463 | xorl %edx, %edx | |
464 | cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx | |
465 | /* NB: no partial register stall here because xorl zero idiom | |
466 | above. */ | |
467 | setg %dl | |
468 | leal -1(%rdx, %rdx), %eax | |
469 | # else | |
470 | movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx | |
471 | movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax | |
472 | subl %ecx, %eax | |
473 | # endif | |
474 | ret | |
475 | # endif | |
476 | ||
477 | .p2align 4 | |
478 | L(more_8x_vec): | |
479 | subq %rdi, %rsi | |
480 | leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx | |
481 | andq $(VEC_SIZE * -1), %rdi | |
482 | addq %rdi, %rsi | |
483 | .p2align 4 | |
484 | L(loop_4x): | |
485 | movups (VEC_SIZE * 2)(%rsi), %xmm0 | |
486 | movups (VEC_SIZE * 3)(%rsi), %xmm1 | |
487 | ||
488 | PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 | |
489 | PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 | |
490 | ||
491 | movups (VEC_SIZE * 4)(%rsi), %xmm2 | |
492 | movups (VEC_SIZE * 5)(%rsi), %xmm3 | |
493 | ||
494 | PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 | |
495 | PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 | |
496 | ||
497 | pand %xmm0, %xmm1 | |
498 | pand %xmm2, %xmm3 | |
499 | pand %xmm1, %xmm3 | |
500 | ||
501 | pmovmskb %xmm3, %eax | |
502 | subl %ecx, %eax | |
503 | jnz L(ret_nonzero_loop) | |
504 | ||
505 | addq $(VEC_SIZE * 4), %rdi | |
506 | addq $(VEC_SIZE * 4), %rsi | |
507 | cmpq %rdi, %rdx | |
508 | ja L(loop_4x) | |
509 | /* Get remaining length in edx. */ | |
510 | subl %edi, %edx | |
511 | /* Restore offset so we can reuse L(last_2x_vec). */ | |
512 | addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx | |
513 | # ifdef USE_AS_WMEMCMP | |
514 | shrl $2, %edx | |
515 | # endif | |
516 | cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx | |
517 | jbe L(last_2x_vec) | |
518 | ||
519 | ||
520 | movups (VEC_SIZE * 2)(%rsi), %xmm0 | |
521 | movups (VEC_SIZE * 2)(%rdi), %xmm1 | |
522 | PCMPEQ %xmm0, %xmm1 | |
523 | movups (VEC_SIZE * 3)(%rsi), %xmm2 | |
524 | movups (VEC_SIZE * 3)(%rdi), %xmm3 | |
525 | PCMPEQ %xmm2, %xmm3 | |
526 | pand %xmm1, %xmm3 | |
527 | ||
528 | pmovmskb %xmm3, %eax | |
529 | CHECK_CMP (%ecx, %eax) | |
530 | jz L(last_2x_vec) | |
531 | # ifdef USE_AS_MEMCMPEQ | |
532 | L(ret_nonzero_loop): | |
533 | ret | |
534 | # else | |
535 | ||
536 | .p2align 4 | |
537 | L(ret_nonzero_vec_start_2_3): | |
538 | pmovmskb %xmm1, %edx | |
539 | sall $16, %eax | |
540 | leal 1(%rax, %rdx), %eax | |
541 | ||
542 | bsfl %eax, %eax | |
543 | # ifdef USE_AS_WMEMCMP | |
544 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx | |
545 | xorl %edx, %edx | |
546 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx | |
547 | /* NB: no partial register stall here because xorl zero idiom | |
548 | above. */ | |
549 | setg %dl | |
550 | leal -1(%rdx, %rdx), %eax | |
551 | # else | |
552 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx | |
553 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax | |
554 | subl %ecx, %eax | |
555 | # endif | |
556 | ret | |
557 | ||
558 | .p2align 4 | |
559 | L(ret_nonzero_loop): | |
560 | pmovmskb %xmm0, %ecx | |
561 | pmovmskb %xmm1, %edx | |
562 | sall $(VEC_SIZE * 1), %edx | |
563 | leal 1(%rcx, %rdx), %edx | |
564 | pmovmskb %xmm2, %ecx | |
565 | /* High 16 bits of eax guranteed to be all ones. Rotate them in | |
566 | to we can do `or + not` with just `xor`. */ | |
567 | rorl $16, %eax | |
568 | xorl %ecx, %eax | |
569 | ||
570 | salq $32, %rax | |
571 | orq %rdx, %rax | |
572 | ||
573 | bsfq %rax, %rax | |
574 | # ifdef USE_AS_WMEMCMP | |
575 | movl (VEC_SIZE * 2)(%rdi, %rax), %ecx | |
576 | xorl %edx, %edx | |
577 | cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx | |
578 | /* NB: no partial register stall here because xorl zero idiom | |
579 | above. */ | |
580 | setg %dl | |
581 | leal -1(%rdx, %rdx), %eax | |
582 | # else | |
583 | movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx | |
584 | movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax | |
585 | subl %ecx, %eax | |
586 | # endif | |
587 | ret | |
588 | # endif | |
589 | END(MEMCMP) | |
590 | #endif |