]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/x86_64/multiarch/memcmp-ssse3.S
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / x86_64 / multiarch / memcmp-ssse3.S
1 /* memcmp with SSSE3, wmemcmp with SSSE3
2 Copyright (C) 2011-2015 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 #if IS_IN (libc)
21
22 # include <sysdep.h>
23
24 # ifndef MEMCMP
25 # define MEMCMP __memcmp_ssse3
26 # endif
27
28 /* Warning!
29 wmemcmp has to use SIGNED comparison for elements.
30 memcmp has to use UNSIGNED comparison for elemnts.
31 */
32
33 atom_text_section
34 ENTRY (MEMCMP)
35 # ifdef USE_AS_WMEMCMP
36 shl $2, %rdx
37 test %rdx, %rdx
38 jz L(equal)
39 # endif
40 mov %rdx, %rcx
41 mov %rdi, %rdx
42 cmp $48, %rcx;
43 jae L(48bytesormore) /* LEN => 48 */
44
45 add %rcx, %rsi
46 add %rcx, %rdi
47 jmp L(less48bytes)
48
49 .p2align 4
50 /* ECX >= 32. */
51 L(48bytesormore):
52 movdqu (%rdi), %xmm3
53 movdqu (%rsi), %xmm0
54 pcmpeqb %xmm0, %xmm3
55 pmovmskb %xmm3, %edx
56 lea 16(%rdi), %rdi
57 lea 16(%rsi), %rsi
58 sub $0xffff, %edx
59 jnz L(less16bytes)
60 mov %edi, %edx
61 and $0xf, %edx
62 xor %rdx, %rdi
63 sub %rdx, %rsi
64 add %rdx, %rcx
65 mov %esi, %edx
66 and $0xf, %edx
67 jz L(shr_0)
68 xor %rdx, %rsi
69
70 # ifndef USE_AS_WMEMCMP
71 cmp $8, %edx
72 jae L(next_unaligned_table)
73 cmp $0, %edx
74 je L(shr_0)
75 cmp $1, %edx
76 je L(shr_1)
77 cmp $2, %edx
78 je L(shr_2)
79 cmp $3, %edx
80 je L(shr_3)
81 cmp $4, %edx
82 je L(shr_4)
83 cmp $5, %edx
84 je L(shr_5)
85 cmp $6, %edx
86 je L(shr_6)
87 jmp L(shr_7)
88
89 .p2align 2
90 L(next_unaligned_table):
91 cmp $8, %edx
92 je L(shr_8)
93 cmp $9, %edx
94 je L(shr_9)
95 cmp $10, %edx
96 je L(shr_10)
97 cmp $11, %edx
98 je L(shr_11)
99 cmp $12, %edx
100 je L(shr_12)
101 cmp $13, %edx
102 je L(shr_13)
103 cmp $14, %edx
104 je L(shr_14)
105 jmp L(shr_15)
106 # else
107 cmp $0, %edx
108 je L(shr_0)
109 cmp $4, %edx
110 je L(shr_4)
111 cmp $8, %edx
112 je L(shr_8)
113 jmp L(shr_12)
114 # endif
115
116 .p2align 4
117 L(shr_0):
118 cmp $80, %rcx
119 lea -48(%rcx), %rcx
120 jae L(shr_0_gobble)
121 xor %eax, %eax
122 movdqa (%rsi), %xmm1
123 pcmpeqb (%rdi), %xmm1
124 movdqa 16(%rsi), %xmm2
125 pcmpeqb 16(%rdi), %xmm2
126 pand %xmm1, %xmm2
127 pmovmskb %xmm2, %edx
128 lea 32(%rdi), %rdi
129 lea 32(%rsi), %rsi
130 sub $0xffff, %edx
131 jnz L(exit)
132 add %rcx, %rsi
133 add %rcx, %rdi
134 jmp L(less48bytes)
135
136 .p2align 4
137 L(shr_0_gobble):
138 movdqa (%rsi), %xmm0
139 xor %eax, %eax
140 pcmpeqb (%rdi), %xmm0
141 sub $32, %rcx
142 movdqa 16(%rsi), %xmm2
143 pcmpeqb 16(%rdi), %xmm2
144 L(shr_0_gobble_loop):
145 pand %xmm0, %xmm2
146 sub $32, %rcx
147 pmovmskb %xmm2, %edx
148 movdqa %xmm0, %xmm1
149 movdqa 32(%rsi), %xmm0
150 movdqa 48(%rsi), %xmm2
151 sbb $0xffff, %edx
152 pcmpeqb 32(%rdi), %xmm0
153 pcmpeqb 48(%rdi), %xmm2
154 lea 32(%rdi), %rdi
155 lea 32(%rsi), %rsi
156 jz L(shr_0_gobble_loop)
157
158 pand %xmm0, %xmm2
159 cmp $0, %rcx
160 jge L(next)
161 inc %edx
162 add $32, %rcx
163 L(next):
164 test %edx, %edx
165 jnz L(exit)
166
167 pmovmskb %xmm2, %edx
168 movdqa %xmm0, %xmm1
169 lea 32(%rdi), %rdi
170 lea 32(%rsi), %rsi
171 sub $0xffff, %edx
172 jnz L(exit)
173 add %rcx, %rsi
174 add %rcx, %rdi
175 jmp L(less48bytes)
176
177 # ifndef USE_AS_WMEMCMP
178
179 .p2align 4
180 L(shr_1):
181 cmp $80, %rcx
182 lea -48(%rcx), %rcx
183 mov %edx, %eax
184 jae L(shr_1_gobble)
185
186 movdqa 16(%rsi), %xmm1
187 movdqa %xmm1, %xmm2
188 palignr $1, (%rsi), %xmm1
189 pcmpeqb (%rdi), %xmm1
190
191 movdqa 32(%rsi), %xmm3
192 palignr $1, %xmm2, %xmm3
193 pcmpeqb 16(%rdi), %xmm3
194
195 pand %xmm1, %xmm3
196 pmovmskb %xmm3, %edx
197 lea 32(%rdi), %rdi
198 lea 32(%rsi), %rsi
199 sub $0xffff, %edx
200 jnz L(exit)
201 add $1, %rsi
202 add %rcx, %rsi
203 add %rcx, %rdi
204 jmp L(less48bytes)
205
206 .p2align 4
207 L(shr_1_gobble):
208 sub $32, %rcx
209 movdqa 16(%rsi), %xmm0
210 palignr $1, (%rsi), %xmm0
211 pcmpeqb (%rdi), %xmm0
212
213 movdqa 32(%rsi), %xmm3
214 palignr $1, 16(%rsi), %xmm3
215 pcmpeqb 16(%rdi), %xmm3
216
217 L(shr_1_gobble_loop):
218 pand %xmm0, %xmm3
219 sub $32, %rcx
220 pmovmskb %xmm3, %edx
221 movdqa %xmm0, %xmm1
222
223 movdqa 64(%rsi), %xmm3
224 palignr $1, 48(%rsi), %xmm3
225 sbb $0xffff, %edx
226 movdqa 48(%rsi), %xmm0
227 palignr $1, 32(%rsi), %xmm0
228 pcmpeqb 32(%rdi), %xmm0
229 lea 32(%rsi), %rsi
230 pcmpeqb 48(%rdi), %xmm3
231
232 lea 32(%rdi), %rdi
233 jz L(shr_1_gobble_loop)
234 pand %xmm0, %xmm3
235
236 cmp $0, %rcx
237 jge L(shr_1_gobble_next)
238 inc %edx
239 add $32, %rcx
240 L(shr_1_gobble_next):
241 test %edx, %edx
242 jnz L(exit)
243
244 pmovmskb %xmm3, %edx
245 movdqa %xmm0, %xmm1
246 lea 32(%rdi), %rdi
247 lea 32(%rsi), %rsi
248 sub $0xffff, %edx
249 jnz L(exit)
250
251 lea 1(%rsi), %rsi
252 add %rcx, %rsi
253 add %rcx, %rdi
254 jmp L(less48bytes)
255
256
257 .p2align 4
258 L(shr_2):
259 cmp $80, %rcx
260 lea -48(%rcx), %rcx
261 mov %edx, %eax
262 jae L(shr_2_gobble)
263
264 movdqa 16(%rsi), %xmm1
265 movdqa %xmm1, %xmm2
266 palignr $2, (%rsi), %xmm1
267 pcmpeqb (%rdi), %xmm1
268
269 movdqa 32(%rsi), %xmm3
270 palignr $2, %xmm2, %xmm3
271 pcmpeqb 16(%rdi), %xmm3
272
273 pand %xmm1, %xmm3
274 pmovmskb %xmm3, %edx
275 lea 32(%rdi), %rdi
276 lea 32(%rsi), %rsi
277 sub $0xffff, %edx
278 jnz L(exit)
279 add $2, %rsi
280 add %rcx, %rsi
281 add %rcx, %rdi
282 jmp L(less48bytes)
283
284 .p2align 4
285 L(shr_2_gobble):
286 sub $32, %rcx
287 movdqa 16(%rsi), %xmm0
288 palignr $2, (%rsi), %xmm0
289 pcmpeqb (%rdi), %xmm0
290
291 movdqa 32(%rsi), %xmm3
292 palignr $2, 16(%rsi), %xmm3
293 pcmpeqb 16(%rdi), %xmm3
294
295 L(shr_2_gobble_loop):
296 pand %xmm0, %xmm3
297 sub $32, %rcx
298 pmovmskb %xmm3, %edx
299 movdqa %xmm0, %xmm1
300
301 movdqa 64(%rsi), %xmm3
302 palignr $2, 48(%rsi), %xmm3
303 sbb $0xffff, %edx
304 movdqa 48(%rsi), %xmm0
305 palignr $2, 32(%rsi), %xmm0
306 pcmpeqb 32(%rdi), %xmm0
307 lea 32(%rsi), %rsi
308 pcmpeqb 48(%rdi), %xmm3
309
310 lea 32(%rdi), %rdi
311 jz L(shr_2_gobble_loop)
312 pand %xmm0, %xmm3
313
314 cmp $0, %rcx
315 jge L(shr_2_gobble_next)
316 inc %edx
317 add $32, %rcx
318 L(shr_2_gobble_next):
319 test %edx, %edx
320 jnz L(exit)
321
322 pmovmskb %xmm3, %edx
323 movdqa %xmm0, %xmm1
324 lea 32(%rdi), %rdi
325 lea 32(%rsi), %rsi
326 sub $0xffff, %edx
327 jnz L(exit)
328
329 lea 2(%rsi), %rsi
330 add %rcx, %rsi
331 add %rcx, %rdi
332 jmp L(less48bytes)
333
334 .p2align 4
335 L(shr_3):
336 cmp $80, %rcx
337 lea -48(%rcx), %rcx
338 mov %edx, %eax
339 jae L(shr_3_gobble)
340
341 movdqa 16(%rsi), %xmm1
342 movdqa %xmm1, %xmm2
343 palignr $3, (%rsi), %xmm1
344 pcmpeqb (%rdi), %xmm1
345
346 movdqa 32(%rsi), %xmm3
347 palignr $3, %xmm2, %xmm3
348 pcmpeqb 16(%rdi), %xmm3
349
350 pand %xmm1, %xmm3
351 pmovmskb %xmm3, %edx
352 lea 32(%rdi), %rdi
353 lea 32(%rsi), %rsi
354 sub $0xffff, %edx
355 jnz L(exit)
356 add $3, %rsi
357 add %rcx, %rsi
358 add %rcx, %rdi
359 jmp L(less48bytes)
360
361 .p2align 4
362 L(shr_3_gobble):
363 sub $32, %rcx
364 movdqa 16(%rsi), %xmm0
365 palignr $3, (%rsi), %xmm0
366 pcmpeqb (%rdi), %xmm0
367
368 movdqa 32(%rsi), %xmm3
369 palignr $3, 16(%rsi), %xmm3
370 pcmpeqb 16(%rdi), %xmm3
371
372 L(shr_3_gobble_loop):
373 pand %xmm0, %xmm3
374 sub $32, %rcx
375 pmovmskb %xmm3, %edx
376 movdqa %xmm0, %xmm1
377
378 movdqa 64(%rsi), %xmm3
379 palignr $3, 48(%rsi), %xmm3
380 sbb $0xffff, %edx
381 movdqa 48(%rsi), %xmm0
382 palignr $3, 32(%rsi), %xmm0
383 pcmpeqb 32(%rdi), %xmm0
384 lea 32(%rsi), %rsi
385 pcmpeqb 48(%rdi), %xmm3
386
387 lea 32(%rdi), %rdi
388 jz L(shr_3_gobble_loop)
389 pand %xmm0, %xmm3
390
391 cmp $0, %rcx
392 jge L(shr_3_gobble_next)
393 inc %edx
394 add $32, %rcx
395 L(shr_3_gobble_next):
396 test %edx, %edx
397 jnz L(exit)
398
399 pmovmskb %xmm3, %edx
400 movdqa %xmm0, %xmm1
401 lea 32(%rdi), %rdi
402 lea 32(%rsi), %rsi
403 sub $0xffff, %edx
404 jnz L(exit)
405
406 lea 3(%rsi), %rsi
407 add %rcx, %rsi
408 add %rcx, %rdi
409 jmp L(less48bytes)
410
411 # endif
412
413 .p2align 4
414 L(shr_4):
415 cmp $80, %rcx
416 lea -48(%rcx), %rcx
417 mov %edx, %eax
418 jae L(shr_4_gobble)
419
420 movdqa 16(%rsi), %xmm1
421 movdqa %xmm1, %xmm2
422 palignr $4, (%rsi), %xmm1
423 pcmpeqb (%rdi), %xmm1
424
425 movdqa 32(%rsi), %xmm3
426 palignr $4, %xmm2, %xmm3
427 pcmpeqb 16(%rdi), %xmm3
428
429 pand %xmm1, %xmm3
430 pmovmskb %xmm3, %edx
431 lea 32(%rdi), %rdi
432 lea 32(%rsi), %rsi
433 sub $0xffff, %edx
434 jnz L(exit)
435 add $4, %rsi
436 add %rcx, %rsi
437 add %rcx, %rdi
438 jmp L(less48bytes)
439
440 .p2align 4
441 L(shr_4_gobble):
442 sub $32, %rcx
443 movdqa 16(%rsi), %xmm0
444 palignr $4, (%rsi), %xmm0
445 pcmpeqb (%rdi), %xmm0
446
447 movdqa 32(%rsi), %xmm3
448 palignr $4, 16(%rsi), %xmm3
449 pcmpeqb 16(%rdi), %xmm3
450
451 L(shr_4_gobble_loop):
452 pand %xmm0, %xmm3
453 sub $32, %rcx
454 pmovmskb %xmm3, %edx
455 movdqa %xmm0, %xmm1
456
457 movdqa 64(%rsi), %xmm3
458 palignr $4, 48(%rsi), %xmm3
459 sbb $0xffff, %edx
460 movdqa 48(%rsi), %xmm0
461 palignr $4, 32(%rsi), %xmm0
462 pcmpeqb 32(%rdi), %xmm0
463 lea 32(%rsi), %rsi
464 pcmpeqb 48(%rdi), %xmm3
465
466 lea 32(%rdi), %rdi
467 jz L(shr_4_gobble_loop)
468 pand %xmm0, %xmm3
469
470 cmp $0, %rcx
471 jge L(shr_4_gobble_next)
472 inc %edx
473 add $32, %rcx
474 L(shr_4_gobble_next):
475 test %edx, %edx
476 jnz L(exit)
477
478 pmovmskb %xmm3, %edx
479 movdqa %xmm0, %xmm1
480 lea 32(%rdi), %rdi
481 lea 32(%rsi), %rsi
482 sub $0xffff, %edx
483 jnz L(exit)
484
485 lea 4(%rsi), %rsi
486 add %rcx, %rsi
487 add %rcx, %rdi
488 jmp L(less48bytes)
489
490 # ifndef USE_AS_WMEMCMP
491
492 .p2align 4
493 L(shr_5):
494 cmp $80, %rcx
495 lea -48(%rcx), %rcx
496 mov %edx, %eax
497 jae L(shr_5_gobble)
498
499 movdqa 16(%rsi), %xmm1
500 movdqa %xmm1, %xmm2
501 palignr $5, (%rsi), %xmm1
502 pcmpeqb (%rdi), %xmm1
503
504 movdqa 32(%rsi), %xmm3
505 palignr $5, %xmm2, %xmm3
506 pcmpeqb 16(%rdi), %xmm3
507
508 pand %xmm1, %xmm3
509 pmovmskb %xmm3, %edx
510 lea 32(%rdi), %rdi
511 lea 32(%rsi), %rsi
512 sub $0xffff, %edx
513 jnz L(exit)
514 add $5, %rsi
515 add %rcx, %rsi
516 add %rcx, %rdi
517 jmp L(less48bytes)
518
519 .p2align 4
520 L(shr_5_gobble):
521 sub $32, %rcx
522 movdqa 16(%rsi), %xmm0
523 palignr $5, (%rsi), %xmm0
524 pcmpeqb (%rdi), %xmm0
525
526 movdqa 32(%rsi), %xmm3
527 palignr $5, 16(%rsi), %xmm3
528 pcmpeqb 16(%rdi), %xmm3
529
530 L(shr_5_gobble_loop):
531 pand %xmm0, %xmm3
532 sub $32, %rcx
533 pmovmskb %xmm3, %edx
534 movdqa %xmm0, %xmm1
535
536 movdqa 64(%rsi), %xmm3
537 palignr $5, 48(%rsi), %xmm3
538 sbb $0xffff, %edx
539 movdqa 48(%rsi), %xmm0
540 palignr $5, 32(%rsi), %xmm0
541 pcmpeqb 32(%rdi), %xmm0
542 lea 32(%rsi), %rsi
543 pcmpeqb 48(%rdi), %xmm3
544
545 lea 32(%rdi), %rdi
546 jz L(shr_5_gobble_loop)
547 pand %xmm0, %xmm3
548
549 cmp $0, %rcx
550 jge L(shr_5_gobble_next)
551 inc %edx
552 add $32, %rcx
553 L(shr_5_gobble_next):
554 test %edx, %edx
555 jnz L(exit)
556
557 pmovmskb %xmm3, %edx
558 movdqa %xmm0, %xmm1
559 lea 32(%rdi), %rdi
560 lea 32(%rsi), %rsi
561 sub $0xffff, %edx
562 jnz L(exit)
563
564 lea 5(%rsi), %rsi
565 add %rcx, %rsi
566 add %rcx, %rdi
567 jmp L(less48bytes)
568
569 .p2align 4
570 L(shr_6):
571 cmp $80, %rcx
572 lea -48(%rcx), %rcx
573 mov %edx, %eax
574 jae L(shr_6_gobble)
575
576 movdqa 16(%rsi), %xmm1
577 movdqa %xmm1, %xmm2
578 palignr $6, (%rsi), %xmm1
579 pcmpeqb (%rdi), %xmm1
580
581 movdqa 32(%rsi), %xmm3
582 palignr $6, %xmm2, %xmm3
583 pcmpeqb 16(%rdi), %xmm3
584
585 pand %xmm1, %xmm3
586 pmovmskb %xmm3, %edx
587 lea 32(%rdi), %rdi
588 lea 32(%rsi), %rsi
589 sub $0xffff, %edx
590 jnz L(exit)
591 add $6, %rsi
592 add %rcx, %rsi
593 add %rcx, %rdi
594 jmp L(less48bytes)
595
596 .p2align 4
597 L(shr_6_gobble):
598 sub $32, %rcx
599 movdqa 16(%rsi), %xmm0
600 palignr $6, (%rsi), %xmm0
601 pcmpeqb (%rdi), %xmm0
602
603 movdqa 32(%rsi), %xmm3
604 palignr $6, 16(%rsi), %xmm3
605 pcmpeqb 16(%rdi), %xmm3
606
607 L(shr_6_gobble_loop):
608 pand %xmm0, %xmm3
609 sub $32, %rcx
610 pmovmskb %xmm3, %edx
611 movdqa %xmm0, %xmm1
612
613 movdqa 64(%rsi), %xmm3
614 palignr $6, 48(%rsi), %xmm3
615 sbb $0xffff, %edx
616 movdqa 48(%rsi), %xmm0
617 palignr $6, 32(%rsi), %xmm0
618 pcmpeqb 32(%rdi), %xmm0
619 lea 32(%rsi), %rsi
620 pcmpeqb 48(%rdi), %xmm3
621
622 lea 32(%rdi), %rdi
623 jz L(shr_6_gobble_loop)
624 pand %xmm0, %xmm3
625
626 cmp $0, %rcx
627 jge L(shr_6_gobble_next)
628 inc %edx
629 add $32, %rcx
630 L(shr_6_gobble_next):
631 test %edx, %edx
632 jnz L(exit)
633
634 pmovmskb %xmm3, %edx
635 movdqa %xmm0, %xmm1
636 lea 32(%rdi), %rdi
637 lea 32(%rsi), %rsi
638 sub $0xffff, %edx
639 jnz L(exit)
640
641 lea 6(%rsi), %rsi
642 add %rcx, %rsi
643 add %rcx, %rdi
644 jmp L(less48bytes)
645
646 .p2align 4
647 L(shr_7):
648 cmp $80, %rcx
649 lea -48(%rcx), %rcx
650 mov %edx, %eax
651 jae L(shr_7_gobble)
652
653 movdqa 16(%rsi), %xmm1
654 movdqa %xmm1, %xmm2
655 palignr $7, (%rsi), %xmm1
656 pcmpeqb (%rdi), %xmm1
657
658 movdqa 32(%rsi), %xmm3
659 palignr $7, %xmm2, %xmm3
660 pcmpeqb 16(%rdi), %xmm3
661
662 pand %xmm1, %xmm3
663 pmovmskb %xmm3, %edx
664 lea 32(%rdi), %rdi
665 lea 32(%rsi), %rsi
666 sub $0xffff, %edx
667 jnz L(exit)
668 add $7, %rsi
669 add %rcx, %rsi
670 add %rcx, %rdi
671 jmp L(less48bytes)
672
673 .p2align 4
674 L(shr_7_gobble):
675 sub $32, %rcx
676 movdqa 16(%rsi), %xmm0
677 palignr $7, (%rsi), %xmm0
678 pcmpeqb (%rdi), %xmm0
679
680 movdqa 32(%rsi), %xmm3
681 palignr $7, 16(%rsi), %xmm3
682 pcmpeqb 16(%rdi), %xmm3
683
684 L(shr_7_gobble_loop):
685 pand %xmm0, %xmm3
686 sub $32, %rcx
687 pmovmskb %xmm3, %edx
688 movdqa %xmm0, %xmm1
689
690 movdqa 64(%rsi), %xmm3
691 palignr $7, 48(%rsi), %xmm3
692 sbb $0xffff, %edx
693 movdqa 48(%rsi), %xmm0
694 palignr $7, 32(%rsi), %xmm0
695 pcmpeqb 32(%rdi), %xmm0
696 lea 32(%rsi), %rsi
697 pcmpeqb 48(%rdi), %xmm3
698
699 lea 32(%rdi), %rdi
700 jz L(shr_7_gobble_loop)
701 pand %xmm0, %xmm3
702
703 cmp $0, %rcx
704 jge L(shr_7_gobble_next)
705 inc %edx
706 add $32, %rcx
707 L(shr_7_gobble_next):
708 test %edx, %edx
709 jnz L(exit)
710
711 pmovmskb %xmm3, %edx
712 movdqa %xmm0, %xmm1
713 lea 32(%rdi), %rdi
714 lea 32(%rsi), %rsi
715 sub $0xffff, %edx
716 jnz L(exit)
717
718 lea 7(%rsi), %rsi
719 add %rcx, %rsi
720 add %rcx, %rdi
721 jmp L(less48bytes)
722
723 # endif
724
725 .p2align 4
726 L(shr_8):
727 cmp $80, %rcx
728 lea -48(%rcx), %rcx
729 mov %edx, %eax
730 jae L(shr_8_gobble)
731
732 movdqa 16(%rsi), %xmm1
733 movdqa %xmm1, %xmm2
734 palignr $8, (%rsi), %xmm1
735 pcmpeqb (%rdi), %xmm1
736
737 movdqa 32(%rsi), %xmm3
738 palignr $8, %xmm2, %xmm3
739 pcmpeqb 16(%rdi), %xmm3
740
741 pand %xmm1, %xmm3
742 pmovmskb %xmm3, %edx
743 lea 32(%rdi), %rdi
744 lea 32(%rsi), %rsi
745 sub $0xffff, %edx
746 jnz L(exit)
747 add $8, %rsi
748 add %rcx, %rsi
749 add %rcx, %rdi
750 jmp L(less48bytes)
751
752 .p2align 4
753 L(shr_8_gobble):
754 sub $32, %rcx
755 movdqa 16(%rsi), %xmm0
756 palignr $8, (%rsi), %xmm0
757 pcmpeqb (%rdi), %xmm0
758
759 movdqa 32(%rsi), %xmm3
760 palignr $8, 16(%rsi), %xmm3
761 pcmpeqb 16(%rdi), %xmm3
762
763 L(shr_8_gobble_loop):
764 pand %xmm0, %xmm3
765 sub $32, %rcx
766 pmovmskb %xmm3, %edx
767 movdqa %xmm0, %xmm1
768
769 movdqa 64(%rsi), %xmm3
770 palignr $8, 48(%rsi), %xmm3
771 sbb $0xffff, %edx
772 movdqa 48(%rsi), %xmm0
773 palignr $8, 32(%rsi), %xmm0
774 pcmpeqb 32(%rdi), %xmm0
775 lea 32(%rsi), %rsi
776 pcmpeqb 48(%rdi), %xmm3
777
778 lea 32(%rdi), %rdi
779 jz L(shr_8_gobble_loop)
780 pand %xmm0, %xmm3
781
782 cmp $0, %rcx
783 jge L(shr_8_gobble_next)
784 inc %edx
785 add $32, %rcx
786 L(shr_8_gobble_next):
787 test %edx, %edx
788 jnz L(exit)
789
790 pmovmskb %xmm3, %edx
791 movdqa %xmm0, %xmm1
792 lea 32(%rdi), %rdi
793 lea 32(%rsi), %rsi
794 sub $0xffff, %edx
795 jnz L(exit)
796
797 lea 8(%rsi), %rsi
798 add %rcx, %rsi
799 add %rcx, %rdi
800 jmp L(less48bytes)
801
802 # ifndef USE_AS_WMEMCMP
803
804 .p2align 4
805 L(shr_9):
806 cmp $80, %rcx
807 lea -48(%rcx), %rcx
808 mov %edx, %eax
809 jae L(shr_9_gobble)
810
811 movdqa 16(%rsi), %xmm1
812 movdqa %xmm1, %xmm2
813 palignr $9, (%rsi), %xmm1
814 pcmpeqb (%rdi), %xmm1
815
816 movdqa 32(%rsi), %xmm3
817 palignr $9, %xmm2, %xmm3
818 pcmpeqb 16(%rdi), %xmm3
819
820 pand %xmm1, %xmm3
821 pmovmskb %xmm3, %edx
822 lea 32(%rdi), %rdi
823 lea 32(%rsi), %rsi
824 sub $0xffff, %edx
825 jnz L(exit)
826 add $9, %rsi
827 add %rcx, %rsi
828 add %rcx, %rdi
829 jmp L(less48bytes)
830
831 .p2align 4
832 L(shr_9_gobble):
833 sub $32, %rcx
834 movdqa 16(%rsi), %xmm0
835 palignr $9, (%rsi), %xmm0
836 pcmpeqb (%rdi), %xmm0
837
838 movdqa 32(%rsi), %xmm3
839 palignr $9, 16(%rsi), %xmm3
840 pcmpeqb 16(%rdi), %xmm3
841
842 L(shr_9_gobble_loop):
843 pand %xmm0, %xmm3
844 sub $32, %rcx
845 pmovmskb %xmm3, %edx
846 movdqa %xmm0, %xmm1
847
848 movdqa 64(%rsi), %xmm3
849 palignr $9, 48(%rsi), %xmm3
850 sbb $0xffff, %edx
851 movdqa 48(%rsi), %xmm0
852 palignr $9, 32(%rsi), %xmm0
853 pcmpeqb 32(%rdi), %xmm0
854 lea 32(%rsi), %rsi
855 pcmpeqb 48(%rdi), %xmm3
856
857 lea 32(%rdi), %rdi
858 jz L(shr_9_gobble_loop)
859 pand %xmm0, %xmm3
860
861 cmp $0, %rcx
862 jge L(shr_9_gobble_next)
863 inc %edx
864 add $32, %rcx
865 L(shr_9_gobble_next):
866 test %edx, %edx
867 jnz L(exit)
868
869 pmovmskb %xmm3, %edx
870 movdqa %xmm0, %xmm1
871 lea 32(%rdi), %rdi
872 lea 32(%rsi), %rsi
873 sub $0xffff, %edx
874 jnz L(exit)
875
876 lea 9(%rsi), %rsi
877 add %rcx, %rsi
878 add %rcx, %rdi
879 jmp L(less48bytes)
880
881 .p2align 4
882 L(shr_10):
883 cmp $80, %rcx
884 lea -48(%rcx), %rcx
885 mov %edx, %eax
886 jae L(shr_10_gobble)
887
888 movdqa 16(%rsi), %xmm1
889 movdqa %xmm1, %xmm2
890 palignr $10, (%rsi), %xmm1
891 pcmpeqb (%rdi), %xmm1
892
893 movdqa 32(%rsi), %xmm3
894 palignr $10, %xmm2, %xmm3
895 pcmpeqb 16(%rdi), %xmm3
896
897 pand %xmm1, %xmm3
898 pmovmskb %xmm3, %edx
899 lea 32(%rdi), %rdi
900 lea 32(%rsi), %rsi
901 sub $0xffff, %edx
902 jnz L(exit)
903 add $10, %rsi
904 add %rcx, %rsi
905 add %rcx, %rdi
906 jmp L(less48bytes)
907
908 .p2align 4
909 L(shr_10_gobble):
910 sub $32, %rcx
911 movdqa 16(%rsi), %xmm0
912 palignr $10, (%rsi), %xmm0
913 pcmpeqb (%rdi), %xmm0
914
915 movdqa 32(%rsi), %xmm3
916 palignr $10, 16(%rsi), %xmm3
917 pcmpeqb 16(%rdi), %xmm3
918
919 L(shr_10_gobble_loop):
920 pand %xmm0, %xmm3
921 sub $32, %rcx
922 pmovmskb %xmm3, %edx
923 movdqa %xmm0, %xmm1
924
925 movdqa 64(%rsi), %xmm3
926 palignr $10, 48(%rsi), %xmm3
927 sbb $0xffff, %edx
928 movdqa 48(%rsi), %xmm0
929 palignr $10, 32(%rsi), %xmm0
930 pcmpeqb 32(%rdi), %xmm0
931 lea 32(%rsi), %rsi
932 pcmpeqb 48(%rdi), %xmm3
933
934 lea 32(%rdi), %rdi
935 jz L(shr_10_gobble_loop)
936 pand %xmm0, %xmm3
937
938 cmp $0, %rcx
939 jge L(shr_10_gobble_next)
940 inc %edx
941 add $32, %rcx
942 L(shr_10_gobble_next):
943 test %edx, %edx
944 jnz L(exit)
945
946 pmovmskb %xmm3, %edx
947 movdqa %xmm0, %xmm1
948 lea 32(%rdi), %rdi
949 lea 32(%rsi), %rsi
950 sub $0xffff, %edx
951 jnz L(exit)
952
953 lea 10(%rsi), %rsi
954 add %rcx, %rsi
955 add %rcx, %rdi
956 jmp L(less48bytes)
957
958 .p2align 4
959 L(shr_11):
960 cmp $80, %rcx
961 lea -48(%rcx), %rcx
962 mov %edx, %eax
963 jae L(shr_11_gobble)
964
965 movdqa 16(%rsi), %xmm1
966 movdqa %xmm1, %xmm2
967 palignr $11, (%rsi), %xmm1
968 pcmpeqb (%rdi), %xmm1
969
970 movdqa 32(%rsi), %xmm3
971 palignr $11, %xmm2, %xmm3
972 pcmpeqb 16(%rdi), %xmm3
973
974 pand %xmm1, %xmm3
975 pmovmskb %xmm3, %edx
976 lea 32(%rdi), %rdi
977 lea 32(%rsi), %rsi
978 sub $0xffff, %edx
979 jnz L(exit)
980 add $11, %rsi
981 add %rcx, %rsi
982 add %rcx, %rdi
983 jmp L(less48bytes)
984
985 .p2align 4
986 L(shr_11_gobble):
987 sub $32, %rcx
988 movdqa 16(%rsi), %xmm0
989 palignr $11, (%rsi), %xmm0
990 pcmpeqb (%rdi), %xmm0
991
992 movdqa 32(%rsi), %xmm3
993 palignr $11, 16(%rsi), %xmm3
994 pcmpeqb 16(%rdi), %xmm3
995
996 L(shr_11_gobble_loop):
997 pand %xmm0, %xmm3
998 sub $32, %rcx
999 pmovmskb %xmm3, %edx
1000 movdqa %xmm0, %xmm1
1001
1002 movdqa 64(%rsi), %xmm3
1003 palignr $11, 48(%rsi), %xmm3
1004 sbb $0xffff, %edx
1005 movdqa 48(%rsi), %xmm0
1006 palignr $11, 32(%rsi), %xmm0
1007 pcmpeqb 32(%rdi), %xmm0
1008 lea 32(%rsi), %rsi
1009 pcmpeqb 48(%rdi), %xmm3
1010
1011 lea 32(%rdi), %rdi
1012 jz L(shr_11_gobble_loop)
1013 pand %xmm0, %xmm3
1014
1015 cmp $0, %rcx
1016 jge L(shr_11_gobble_next)
1017 inc %edx
1018 add $32, %rcx
1019 L(shr_11_gobble_next):
1020 test %edx, %edx
1021 jnz L(exit)
1022
1023 pmovmskb %xmm3, %edx
1024 movdqa %xmm0, %xmm1
1025 lea 32(%rdi), %rdi
1026 lea 32(%rsi), %rsi
1027 sub $0xffff, %edx
1028 jnz L(exit)
1029
1030 lea 11(%rsi), %rsi
1031 add %rcx, %rsi
1032 add %rcx, %rdi
1033 jmp L(less48bytes)
1034
1035 # endif
1036
1037 .p2align 4
1038 L(shr_12):
1039 cmp $80, %rcx
1040 lea -48(%rcx), %rcx
1041 mov %edx, %eax
1042 jae L(shr_12_gobble)
1043
1044 movdqa 16(%rsi), %xmm1
1045 movdqa %xmm1, %xmm2
1046 palignr $12, (%rsi), %xmm1
1047 pcmpeqb (%rdi), %xmm1
1048
1049 movdqa 32(%rsi), %xmm3
1050 palignr $12, %xmm2, %xmm3
1051 pcmpeqb 16(%rdi), %xmm3
1052
1053 pand %xmm1, %xmm3
1054 pmovmskb %xmm3, %edx
1055 lea 32(%rdi), %rdi
1056 lea 32(%rsi), %rsi
1057 sub $0xffff, %edx
1058 jnz L(exit)
1059 add $12, %rsi
1060 add %rcx, %rsi
1061 add %rcx, %rdi
1062 jmp L(less48bytes)
1063
1064 .p2align 4
1065 L(shr_12_gobble):
1066 sub $32, %rcx
1067 movdqa 16(%rsi), %xmm0
1068 palignr $12, (%rsi), %xmm0
1069 pcmpeqb (%rdi), %xmm0
1070
1071 movdqa 32(%rsi), %xmm3
1072 palignr $12, 16(%rsi), %xmm3
1073 pcmpeqb 16(%rdi), %xmm3
1074
1075 L(shr_12_gobble_loop):
1076 pand %xmm0, %xmm3
1077 sub $32, %rcx
1078 pmovmskb %xmm3, %edx
1079 movdqa %xmm0, %xmm1
1080
1081 movdqa 64(%rsi), %xmm3
1082 palignr $12, 48(%rsi), %xmm3
1083 sbb $0xffff, %edx
1084 movdqa 48(%rsi), %xmm0
1085 palignr $12, 32(%rsi), %xmm0
1086 pcmpeqb 32(%rdi), %xmm0
1087 lea 32(%rsi), %rsi
1088 pcmpeqb 48(%rdi), %xmm3
1089
1090 lea 32(%rdi), %rdi
1091 jz L(shr_12_gobble_loop)
1092 pand %xmm0, %xmm3
1093
1094 cmp $0, %rcx
1095 jge L(shr_12_gobble_next)
1096 inc %edx
1097 add $32, %rcx
1098 L(shr_12_gobble_next):
1099 test %edx, %edx
1100 jnz L(exit)
1101
1102 pmovmskb %xmm3, %edx
1103 movdqa %xmm0, %xmm1
1104 lea 32(%rdi), %rdi
1105 lea 32(%rsi), %rsi
1106 sub $0xffff, %edx
1107 jnz L(exit)
1108
1109 lea 12(%rsi), %rsi
1110 add %rcx, %rsi
1111 add %rcx, %rdi
1112 jmp L(less48bytes)
1113
1114 # ifndef USE_AS_WMEMCMP
1115
1116 .p2align 4
1117 L(shr_13):
1118 cmp $80, %rcx
1119 lea -48(%rcx), %rcx
1120 mov %edx, %eax
1121 jae L(shr_13_gobble)
1122
1123 movdqa 16(%rsi), %xmm1
1124 movdqa %xmm1, %xmm2
1125 palignr $13, (%rsi), %xmm1
1126 pcmpeqb (%rdi), %xmm1
1127
1128 movdqa 32(%rsi), %xmm3
1129 palignr $13, %xmm2, %xmm3
1130 pcmpeqb 16(%rdi), %xmm3
1131
1132 pand %xmm1, %xmm3
1133 pmovmskb %xmm3, %edx
1134 lea 32(%rdi), %rdi
1135 lea 32(%rsi), %rsi
1136 sub $0xffff, %edx
1137 jnz L(exit)
1138 add $13, %rsi
1139 add %rcx, %rsi
1140 add %rcx, %rdi
1141 jmp L(less48bytes)
1142
1143 .p2align 4
1144 L(shr_13_gobble):
1145 sub $32, %rcx
1146 movdqa 16(%rsi), %xmm0
1147 palignr $13, (%rsi), %xmm0
1148 pcmpeqb (%rdi), %xmm0
1149
1150 movdqa 32(%rsi), %xmm3
1151 palignr $13, 16(%rsi), %xmm3
1152 pcmpeqb 16(%rdi), %xmm3
1153
1154 L(shr_13_gobble_loop):
1155 pand %xmm0, %xmm3
1156 sub $32, %rcx
1157 pmovmskb %xmm3, %edx
1158 movdqa %xmm0, %xmm1
1159
1160 movdqa 64(%rsi), %xmm3
1161 palignr $13, 48(%rsi), %xmm3
1162 sbb $0xffff, %edx
1163 movdqa 48(%rsi), %xmm0
1164 palignr $13, 32(%rsi), %xmm0
1165 pcmpeqb 32(%rdi), %xmm0
1166 lea 32(%rsi), %rsi
1167 pcmpeqb 48(%rdi), %xmm3
1168
1169 lea 32(%rdi), %rdi
1170 jz L(shr_13_gobble_loop)
1171 pand %xmm0, %xmm3
1172
1173 cmp $0, %rcx
1174 jge L(shr_13_gobble_next)
1175 inc %edx
1176 add $32, %rcx
1177 L(shr_13_gobble_next):
1178 test %edx, %edx
1179 jnz L(exit)
1180
1181 pmovmskb %xmm3, %edx
1182 movdqa %xmm0, %xmm1
1183 lea 32(%rdi), %rdi
1184 lea 32(%rsi), %rsi
1185 sub $0xffff, %edx
1186 jnz L(exit)
1187
1188 lea 13(%rsi), %rsi
1189 add %rcx, %rsi
1190 add %rcx, %rdi
1191 jmp L(less48bytes)
1192
1193 .p2align 4
1194 L(shr_14):
1195 cmp $80, %rcx
1196 lea -48(%rcx), %rcx
1197 mov %edx, %eax
1198 jae L(shr_14_gobble)
1199
1200 movdqa 16(%rsi), %xmm1
1201 movdqa %xmm1, %xmm2
1202 palignr $14, (%rsi), %xmm1
1203 pcmpeqb (%rdi), %xmm1
1204
1205 movdqa 32(%rsi), %xmm3
1206 palignr $14, %xmm2, %xmm3
1207 pcmpeqb 16(%rdi), %xmm3
1208
1209 pand %xmm1, %xmm3
1210 pmovmskb %xmm3, %edx
1211 lea 32(%rdi), %rdi
1212 lea 32(%rsi), %rsi
1213 sub $0xffff, %edx
1214 jnz L(exit)
1215 add $14, %rsi
1216 add %rcx, %rsi
1217 add %rcx, %rdi
1218 jmp L(less48bytes)
1219
1220 .p2align 4
1221 L(shr_14_gobble):
1222 sub $32, %rcx
1223 movdqa 16(%rsi), %xmm0
1224 palignr $14, (%rsi), %xmm0
1225 pcmpeqb (%rdi), %xmm0
1226
1227 movdqa 32(%rsi), %xmm3
1228 palignr $14, 16(%rsi), %xmm3
1229 pcmpeqb 16(%rdi), %xmm3
1230
1231 L(shr_14_gobble_loop):
1232 pand %xmm0, %xmm3
1233 sub $32, %rcx
1234 pmovmskb %xmm3, %edx
1235 movdqa %xmm0, %xmm1
1236
1237 movdqa 64(%rsi), %xmm3
1238 palignr $14, 48(%rsi), %xmm3
1239 sbb $0xffff, %edx
1240 movdqa 48(%rsi), %xmm0
1241 palignr $14, 32(%rsi), %xmm0
1242 pcmpeqb 32(%rdi), %xmm0
1243 lea 32(%rsi), %rsi
1244 pcmpeqb 48(%rdi), %xmm3
1245
1246 lea 32(%rdi), %rdi
1247 jz L(shr_14_gobble_loop)
1248 pand %xmm0, %xmm3
1249
1250 cmp $0, %rcx
1251 jge L(shr_14_gobble_next)
1252 inc %edx
1253 add $32, %rcx
1254 L(shr_14_gobble_next):
1255 test %edx, %edx
1256 jnz L(exit)
1257
1258 pmovmskb %xmm3, %edx
1259 movdqa %xmm0, %xmm1
1260 lea 32(%rdi), %rdi
1261 lea 32(%rsi), %rsi
1262 sub $0xffff, %edx
1263 jnz L(exit)
1264
1265 lea 14(%rsi), %rsi
1266 add %rcx, %rsi
1267 add %rcx, %rdi
1268 jmp L(less48bytes)
1269
1270 .p2align 4
1271 L(shr_15):
1272 cmp $80, %rcx
1273 lea -48(%rcx), %rcx
1274 mov %edx, %eax
1275 jae L(shr_15_gobble)
1276
1277 movdqa 16(%rsi), %xmm1
1278 movdqa %xmm1, %xmm2
1279 palignr $15, (%rsi), %xmm1
1280 pcmpeqb (%rdi), %xmm1
1281
1282 movdqa 32(%rsi), %xmm3
1283 palignr $15, %xmm2, %xmm3
1284 pcmpeqb 16(%rdi), %xmm3
1285
1286 pand %xmm1, %xmm3
1287 pmovmskb %xmm3, %edx
1288 lea 32(%rdi), %rdi
1289 lea 32(%rsi), %rsi
1290 sub $0xffff, %edx
1291 jnz L(exit)
1292 add $15, %rsi
1293 add %rcx, %rsi
1294 add %rcx, %rdi
1295 jmp L(less48bytes)
1296
1297 .p2align 4
1298 L(shr_15_gobble):
1299 sub $32, %rcx
1300 movdqa 16(%rsi), %xmm0
1301 palignr $15, (%rsi), %xmm0
1302 pcmpeqb (%rdi), %xmm0
1303
1304 movdqa 32(%rsi), %xmm3
1305 palignr $15, 16(%rsi), %xmm3
1306 pcmpeqb 16(%rdi), %xmm3
1307
1308 L(shr_15_gobble_loop):
1309 pand %xmm0, %xmm3
1310 sub $32, %rcx
1311 pmovmskb %xmm3, %edx
1312 movdqa %xmm0, %xmm1
1313
1314 movdqa 64(%rsi), %xmm3
1315 palignr $15, 48(%rsi), %xmm3
1316 sbb $0xffff, %edx
1317 movdqa 48(%rsi), %xmm0
1318 palignr $15, 32(%rsi), %xmm0
1319 pcmpeqb 32(%rdi), %xmm0
1320 lea 32(%rsi), %rsi
1321 pcmpeqb 48(%rdi), %xmm3
1322
1323 lea 32(%rdi), %rdi
1324 jz L(shr_15_gobble_loop)
1325 pand %xmm0, %xmm3
1326
1327 cmp $0, %rcx
1328 jge L(shr_15_gobble_next)
1329 inc %edx
1330 add $32, %rcx
1331 L(shr_15_gobble_next):
1332 test %edx, %edx
1333 jnz L(exit)
1334
1335 pmovmskb %xmm3, %edx
1336 movdqa %xmm0, %xmm1
1337 lea 32(%rdi), %rdi
1338 lea 32(%rsi), %rsi
1339 sub $0xffff, %edx
1340 jnz L(exit)
1341
1342 lea 15(%rsi), %rsi
1343 add %rcx, %rsi
1344 add %rcx, %rdi
1345 jmp L(less48bytes)
1346 # endif
1347 .p2align 4
1348 L(exit):
1349 pmovmskb %xmm1, %r8d
1350 sub $0xffff, %r8d
1351 jz L(first16bytes)
1352 lea -16(%rsi), %rsi
1353 lea -16(%rdi), %rdi
1354 mov %r8d, %edx
1355 L(first16bytes):
1356 add %rax, %rsi
1357 L(less16bytes):
1358 # ifndef USE_AS_WMEMCMP
1359 test %dl, %dl
1360 jz L(next_24_bytes)
1361
1362 test $0x01, %dl
1363 jnz L(Byte16)
1364
1365 test $0x02, %dl
1366 jnz L(Byte17)
1367
1368 test $0x04, %dl
1369 jnz L(Byte18)
1370
1371 test $0x08, %dl
1372 jnz L(Byte19)
1373
1374 test $0x10, %dl
1375 jnz L(Byte20)
1376
1377 test $0x20, %dl
1378 jnz L(Byte21)
1379
1380 test $0x40, %dl
1381 jnz L(Byte22)
1382
1383 movzbl -9(%rdi), %eax
1384 movzbl -9(%rsi), %edx
1385 sub %edx, %eax
1386 ret
1387
1388 .p2align 4
1389 L(Byte16):
1390 movzbl -16(%rdi), %eax
1391 movzbl -16(%rsi), %edx
1392 sub %edx, %eax
1393 ret
1394
1395 .p2align 4
1396 L(Byte17):
1397 movzbl -15(%rdi), %eax
1398 movzbl -15(%rsi), %edx
1399 sub %edx, %eax
1400 ret
1401
1402 .p2align 4
1403 L(Byte18):
1404 movzbl -14(%rdi), %eax
1405 movzbl -14(%rsi), %edx
1406 sub %edx, %eax
1407 ret
1408
1409 .p2align 4
1410 L(Byte19):
1411 movzbl -13(%rdi), %eax
1412 movzbl -13(%rsi), %edx
1413 sub %edx, %eax
1414 ret
1415
1416 .p2align 4
1417 L(Byte20):
1418 movzbl -12(%rdi), %eax
1419 movzbl -12(%rsi), %edx
1420 sub %edx, %eax
1421 ret
1422
1423 .p2align 4
1424 L(Byte21):
1425 movzbl -11(%rdi), %eax
1426 movzbl -11(%rsi), %edx
1427 sub %edx, %eax
1428 ret
1429
1430 .p2align 4
1431 L(Byte22):
1432 movzbl -10(%rdi), %eax
1433 movzbl -10(%rsi), %edx
1434 sub %edx, %eax
1435 ret
1436
1437 .p2align 4
1438 L(next_24_bytes):
1439 lea 8(%rdi), %rdi
1440 lea 8(%rsi), %rsi
1441 test $0x01, %dh
1442 jnz L(Byte16)
1443
1444 test $0x02, %dh
1445 jnz L(Byte17)
1446
1447 test $0x04, %dh
1448 jnz L(Byte18)
1449
1450 test $0x08, %dh
1451 jnz L(Byte19)
1452
1453 test $0x10, %dh
1454 jnz L(Byte20)
1455
1456 test $0x20, %dh
1457 jnz L(Byte21)
1458
1459 test $0x40, %dh
1460 jnz L(Byte22)
1461
1462 movzbl -9(%rdi), %eax
1463 movzbl -9(%rsi), %edx
1464 sub %edx, %eax
1465 ret
1466 # else
1467 /* special for wmemcmp */
1468 xor %eax, %eax
1469 test %dl, %dl
1470 jz L(next_two_double_words)
1471 and $15, %dl
1472 jz L(second_double_word)
1473 mov -16(%rdi), %eax
1474 cmp -16(%rsi), %eax
1475 jne L(find_diff)
1476 ret
1477
1478 .p2align 4
1479 L(second_double_word):
1480 mov -12(%rdi), %eax
1481 cmp -12(%rsi), %eax
1482 jne L(find_diff)
1483 ret
1484
1485 .p2align 4
1486 L(next_two_double_words):
1487 and $15, %dh
1488 jz L(fourth_double_word)
1489 mov -8(%rdi), %eax
1490 cmp -8(%rsi), %eax
1491 jne L(find_diff)
1492 ret
1493
1494 .p2align 4
1495 L(fourth_double_word):
1496 mov -4(%rdi), %eax
1497 cmp -4(%rsi), %eax
1498 jne L(find_diff)
1499 ret
1500 # endif
1501
1502 .p2align 4
1503 L(less48bytes):
1504 cmp $8, %ecx
1505 jae L(more8bytes)
1506 cmp $0, %ecx
1507 je L(0bytes)
1508 # ifndef USE_AS_WMEMCMP
1509 cmp $1, %ecx
1510 je L(1bytes)
1511 cmp $2, %ecx
1512 je L(2bytes)
1513 cmp $3, %ecx
1514 je L(3bytes)
1515 cmp $4, %ecx
1516 je L(4bytes)
1517 cmp $5, %ecx
1518 je L(5bytes)
1519 cmp $6, %ecx
1520 je L(6bytes)
1521 jmp L(7bytes)
1522 # else
1523 jmp L(4bytes)
1524 # endif
1525
1526 .p2align 4
1527 L(more8bytes):
1528 cmp $16, %ecx
1529 jae L(more16bytes)
1530 cmp $8, %ecx
1531 je L(8bytes)
1532 # ifndef USE_AS_WMEMCMP
1533 cmp $9, %ecx
1534 je L(9bytes)
1535 cmp $10, %ecx
1536 je L(10bytes)
1537 cmp $11, %ecx
1538 je L(11bytes)
1539 cmp $12, %ecx
1540 je L(12bytes)
1541 cmp $13, %ecx
1542 je L(13bytes)
1543 cmp $14, %ecx
1544 je L(14bytes)
1545 jmp L(15bytes)
1546 # else
1547 jmp L(12bytes)
1548 # endif
1549
1550 .p2align 4
1551 L(more16bytes):
1552 cmp $24, %ecx
1553 jae L(more24bytes)
1554 cmp $16, %ecx
1555 je L(16bytes)
1556 # ifndef USE_AS_WMEMCMP
1557 cmp $17, %ecx
1558 je L(17bytes)
1559 cmp $18, %ecx
1560 je L(18bytes)
1561 cmp $19, %ecx
1562 je L(19bytes)
1563 cmp $20, %ecx
1564 je L(20bytes)
1565 cmp $21, %ecx
1566 je L(21bytes)
1567 cmp $22, %ecx
1568 je L(22bytes)
1569 jmp L(23bytes)
1570 # else
1571 jmp L(20bytes)
1572 # endif
1573
1574 .p2align 4
1575 L(more24bytes):
1576 cmp $32, %ecx
1577 jae L(more32bytes)
1578 cmp $24, %ecx
1579 je L(24bytes)
1580 # ifndef USE_AS_WMEMCMP
1581 cmp $25, %ecx
1582 je L(25bytes)
1583 cmp $26, %ecx
1584 je L(26bytes)
1585 cmp $27, %ecx
1586 je L(27bytes)
1587 cmp $28, %ecx
1588 je L(28bytes)
1589 cmp $29, %ecx
1590 je L(29bytes)
1591 cmp $30, %ecx
1592 je L(30bytes)
1593 jmp L(31bytes)
1594 # else
1595 jmp L(28bytes)
1596 # endif
1597
1598 .p2align 4
1599 L(more32bytes):
1600 cmp $40, %ecx
1601 jae L(more40bytes)
1602 cmp $32, %ecx
1603 je L(32bytes)
1604 # ifndef USE_AS_WMEMCMP
1605 cmp $33, %ecx
1606 je L(33bytes)
1607 cmp $34, %ecx
1608 je L(34bytes)
1609 cmp $35, %ecx
1610 je L(35bytes)
1611 cmp $36, %ecx
1612 je L(36bytes)
1613 cmp $37, %ecx
1614 je L(37bytes)
1615 cmp $38, %ecx
1616 je L(38bytes)
1617 jmp L(39bytes)
1618 # else
1619 jmp L(36bytes)
1620 # endif
1621
1622 .p2align 4
1623 L(more40bytes):
1624 cmp $40, %ecx
1625 je L(40bytes)
1626 # ifndef USE_AS_WMEMCMP
1627 cmp $41, %ecx
1628 je L(41bytes)
1629 cmp $42, %ecx
1630 je L(42bytes)
1631 cmp $43, %ecx
1632 je L(43bytes)
1633 cmp $44, %ecx
1634 je L(44bytes)
1635 cmp $45, %ecx
1636 je L(45bytes)
1637 cmp $46, %ecx
1638 je L(46bytes)
1639 jmp L(47bytes)
1640
1641 .p2align 4
1642 L(44bytes):
1643 movl -44(%rdi), %eax
1644 movl -44(%rsi), %ecx
1645 cmp %ecx, %eax
1646 jne L(find_diff)
1647 L(40bytes):
1648 movl -40(%rdi), %eax
1649 movl -40(%rsi), %ecx
1650 cmp %ecx, %eax
1651 jne L(find_diff)
1652 L(36bytes):
1653 movl -36(%rdi), %eax
1654 movl -36(%rsi), %ecx
1655 cmp %ecx, %eax
1656 jne L(find_diff)
1657 L(32bytes):
1658 movl -32(%rdi), %eax
1659 movl -32(%rsi), %ecx
1660 cmp %ecx, %eax
1661 jne L(find_diff)
1662 L(28bytes):
1663 movl -28(%rdi), %eax
1664 movl -28(%rsi), %ecx
1665 cmp %ecx, %eax
1666 jne L(find_diff)
1667 L(24bytes):
1668 movl -24(%rdi), %eax
1669 movl -24(%rsi), %ecx
1670 cmp %ecx, %eax
1671 jne L(find_diff)
1672 L(20bytes):
1673 movl -20(%rdi), %eax
1674 movl -20(%rsi), %ecx
1675 cmp %ecx, %eax
1676 jne L(find_diff)
1677 L(16bytes):
1678 movl -16(%rdi), %eax
1679 movl -16(%rsi), %ecx
1680 cmp %ecx, %eax
1681 jne L(find_diff)
1682 L(12bytes):
1683 movl -12(%rdi), %eax
1684 movl -12(%rsi), %ecx
1685 cmp %ecx, %eax
1686 jne L(find_diff)
1687 L(8bytes):
1688 movl -8(%rdi), %eax
1689 movl -8(%rsi), %ecx
1690 cmp %ecx, %eax
1691 jne L(find_diff)
1692 L(4bytes):
1693 movl -4(%rdi), %eax
1694 movl -4(%rsi), %ecx
1695 cmp %ecx, %eax
1696 jne L(find_diff)
1697 L(0bytes):
1698 xor %eax, %eax
1699 ret
1700 # else
1701 .p2align 4
1702 L(44bytes):
1703 movl -44(%rdi), %eax
1704 cmp -44(%rsi), %eax
1705 jne L(find_diff)
1706 L(40bytes):
1707 movl -40(%rdi), %eax
1708 cmp -40(%rsi), %eax
1709 jne L(find_diff)
1710 L(36bytes):
1711 movl -36(%rdi), %eax
1712 cmp -36(%rsi), %eax
1713 jne L(find_diff)
1714 L(32bytes):
1715 movl -32(%rdi), %eax
1716 cmp -32(%rsi), %eax
1717 jne L(find_diff)
1718 L(28bytes):
1719 movl -28(%rdi), %eax
1720 cmp -28(%rsi), %eax
1721 jne L(find_diff)
1722 L(24bytes):
1723 movl -24(%rdi), %eax
1724 cmp -24(%rsi), %eax
1725 jne L(find_diff)
1726 L(20bytes):
1727 movl -20(%rdi), %eax
1728 cmp -20(%rsi), %eax
1729 jne L(find_diff)
1730 L(16bytes):
1731 movl -16(%rdi), %eax
1732 cmp -16(%rsi), %eax
1733 jne L(find_diff)
1734 L(12bytes):
1735 movl -12(%rdi), %eax
1736 cmp -12(%rsi), %eax
1737 jne L(find_diff)
1738 L(8bytes):
1739 movl -8(%rdi), %eax
1740 cmp -8(%rsi), %eax
1741 jne L(find_diff)
1742 L(4bytes):
1743 movl -4(%rdi), %eax
1744 cmp -4(%rsi), %eax
1745 jne L(find_diff)
1746 L(0bytes):
1747 xor %eax, %eax
1748 ret
1749 # endif
1750
1751 # ifndef USE_AS_WMEMCMP
1752 .p2align 4
1753 L(45bytes):
1754 movl -45(%rdi), %eax
1755 movl -45(%rsi), %ecx
1756 cmp %ecx, %eax
1757 jne L(find_diff)
1758 L(41bytes):
1759 movl -41(%rdi), %eax
1760 movl -41(%rsi), %ecx
1761 cmp %ecx, %eax
1762 jne L(find_diff)
1763 L(37bytes):
1764 movl -37(%rdi), %eax
1765 movl -37(%rsi), %ecx
1766 cmp %ecx, %eax
1767 jne L(find_diff)
1768 L(33bytes):
1769 movl -33(%rdi), %eax
1770 movl -33(%rsi), %ecx
1771 cmp %ecx, %eax
1772 jne L(find_diff)
1773 L(29bytes):
1774 movl -29(%rdi), %eax
1775 movl -29(%rsi), %ecx
1776 cmp %ecx, %eax
1777 jne L(find_diff)
1778 L(25bytes):
1779 movl -25(%rdi), %eax
1780 movl -25(%rsi), %ecx
1781 cmp %ecx, %eax
1782 jne L(find_diff)
1783 L(21bytes):
1784 movl -21(%rdi), %eax
1785 movl -21(%rsi), %ecx
1786 cmp %ecx, %eax
1787 jne L(find_diff)
1788 L(17bytes):
1789 movl -17(%rdi), %eax
1790 movl -17(%rsi), %ecx
1791 cmp %ecx, %eax
1792 jne L(find_diff)
1793 L(13bytes):
1794 movl -13(%rdi), %eax
1795 movl -13(%rsi), %ecx
1796 cmp %ecx, %eax
1797 jne L(find_diff)
1798 L(9bytes):
1799 movl -9(%rdi), %eax
1800 movl -9(%rsi), %ecx
1801 cmp %ecx, %eax
1802 jne L(find_diff)
1803 L(5bytes):
1804 movl -5(%rdi), %eax
1805 movl -5(%rsi), %ecx
1806 cmp %ecx, %eax
1807 jne L(find_diff)
1808 L(1bytes):
1809 movzbl -1(%rdi), %eax
1810 cmpb -1(%rsi), %al
1811 jne L(set)
1812 xor %eax, %eax
1813 ret
1814
1815 .p2align 4
1816 L(46bytes):
1817 movl -46(%rdi), %eax
1818 movl -46(%rsi), %ecx
1819 cmp %ecx, %eax
1820 jne L(find_diff)
1821 L(42bytes):
1822 movl -42(%rdi), %eax
1823 movl -42(%rsi), %ecx
1824 cmp %ecx, %eax
1825 jne L(find_diff)
1826 L(38bytes):
1827 movl -38(%rdi), %eax
1828 movl -38(%rsi), %ecx
1829 cmp %ecx, %eax
1830 jne L(find_diff)
1831 L(34bytes):
1832 movl -34(%rdi), %eax
1833 movl -34(%rsi), %ecx
1834 cmp %ecx, %eax
1835 jne L(find_diff)
1836 L(30bytes):
1837 movl -30(%rdi), %eax
1838 movl -30(%rsi), %ecx
1839 cmp %ecx, %eax
1840 jne L(find_diff)
1841 L(26bytes):
1842 movl -26(%rdi), %eax
1843 movl -26(%rsi), %ecx
1844 cmp %ecx, %eax
1845 jne L(find_diff)
1846 L(22bytes):
1847 movl -22(%rdi), %eax
1848 movl -22(%rsi), %ecx
1849 cmp %ecx, %eax
1850 jne L(find_diff)
1851 L(18bytes):
1852 movl -18(%rdi), %eax
1853 movl -18(%rsi), %ecx
1854 cmp %ecx, %eax
1855 jne L(find_diff)
1856 L(14bytes):
1857 movl -14(%rdi), %eax
1858 movl -14(%rsi), %ecx
1859 cmp %ecx, %eax
1860 jne L(find_diff)
1861 L(10bytes):
1862 movl -10(%rdi), %eax
1863 movl -10(%rsi), %ecx
1864 cmp %ecx, %eax
1865 jne L(find_diff)
1866 L(6bytes):
1867 movl -6(%rdi), %eax
1868 movl -6(%rsi), %ecx
1869 cmp %ecx, %eax
1870 jne L(find_diff)
1871 L(2bytes):
1872 movzwl -2(%rdi), %eax
1873 movzwl -2(%rsi), %ecx
1874 cmpb %cl, %al
1875 jne L(set)
1876 cmp %ecx, %eax
1877 jne L(set)
1878 xor %eax, %eax
1879 ret
1880
1881 .p2align 4
1882 L(47bytes):
1883 movl -47(%rdi), %eax
1884 movl -47(%rsi), %ecx
1885 cmp %ecx, %eax
1886 jne L(find_diff)
1887 L(43bytes):
1888 movl -43(%rdi), %eax
1889 movl -43(%rsi), %ecx
1890 cmp %ecx, %eax
1891 jne L(find_diff)
1892 L(39bytes):
1893 movl -39(%rdi), %eax
1894 movl -39(%rsi), %ecx
1895 cmp %ecx, %eax
1896 jne L(find_diff)
1897 L(35bytes):
1898 movl -35(%rdi), %eax
1899 movl -35(%rsi), %ecx
1900 cmp %ecx, %eax
1901 jne L(find_diff)
1902 L(31bytes):
1903 movl -31(%rdi), %eax
1904 movl -31(%rsi), %ecx
1905 cmp %ecx, %eax
1906 jne L(find_diff)
1907 L(27bytes):
1908 movl -27(%rdi), %eax
1909 movl -27(%rsi), %ecx
1910 cmp %ecx, %eax
1911 jne L(find_diff)
1912 L(23bytes):
1913 movl -23(%rdi), %eax
1914 movl -23(%rsi), %ecx
1915 cmp %ecx, %eax
1916 jne L(find_diff)
1917 L(19bytes):
1918 movl -19(%rdi), %eax
1919 movl -19(%rsi), %ecx
1920 cmp %ecx, %eax
1921 jne L(find_diff)
1922 L(15bytes):
1923 movl -15(%rdi), %eax
1924 movl -15(%rsi), %ecx
1925 cmp %ecx, %eax
1926 jne L(find_diff)
1927 L(11bytes):
1928 movl -11(%rdi), %eax
1929 movl -11(%rsi), %ecx
1930 cmp %ecx, %eax
1931 jne L(find_diff)
1932 L(7bytes):
1933 movl -7(%rdi), %eax
1934 movl -7(%rsi), %ecx
1935 cmp %ecx, %eax
1936 jne L(find_diff)
1937 L(3bytes):
1938 movzwl -3(%rdi), %eax
1939 movzwl -3(%rsi), %ecx
1940 cmpb %cl, %al
1941 jne L(set)
1942 cmp %ecx, %eax
1943 jne L(set)
1944 movzbl -1(%rdi), %eax
1945 cmpb -1(%rsi), %al
1946 jne L(set)
1947 xor %eax, %eax
1948 ret
1949
1950 .p2align 4
1951 L(find_diff):
1952 cmpb %cl, %al
1953 jne L(set)
1954 cmpw %cx, %ax
1955 jne L(set)
1956 shr $16, %eax
1957 shr $16, %ecx
1958 cmpb %cl, %al
1959 jne L(set)
1960
1961 /* We get there only if we already know there is a
1962 difference. */
1963
1964 cmp %ecx, %eax
1965 L(set):
1966 sbb %eax, %eax
1967 sbb $-1, %eax
1968 ret
1969 # else
1970
1971 /* for wmemcmp */
1972 .p2align 4
1973 L(find_diff):
1974 mov $1, %eax
1975 jg L(find_diff_bigger)
1976 neg %eax
1977 ret
1978
1979 .p2align 4
1980 L(find_diff_bigger):
1981 ret
1982 # endif
1983
1984 .p2align 4
1985 L(equal):
1986 xor %eax, %eax
1987 ret
1988
1989 END (MEMCMP)
1990 #endif