]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/i386/i686/multiarch/memcmp-sse4.S
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / sysdeps / i386 / i686 / multiarch / memcmp-sse4.S
1 /* memcmp with SSE4.2, wmemcmp with SSE4.2
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # ifndef MEMCMP
24 # define MEMCMP __memcmp_sse4_2
25 # endif
26
27 # define CFI_PUSH(REG) \
28 cfi_adjust_cfa_offset (4); \
29 cfi_rel_offset (REG, 0)
30
31 # define CFI_POP(REG) \
32 cfi_adjust_cfa_offset (-4); \
33 cfi_restore (REG)
34
35 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
36 # define POP(REG) popl REG; CFI_POP (REG)
37
38 # define PARMS 4
39 # define BLK1 PARMS
40 # define BLK2 BLK1 + 4
41 # define LEN BLK2 + 4
42 # define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
43
44
45 # ifdef PIC
46 # define JMPTBL(I, B) I - B
47
48 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
49 jump table with relative offsets. INDEX is a register contains the
50 index into the jump table. SCALE is the scale of INDEX. */
51
52 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
53 /* We first load PC into EBX. */ \
54 SETUP_PIC_REG(bx); \
55 /* Get the address of the jump table. */ \
56 addl $(TABLE - .), %ebx; \
57 /* Get the entry and convert the relative offset to the \
58 absolute address. */ \
59 addl (%ebx,INDEX,SCALE), %ebx; \
60 /* We loaded the jump table and adjusted EDX/ESI. Go. */ \
61 _CET_NOTRACK jmp *%ebx
62 # else
63 # define JMPTBL(I, B) I
64
65 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
66 jump table with relative offsets. INDEX is a register contains the
67 index into the jump table. SCALE is the scale of INDEX. */
68 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
69 _CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
70 # endif
71
72
73 /* Warning!
74 wmemcmp has to use SIGNED comparison for elements.
75 memcmp has to use UNSIGNED comparison for elements.
76 */
77
78 .section .text.sse4.2,"ax",@progbits
79 ENTRY (MEMCMP)
80 movl BLK1(%esp), %eax
81 movl BLK2(%esp), %edx
82 movl LEN(%esp), %ecx
83
84 # ifdef USE_AS_WMEMCMP
85 shl $2, %ecx
86 test %ecx, %ecx
87 jz L(return0)
88 # else
89 cmp $1, %ecx
90 jbe L(less1bytes)
91 # endif
92
93 pxor %xmm0, %xmm0
94 cmp $64, %ecx
95 ja L(64bytesormore)
96 cmp $8, %ecx
97
98 # ifndef USE_AS_WMEMCMP
99 PUSH (%ebx)
100 jb L(less8bytes)
101 # else
102 jb L(less8bytes)
103 PUSH (%ebx)
104 # endif
105
106 add %ecx, %edx
107 add %ecx, %eax
108 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
109
110 # ifndef USE_AS_WMEMCMP
111 .p2align 4
112 L(less8bytes):
113 mov (%eax), %bl
114 cmpb (%edx), %bl
115 jne L(nonzero)
116
117 mov 1(%eax), %bl
118 cmpb 1(%edx), %bl
119 jne L(nonzero)
120
121 cmp $2, %ecx
122 jz L(0bytes)
123
124 mov 2(%eax), %bl
125 cmpb 2(%edx), %bl
126 jne L(nonzero)
127
128 cmp $3, %ecx
129 jz L(0bytes)
130
131 mov 3(%eax), %bl
132 cmpb 3(%edx), %bl
133 jne L(nonzero)
134
135 cmp $4, %ecx
136 jz L(0bytes)
137
138 mov 4(%eax), %bl
139 cmpb 4(%edx), %bl
140 jne L(nonzero)
141
142 cmp $5, %ecx
143 jz L(0bytes)
144
145 mov 5(%eax), %bl
146 cmpb 5(%edx), %bl
147 jne L(nonzero)
148
149 cmp $6, %ecx
150 jz L(0bytes)
151
152 mov 6(%eax), %bl
153 cmpb 6(%edx), %bl
154 je L(0bytes)
155
156 L(nonzero):
157 POP (%ebx)
158 mov $1, %eax
159 ja L(above)
160 neg %eax
161 L(above):
162 ret
163 CFI_PUSH (%ebx)
164 # endif
165
166 .p2align 4
167 L(0bytes):
168 POP (%ebx)
169 xor %eax, %eax
170 ret
171
172 # ifdef USE_AS_WMEMCMP
173
174 /* for wmemcmp, case N == 1 */
175
176 .p2align 4
177 L(less8bytes):
178 mov (%eax), %ecx
179 cmp (%edx), %ecx
180 je L(return0)
181 mov $1, %eax
182 jg L(find_diff_bigger)
183 neg %eax
184 ret
185
186 .p2align 4
187 L(find_diff_bigger):
188 ret
189
190 .p2align 4
191 L(return0):
192 xor %eax, %eax
193 ret
194 # endif
195
196 # ifndef USE_AS_WMEMCMP
197 .p2align 4
198 L(less1bytes):
199 jb L(0bytesend)
200 movzbl (%eax), %eax
201 movzbl (%edx), %edx
202 sub %edx, %eax
203 ret
204
205 .p2align 4
206 L(0bytesend):
207 xor %eax, %eax
208 ret
209 # endif
210 .p2align 4
211 L(64bytesormore):
212 PUSH (%ebx)
213 mov %ecx, %ebx
214 mov $64, %ecx
215 sub $64, %ebx
216 L(64bytesormore_loop):
217 movdqu (%eax), %xmm1
218 movdqu (%edx), %xmm2
219 pxor %xmm1, %xmm2
220 ptest %xmm2, %xmm0
221 jnc L(find_16diff)
222
223 movdqu 16(%eax), %xmm1
224 movdqu 16(%edx), %xmm2
225 pxor %xmm1, %xmm2
226 ptest %xmm2, %xmm0
227 jnc L(find_32diff)
228
229 movdqu 32(%eax), %xmm1
230 movdqu 32(%edx), %xmm2
231 pxor %xmm1, %xmm2
232 ptest %xmm2, %xmm0
233 jnc L(find_48diff)
234
235 movdqu 48(%eax), %xmm1
236 movdqu 48(%edx), %xmm2
237 pxor %xmm1, %xmm2
238 ptest %xmm2, %xmm0
239 jnc L(find_64diff)
240 add %ecx, %eax
241 add %ecx, %edx
242 sub %ecx, %ebx
243 jae L(64bytesormore_loop)
244 add %ebx, %ecx
245 add %ecx, %edx
246 add %ecx, %eax
247 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
248
249 # ifdef USE_AS_WMEMCMP
250
251 /* Label needs only for table_64bytes filling */
252 L(unreal_case):
253 /* no code here */
254
255 # endif
256 .p2align 4
257 L(find_16diff):
258 sub $16, %ecx
259 L(find_32diff):
260 sub $16, %ecx
261 L(find_48diff):
262 sub $16, %ecx
263 L(find_64diff):
264 add %ecx, %edx
265 add %ecx, %eax
266
267 # ifndef USE_AS_WMEMCMP
268 .p2align 4
269 L(16bytes):
270 mov -16(%eax), %ecx
271 mov -16(%edx), %ebx
272 cmp %ebx, %ecx
273 jne L(find_diff)
274 L(12bytes):
275 mov -12(%eax), %ecx
276 mov -12(%edx), %ebx
277 cmp %ebx, %ecx
278 jne L(find_diff)
279 L(8bytes):
280 mov -8(%eax), %ecx
281 mov -8(%edx), %ebx
282 cmp %ebx, %ecx
283 jne L(find_diff)
284 L(4bytes):
285 mov -4(%eax), %ecx
286 mov -4(%edx), %ebx
287 cmp %ebx, %ecx
288 mov $0, %eax
289 jne L(find_diff)
290 RETURN
291 # else
292 .p2align 4
293 L(16bytes):
294 mov -16(%eax), %ecx
295 cmp -16(%edx), %ecx
296 jne L(find_diff)
297 L(12bytes):
298 mov -12(%eax), %ecx
299 cmp -12(%edx), %ecx
300 jne L(find_diff)
301 L(8bytes):
302 mov -8(%eax), %ecx
303 cmp -8(%edx), %ecx
304 jne L(find_diff)
305 L(4bytes):
306 mov -4(%eax), %ecx
307 cmp -4(%edx), %ecx
308 mov $0, %eax
309 jne L(find_diff)
310 RETURN
311 # endif
312
313 # ifndef USE_AS_WMEMCMP
314 .p2align 4
315 L(49bytes):
316 movdqu -49(%eax), %xmm1
317 movdqu -49(%edx), %xmm2
318 mov $-49, %ebx
319 pxor %xmm1, %xmm2
320 ptest %xmm2, %xmm0
321 jnc L(less16bytes)
322 L(33bytes):
323 movdqu -33(%eax), %xmm1
324 movdqu -33(%edx), %xmm2
325 mov $-33, %ebx
326 pxor %xmm1, %xmm2
327 ptest %xmm2, %xmm0
328 jnc L(less16bytes)
329 L(17bytes):
330 mov -17(%eax), %ecx
331 mov -17(%edx), %ebx
332 cmp %ebx, %ecx
333 jne L(find_diff)
334 L(13bytes):
335 mov -13(%eax), %ecx
336 mov -13(%edx), %ebx
337 cmp %ebx, %ecx
338 jne L(find_diff)
339 L(9bytes):
340 mov -9(%eax), %ecx
341 mov -9(%edx), %ebx
342 cmp %ebx, %ecx
343 jne L(find_diff)
344 L(5bytes):
345 mov -5(%eax), %ecx
346 mov -5(%edx), %ebx
347 cmp %ebx, %ecx
348 jne L(find_diff)
349 movzbl -1(%eax), %ecx
350 cmp -1(%edx), %cl
351 mov $0, %eax
352 jne L(end)
353 RETURN
354
355 .p2align 4
356 L(50bytes):
357 mov $-50, %ebx
358 movdqu -50(%eax), %xmm1
359 movdqu -50(%edx), %xmm2
360 pxor %xmm1, %xmm2
361 ptest %xmm2, %xmm0
362 jnc L(less16bytes)
363 L(34bytes):
364 mov $-34, %ebx
365 movdqu -34(%eax), %xmm1
366 movdqu -34(%edx), %xmm2
367 pxor %xmm1, %xmm2
368 ptest %xmm2, %xmm0
369 jnc L(less16bytes)
370 L(18bytes):
371 mov -18(%eax), %ecx
372 mov -18(%edx), %ebx
373 cmp %ebx, %ecx
374 jne L(find_diff)
375 L(14bytes):
376 mov -14(%eax), %ecx
377 mov -14(%edx), %ebx
378 cmp %ebx, %ecx
379 jne L(find_diff)
380 L(10bytes):
381 mov -10(%eax), %ecx
382 mov -10(%edx), %ebx
383 cmp %ebx, %ecx
384 jne L(find_diff)
385 L(6bytes):
386 mov -6(%eax), %ecx
387 mov -6(%edx), %ebx
388 cmp %ebx, %ecx
389 jne L(find_diff)
390 L(2bytes):
391 movzwl -2(%eax), %ecx
392 movzwl -2(%edx), %ebx
393 cmp %bl, %cl
394 jne L(end)
395 cmp %bh, %ch
396 mov $0, %eax
397 jne L(end)
398 RETURN
399
400 .p2align 4
401 L(51bytes):
402 mov $-51, %ebx
403 movdqu -51(%eax), %xmm1
404 movdqu -51(%edx), %xmm2
405 pxor %xmm1, %xmm2
406 ptest %xmm2, %xmm0
407 jnc L(less16bytes)
408 L(35bytes):
409 mov $-35, %ebx
410 movdqu -35(%eax), %xmm1
411 movdqu -35(%edx), %xmm2
412 pxor %xmm1, %xmm2
413 ptest %xmm2, %xmm0
414 jnc L(less16bytes)
415 L(19bytes):
416 movl -19(%eax), %ecx
417 movl -19(%edx), %ebx
418 cmp %ebx, %ecx
419 jne L(find_diff)
420 L(15bytes):
421 movl -15(%eax), %ecx
422 movl -15(%edx), %ebx
423 cmp %ebx, %ecx
424 jne L(find_diff)
425 L(11bytes):
426 movl -11(%eax), %ecx
427 movl -11(%edx), %ebx
428 cmp %ebx, %ecx
429 jne L(find_diff)
430 L(7bytes):
431 movl -7(%eax), %ecx
432 movl -7(%edx), %ebx
433 cmp %ebx, %ecx
434 jne L(find_diff)
435 L(3bytes):
436 movzwl -3(%eax), %ecx
437 movzwl -3(%edx), %ebx
438 cmpb %bl, %cl
439 jne L(end)
440 cmp %bx, %cx
441 jne L(end)
442 L(1bytes):
443 movzbl -1(%eax), %eax
444 cmpb -1(%edx), %al
445 mov $0, %eax
446 jne L(end)
447 RETURN
448 # endif
449 .p2align 4
450 L(52bytes):
451 movdqu -52(%eax), %xmm1
452 movdqu -52(%edx), %xmm2
453 mov $-52, %ebx
454 pxor %xmm1, %xmm2
455 ptest %xmm2, %xmm0
456 jnc L(less16bytes)
457 L(36bytes):
458 movdqu -36(%eax), %xmm1
459 movdqu -36(%edx), %xmm2
460 mov $-36, %ebx
461 pxor %xmm1, %xmm2
462 ptest %xmm2, %xmm0
463 jnc L(less16bytes)
464 L(20bytes):
465 movdqu -20(%eax), %xmm1
466 movdqu -20(%edx), %xmm2
467 mov $-20, %ebx
468 pxor %xmm1, %xmm2
469 ptest %xmm2, %xmm0
470 jnc L(less16bytes)
471 mov -4(%eax), %ecx
472 # ifndef USE_AS_WMEMCMP
473 mov -4(%edx), %ebx
474 cmp %ebx, %ecx
475 # else
476 cmp -4(%edx), %ecx
477 # endif
478 mov $0, %eax
479 jne L(find_diff)
480 RETURN
481
482 # ifndef USE_AS_WMEMCMP
483 .p2align 4
484 L(53bytes):
485 movdqu -53(%eax), %xmm1
486 movdqu -53(%edx), %xmm2
487 mov $-53, %ebx
488 pxor %xmm1, %xmm2
489 ptest %xmm2, %xmm0
490 jnc L(less16bytes)
491 L(37bytes):
492 mov $-37, %ebx
493 movdqu -37(%eax), %xmm1
494 movdqu -37(%edx), %xmm2
495 pxor %xmm1, %xmm2
496 ptest %xmm2, %xmm0
497 jnc L(less16bytes)
498 L(21bytes):
499 mov $-21, %ebx
500 movdqu -21(%eax), %xmm1
501 movdqu -21(%edx), %xmm2
502 pxor %xmm1, %xmm2
503 ptest %xmm2, %xmm0
504 jnc L(less16bytes)
505 mov -5(%eax), %ecx
506 mov -5(%edx), %ebx
507 cmp %ebx, %ecx
508 jne L(find_diff)
509 movzbl -1(%eax), %ecx
510 cmp -1(%edx), %cl
511 mov $0, %eax
512 jne L(end)
513 RETURN
514
515 .p2align 4
516 L(54bytes):
517 movdqu -54(%eax), %xmm1
518 movdqu -54(%edx), %xmm2
519 mov $-54, %ebx
520 pxor %xmm1, %xmm2
521 ptest %xmm2, %xmm0
522 jnc L(less16bytes)
523 L(38bytes):
524 mov $-38, %ebx
525 movdqu -38(%eax), %xmm1
526 movdqu -38(%edx), %xmm2
527 pxor %xmm1, %xmm2
528 ptest %xmm2, %xmm0
529 jnc L(less16bytes)
530 L(22bytes):
531 mov $-22, %ebx
532 movdqu -22(%eax), %xmm1
533 movdqu -22(%edx), %xmm2
534 pxor %xmm1, %xmm2
535 ptest %xmm2, %xmm0
536 jnc L(less16bytes)
537
538 mov -6(%eax), %ecx
539 mov -6(%edx), %ebx
540 cmp %ebx, %ecx
541 jne L(find_diff)
542 movzwl -2(%eax), %ecx
543 movzwl -2(%edx), %ebx
544 cmp %bl, %cl
545 jne L(end)
546 cmp %bh, %ch
547 mov $0, %eax
548 jne L(end)
549 RETURN
550
551 .p2align 4
552 L(55bytes):
553 movdqu -55(%eax), %xmm1
554 movdqu -55(%edx), %xmm2
555 mov $-55, %ebx
556 pxor %xmm1, %xmm2
557 ptest %xmm2, %xmm0
558 jnc L(less16bytes)
559 L(39bytes):
560 mov $-39, %ebx
561 movdqu -39(%eax), %xmm1
562 movdqu -39(%edx), %xmm2
563 pxor %xmm1, %xmm2
564 ptest %xmm2, %xmm0
565 jnc L(less16bytes)
566 L(23bytes):
567 mov $-23, %ebx
568 movdqu -23(%eax), %xmm1
569 movdqu -23(%edx), %xmm2
570 pxor %xmm1, %xmm2
571 ptest %xmm2, %xmm0
572 jnc L(less16bytes)
573 movl -7(%eax), %ecx
574 movl -7(%edx), %ebx
575 cmp %ebx, %ecx
576 jne L(find_diff)
577 movzwl -3(%eax), %ecx
578 movzwl -3(%edx), %ebx
579 cmpb %bl, %cl
580 jne L(end)
581 cmp %bx, %cx
582 jne L(end)
583 movzbl -1(%eax), %eax
584 cmpb -1(%edx), %al
585 mov $0, %eax
586 jne L(end)
587 RETURN
588 # endif
589 .p2align 4
590 L(56bytes):
591 movdqu -56(%eax), %xmm1
592 movdqu -56(%edx), %xmm2
593 mov $-56, %ebx
594 pxor %xmm1, %xmm2
595 ptest %xmm2, %xmm0
596 jnc L(less16bytes)
597 L(40bytes):
598 mov $-40, %ebx
599 movdqu -40(%eax), %xmm1
600 movdqu -40(%edx), %xmm2
601 pxor %xmm1, %xmm2
602 ptest %xmm2, %xmm0
603 jnc L(less16bytes)
604 L(24bytes):
605 mov $-24, %ebx
606 movdqu -24(%eax), %xmm1
607 movdqu -24(%edx), %xmm2
608 pxor %xmm1, %xmm2
609 ptest %xmm2, %xmm0
610 jnc L(less16bytes)
611
612 mov -8(%eax), %ecx
613 # ifndef USE_AS_WMEMCMP
614 mov -8(%edx), %ebx
615 cmp %ebx, %ecx
616 # else
617 cmp -8(%edx), %ecx
618 # endif
619 jne L(find_diff)
620
621 mov -4(%eax), %ecx
622 # ifndef USE_AS_WMEMCMP
623 mov -4(%edx), %ebx
624 cmp %ebx, %ecx
625 # else
626 cmp -4(%edx), %ecx
627 # endif
628 mov $0, %eax
629 jne L(find_diff)
630 RETURN
631
632 # ifndef USE_AS_WMEMCMP
633 .p2align 4
634 L(57bytes):
635 movdqu -57(%eax), %xmm1
636 movdqu -57(%edx), %xmm2
637 mov $-57, %ebx
638 pxor %xmm1, %xmm2
639 ptest %xmm2, %xmm0
640 jnc L(less16bytes)
641 L(41bytes):
642 mov $-41, %ebx
643 movdqu -41(%eax), %xmm1
644 movdqu -41(%edx), %xmm2
645 pxor %xmm1, %xmm2
646 ptest %xmm2, %xmm0
647 jnc L(less16bytes)
648 L(25bytes):
649 mov $-25, %ebx
650 movdqu -25(%eax), %xmm1
651 movdqu -25(%edx), %xmm2
652 pxor %xmm1, %xmm2
653 ptest %xmm2, %xmm0
654 jnc L(less16bytes)
655 mov -9(%eax), %ecx
656 mov -9(%edx), %ebx
657 cmp %ebx, %ecx
658 jne L(find_diff)
659 mov -5(%eax), %ecx
660 mov -5(%edx), %ebx
661 cmp %ebx, %ecx
662 jne L(find_diff)
663 movzbl -1(%eax), %ecx
664 cmp -1(%edx), %cl
665 mov $0, %eax
666 jne L(end)
667 RETURN
668
669 .p2align 4
670 L(58bytes):
671 movdqu -58(%eax), %xmm1
672 movdqu -58(%edx), %xmm2
673 mov $-58, %ebx
674 pxor %xmm1, %xmm2
675 ptest %xmm2, %xmm0
676 jnc L(less16bytes)
677 L(42bytes):
678 mov $-42, %ebx
679 movdqu -42(%eax), %xmm1
680 movdqu -42(%edx), %xmm2
681 pxor %xmm1, %xmm2
682 ptest %xmm2, %xmm0
683 jnc L(less16bytes)
684 L(26bytes):
685 mov $-26, %ebx
686 movdqu -26(%eax), %xmm1
687 movdqu -26(%edx), %xmm2
688 pxor %xmm1, %xmm2
689 ptest %xmm2, %xmm0
690 jnc L(less16bytes)
691
692 mov -10(%eax), %ecx
693 mov -10(%edx), %ebx
694 cmp %ebx, %ecx
695 jne L(find_diff)
696
697 mov -6(%eax), %ecx
698 mov -6(%edx), %ebx
699 cmp %ebx, %ecx
700 jne L(find_diff)
701
702 movzwl -2(%eax), %ecx
703 movzwl -2(%edx), %ebx
704 cmp %bl, %cl
705 jne L(end)
706 cmp %bh, %ch
707 mov $0, %eax
708 jne L(end)
709 RETURN
710
711 .p2align 4
712 L(59bytes):
713 movdqu -59(%eax), %xmm1
714 movdqu -59(%edx), %xmm2
715 mov $-59, %ebx
716 pxor %xmm1, %xmm2
717 ptest %xmm2, %xmm0
718 jnc L(less16bytes)
719 L(43bytes):
720 mov $-43, %ebx
721 movdqu -43(%eax), %xmm1
722 movdqu -43(%edx), %xmm2
723 pxor %xmm1, %xmm2
724 ptest %xmm2, %xmm0
725 jnc L(less16bytes)
726 L(27bytes):
727 mov $-27, %ebx
728 movdqu -27(%eax), %xmm1
729 movdqu -27(%edx), %xmm2
730 pxor %xmm1, %xmm2
731 ptest %xmm2, %xmm0
732 jnc L(less16bytes)
733 movl -11(%eax), %ecx
734 movl -11(%edx), %ebx
735 cmp %ebx, %ecx
736 jne L(find_diff)
737 movl -7(%eax), %ecx
738 movl -7(%edx), %ebx
739 cmp %ebx, %ecx
740 jne L(find_diff)
741 movzwl -3(%eax), %ecx
742 movzwl -3(%edx), %ebx
743 cmpb %bl, %cl
744 jne L(end)
745 cmp %bx, %cx
746 jne L(end)
747 movzbl -1(%eax), %eax
748 cmpb -1(%edx), %al
749 mov $0, %eax
750 jne L(end)
751 RETURN
752 # endif
753 .p2align 4
754 L(60bytes):
755 movdqu -60(%eax), %xmm1
756 movdqu -60(%edx), %xmm2
757 mov $-60, %ebx
758 pxor %xmm1, %xmm2
759 ptest %xmm2, %xmm0
760 jnc L(less16bytes)
761 L(44bytes):
762 mov $-44, %ebx
763 movdqu -44(%eax), %xmm1
764 movdqu -44(%edx), %xmm2
765 pxor %xmm1, %xmm2
766 ptest %xmm2, %xmm0
767 jnc L(less16bytes)
768 L(28bytes):
769 mov $-28, %ebx
770 movdqu -28(%eax), %xmm1
771 movdqu -28(%edx), %xmm2
772 pxor %xmm1, %xmm2
773 ptest %xmm2, %xmm0
774 jnc L(less16bytes)
775
776 mov -12(%eax), %ecx
777 # ifndef USE_AS_WMEMCMP
778 mov -12(%edx), %ebx
779 cmp %ebx, %ecx
780 # else
781 cmp -12(%edx), %ecx
782 # endif
783 jne L(find_diff)
784
785 mov -8(%eax), %ecx
786 # ifndef USE_AS_WMEMCMP
787 mov -8(%edx), %ebx
788 cmp %ebx, %ecx
789 # else
790 cmp -8(%edx), %ecx
791 # endif
792 jne L(find_diff)
793
794 mov -4(%eax), %ecx
795 # ifndef USE_AS_WMEMCMP
796 mov -4(%edx), %ebx
797 cmp %ebx, %ecx
798 # else
799 cmp -4(%edx), %ecx
800 # endif
801 mov $0, %eax
802 jne L(find_diff)
803 RETURN
804
805 # ifndef USE_AS_WMEMCMP
806 .p2align 4
807 L(61bytes):
808 movdqu -61(%eax), %xmm1
809 movdqu -61(%edx), %xmm2
810 mov $-61, %ebx
811 pxor %xmm1, %xmm2
812 ptest %xmm2, %xmm0
813 jnc L(less16bytes)
814 L(45bytes):
815 mov $-45, %ebx
816 movdqu -45(%eax), %xmm1
817 movdqu -45(%edx), %xmm2
818 pxor %xmm1, %xmm2
819 ptest %xmm2, %xmm0
820 jnc L(less16bytes)
821 L(29bytes):
822 mov $-29, %ebx
823 movdqu -29(%eax), %xmm1
824 movdqu -29(%edx), %xmm2
825 pxor %xmm1, %xmm2
826 ptest %xmm2, %xmm0
827 jnc L(less16bytes)
828
829 mov -13(%eax), %ecx
830 mov -13(%edx), %ebx
831 cmp %ebx, %ecx
832 jne L(find_diff)
833
834 mov -9(%eax), %ecx
835 mov -9(%edx), %ebx
836 cmp %ebx, %ecx
837 jne L(find_diff)
838
839 mov -5(%eax), %ecx
840 mov -5(%edx), %ebx
841 cmp %ebx, %ecx
842 jne L(find_diff)
843 movzbl -1(%eax), %ecx
844 cmp -1(%edx), %cl
845 mov $0, %eax
846 jne L(end)
847 RETURN
848
849 .p2align 4
850 L(62bytes):
851 movdqu -62(%eax), %xmm1
852 movdqu -62(%edx), %xmm2
853 mov $-62, %ebx
854 pxor %xmm1, %xmm2
855 ptest %xmm2, %xmm0
856 jnc L(less16bytes)
857 L(46bytes):
858 mov $-46, %ebx
859 movdqu -46(%eax), %xmm1
860 movdqu -46(%edx), %xmm2
861 pxor %xmm1, %xmm2
862 ptest %xmm2, %xmm0
863 jnc L(less16bytes)
864 L(30bytes):
865 mov $-30, %ebx
866 movdqu -30(%eax), %xmm1
867 movdqu -30(%edx), %xmm2
868 pxor %xmm1, %xmm2
869 ptest %xmm2, %xmm0
870 jnc L(less16bytes)
871 mov -14(%eax), %ecx
872 mov -14(%edx), %ebx
873 cmp %ebx, %ecx
874 jne L(find_diff)
875 mov -10(%eax), %ecx
876 mov -10(%edx), %ebx
877 cmp %ebx, %ecx
878 jne L(find_diff)
879 mov -6(%eax), %ecx
880 mov -6(%edx), %ebx
881 cmp %ebx, %ecx
882 jne L(find_diff)
883 movzwl -2(%eax), %ecx
884 movzwl -2(%edx), %ebx
885 cmp %bl, %cl
886 jne L(end)
887 cmp %bh, %ch
888 mov $0, %eax
889 jne L(end)
890 RETURN
891
892 .p2align 4
893 L(63bytes):
894 movdqu -63(%eax), %xmm1
895 movdqu -63(%edx), %xmm2
896 mov $-63, %ebx
897 pxor %xmm1, %xmm2
898 ptest %xmm2, %xmm0
899 jnc L(less16bytes)
900 L(47bytes):
901 mov $-47, %ebx
902 movdqu -47(%eax), %xmm1
903 movdqu -47(%edx), %xmm2
904 pxor %xmm1, %xmm2
905 ptest %xmm2, %xmm0
906 jnc L(less16bytes)
907 L(31bytes):
908 mov $-31, %ebx
909 movdqu -31(%eax), %xmm1
910 movdqu -31(%edx), %xmm2
911 pxor %xmm1, %xmm2
912 ptest %xmm2, %xmm0
913 jnc L(less16bytes)
914
915 movl -15(%eax), %ecx
916 movl -15(%edx), %ebx
917 cmp %ebx, %ecx
918 jne L(find_diff)
919 movl -11(%eax), %ecx
920 movl -11(%edx), %ebx
921 cmp %ebx, %ecx
922 jne L(find_diff)
923 movl -7(%eax), %ecx
924 movl -7(%edx), %ebx
925 cmp %ebx, %ecx
926 jne L(find_diff)
927 movzwl -3(%eax), %ecx
928 movzwl -3(%edx), %ebx
929 cmpb %bl, %cl
930 jne L(end)
931 cmp %bx, %cx
932 jne L(end)
933 movzbl -1(%eax), %eax
934 cmpb -1(%edx), %al
935 mov $0, %eax
936 jne L(end)
937 RETURN
938 # endif
939
940 .p2align 4
941 L(64bytes):
942 movdqu -64(%eax), %xmm1
943 movdqu -64(%edx), %xmm2
944 mov $-64, %ebx
945 pxor %xmm1, %xmm2
946 ptest %xmm2, %xmm0
947 jnc L(less16bytes)
948 L(48bytes):
949 movdqu -48(%eax), %xmm1
950 movdqu -48(%edx), %xmm2
951 mov $-48, %ebx
952 pxor %xmm1, %xmm2
953 ptest %xmm2, %xmm0
954 jnc L(less16bytes)
955 L(32bytes):
956 movdqu -32(%eax), %xmm1
957 movdqu -32(%edx), %xmm2
958 mov $-32, %ebx
959 pxor %xmm1, %xmm2
960 ptest %xmm2, %xmm0
961 jnc L(less16bytes)
962
963 mov -16(%eax), %ecx
964 # ifndef USE_AS_WMEMCMP
965 mov -16(%edx), %ebx
966 cmp %ebx, %ecx
967 # else
968 cmp -16(%edx), %ecx
969 # endif
970 jne L(find_diff)
971
972 mov -12(%eax), %ecx
973 # ifndef USE_AS_WMEMCMP
974 mov -12(%edx), %ebx
975 cmp %ebx, %ecx
976 # else
977 cmp -12(%edx), %ecx
978 # endif
979 jne L(find_diff)
980
981 mov -8(%eax), %ecx
982 # ifndef USE_AS_WMEMCMP
983 mov -8(%edx), %ebx
984 cmp %ebx, %ecx
985 # else
986 cmp -8(%edx), %ecx
987 # endif
988 jne L(find_diff)
989
990 mov -4(%eax), %ecx
991 # ifndef USE_AS_WMEMCMP
992 mov -4(%edx), %ebx
993 cmp %ebx, %ecx
994 # else
995 cmp -4(%edx), %ecx
996 # endif
997 mov $0, %eax
998 jne L(find_diff)
999 RETURN
1000
1001 # ifndef USE_AS_WMEMCMP
1002 .p2align 4
1003 L(less16bytes):
1004 add %ebx, %eax
1005 add %ebx, %edx
1006
1007 mov (%eax), %ecx
1008 mov (%edx), %ebx
1009 cmp %ebx, %ecx
1010 jne L(find_diff)
1011
1012 mov 4(%eax), %ecx
1013 mov 4(%edx), %ebx
1014 cmp %ebx, %ecx
1015 jne L(find_diff)
1016
1017 mov 8(%eax), %ecx
1018 mov 8(%edx), %ebx
1019 cmp %ebx, %ecx
1020 jne L(find_diff)
1021
1022 mov 12(%eax), %ecx
1023 mov 12(%edx), %ebx
1024 cmp %ebx, %ecx
1025 mov $0, %eax
1026 jne L(find_diff)
1027 RETURN
1028 # else
1029 .p2align 4
1030 L(less16bytes):
1031 add %ebx, %eax
1032 add %ebx, %edx
1033
1034 mov (%eax), %ecx
1035 cmp (%edx), %ecx
1036 jne L(find_diff)
1037
1038 mov 4(%eax), %ecx
1039 cmp 4(%edx), %ecx
1040 jne L(find_diff)
1041
1042 mov 8(%eax), %ecx
1043 cmp 8(%edx), %ecx
1044 jne L(find_diff)
1045
1046 mov 12(%eax), %ecx
1047 cmp 12(%edx), %ecx
1048
1049 mov $0, %eax
1050 jne L(find_diff)
1051 RETURN
1052 # endif
1053
1054 .p2align 4
1055 L(find_diff):
1056 # ifndef USE_AS_WMEMCMP
1057 cmpb %bl, %cl
1058 jne L(end)
1059 cmp %bx, %cx
1060 jne L(end)
1061 shr $16,%ecx
1062 shr $16,%ebx
1063 cmp %bl, %cl
1064 jne L(end)
1065 cmp %bx, %cx
1066 L(end):
1067 POP (%ebx)
1068 mov $1, %eax
1069 ja L(bigger)
1070 neg %eax
1071 L(bigger):
1072 ret
1073 # else
1074 POP (%ebx)
1075 mov $1, %eax
1076 jg L(bigger)
1077 neg %eax
1078 ret
1079
1080 .p2align 4
1081 L(bigger):
1082 ret
1083 # endif
1084 END (MEMCMP)
1085
1086 .section .rodata.sse4.2,"a",@progbits
1087 .p2align 2
1088 .type L(table_64bytes), @object
1089 # ifndef USE_AS_WMEMCMP
1090 L(table_64bytes):
1091 .int JMPTBL (L(0bytes), L(table_64bytes))
1092 .int JMPTBL (L(1bytes), L(table_64bytes))
1093 .int JMPTBL (L(2bytes), L(table_64bytes))
1094 .int JMPTBL (L(3bytes), L(table_64bytes))
1095 .int JMPTBL (L(4bytes), L(table_64bytes))
1096 .int JMPTBL (L(5bytes), L(table_64bytes))
1097 .int JMPTBL (L(6bytes), L(table_64bytes))
1098 .int JMPTBL (L(7bytes), L(table_64bytes))
1099 .int JMPTBL (L(8bytes), L(table_64bytes))
1100 .int JMPTBL (L(9bytes), L(table_64bytes))
1101 .int JMPTBL (L(10bytes), L(table_64bytes))
1102 .int JMPTBL (L(11bytes), L(table_64bytes))
1103 .int JMPTBL (L(12bytes), L(table_64bytes))
1104 .int JMPTBL (L(13bytes), L(table_64bytes))
1105 .int JMPTBL (L(14bytes), L(table_64bytes))
1106 .int JMPTBL (L(15bytes), L(table_64bytes))
1107 .int JMPTBL (L(16bytes), L(table_64bytes))
1108 .int JMPTBL (L(17bytes), L(table_64bytes))
1109 .int JMPTBL (L(18bytes), L(table_64bytes))
1110 .int JMPTBL (L(19bytes), L(table_64bytes))
1111 .int JMPTBL (L(20bytes), L(table_64bytes))
1112 .int JMPTBL (L(21bytes), L(table_64bytes))
1113 .int JMPTBL (L(22bytes), L(table_64bytes))
1114 .int JMPTBL (L(23bytes), L(table_64bytes))
1115 .int JMPTBL (L(24bytes), L(table_64bytes))
1116 .int JMPTBL (L(25bytes), L(table_64bytes))
1117 .int JMPTBL (L(26bytes), L(table_64bytes))
1118 .int JMPTBL (L(27bytes), L(table_64bytes))
1119 .int JMPTBL (L(28bytes), L(table_64bytes))
1120 .int JMPTBL (L(29bytes), L(table_64bytes))
1121 .int JMPTBL (L(30bytes), L(table_64bytes))
1122 .int JMPTBL (L(31bytes), L(table_64bytes))
1123 .int JMPTBL (L(32bytes), L(table_64bytes))
1124 .int JMPTBL (L(33bytes), L(table_64bytes))
1125 .int JMPTBL (L(34bytes), L(table_64bytes))
1126 .int JMPTBL (L(35bytes), L(table_64bytes))
1127 .int JMPTBL (L(36bytes), L(table_64bytes))
1128 .int JMPTBL (L(37bytes), L(table_64bytes))
1129 .int JMPTBL (L(38bytes), L(table_64bytes))
1130 .int JMPTBL (L(39bytes), L(table_64bytes))
1131 .int JMPTBL (L(40bytes), L(table_64bytes))
1132 .int JMPTBL (L(41bytes), L(table_64bytes))
1133 .int JMPTBL (L(42bytes), L(table_64bytes))
1134 .int JMPTBL (L(43bytes), L(table_64bytes))
1135 .int JMPTBL (L(44bytes), L(table_64bytes))
1136 .int JMPTBL (L(45bytes), L(table_64bytes))
1137 .int JMPTBL (L(46bytes), L(table_64bytes))
1138 .int JMPTBL (L(47bytes), L(table_64bytes))
1139 .int JMPTBL (L(48bytes), L(table_64bytes))
1140 .int JMPTBL (L(49bytes), L(table_64bytes))
1141 .int JMPTBL (L(50bytes), L(table_64bytes))
1142 .int JMPTBL (L(51bytes), L(table_64bytes))
1143 .int JMPTBL (L(52bytes), L(table_64bytes))
1144 .int JMPTBL (L(53bytes), L(table_64bytes))
1145 .int JMPTBL (L(54bytes), L(table_64bytes))
1146 .int JMPTBL (L(55bytes), L(table_64bytes))
1147 .int JMPTBL (L(56bytes), L(table_64bytes))
1148 .int JMPTBL (L(57bytes), L(table_64bytes))
1149 .int JMPTBL (L(58bytes), L(table_64bytes))
1150 .int JMPTBL (L(59bytes), L(table_64bytes))
1151 .int JMPTBL (L(60bytes), L(table_64bytes))
1152 .int JMPTBL (L(61bytes), L(table_64bytes))
1153 .int JMPTBL (L(62bytes), L(table_64bytes))
1154 .int JMPTBL (L(63bytes), L(table_64bytes))
1155 .int JMPTBL (L(64bytes), L(table_64bytes))
1156 # else
1157 L(table_64bytes):
1158 .int JMPTBL (L(0bytes), L(table_64bytes))
1159 .int JMPTBL (L(unreal_case), L(table_64bytes))
1160 .int JMPTBL (L(unreal_case), L(table_64bytes))
1161 .int JMPTBL (L(unreal_case), L(table_64bytes))
1162 .int JMPTBL (L(4bytes), L(table_64bytes))
1163 .int JMPTBL (L(unreal_case), L(table_64bytes))
1164 .int JMPTBL (L(unreal_case), L(table_64bytes))
1165 .int JMPTBL (L(unreal_case), L(table_64bytes))
1166 .int JMPTBL (L(8bytes), L(table_64bytes))
1167 .int JMPTBL (L(unreal_case), L(table_64bytes))
1168 .int JMPTBL (L(unreal_case), L(table_64bytes))
1169 .int JMPTBL (L(unreal_case), L(table_64bytes))
1170 .int JMPTBL (L(12bytes), L(table_64bytes))
1171 .int JMPTBL (L(unreal_case), L(table_64bytes))
1172 .int JMPTBL (L(unreal_case), L(table_64bytes))
1173 .int JMPTBL (L(unreal_case), L(table_64bytes))
1174 .int JMPTBL (L(16bytes), L(table_64bytes))
1175 .int JMPTBL (L(unreal_case), L(table_64bytes))
1176 .int JMPTBL (L(unreal_case), L(table_64bytes))
1177 .int JMPTBL (L(unreal_case), L(table_64bytes))
1178 .int JMPTBL (L(20bytes), L(table_64bytes))
1179 .int JMPTBL (L(unreal_case), L(table_64bytes))
1180 .int JMPTBL (L(unreal_case), L(table_64bytes))
1181 .int JMPTBL (L(unreal_case), L(table_64bytes))
1182 .int JMPTBL (L(24bytes), L(table_64bytes))
1183 .int JMPTBL (L(unreal_case), L(table_64bytes))
1184 .int JMPTBL (L(unreal_case), L(table_64bytes))
1185 .int JMPTBL (L(unreal_case), L(table_64bytes))
1186 .int JMPTBL (L(28bytes), L(table_64bytes))
1187 .int JMPTBL (L(unreal_case), L(table_64bytes))
1188 .int JMPTBL (L(unreal_case), L(table_64bytes))
1189 .int JMPTBL (L(unreal_case), L(table_64bytes))
1190 .int JMPTBL (L(32bytes), L(table_64bytes))
1191 .int JMPTBL (L(unreal_case), L(table_64bytes))
1192 .int JMPTBL (L(unreal_case), L(table_64bytes))
1193 .int JMPTBL (L(unreal_case), L(table_64bytes))
1194 .int JMPTBL (L(36bytes), L(table_64bytes))
1195 .int JMPTBL (L(unreal_case), L(table_64bytes))
1196 .int JMPTBL (L(unreal_case), L(table_64bytes))
1197 .int JMPTBL (L(unreal_case), L(table_64bytes))
1198 .int JMPTBL (L(40bytes), L(table_64bytes))
1199 .int JMPTBL (L(unreal_case), L(table_64bytes))
1200 .int JMPTBL (L(unreal_case), L(table_64bytes))
1201 .int JMPTBL (L(unreal_case), L(table_64bytes))
1202 .int JMPTBL (L(44bytes), L(table_64bytes))
1203 .int JMPTBL (L(unreal_case), L(table_64bytes))
1204 .int JMPTBL (L(unreal_case), L(table_64bytes))
1205 .int JMPTBL (L(unreal_case), L(table_64bytes))
1206 .int JMPTBL (L(48bytes), L(table_64bytes))
1207 .int JMPTBL (L(unreal_case), L(table_64bytes))
1208 .int JMPTBL (L(unreal_case), L(table_64bytes))
1209 .int JMPTBL (L(unreal_case), L(table_64bytes))
1210 .int JMPTBL (L(52bytes), L(table_64bytes))
1211 .int JMPTBL (L(unreal_case), L(table_64bytes))
1212 .int JMPTBL (L(unreal_case), L(table_64bytes))
1213 .int JMPTBL (L(unreal_case), L(table_64bytes))
1214 .int JMPTBL (L(56bytes), L(table_64bytes))
1215 .int JMPTBL (L(unreal_case), L(table_64bytes))
1216 .int JMPTBL (L(unreal_case), L(table_64bytes))
1217 .int JMPTBL (L(unreal_case), L(table_64bytes))
1218 .int JMPTBL (L(60bytes), L(table_64bytes))
1219 .int JMPTBL (L(unreal_case), L(table_64bytes))
1220 .int JMPTBL (L(unreal_case), L(table_64bytes))
1221 .int JMPTBL (L(unreal_case), L(table_64bytes))
1222 .int JMPTBL (L(64bytes), L(table_64bytes))
1223 # endif
1224 #endif