1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
25 /* TODO: change these to the actual instructions when the minimum required
26 binutils allows it. */
27 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
29 # define MEMCMP memcmp
32 ENTRY_TOCLESS (MEMCMP, 4)
36 #define rSTR1 r3 /* First string arg. */
37 #define rSTR2 r4 /* Second string arg. */
38 #define rN r5 /* Max string length. */
39 #define rWORD1 r6 /* Current word in s1. */
40 #define rWORD2 r7 /* Current word in s2. */
41 #define rWORD3 r8 /* Next word in s1. */
42 #define rWORD4 r9 /* Next word in s2. */
43 #define rWORD5 r10 /* Next word in s1. */
44 #define rWORD6 r11 /* Next word in s2. */
46 #define rOFF8 r20 /* 8 bytes offset. */
47 #define rOFF16 r21 /* 16 bytes offset. */
48 #define rOFF24 r22 /* 24 bytes offset. */
49 #define rOFF32 r23 /* 24 bytes offset. */
50 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
51 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
52 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
53 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
54 #define rSHR r28 /* Unaligned shift right count. */
55 #define rSHL r29 /* Unaligned shift left count. */
56 #define rWORD7 r30 /* Next word in s1. */
57 #define rWORD8 r31 /* Next word in s2. */
59 #define rWORD8SAVE (-8)
60 #define rWORD7SAVE (-16)
61 #define rOFF8SAVE (-24)
62 #define rOFF16SAVE (-32)
63 #define rOFF24SAVE (-40)
64 #define rOFF32SAVE (-48)
65 #define rSHRSAVE (-56)
66 #define rSHLSAVE (-64)
67 #define rWORD8SHIFTSAVE (-72)
68 #define rWORD2SHIFTSAVE (-80)
69 #define rWORD4SHIFTSAVE (-88)
70 #define rWORD6SHIFTSAVE (-96)
72 #ifdef __LITTLE_ENDIAN__
84 beq- cr6, L(zeroLength)
87 /* If less than 8 bytes or not aligned, use the unaligned
89 blt cr1, L(bytealigned)
91 /* At this point we know both strings have the same alignment and the
92 compare length is at least 8 bytes. r12 contains the low order
93 3 bits of rSTR1 and cr5 contains the result of the logical compare
94 of r12 to 0. If r12 == 0 then we are already double word
95 aligned and can perform the DW aligned loop. */
102 /* Try to align to QW else proceed to DW loop. */
105 /* For the difference to reach QW alignment, load as DW. */
106 clrrdi rSTR1, rSTR1, 3
107 clrrdi rSTR2, rSTR2, 3
113 sld rWORD1, rWORD1, r9
114 sld rWORD2, rWORD2, r9
115 cmpld cr6, rWORD1, rWORD2
125 cmpld cr6, rWORD1, rWORD2
128 bne cr6, L(different)
130 ble cr6, L(zeroLength)
132 /* Now both rSTR1 and rSTR2 are aligned to QW. */
140 ble cr0, L(lessthan64)
144 /* Aligned vector loop. */
150 bnl cr6, L(different3)
154 bnl cr6, L(different2)
158 bnl cr6, L(different3)
162 bnl cr6, L(different2)
163 addi rSTR1, rSTR1, 64
164 addi rSTR2, rSTR2, 64
167 bnl cr6, L(different3)
169 /* Handle remainder for aligned loop. */
179 bnl cr6, L(different1)
187 bnl cr6, L(different1)
195 bnl cr6, L(different1)
203 bnl cr6, L(different1)
206 /* Calculate and return the difference. */
210 bge cr6, L(different2)
211 /* Discard unwanted bytes. */
212 #ifdef __LITTLE_ENDIAN__
226 #ifdef __LITTLE_ENDIAN__
227 /* Reverse bytes for direct comparison. */
238 /* Difference in second DW. */
251 #ifdef __LITTLE_ENDIAN__
252 /* Reverse bytes for direct comparison. */
263 /* Difference in second DW. */
278 /* Skip unwanted bytes. */
281 srd rWORD1, rWORD1, r8
282 srd rWORD2, rWORD2, r8
283 cmpld cr6, rWORD1, rWORD2
294 /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
295 rldicl r9, rSTR1, 0, 52
297 cmpldi cr0, r9, 4096-16
298 bgt cr0, L(unaligned)
299 rldicl r9, rSTR2, 0, 52
301 cmpldi cr0, r9, 4096-16
302 bgt cr0, L(unaligned)
306 /* Check if rSTR1 is aligned to QW. */
307 andi. r11, rSTR1, 0xF
310 /* Compare 16B and align S1 to QW. */
311 #ifdef __LITTLE_ENDIAN__
312 lvsr v10, 0, rSTR1 /* Compute mask. */
313 lvsr v6, 0, rSTR2 /* Compute mask. */
315 lvsl v10, 0, rSTR1 /* Compute mask. */
316 lvsl v6, 0, rSTR2 /* Compute mask. */
320 #ifdef __LITTLE_ENDIAN__
327 #ifdef __LITTLE_ENDIAN__
328 vperm v4, v9, v4, v10
330 vperm v4, v4, v9, v10
333 bnl cr6, L(different1)
335 ble cr6, L(zeroLength)
338 add rSTR1, rSTR1, r11
339 add rSTR2, rSTR2, r11
341 /* As s1 is QW aligned prepare for unaligned loop. */
344 #ifdef __LITTLE_ENDIAN__
353 ble cr0, L(lessthan64_unalign)
356 /* Unaligned vector loop. */
361 #ifdef __LITTLE_ENDIAN__
362 vperm v5, v10, v5, v6
364 vperm v5, v5, v10, v6
367 bnl cr6, L(different2)
371 #ifdef __LITTLE_ENDIAN__
372 vperm v5, v10, v5, v6
374 vperm v5, v5, v10, v6
377 bnl cr6, L(different2)
381 #ifdef __LITTLE_ENDIAN__
382 vperm v5, v10, v5, v6
384 vperm v5, v5, v10, v6
387 bnl cr6, L(different2)
391 #ifdef __LITTLE_ENDIAN__
392 vperm v5, v10, v5, v6
394 vperm v5, v5, v10, v6
397 bnl cr6, L(different2)
399 addi rSTR1, rSTR1, 64
400 addi rSTR2, rSTR2, 64
401 bdnz L(unalign_qwloop)
403 /* Handle remainder for unaligned loop. */
405 L(lessthan64_unalign):
412 #ifdef __LITTLE_ENDIAN__
413 vperm v5, v10, v5, v6
415 vperm v5, v5, v10, v6
418 bnl cr6, L(different1)
426 #ifdef __LITTLE_ENDIAN__
427 vperm v5, v10, v5, v6
429 vperm v5, v5, v10, v6
432 bnl cr6, L(different1)
440 #ifdef __LITTLE_ENDIAN__
441 vperm v5, v10, v5, v6
443 vperm v5, v5, v10, v6
446 bnl cr6, L(different1)
455 #ifdef __LITTLE_ENDIAN__
456 vperm v5, v10, v5, v6
458 vperm v5, v5, v10, v6
461 bnl cr6, L(different1)
464 /* Otherwise we know the two strings have the same alignment (but not
465 yet DW). So we force the string addresses to the next lower DW
466 boundary and special case this first DW using shift left to
467 eliminate bits preceding the first byte. Since we want to join the
468 normal (DW aligned) compare loop, starting at the second double word,
469 we need to adjust the length (rN) and special case the loop
470 versioning for the first DW. This ensures that the loop count is
471 correct and the first DW (shifted) is in the expected register pair. */
474 std rWORD8, rWORD8SAVE(r1)
475 std rWORD7, rWORD7SAVE(r1)
476 std rOFF8, rOFF8SAVE(r1)
477 std rOFF16, rOFF16SAVE(r1)
478 std rOFF24, rOFF24SAVE(r1)
479 std rOFF32, rOFF32SAVE(r1)
480 cfi_offset(rWORD8, rWORD8SAVE)
481 cfi_offset(rWORD7, rWORD7SAVE)
482 cfi_offset(rOFF8, rOFF8SAVE)
483 cfi_offset(rOFF16, rOFF16SAVE)
484 cfi_offset(rOFF24, rOFF24SAVE)
485 cfi_offset(rOFF32, rOFF32SAVE)
491 clrrdi rSTR1, rSTR1, 3
492 clrrdi rSTR2, rSTR2, 3
493 beq cr5, L(DWaligned)
496 srdi r0, rN, 5 /* Divide by 32. */
497 andi. r12, rN, 24 /* Get the DW remainder. */
508 /* Remainder is 8. */
511 sld rWORD5, rWORD1, rWORD6
512 sld rWORD6, rWORD2, rWORD6
513 cmpld cr5, rWORD5, rWORD6
515 /* Do something useful in this cycle since we have to branch anyway. */
516 LD rWORD1, rOFF8, rSTR1
517 LD rWORD2, rOFF8, rSTR2
518 cmpld cr7, rWORD1, rWORD2
520 /* Remainder is 16. */
523 sld rWORD5, rWORD1, rWORD6
524 sld rWORD6, rWORD2, rWORD6
525 cmpld cr6, rWORD5, rWORD6
527 /* Do something useful in this cycle since we have to branch anyway. */
528 LD rWORD7, rOFF8, rSTR1
529 LD rWORD8, rOFF8, rSTR2
530 cmpld cr5, rWORD7, rWORD8
532 /* Remainder is 24. */
535 sld rWORD3, rWORD1, rWORD6
536 sld rWORD4, rWORD2, rWORD6
537 cmpld cr1, rWORD3, rWORD4
539 /* Count is a multiple of 32, remainder is 0. */
543 sld rWORD1, rWORD1, rWORD6
544 sld rWORD2, rWORD2, rWORD6
545 cmpld cr7, rWORD1, rWORD2
548 /* At this point we know both strings are double word aligned and the
549 compare length is at least 8 bytes. */
552 andi. r12, rN, 24 /* Get the DW remainder. */
553 srdi r0, rN, 5 /* Divide by 32. */
561 /* Remainder is 8. */
565 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
566 (8-15 byte compare), we want to use only volatile registers. This
567 means we can avoid restoring non-volatile registers since we did not
568 change any on the early exit path. The key here is the non-early
569 exit path only cares about the condition code (cr5), not about which
570 register pair was used. */
573 cmpld cr5, rWORD5, rWORD6
575 LD rWORD1, rOFF8, rSTR1
576 LD rWORD2, rOFF8, rSTR2
577 cmpld cr7, rWORD1, rWORD2
579 LD rWORD3, rOFF16, rSTR1
580 LD rWORD4, rOFF16, rSTR2
581 cmpld cr1, rWORD3, rWORD4
582 LD rWORD5, rOFF24, rSTR1
583 LD rWORD6, rOFF24, rSTR2
584 cmpld cr6, rWORD5, rWORD6
588 LD rWORD7, rOFF32, rSTR1
589 LD rWORD8, rOFF32, rSTR2
590 addi rSTR1, rSTR1, 32
591 addi rSTR2, rSTR2, 32
593 cmpld cr5, rWORD7, rWORD8
596 ld rWORD8, rWORD8SAVE(r1)
597 ld rWORD7, rWORD7SAVE(r1)
602 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
604 ld rOFF8, rOFF8SAVE(r1)
605 ld rOFF16, rOFF16SAVE(r1)
606 ld rOFF24, rOFF24SAVE(r1)
607 ld rOFF32, rOFF32SAVE(r1)
611 /* Remainder is 16. */
617 cmpld cr6, rWORD5, rWORD6
619 LD rWORD7, rOFF8, rSTR1
620 LD rWORD8, rOFF8, rSTR2
621 cmpld cr5, rWORD7, rWORD8
623 LD rWORD1, rOFF16, rSTR1
624 LD rWORD2, rOFF16, rSTR2
625 cmpld cr7, rWORD1, rWORD2
626 LD rWORD3, rOFF24, rSTR1
627 LD rWORD4, rOFF24, rSTR2
628 cmpld cr1, rWORD3, rWORD4
636 LD rWORD3, rOFF8, rSTR1
637 LD rWORD4, rOFF8, rSTR2
638 cmpld cr1, rWORD3, rWORD4
644 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
646 ld rOFF8, rOFF8SAVE(r1)
647 ld rOFF16, rOFF16SAVE(r1)
648 ld rOFF24, rOFF24SAVE(r1)
649 ld rOFF32, rOFF32SAVE(r1)
653 /* Remainder is 24. */
659 cmpld cr1, rWORD3, rWORD4
661 LD rWORD5, rOFF8, rSTR1
662 LD rWORD6, rOFF8, rSTR2
663 cmpld cr6, rWORD5, rWORD6
665 LD rWORD7, rOFF16, rSTR1
666 LD rWORD8, rOFF16, rSTR2
667 cmpld cr5, rWORD7, rWORD8
668 LD rWORD1, rOFF24, rSTR1
669 LD rWORD2, rOFF24, rSTR2
670 cmpld cr7, rWORD1, rWORD2
671 addi rSTR1, rSTR1, 16
672 addi rSTR2, rSTR2, 16
676 /* Again we are on a early exit path (24-31 byte compare), we want to
677 only use volatile registers and avoid restoring non-volatile
681 LD rWORD1, rOFF16, rSTR1
682 LD rWORD2, rOFF16, rSTR2
683 cmpld cr7, rWORD1, rWORD2
686 addi rSTR1, rSTR1, 16
687 addi rSTR2, rSTR2, 16
689 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
692 ld rOFF8, rOFF8SAVE(r1)
693 ld rOFF16, rOFF16SAVE(r1)
694 ld rOFF24, rOFF24SAVE(r1)
695 ld rOFF32, rOFF32SAVE(r1)
699 /* Count is a multiple of 32, remainder is 0. */
705 cmpld cr7, rWORD1, rWORD2
707 LD rWORD3, rOFF8, rSTR1
708 LD rWORD4, rOFF8, rSTR2
709 cmpld cr1, rWORD3, rWORD4
710 LD rWORD5, rOFF16, rSTR1
711 LD rWORD6, rOFF16, rSTR2
712 cmpld cr6, rWORD5, rWORD6
713 LD rWORD7, rOFF24, rSTR1
714 LD rWORD8, rOFF24, rSTR2
715 addi rSTR1, rSTR1, 24
716 addi rSTR2, rSTR2, 24
717 cmpld cr5, rWORD7, rWORD8
720 bdz- L(d24) /* Adjust CTR as we start with +4. */
721 /* This is the primary loop. */
724 LD rWORD1, rOFF8, rSTR1
725 LD rWORD2, rOFF8, rSTR2
726 cmpld cr1, rWORD3, rWORD4
729 LD rWORD3, rOFF16, rSTR1
730 LD rWORD4, rOFF16, rSTR2
731 cmpld cr6, rWORD5, rWORD6
734 LD rWORD5, rOFF24, rSTR1
735 LD rWORD6, rOFF24, rSTR2
736 cmpld cr5, rWORD7, rWORD8
739 LD rWORD7, rOFF32, rSTR1
740 LD rWORD8, rOFF32, rSTR2
741 addi rSTR1, rSTR1, 32
742 addi rSTR2, rSTR2, 32
744 cmpld cr7, rWORD1, rWORD2
748 cmpld cr1, rWORD3, rWORD4
750 cmpld cr6, rWORD5, rWORD6
752 cmpld cr5, rWORD7, rWORD8
763 ld rWORD8, rWORD8SAVE(r1)
764 ld rWORD7, rWORD7SAVE(r1)
765 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
767 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
768 we are aligned it is safe to load the whole double word, and use
769 shift right double to eliminate bits beyond the compare length. */
771 LD rWORD1, rOFF8, rSTR1
772 LD rWORD2, rOFF8, rSTR2
773 srd rWORD1, rWORD1, rN
774 srd rWORD2, rWORD2, rN
775 cmpld cr7, rWORD1, rWORD2
777 ld rOFF8, rOFF8SAVE(r1)
778 ld rOFF16, rOFF16SAVE(r1)
779 ld rOFF24, rOFF24SAVE(r1)
780 ld rOFF32, rOFF32SAVE(r1)
786 ld rWORD8, rWORD8SAVE(r1)
787 ld rWORD7, rWORD7SAVE(r1)
789 ld rOFF8, rOFF8SAVE(r1)
790 ld rOFF16, rOFF16SAVE(r1)
791 ld rOFF24, rOFF24SAVE(r1)
792 ld rOFF32, rOFF32SAVE(r1)
799 ld rWORD8, rWORD8SAVE(r1)
800 ld rWORD7, rWORD7SAVE(r1)
802 ld rOFF8, rOFF8SAVE(r1)
803 ld rOFF16, rOFF16SAVE(r1)
804 ld rOFF24, rOFF24SAVE(r1)
805 ld rOFF32, rOFF32SAVE(r1)
812 ld rWORD8, rWORD8SAVE(r1)
813 ld rWORD7, rWORD7SAVE(r1)
815 ld rOFF8, rOFF8SAVE(r1)
816 ld rOFF16, rOFF16SAVE(r1)
817 ld rOFF24, rOFF24SAVE(r1)
818 ld rOFF32, rOFF32SAVE(r1)
825 ld rWORD8, rWORD8SAVE(r1)
826 ld rWORD7, rWORD7SAVE(r1)
828 ld rOFF8, rOFF8SAVE(r1)
829 ld rOFF16, rOFF16SAVE(r1)
830 ld rOFF24, rOFF24SAVE(r1)
831 ld rOFF32, rOFF32SAVE(r1)
841 /* We need to prime this loop. This loop is swing modulo scheduled
842 to avoid pipe delays. The dependent instruction latencies (load to
843 compare to conditional branch) is 2 to 3 cycles. In this loop each
844 dispatch group ends in a branch and takes 1 cycle. Effectively
845 the first iteration of the loop only serves to load operands and
846 branches based on compares are delayed until the next loop.
848 So we must precondition some registers and condition codes so that
849 we don't exit the loop early on the first iteration. */
854 cmpld cr7, rWORD1, rWORD2
858 cmpld cr1, rWORD3, rWORD4
859 lbzu rWORD5, 2(rSTR1)
860 lbzu rWORD6, 2(rSTR2)
864 lbzu rWORD1, 1(rSTR1)
865 lbzu rWORD2, 1(rSTR2)
868 cmpld cr6, rWORD5, rWORD6
871 lbzu rWORD3, 1(rSTR1)
872 lbzu rWORD4, 1(rSTR2)
875 cmpld cr7, rWORD1, rWORD2
878 lbzu rWORD5, 1(rSTR1)
879 lbzu rWORD6, 1(rSTR2)
882 cmpld cr1, rWORD3, rWORD4
885 /* We speculatively loading bytes before we have tested the previous
886 bytes. But we must avoid overrunning the length (in the ctr) to
887 prevent these speculative loads from causing a segfault. In this
888 case the loop will exit early (before the all pending bytes are
889 tested. In this case we must complete the pending operations
926 sub rRTN, rWORD5, rWORD6
932 sub rRTN, rWORD3, rWORD4
936 sub rRTN, rWORD1, rWORD2
945 /* At this point we know the strings have different alignment and the
946 compare length is at least 8 bytes. r12 contains the low order
947 3 bits of rSTR1 and cr5 contains the result of the logical compare
948 of r12 to 0. If r12 == 0 then rStr1 is double word
949 aligned and can perform the DWunaligned loop.
951 Otherwise we know that rSTR1 is not already DW aligned yet.
952 So we can force the string addresses to the next lower DW
953 boundary and special case this first DW using shift left to
954 eliminate bits preceding the first byte. Since we want to join the
955 normal (DWaligned) compare loop, starting at the second double word,
956 we need to adjust the length (rN) and special case the loop
957 versioning for the first DW. This ensures that the loop count is
958 correct and the first DW (shifted) is in the expected resister pair. */
960 std rWORD8, rWORD8SAVE(r1)
961 std rWORD7, rWORD7SAVE(r1)
962 std rOFF8, rOFF8SAVE(r1)
963 std rOFF16, rOFF16SAVE(r1)
964 std rOFF24, rOFF24SAVE(r1)
965 std rOFF32, rOFF32SAVE(r1)
966 cfi_offset(rWORD8, rWORD8SAVE)
967 cfi_offset(rWORD7, rWORD7SAVE)
968 cfi_offset(rOFF8, rOFF8SAVE)
969 cfi_offset(rOFF16, rOFF16SAVE)
970 cfi_offset(rOFF24, rOFF24SAVE)
971 cfi_offset(rOFF32, rOFF32SAVE)
976 std rSHL, rSHLSAVE(r1)
977 cfi_offset(rSHL, rSHLSAVE)
978 clrldi rSHL, rSTR2, 61
979 beq cr6, L(duzeroLength)
980 std rSHR, rSHRSAVE(r1)
981 cfi_offset(rSHR, rSHRSAVE)
982 beq cr5, L(DWunaligned)
983 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
984 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
985 /* Adjust the logical start of rSTR2 to compensate for the extra bits
986 in the 1st rSTR1 DW. */
987 sub rWORD8_SHIFT, rSTR2, r12
988 /* But do not attempt to address the DW before that DW that contains
989 the actual start of rSTR2. */
990 clrrdi rSTR2, rSTR2, 3
991 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
992 /* Compute the left/right shift counts for the unaligned rSTR2,
993 compensating for the logical (DW aligned) start of rSTR1. */
994 clrldi rSHL, rWORD8_SHIFT, 61
995 clrrdi rSTR1, rSTR1, 3
996 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
998 cmpld cr5, rWORD8_SHIFT, rSTR2
1001 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1002 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1003 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1004 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1005 subfic rSHR, rSHL, 64
1006 srdi r0, rN, 5 /* Divide by 32. */
1007 andi. r12, rN, 24 /* Get the DW remainder. */
1008 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
1009 this special case those bits may be discarded anyway. Also we
1010 must avoid loading a DW where none of the bits are part of rSTR2 as
1011 this may cross a page boundary and cause a page fault. */
1015 addi rSTR2, rSTR2, 8
1016 sld rWORD8, rWORD8, rSHL
1023 srd r12, rWORD2, rSHR
1027 or rWORD8, r12, rWORD8
1031 /* Remainder is 8. */
1034 sld rWORD8_SHIFT, rWORD2, rSHL
1035 sld rWORD7, rWORD1, rWORD6
1036 sld rWORD8, rWORD8, rWORD6
1038 /* At this point we exit early with the first double word compare
1039 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1040 how we handle the remaining bytes. */
1041 cmpld cr5, rWORD7, rWORD8
1048 LD rWORD2, rOFF8, rSTR2
1049 srd r0, rWORD2, rSHR
1051 /* Remainder is 16. */
1054 sld rWORD6_SHIFT, rWORD2, rSHL
1055 sld rWORD5, rWORD1, rWORD6
1056 sld rWORD6, rWORD8, rWORD6
1058 /* Remainder is 24. */
1061 sld rWORD4_SHIFT, rWORD2, rSHL
1062 sld rWORD3, rWORD1, rWORD6
1063 sld rWORD4, rWORD8, rWORD6
1065 /* Count is a multiple of 32, remainder is 0. */
1069 or rWORD8, r12, rWORD8
1070 sld rWORD2_SHIFT, rWORD2, rSHL
1071 sld rWORD1, rWORD1, rWORD6
1072 sld rWORD2, rWORD8, rWORD6
1075 /* At this point we know rSTR1 is double word aligned and the
1076 compare length is at least 8 bytes. */
1079 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1080 clrrdi rSTR2, rSTR2, 3
1081 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1082 srdi r0, rN, 5 /* Divide by 32. */
1083 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1084 andi. r12, rN, 24 /* Get the DW remainder. */
1085 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1086 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
1087 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1088 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1089 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1092 LD rWORD8, rOFF8, rSTR2
1093 addi rSTR2, rSTR2, 8
1097 subfic rSHR, rSHL, 64
1098 sld rWORD6_SHIFT, rWORD6, rSHL
1104 /* Remainder is 8. */
1107 srd r12, rWORD8, rSHR
1109 sld rWORD8_SHIFT, rWORD8, rSHL
1110 or rWORD8, r12, rWORD6_SHIFT
1113 LD rWORD1, rOFF8, rSTR1
1114 LD rWORD2, rOFF8, rSTR2
1115 cmpld cr5, rWORD7, rWORD8
1116 srd r0, rWORD2, rSHR
1117 sld rWORD2_SHIFT, rWORD2, rSHL
1118 or rWORD2, r0, rWORD8_SHIFT
1119 LD rWORD3, rOFF16, rSTR1
1120 LD rWORD4, rOFF16, rSTR2
1121 cmpld cr7, rWORD1, rWORD2
1122 srd r12, rWORD4, rSHR
1123 sld rWORD4_SHIFT, rWORD4, rSHL
1125 or rWORD4, r12, rWORD2_SHIFT
1126 LD rWORD5, rOFF24, rSTR1
1127 LD rWORD6, rOFF24, rSTR2
1128 cmpld cr1, rWORD3, rWORD4
1129 srd r0, rWORD6, rSHR
1130 sld rWORD6_SHIFT, rWORD6, rSHL
1132 or rWORD6, r0, rWORD4_SHIFT
1133 cmpld cr6, rWORD5, rWORD6
1136 /* At this point we exit early with the first double word compare
1137 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1138 how we handle the remaining bytes. */
1140 cmpld cr5, rWORD7, rWORD8
1147 LD rWORD2, rOFF8, rSTR2
1148 srd r0, rWORD2, rSHR
1150 /* Remainder is 16. */
1153 srd r0, rWORD8, rSHR
1155 or rWORD6, r0, rWORD6_SHIFT
1156 sld rWORD6_SHIFT, rWORD8, rSHL
1158 LD rWORD7, rOFF8, rSTR1
1159 LD rWORD8, rOFF8, rSTR2
1160 cmpld cr6, rWORD5, rWORD6
1161 srd r12, rWORD8, rSHR
1162 sld rWORD8_SHIFT, rWORD8, rSHL
1163 or rWORD8, r12, rWORD6_SHIFT
1165 LD rWORD1, rOFF16, rSTR1
1166 LD rWORD2, rOFF16, rSTR2
1167 cmpld cr5, rWORD7, rWORD8
1169 srd r0, rWORD2, rSHR
1170 sld rWORD2_SHIFT, rWORD2, rSHL
1171 or rWORD2, r0, rWORD8_SHIFT
1172 LD rWORD3, rOFF24, rSTR1
1173 LD rWORD4, rOFF24, rSTR2
1174 cmpld cr7, rWORD1, rWORD2
1176 srd r12, rWORD4, rSHR
1177 sld rWORD4_SHIFT, rWORD4, rSHL
1178 or rWORD4, r12, rWORD2_SHIFT
1179 addi rSTR1, rSTR1, 8
1180 addi rSTR2, rSTR2, 8
1181 cmpld cr1, rWORD3, rWORD4
1185 cmpld cr5, rWORD7, rWORD8
1186 addi rSTR1, rSTR1, 8
1187 addi rSTR2, rSTR2, 8
1195 LD rWORD2, rOFF8, rSTR2
1196 srd r0, rWORD2, rSHR
1199 /* Remainder is 24. */
1202 srd r12, rWORD8, rSHR
1204 sld rWORD4_SHIFT, rWORD8, rSHL
1205 or rWORD4, r12, rWORD6_SHIFT
1207 LD rWORD5, rOFF8, rSTR1
1208 LD rWORD6, rOFF8, rSTR2
1209 cmpld cr1, rWORD3, rWORD4
1210 srd r0, rWORD6, rSHR
1211 sld rWORD6_SHIFT, rWORD6, rSHL
1212 or rWORD6, r0, rWORD4_SHIFT
1213 LD rWORD7, rOFF16, rSTR1
1214 LD rWORD8, rOFF16, rSTR2
1215 cmpld cr6, rWORD5, rWORD6
1217 srd r12, rWORD8, rSHR
1218 sld rWORD8_SHIFT, rWORD8, rSHL
1219 or rWORD8, r12, rWORD6_SHIFT
1221 LD rWORD1, rOFF24, rSTR1
1222 LD rWORD2, rOFF24, rSTR2
1223 cmpld cr5, rWORD7, rWORD8
1225 srd r0, rWORD2, rSHR
1226 sld rWORD2_SHIFT, rWORD2, rSHL
1227 or rWORD2, r0, rWORD8_SHIFT
1228 addi rSTR1, rSTR1, 16
1229 addi rSTR2, rSTR2, 16
1230 cmpld cr7, rWORD1, rWORD2
1234 addi rSTR1, rSTR1, 16
1235 addi rSTR2, rSTR2, 16
1236 cmpld cr5, rWORD7, rWORD8
1244 LD rWORD2, rOFF8, rSTR2
1245 srd r0, rWORD2, rSHR
1248 /* Count is a multiple of 32, remainder is 0. */
1252 srd r0, rWORD8, rSHR
1254 sld rWORD2_SHIFT, rWORD8, rSHL
1255 or rWORD2, r0, rWORD6_SHIFT
1257 LD rWORD3, rOFF8, rSTR1
1258 LD rWORD4, rOFF8, rSTR2
1259 cmpld cr7, rWORD1, rWORD2
1260 srd r12, rWORD4, rSHR
1261 sld rWORD4_SHIFT, rWORD4, rSHL
1262 or rWORD4, r12, rWORD2_SHIFT
1263 LD rWORD5, rOFF16, rSTR1
1264 LD rWORD6, rOFF16, rSTR2
1265 cmpld cr1, rWORD3, rWORD4
1267 srd r0, rWORD6, rSHR
1268 sld rWORD6_SHIFT, rWORD6, rSHL
1269 or rWORD6, r0, rWORD4_SHIFT
1270 LD rWORD7, rOFF24, rSTR1
1271 LD rWORD8, rOFF24, rSTR2
1272 addi rSTR1, rSTR1, 24
1273 addi rSTR2, rSTR2, 24
1274 cmpld cr6, rWORD5, rWORD6
1276 srd r12, rWORD8, rSHR
1277 sld rWORD8_SHIFT, rWORD8, rSHL
1278 or rWORD8, r12, rWORD6_SHIFT
1279 cmpld cr5, rWORD7, rWORD8
1280 bdz L(du24) /* Adjust CTR as we start with +4. */
1281 /* This is the primary loop. */
1284 LD rWORD1, rOFF8, rSTR1
1285 LD rWORD2, rOFF8, rSTR2
1286 cmpld cr1, rWORD3, rWORD4
1288 srd r0, rWORD2, rSHR
1289 sld rWORD2_SHIFT, rWORD2, rSHL
1290 or rWORD2, r0, rWORD8_SHIFT
1292 LD rWORD3, rOFF16, rSTR1
1293 LD rWORD4, rOFF16, rSTR2
1294 cmpld cr6, rWORD5, rWORD6
1296 srd r12, rWORD4, rSHR
1297 sld rWORD4_SHIFT, rWORD4, rSHL
1298 or rWORD4, r12, rWORD2_SHIFT
1300 LD rWORD5, rOFF24, rSTR1
1301 LD rWORD6, rOFF24, rSTR2
1302 cmpld cr5, rWORD7, rWORD8
1304 srd r0, rWORD6, rSHR
1305 sld rWORD6_SHIFT, rWORD6, rSHL
1306 or rWORD6, r0, rWORD4_SHIFT
1308 LD rWORD7, rOFF32, rSTR1
1309 LD rWORD8, rOFF32, rSTR2
1310 addi rSTR1, rSTR1, 32
1311 addi rSTR2, rSTR2, 32
1312 cmpld cr7, rWORD1, rWORD2
1314 srd r12, rWORD8, rSHR
1315 sld rWORD8_SHIFT, rWORD8, rSHL
1316 or rWORD8, r12, rWORD6_SHIFT
1320 cmpld cr1, rWORD3, rWORD4
1322 cmpld cr6, rWORD5, rWORD6
1324 cmpld cr5, rWORD7, rWORD8
1334 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1335 shift right double to eliminate bits beyond the compare length.
1337 However it may not be safe to load rWORD2 which may be beyond the
1338 string length. So we compare the bit length of the remainder to
1339 the right shift count (rSHR). If the bit count is less than or equal
1340 we do not need to load rWORD2 (all significant bits are already in
1346 LD rWORD2, rOFF8, rSTR2
1347 srd r0, rWORD2, rSHR
1350 LD rWORD1, rOFF8, rSTR1
1352 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1353 or rWORD2, r0, rWORD8_SHIFT
1354 ld rWORD7, rWORD7SAVE(r1)
1355 ld rSHL, rSHLSAVE(r1)
1356 srd rWORD1, rWORD1, rN
1357 srd rWORD2, rWORD2, rN
1358 ld rSHR, rSHRSAVE(r1)
1359 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1361 cmpld cr7, rWORD1, rWORD2
1362 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1363 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1364 beq cr7, L(dureturn24)
1366 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1367 ld rOFF8, rOFF8SAVE(r1)
1368 ld rOFF16, rOFF16SAVE(r1)
1369 ld rOFF24, rOFF24SAVE(r1)
1370 ld rOFF32, rOFF32SAVE(r1)
1376 ld rWORD8, rWORD8SAVE(r1)
1377 ld rWORD7, rWORD7SAVE(r1)
1379 bgt cr7, L(dureturn29)
1380 ld rSHL, rSHLSAVE(r1)
1381 ld rSHR, rSHRSAVE(r1)
1386 ld rWORD8, rWORD8SAVE(r1)
1387 ld rWORD7, rWORD7SAVE(r1)
1389 bgt cr1, L(dureturn29)
1390 ld rSHL, rSHLSAVE(r1)
1391 ld rSHR, rSHRSAVE(r1)
1396 ld rWORD8, rWORD8SAVE(r1)
1397 ld rWORD7, rWORD7SAVE(r1)
1399 bgt cr6, L(dureturn29)
1400 ld rSHL, rSHLSAVE(r1)
1401 ld rSHR, rSHRSAVE(r1)
1406 ld rWORD8, rWORD8SAVE(r1)
1407 ld rWORD7, rWORD7SAVE(r1)
1409 bgt cr5, L(dureturn29)
1410 ld rSHL, rSHLSAVE(r1)
1411 ld rSHR, rSHRSAVE(r1)
1420 ld rWORD8, rWORD8SAVE(r1)
1421 ld rWORD7, rWORD7SAVE(r1)
1423 ld rSHL, rSHLSAVE(r1)
1424 ld rSHR, rSHRSAVE(r1)
1426 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1427 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1428 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1430 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1431 ld rOFF8, rOFF8SAVE(r1)
1432 ld rOFF16, rOFF16SAVE(r1)
1433 ld rOFF24, rOFF24SAVE(r1)
1434 ld rOFF32, rOFF32SAVE(r1)
1438 ld rOFF8, rOFF8SAVE(r1)
1439 ld rOFF16, rOFF16SAVE(r1)
1440 ld rOFF24, rOFF24SAVE(r1)
1441 ld rOFF32, rOFF32SAVE(r1)
1446 libc_hidden_builtin_def (memcmp)
1447 weak_alias (memcmp, bcmp)