1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
26 # define MEMCMP memcmp
29 ENTRY_TOCLESS (MEMCMP, 4)
33 #define rSTR1 r3 /* First string arg. */
34 #define rSTR2 r4 /* Second string arg. */
35 #define rN r5 /* Max string length. */
36 #define rWORD1 r6 /* Current word in s1. */
37 #define rWORD2 r7 /* Current word in s2. */
38 #define rWORD3 r8 /* Next word in s1. */
39 #define rWORD4 r9 /* Next word in s2. */
40 #define rWORD5 r10 /* Next word in s1. */
41 #define rWORD6 r11 /* Next word in s2. */
43 #define rOFF8 r20 /* 8 bytes offset. */
44 #define rOFF16 r21 /* 16 bytes offset. */
45 #define rOFF24 r22 /* 24 bytes offset. */
46 #define rOFF32 r23 /* 24 bytes offset. */
47 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
48 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
49 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
50 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
51 #define rSHR r28 /* Unaligned shift right count. */
52 #define rSHL r29 /* Unaligned shift left count. */
53 #define rWORD7 r30 /* Next word in s1. */
54 #define rWORD8 r31 /* Next word in s2. */
56 #define rWORD8SAVE (-8)
57 #define rWORD7SAVE (-16)
58 #define rOFF8SAVE (-24)
59 #define rOFF16SAVE (-32)
60 #define rOFF24SAVE (-40)
61 #define rOFF32SAVE (-48)
62 #define rSHRSAVE (-56)
63 #define rSHLSAVE (-64)
64 #define rWORD8SHIFTSAVE (-72)
65 #define rWORD2SHIFTSAVE (-80)
66 #define rWORD4SHIFTSAVE (-88)
67 #define rWORD6SHIFTSAVE (-96)
69 #ifdef __LITTLE_ENDIAN__
81 beq- cr6, L(zeroLength)
84 /* If less than 8 bytes or not aligned, use the unaligned
86 blt cr1, L(bytealigned)
88 /* At this point we know both strings have the same alignment and the
89 compare length is at least 8 bytes. r12 contains the low order
90 3 bits of rSTR1 and cr5 contains the result of the logical compare
91 of r12 to 0. If r12 == 0 then we are already double word
92 aligned and can perform the DW aligned loop. */
99 /* Try to align to QW else proceed to DW loop. */
102 /* For the difference to reach QW alignment, load as DW. */
103 clrrdi rSTR1, rSTR1, 3
104 clrrdi rSTR2, rSTR2, 3
110 sld rWORD1, rWORD1, r9
111 sld rWORD2, rWORD2, r9
112 cmpld cr6, rWORD1, rWORD2
122 cmpld cr6, rWORD1, rWORD2
125 bne cr6, L(different)
127 ble cr6, L(zeroLength)
129 /* Now both rSTR1 and rSTR2 are aligned to QW. */
137 ble cr0, L(lessthan64)
141 /* Aligned vector loop. */
147 bnl cr6, L(different3)
151 bnl cr6, L(different2)
155 bnl cr6, L(different3)
159 bnl cr6, L(different2)
160 addi rSTR1, rSTR1, 64
161 addi rSTR2, rSTR2, 64
164 bnl cr6, L(different3)
166 /* Handle remainder for aligned loop. */
176 bnl cr6, L(different1)
184 bnl cr6, L(different1)
192 bnl cr6, L(different1)
200 bnl cr6, L(different1)
203 /* Calculate and return the difference. */
207 bge cr6, L(different2)
208 /* Discard unwanted bytes. */
209 #ifdef __LITTLE_ENDIAN__
223 #ifdef __LITTLE_ENDIAN__
224 /* Reverse bytes for direct comparison. */
235 /* Difference in second DW. */
248 #ifdef __LITTLE_ENDIAN__
249 /* Reverse bytes for direct comparison. */
260 /* Difference in second DW. */
275 /* Skip unwanted bytes. */
278 srd rWORD1, rWORD1, r8
279 srd rWORD2, rWORD2, r8
280 cmpld cr6, rWORD1, rWORD2
291 /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
292 rldicl r9, rSTR1, 0, 52
294 cmpldi cr0, r9, 4096-16
295 bgt cr0, L(unaligned)
296 rldicl r9, rSTR2, 0, 52
298 cmpldi cr0, r9, 4096-16
299 bgt cr0, L(unaligned)
303 /* Check if rSTR1 is aligned to QW. */
304 andi. r11, rSTR1, 0xF
307 /* Compare 16B and align S1 to QW. */
308 #ifdef __LITTLE_ENDIAN__
309 lvsr v10, 0, rSTR1 /* Compute mask. */
310 lvsr v6, 0, rSTR2 /* Compute mask. */
312 lvsl v10, 0, rSTR1 /* Compute mask. */
313 lvsl v6, 0, rSTR2 /* Compute mask. */
317 #ifdef __LITTLE_ENDIAN__
324 #ifdef __LITTLE_ENDIAN__
325 vperm v4, v9, v4, v10
327 vperm v4, v4, v9, v10
330 bnl cr6, L(different1)
332 ble cr6, L(zeroLength)
335 add rSTR1, rSTR1, r11
336 add rSTR2, rSTR2, r11
338 /* As s1 is QW aligned prepare for unaligned loop. */
341 #ifdef __LITTLE_ENDIAN__
350 ble cr0, L(lessthan64_unalign)
353 /* Unaligned vector loop. */
358 #ifdef __LITTLE_ENDIAN__
359 vperm v5, v10, v5, v6
361 vperm v5, v5, v10, v6
364 bnl cr6, L(different2)
368 #ifdef __LITTLE_ENDIAN__
369 vperm v5, v10, v5, v6
371 vperm v5, v5, v10, v6
374 bnl cr6, L(different2)
378 #ifdef __LITTLE_ENDIAN__
379 vperm v5, v10, v5, v6
381 vperm v5, v5, v10, v6
384 bnl cr6, L(different2)
388 #ifdef __LITTLE_ENDIAN__
389 vperm v5, v10, v5, v6
391 vperm v5, v5, v10, v6
394 bnl cr6, L(different2)
396 addi rSTR1, rSTR1, 64
397 addi rSTR2, rSTR2, 64
398 bdnz L(unalign_qwloop)
400 /* Handle remainder for unaligned loop. */
402 L(lessthan64_unalign):
409 #ifdef __LITTLE_ENDIAN__
410 vperm v5, v10, v5, v6
412 vperm v5, v5, v10, v6
415 bnl cr6, L(different1)
423 #ifdef __LITTLE_ENDIAN__
424 vperm v5, v10, v5, v6
426 vperm v5, v5, v10, v6
429 bnl cr6, L(different1)
437 #ifdef __LITTLE_ENDIAN__
438 vperm v5, v10, v5, v6
440 vperm v5, v5, v10, v6
443 bnl cr6, L(different1)
452 #ifdef __LITTLE_ENDIAN__
453 vperm v5, v10, v5, v6
455 vperm v5, v5, v10, v6
458 bnl cr6, L(different1)
461 /* Otherwise we know the two strings have the same alignment (but not
462 yet DW). So we force the string addresses to the next lower DW
463 boundary and special case this first DW using shift left to
464 eliminate bits preceding the first byte. Since we want to join the
465 normal (DW aligned) compare loop, starting at the second double word,
466 we need to adjust the length (rN) and special case the loop
467 versioning for the first DW. This ensures that the loop count is
468 correct and the first DW (shifted) is in the expected register pair. */
471 std rWORD8, rWORD8SAVE(r1)
472 std rWORD7, rWORD7SAVE(r1)
473 std rOFF8, rOFF8SAVE(r1)
474 std rOFF16, rOFF16SAVE(r1)
475 std rOFF24, rOFF24SAVE(r1)
476 std rOFF32, rOFF32SAVE(r1)
477 cfi_offset(rWORD8, rWORD8SAVE)
478 cfi_offset(rWORD7, rWORD7SAVE)
479 cfi_offset(rOFF8, rOFF8SAVE)
480 cfi_offset(rOFF16, rOFF16SAVE)
481 cfi_offset(rOFF24, rOFF24SAVE)
482 cfi_offset(rOFF32, rOFF32SAVE)
488 clrrdi rSTR1, rSTR1, 3
489 clrrdi rSTR2, rSTR2, 3
490 beq cr5, L(DWaligned)
493 srdi r0, rN, 5 /* Divide by 32. */
494 andi. r12, rN, 24 /* Get the DW remainder. */
505 /* Remainder is 8. */
508 sld rWORD5, rWORD1, rWORD6
509 sld rWORD6, rWORD2, rWORD6
510 cmpld cr5, rWORD5, rWORD6
512 /* Do something useful in this cycle since we have to branch anyway. */
513 LD rWORD1, rOFF8, rSTR1
514 LD rWORD2, rOFF8, rSTR2
515 cmpld cr7, rWORD1, rWORD2
517 /* Remainder is 16. */
520 sld rWORD5, rWORD1, rWORD6
521 sld rWORD6, rWORD2, rWORD6
522 cmpld cr6, rWORD5, rWORD6
524 /* Do something useful in this cycle since we have to branch anyway. */
525 LD rWORD7, rOFF8, rSTR1
526 LD rWORD8, rOFF8, rSTR2
527 cmpld cr5, rWORD7, rWORD8
529 /* Remainder is 24. */
532 sld rWORD3, rWORD1, rWORD6
533 sld rWORD4, rWORD2, rWORD6
534 cmpld cr1, rWORD3, rWORD4
536 /* Count is a multiple of 32, remainder is 0. */
540 sld rWORD1, rWORD1, rWORD6
541 sld rWORD2, rWORD2, rWORD6
542 cmpld cr7, rWORD1, rWORD2
545 /* At this point we know both strings are double word aligned and the
546 compare length is at least 8 bytes. */
549 andi. r12, rN, 24 /* Get the DW remainder. */
550 srdi r0, rN, 5 /* Divide by 32. */
558 /* Remainder is 8. */
562 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
563 (8-15 byte compare), we want to use only volatile registers. This
564 means we can avoid restoring non-volatile registers since we did not
565 change any on the early exit path. The key here is the non-early
566 exit path only cares about the condition code (cr5), not about which
567 register pair was used. */
570 cmpld cr5, rWORD5, rWORD6
572 LD rWORD1, rOFF8, rSTR1
573 LD rWORD2, rOFF8, rSTR2
574 cmpld cr7, rWORD1, rWORD2
576 LD rWORD3, rOFF16, rSTR1
577 LD rWORD4, rOFF16, rSTR2
578 cmpld cr1, rWORD3, rWORD4
579 LD rWORD5, rOFF24, rSTR1
580 LD rWORD6, rOFF24, rSTR2
581 cmpld cr6, rWORD5, rWORD6
585 LD rWORD7, rOFF32, rSTR1
586 LD rWORD8, rOFF32, rSTR2
587 addi rSTR1, rSTR1, 32
588 addi rSTR2, rSTR2, 32
590 cmpld cr5, rWORD7, rWORD8
593 ld rWORD8, rWORD8SAVE(r1)
594 ld rWORD7, rWORD7SAVE(r1)
599 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
601 ld rOFF8, rOFF8SAVE(r1)
602 ld rOFF16, rOFF16SAVE(r1)
603 ld rOFF24, rOFF24SAVE(r1)
604 ld rOFF32, rOFF32SAVE(r1)
608 /* Remainder is 16. */
614 cmpld cr6, rWORD5, rWORD6
616 LD rWORD7, rOFF8, rSTR1
617 LD rWORD8, rOFF8, rSTR2
618 cmpld cr5, rWORD7, rWORD8
620 LD rWORD1, rOFF16, rSTR1
621 LD rWORD2, rOFF16, rSTR2
622 cmpld cr7, rWORD1, rWORD2
623 LD rWORD3, rOFF24, rSTR1
624 LD rWORD4, rOFF24, rSTR2
625 cmpld cr1, rWORD3, rWORD4
633 LD rWORD3, rOFF8, rSTR1
634 LD rWORD4, rOFF8, rSTR2
635 cmpld cr1, rWORD3, rWORD4
641 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
643 ld rOFF8, rOFF8SAVE(r1)
644 ld rOFF16, rOFF16SAVE(r1)
645 ld rOFF24, rOFF24SAVE(r1)
646 ld rOFF32, rOFF32SAVE(r1)
650 /* Remainder is 24. */
656 cmpld cr1, rWORD3, rWORD4
658 LD rWORD5, rOFF8, rSTR1
659 LD rWORD6, rOFF8, rSTR2
660 cmpld cr6, rWORD5, rWORD6
662 LD rWORD7, rOFF16, rSTR1
663 LD rWORD8, rOFF16, rSTR2
664 cmpld cr5, rWORD7, rWORD8
665 LD rWORD1, rOFF24, rSTR1
666 LD rWORD2, rOFF24, rSTR2
667 cmpld cr7, rWORD1, rWORD2
668 addi rSTR1, rSTR1, 16
669 addi rSTR2, rSTR2, 16
673 /* Again we are on a early exit path (24-31 byte compare), we want to
674 only use volatile registers and avoid restoring non-volatile
678 LD rWORD1, rOFF16, rSTR1
679 LD rWORD2, rOFF16, rSTR2
680 cmpld cr7, rWORD1, rWORD2
683 addi rSTR1, rSTR1, 16
684 addi rSTR2, rSTR2, 16
686 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
689 ld rOFF8, rOFF8SAVE(r1)
690 ld rOFF16, rOFF16SAVE(r1)
691 ld rOFF24, rOFF24SAVE(r1)
692 ld rOFF32, rOFF32SAVE(r1)
696 /* Count is a multiple of 32, remainder is 0. */
702 cmpld cr7, rWORD1, rWORD2
704 LD rWORD3, rOFF8, rSTR1
705 LD rWORD4, rOFF8, rSTR2
706 cmpld cr1, rWORD3, rWORD4
707 LD rWORD5, rOFF16, rSTR1
708 LD rWORD6, rOFF16, rSTR2
709 cmpld cr6, rWORD5, rWORD6
710 LD rWORD7, rOFF24, rSTR1
711 LD rWORD8, rOFF24, rSTR2
712 addi rSTR1, rSTR1, 24
713 addi rSTR2, rSTR2, 24
714 cmpld cr5, rWORD7, rWORD8
717 bdz- L(d24) /* Adjust CTR as we start with +4. */
718 /* This is the primary loop. */
721 LD rWORD1, rOFF8, rSTR1
722 LD rWORD2, rOFF8, rSTR2
723 cmpld cr1, rWORD3, rWORD4
726 LD rWORD3, rOFF16, rSTR1
727 LD rWORD4, rOFF16, rSTR2
728 cmpld cr6, rWORD5, rWORD6
731 LD rWORD5, rOFF24, rSTR1
732 LD rWORD6, rOFF24, rSTR2
733 cmpld cr5, rWORD7, rWORD8
736 LD rWORD7, rOFF32, rSTR1
737 LD rWORD8, rOFF32, rSTR2
738 addi rSTR1, rSTR1, 32
739 addi rSTR2, rSTR2, 32
741 cmpld cr7, rWORD1, rWORD2
745 cmpld cr1, rWORD3, rWORD4
747 cmpld cr6, rWORD5, rWORD6
749 cmpld cr5, rWORD7, rWORD8
760 ld rWORD8, rWORD8SAVE(r1)
761 ld rWORD7, rWORD7SAVE(r1)
762 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
764 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
765 we are aligned it is safe to load the whole double word, and use
766 shift right double to eliminate bits beyond the compare length. */
768 LD rWORD1, rOFF8, rSTR1
769 LD rWORD2, rOFF8, rSTR2
770 srd rWORD1, rWORD1, rN
771 srd rWORD2, rWORD2, rN
772 cmpld cr7, rWORD1, rWORD2
774 ld rOFF8, rOFF8SAVE(r1)
775 ld rOFF16, rOFF16SAVE(r1)
776 ld rOFF24, rOFF24SAVE(r1)
777 ld rOFF32, rOFF32SAVE(r1)
783 ld rWORD8, rWORD8SAVE(r1)
784 ld rWORD7, rWORD7SAVE(r1)
786 ld rOFF8, rOFF8SAVE(r1)
787 ld rOFF16, rOFF16SAVE(r1)
788 ld rOFF24, rOFF24SAVE(r1)
789 ld rOFF32, rOFF32SAVE(r1)
796 ld rWORD8, rWORD8SAVE(r1)
797 ld rWORD7, rWORD7SAVE(r1)
799 ld rOFF8, rOFF8SAVE(r1)
800 ld rOFF16, rOFF16SAVE(r1)
801 ld rOFF24, rOFF24SAVE(r1)
802 ld rOFF32, rOFF32SAVE(r1)
809 ld rWORD8, rWORD8SAVE(r1)
810 ld rWORD7, rWORD7SAVE(r1)
812 ld rOFF8, rOFF8SAVE(r1)
813 ld rOFF16, rOFF16SAVE(r1)
814 ld rOFF24, rOFF24SAVE(r1)
815 ld rOFF32, rOFF32SAVE(r1)
822 ld rWORD8, rWORD8SAVE(r1)
823 ld rWORD7, rWORD7SAVE(r1)
825 ld rOFF8, rOFF8SAVE(r1)
826 ld rOFF16, rOFF16SAVE(r1)
827 ld rOFF24, rOFF24SAVE(r1)
828 ld rOFF32, rOFF32SAVE(r1)
838 /* We need to prime this loop. This loop is swing modulo scheduled
839 to avoid pipe delays. The dependent instruction latencies (load to
840 compare to conditional branch) is 2 to 3 cycles. In this loop each
841 dispatch group ends in a branch and takes 1 cycle. Effectively
842 the first iteration of the loop only serves to load operands and
843 branches based on compares are delayed until the next loop.
845 So we must precondition some registers and condition codes so that
846 we don't exit the loop early on the first iteration. */
851 cmpld cr7, rWORD1, rWORD2
855 cmpld cr1, rWORD3, rWORD4
856 lbzu rWORD5, 2(rSTR1)
857 lbzu rWORD6, 2(rSTR2)
861 lbzu rWORD1, 1(rSTR1)
862 lbzu rWORD2, 1(rSTR2)
865 cmpld cr6, rWORD5, rWORD6
868 lbzu rWORD3, 1(rSTR1)
869 lbzu rWORD4, 1(rSTR2)
872 cmpld cr7, rWORD1, rWORD2
875 lbzu rWORD5, 1(rSTR1)
876 lbzu rWORD6, 1(rSTR2)
879 cmpld cr1, rWORD3, rWORD4
882 /* We speculatively loading bytes before we have tested the previous
883 bytes. But we must avoid overrunning the length (in the ctr) to
884 prevent these speculative loads from causing a segfault. In this
885 case the loop will exit early (before the all pending bytes are
886 tested. In this case we must complete the pending operations
923 sub rRTN, rWORD5, rWORD6
929 sub rRTN, rWORD3, rWORD4
933 sub rRTN, rWORD1, rWORD2
942 /* At this point we know the strings have different alignment and the
943 compare length is at least 8 bytes. r12 contains the low order
944 3 bits of rSTR1 and cr5 contains the result of the logical compare
945 of r12 to 0. If r12 == 0 then rStr1 is double word
946 aligned and can perform the DWunaligned loop.
948 Otherwise we know that rSTR1 is not already DW aligned yet.
949 So we can force the string addresses to the next lower DW
950 boundary and special case this first DW using shift left to
951 eliminate bits preceding the first byte. Since we want to join the
952 normal (DWaligned) compare loop, starting at the second double word,
953 we need to adjust the length (rN) and special case the loop
954 versioning for the first DW. This ensures that the loop count is
955 correct and the first DW (shifted) is in the expected resister pair. */
957 std rWORD8, rWORD8SAVE(r1)
958 std rWORD7, rWORD7SAVE(r1)
959 std rOFF8, rOFF8SAVE(r1)
960 std rOFF16, rOFF16SAVE(r1)
961 std rOFF24, rOFF24SAVE(r1)
962 std rOFF32, rOFF32SAVE(r1)
963 cfi_offset(rWORD8, rWORD8SAVE)
964 cfi_offset(rWORD7, rWORD7SAVE)
965 cfi_offset(rOFF8, rOFF8SAVE)
966 cfi_offset(rOFF16, rOFF16SAVE)
967 cfi_offset(rOFF24, rOFF24SAVE)
968 cfi_offset(rOFF32, rOFF32SAVE)
973 std rSHL, rSHLSAVE(r1)
974 cfi_offset(rSHL, rSHLSAVE)
975 clrldi rSHL, rSTR2, 61
976 beq cr6, L(duzeroLength)
977 std rSHR, rSHRSAVE(r1)
978 cfi_offset(rSHR, rSHRSAVE)
979 beq cr5, L(DWunaligned)
980 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
981 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
982 /* Adjust the logical start of rSTR2 to compensate for the extra bits
983 in the 1st rSTR1 DW. */
984 sub rWORD8_SHIFT, rSTR2, r12
985 /* But do not attempt to address the DW before that DW that contains
986 the actual start of rSTR2. */
987 clrrdi rSTR2, rSTR2, 3
988 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
989 /* Compute the left/right shift counts for the unaligned rSTR2,
990 compensating for the logical (DW aligned) start of rSTR1. */
991 clrldi rSHL, rWORD8_SHIFT, 61
992 clrrdi rSTR1, rSTR1, 3
993 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
995 cmpld cr5, rWORD8_SHIFT, rSTR2
998 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
999 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1000 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1001 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1002 subfic rSHR, rSHL, 64
1003 srdi r0, rN, 5 /* Divide by 32. */
1004 andi. r12, rN, 24 /* Get the DW remainder. */
1005 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
1006 this special case those bits may be discarded anyway. Also we
1007 must avoid loading a DW where none of the bits are part of rSTR2 as
1008 this may cross a page boundary and cause a page fault. */
1012 addi rSTR2, rSTR2, 8
1013 sld rWORD8, rWORD8, rSHL
1020 srd r12, rWORD2, rSHR
1024 or rWORD8, r12, rWORD8
1028 /* Remainder is 8. */
1031 sld rWORD8_SHIFT, rWORD2, rSHL
1032 sld rWORD7, rWORD1, rWORD6
1033 sld rWORD8, rWORD8, rWORD6
1035 /* At this point we exit early with the first double word compare
1036 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1037 how we handle the remaining bytes. */
1038 cmpld cr5, rWORD7, rWORD8
1045 LD rWORD2, rOFF8, rSTR2
1046 srd r0, rWORD2, rSHR
1048 /* Remainder is 16. */
1051 sld rWORD6_SHIFT, rWORD2, rSHL
1052 sld rWORD5, rWORD1, rWORD6
1053 sld rWORD6, rWORD8, rWORD6
1055 /* Remainder is 24. */
1058 sld rWORD4_SHIFT, rWORD2, rSHL
1059 sld rWORD3, rWORD1, rWORD6
1060 sld rWORD4, rWORD8, rWORD6
1062 /* Count is a multiple of 32, remainder is 0. */
1066 or rWORD8, r12, rWORD8
1067 sld rWORD2_SHIFT, rWORD2, rSHL
1068 sld rWORD1, rWORD1, rWORD6
1069 sld rWORD2, rWORD8, rWORD6
1072 /* At this point we know rSTR1 is double word aligned and the
1073 compare length is at least 8 bytes. */
1076 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1077 clrrdi rSTR2, rSTR2, 3
1078 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1079 srdi r0, rN, 5 /* Divide by 32. */
1080 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1081 andi. r12, rN, 24 /* Get the DW remainder. */
1082 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1083 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
1084 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1085 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1086 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1089 LD rWORD8, rOFF8, rSTR2
1090 addi rSTR2, rSTR2, 8
1094 subfic rSHR, rSHL, 64
1095 sld rWORD6_SHIFT, rWORD6, rSHL
1101 /* Remainder is 8. */
1104 srd r12, rWORD8, rSHR
1106 sld rWORD8_SHIFT, rWORD8, rSHL
1107 or rWORD8, r12, rWORD6_SHIFT
1110 LD rWORD1, rOFF8, rSTR1
1111 LD rWORD2, rOFF8, rSTR2
1112 cmpld cr5, rWORD7, rWORD8
1113 srd r0, rWORD2, rSHR
1114 sld rWORD2_SHIFT, rWORD2, rSHL
1115 or rWORD2, r0, rWORD8_SHIFT
1116 LD rWORD3, rOFF16, rSTR1
1117 LD rWORD4, rOFF16, rSTR2
1118 cmpld cr7, rWORD1, rWORD2
1119 srd r12, rWORD4, rSHR
1120 sld rWORD4_SHIFT, rWORD4, rSHL
1122 or rWORD4, r12, rWORD2_SHIFT
1123 LD rWORD5, rOFF24, rSTR1
1124 LD rWORD6, rOFF24, rSTR2
1125 cmpld cr1, rWORD3, rWORD4
1126 srd r0, rWORD6, rSHR
1127 sld rWORD6_SHIFT, rWORD6, rSHL
1129 or rWORD6, r0, rWORD4_SHIFT
1130 cmpld cr6, rWORD5, rWORD6
1133 /* At this point we exit early with the first double word compare
1134 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1135 how we handle the remaining bytes. */
1137 cmpld cr5, rWORD7, rWORD8
1144 LD rWORD2, rOFF8, rSTR2
1145 srd r0, rWORD2, rSHR
1147 /* Remainder is 16. */
1150 srd r0, rWORD8, rSHR
1152 or rWORD6, r0, rWORD6_SHIFT
1153 sld rWORD6_SHIFT, rWORD8, rSHL
1155 LD rWORD7, rOFF8, rSTR1
1156 LD rWORD8, rOFF8, rSTR2
1157 cmpld cr6, rWORD5, rWORD6
1158 srd r12, rWORD8, rSHR
1159 sld rWORD8_SHIFT, rWORD8, rSHL
1160 or rWORD8, r12, rWORD6_SHIFT
1162 LD rWORD1, rOFF16, rSTR1
1163 LD rWORD2, rOFF16, rSTR2
1164 cmpld cr5, rWORD7, rWORD8
1166 srd r0, rWORD2, rSHR
1167 sld rWORD2_SHIFT, rWORD2, rSHL
1168 or rWORD2, r0, rWORD8_SHIFT
1169 LD rWORD3, rOFF24, rSTR1
1170 LD rWORD4, rOFF24, rSTR2
1171 cmpld cr7, rWORD1, rWORD2
1173 srd r12, rWORD4, rSHR
1174 sld rWORD4_SHIFT, rWORD4, rSHL
1175 or rWORD4, r12, rWORD2_SHIFT
1176 addi rSTR1, rSTR1, 8
1177 addi rSTR2, rSTR2, 8
1178 cmpld cr1, rWORD3, rWORD4
1182 cmpld cr5, rWORD7, rWORD8
1183 addi rSTR1, rSTR1, 8
1184 addi rSTR2, rSTR2, 8
1192 LD rWORD2, rOFF8, rSTR2
1193 srd r0, rWORD2, rSHR
1196 /* Remainder is 24. */
1199 srd r12, rWORD8, rSHR
1201 sld rWORD4_SHIFT, rWORD8, rSHL
1202 or rWORD4, r12, rWORD6_SHIFT
1204 LD rWORD5, rOFF8, rSTR1
1205 LD rWORD6, rOFF8, rSTR2
1206 cmpld cr1, rWORD3, rWORD4
1207 srd r0, rWORD6, rSHR
1208 sld rWORD6_SHIFT, rWORD6, rSHL
1209 or rWORD6, r0, rWORD4_SHIFT
1210 LD rWORD7, rOFF16, rSTR1
1211 LD rWORD8, rOFF16, rSTR2
1212 cmpld cr6, rWORD5, rWORD6
1214 srd r12, rWORD8, rSHR
1215 sld rWORD8_SHIFT, rWORD8, rSHL
1216 or rWORD8, r12, rWORD6_SHIFT
1218 LD rWORD1, rOFF24, rSTR1
1219 LD rWORD2, rOFF24, rSTR2
1220 cmpld cr5, rWORD7, rWORD8
1222 srd r0, rWORD2, rSHR
1223 sld rWORD2_SHIFT, rWORD2, rSHL
1224 or rWORD2, r0, rWORD8_SHIFT
1225 addi rSTR1, rSTR1, 16
1226 addi rSTR2, rSTR2, 16
1227 cmpld cr7, rWORD1, rWORD2
1231 addi rSTR1, rSTR1, 16
1232 addi rSTR2, rSTR2, 16
1233 cmpld cr5, rWORD7, rWORD8
1241 LD rWORD2, rOFF8, rSTR2
1242 srd r0, rWORD2, rSHR
1245 /* Count is a multiple of 32, remainder is 0. */
1249 srd r0, rWORD8, rSHR
1251 sld rWORD2_SHIFT, rWORD8, rSHL
1252 or rWORD2, r0, rWORD6_SHIFT
1254 LD rWORD3, rOFF8, rSTR1
1255 LD rWORD4, rOFF8, rSTR2
1256 cmpld cr7, rWORD1, rWORD2
1257 srd r12, rWORD4, rSHR
1258 sld rWORD4_SHIFT, rWORD4, rSHL
1259 or rWORD4, r12, rWORD2_SHIFT
1260 LD rWORD5, rOFF16, rSTR1
1261 LD rWORD6, rOFF16, rSTR2
1262 cmpld cr1, rWORD3, rWORD4
1264 srd r0, rWORD6, rSHR
1265 sld rWORD6_SHIFT, rWORD6, rSHL
1266 or rWORD6, r0, rWORD4_SHIFT
1267 LD rWORD7, rOFF24, rSTR1
1268 LD rWORD8, rOFF24, rSTR2
1269 addi rSTR1, rSTR1, 24
1270 addi rSTR2, rSTR2, 24
1271 cmpld cr6, rWORD5, rWORD6
1273 srd r12, rWORD8, rSHR
1274 sld rWORD8_SHIFT, rWORD8, rSHL
1275 or rWORD8, r12, rWORD6_SHIFT
1276 cmpld cr5, rWORD7, rWORD8
1277 bdz L(du24) /* Adjust CTR as we start with +4. */
1278 /* This is the primary loop. */
1281 LD rWORD1, rOFF8, rSTR1
1282 LD rWORD2, rOFF8, rSTR2
1283 cmpld cr1, rWORD3, rWORD4
1285 srd r0, rWORD2, rSHR
1286 sld rWORD2_SHIFT, rWORD2, rSHL
1287 or rWORD2, r0, rWORD8_SHIFT
1289 LD rWORD3, rOFF16, rSTR1
1290 LD rWORD4, rOFF16, rSTR2
1291 cmpld cr6, rWORD5, rWORD6
1293 srd r12, rWORD4, rSHR
1294 sld rWORD4_SHIFT, rWORD4, rSHL
1295 or rWORD4, r12, rWORD2_SHIFT
1297 LD rWORD5, rOFF24, rSTR1
1298 LD rWORD6, rOFF24, rSTR2
1299 cmpld cr5, rWORD7, rWORD8
1301 srd r0, rWORD6, rSHR
1302 sld rWORD6_SHIFT, rWORD6, rSHL
1303 or rWORD6, r0, rWORD4_SHIFT
1305 LD rWORD7, rOFF32, rSTR1
1306 LD rWORD8, rOFF32, rSTR2
1307 addi rSTR1, rSTR1, 32
1308 addi rSTR2, rSTR2, 32
1309 cmpld cr7, rWORD1, rWORD2
1311 srd r12, rWORD8, rSHR
1312 sld rWORD8_SHIFT, rWORD8, rSHL
1313 or rWORD8, r12, rWORD6_SHIFT
1317 cmpld cr1, rWORD3, rWORD4
1319 cmpld cr6, rWORD5, rWORD6
1321 cmpld cr5, rWORD7, rWORD8
1331 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1332 shift right double to eliminate bits beyond the compare length.
1334 However it may not be safe to load rWORD2 which may be beyond the
1335 string length. So we compare the bit length of the remainder to
1336 the right shift count (rSHR). If the bit count is less than or equal
1337 we do not need to load rWORD2 (all significant bits are already in
1343 LD rWORD2, rOFF8, rSTR2
1344 srd r0, rWORD2, rSHR
1347 LD rWORD1, rOFF8, rSTR1
1349 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1350 or rWORD2, r0, rWORD8_SHIFT
1351 ld rWORD7, rWORD7SAVE(r1)
1352 ld rSHL, rSHLSAVE(r1)
1353 srd rWORD1, rWORD1, rN
1354 srd rWORD2, rWORD2, rN
1355 ld rSHR, rSHRSAVE(r1)
1356 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1358 cmpld cr7, rWORD1, rWORD2
1359 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1360 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1361 beq cr7, L(dureturn24)
1363 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1364 ld rOFF8, rOFF8SAVE(r1)
1365 ld rOFF16, rOFF16SAVE(r1)
1366 ld rOFF24, rOFF24SAVE(r1)
1367 ld rOFF32, rOFF32SAVE(r1)
1373 ld rWORD8, rWORD8SAVE(r1)
1374 ld rWORD7, rWORD7SAVE(r1)
1376 bgt cr7, L(dureturn29)
1377 ld rSHL, rSHLSAVE(r1)
1378 ld rSHR, rSHRSAVE(r1)
1383 ld rWORD8, rWORD8SAVE(r1)
1384 ld rWORD7, rWORD7SAVE(r1)
1386 bgt cr1, L(dureturn29)
1387 ld rSHL, rSHLSAVE(r1)
1388 ld rSHR, rSHRSAVE(r1)
1393 ld rWORD8, rWORD8SAVE(r1)
1394 ld rWORD7, rWORD7SAVE(r1)
1396 bgt cr6, L(dureturn29)
1397 ld rSHL, rSHLSAVE(r1)
1398 ld rSHR, rSHRSAVE(r1)
1403 ld rWORD8, rWORD8SAVE(r1)
1404 ld rWORD7, rWORD7SAVE(r1)
1406 bgt cr5, L(dureturn29)
1407 ld rSHL, rSHLSAVE(r1)
1408 ld rSHR, rSHRSAVE(r1)
1417 ld rWORD8, rWORD8SAVE(r1)
1418 ld rWORD7, rWORD7SAVE(r1)
1420 ld rSHL, rSHLSAVE(r1)
1421 ld rSHR, rSHRSAVE(r1)
1423 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1424 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1425 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1427 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1428 ld rOFF8, rOFF8SAVE(r1)
1429 ld rOFF16, rOFF16SAVE(r1)
1430 ld rOFF24, rOFF24SAVE(r1)
1431 ld rOFF32, rOFF32SAVE(r1)
1435 ld rOFF8, rOFF8SAVE(r1)
1436 ld rOFF16, rOFF16SAVE(r1)
1437 ld rOFF24, rOFF24SAVE(r1)
1438 ld rOFF32, rOFF32SAVE(r1)
1443 libc_hidden_builtin_def (memcmp)
1444 weak_alias (memcmp, bcmp)