1 /* Optimized memcmp implementation for PowerPC64.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
26 # define MEMCMP memcmp
29 #ifndef __LITTLE_ENDIAN__
32 /* Little endian is only available since POWER8, so it's safe to
33 specify .machine as power8 (or older), even though this is a POWER4
34 file. Since the little-endian code uses 'ldbrx', power7 is enough. */
37 ENTRY_TOCLESS (MEMCMP, 4)
41 #define rSTR1 r3 /* first string arg */
42 #define rSTR2 r4 /* second string arg */
43 #define rN r5 /* max string length */
44 #define rWORD1 r6 /* current word in s1 */
45 #define rWORD2 r7 /* current word in s2 */
46 #define rWORD3 r8 /* next word in s1 */
47 #define rWORD4 r9 /* next word in s2 */
48 #define rWORD5 r10 /* next word in s1 */
49 #define rWORD6 r11 /* next word in s2 */
50 #define rWORD7 r30 /* next word in s1 */
51 #define rWORD8 r31 /* next word in s2 */
59 beq- cr6, L(zeroLength)
62 /* If less than 8 bytes or not aligned, use the unaligned
64 blt cr1, L(bytealigned)
67 cfi_offset(rWORD8, -8)
68 cfi_offset(rWORD7, -16)
70 /* At this point we know both strings have the same alignment and the
71 compare length is at least 8 bytes. r12 contains the low order
72 3 bits of rSTR1 and cr5 contains the result of the logical compare
73 of r12 to 0. If r12 == 0 then we are already double word
74 aligned and can perform the DW aligned loop.
76 Otherwise we know the two strings have the same alignment (but not
77 yet DW). So we force the string addresses to the next lower DW
78 boundary and special case this first DW using shift left to
79 eliminate bits preceding the first byte. Since we want to join the
80 normal (DW aligned) compare loop, starting at the second double word,
81 we need to adjust the length (rN) and special case the loop
82 versioning for the first DW. This ensures that the loop count is
83 correct and the first DW (shifted) is in the expected register pair. */
86 clrrdi rSTR1, rSTR1, 3
87 clrrdi rSTR2, rSTR2, 3
91 srdi r0, rN, 5 /* Divide by 32 */
92 andi. r12, rN, 24 /* Get the DW remainder */
93 #ifdef __LITTLE_ENDIAN__
94 ldbrx rWORD1, 0, rSTR1
95 ldbrx rWORD2, 0, rSTR2
106 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
113 sld rWORD5, rWORD1, rWORD6
114 sld rWORD6, rWORD2, rWORD6
115 cmpld cr5, rWORD5, rWORD6
117 /* Do something useful in this cycle since we have to branch anyway. */
118 #ifdef __LITTLE_ENDIAN__
119 ldbrx rWORD1, 0, rSTR1
120 ldbrx rWORD2, 0, rSTR2
127 cmpld cr7, rWORD1, rWORD2
129 /* Remainder is 16 */
132 sld rWORD5, rWORD1, rWORD6
133 sld rWORD6, rWORD2, rWORD6
134 cmpld cr6, rWORD5, rWORD6
136 /* Do something useful in this cycle since we have to branch anyway. */
137 #ifdef __LITTLE_ENDIAN__
138 ldbrx rWORD7, 0, rSTR1
139 ldbrx rWORD8, 0, rSTR2
146 cmpld cr5, rWORD7, rWORD8
148 /* Remainder is 24 */
151 sld rWORD3, rWORD1, rWORD6
152 sld rWORD4, rWORD2, rWORD6
153 cmpld cr1, rWORD3, rWORD4
155 /* Count is a multiple of 32, remainder is 0 */
158 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
159 sld rWORD1, rWORD1, rWORD6
160 sld rWORD2, rWORD2, rWORD6
161 cmpld cr7, rWORD1, rWORD2
164 /* At this point we know both strings are double word aligned and the
165 compare length is at least 8 bytes. */
168 andi. r12, rN, 24 /* Get the DW remainder */
169 srdi r0, rN, 5 /* Divide by 32 */
180 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
181 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
182 (8-15 byte compare), we want to use only volatile registers. This
183 means we can avoid restoring non-volatile registers since we did not
184 change any on the early exit path. The key here is the non-early
185 exit path only cares about the condition code (cr5), not about which
186 register pair was used. */
187 #ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD5, 0, rSTR1
189 ldbrx rWORD6, 0, rSTR2
196 cmpld cr5, rWORD5, rWORD6
198 #ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD1, 0, rSTR1
200 ldbrx rWORD2, 0, rSTR2
207 cmpld cr7, rWORD1, rWORD2
209 #ifdef __LITTLE_ENDIAN__
210 ldbrx rWORD3, 0, rSTR1
211 ldbrx rWORD4, 0, rSTR2
218 cmpld cr1, rWORD3, rWORD4
219 #ifdef __LITTLE_ENDIAN__
220 ldbrx rWORD5, 0, rSTR1
221 ldbrx rWORD6, 0, rSTR2
228 cmpld cr6, rWORD5, rWORD6
232 #ifdef __LITTLE_ENDIAN__
233 ldbrx rWORD7, 0, rSTR1
234 ldbrx rWORD8, 0, rSTR2
238 ldu rWORD7, 32(rSTR1)
239 ldu rWORD8, 32(rSTR2)
242 cmpld cr5, rWORD7, rWORD8
251 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
256 /* Remainder is 16 */
259 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
260 #ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD5, 0, rSTR1
262 ldbrx rWORD6, 0, rSTR2
269 cmpld cr6, rWORD5, rWORD6
271 #ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD7, 0, rSTR1
273 ldbrx rWORD8, 0, rSTR2
280 cmpld cr5, rWORD7, rWORD8
282 #ifdef __LITTLE_ENDIAN__
283 ldbrx rWORD1, 0, rSTR1
284 ldbrx rWORD2, 0, rSTR2
291 cmpld cr7, rWORD1, rWORD2
292 #ifdef __LITTLE_ENDIAN__
293 ldbrx rWORD3, 0, rSTR1
294 ldbrx rWORD4, 0, rSTR2
301 cmpld cr1, rWORD3, rWORD4
302 #ifndef __LITTLE_ENDIAN__
309 /* Again we are on a early exit path (16-23 byte compare), we want to
310 only use volatile registers and avoid restoring non-volatile
314 #ifdef __LITTLE_ENDIAN__
315 ldbrx rWORD3, 0, rSTR1
316 ldbrx rWORD4, 0, rSTR2
323 cmpld cr1, rWORD3, rWORD4
326 #ifndef __LITTLE_ENDIAN__
331 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
336 /* Remainder is 24 */
339 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
340 #ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD3, 0, rSTR1
342 ldbrx rWORD4, 0, rSTR2
349 cmpld cr1, rWORD3, rWORD4
351 #ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD5, 0, rSTR1
353 ldbrx rWORD6, 0, rSTR2
360 cmpld cr6, rWORD5, rWORD6
362 #ifdef __LITTLE_ENDIAN__
363 ldbrx rWORD7, 0, rSTR1
364 ldbrx rWORD8, 0, rSTR2
371 cmpld cr5, rWORD7, rWORD8
372 #ifdef __LITTLE_ENDIAN__
373 ldbrx rWORD1, 0, rSTR1
374 ldbrx rWORD2, 0, rSTR2
381 cmpld cr7, rWORD1, rWORD2
382 #ifndef __LITTLE_ENDIAN__
383 addi rSTR1, rSTR1, 16
384 addi rSTR2, rSTR2, 16
389 /* Again we are on a early exit path (24-31 byte compare), we want to
390 only use volatile registers and avoid restoring non-volatile
394 #ifdef __LITTLE_ENDIAN__
395 ldbrx rWORD1, 0, rSTR1
396 ldbrx rWORD2, 0, rSTR2
403 cmpld cr7, rWORD1, rWORD2
406 #ifndef __LITTLE_ENDIAN__
407 addi rSTR1, rSTR1, 16
408 addi rSTR2, rSTR2, 16
411 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
417 /* Count is a multiple of 32, remainder is 0 */
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421 #ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD1, 0, rSTR1
423 ldbrx rWORD2, 0, rSTR2
430 cmpld cr7, rWORD1, rWORD2
432 #ifdef __LITTLE_ENDIAN__
433 ldbrx rWORD3, 0, rSTR1
434 ldbrx rWORD4, 0, rSTR2
441 cmpld cr1, rWORD3, rWORD4
442 #ifdef __LITTLE_ENDIAN__
443 ldbrx rWORD5, 0, rSTR1
444 ldbrx rWORD6, 0, rSTR2
451 cmpld cr6, rWORD5, rWORD6
452 #ifdef __LITTLE_ENDIAN__
453 ldbrx rWORD7, 0, rSTR1
454 ldbrx rWORD8, 0, rSTR2
458 ldu rWORD7, 24(rSTR1)
459 ldu rWORD8, 24(rSTR2)
461 cmpld cr5, rWORD7, rWORD8
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465 /* This is the primary loop */
468 #ifdef __LITTLE_ENDIAN__
469 ldbrx rWORD1, 0, rSTR1
470 ldbrx rWORD2, 0, rSTR2
477 cmpld cr1, rWORD3, rWORD4
480 #ifdef __LITTLE_ENDIAN__
481 ldbrx rWORD3, 0, rSTR1
482 ldbrx rWORD4, 0, rSTR2
489 cmpld cr6, rWORD5, rWORD6
492 #ifdef __LITTLE_ENDIAN__
493 ldbrx rWORD5, 0, rSTR1
494 ldbrx rWORD6, 0, rSTR2
501 cmpld cr5, rWORD7, rWORD8
504 #ifdef __LITTLE_ENDIAN__
505 ldbrx rWORD7, 0, rSTR1
506 ldbrx rWORD8, 0, rSTR2
510 ldu rWORD7, 32(rSTR1)
511 ldu rWORD8, 32(rSTR2)
514 cmpld cr7, rWORD1, rWORD2
518 cmpld cr1, rWORD3, rWORD4
520 cmpld cr6, rWORD5, rWORD6
522 cmpld cr5, rWORD7, rWORD8
535 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
537 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
538 we are aligned it is safe to load the whole double word, and use
539 shift right double to eliminate bits beyond the compare length. */
541 #ifdef __LITTLE_ENDIAN__
542 ldbrx rWORD1, 0, rSTR1
543 ldbrx rWORD2, 0, rSTR2
550 srd rWORD1, rWORD1, rN
551 srd rWORD2, rWORD2, rN
552 cmpld cr7, rWORD1, rWORD2
596 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
598 /* Huh? We've already branched on cr6! */
599 beq- cr6, L(zeroLength)
602 /* We need to prime this loop. This loop is swing modulo scheduled
603 to avoid pipe delays. The dependent instruction latencies (load to
604 compare to conditional branch) is 2 to 3 cycles. In this loop each
605 dispatch group ends in a branch and takes 1 cycle. Effectively
606 the first iteration of the loop only serves to load operands and
607 branches based on compares are delayed until the next loop.
609 So we must precondition some registers and condition codes so that
610 we don't exit the loop early on the first iteration. */
615 cmpld cr7, rWORD1, rWORD2
619 cmpld cr1, rWORD3, rWORD4
620 lbzu rWORD5, 2(rSTR1)
621 lbzu rWORD6, 2(rSTR2)
625 lbzu rWORD1, 1(rSTR1)
626 lbzu rWORD2, 1(rSTR2)
629 cmpld cr6, rWORD5, rWORD6
632 lbzu rWORD3, 1(rSTR1)
633 lbzu rWORD4, 1(rSTR2)
636 cmpld cr7, rWORD1, rWORD2
639 lbzu rWORD5, 1(rSTR1)
640 lbzu rWORD6, 1(rSTR2)
643 cmpld cr1, rWORD3, rWORD4
646 /* We speculatively loading bytes before we have tested the previous
647 bytes. But we must avoid overrunning the length (in the ctr) to
648 prevent these speculative loads from causing a segfault. In this
649 case the loop will exit early (before the all pending bytes are
650 tested. In this case we must complete the pending operations
687 sub rRTN, rWORD5, rWORD6
693 sub rRTN, rWORD3, rWORD4
697 sub rRTN, rWORD1, rWORD2
705 /* At this point we know the strings have different alignment and the
706 compare length is at least 8 bytes. r12 contains the low order
707 3 bits of rSTR1 and cr5 contains the result of the logical compare
708 of r12 to 0. If r12 == 0 then rStr1 is double word
709 aligned and can perform the DWunaligned loop.
711 Otherwise we know that rSTR1 is not already DW aligned yet.
712 So we can force the string addresses to the next lower DW
713 boundary and special case this first DW using shift left to
714 eliminate bits preceding the first byte. Since we want to join the
715 normal (DWaligned) compare loop, starting at the second double word,
716 we need to adjust the length (rN) and special case the loop
717 versioning for the first DW. This ensures that the loop count is
718 correct and the first DW (shifted) is in the expected resister pair. */
719 #define rSHL r29 /* Unaligned shift left count. */
720 #define rSHR r28 /* Unaligned shift right count. */
721 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
722 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
723 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
724 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
727 cfi_offset(rSHL, -24)
728 clrldi rSHL, rSTR2, 61
729 beq- cr6, L(duzeroLength)
731 cfi_offset(rSHR, -32)
732 beq cr5, L(DWunaligned)
733 std rWORD8_SHIFT, -40(r1)
734 cfi_offset(rWORD8_SHIFT, -40)
735 /* Adjust the logical start of rSTR2 to compensate for the extra bits
736 in the 1st rSTR1 DW. */
737 sub rWORD8_SHIFT, rSTR2, r12
738 /* But do not attempt to address the DW before that DW that contains
739 the actual start of rSTR2. */
740 clrrdi rSTR2, rSTR2, 3
741 std rWORD2_SHIFT, -48(r1)
742 /* Compute the left/right shift counts for the unaligned rSTR2,
743 compensating for the logical (DW aligned) start of rSTR1. */
744 clrldi rSHL, rWORD8_SHIFT, 61
745 clrrdi rSTR1, rSTR1, 3
746 std rWORD4_SHIFT, -56(r1)
748 cmpld cr5, rWORD8_SHIFT, rSTR2
751 std rWORD6_SHIFT, -64(r1)
752 cfi_offset(rWORD2_SHIFT, -48)
753 cfi_offset(rWORD4_SHIFT, -56)
754 cfi_offset(rWORD6_SHIFT, -64)
755 subfic rSHR, rSHL, 64
756 srdi r0, rN, 5 /* Divide by 32 */
757 andi. r12, rN, 24 /* Get the DW remainder */
758 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
759 this special case those bits may be discarded anyway. Also we
760 must avoid loading a DW where none of the bits are part of rSTR2 as
761 this may cross a page boundary and cause a page fault. */
764 #ifdef __LITTLE_ENDIAN__
765 ldbrx rWORD8, 0, rSTR2
771 sld rWORD8, rWORD8, rSHL
774 #ifdef __LITTLE_ENDIAN__
775 ldbrx rWORD1, 0, rSTR1
776 ldbrx rWORD2, 0, rSTR2
785 srd r12, rWORD2, rSHR
788 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
789 or rWORD8, r12, rWORD8
796 sld rWORD8_SHIFT, rWORD2, rSHL
797 sld rWORD7, rWORD1, rWORD6
798 sld rWORD8, rWORD8, rWORD6
800 /* At this point we exit early with the first double word compare
801 complete and remainder of 0 to 7 bytes. See L(du14) for details on
802 how we handle the remaining bytes. */
803 cmpld cr5, rWORD7, rWORD8
810 #ifdef __LITTLE_ENDIAN__
811 ldbrx rWORD2, 0, rSTR2
818 /* Remainder is 16 */
821 sld rWORD6_SHIFT, rWORD2, rSHL
822 sld rWORD5, rWORD1, rWORD6
823 sld rWORD6, rWORD8, rWORD6
825 /* Remainder is 24 */
828 sld rWORD4_SHIFT, rWORD2, rSHL
829 sld rWORD3, rWORD1, rWORD6
830 sld rWORD4, rWORD8, rWORD6
832 /* Count is a multiple of 32, remainder is 0 */
835 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
836 or rWORD8, r12, rWORD8
837 sld rWORD2_SHIFT, rWORD2, rSHL
838 sld rWORD1, rWORD1, rWORD6
839 sld rWORD2, rWORD8, rWORD6
842 /* At this point we know rSTR1 is double word aligned and the
843 compare length is at least 8 bytes. */
846 std rWORD8_SHIFT, -40(r1)
847 clrrdi rSTR2, rSTR2, 3
848 std rWORD2_SHIFT, -48(r1)
849 srdi r0, rN, 5 /* Divide by 32 */
850 std rWORD4_SHIFT, -56(r1)
851 andi. r12, rN, 24 /* Get the DW remainder */
852 std rWORD6_SHIFT, -64(r1)
853 cfi_offset(rWORD8_SHIFT, -40)
854 cfi_offset(rWORD2_SHIFT, -48)
855 cfi_offset(rWORD4_SHIFT, -56)
856 cfi_offset(rWORD6_SHIFT, -64)
858 #ifdef __LITTLE_ENDIAN__
859 ldbrx rWORD6, 0, rSTR2
861 ldbrx rWORD8, 0, rSTR2
870 subfic rSHR, rSHL, 64
871 sld rWORD6_SHIFT, rWORD6, rSHL
873 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
880 srd r12, rWORD8, rSHR
881 #ifdef __LITTLE_ENDIAN__
882 ldbrx rWORD7, 0, rSTR1
887 sld rWORD8_SHIFT, rWORD8, rSHL
888 or rWORD8, r12, rWORD6_SHIFT
891 #ifdef __LITTLE_ENDIAN__
892 ldbrx rWORD1, 0, rSTR1
893 ldbrx rWORD2, 0, rSTR2
900 cmpld cr5, rWORD7, rWORD8
902 sld rWORD2_SHIFT, rWORD2, rSHL
903 or rWORD2, r0, rWORD8_SHIFT
904 #ifdef __LITTLE_ENDIAN__
905 ldbrx rWORD3, 0, rSTR1
906 ldbrx rWORD4, 0, rSTR2
913 cmpld cr7, rWORD1, rWORD2
914 srd r12, rWORD4, rSHR
915 sld rWORD4_SHIFT, rWORD4, rSHL
917 or rWORD4, r12, rWORD2_SHIFT
918 #ifdef __LITTLE_ENDIAN__
919 ldbrx rWORD5, 0, rSTR1
920 ldbrx rWORD6, 0, rSTR2
927 cmpld cr1, rWORD3, rWORD4
929 sld rWORD6_SHIFT, rWORD6, rSHL
931 or rWORD6, r0, rWORD4_SHIFT
932 cmpld cr6, rWORD5, rWORD6
935 /* At this point we exit early with the first double word compare
936 complete and remainder of 0 to 7 bytes. See L(du14) for details on
937 how we handle the remaining bytes. */
939 cmpld cr5, rWORD7, rWORD8
946 #ifdef __LITTLE_ENDIAN__
947 ldbrx rWORD2, 0, rSTR2
954 /* Remainder is 16 */
958 #ifdef __LITTLE_ENDIAN__
959 ldbrx rWORD5, 0, rSTR1
964 or rWORD6, r0, rWORD6_SHIFT
965 sld rWORD6_SHIFT, rWORD8, rSHL
967 #ifdef __LITTLE_ENDIAN__
968 ldbrx rWORD7, 0, rSTR1
969 ldbrx rWORD8, 0, rSTR2
976 cmpld cr6, rWORD5, rWORD6
977 srd r12, rWORD8, rSHR
978 sld rWORD8_SHIFT, rWORD8, rSHL
979 or rWORD8, r12, rWORD6_SHIFT
981 #ifdef __LITTLE_ENDIAN__
982 ldbrx rWORD1, 0, rSTR1
983 ldbrx rWORD2, 0, rSTR2
990 cmpld cr5, rWORD7, rWORD8
993 sld rWORD2_SHIFT, rWORD2, rSHL
994 or rWORD2, r0, rWORD8_SHIFT
995 #ifdef __LITTLE_ENDIAN__
996 ldbrx rWORD3, 0, rSTR1
997 ldbrx rWORD4, 0, rSTR2
1001 ld rWORD3, 24(rSTR1)
1002 ld rWORD4, 24(rSTR2)
1004 cmpld cr7, rWORD1, rWORD2
1006 srd r12, rWORD4, rSHR
1007 sld rWORD4_SHIFT, rWORD4, rSHL
1008 or rWORD4, r12, rWORD2_SHIFT
1009 #ifndef __LITTLE_ENDIAN__
1010 addi rSTR1, rSTR1, 8
1011 addi rSTR2, rSTR2, 8
1013 cmpld cr1, rWORD3, rWORD4
1017 cmpld cr5, rWORD7, rWORD8
1018 #ifndef __LITTLE_ENDIAN__
1019 addi rSTR1, rSTR1, 8
1020 addi rSTR2, rSTR2, 8
1029 #ifdef __LITTLE_ENDIAN__
1030 ldbrx rWORD2, 0, rSTR2
1031 addi rSTR2, rSTR2, 8
1035 srd r0, rWORD2, rSHR
1038 /* Remainder is 24 */
1041 srd r12, rWORD8, rSHR
1042 #ifdef __LITTLE_ENDIAN__
1043 ldbrx rWORD3, 0, rSTR1
1044 addi rSTR1, rSTR1, 8
1048 sld rWORD4_SHIFT, rWORD8, rSHL
1049 or rWORD4, r12, rWORD6_SHIFT
1051 #ifdef __LITTLE_ENDIAN__
1052 ldbrx rWORD5, 0, rSTR1
1053 ldbrx rWORD6, 0, rSTR2
1054 addi rSTR1, rSTR1, 8
1055 addi rSTR2, rSTR2, 8
1060 cmpld cr1, rWORD3, rWORD4
1061 srd r0, rWORD6, rSHR
1062 sld rWORD6_SHIFT, rWORD6, rSHL
1063 or rWORD6, r0, rWORD4_SHIFT
1064 #ifdef __LITTLE_ENDIAN__
1065 ldbrx rWORD7, 0, rSTR1
1066 ldbrx rWORD8, 0, rSTR2
1067 addi rSTR1, rSTR1, 8
1068 addi rSTR2, rSTR2, 8
1070 ld rWORD7, 16(rSTR1)
1071 ld rWORD8, 16(rSTR2)
1073 cmpld cr6, rWORD5, rWORD6
1075 srd r12, rWORD8, rSHR
1076 sld rWORD8_SHIFT, rWORD8, rSHL
1077 or rWORD8, r12, rWORD6_SHIFT
1079 #ifdef __LITTLE_ENDIAN__
1080 ldbrx rWORD1, 0, rSTR1
1081 ldbrx rWORD2, 0, rSTR2
1082 addi rSTR1, rSTR1, 8
1083 addi rSTR2, rSTR2, 8
1085 ld rWORD1, 24(rSTR1)
1086 ld rWORD2, 24(rSTR2)
1088 cmpld cr5, rWORD7, rWORD8
1090 srd r0, rWORD2, rSHR
1091 sld rWORD2_SHIFT, rWORD2, rSHL
1092 or rWORD2, r0, rWORD8_SHIFT
1093 #ifndef __LITTLE_ENDIAN__
1094 addi rSTR1, rSTR1, 16
1095 addi rSTR2, rSTR2, 16
1097 cmpld cr7, rWORD1, rWORD2
1101 #ifndef __LITTLE_ENDIAN__
1102 addi rSTR1, rSTR1, 16
1103 addi rSTR2, rSTR2, 16
1106 /* Huh? We've already branched on cr1! */
1109 cmpld cr5, rWORD7, rWORD8
1117 #ifdef __LITTLE_ENDIAN__
1118 ldbrx rWORD2, 0, rSTR2
1119 addi rSTR2, rSTR2, 8
1123 srd r0, rWORD2, rSHR
1126 /* Count is a multiple of 32, remainder is 0 */
1129 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1130 srd r0, rWORD8, rSHR
1131 #ifdef __LITTLE_ENDIAN__
1132 ldbrx rWORD1, 0, rSTR1
1133 addi rSTR1, rSTR1, 8
1137 sld rWORD2_SHIFT, rWORD8, rSHL
1138 or rWORD2, r0, rWORD6_SHIFT
1140 #ifdef __LITTLE_ENDIAN__
1141 ldbrx rWORD3, 0, rSTR1
1142 ldbrx rWORD4, 0, rSTR2
1143 addi rSTR1, rSTR1, 8
1144 addi rSTR2, rSTR2, 8
1149 cmpld cr7, rWORD1, rWORD2
1150 srd r12, rWORD4, rSHR
1151 sld rWORD4_SHIFT, rWORD4, rSHL
1152 or rWORD4, r12, rWORD2_SHIFT
1153 #ifdef __LITTLE_ENDIAN__
1154 ldbrx rWORD5, 0, rSTR1
1155 ldbrx rWORD6, 0, rSTR2
1156 addi rSTR1, rSTR1, 8
1157 addi rSTR2, rSTR2, 8
1159 ld rWORD5, 16(rSTR1)
1160 ld rWORD6, 16(rSTR2)
1162 cmpld cr1, rWORD3, rWORD4
1164 srd r0, rWORD6, rSHR
1165 sld rWORD6_SHIFT, rWORD6, rSHL
1166 or rWORD6, r0, rWORD4_SHIFT
1167 #ifdef __LITTLE_ENDIAN__
1168 ldbrx rWORD7, 0, rSTR1
1169 ldbrx rWORD8, 0, rSTR2
1170 addi rSTR1, rSTR1, 8
1171 addi rSTR2, rSTR2, 8
1173 ldu rWORD7, 24(rSTR1)
1174 ldu rWORD8, 24(rSTR2)
1176 cmpld cr6, rWORD5, rWORD6
1178 srd r12, rWORD8, rSHR
1179 sld rWORD8_SHIFT, rWORD8, rSHL
1180 or rWORD8, r12, rWORD6_SHIFT
1181 cmpld cr5, rWORD7, rWORD8
1182 bdz- L(du24) /* Adjust CTR as we start with +4 */
1183 /* This is the primary loop */
1186 #ifdef __LITTLE_ENDIAN__
1187 ldbrx rWORD1, 0, rSTR1
1188 ldbrx rWORD2, 0, rSTR2
1189 addi rSTR1, rSTR1, 8
1190 addi rSTR2, rSTR2, 8
1195 cmpld cr1, rWORD3, rWORD4
1197 srd r0, rWORD2, rSHR
1198 sld rWORD2_SHIFT, rWORD2, rSHL
1199 or rWORD2, r0, rWORD8_SHIFT
1201 #ifdef __LITTLE_ENDIAN__
1202 ldbrx rWORD3, 0, rSTR1
1203 ldbrx rWORD4, 0, rSTR2
1204 addi rSTR1, rSTR1, 8
1205 addi rSTR2, rSTR2, 8
1207 ld rWORD3, 16(rSTR1)
1208 ld rWORD4, 16(rSTR2)
1210 cmpld cr6, rWORD5, rWORD6
1212 srd r12, rWORD4, rSHR
1213 sld rWORD4_SHIFT, rWORD4, rSHL
1214 or rWORD4, r12, rWORD2_SHIFT
1216 #ifdef __LITTLE_ENDIAN__
1217 ldbrx rWORD5, 0, rSTR1
1218 ldbrx rWORD6, 0, rSTR2
1219 addi rSTR1, rSTR1, 8
1220 addi rSTR2, rSTR2, 8
1222 ld rWORD5, 24(rSTR1)
1223 ld rWORD6, 24(rSTR2)
1225 cmpld cr5, rWORD7, rWORD8
1227 srd r0, rWORD6, rSHR
1228 sld rWORD6_SHIFT, rWORD6, rSHL
1229 or rWORD6, r0, rWORD4_SHIFT
1231 #ifdef __LITTLE_ENDIAN__
1232 ldbrx rWORD7, 0, rSTR1
1233 ldbrx rWORD8, 0, rSTR2
1234 addi rSTR1, rSTR1, 8
1235 addi rSTR2, rSTR2, 8
1237 ldu rWORD7, 32(rSTR1)
1238 ldu rWORD8, 32(rSTR2)
1240 cmpld cr7, rWORD1, rWORD2
1242 srd r12, rWORD8, rSHR
1243 sld rWORD8_SHIFT, rWORD8, rSHL
1244 or rWORD8, r12, rWORD6_SHIFT
1249 /* Huh? We've already branched on cr1! */
1252 cmpld cr1, rWORD3, rWORD4
1254 cmpld cr6, rWORD5, rWORD6
1256 cmpld cr5, rWORD7, rWORD8
1266 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1267 shift right double to eliminate bits beyond the compare length.
1269 However it may not be safe to load rWORD2 which may be beyond the
1270 string length. So we compare the bit length of the remainder to
1271 the right shift count (rSHR). If the bit count is less than or equal
1272 we do not need to load rWORD2 (all significant bits are already in
1278 #ifdef __LITTLE_ENDIAN__
1279 ldbrx rWORD2, 0, rSTR2
1280 addi rSTR2, rSTR2, 8
1284 srd r0, rWORD2, rSHR
1287 #ifdef __LITTLE_ENDIAN__
1288 ldbrx rWORD1, 0, rSTR1
1293 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1294 or rWORD2, r0, rWORD8_SHIFT
1297 srd rWORD1, rWORD1, rN
1298 srd rWORD2, rWORD2, rN
1300 ld rWORD8_SHIFT, -40(r1)
1302 cmpld cr7, rWORD1, rWORD2
1303 ld rWORD2_SHIFT, -48(r1)
1304 ld rWORD4_SHIFT, -56(r1)
1305 beq cr7, L(dureturn24)
1307 ld rWORD6_SHIFT, -64(r1)
1316 bgt cr7, L(dureturn29)
1326 bgt cr1, L(dureturn29)
1336 bgt cr6, L(dureturn29)
1346 bgt cr5, L(dureturn29)
1362 ld rWORD8_SHIFT, -40(r1)
1364 ld rWORD2_SHIFT, -48(r1)
1366 ld rWORD4_SHIFT, -56(r1)
1368 ld rWORD6_SHIFT, -64(r1)
1375 libc_hidden_builtin_def (memcmp)
1376 weak_alias (memcmp, bcmp)