1 /* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
29 #define rSTR1 r3 /* first string arg */
30 #define rSTR2 r4 /* second string arg */
31 #define rN r5 /* max string length */
32 #define rWORD1 r6 /* current word in s1 */
33 #define rWORD2 r7 /* current word in s2 */
34 #define rWORD3 r8 /* next word in s1 */
35 #define rWORD4 r9 /* next word in s2 */
36 #define rWORD5 r10 /* next word in s1 */
37 #define rWORD6 r11 /* next word in s2 */
38 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
39 #define rWORD7 r30 /* next word in s1 */
40 #define rWORD8 r31 /* next word in s2 */
42 xor rTMP, rSTR2, rSTR1
45 clrldi. rTMP, rTMP, 61
46 clrldi rBITDIF, rSTR1, 61
47 cmpldi cr5, rBITDIF, 0
48 beq- cr6, L(zeroLength)
51 /* If less than 8 bytes or not aligned, use the unaligned
53 blt cr1, L(bytealigned)
57 cfi_offset(rWORD7,-16)
59 /* At this point we know both strings have the same alignment and the
60 compare length is at least 8 bytes. rBITDIF contains the low order
61 3 bits of rSTR1 and cr5 contains the result of the logical compare
62 of rBITDIF to 0. If rBITDIF == 0 then we are already double word
63 aligned and can perform the DWaligned loop.
65 Otherwise we know the two strings have the same alignment (but not
66 yet DW). So we can force the string addresses to the next lower DW
67 boundary and special case this first DW word using shift left to
68 eliminate bits preceding the first byte. Since we want to join the
69 normal (DWaligned) compare loop, starting at the second double word,
70 we need to adjust the length (rN) and special case the loop
71 versioning for the first DW. This insures that the loop count is
72 correct and the first DW (shifted) is in the expected resister pair. */
75 clrrdi rSTR1, rSTR1, 3
76 clrrdi rSTR2, rSTR2, 3
80 srdi rTMP, rN, 5 /* Divide by 32 */
81 andi. rBITDIF, rN, 24 /* Get the DW remainder */
84 cmpldi cr1, rBITDIF, 16
88 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
95 sld rWORD5, rWORD1, r11
96 sld rWORD6, rWORD2, r11
97 cmpld cr5, rWORD5, rWORD6
99 /* Do something useful in this cycle since we have to branch anyway. */
102 cmpld cr0, rWORD1, rWORD2
104 /* Remainder is 16 */
107 sld rWORD5, rWORD1, r11
108 sld rWORD6, rWORD2, r11
109 cmpld cr6, rWORD5, rWORD6
111 /* Do something useful in this cycle since we have to branch anyway. */
114 cmpld cr5, rWORD7, rWORD8
116 /* Remainder is 24 */
119 sld rWORD3, rWORD1, r11
120 sld rWORD4, rWORD2, r11
121 cmpld cr1, rWORD3, rWORD4
123 /* Count is a multiple of 32, remainder is 0 */
126 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
127 sld rWORD1, rWORD1, r11
128 sld rWORD2, rWORD2, r11
129 cmpld cr0, rWORD1, rWORD2
132 /* At this point we know both strings are double word aligned and the
133 compare length is at least 8 bytes. */
136 andi. rBITDIF, rN, 24 /* Get the DW remainder */
137 srdi rTMP, rN, 5 /* Divide by 32 */
138 cmpldi cr1, rBITDIF, 16
148 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
149 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
150 (8-15 byte compare), we want to use only volatile registers. This
151 means we can avoid restoring non-volatile registers since we did not
152 change any on the early exit path. The key here is the non-early
153 exit path only cares about the condition code (cr5), not about which
154 register pair was used. */
157 cmpld cr5, rWORD5, rWORD6
161 cmpld cr0, rWORD1, rWORD2
165 cmpld cr1, rWORD3, rWORD4
168 cmpld cr6, rWORD5, rWORD6
172 ldu rWORD7, 32(rSTR1)
173 ldu rWORD8, 32(rSTR2)
175 cmpld cr5, rWORD7, rWORD8
184 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
189 /* Remainder is 16 */
192 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
195 cmpld cr6, rWORD5, rWORD6
199 cmpld cr5, rWORD7, rWORD8
203 cmpld cr0, rWORD1, rWORD2
206 cmpld cr1, rWORD3, rWORD4
212 /* Again we are on a early exit path (16-23 byte compare), we want to
213 only use volatile registers and avoid restoring non-volatile
219 cmpld cr5, rWORD3, rWORD4
225 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
230 /* Remainder is 24 */
233 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
236 cmpld cr1, rWORD3, rWORD4
240 cmpld cr6, rWORD5, rWORD6
244 cmpld cr5, rWORD7, rWORD8
247 cmpld cr0, rWORD1, rWORD2
248 addi rSTR1, rSTR1, 16
249 addi rSTR2, rSTR2, 16
253 /* Again we are on a early exit path (24-31 byte compare), we want to
254 only use volatile registers and avoid restoring non-volatile
260 cmpld cr5, rWORD1, rWORD2
263 addi rSTR1, rSTR1, 16
264 addi rSTR2, rSTR2, 16
266 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
272 /* Count is a multiple of 32, remainder is 0 */
275 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
278 cmpld cr0, rWORD1, rWORD2
282 cmpld cr1, rWORD3, rWORD4
285 cmpld cr6, rWORD5, rWORD6
286 ldu rWORD7, 24(rSTR1)
287 ldu rWORD8, 24(rSTR2)
288 cmpld cr5, rWORD7, rWORD8
291 bdz- L(d24) /* Adjust CTR as we start with +4 */
292 /* This is the primary loop */
297 cmpld cr1, rWORD3, rWORD4
302 cmpld cr6, rWORD5, rWORD6
307 cmpld cr5, rWORD7, rWORD8
310 ldu rWORD7, 32(rSTR1)
311 ldu rWORD8, 32(rSTR2)
313 cmpld cr0, rWORD1, rWORD2
317 cmpld cr1, rWORD3, rWORD4
319 cmpld cr6, rWORD5, rWORD6
321 cmpld cr5, rWORD7, rWORD8
334 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
336 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
337 we are aligned it is safe to load the whole double word, and use
338 shift right double to eliminate bits beyond the compare length. */
342 srd rWORD1, rWORD1, rN
343 srd rWORD2, rWORD2, rN
344 cmpld cr5, rWORD1, rWORD2
384 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
385 beq- cr6, L(zeroLength)
387 /* We need to prime this loop. This loop is swing modulo scheduled
388 to avoid pipe delays. The dependent instruction latencies (load to
389 compare to conditional branch) is 2 to 3 cycles. In this loop each
390 dispatch group ends in a branch and takes 1 cycle. Effectively
391 the first iteration of the loop only serves to load operands and
392 branches based on compares are delayed until the next loop.
394 So we must precondition some registers and condition codes so that
395 we don't exit the loop early on the first iteration. */
400 cmpld cr0, rWORD1, rWORD2
404 cmpld cr1, rWORD3, rWORD4
405 lbzu rWORD5, 2(rSTR1)
406 lbzu rWORD6, 2(rSTR2)
410 lbzu rWORD1, 1(rSTR1)
411 lbzu rWORD2, 1(rSTR2)
414 cmpld cr6, rWORD5, rWORD6
417 lbzu rWORD3, 1(rSTR1)
418 lbzu rWORD4, 1(rSTR2)
421 cmpld cr0, rWORD1, rWORD2
424 lbzu rWORD5, 1(rSTR1)
425 lbzu rWORD6, 1(rSTR2)
428 cmpld cr1, rWORD3, rWORD4
431 /* We speculatively loading bytes before we have tested the previous
432 bytes. But we must avoid overrunning the length (in the ctr) to
433 prevent these speculative loads from causing a segfault. In this
434 case the loop will exit early (before the all pending bytes are
435 tested. In this case we must complete the pending operations
472 sub rRTN, rWORD5, rWORD6
478 sub rRTN, rWORD3, rWORD4
482 sub rRTN, rWORD1, rWORD2
493 /* At this point we know the strings have different alignment and the
494 compare length is at least 8 bytes. rBITDIF contains the low order
495 3 bits of rSTR1 and cr5 contains the result of the logical compare
496 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
497 aligned and can perform the DWunaligned loop.
499 Otherwise we know that rSTR1 is not already DW aligned yet.
500 So we can force the string addresses to the next lower DW
501 boundary and special case this first DW word using shift left to
502 eliminate bits preceding the first byte. Since we want to join the
503 normal (DWaligned) compare loop, starting at the second double word,
504 we need to adjust the length (rN) and special case the loop
505 versioning for the first DW. This insures that the loop count is
506 correct and the first DW (shifted) is in the expected resister pair. */
507 #define rSHL r29 /* Unaligned shift left count. */
508 #define rSHR r28 /* Unaligned shift right count. */
509 #define rB r27 /* Left rotation temp for rWORD2. */
510 #define rD r26 /* Left rotation temp for rWORD4. */
511 #define rF r25 /* Left rotation temp for rWORD6. */
512 #define rH r24 /* Left rotation temp for rWORD8. */
513 #define rA r0 /* Right rotation temp for rWORD2. */
514 #define rC r12 /* Right rotation temp for rWORD4. */
515 #define rE r0 /* Right rotation temp for rWORD6. */
516 #define rG r12 /* Right rotation temp for rWORD8. */
520 clrldi rSHL, rSTR2, 61
521 beq- cr6, L(duzeroLength)
524 beq cr5, L(DWunaligned)
527 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
528 in the 1st rSTR1 DW. */
529 sub r27, rSTR2, rBITDIF
530 /* But do not attempt to address the DW before that DW that contains
531 the actual start of rSTR2. */
532 clrrdi rSTR2, rSTR2, 3
535 /* Compute the left/right shift counts for the unalign rSTR2,
536 compensating for the logical (DW aligned) start of rSTR1. */
538 clrrdi rSTR1, rSTR1, 3
542 cmpld cr5, r27, rSTR2
547 subfic rSHR, rSHL, 64
548 srdi rTMP, rN, 5 /* Divide by 32 */
549 andi. rBITDIF, rN, 24 /* Get the DW remainder */
550 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
551 this special case those bits may be discarded anyway. Also we
552 must avoid loading a DW where none of the bits are part of rSTR2 as
553 this may cross a page boundary and cause a page fault. */
558 sld rWORD8, rWORD8, rSHL
563 cmpldi cr1, rBITDIF, 16
568 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
569 or rWORD8, rG, rWORD8
577 sld rWORD7, rWORD1, r11
578 sld rWORD8, rWORD8, r11
580 /* At this point we exit early with the first double word compare
581 complete and remainder of 0 to 7 bytes. See L(du14) for details on
582 how we handle the remaining bytes. */
583 cmpld cr5, rWORD7, rWORD8
593 /* Remainder is 16 */
597 sld rWORD5, rWORD1, r11
598 sld rWORD6, rWORD8, r11
600 /* Remainder is 24 */
604 sld rWORD3, rWORD1, r11
605 sld rWORD4, rWORD8, r11
607 /* Count is a multiple of 32, remainder is 0 */
610 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
611 or rWORD8, rG, rWORD8
613 sld rWORD1, rWORD1, r11
614 sld rWORD2, rWORD8, r11
617 /* At this point we know rSTR1 is double word aligned and the
618 compare length is at least 8 bytes. */
623 clrrdi rSTR2, rSTR2, 3
626 srdi rTMP, rN, 5 /* Divide by 32 */
629 andi. rBITDIF, rN, 24 /* Get the DW remainder */
635 cmpldi cr1, rBITDIF, 16
638 subfic rSHR, rSHL, 64
641 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
656 cmpld cr5, rWORD7, rWORD8
662 cmpld cr0, rWORD1, rWORD2
669 cmpld cr1, rWORD3, rWORD4
674 cmpld cr6, rWORD5, rWORD6
677 /* At this point we exit early with the first double word compare
678 complete and remainder of 0 to 7 bytes. See L(du14) for details on
679 how we handle the remaining bytes. */
681 cmpld cr5, rWORD7, rWORD8
691 /* Remainder is 16 */
701 cmpld cr6, rWORD5, rWORD6
708 cmpld cr5, rWORD7, rWORD8
715 cmpld cr0, rWORD1, rWORD2
722 cmpld cr1, rWORD3, rWORD4
726 cmpld cr5, rWORD7, rWORD8
740 /* Remainder is 24 */
750 cmpld cr1, rWORD3, rWORD4
756 cmpld cr6, rWORD5, rWORD6
764 cmpld cr5, rWORD7, rWORD8
769 addi rSTR1, rSTR1, 16
770 addi rSTR2, rSTR2, 16
771 cmpld cr0, rWORD1, rWORD2
775 addi rSTR1, rSTR1, 16
776 addi rSTR2, rSTR2, 16
778 cmpld cr5, rWORD7, rWORD8
790 /* Count is a multiple of 32, remainder is 0 */
793 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
801 cmpld cr0, rWORD1, rWORD2
807 cmpld cr1, rWORD3, rWORD4
812 ldu rWORD7, 24(rSTR1)
813 ldu rWORD8, 24(rSTR2)
814 cmpld cr6, rWORD5, rWORD6
819 cmpld cr5, rWORD7, rWORD8
820 bdz- L(du24) /* Adjust CTR as we start with +4 */
821 /* This is the primary loop */
826 cmpld cr1, rWORD3, rWORD4
834 cmpld cr6, rWORD5, rWORD6
842 cmpld cr5, rWORD7, rWORD8
848 ldu rWORD7, 32(rSTR1)
849 ldu rWORD8, 32(rSTR2)
850 cmpld cr0, rWORD1, rWORD2
859 cmpld cr1, rWORD3, rWORD4
861 cmpld cr6, rWORD5, rWORD6
863 cmpld cr5, rWORD7, rWORD8
873 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
874 shift right double to eliminate bits beyond the compare length.
875 This allows the use of double word subtract to compute the final
878 However it may not be safe to load rWORD2 which may be beyond the
879 string length. So we compare the bit length of the remainder to
880 the right shift count (rSHR). If the bit count is less than or equal
881 we do not need to load rWORD2 (all significant bits are already in
893 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
897 srd rWORD1, rWORD1, rN
898 srd rWORD2, rWORD2, rN
902 cmpld cr0, rWORD1, rWORD2
905 beq cr0, L(dureturn24)
916 bgt cr0, L(dureturn29)
926 bgt cr1, L(dureturn29)
936 bgt cr6, L(dureturn29)
946 bgt cr5, L(dureturn29)
975 libc_hidden_builtin_def (memcmp)
976 weak_alias (memcmp, bcmp)