]>
Commit | Line | Data |
---|---|---|
e23d3d26 | 1 | /* Optimized strcmp implementation for Power7 using 'cmpb' instruction |
b168057a | 2 | Copyright (C) 2014-2015 Free Software Foundation, Inc. |
e23d3d26 VR |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* The optimization is achieved here through cmpb instruction. | |
20 | 8byte aligned strings are processed with double word comparision | |
21 | and unaligned strings are handled effectively with loop unrolling | |
22 | technique */ | |
23 | ||
24 | #include <sysdep.h> | |
25 | ||
26 | /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ | |
27 | ||
28 | EALIGN (strcmp, 4, 0) | |
29 | CALL_MCOUNT 2 | |
30 | ||
31 | or r9, r3, r4 | |
32 | rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ | |
33 | bne cr0, L(process_unaligned_bytes) | |
34 | ||
35 | /* process input parameters on double word aligned boundary */ | |
36 | ld r9, 0(r4) /* load s2 at offset=0 */ | |
37 | li r10, 0 /* load mask=0 */ | |
38 | cmpb r10, r9, r10 /* compare bytes at s2 with mask */ | |
39 | cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ | |
40 | bne cr7, L(process_unaligned_bytes) /* process byte by byte */ | |
41 | ||
42 | ld r10, 0(r3) /* load s1 at offset=0 */ | |
43 | li r8, 0 /* load mask=0 */ | |
44 | cmpb r8, r10, r8 /* compare bytes at s1 with mask */ | |
45 | cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ | |
46 | bne cr7, L(process_unaligned_bytes) /* process byte by byte */ | |
47 | ||
48 | /*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ | |
49 | cmpb r9, r10, r9 /* compare s1 and s2 */ | |
50 | cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ | |
51 | bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ | |
52 | ||
53 | addi r5, r3, 8 /* save next offset of s2 */ | |
54 | addi r11, r4, 8 /* save next offset of s1 */ | |
55 | ld r8, 8(r4) /* load s2 at offset=8 */ | |
56 | li r9, 0 /* load mask=0 */ | |
57 | cmpb r9, r8, r9 /* compare bytes at s2 with mask */ | |
58 | cmpdi cr7, r9, 0 /* NULL found ..? */ | |
59 | bne cr7, L(processBytes)/* update input and process bytes one by one */ | |
60 | ||
61 | mr r9, r4 /* save s2 */ | |
62 | li r10, 0 /* load mask=0 */ | |
63 | ||
64 | ld r7, 8(r3) /* load s1 at offset=8 */ | |
65 | cmpb r6, r7, r10 /* compare bytes at s1 with mask */ | |
66 | cmpdi cr7, r6, 0 /* is NULL found */ | |
67 | bne cr7, L(processBytes)/* mismatch, so process one by one */ | |
68 | ||
69 | L(unrollDword): | |
70 | cmpb r8, r7, r8 /* compare s1 and s2 */ | |
71 | cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ | |
72 | bne cr7, L(processBytes)/* mismatch with s1 and s2 */ | |
73 | ||
74 | addi r5, r3, 16 /* save offset=16 of s1 */ | |
75 | addi r4, r9, 16 /* save offset=16 of s2 */ | |
76 | ld r8, 16(r9) /* load s2 at offset=16 */ | |
77 | cmpb r7, r8, r10 /* compare bytes at s2 with mask */ | |
78 | cmpdi cr7, r7, 0 /* NULL found ..? */ | |
79 | bne cr7, L(update2processBytes) | |
80 | ||
81 | ld r7, 16(r3) /* load s1 at offset=16 */ | |
82 | cmpb r6, r7, r10 /* check s1 for end of string */ | |
83 | cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ | |
84 | bne 7,L(update2processBytes) | |
85 | ||
86 | cmpb r8, r7, r8 /* compare s1 and s2 double words */ | |
87 | cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ | |
88 | bne cr7,L(update2processBytes) | |
89 | ||
90 | addi r5, r3, 24 /* update s1 to offset=24 */ | |
91 | addi r4, r9, 24 /* update s2 to offset=24 */ | |
92 | ||
93 | ld r8, 24(r9) /* load s2 */ | |
94 | cmpb r7, r8, r10 /* compare s2 for NULL */ | |
95 | cmpdi cr7, r7, 0 /* verify if s2 is ending now */ | |
96 | bne cr7,L(update2processBytes) | |
97 | ||
98 | ld r7, 24(r3) /* load s1 at offset=24 */ | |
99 | cmpb r6, r7, r10 /* verify for NULL */ | |
100 | cmpdi cr7, r6, 0 /* is NULL found */ | |
101 | bne cr7, L(update2processBytes) | |
102 | ||
103 | cmpb r8, r7, r8 /* compare s1 and s2 */ | |
104 | cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ | |
105 | bne cr7, L(update2processBytes) | |
106 | ||
107 | addi r7, r9, 32 /* update s2 to next double word */ | |
108 | addi r3, r3, 32 /* update s1 to next double word */ | |
109 | ||
110 | ld r8, 32(r9) /* load s2 */ | |
111 | mr r4, r7 /* save s2 */ | |
112 | cmpb r6, r8, r10 /* compare s2 with NULL */ | |
113 | cmpdi cr7, r6, 0 /* end of s2 ..? */ | |
114 | bne cr7, L(process_unaligned_bytes) | |
115 | ||
116 | ld r6, 0(r3) /* load and compare s1 for NULL */ | |
117 | cmpb r5, r6, r10 | |
118 | cmpdi cr7, r5, 0 | |
119 | bne cr7, L(process_unaligned_bytes) | |
120 | ||
121 | cmpb r8, r6, r8 /* compare s1 and s2 */ | |
122 | cmpdi cr7, r8, -1 | |
123 | bne cr7, L(process_unaligned_bytes) | |
124 | ||
125 | addi r5, r3, 8 /* increment s1 and d2 here */ | |
126 | addi r11, r9, 40 | |
127 | ||
128 | ld r8, 40(r9) /* process s2 now */ | |
129 | cmpb r9, r8, r10 | |
130 | cmpdi cr7, r9, 0 | |
131 | bne cr7, L(processBytes) | |
132 | ||
133 | mr r9, r7 | |
134 | ld r7, 8(r3) /* process s1 now */ | |
135 | cmpb r6, r7, r10 | |
136 | cmpdi cr7, r6, 0 | |
137 | beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ | |
138 | ||
139 | L(processBytes): | |
140 | mr r4, r11 /* update input params */ | |
141 | mr r3, r5 | |
142 | ||
143 | .p2align 4 | |
144 | L(process_unaligned_bytes): | |
145 | lbz r9, 0(r3) /* load byte from s1 */ | |
146 | lbz r10, 0(r4) /* load byte from s2 */ | |
147 | cmpdi cr7, r9, 0 /* compare *s1 with NULL */ | |
148 | beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ | |
149 | cmplw cr7, r9, r10 /* compare *s1 and *s2 */ | |
150 | bne cr7, L(ComputeDiff) /* branch to compute difference and return */ | |
151 | ||
152 | lbz r9, 1(r3) /* load next byte from s1 */ | |
153 | lbz r10, 1(r4) /* load next byte from s2 */ | |
154 | cmpdi cr7, r9, 0 /* compare *s1 with NULL */ | |
155 | beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */ | |
156 | cmplw cr7, r9, r10 /* compare *s1 and *s2 */ | |
157 | bne cr7, L(ComputeDiff) /* branch to compute difference and return */ | |
158 | ||
159 | lbz r9, 2(r3) /* unroll 3rd byte here */ | |
160 | lbz r10, 2(r4) | |
161 | cmpdi cr7, r9, 0 | |
162 | beq cr7, L(diffOfNULL) | |
163 | cmplw cr7, r9, r10 | |
164 | bne 7, L(ComputeDiff) | |
165 | ||
166 | lbz r9, 3(r3) /* unroll 4th byte now */ | |
167 | lbz r10, 3(r4) | |
168 | addi r3, r3, 4 /* increment s1 by unroll factor */ | |
169 | cmpdi cr7, r9, 0 | |
170 | cmplw cr6, 9, r10 | |
171 | beq cr7, L(diffOfNULL) | |
172 | addi r4, r4, 4 /* increment s2 by unroll factor */ | |
173 | beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ | |
174 | ||
175 | .p2align 4 | |
176 | L(ComputeDiff): | |
177 | extsw r9, r9 | |
178 | subf r10, r10, r9 /* compute s1 - s2 */ | |
179 | extsw r3, r10 | |
180 | blr /* return */ | |
181 | ||
182 | .p2align 4 | |
183 | L(diffOfNULL): | |
184 | li r9, 0 | |
185 | subf r10, r10, r9 /* compute s1 - s2 */ | |
186 | extsw r3, r10 /* sign extend result */ | |
187 | blr /* return */ | |
188 | ||
189 | .p2align 4 | |
190 | L(update2processBytes): | |
191 | mr r3, r5 /* update and proceed */ | |
192 | b L(process_unaligned_bytes) | |
193 | ||
194 | END (strcmp) | |
195 | libc_hidden_builtin_def (strcmp) |