]>
Commit | Line | Data |
---|---|---|
fe6e95d7 | 1 | /* Optimized strcmp implementation for PowerPC32. |
d4697bc9 | 2 | Copyright (C) 2003-2014 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
fe6e95d7 AM |
21 | /* int [r3] memcmp (const char *s1 [r3], |
22 | const char *s2 [r4], | |
23 | size_t size [r5]) */ | |
04067002 | 24 | |
a88f47a7 | 25 | .machine power4 |
b5510883 | 26 | EALIGN (memcmp, 4, 0) |
04067002 UD |
27 | CALL_MCOUNT |
28 | ||
04067002 UD |
29 | #define rRTN r3 |
30 | #define rSTR1 r3 /* first string arg */ | |
31 | #define rSTR2 r4 /* second string arg */ | |
32 | #define rN r5 /* max string length */ | |
33 | #define rWORD1 r6 /* current word in s1 */ | |
34 | #define rWORD2 r7 /* current word in s2 */ | |
35 | #define rWORD3 r8 /* next word in s1 */ | |
36 | #define rWORD4 r9 /* next word in s2 */ | |
37 | #define rWORD5 r10 /* next word in s1 */ | |
38 | #define rWORD6 r11 /* next word in s2 */ | |
04067002 UD |
39 | #define rWORD7 r30 /* next word in s1 */ |
40 | #define rWORD8 r31 /* next word in s2 */ | |
41 | ||
fe6e95d7 | 42 | xor r0, rSTR2, rSTR1 |
04067002 UD |
43 | cmplwi cr6, rN, 0 |
44 | cmplwi cr1, rN, 12 | |
fe6e95d7 AM |
45 | clrlwi. r0, r0, 30 |
46 | clrlwi r12, rSTR1, 30 | |
47 | cmplwi cr5, r12, 0 | |
04067002 | 48 | beq- cr6, L(zeroLength) |
fe6e95d7 AM |
49 | dcbt 0, rSTR1 |
50 | dcbt 0, rSTR2 | |
04067002 UD |
51 | /* If less than 8 bytes or not aligned, use the unaligned |
52 | byte loop. */ | |
53 | blt cr1, L(bytealigned) | |
fe6e95d7 | 54 | stwu 1, -64(r1) |
04067002 | 55 | cfi_adjust_cfa_offset(64) |
fe6e95d7 AM |
56 | stw rWORD8, 48(r1) |
57 | cfi_offset(rWORD8, (48-64)) | |
58 | stw rWORD7, 44(r1) | |
59 | cfi_offset(rWORD7, (44-64)) | |
04067002 UD |
60 | bne L(unaligned) |
61 | /* At this point we know both strings have the same alignment and the | |
fe6e95d7 | 62 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 63 | 2 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 | 64 | of r12 to 0. If r12 == 0 then we are already word |
04067002 | 65 | aligned and can perform the word aligned loop. |
9c84384c | 66 | |
04067002 UD |
67 | Otherwise we know the two strings have the same alignment (but not |
68 | yet word aligned). So we force the string addresses to the next lower | |
69 | word boundary and special case this first word using shift left to | |
2ccdea26 | 70 | eliminate bits preceding the first byte. Since we want to join the |
04067002 UD |
71 | normal (word aligned) compare loop, starting at the second word, |
72 | we need to adjust the length (rN) and special case the loop | |
fe6e95d7 | 73 | versioning for the first word. This ensures that the loop count is |
04067002 | 74 | correct and the first word (shifted) is in the expected register pair. */ |
fe6e95d7 | 75 | .align 4 |
04067002 UD |
76 | L(samealignment): |
77 | clrrwi rSTR1, rSTR1, 2 | |
78 | clrrwi rSTR2, rSTR2, 2 | |
79 | beq cr5, L(Waligned) | |
fe6e95d7 AM |
80 | add rN, rN, r12 |
81 | slwi rWORD6, r12, 3 | |
82 | srwi r0, rN, 4 /* Divide by 16 */ | |
83 | andi. r12, rN, 12 /* Get the word remainder */ | |
84 | #ifdef __LITTLE_ENDIAN__ | |
85 | lwbrx rWORD1, 0, rSTR1 | |
86 | lwbrx rWORD2, 0, rSTR2 | |
87 | addi rSTR1, rSTR1, 4 | |
88 | addi rSTR2, rSTR2, 4 | |
89 | #else | |
04067002 UD |
90 | lwz rWORD1, 0(rSTR1) |
91 | lwz rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
92 | #endif |
93 | cmplwi cr1, r12, 8 | |
04067002 UD |
94 | cmplwi cr7, rN, 16 |
95 | clrlwi rN, rN, 30 | |
96 | beq L(dPs4) | |
fe6e95d7 | 97 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
98 | bgt cr1, L(dPs3) |
99 | beq cr1, L(dPs2) | |
100 | ||
101 | /* Remainder is 4 */ | |
fe6e95d7 | 102 | .align 3 |
04067002 | 103 | L(dsP1): |
fe6e95d7 AM |
104 | slw rWORD5, rWORD1, rWORD6 |
105 | slw rWORD6, rWORD2, rWORD6 | |
04067002 UD |
106 | cmplw cr5, rWORD5, rWORD6 |
107 | blt cr7, L(dP1x) | |
108 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
109 | #ifdef __LITTLE_ENDIAN__ |
110 | lwbrx rWORD1, 0, rSTR1 | |
111 | lwbrx rWORD2, 0, rSTR2 | |
112 | addi rSTR1, rSTR1, 4 | |
113 | addi rSTR2, rSTR2, 4 | |
114 | #else | |
04067002 UD |
115 | lwz rWORD1, 4(rSTR1) |
116 | lwz rWORD2, 4(rSTR2) | |
fe6e95d7 AM |
117 | #endif |
118 | cmplw cr7, rWORD1, rWORD2 | |
04067002 UD |
119 | b L(dP1e) |
120 | /* Remainder is 8 */ | |
fe6e95d7 | 121 | .align 4 |
04067002 | 122 | L(dPs2): |
fe6e95d7 AM |
123 | slw rWORD5, rWORD1, rWORD6 |
124 | slw rWORD6, rWORD2, rWORD6 | |
04067002 UD |
125 | cmplw cr6, rWORD5, rWORD6 |
126 | blt cr7, L(dP2x) | |
127 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
128 | #ifdef __LITTLE_ENDIAN__ |
129 | lwbrx rWORD7, 0, rSTR1 | |
130 | lwbrx rWORD8, 0, rSTR2 | |
131 | addi rSTR1, rSTR1, 4 | |
132 | addi rSTR2, rSTR2, 4 | |
133 | #else | |
04067002 UD |
134 | lwz rWORD7, 4(rSTR1) |
135 | lwz rWORD8, 4(rSTR2) | |
fe6e95d7 | 136 | #endif |
04067002 UD |
137 | cmplw cr5, rWORD7, rWORD8 |
138 | b L(dP2e) | |
139 | /* Remainder is 12 */ | |
fe6e95d7 | 140 | .align 4 |
04067002 | 141 | L(dPs3): |
fe6e95d7 AM |
142 | slw rWORD3, rWORD1, rWORD6 |
143 | slw rWORD4, rWORD2, rWORD6 | |
04067002 UD |
144 | cmplw cr1, rWORD3, rWORD4 |
145 | b L(dP3e) | |
146 | /* Count is a multiple of 16, remainder is 0 */ | |
fe6e95d7 | 147 | .align 4 |
04067002 | 148 | L(dPs4): |
fe6e95d7 AM |
149 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
150 | slw rWORD1, rWORD1, rWORD6 | |
151 | slw rWORD2, rWORD2, rWORD6 | |
152 | cmplw cr7, rWORD1, rWORD2 | |
04067002 UD |
153 | b L(dP4e) |
154 | ||
155 | /* At this point we know both strings are word aligned and the | |
156 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 157 | .align 4 |
04067002 | 158 | L(Waligned): |
fe6e95d7 AM |
159 | andi. r12, rN, 12 /* Get the word remainder */ |
160 | srwi r0, rN, 4 /* Divide by 16 */ | |
161 | cmplwi cr1, r12, 8 | |
04067002 UD |
162 | cmplwi cr7, rN, 16 |
163 | clrlwi rN, rN, 30 | |
164 | beq L(dP4) | |
165 | bgt cr1, L(dP3) | |
166 | beq cr1, L(dP2) | |
9c84384c | 167 | |
04067002 | 168 | /* Remainder is 4 */ |
fe6e95d7 | 169 | .align 4 |
04067002 | 170 | L(dP1): |
fe6e95d7 | 171 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
172 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early |
173 | (8-15 byte compare), we want to use only volatile registers. This | |
174 | means we can avoid restoring non-volatile registers since we did not | |
175 | change any on the early exit path. The key here is the non-early | |
9c84384c | 176 | exit path only cares about the condition code (cr5), not about which |
04067002 | 177 | register pair was used. */ |
fe6e95d7 AM |
178 | #ifdef __LITTLE_ENDIAN__ |
179 | lwbrx rWORD5, 0, rSTR1 | |
180 | lwbrx rWORD6, 0, rSTR2 | |
181 | addi rSTR1, rSTR1, 4 | |
182 | addi rSTR2, rSTR2, 4 | |
183 | #else | |
04067002 UD |
184 | lwz rWORD5, 0(rSTR1) |
185 | lwz rWORD6, 0(rSTR2) | |
fe6e95d7 | 186 | #endif |
04067002 UD |
187 | cmplw cr5, rWORD5, rWORD6 |
188 | blt cr7, L(dP1x) | |
fe6e95d7 AM |
189 | #ifdef __LITTLE_ENDIAN__ |
190 | lwbrx rWORD1, 0, rSTR1 | |
191 | lwbrx rWORD2, 0, rSTR2 | |
192 | addi rSTR1, rSTR1, 4 | |
193 | addi rSTR2, rSTR2, 4 | |
194 | #else | |
04067002 UD |
195 | lwz rWORD1, 4(rSTR1) |
196 | lwz rWORD2, 4(rSTR2) | |
fe6e95d7 AM |
197 | #endif |
198 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 199 | L(dP1e): |
fe6e95d7 AM |
200 | #ifdef __LITTLE_ENDIAN__ |
201 | lwbrx rWORD3, 0, rSTR1 | |
202 | lwbrx rWORD4, 0, rSTR2 | |
203 | addi rSTR1, rSTR1, 4 | |
204 | addi rSTR2, rSTR2, 4 | |
205 | #else | |
04067002 UD |
206 | lwz rWORD3, 8(rSTR1) |
207 | lwz rWORD4, 8(rSTR2) | |
fe6e95d7 | 208 | #endif |
04067002 | 209 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
210 | #ifdef __LITTLE_ENDIAN__ |
211 | lwbrx rWORD5, 0, rSTR1 | |
212 | lwbrx rWORD6, 0, rSTR2 | |
213 | addi rSTR1, rSTR1, 4 | |
214 | addi rSTR2, rSTR2, 4 | |
215 | #else | |
04067002 UD |
216 | lwz rWORD5, 12(rSTR1) |
217 | lwz rWORD6, 12(rSTR2) | |
fe6e95d7 | 218 | #endif |
04067002 | 219 | cmplw cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
220 | bne cr5, L(dLcr5x) |
221 | bne cr7, L(dLcr7x) | |
9c84384c | 222 | |
fe6e95d7 AM |
223 | #ifdef __LITTLE_ENDIAN__ |
224 | lwbrx rWORD7, 0, rSTR1 | |
225 | lwbrx rWORD8, 0, rSTR2 | |
226 | addi rSTR1, rSTR1, 4 | |
227 | addi rSTR2, rSTR2, 4 | |
228 | #else | |
04067002 UD |
229 | lwzu rWORD7, 16(rSTR1) |
230 | lwzu rWORD8, 16(rSTR2) | |
fe6e95d7 | 231 | #endif |
04067002 UD |
232 | bne cr1, L(dLcr1) |
233 | cmplw cr5, rWORD7, rWORD8 | |
234 | bdnz L(dLoop) | |
235 | bne cr6, L(dLcr6) | |
fe6e95d7 AM |
236 | lwz rWORD7, 44(r1) |
237 | lwz rWORD8, 48(r1) | |
238 | .align 3 | |
04067002 UD |
239 | L(dP1x): |
240 | slwi. r12, rN, 3 | |
fe6e95d7 | 241 | bne cr5, L(dLcr5x) |
04067002 | 242 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ |
fe6e95d7 AM |
243 | addi 1, 1, 64 |
244 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
245 | bne L(d00) |
246 | li rRTN, 0 | |
247 | blr | |
9c84384c | 248 | |
04067002 | 249 | /* Remainder is 8 */ |
fe6e95d7 AM |
250 | .align 4 |
251 | cfi_adjust_cfa_offset(64) | |
04067002 | 252 | L(dP2): |
fe6e95d7 AM |
253 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
254 | #ifdef __LITTLE_ENDIAN__ | |
255 | lwbrx rWORD5, 0, rSTR1 | |
256 | lwbrx rWORD6, 0, rSTR2 | |
257 | addi rSTR1, rSTR1, 4 | |
258 | addi rSTR2, rSTR2, 4 | |
259 | #else | |
04067002 UD |
260 | lwz rWORD5, 0(rSTR1) |
261 | lwz rWORD6, 0(rSTR2) | |
fe6e95d7 | 262 | #endif |
04067002 UD |
263 | cmplw cr6, rWORD5, rWORD6 |
264 | blt cr7, L(dP2x) | |
fe6e95d7 AM |
265 | #ifdef __LITTLE_ENDIAN__ |
266 | lwbrx rWORD7, 0, rSTR1 | |
267 | lwbrx rWORD8, 0, rSTR2 | |
268 | addi rSTR1, rSTR1, 4 | |
269 | addi rSTR2, rSTR2, 4 | |
270 | #else | |
04067002 UD |
271 | lwz rWORD7, 4(rSTR1) |
272 | lwz rWORD8, 4(rSTR2) | |
fe6e95d7 | 273 | #endif |
04067002 UD |
274 | cmplw cr5, rWORD7, rWORD8 |
275 | L(dP2e): | |
fe6e95d7 AM |
276 | #ifdef __LITTLE_ENDIAN__ |
277 | lwbrx rWORD1, 0, rSTR1 | |
278 | lwbrx rWORD2, 0, rSTR2 | |
279 | addi rSTR1, rSTR1, 4 | |
280 | addi rSTR2, rSTR2, 4 | |
281 | #else | |
04067002 UD |
282 | lwz rWORD1, 8(rSTR1) |
283 | lwz rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
284 | #endif |
285 | cmplw cr7, rWORD1, rWORD2 | |
286 | #ifdef __LITTLE_ENDIAN__ | |
287 | lwbrx rWORD3, 0, rSTR1 | |
288 | lwbrx rWORD4, 0, rSTR2 | |
289 | addi rSTR1, rSTR1, 4 | |
290 | addi rSTR2, rSTR2, 4 | |
291 | #else | |
04067002 UD |
292 | lwz rWORD3, 12(rSTR1) |
293 | lwz rWORD4, 12(rSTR2) | |
fe6e95d7 | 294 | #endif |
04067002 | 295 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 | 296 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
297 | addi rSTR1, rSTR1, 4 |
298 | addi rSTR2, rSTR2, 4 | |
fe6e95d7 | 299 | #endif |
04067002 UD |
300 | bne cr6, L(dLcr6) |
301 | bne cr5, L(dLcr5) | |
302 | b L(dLoop2) | |
303 | /* Again we are on a early exit path (16-23 byte compare), we want to | |
304 | only use volatile registers and avoid restoring non-volatile | |
305 | registers. */ | |
fe6e95d7 | 306 | .align 4 |
04067002 | 307 | L(dP2x): |
fe6e95d7 AM |
308 | #ifdef __LITTLE_ENDIAN__ |
309 | lwbrx rWORD3, 0, rSTR1 | |
310 | lwbrx rWORD4, 0, rSTR2 | |
311 | addi rSTR1, rSTR1, 4 | |
312 | addi rSTR2, rSTR2, 4 | |
313 | #else | |
04067002 UD |
314 | lwz rWORD3, 4(rSTR1) |
315 | lwz rWORD4, 4(rSTR2) | |
fe6e95d7 AM |
316 | #endif |
317 | cmplw cr1, rWORD3, rWORD4 | |
04067002 | 318 | slwi. r12, rN, 3 |
fe6e95d7 AM |
319 | bne cr6, L(dLcr6x) |
320 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
321 | addi rSTR1, rSTR1, 4 |
322 | addi rSTR2, rSTR2, 4 | |
fe6e95d7 AM |
323 | #endif |
324 | bne cr1, L(dLcr1x) | |
04067002 | 325 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ |
fe6e95d7 AM |
326 | addi 1, 1, 64 |
327 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
328 | bne L(d00) |
329 | li rRTN, 0 | |
330 | blr | |
9c84384c | 331 | |
04067002 | 332 | /* Remainder is 12 */ |
fe6e95d7 AM |
333 | .align 4 |
334 | cfi_adjust_cfa_offset(64) | |
04067002 | 335 | L(dP3): |
fe6e95d7 AM |
336 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
337 | #ifdef __LITTLE_ENDIAN__ | |
338 | lwbrx rWORD3, 0, rSTR1 | |
339 | lwbrx rWORD4, 0, rSTR2 | |
340 | addi rSTR1, rSTR1, 4 | |
341 | addi rSTR2, rSTR2, 4 | |
342 | #else | |
04067002 UD |
343 | lwz rWORD3, 0(rSTR1) |
344 | lwz rWORD4, 0(rSTR2) | |
fe6e95d7 | 345 | #endif |
04067002 UD |
346 | cmplw cr1, rWORD3, rWORD4 |
347 | L(dP3e): | |
fe6e95d7 AM |
348 | #ifdef __LITTLE_ENDIAN__ |
349 | lwbrx rWORD5, 0, rSTR1 | |
350 | lwbrx rWORD6, 0, rSTR2 | |
351 | addi rSTR1, rSTR1, 4 | |
352 | addi rSTR2, rSTR2, 4 | |
353 | #else | |
04067002 UD |
354 | lwz rWORD5, 4(rSTR1) |
355 | lwz rWORD6, 4(rSTR2) | |
fe6e95d7 | 356 | #endif |
04067002 UD |
357 | cmplw cr6, rWORD5, rWORD6 |
358 | blt cr7, L(dP3x) | |
fe6e95d7 AM |
359 | #ifdef __LITTLE_ENDIAN__ |
360 | lwbrx rWORD7, 0, rSTR1 | |
361 | lwbrx rWORD8, 0, rSTR2 | |
362 | addi rSTR1, rSTR1, 4 | |
363 | addi rSTR2, rSTR2, 4 | |
364 | #else | |
04067002 UD |
365 | lwz rWORD7, 8(rSTR1) |
366 | lwz rWORD8, 8(rSTR2) | |
fe6e95d7 | 367 | #endif |
04067002 | 368 | cmplw cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
369 | #ifdef __LITTLE_ENDIAN__ |
370 | lwbrx rWORD1, 0, rSTR1 | |
371 | lwbrx rWORD2, 0, rSTR2 | |
372 | addi rSTR1, rSTR1, 4 | |
373 | addi rSTR2, rSTR2, 4 | |
374 | #else | |
04067002 UD |
375 | lwz rWORD1, 12(rSTR1) |
376 | lwz rWORD2, 12(rSTR2) | |
fe6e95d7 AM |
377 | #endif |
378 | cmplw cr7, rWORD1, rWORD2 | |
379 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
380 | addi rSTR1, rSTR1, 8 |
381 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 382 | #endif |
04067002 UD |
383 | bne cr1, L(dLcr1) |
384 | bne cr6, L(dLcr6) | |
385 | b L(dLoop1) | |
386 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
387 | only use volatile registers and avoid restoring non-volatile | |
388 | registers. */ | |
fe6e95d7 | 389 | .align 4 |
04067002 | 390 | L(dP3x): |
fe6e95d7 AM |
391 | #ifdef __LITTLE_ENDIAN__ |
392 | lwbrx rWORD1, 0, rSTR1 | |
393 | lwbrx rWORD2, 0, rSTR2 | |
394 | addi rSTR1, rSTR1, 4 | |
395 | addi rSTR2, rSTR2, 4 | |
396 | #else | |
04067002 UD |
397 | lwz rWORD1, 8(rSTR1) |
398 | lwz rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
399 | #endif |
400 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 401 | slwi. r12, rN, 3 |
fe6e95d7 AM |
402 | bne cr1, L(dLcr1x) |
403 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
404 | addi rSTR1, rSTR1, 8 |
405 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
406 | #endif |
407 | bne cr6, L(dLcr6x) | |
04067002 | 408 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ |
fe6e95d7 AM |
409 | bne cr7, L(dLcr7x) |
410 | addi 1, 1, 64 | |
411 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
412 | bne L(d00) |
413 | li rRTN, 0 | |
414 | blr | |
9c84384c | 415 | |
04067002 | 416 | /* Count is a multiple of 16, remainder is 0 */ |
fe6e95d7 AM |
417 | .align 4 |
418 | cfi_adjust_cfa_offset(64) | |
04067002 | 419 | L(dP4): |
fe6e95d7 AM |
420 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
421 | #ifdef __LITTLE_ENDIAN__ | |
422 | lwbrx rWORD1, 0, rSTR1 | |
423 | lwbrx rWORD2, 0, rSTR2 | |
424 | addi rSTR1, rSTR1, 4 | |
425 | addi rSTR2, rSTR2, 4 | |
426 | #else | |
04067002 UD |
427 | lwz rWORD1, 0(rSTR1) |
428 | lwz rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
429 | #endif |
430 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 431 | L(dP4e): |
fe6e95d7 AM |
432 | #ifdef __LITTLE_ENDIAN__ |
433 | lwbrx rWORD3, 0, rSTR1 | |
434 | lwbrx rWORD4, 0, rSTR2 | |
435 | addi rSTR1, rSTR1, 4 | |
436 | addi rSTR2, rSTR2, 4 | |
437 | #else | |
04067002 UD |
438 | lwz rWORD3, 4(rSTR1) |
439 | lwz rWORD4, 4(rSTR2) | |
fe6e95d7 | 440 | #endif |
04067002 | 441 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
442 | #ifdef __LITTLE_ENDIAN__ |
443 | lwbrx rWORD5, 0, rSTR1 | |
444 | lwbrx rWORD6, 0, rSTR2 | |
445 | addi rSTR1, rSTR1, 4 | |
446 | addi rSTR2, rSTR2, 4 | |
447 | #else | |
04067002 UD |
448 | lwz rWORD5, 8(rSTR1) |
449 | lwz rWORD6, 8(rSTR2) | |
fe6e95d7 | 450 | #endif |
04067002 | 451 | cmplw cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
452 | #ifdef __LITTLE_ENDIAN__ |
453 | lwbrx rWORD7, 0, rSTR1 | |
454 | lwbrx rWORD8, 0, rSTR2 | |
455 | addi rSTR1, rSTR1, 4 | |
456 | addi rSTR2, rSTR2, 4 | |
457 | #else | |
04067002 UD |
458 | lwzu rWORD7, 12(rSTR1) |
459 | lwzu rWORD8, 12(rSTR2) | |
fe6e95d7 | 460 | #endif |
04067002 | 461 | cmplw cr5, rWORD7, rWORD8 |
fe6e95d7 | 462 | bne cr7, L(dLcr7) |
04067002 UD |
463 | bne cr1, L(dLcr1) |
464 | bdz- L(d24) /* Adjust CTR as we start with +4 */ | |
465 | /* This is the primary loop */ | |
fe6e95d7 | 466 | .align 4 |
04067002 | 467 | L(dLoop): |
fe6e95d7 AM |
468 | #ifdef __LITTLE_ENDIAN__ |
469 | lwbrx rWORD1, 0, rSTR1 | |
470 | lwbrx rWORD2, 0, rSTR2 | |
471 | addi rSTR1, rSTR1, 4 | |
472 | addi rSTR2, rSTR2, 4 | |
473 | #else | |
04067002 UD |
474 | lwz rWORD1, 4(rSTR1) |
475 | lwz rWORD2, 4(rSTR2) | |
fe6e95d7 | 476 | #endif |
04067002 UD |
477 | cmplw cr1, rWORD3, rWORD4 |
478 | bne cr6, L(dLcr6) | |
479 | L(dLoop1): | |
fe6e95d7 AM |
480 | #ifdef __LITTLE_ENDIAN__ |
481 | lwbrx rWORD3, 0, rSTR1 | |
482 | lwbrx rWORD4, 0, rSTR2 | |
483 | addi rSTR1, rSTR1, 4 | |
484 | addi rSTR2, rSTR2, 4 | |
485 | #else | |
04067002 UD |
486 | lwz rWORD3, 8(rSTR1) |
487 | lwz rWORD4, 8(rSTR2) | |
fe6e95d7 | 488 | #endif |
04067002 UD |
489 | cmplw cr6, rWORD5, rWORD6 |
490 | bne cr5, L(dLcr5) | |
491 | L(dLoop2): | |
fe6e95d7 AM |
492 | #ifdef __LITTLE_ENDIAN__ |
493 | lwbrx rWORD5, 0, rSTR1 | |
494 | lwbrx rWORD6, 0, rSTR2 | |
495 | addi rSTR1, rSTR1, 4 | |
496 | addi rSTR2, rSTR2, 4 | |
497 | #else | |
04067002 UD |
498 | lwz rWORD5, 12(rSTR1) |
499 | lwz rWORD6, 12(rSTR2) | |
fe6e95d7 | 500 | #endif |
04067002 | 501 | cmplw cr5, rWORD7, rWORD8 |
fe6e95d7 | 502 | bne cr7, L(dLcr7) |
04067002 | 503 | L(dLoop3): |
fe6e95d7 AM |
504 | #ifdef __LITTLE_ENDIAN__ |
505 | lwbrx rWORD7, 0, rSTR1 | |
506 | lwbrx rWORD8, 0, rSTR2 | |
507 | addi rSTR1, rSTR1, 4 | |
508 | addi rSTR2, rSTR2, 4 | |
509 | #else | |
04067002 UD |
510 | lwzu rWORD7, 16(rSTR1) |
511 | lwzu rWORD8, 16(rSTR2) | |
fe6e95d7 | 512 | #endif |
04067002 | 513 | bne- cr1, L(dLcr1) |
fe6e95d7 | 514 | cmplw cr7, rWORD1, rWORD2 |
9c84384c JM |
515 | bdnz+ L(dLoop) |
516 | ||
04067002 UD |
517 | L(dL4): |
518 | cmplw cr1, rWORD3, rWORD4 | |
519 | bne cr6, L(dLcr6) | |
520 | cmplw cr6, rWORD5, rWORD6 | |
521 | bne cr5, L(dLcr5) | |
522 | cmplw cr5, rWORD7, rWORD8 | |
523 | L(d44): | |
fe6e95d7 | 524 | bne cr7, L(dLcr7) |
04067002 UD |
525 | L(d34): |
526 | bne cr1, L(dLcr1) | |
527 | L(d24): | |
528 | bne cr6, L(dLcr6) | |
529 | L(d14): | |
530 | slwi. r12, rN, 3 | |
9c84384c | 531 | bne cr5, L(dLcr5) |
04067002 | 532 | L(d04): |
fe6e95d7 AM |
533 | lwz rWORD7, 44(r1) |
534 | lwz rWORD8, 48(r1) | |
535 | addi 1, 1, 64 | |
536 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
537 | subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */ |
538 | beq L(zeroLength) | |
539 | /* At this point we have a remainder of 1 to 3 bytes to compare. Since | |
540 | we are aligned it is safe to load the whole word, and use | |
fe6e95d7 | 541 | shift right to eliminate bits beyond the compare length. */ |
04067002 | 542 | L(d00): |
fe6e95d7 AM |
543 | #ifdef __LITTLE_ENDIAN__ |
544 | lwbrx rWORD1, 0, rSTR1 | |
545 | lwbrx rWORD2, 0, rSTR2 | |
546 | addi rSTR1, rSTR1, 4 | |
547 | addi rSTR2, rSTR2, 4 | |
548 | #else | |
04067002 | 549 | lwz rWORD1, 4(rSTR1) |
9c84384c | 550 | lwz rWORD2, 4(rSTR2) |
fe6e95d7 | 551 | #endif |
04067002 UD |
552 | srw rWORD1, rWORD1, rN |
553 | srw rWORD2, rWORD2, rN | |
fe6e95d7 AM |
554 | sub rRTN, rWORD1, rWORD2 |
555 | blr | |
556 | ||
557 | .align 4 | |
558 | cfi_adjust_cfa_offset(64) | |
559 | L(dLcr7): | |
560 | lwz rWORD7, 44(r1) | |
561 | lwz rWORD8, 48(r1) | |
562 | L(dLcr7x): | |
04067002 | 563 | li rRTN, 1 |
fe6e95d7 AM |
564 | addi 1, 1, 64 |
565 | cfi_adjust_cfa_offset(-64) | |
566 | bgtlr cr7 | |
04067002 UD |
567 | li rRTN, -1 |
568 | blr | |
fe6e95d7 AM |
569 | .align 4 |
570 | cfi_adjust_cfa_offset(64) | |
04067002 | 571 | L(dLcr1): |
fe6e95d7 AM |
572 | lwz rWORD7, 44(r1) |
573 | lwz rWORD8, 48(r1) | |
574 | L(dLcr1x): | |
04067002 | 575 | li rRTN, 1 |
fe6e95d7 AM |
576 | addi 1, 1, 64 |
577 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
578 | bgtlr cr1 |
579 | li rRTN, -1 | |
580 | blr | |
fe6e95d7 AM |
581 | .align 4 |
582 | cfi_adjust_cfa_offset(64) | |
04067002 | 583 | L(dLcr6): |
fe6e95d7 AM |
584 | lwz rWORD7, 44(r1) |
585 | lwz rWORD8, 48(r1) | |
586 | L(dLcr6x): | |
04067002 | 587 | li rRTN, 1 |
fe6e95d7 AM |
588 | addi 1, 1, 64 |
589 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
590 | bgtlr cr6 |
591 | li rRTN, -1 | |
592 | blr | |
fe6e95d7 AM |
593 | .align 4 |
594 | cfi_adjust_cfa_offset(64) | |
04067002 | 595 | L(dLcr5): |
fe6e95d7 AM |
596 | lwz rWORD7, 44(r1) |
597 | lwz rWORD8, 48(r1) | |
04067002 UD |
598 | L(dLcr5x): |
599 | li rRTN, 1 | |
fe6e95d7 AM |
600 | addi 1, 1, 64 |
601 | cfi_adjust_cfa_offset(-64) | |
04067002 UD |
602 | bgtlr cr5 |
603 | li rRTN, -1 | |
604 | blr | |
9c84384c | 605 | |
fe6e95d7 | 606 | .align 4 |
04067002 | 607 | L(bytealigned): |
fe6e95d7 | 608 | mtctr rN /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
609 | |
610 | /* We need to prime this loop. This loop is swing modulo scheduled | |
9c84384c | 611 | to avoid pipe delays. The dependent instruction latencies (load to |
04067002 UD |
612 | compare to conditional branch) is 2 to 3 cycles. In this loop each |
613 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
9c84384c JM |
614 | the first iteration of the loop only serves to load operands and |
615 | branches based on compares are delayed until the next loop. | |
04067002 UD |
616 | |
617 | So we must precondition some registers and condition codes so that | |
618 | we don't exit the loop early on the first iteration. */ | |
9c84384c | 619 | |
04067002 UD |
620 | lbz rWORD1, 0(rSTR1) |
621 | lbz rWORD2, 0(rSTR2) | |
622 | bdz- L(b11) | |
fe6e95d7 | 623 | cmplw cr7, rWORD1, rWORD2 |
04067002 UD |
624 | lbz rWORD3, 1(rSTR1) |
625 | lbz rWORD4, 1(rSTR2) | |
626 | bdz- L(b12) | |
627 | cmplw cr1, rWORD3, rWORD4 | |
628 | lbzu rWORD5, 2(rSTR1) | |
629 | lbzu rWORD6, 2(rSTR2) | |
630 | bdz- L(b13) | |
fe6e95d7 | 631 | .align 4 |
04067002 UD |
632 | L(bLoop): |
633 | lbzu rWORD1, 1(rSTR1) | |
634 | lbzu rWORD2, 1(rSTR2) | |
fe6e95d7 | 635 | bne- cr7, L(bLcr7) |
04067002 UD |
636 | |
637 | cmplw cr6, rWORD5, rWORD6 | |
638 | bdz- L(b3i) | |
9c84384c | 639 | |
04067002 UD |
640 | lbzu rWORD3, 1(rSTR1) |
641 | lbzu rWORD4, 1(rSTR2) | |
642 | bne- cr1, L(bLcr1) | |
643 | ||
fe6e95d7 | 644 | cmplw cr7, rWORD1, rWORD2 |
04067002 UD |
645 | bdz- L(b2i) |
646 | ||
647 | lbzu rWORD5, 1(rSTR1) | |
648 | lbzu rWORD6, 1(rSTR2) | |
649 | bne- cr6, L(bLcr6) | |
650 | ||
651 | cmplw cr1, rWORD3, rWORD4 | |
652 | bdnz+ L(bLoop) | |
9c84384c | 653 | |
04067002 UD |
654 | /* We speculatively loading bytes before we have tested the previous |
655 | bytes. But we must avoid overrunning the length (in the ctr) to | |
9c84384c | 656 | prevent these speculative loads from causing a segfault. In this |
04067002 UD |
657 | case the loop will exit early (before the all pending bytes are |
658 | tested. In this case we must complete the pending operations | |
659 | before returning. */ | |
660 | L(b1i): | |
fe6e95d7 | 661 | bne- cr7, L(bLcr7) |
04067002 UD |
662 | bne- cr1, L(bLcr1) |
663 | b L(bx56) | |
fe6e95d7 | 664 | .align 4 |
04067002 UD |
665 | L(b2i): |
666 | bne- cr6, L(bLcr6) | |
fe6e95d7 | 667 | bne- cr7, L(bLcr7) |
04067002 | 668 | b L(bx34) |
fe6e95d7 | 669 | .align 4 |
04067002 UD |
670 | L(b3i): |
671 | bne- cr1, L(bLcr1) | |
672 | bne- cr6, L(bLcr6) | |
673 | b L(bx12) | |
fe6e95d7 AM |
674 | .align 4 |
675 | L(bLcr7): | |
04067002 | 676 | li rRTN, 1 |
fe6e95d7 | 677 | bgtlr cr7 |
04067002 UD |
678 | li rRTN, -1 |
679 | blr | |
680 | L(bLcr1): | |
681 | li rRTN, 1 | |
682 | bgtlr cr1 | |
683 | li rRTN, -1 | |
684 | blr | |
685 | L(bLcr6): | |
686 | li rRTN, 1 | |
687 | bgtlr cr6 | |
688 | li rRTN, -1 | |
689 | blr | |
690 | ||
691 | L(b13): | |
fe6e95d7 | 692 | bne- cr7, L(bx12) |
04067002 UD |
693 | bne- cr1, L(bx34) |
694 | L(bx56): | |
695 | sub rRTN, rWORD5, rWORD6 | |
696 | blr | |
697 | nop | |
698 | L(b12): | |
fe6e95d7 | 699 | bne- cr7, L(bx12) |
9c84384c | 700 | L(bx34): |
04067002 UD |
701 | sub rRTN, rWORD3, rWORD4 |
702 | blr | |
04067002 UD |
703 | L(b11): |
704 | L(bx12): | |
705 | sub rRTN, rWORD1, rWORD2 | |
706 | blr | |
fe6e95d7 | 707 | .align 4 |
04067002 UD |
708 | L(zeroLength): |
709 | li rRTN, 0 | |
710 | blr | |
711 | ||
fe6e95d7 | 712 | .align 4 |
04067002 | 713 | /* At this point we know the strings have different alignment and the |
fe6e95d7 | 714 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 715 | 2 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 | 716 | of r12 to 0. If r12 == 0 then rStr1 is word aligned and can |
04067002 | 717 | perform the Wunaligned loop. |
9c84384c | 718 | |
c0c3f78a | 719 | Otherwise we know that rSTR1 is not already word aligned yet. |
04067002 UD |
720 | So we can force the string addresses to the next lower word |
721 | boundary and special case this first word using shift left to | |
2ccdea26 | 722 | eliminate bits preceding the first byte. Since we want to join the |
04067002 UD |
723 | normal (Wualigned) compare loop, starting at the second word, |
724 | we need to adjust the length (rN) and special case the loop | |
fe6e95d7 | 725 | versioning for the first W. This ensures that the loop count is |
04067002 UD |
726 | correct and the first W (shifted) is in the expected resister pair. */ |
727 | #define rSHL r29 /* Unaligned shift left count. */ | |
728 | #define rSHR r28 /* Unaligned shift right count. */ | |
fe6e95d7 AM |
729 | #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ |
730 | #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ | |
731 | #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ | |
732 | #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ | |
733 | cfi_adjust_cfa_offset(64) | |
04067002 | 734 | L(unaligned): |
fe6e95d7 AM |
735 | stw rSHL, 40(r1) |
736 | cfi_offset(rSHL, (40-64)) | |
04067002 | 737 | clrlwi rSHL, rSTR2, 30 |
fe6e95d7 AM |
738 | stw rSHR, 36(r1) |
739 | cfi_offset(rSHR, (36-64)) | |
04067002 | 740 | beq cr5, L(Wunaligned) |
fe6e95d7 AM |
741 | stw rWORD8_SHIFT, 32(r1) |
742 | cfi_offset(rWORD8_SHIFT, (32-64)) | |
04067002 UD |
743 | /* Adjust the logical start of rSTR2 to compensate for the extra bits |
744 | in the 1st rSTR1 W. */ | |
fe6e95d7 | 745 | sub rWORD8_SHIFT, rSTR2, r12 |
04067002 UD |
746 | /* But do not attempt to address the W before that W that contains |
747 | the actual start of rSTR2. */ | |
748 | clrrwi rSTR2, rSTR2, 2 | |
fe6e95d7 AM |
749 | stw rWORD2_SHIFT, 28(r1) |
750 | cfi_offset(rWORD2_SHIFT, (28-64)) | |
751 | /* Compute the left/right shift counts for the unaligned rSTR2, | |
9c84384c | 752 | compensating for the logical (W aligned) start of rSTR1. */ |
fe6e95d7 | 753 | clrlwi rSHL, rWORD8_SHIFT, 30 |
9c84384c | 754 | clrrwi rSTR1, rSTR1, 2 |
fe6e95d7 AM |
755 | stw rWORD4_SHIFT, 24(r1) |
756 | cfi_offset(rWORD4_SHIFT, (24-64)) | |
04067002 | 757 | slwi rSHL, rSHL, 3 |
fe6e95d7 AM |
758 | cmplw cr5, rWORD8_SHIFT, rSTR2 |
759 | add rN, rN, r12 | |
760 | slwi rWORD6, r12, 3 | |
761 | stw rWORD6_SHIFT, 20(r1) | |
762 | cfi_offset(rWORD6_SHIFT, (20-64)) | |
04067002 | 763 | subfic rSHR, rSHL, 32 |
fe6e95d7 AM |
764 | srwi r0, rN, 4 /* Divide by 16 */ |
765 | andi. r12, rN, 12 /* Get the W remainder */ | |
04067002 UD |
766 | /* We normally need to load 2 Ws to start the unaligned rSTR2, but in |
767 | this special case those bits may be discarded anyway. Also we | |
768 | must avoid loading a W where none of the bits are part of rSTR2 as | |
769 | this may cross a page boundary and cause a page fault. */ | |
770 | li rWORD8, 0 | |
771 | blt cr5, L(dus0) | |
fe6e95d7 AM |
772 | #ifdef __LITTLE_ENDIAN__ |
773 | lwbrx rWORD8, 0, rSTR2 | |
774 | addi rSTR2, rSTR2, 4 | |
775 | #else | |
04067002 | 776 | lwz rWORD8, 0(rSTR2) |
fe6e95d7 AM |
777 | addi rSTR2, rSTR2, 4 |
778 | #endif | |
04067002 UD |
779 | slw rWORD8, rWORD8, rSHL |
780 | ||
781 | L(dus0): | |
fe6e95d7 AM |
782 | #ifdef __LITTLE_ENDIAN__ |
783 | lwbrx rWORD1, 0, rSTR1 | |
784 | lwbrx rWORD2, 0, rSTR2 | |
785 | addi rSTR1, rSTR1, 4 | |
786 | addi rSTR2, rSTR2, 4 | |
787 | #else | |
04067002 UD |
788 | lwz rWORD1, 0(rSTR1) |
789 | lwz rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
790 | #endif |
791 | cmplwi cr1, r12, 8 | |
04067002 | 792 | cmplwi cr7, rN, 16 |
fe6e95d7 | 793 | srw r12, rWORD2, rSHR |
04067002 UD |
794 | clrlwi rN, rN, 30 |
795 | beq L(duPs4) | |
fe6e95d7 AM |
796 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
797 | or rWORD8, r12, rWORD8 | |
04067002 UD |
798 | bgt cr1, L(duPs3) |
799 | beq cr1, L(duPs2) | |
800 | ||
801 | /* Remainder is 4 */ | |
fe6e95d7 | 802 | .align 4 |
04067002 | 803 | L(dusP1): |
fe6e95d7 AM |
804 | slw rWORD8_SHIFT, rWORD2, rSHL |
805 | slw rWORD7, rWORD1, rWORD6 | |
806 | slw rWORD8, rWORD8, rWORD6 | |
04067002 UD |
807 | bge cr7, L(duP1e) |
808 | /* At this point we exit early with the first word compare | |
809 | complete and remainder of 0 to 3 bytes. See L(du14) for details on | |
810 | how we handle the remaining bytes. */ | |
811 | cmplw cr5, rWORD7, rWORD8 | |
812 | slwi. rN, rN, 3 | |
813 | bne cr5, L(duLcr5) | |
814 | cmplw cr7, rN, rSHR | |
815 | beq L(duZeroReturn) | |
fe6e95d7 | 816 | li r0, 0 |
04067002 | 817 | ble cr7, L(dutrim) |
fe6e95d7 AM |
818 | #ifdef __LITTLE_ENDIAN__ |
819 | lwbrx rWORD2, 0, rSTR2 | |
820 | addi rSTR2, rSTR2, 4 | |
821 | #else | |
04067002 | 822 | lwz rWORD2, 4(rSTR2) |
fe6e95d7 AM |
823 | #endif |
824 | srw r0, rWORD2, rSHR | |
04067002 UD |
825 | b L(dutrim) |
826 | /* Remainder is 8 */ | |
fe6e95d7 | 827 | .align 4 |
04067002 | 828 | L(duPs2): |
fe6e95d7 AM |
829 | slw rWORD6_SHIFT, rWORD2, rSHL |
830 | slw rWORD5, rWORD1, rWORD6 | |
831 | slw rWORD6, rWORD8, rWORD6 | |
04067002 UD |
832 | b L(duP2e) |
833 | /* Remainder is 12 */ | |
fe6e95d7 | 834 | .align 4 |
04067002 | 835 | L(duPs3): |
fe6e95d7 AM |
836 | slw rWORD4_SHIFT, rWORD2, rSHL |
837 | slw rWORD3, rWORD1, rWORD6 | |
838 | slw rWORD4, rWORD8, rWORD6 | |
04067002 UD |
839 | b L(duP3e) |
840 | /* Count is a multiple of 16, remainder is 0 */ | |
fe6e95d7 | 841 | .align 4 |
04067002 | 842 | L(duPs4): |
fe6e95d7 AM |
843 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
844 | or rWORD8, r12, rWORD8 | |
845 | slw rWORD2_SHIFT, rWORD2, rSHL | |
846 | slw rWORD1, rWORD1, rWORD6 | |
847 | slw rWORD2, rWORD8, rWORD6 | |
04067002 UD |
848 | b L(duP4e) |
849 | ||
850 | /* At this point we know rSTR1 is word aligned and the | |
851 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 852 | .align 4 |
04067002 | 853 | L(Wunaligned): |
fe6e95d7 AM |
854 | stw rWORD8_SHIFT, 32(r1) |
855 | cfi_offset(rWORD8_SHIFT, (32-64)) | |
04067002 | 856 | clrrwi rSTR2, rSTR2, 2 |
fe6e95d7 AM |
857 | stw rWORD2_SHIFT, 28(r1) |
858 | cfi_offset(rWORD2_SHIFT, (28-64)) | |
859 | srwi r0, rN, 4 /* Divide by 16 */ | |
860 | stw rWORD4_SHIFT, 24(r1) | |
861 | cfi_offset(rWORD4_SHIFT, (24-64)) | |
862 | andi. r12, rN, 12 /* Get the W remainder */ | |
863 | stw rWORD6_SHIFT, 20(r1) | |
864 | cfi_offset(rWORD6_SHIFT, (20-64)) | |
04067002 | 865 | slwi rSHL, rSHL, 3 |
fe6e95d7 AM |
866 | #ifdef __LITTLE_ENDIAN__ |
867 | lwbrx rWORD6, 0, rSTR2 | |
868 | addi rSTR2, rSTR2, 4 | |
869 | lwbrx rWORD8, 0, rSTR2 | |
870 | addi rSTR2, rSTR2, 4 | |
871 | #else | |
04067002 UD |
872 | lwz rWORD6, 0(rSTR2) |
873 | lwzu rWORD8, 4(rSTR2) | |
fe6e95d7 AM |
874 | #endif |
875 | cmplwi cr1, r12, 8 | |
04067002 UD |
876 | cmplwi cr7, rN, 16 |
877 | clrlwi rN, rN, 30 | |
878 | subfic rSHR, rSHL, 32 | |
fe6e95d7 | 879 | slw rWORD6_SHIFT, rWORD6, rSHL |
04067002 | 880 | beq L(duP4) |
fe6e95d7 | 881 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
882 | bgt cr1, L(duP3) |
883 | beq cr1, L(duP2) | |
9c84384c | 884 | |
04067002 | 885 | /* Remainder is 4 */ |
fe6e95d7 | 886 | .align 4 |
04067002 | 887 | L(duP1): |
fe6e95d7 AM |
888 | srw r12, rWORD8, rSHR |
889 | #ifdef __LITTLE_ENDIAN__ | |
890 | lwbrx rWORD7, 0, rSTR1 | |
891 | addi rSTR1, rSTR1, 4 | |
892 | #else | |
04067002 | 893 | lwz rWORD7, 0(rSTR1) |
fe6e95d7 AM |
894 | #endif |
895 | slw rWORD8_SHIFT, rWORD8, rSHL | |
896 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
897 | blt cr7, L(duP1x) |
898 | L(duP1e): | |
fe6e95d7 AM |
899 | #ifdef __LITTLE_ENDIAN__ |
900 | lwbrx rWORD1, 0, rSTR1 | |
901 | lwbrx rWORD2, 0, rSTR2 | |
902 | addi rSTR1, rSTR1, 4 | |
903 | addi rSTR2, rSTR2, 4 | |
904 | #else | |
04067002 UD |
905 | lwz rWORD1, 4(rSTR1) |
906 | lwz rWORD2, 4(rSTR2) | |
fe6e95d7 | 907 | #endif |
04067002 | 908 | cmplw cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
909 | srw r0, rWORD2, rSHR |
910 | slw rWORD2_SHIFT, rWORD2, rSHL | |
911 | or rWORD2, r0, rWORD8_SHIFT | |
912 | #ifdef __LITTLE_ENDIAN__ | |
913 | lwbrx rWORD3, 0, rSTR1 | |
914 | lwbrx rWORD4, 0, rSTR2 | |
915 | addi rSTR1, rSTR1, 4 | |
916 | addi rSTR2, rSTR2, 4 | |
917 | #else | |
04067002 UD |
918 | lwz rWORD3, 8(rSTR1) |
919 | lwz rWORD4, 8(rSTR2) | |
fe6e95d7 AM |
920 | #endif |
921 | cmplw cr7, rWORD1, rWORD2 | |
922 | srw r12, rWORD4, rSHR | |
923 | slw rWORD4_SHIFT, rWORD4, rSHL | |
04067002 | 924 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
925 | or rWORD4, r12, rWORD2_SHIFT |
926 | #ifdef __LITTLE_ENDIAN__ | |
927 | lwbrx rWORD5, 0, rSTR1 | |
928 | lwbrx rWORD6, 0, rSTR2 | |
929 | addi rSTR1, rSTR1, 4 | |
930 | addi rSTR2, rSTR2, 4 | |
931 | #else | |
04067002 UD |
932 | lwz rWORD5, 12(rSTR1) |
933 | lwz rWORD6, 12(rSTR2) | |
fe6e95d7 | 934 | #endif |
04067002 | 935 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
936 | srw r0, rWORD6, rSHR |
937 | slw rWORD6_SHIFT, rWORD6, rSHL | |
938 | bne cr7, L(duLcr7) | |
939 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 940 | cmplw cr6, rWORD5, rWORD6 |
9c84384c | 941 | b L(duLoop3) |
fe6e95d7 | 942 | .align 4 |
04067002 UD |
943 | /* At this point we exit early with the first word compare |
944 | complete and remainder of 0 to 3 bytes. See L(du14) for details on | |
945 | how we handle the remaining bytes. */ | |
946 | L(duP1x): | |
947 | cmplw cr5, rWORD7, rWORD8 | |
948 | slwi. rN, rN, 3 | |
949 | bne cr5, L(duLcr5) | |
950 | cmplw cr7, rN, rSHR | |
951 | beq L(duZeroReturn) | |
fe6e95d7 | 952 | li r0, 0 |
04067002 | 953 | ble cr7, L(dutrim) |
fe6e95d7 AM |
954 | #ifdef __LITTLE_ENDIAN__ |
955 | lwbrx rWORD2, 0, rSTR2 | |
956 | addi rSTR2, rSTR2, 4 | |
957 | #else | |
958 | lwz rWORD2, 8(rSTR2) | |
959 | #endif | |
960 | srw r0, rWORD2, rSHR | |
04067002 UD |
961 | b L(dutrim) |
962 | /* Remainder is 8 */ | |
fe6e95d7 | 963 | .align 4 |
04067002 | 964 | L(duP2): |
fe6e95d7 AM |
965 | srw r0, rWORD8, rSHR |
966 | #ifdef __LITTLE_ENDIAN__ | |
967 | lwbrx rWORD5, 0, rSTR1 | |
968 | addi rSTR1, rSTR1, 4 | |
969 | #else | |
04067002 | 970 | lwz rWORD5, 0(rSTR1) |
fe6e95d7 AM |
971 | #endif |
972 | or rWORD6, r0, rWORD6_SHIFT | |
973 | slw rWORD6_SHIFT, rWORD8, rSHL | |
04067002 | 974 | L(duP2e): |
fe6e95d7 AM |
975 | #ifdef __LITTLE_ENDIAN__ |
976 | lwbrx rWORD7, 0, rSTR1 | |
977 | lwbrx rWORD8, 0, rSTR2 | |
978 | addi rSTR1, rSTR1, 4 | |
979 | addi rSTR2, rSTR2, 4 | |
980 | #else | |
04067002 UD |
981 | lwz rWORD7, 4(rSTR1) |
982 | lwz rWORD8, 4(rSTR2) | |
fe6e95d7 | 983 | #endif |
04067002 | 984 | cmplw cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
985 | srw r12, rWORD8, rSHR |
986 | slw rWORD8_SHIFT, rWORD8, rSHL | |
987 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 988 | blt cr7, L(duP2x) |
fe6e95d7 AM |
989 | #ifdef __LITTLE_ENDIAN__ |
990 | lwbrx rWORD1, 0, rSTR1 | |
991 | lwbrx rWORD2, 0, rSTR2 | |
992 | addi rSTR1, rSTR1, 4 | |
993 | addi rSTR2, rSTR2, 4 | |
994 | #else | |
04067002 UD |
995 | lwz rWORD1, 8(rSTR1) |
996 | lwz rWORD2, 8(rSTR2) | |
fe6e95d7 | 997 | #endif |
04067002 UD |
998 | cmplw cr5, rWORD7, rWORD8 |
999 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1000 | srw r0, rWORD2, rSHR |
1001 | slw rWORD2_SHIFT, rWORD2, rSHL | |
1002 | or rWORD2, r0, rWORD8_SHIFT | |
1003 | #ifdef __LITTLE_ENDIAN__ | |
1004 | lwbrx rWORD3, 0, rSTR1 | |
1005 | lwbrx rWORD4, 0, rSTR2 | |
1006 | addi rSTR1, rSTR1, 4 | |
1007 | addi rSTR2, rSTR2, 4 | |
1008 | #else | |
04067002 UD |
1009 | lwz rWORD3, 12(rSTR1) |
1010 | lwz rWORD4, 12(rSTR2) | |
fe6e95d7 AM |
1011 | #endif |
1012 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 1013 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
1014 | srw r12, rWORD4, rSHR |
1015 | slw rWORD4_SHIFT, rWORD4, rSHL | |
1016 | or rWORD4, r12, rWORD2_SHIFT | |
1017 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
1018 | addi rSTR1, rSTR1, 4 |
1019 | addi rSTR2, rSTR2, 4 | |
fe6e95d7 | 1020 | #endif |
04067002 UD |
1021 | cmplw cr1, rWORD3, rWORD4 |
1022 | b L(duLoop2) | |
fe6e95d7 | 1023 | .align 4 |
04067002 UD |
1024 | L(duP2x): |
1025 | cmplw cr5, rWORD7, rWORD8 | |
fe6e95d7 | 1026 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1027 | addi rSTR1, rSTR1, 4 |
1028 | addi rSTR2, rSTR2, 4 | |
fe6e95d7 | 1029 | #endif |
04067002 UD |
1030 | bne cr6, L(duLcr6) |
1031 | slwi. rN, rN, 3 | |
1032 | bne cr5, L(duLcr5) | |
1033 | cmplw cr7, rN, rSHR | |
1034 | beq L(duZeroReturn) | |
fe6e95d7 | 1035 | li r0, 0 |
04067002 | 1036 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1037 | #ifdef __LITTLE_ENDIAN__ |
1038 | lwbrx rWORD2, 0, rSTR2 | |
1039 | addi rSTR2, rSTR2, 4 | |
1040 | #else | |
04067002 | 1041 | lwz rWORD2, 4(rSTR2) |
fe6e95d7 AM |
1042 | #endif |
1043 | srw r0, rWORD2, rSHR | |
04067002 | 1044 | b L(dutrim) |
9c84384c | 1045 | |
04067002 | 1046 | /* Remainder is 12 */ |
fe6e95d7 | 1047 | .align 4 |
04067002 | 1048 | L(duP3): |
fe6e95d7 AM |
1049 | srw r12, rWORD8, rSHR |
1050 | #ifdef __LITTLE_ENDIAN__ | |
1051 | lwbrx rWORD3, 0, rSTR1 | |
1052 | addi rSTR1, rSTR1, 4 | |
1053 | #else | |
04067002 | 1054 | lwz rWORD3, 0(rSTR1) |
fe6e95d7 AM |
1055 | #endif |
1056 | slw rWORD4_SHIFT, rWORD8, rSHL | |
1057 | or rWORD4, r12, rWORD6_SHIFT | |
04067002 | 1058 | L(duP3e): |
fe6e95d7 AM |
1059 | #ifdef __LITTLE_ENDIAN__ |
1060 | lwbrx rWORD5, 0, rSTR1 | |
1061 | lwbrx rWORD6, 0, rSTR2 | |
1062 | addi rSTR1, rSTR1, 4 | |
1063 | addi rSTR2, rSTR2, 4 | |
1064 | #else | |
04067002 UD |
1065 | lwz rWORD5, 4(rSTR1) |
1066 | lwz rWORD6, 4(rSTR2) | |
fe6e95d7 | 1067 | #endif |
04067002 | 1068 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1069 | srw r0, rWORD6, rSHR |
1070 | slw rWORD6_SHIFT, rWORD6, rSHL | |
1071 | or rWORD6, r0, rWORD4_SHIFT | |
1072 | #ifdef __LITTLE_ENDIAN__ | |
1073 | lwbrx rWORD7, 0, rSTR1 | |
1074 | lwbrx rWORD8, 0, rSTR2 | |
1075 | addi rSTR1, rSTR1, 4 | |
1076 | addi rSTR2, rSTR2, 4 | |
1077 | #else | |
04067002 UD |
1078 | lwz rWORD7, 8(rSTR1) |
1079 | lwz rWORD8, 8(rSTR2) | |
fe6e95d7 | 1080 | #endif |
04067002 UD |
1081 | cmplw cr6, rWORD5, rWORD6 |
1082 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1083 | srw r12, rWORD8, rSHR |
1084 | slw rWORD8_SHIFT, rWORD8, rSHL | |
1085 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 1086 | blt cr7, L(duP3x) |
fe6e95d7 AM |
1087 | #ifdef __LITTLE_ENDIAN__ |
1088 | lwbrx rWORD1, 0, rSTR1 | |
1089 | lwbrx rWORD2, 0, rSTR2 | |
1090 | addi rSTR1, rSTR1, 4 | |
1091 | addi rSTR2, rSTR2, 4 | |
1092 | #else | |
04067002 UD |
1093 | lwz rWORD1, 12(rSTR1) |
1094 | lwz rWORD2, 12(rSTR2) | |
fe6e95d7 | 1095 | #endif |
04067002 UD |
1096 | cmplw cr5, rWORD7, rWORD8 |
1097 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1098 | srw r0, rWORD2, rSHR |
1099 | slw rWORD2_SHIFT, rWORD2, rSHL | |
1100 | or rWORD2, r0, rWORD8_SHIFT | |
1101 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
1102 | addi rSTR1, rSTR1, 8 |
1103 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
1104 | #endif |
1105 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 1106 | b L(duLoop1) |
fe6e95d7 | 1107 | .align 4 |
04067002 | 1108 | L(duP3x): |
fe6e95d7 | 1109 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1110 | addi rSTR1, rSTR1, 8 |
1111 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
1112 | #endif |
1113 | #if 0 | |
1114 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1115 | bne cr1, L(duLcr1) |
fe6e95d7 | 1116 | #endif |
04067002 UD |
1117 | cmplw cr5, rWORD7, rWORD8 |
1118 | bne cr6, L(duLcr6) | |
1119 | slwi. rN, rN, 3 | |
1120 | bne cr5, L(duLcr5) | |
1121 | cmplw cr7, rN, rSHR | |
1122 | beq L(duZeroReturn) | |
fe6e95d7 | 1123 | li r0, 0 |
04067002 | 1124 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1125 | #ifdef __LITTLE_ENDIAN__ |
1126 | lwbrx rWORD2, 0, rSTR2 | |
1127 | addi rSTR2, rSTR2, 4 | |
1128 | #else | |
04067002 | 1129 | lwz rWORD2, 4(rSTR2) |
fe6e95d7 AM |
1130 | #endif |
1131 | srw r0, rWORD2, rSHR | |
04067002 | 1132 | b L(dutrim) |
9c84384c | 1133 | |
04067002 | 1134 | /* Count is a multiple of 16, remainder is 0 */ |
fe6e95d7 | 1135 | .align 4 |
04067002 | 1136 | L(duP4): |
fe6e95d7 AM |
1137 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
1138 | srw r0, rWORD8, rSHR | |
1139 | #ifdef __LITTLE_ENDIAN__ | |
1140 | lwbrx rWORD1, 0, rSTR1 | |
1141 | addi rSTR1, rSTR1, 4 | |
1142 | #else | |
04067002 | 1143 | lwz rWORD1, 0(rSTR1) |
fe6e95d7 AM |
1144 | #endif |
1145 | slw rWORD2_SHIFT, rWORD8, rSHL | |
1146 | or rWORD2, r0, rWORD6_SHIFT | |
04067002 | 1147 | L(duP4e): |
fe6e95d7 AM |
1148 | #ifdef __LITTLE_ENDIAN__ |
1149 | lwbrx rWORD3, 0, rSTR1 | |
1150 | lwbrx rWORD4, 0, rSTR2 | |
1151 | addi rSTR1, rSTR1, 4 | |
1152 | addi rSTR2, rSTR2, 4 | |
1153 | #else | |
04067002 UD |
1154 | lwz rWORD3, 4(rSTR1) |
1155 | lwz rWORD4, 4(rSTR2) | |
fe6e95d7 AM |
1156 | #endif |
1157 | cmplw cr7, rWORD1, rWORD2 | |
1158 | srw r12, rWORD4, rSHR | |
1159 | slw rWORD4_SHIFT, rWORD4, rSHL | |
1160 | or rWORD4, r12, rWORD2_SHIFT | |
1161 | #ifdef __LITTLE_ENDIAN__ | |
1162 | lwbrx rWORD5, 0, rSTR1 | |
1163 | lwbrx rWORD6, 0, rSTR2 | |
1164 | addi rSTR1, rSTR1, 4 | |
1165 | addi rSTR2, rSTR2, 4 | |
1166 | #else | |
04067002 UD |
1167 | lwz rWORD5, 8(rSTR1) |
1168 | lwz rWORD6, 8(rSTR2) | |
fe6e95d7 | 1169 | #endif |
04067002 | 1170 | cmplw cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1171 | bne cr7, L(duLcr7) |
1172 | srw r0, rWORD6, rSHR | |
1173 | slw rWORD6_SHIFT, rWORD6, rSHL | |
1174 | or rWORD6, r0, rWORD4_SHIFT | |
1175 | #ifdef __LITTLE_ENDIAN__ | |
1176 | lwbrx rWORD7, 0, rSTR1 | |
1177 | lwbrx rWORD8, 0, rSTR2 | |
1178 | addi rSTR1, rSTR1, 4 | |
1179 | addi rSTR2, rSTR2, 4 | |
1180 | #else | |
04067002 UD |
1181 | lwzu rWORD7, 12(rSTR1) |
1182 | lwzu rWORD8, 12(rSTR2) | |
fe6e95d7 | 1183 | #endif |
04067002 UD |
1184 | cmplw cr6, rWORD5, rWORD6 |
1185 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1186 | srw r12, rWORD8, rSHR |
1187 | slw rWORD8_SHIFT, rWORD8, rSHL | |
1188 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
1189 | cmplw cr5, rWORD7, rWORD8 |
1190 | bdz- L(du24) /* Adjust CTR as we start with +4 */ | |
1191 | /* This is the primary loop */ | |
fe6e95d7 | 1192 | .align 4 |
04067002 | 1193 | L(duLoop): |
fe6e95d7 AM |
1194 | #ifdef __LITTLE_ENDIAN__ |
1195 | lwbrx rWORD1, 0, rSTR1 | |
1196 | lwbrx rWORD2, 0, rSTR2 | |
1197 | addi rSTR1, rSTR1, 4 | |
1198 | addi rSTR2, rSTR2, 4 | |
1199 | #else | |
04067002 UD |
1200 | lwz rWORD1, 4(rSTR1) |
1201 | lwz rWORD2, 4(rSTR2) | |
fe6e95d7 | 1202 | #endif |
04067002 UD |
1203 | cmplw cr1, rWORD3, rWORD4 |
1204 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1205 | srw r0, rWORD2, rSHR |
1206 | slw rWORD2_SHIFT, rWORD2, rSHL | |
1207 | or rWORD2, r0, rWORD8_SHIFT | |
04067002 | 1208 | L(duLoop1): |
fe6e95d7 AM |
1209 | #ifdef __LITTLE_ENDIAN__ |
1210 | lwbrx rWORD3, 0, rSTR1 | |
1211 | lwbrx rWORD4, 0, rSTR2 | |
1212 | addi rSTR1, rSTR1, 4 | |
1213 | addi rSTR2, rSTR2, 4 | |
1214 | #else | |
04067002 UD |
1215 | lwz rWORD3, 8(rSTR1) |
1216 | lwz rWORD4, 8(rSTR2) | |
fe6e95d7 | 1217 | #endif |
04067002 UD |
1218 | cmplw cr6, rWORD5, rWORD6 |
1219 | bne cr5, L(duLcr5) | |
fe6e95d7 AM |
1220 | srw r12, rWORD4, rSHR |
1221 | slw rWORD4_SHIFT, rWORD4, rSHL | |
1222 | or rWORD4, r12, rWORD2_SHIFT | |
04067002 | 1223 | L(duLoop2): |
fe6e95d7 AM |
1224 | #ifdef __LITTLE_ENDIAN__ |
1225 | lwbrx rWORD5, 0, rSTR1 | |
1226 | lwbrx rWORD6, 0, rSTR2 | |
1227 | addi rSTR1, rSTR1, 4 | |
1228 | addi rSTR2, rSTR2, 4 | |
1229 | #else | |
04067002 UD |
1230 | lwz rWORD5, 12(rSTR1) |
1231 | lwz rWORD6, 12(rSTR2) | |
fe6e95d7 | 1232 | #endif |
04067002 | 1233 | cmplw cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
1234 | bne cr7, L(duLcr7) |
1235 | srw r0, rWORD6, rSHR | |
1236 | slw rWORD6_SHIFT, rWORD6, rSHL | |
1237 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 1238 | L(duLoop3): |
fe6e95d7 AM |
1239 | #ifdef __LITTLE_ENDIAN__ |
1240 | lwbrx rWORD7, 0, rSTR1 | |
1241 | lwbrx rWORD8, 0, rSTR2 | |
1242 | addi rSTR1, rSTR1, 4 | |
1243 | addi rSTR2, rSTR2, 4 | |
1244 | #else | |
04067002 UD |
1245 | lwzu rWORD7, 16(rSTR1) |
1246 | lwzu rWORD8, 16(rSTR2) | |
fe6e95d7 AM |
1247 | #endif |
1248 | cmplw cr7, rWORD1, rWORD2 | |
04067002 | 1249 | bne- cr1, L(duLcr1) |
fe6e95d7 AM |
1250 | srw r12, rWORD8, rSHR |
1251 | slw rWORD8_SHIFT, rWORD8, rSHL | |
1252 | or rWORD8, r12, rWORD6_SHIFT | |
9c84384c JM |
1253 | bdnz+ L(duLoop) |
1254 | ||
04067002 | 1255 | L(duL4): |
fe6e95d7 AM |
1256 | #if 0 |
1257 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1258 | bne cr1, L(duLcr1) |
fe6e95d7 | 1259 | #endif |
04067002 UD |
1260 | cmplw cr1, rWORD3, rWORD4 |
1261 | bne cr6, L(duLcr6) | |
1262 | cmplw cr6, rWORD5, rWORD6 | |
1263 | bne cr5, L(duLcr5) | |
1264 | cmplw cr5, rWORD7, rWORD8 | |
1265 | L(du44): | |
fe6e95d7 | 1266 | bne cr7, L(duLcr7) |
04067002 UD |
1267 | L(du34): |
1268 | bne cr1, L(duLcr1) | |
1269 | L(du24): | |
1270 | bne cr6, L(duLcr6) | |
1271 | L(du14): | |
1272 | slwi. rN, rN, 3 | |
1273 | bne cr5, L(duLcr5) | |
1274 | /* At this point we have a remainder of 1 to 3 bytes to compare. We use | |
9c84384c | 1275 | shift right to eliminate bits beyond the compare length. |
fe6e95d7 | 1276 | This allows the use of word subtract to compute the final result. |
04067002 | 1277 | |
9c84384c | 1278 | However it may not be safe to load rWORD2 which may be beyond the |
04067002 UD |
1279 | string length. So we compare the bit length of the remainder to |
1280 | the right shift count (rSHR). If the bit count is less than or equal | |
1281 | we do not need to load rWORD2 (all significant bits are already in | |
fe6e95d7 | 1282 | rWORD8_SHIFT). */ |
04067002 UD |
1283 | cmplw cr7, rN, rSHR |
1284 | beq L(duZeroReturn) | |
fe6e95d7 | 1285 | li r0, 0 |
04067002 | 1286 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1287 | #ifdef __LITTLE_ENDIAN__ |
1288 | lwbrx rWORD2, 0, rSTR2 | |
1289 | addi rSTR2, rSTR2, 4 | |
1290 | #else | |
04067002 | 1291 | lwz rWORD2, 4(rSTR2) |
fe6e95d7 AM |
1292 | #endif |
1293 | srw r0, rWORD2, rSHR | |
1294 | .align 4 | |
04067002 | 1295 | L(dutrim): |
fe6e95d7 AM |
1296 | #ifdef __LITTLE_ENDIAN__ |
1297 | lwbrx rWORD1, 0, rSTR1 | |
1298 | #else | |
04067002 | 1299 | lwz rWORD1, 4(rSTR1) |
fe6e95d7 AM |
1300 | #endif |
1301 | lwz rWORD8, 48(r1) | |
9c84384c | 1302 | subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */ |
fe6e95d7 AM |
1303 | or rWORD2, r0, rWORD8_SHIFT |
1304 | lwz rWORD7, 44(r1) | |
1305 | lwz rSHL, 40(r1) | |
04067002 UD |
1306 | srw rWORD1, rWORD1, rN |
1307 | srw rWORD2, rWORD2, rN | |
fe6e95d7 AM |
1308 | lwz rSHR, 36(r1) |
1309 | lwz rWORD8_SHIFT, 32(r1) | |
1310 | sub rRTN, rWORD1, rWORD2 | |
1311 | b L(dureturn26) | |
1312 | .align 4 | |
1313 | L(duLcr7): | |
1314 | lwz rWORD8, 48(r1) | |
1315 | lwz rWORD7, 44(r1) | |
04067002 | 1316 | li rRTN, 1 |
fe6e95d7 AM |
1317 | bgt cr7, L(dureturn29) |
1318 | lwz rSHL, 40(r1) | |
1319 | lwz rSHR, 36(r1) | |
04067002 UD |
1320 | li rRTN, -1 |
1321 | b L(dureturn27) | |
fe6e95d7 | 1322 | .align 4 |
04067002 | 1323 | L(duLcr1): |
fe6e95d7 AM |
1324 | lwz rWORD8, 48(r1) |
1325 | lwz rWORD7, 44(r1) | |
04067002 | 1326 | li rRTN, 1 |
9c84384c | 1327 | bgt cr1, L(dureturn29) |
fe6e95d7 AM |
1328 | lwz rSHL, 40(r1) |
1329 | lwz rSHR, 36(r1) | |
04067002 UD |
1330 | li rRTN, -1 |
1331 | b L(dureturn27) | |
fe6e95d7 | 1332 | .align 4 |
04067002 | 1333 | L(duLcr6): |
fe6e95d7 AM |
1334 | lwz rWORD8, 48(r1) |
1335 | lwz rWORD7, 44(r1) | |
04067002 | 1336 | li rRTN, 1 |
9c84384c | 1337 | bgt cr6, L(dureturn29) |
fe6e95d7 AM |
1338 | lwz rSHL, 40(r1) |
1339 | lwz rSHR, 36(r1) | |
04067002 UD |
1340 | li rRTN, -1 |
1341 | b L(dureturn27) | |
fe6e95d7 | 1342 | .align 4 |
04067002 | 1343 | L(duLcr5): |
fe6e95d7 AM |
1344 | lwz rWORD8, 48(r1) |
1345 | lwz rWORD7, 44(r1) | |
04067002 | 1346 | li rRTN, 1 |
9c84384c | 1347 | bgt cr5, L(dureturn29) |
fe6e95d7 AM |
1348 | lwz rSHL, 40(r1) |
1349 | lwz rSHR, 36(r1) | |
04067002 UD |
1350 | li rRTN, -1 |
1351 | b L(dureturn27) | |
1352 | .align 3 | |
1353 | L(duZeroReturn): | |
fe6e95d7 | 1354 | li rRTN, 0 |
04067002 UD |
1355 | .align 4 |
1356 | L(dureturn): | |
fe6e95d7 AM |
1357 | lwz rWORD8, 48(r1) |
1358 | lwz rWORD7, 44(r1) | |
9c84384c | 1359 | L(dureturn29): |
fe6e95d7 AM |
1360 | lwz rSHL, 40(r1) |
1361 | lwz rSHR, 36(r1) | |
9c84384c | 1362 | L(dureturn27): |
fe6e95d7 | 1363 | lwz rWORD8_SHIFT, 32(r1) |
9c84384c | 1364 | L(dureturn26): |
fe6e95d7 | 1365 | lwz rWORD2_SHIFT, 28(r1) |
9c84384c | 1366 | L(dureturn25): |
fe6e95d7 AM |
1367 | lwz rWORD4_SHIFT, 24(r1) |
1368 | lwz rWORD6_SHIFT, 20(r1) | |
1369 | addi 1, 1, 64 | |
1370 | cfi_adjust_cfa_offset(-64) | |
04067002 | 1371 | blr |
b5510883 | 1372 | END (memcmp) |
04067002 UD |
1373 | |
1374 | libc_hidden_builtin_def (memcmp) | |
1375 | weak_alias (memcmp, bcmp) |