]>
Commit | Line | Data |
---|---|---|
fe6e95d7 | 1 | /* Optimized memcmp implementation for PowerPC64. |
b168057a | 2 | Copyright (C) 2003-2015 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
fe6e95d7 AM |
21 | /* int [r3] memcmp (const char *s1 [r3], |
22 | const char *s2 [r4], | |
23 | size_t size [r5]) */ | |
04067002 | 24 | |
a88f47a7 | 25 | .machine power4 |
2d67d91a | 26 | EALIGN (memcmp, 4, 0) |
04067002 UD |
27 | CALL_MCOUNT 3 |
28 | ||
04067002 UD |
29 | #define rRTN r3 |
30 | #define rSTR1 r3 /* first string arg */ | |
31 | #define rSTR2 r4 /* second string arg */ | |
32 | #define rN r5 /* max string length */ | |
04067002 UD |
33 | #define rWORD1 r6 /* current word in s1 */ |
34 | #define rWORD2 r7 /* current word in s2 */ | |
35 | #define rWORD3 r8 /* next word in s1 */ | |
36 | #define rWORD4 r9 /* next word in s2 */ | |
37 | #define rWORD5 r10 /* next word in s1 */ | |
38 | #define rWORD6 r11 /* next word in s2 */ | |
04067002 UD |
39 | #define rWORD7 r30 /* next word in s1 */ |
40 | #define rWORD8 r31 /* next word in s2 */ | |
41 | ||
fe6e95d7 | 42 | xor r0, rSTR2, rSTR1 |
04067002 UD |
43 | cmpldi cr6, rN, 0 |
44 | cmpldi cr1, rN, 12 | |
fe6e95d7 AM |
45 | clrldi. r0, r0, 61 |
46 | clrldi r12, rSTR1, 61 | |
47 | cmpldi cr5, r12, 0 | |
04067002 | 48 | beq- cr6, L(zeroLength) |
fe6e95d7 AM |
49 | dcbt 0, rSTR1 |
50 | dcbt 0, rSTR2 | |
2ccdea26 | 51 | /* If less than 8 bytes or not aligned, use the unaligned |
04067002 UD |
52 | byte loop. */ |
53 | blt cr1, L(bytealigned) | |
fe6e95d7 AM |
54 | std rWORD8, -8(r1) |
55 | cfi_offset(rWORD8, -8) | |
56 | std rWORD7, -16(r1) | |
57 | cfi_offset(rWORD7, -16) | |
04067002 UD |
58 | bne L(unaligned) |
59 | /* At this point we know both strings have the same alignment and the | |
fe6e95d7 | 60 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 61 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 AM |
62 | of r12 to 0. If r12 == 0 then we are already double word |
63 | aligned and can perform the DW aligned loop. | |
9c84384c | 64 | |
04067002 | 65 | Otherwise we know the two strings have the same alignment (but not |
fe6e95d7 AM |
66 | yet DW). So we force the string addresses to the next lower DW |
67 | boundary and special case this first DW using shift left to | |
2ccdea26 | 68 | eliminate bits preceding the first byte. Since we want to join the |
fe6e95d7 | 69 | normal (DW aligned) compare loop, starting at the second double word, |
04067002 | 70 | we need to adjust the length (rN) and special case the loop |
fe6e95d7 AM |
71 | versioning for the first DW. This ensures that the loop count is |
72 | correct and the first DW (shifted) is in the expected register pair. */ | |
73 | .align 4 | |
04067002 UD |
74 | L(samealignment): |
75 | clrrdi rSTR1, rSTR1, 3 | |
76 | clrrdi rSTR2, rSTR2, 3 | |
77 | beq cr5, L(DWaligned) | |
fe6e95d7 AM |
78 | add rN, rN, r12 |
79 | sldi rWORD6, r12, 3 | |
80 | srdi r0, rN, 5 /* Divide by 32 */ | |
81 | andi. r12, rN, 24 /* Get the DW remainder */ | |
82 | #ifdef __LITTLE_ENDIAN__ | |
83 | ldbrx rWORD1, 0, rSTR1 | |
84 | ldbrx rWORD2, 0, rSTR2 | |
85 | addi rSTR1, rSTR1, 8 | |
86 | addi rSTR2, rSTR2, 8 | |
87 | #else | |
04067002 UD |
88 | ld rWORD1, 0(rSTR1) |
89 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
90 | #endif |
91 | cmpldi cr1, r12, 16 | |
04067002 UD |
92 | cmpldi cr7, rN, 32 |
93 | clrldi rN, rN, 61 | |
94 | beq L(dPs4) | |
fe6e95d7 | 95 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
96 | bgt cr1, L(dPs3) |
97 | beq cr1, L(dPs2) | |
98 | ||
99 | /* Remainder is 8 */ | |
fe6e95d7 | 100 | .align 3 |
04067002 | 101 | L(dsP1): |
fe6e95d7 AM |
102 | sld rWORD5, rWORD1, rWORD6 |
103 | sld rWORD6, rWORD2, rWORD6 | |
04067002 UD |
104 | cmpld cr5, rWORD5, rWORD6 |
105 | blt cr7, L(dP1x) | |
106 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
107 | #ifdef __LITTLE_ENDIAN__ |
108 | ldbrx rWORD1, 0, rSTR1 | |
109 | ldbrx rWORD2, 0, rSTR2 | |
110 | addi rSTR1, rSTR1, 8 | |
111 | addi rSTR2, rSTR2, 8 | |
112 | #else | |
04067002 UD |
113 | ld rWORD1, 8(rSTR1) |
114 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
115 | #endif |
116 | cmpld cr7, rWORD1, rWORD2 | |
04067002 UD |
117 | b L(dP1e) |
118 | /* Remainder is 16 */ | |
fe6e95d7 | 119 | .align 4 |
04067002 | 120 | L(dPs2): |
fe6e95d7 AM |
121 | sld rWORD5, rWORD1, rWORD6 |
122 | sld rWORD6, rWORD2, rWORD6 | |
04067002 UD |
123 | cmpld cr6, rWORD5, rWORD6 |
124 | blt cr7, L(dP2x) | |
125 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
126 | #ifdef __LITTLE_ENDIAN__ |
127 | ldbrx rWORD7, 0, rSTR1 | |
128 | ldbrx rWORD8, 0, rSTR2 | |
129 | addi rSTR1, rSTR1, 8 | |
130 | addi rSTR2, rSTR2, 8 | |
131 | #else | |
04067002 UD |
132 | ld rWORD7, 8(rSTR1) |
133 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 134 | #endif |
04067002 UD |
135 | cmpld cr5, rWORD7, rWORD8 |
136 | b L(dP2e) | |
137 | /* Remainder is 24 */ | |
fe6e95d7 | 138 | .align 4 |
04067002 | 139 | L(dPs3): |
fe6e95d7 AM |
140 | sld rWORD3, rWORD1, rWORD6 |
141 | sld rWORD4, rWORD2, rWORD6 | |
04067002 UD |
142 | cmpld cr1, rWORD3, rWORD4 |
143 | b L(dP3e) | |
144 | /* Count is a multiple of 32, remainder is 0 */ | |
fe6e95d7 | 145 | .align 4 |
04067002 | 146 | L(dPs4): |
fe6e95d7 AM |
147 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
148 | sld rWORD1, rWORD1, rWORD6 | |
149 | sld rWORD2, rWORD2, rWORD6 | |
150 | cmpld cr7, rWORD1, rWORD2 | |
04067002 UD |
151 | b L(dP4e) |
152 | ||
153 | /* At this point we know both strings are double word aligned and the | |
154 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 155 | .align 4 |
04067002 | 156 | L(DWaligned): |
fe6e95d7 AM |
157 | andi. r12, rN, 24 /* Get the DW remainder */ |
158 | srdi r0, rN, 5 /* Divide by 32 */ | |
159 | cmpldi cr1, r12, 16 | |
04067002 UD |
160 | cmpldi cr7, rN, 32 |
161 | clrldi rN, rN, 61 | |
162 | beq L(dP4) | |
163 | bgt cr1, L(dP3) | |
164 | beq cr1, L(dP2) | |
9c84384c | 165 | |
04067002 | 166 | /* Remainder is 8 */ |
fe6e95d7 | 167 | .align 4 |
04067002 | 168 | L(dP1): |
fe6e95d7 | 169 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 | 170 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early |
2ccdea26 AB |
171 | (8-15 byte compare), we want to use only volatile registers. This |
172 | means we can avoid restoring non-volatile registers since we did not | |
04067002 | 173 | change any on the early exit path. The key here is the non-early |
9c84384c | 174 | exit path only cares about the condition code (cr5), not about which |
04067002 | 175 | register pair was used. */ |
fe6e95d7 AM |
176 | #ifdef __LITTLE_ENDIAN__ |
177 | ldbrx rWORD5, 0, rSTR1 | |
178 | ldbrx rWORD6, 0, rSTR2 | |
179 | addi rSTR1, rSTR1, 8 | |
180 | addi rSTR2, rSTR2, 8 | |
181 | #else | |
04067002 UD |
182 | ld rWORD5, 0(rSTR1) |
183 | ld rWORD6, 0(rSTR2) | |
fe6e95d7 | 184 | #endif |
04067002 UD |
185 | cmpld cr5, rWORD5, rWORD6 |
186 | blt cr7, L(dP1x) | |
fe6e95d7 AM |
187 | #ifdef __LITTLE_ENDIAN__ |
188 | ldbrx rWORD1, 0, rSTR1 | |
189 | ldbrx rWORD2, 0, rSTR2 | |
190 | addi rSTR1, rSTR1, 8 | |
191 | addi rSTR2, rSTR2, 8 | |
192 | #else | |
04067002 UD |
193 | ld rWORD1, 8(rSTR1) |
194 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
195 | #endif |
196 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 197 | L(dP1e): |
fe6e95d7 AM |
198 | #ifdef __LITTLE_ENDIAN__ |
199 | ldbrx rWORD3, 0, rSTR1 | |
200 | ldbrx rWORD4, 0, rSTR2 | |
201 | addi rSTR1, rSTR1, 8 | |
202 | addi rSTR2, rSTR2, 8 | |
203 | #else | |
04067002 UD |
204 | ld rWORD3, 16(rSTR1) |
205 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 206 | #endif |
04067002 | 207 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
208 | #ifdef __LITTLE_ENDIAN__ |
209 | ldbrx rWORD5, 0, rSTR1 | |
210 | ldbrx rWORD6, 0, rSTR2 | |
211 | addi rSTR1, rSTR1, 8 | |
212 | addi rSTR2, rSTR2, 8 | |
213 | #else | |
04067002 UD |
214 | ld rWORD5, 24(rSTR1) |
215 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 216 | #endif |
04067002 | 217 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
218 | bne cr5, L(dLcr5x) |
219 | bne cr7, L(dLcr7x) | |
9c84384c | 220 | |
fe6e95d7 AM |
221 | #ifdef __LITTLE_ENDIAN__ |
222 | ldbrx rWORD7, 0, rSTR1 | |
223 | ldbrx rWORD8, 0, rSTR2 | |
224 | addi rSTR1, rSTR1, 8 | |
225 | addi rSTR2, rSTR2, 8 | |
226 | #else | |
04067002 UD |
227 | ldu rWORD7, 32(rSTR1) |
228 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 | 229 | #endif |
04067002 UD |
230 | bne cr1, L(dLcr1) |
231 | cmpld cr5, rWORD7, rWORD8 | |
232 | bdnz L(dLoop) | |
233 | bne cr6, L(dLcr6) | |
fe6e95d7 AM |
234 | ld rWORD8, -8(r1) |
235 | ld rWORD7, -16(r1) | |
236 | .align 3 | |
04067002 UD |
237 | L(dP1x): |
238 | sldi. r12, rN, 3 | |
fe6e95d7 | 239 | bne cr5, L(dLcr5x) |
04067002 UD |
240 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
241 | bne L(d00) | |
242 | li rRTN, 0 | |
243 | blr | |
9c84384c | 244 | |
04067002 | 245 | /* Remainder is 16 */ |
fe6e95d7 | 246 | .align 4 |
04067002 | 247 | L(dP2): |
fe6e95d7 AM |
248 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
249 | #ifdef __LITTLE_ENDIAN__ | |
250 | ldbrx rWORD5, 0, rSTR1 | |
251 | ldbrx rWORD6, 0, rSTR2 | |
252 | addi rSTR1, rSTR1, 8 | |
253 | addi rSTR2, rSTR2, 8 | |
254 | #else | |
04067002 UD |
255 | ld rWORD5, 0(rSTR1) |
256 | ld rWORD6, 0(rSTR2) | |
fe6e95d7 | 257 | #endif |
04067002 UD |
258 | cmpld cr6, rWORD5, rWORD6 |
259 | blt cr7, L(dP2x) | |
fe6e95d7 AM |
260 | #ifdef __LITTLE_ENDIAN__ |
261 | ldbrx rWORD7, 0, rSTR1 | |
262 | ldbrx rWORD8, 0, rSTR2 | |
263 | addi rSTR1, rSTR1, 8 | |
264 | addi rSTR2, rSTR2, 8 | |
265 | #else | |
04067002 UD |
266 | ld rWORD7, 8(rSTR1) |
267 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 268 | #endif |
04067002 UD |
269 | cmpld cr5, rWORD7, rWORD8 |
270 | L(dP2e): | |
fe6e95d7 AM |
271 | #ifdef __LITTLE_ENDIAN__ |
272 | ldbrx rWORD1, 0, rSTR1 | |
273 | ldbrx rWORD2, 0, rSTR2 | |
274 | addi rSTR1, rSTR1, 8 | |
275 | addi rSTR2, rSTR2, 8 | |
276 | #else | |
04067002 UD |
277 | ld rWORD1, 16(rSTR1) |
278 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 AM |
279 | #endif |
280 | cmpld cr7, rWORD1, rWORD2 | |
281 | #ifdef __LITTLE_ENDIAN__ | |
282 | ldbrx rWORD3, 0, rSTR1 | |
283 | ldbrx rWORD4, 0, rSTR2 | |
284 | addi rSTR1, rSTR1, 8 | |
285 | addi rSTR2, rSTR2, 8 | |
286 | #else | |
04067002 UD |
287 | ld rWORD3, 24(rSTR1) |
288 | ld rWORD4, 24(rSTR2) | |
fe6e95d7 | 289 | #endif |
04067002 | 290 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 | 291 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
292 | addi rSTR1, rSTR1, 8 |
293 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 294 | #endif |
04067002 UD |
295 | bne cr6, L(dLcr6) |
296 | bne cr5, L(dLcr5) | |
297 | b L(dLoop2) | |
298 | /* Again we are on a early exit path (16-23 byte compare), we want to | |
2ccdea26 | 299 | only use volatile registers and avoid restoring non-volatile |
04067002 | 300 | registers. */ |
fe6e95d7 | 301 | .align 4 |
04067002 | 302 | L(dP2x): |
fe6e95d7 AM |
303 | #ifdef __LITTLE_ENDIAN__ |
304 | ldbrx rWORD3, 0, rSTR1 | |
305 | ldbrx rWORD4, 0, rSTR2 | |
306 | addi rSTR1, rSTR1, 8 | |
307 | addi rSTR2, rSTR2, 8 | |
308 | #else | |
04067002 UD |
309 | ld rWORD3, 8(rSTR1) |
310 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 AM |
311 | #endif |
312 | cmpld cr1, rWORD3, rWORD4 | |
04067002 | 313 | sldi. r12, rN, 3 |
fe6e95d7 AM |
314 | bne cr6, L(dLcr6x) |
315 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
316 | addi rSTR1, rSTR1, 8 |
317 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
318 | #endif |
319 | bne cr1, L(dLcr1x) | |
04067002 UD |
320 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
321 | bne L(d00) | |
322 | li rRTN, 0 | |
323 | blr | |
9c84384c | 324 | |
04067002 | 325 | /* Remainder is 24 */ |
fe6e95d7 | 326 | .align 4 |
04067002 | 327 | L(dP3): |
fe6e95d7 AM |
328 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
329 | #ifdef __LITTLE_ENDIAN__ | |
330 | ldbrx rWORD3, 0, rSTR1 | |
331 | ldbrx rWORD4, 0, rSTR2 | |
332 | addi rSTR1, rSTR1, 8 | |
333 | addi rSTR2, rSTR2, 8 | |
334 | #else | |
04067002 UD |
335 | ld rWORD3, 0(rSTR1) |
336 | ld rWORD4, 0(rSTR2) | |
fe6e95d7 | 337 | #endif |
04067002 UD |
338 | cmpld cr1, rWORD3, rWORD4 |
339 | L(dP3e): | |
fe6e95d7 AM |
340 | #ifdef __LITTLE_ENDIAN__ |
341 | ldbrx rWORD5, 0, rSTR1 | |
342 | ldbrx rWORD6, 0, rSTR2 | |
343 | addi rSTR1, rSTR1, 8 | |
344 | addi rSTR2, rSTR2, 8 | |
345 | #else | |
04067002 UD |
346 | ld rWORD5, 8(rSTR1) |
347 | ld rWORD6, 8(rSTR2) | |
fe6e95d7 | 348 | #endif |
04067002 UD |
349 | cmpld cr6, rWORD5, rWORD6 |
350 | blt cr7, L(dP3x) | |
fe6e95d7 AM |
351 | #ifdef __LITTLE_ENDIAN__ |
352 | ldbrx rWORD7, 0, rSTR1 | |
353 | ldbrx rWORD8, 0, rSTR2 | |
354 | addi rSTR1, rSTR1, 8 | |
355 | addi rSTR2, rSTR2, 8 | |
356 | #else | |
04067002 UD |
357 | ld rWORD7, 16(rSTR1) |
358 | ld rWORD8, 16(rSTR2) | |
fe6e95d7 | 359 | #endif |
04067002 | 360 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
361 | #ifdef __LITTLE_ENDIAN__ |
362 | ldbrx rWORD1, 0, rSTR1 | |
363 | ldbrx rWORD2, 0, rSTR2 | |
364 | addi rSTR1, rSTR1, 8 | |
365 | addi rSTR2, rSTR2, 8 | |
366 | #else | |
04067002 UD |
367 | ld rWORD1, 24(rSTR1) |
368 | ld rWORD2, 24(rSTR2) | |
fe6e95d7 AM |
369 | #endif |
370 | cmpld cr7, rWORD1, rWORD2 | |
371 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
372 | addi rSTR1, rSTR1, 16 |
373 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 | 374 | #endif |
04067002 UD |
375 | bne cr1, L(dLcr1) |
376 | bne cr6, L(dLcr6) | |
377 | b L(dLoop1) | |
378 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
2ccdea26 | 379 | only use volatile registers and avoid restoring non-volatile |
04067002 | 380 | registers. */ |
fe6e95d7 | 381 | .align 4 |
04067002 | 382 | L(dP3x): |
fe6e95d7 AM |
383 | #ifdef __LITTLE_ENDIAN__ |
384 | ldbrx rWORD1, 0, rSTR1 | |
385 | ldbrx rWORD2, 0, rSTR2 | |
386 | addi rSTR1, rSTR1, 8 | |
387 | addi rSTR2, rSTR2, 8 | |
388 | #else | |
04067002 UD |
389 | ld rWORD1, 16(rSTR1) |
390 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 AM |
391 | #endif |
392 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 393 | sldi. r12, rN, 3 |
fe6e95d7 AM |
394 | bne cr1, L(dLcr1x) |
395 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
396 | addi rSTR1, rSTR1, 16 |
397 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
398 | #endif |
399 | bne cr6, L(dLcr6x) | |
04067002 | 400 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
fe6e95d7 | 401 | bne cr7, L(dLcr7x) |
04067002 UD |
402 | bne L(d00) |
403 | li rRTN, 0 | |
404 | blr | |
9c84384c | 405 | |
04067002 | 406 | /* Count is a multiple of 32, remainder is 0 */ |
fe6e95d7 | 407 | .align 4 |
04067002 | 408 | L(dP4): |
fe6e95d7 AM |
409 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
410 | #ifdef __LITTLE_ENDIAN__ | |
411 | ldbrx rWORD1, 0, rSTR1 | |
412 | ldbrx rWORD2, 0, rSTR2 | |
413 | addi rSTR1, rSTR1, 8 | |
414 | addi rSTR2, rSTR2, 8 | |
415 | #else | |
04067002 UD |
416 | ld rWORD1, 0(rSTR1) |
417 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
418 | #endif |
419 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 420 | L(dP4e): |
fe6e95d7 AM |
421 | #ifdef __LITTLE_ENDIAN__ |
422 | ldbrx rWORD3, 0, rSTR1 | |
423 | ldbrx rWORD4, 0, rSTR2 | |
424 | addi rSTR1, rSTR1, 8 | |
425 | addi rSTR2, rSTR2, 8 | |
426 | #else | |
04067002 UD |
427 | ld rWORD3, 8(rSTR1) |
428 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 | 429 | #endif |
04067002 | 430 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
431 | #ifdef __LITTLE_ENDIAN__ |
432 | ldbrx rWORD5, 0, rSTR1 | |
433 | ldbrx rWORD6, 0, rSTR2 | |
434 | addi rSTR1, rSTR1, 8 | |
435 | addi rSTR2, rSTR2, 8 | |
436 | #else | |
04067002 UD |
437 | ld rWORD5, 16(rSTR1) |
438 | ld rWORD6, 16(rSTR2) | |
fe6e95d7 | 439 | #endif |
04067002 | 440 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
441 | #ifdef __LITTLE_ENDIAN__ |
442 | ldbrx rWORD7, 0, rSTR1 | |
443 | ldbrx rWORD8, 0, rSTR2 | |
444 | addi rSTR1, rSTR1, 8 | |
445 | addi rSTR2, rSTR2, 8 | |
446 | #else | |
04067002 UD |
447 | ldu rWORD7, 24(rSTR1) |
448 | ldu rWORD8, 24(rSTR2) | |
fe6e95d7 | 449 | #endif |
04067002 | 450 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 | 451 | bne cr7, L(dLcr7) |
04067002 UD |
452 | bne cr1, L(dLcr1) |
453 | bdz- L(d24) /* Adjust CTR as we start with +4 */ | |
454 | /* This is the primary loop */ | |
fe6e95d7 | 455 | .align 4 |
04067002 | 456 | L(dLoop): |
fe6e95d7 AM |
457 | #ifdef __LITTLE_ENDIAN__ |
458 | ldbrx rWORD1, 0, rSTR1 | |
459 | ldbrx rWORD2, 0, rSTR2 | |
460 | addi rSTR1, rSTR1, 8 | |
461 | addi rSTR2, rSTR2, 8 | |
462 | #else | |
04067002 UD |
463 | ld rWORD1, 8(rSTR1) |
464 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 465 | #endif |
04067002 UD |
466 | cmpld cr1, rWORD3, rWORD4 |
467 | bne cr6, L(dLcr6) | |
468 | L(dLoop1): | |
fe6e95d7 AM |
469 | #ifdef __LITTLE_ENDIAN__ |
470 | ldbrx rWORD3, 0, rSTR1 | |
471 | ldbrx rWORD4, 0, rSTR2 | |
472 | addi rSTR1, rSTR1, 8 | |
473 | addi rSTR2, rSTR2, 8 | |
474 | #else | |
04067002 UD |
475 | ld rWORD3, 16(rSTR1) |
476 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 477 | #endif |
04067002 UD |
478 | cmpld cr6, rWORD5, rWORD6 |
479 | bne cr5, L(dLcr5) | |
480 | L(dLoop2): | |
fe6e95d7 AM |
481 | #ifdef __LITTLE_ENDIAN__ |
482 | ldbrx rWORD5, 0, rSTR1 | |
483 | ldbrx rWORD6, 0, rSTR2 | |
484 | addi rSTR1, rSTR1, 8 | |
485 | addi rSTR2, rSTR2, 8 | |
486 | #else | |
04067002 UD |
487 | ld rWORD5, 24(rSTR1) |
488 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 489 | #endif |
04067002 | 490 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 | 491 | bne cr7, L(dLcr7) |
04067002 | 492 | L(dLoop3): |
fe6e95d7 AM |
493 | #ifdef __LITTLE_ENDIAN__ |
494 | ldbrx rWORD7, 0, rSTR1 | |
495 | ldbrx rWORD8, 0, rSTR2 | |
496 | addi rSTR1, rSTR1, 8 | |
497 | addi rSTR2, rSTR2, 8 | |
498 | #else | |
04067002 UD |
499 | ldu rWORD7, 32(rSTR1) |
500 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 | 501 | #endif |
04067002 | 502 | bne- cr1, L(dLcr1) |
fe6e95d7 | 503 | cmpld cr7, rWORD1, rWORD2 |
9c84384c JM |
504 | bdnz+ L(dLoop) |
505 | ||
04067002 UD |
506 | L(dL4): |
507 | cmpld cr1, rWORD3, rWORD4 | |
508 | bne cr6, L(dLcr6) | |
509 | cmpld cr6, rWORD5, rWORD6 | |
510 | bne cr5, L(dLcr5) | |
511 | cmpld cr5, rWORD7, rWORD8 | |
512 | L(d44): | |
fe6e95d7 | 513 | bne cr7, L(dLcr7) |
04067002 UD |
514 | L(d34): |
515 | bne cr1, L(dLcr1) | |
516 | L(d24): | |
517 | bne cr6, L(dLcr6) | |
518 | L(d14): | |
519 | sldi. r12, rN, 3 | |
9c84384c | 520 | bne cr5, L(dLcr5) |
04067002 | 521 | L(d04): |
fe6e95d7 AM |
522 | ld rWORD8, -8(r1) |
523 | ld rWORD7, -16(r1) | |
04067002 UD |
524 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
525 | beq L(zeroLength) | |
526 | /* At this point we have a remainder of 1 to 7 bytes to compare. Since | |
527 | we are aligned it is safe to load the whole double word, and use | |
2ccdea26 | 528 | shift right double to eliminate bits beyond the compare length. */ |
04067002 | 529 | L(d00): |
fe6e95d7 AM |
530 | #ifdef __LITTLE_ENDIAN__ |
531 | ldbrx rWORD1, 0, rSTR1 | |
532 | ldbrx rWORD2, 0, rSTR2 | |
533 | addi rSTR1, rSTR1, 8 | |
534 | addi rSTR2, rSTR2, 8 | |
535 | #else | |
04067002 | 536 | ld rWORD1, 8(rSTR1) |
9c84384c | 537 | ld rWORD2, 8(rSTR2) |
fe6e95d7 | 538 | #endif |
04067002 UD |
539 | srd rWORD1, rWORD1, rN |
540 | srd rWORD2, rWORD2, rN | |
fe6e95d7 AM |
541 | cmpld cr7, rWORD1, rWORD2 |
542 | bne cr7, L(dLcr7x) | |
04067002 UD |
543 | li rRTN, 0 |
544 | blr | |
fe6e95d7 AM |
545 | |
546 | .align 4 | |
547 | L(dLcr7): | |
548 | ld rWORD8, -8(r1) | |
549 | ld rWORD7, -16(r1) | |
550 | L(dLcr7x): | |
04067002 | 551 | li rRTN, 1 |
fe6e95d7 | 552 | bgtlr cr7 |
04067002 UD |
553 | li rRTN, -1 |
554 | blr | |
fe6e95d7 | 555 | .align 4 |
04067002 | 556 | L(dLcr1): |
fe6e95d7 AM |
557 | ld rWORD8, -8(r1) |
558 | ld rWORD7, -16(r1) | |
559 | L(dLcr1x): | |
04067002 UD |
560 | li rRTN, 1 |
561 | bgtlr cr1 | |
562 | li rRTN, -1 | |
563 | blr | |
fe6e95d7 | 564 | .align 4 |
04067002 | 565 | L(dLcr6): |
fe6e95d7 AM |
566 | ld rWORD8, -8(r1) |
567 | ld rWORD7, -16(r1) | |
568 | L(dLcr6x): | |
04067002 UD |
569 | li rRTN, 1 |
570 | bgtlr cr6 | |
571 | li rRTN, -1 | |
572 | blr | |
fe6e95d7 | 573 | .align 4 |
04067002 | 574 | L(dLcr5): |
fe6e95d7 AM |
575 | ld rWORD8, -8(r1) |
576 | ld rWORD7, -16(r1) | |
04067002 UD |
577 | L(dLcr5x): |
578 | li rRTN, 1 | |
579 | bgtlr cr5 | |
580 | li rRTN, -1 | |
581 | blr | |
9c84384c | 582 | |
fe6e95d7 | 583 | .align 4 |
04067002 | 584 | L(bytealigned): |
fe6e95d7 AM |
585 | mtctr rN /* Power4 wants mtctr 1st in dispatch group */ |
586 | #if 0 | |
587 | /* Huh? We've already branched on cr6! */ | |
04067002 | 588 | beq- cr6, L(zeroLength) |
fe6e95d7 | 589 | #endif |
04067002 UD |
590 | |
591 | /* We need to prime this loop. This loop is swing modulo scheduled | |
9c84384c | 592 | to avoid pipe delays. The dependent instruction latencies (load to |
04067002 UD |
593 | compare to conditional branch) is 2 to 3 cycles. In this loop each |
594 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
9c84384c JM |
595 | the first iteration of the loop only serves to load operands and |
596 | branches based on compares are delayed until the next loop. | |
04067002 UD |
597 | |
598 | So we must precondition some registers and condition codes so that | |
599 | we don't exit the loop early on the first iteration. */ | |
9c84384c | 600 | |
04067002 UD |
601 | lbz rWORD1, 0(rSTR1) |
602 | lbz rWORD2, 0(rSTR2) | |
603 | bdz- L(b11) | |
fe6e95d7 | 604 | cmpld cr7, rWORD1, rWORD2 |
04067002 UD |
605 | lbz rWORD3, 1(rSTR1) |
606 | lbz rWORD4, 1(rSTR2) | |
607 | bdz- L(b12) | |
608 | cmpld cr1, rWORD3, rWORD4 | |
609 | lbzu rWORD5, 2(rSTR1) | |
610 | lbzu rWORD6, 2(rSTR2) | |
611 | bdz- L(b13) | |
fe6e95d7 | 612 | .align 4 |
04067002 UD |
613 | L(bLoop): |
614 | lbzu rWORD1, 1(rSTR1) | |
615 | lbzu rWORD2, 1(rSTR2) | |
fe6e95d7 | 616 | bne- cr7, L(bLcr7) |
04067002 UD |
617 | |
618 | cmpld cr6, rWORD5, rWORD6 | |
619 | bdz- L(b3i) | |
9c84384c | 620 | |
04067002 UD |
621 | lbzu rWORD3, 1(rSTR1) |
622 | lbzu rWORD4, 1(rSTR2) | |
623 | bne- cr1, L(bLcr1) | |
624 | ||
fe6e95d7 | 625 | cmpld cr7, rWORD1, rWORD2 |
04067002 UD |
626 | bdz- L(b2i) |
627 | ||
628 | lbzu rWORD5, 1(rSTR1) | |
629 | lbzu rWORD6, 1(rSTR2) | |
630 | bne- cr6, L(bLcr6) | |
631 | ||
632 | cmpld cr1, rWORD3, rWORD4 | |
633 | bdnz+ L(bLoop) | |
9c84384c | 634 | |
04067002 UD |
635 | /* We speculatively loading bytes before we have tested the previous |
636 | bytes. But we must avoid overrunning the length (in the ctr) to | |
9c84384c | 637 | prevent these speculative loads from causing a segfault. In this |
04067002 UD |
638 | case the loop will exit early (before the all pending bytes are |
639 | tested. In this case we must complete the pending operations | |
640 | before returning. */ | |
641 | L(b1i): | |
fe6e95d7 | 642 | bne- cr7, L(bLcr7) |
04067002 UD |
643 | bne- cr1, L(bLcr1) |
644 | b L(bx56) | |
fe6e95d7 | 645 | .align 4 |
04067002 UD |
646 | L(b2i): |
647 | bne- cr6, L(bLcr6) | |
fe6e95d7 | 648 | bne- cr7, L(bLcr7) |
04067002 | 649 | b L(bx34) |
fe6e95d7 | 650 | .align 4 |
04067002 UD |
651 | L(b3i): |
652 | bne- cr1, L(bLcr1) | |
653 | bne- cr6, L(bLcr6) | |
654 | b L(bx12) | |
fe6e95d7 AM |
655 | .align 4 |
656 | L(bLcr7): | |
04067002 | 657 | li rRTN, 1 |
fe6e95d7 | 658 | bgtlr cr7 |
04067002 UD |
659 | li rRTN, -1 |
660 | blr | |
661 | L(bLcr1): | |
662 | li rRTN, 1 | |
663 | bgtlr cr1 | |
664 | li rRTN, -1 | |
665 | blr | |
666 | L(bLcr6): | |
667 | li rRTN, 1 | |
668 | bgtlr cr6 | |
669 | li rRTN, -1 | |
670 | blr | |
671 | ||
672 | L(b13): | |
fe6e95d7 | 673 | bne- cr7, L(bx12) |
04067002 UD |
674 | bne- cr1, L(bx34) |
675 | L(bx56): | |
676 | sub rRTN, rWORD5, rWORD6 | |
677 | blr | |
678 | nop | |
679 | L(b12): | |
fe6e95d7 | 680 | bne- cr7, L(bx12) |
9c84384c | 681 | L(bx34): |
04067002 UD |
682 | sub rRTN, rWORD3, rWORD4 |
683 | blr | |
684 | L(b11): | |
685 | L(bx12): | |
686 | sub rRTN, rWORD1, rWORD2 | |
687 | blr | |
fe6e95d7 | 688 | .align 4 |
04067002 UD |
689 | L(zeroLength): |
690 | li rRTN, 0 | |
691 | blr | |
692 | ||
fe6e95d7 | 693 | .align 4 |
04067002 | 694 | /* At this point we know the strings have different alignment and the |
fe6e95d7 | 695 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 696 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 | 697 | of r12 to 0. If r12 == 0 then rStr1 is double word |
04067002 | 698 | aligned and can perform the DWunaligned loop. |
9c84384c | 699 | |
2ccdea26 | 700 | Otherwise we know that rSTR1 is not already DW aligned yet. |
04067002 | 701 | So we can force the string addresses to the next lower DW |
fe6e95d7 | 702 | boundary and special case this first DW using shift left to |
2ccdea26 | 703 | eliminate bits preceding the first byte. Since we want to join the |
04067002 UD |
704 | normal (DWaligned) compare loop, starting at the second double word, |
705 | we need to adjust the length (rN) and special case the loop | |
fe6e95d7 | 706 | versioning for the first DW. This ensures that the loop count is |
04067002 | 707 | correct and the first DW (shifted) is in the expected resister pair. */ |
fe6e95d7 AM |
708 | #define rSHL r29 /* Unaligned shift left count. */ |
709 | #define rSHR r28 /* Unaligned shift right count. */ | |
710 | #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ | |
711 | #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ | |
712 | #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ | |
713 | #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ | |
04067002 | 714 | L(unaligned): |
fe6e95d7 AM |
715 | std rSHL, -24(r1) |
716 | cfi_offset(rSHL, -24) | |
04067002 UD |
717 | clrldi rSHL, rSTR2, 61 |
718 | beq- cr6, L(duzeroLength) | |
fe6e95d7 AM |
719 | std rSHR, -32(r1) |
720 | cfi_offset(rSHR, -32) | |
04067002 | 721 | beq cr5, L(DWunaligned) |
fe6e95d7 AM |
722 | std rWORD8_SHIFT, -40(r1) |
723 | cfi_offset(rWORD8_SHIFT, -40) | |
724 | /* Adjust the logical start of rSTR2 to compensate for the extra bits | |
04067002 | 725 | in the 1st rSTR1 DW. */ |
fe6e95d7 | 726 | sub rWORD8_SHIFT, rSTR2, r12 |
04067002 UD |
727 | /* But do not attempt to address the DW before that DW that contains |
728 | the actual start of rSTR2. */ | |
729 | clrrdi rSTR2, rSTR2, 3 | |
fe6e95d7 AM |
730 | std rWORD2_SHIFT, -48(r1) |
731 | cfi_offset(rWORD2_SHIFT, -48) | |
732 | /* Compute the left/right shift counts for the unaligned rSTR2, | |
9c84384c | 733 | compensating for the logical (DW aligned) start of rSTR1. */ |
fe6e95d7 | 734 | clrldi rSHL, rWORD8_SHIFT, 61 |
9c84384c | 735 | clrrdi rSTR1, rSTR1, 3 |
fe6e95d7 AM |
736 | std rWORD4_SHIFT, -56(r1) |
737 | cfi_offset(rWORD4_SHIFT, -56) | |
04067002 | 738 | sldi rSHL, rSHL, 3 |
fe6e95d7 AM |
739 | cmpld cr5, rWORD8_SHIFT, rSTR2 |
740 | add rN, rN, r12 | |
741 | sldi rWORD6, r12, 3 | |
742 | std rWORD6_SHIFT, -64(r1) | |
743 | cfi_offset(rWORD6_SHIFT, -64) | |
04067002 | 744 | subfic rSHR, rSHL, 64 |
fe6e95d7 AM |
745 | srdi r0, rN, 5 /* Divide by 32 */ |
746 | andi. r12, rN, 24 /* Get the DW remainder */ | |
04067002 UD |
747 | /* We normally need to load 2 DWs to start the unaligned rSTR2, but in |
748 | this special case those bits may be discarded anyway. Also we | |
749 | must avoid loading a DW where none of the bits are part of rSTR2 as | |
750 | this may cross a page boundary and cause a page fault. */ | |
751 | li rWORD8, 0 | |
752 | blt cr5, L(dus0) | |
fe6e95d7 AM |
753 | #ifdef __LITTLE_ENDIAN__ |
754 | ldbrx rWORD8, 0, rSTR2 | |
755 | addi rSTR2, rSTR2, 8 | |
756 | #else | |
04067002 | 757 | ld rWORD8, 0(rSTR2) |
fe6e95d7 AM |
758 | addi rSTR2, rSTR2, 8 |
759 | #endif | |
04067002 UD |
760 | sld rWORD8, rWORD8, rSHL |
761 | ||
762 | L(dus0): | |
fe6e95d7 AM |
763 | #ifdef __LITTLE_ENDIAN__ |
764 | ldbrx rWORD1, 0, rSTR1 | |
765 | ldbrx rWORD2, 0, rSTR2 | |
766 | addi rSTR1, rSTR1, 8 | |
767 | addi rSTR2, rSTR2, 8 | |
768 | #else | |
04067002 UD |
769 | ld rWORD1, 0(rSTR1) |
770 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
771 | #endif |
772 | cmpldi cr1, r12, 16 | |
04067002 | 773 | cmpldi cr7, rN, 32 |
fe6e95d7 | 774 | srd r12, rWORD2, rSHR |
04067002 UD |
775 | clrldi rN, rN, 61 |
776 | beq L(duPs4) | |
fe6e95d7 AM |
777 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
778 | or rWORD8, r12, rWORD8 | |
04067002 UD |
779 | bgt cr1, L(duPs3) |
780 | beq cr1, L(duPs2) | |
781 | ||
782 | /* Remainder is 8 */ | |
fe6e95d7 | 783 | .align 4 |
04067002 | 784 | L(dusP1): |
fe6e95d7 AM |
785 | sld rWORD8_SHIFT, rWORD2, rSHL |
786 | sld rWORD7, rWORD1, rWORD6 | |
787 | sld rWORD8, rWORD8, rWORD6 | |
04067002 UD |
788 | bge cr7, L(duP1e) |
789 | /* At this point we exit early with the first double word compare | |
790 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
791 | how we handle the remaining bytes. */ | |
792 | cmpld cr5, rWORD7, rWORD8 | |
793 | sldi. rN, rN, 3 | |
794 | bne cr5, L(duLcr5) | |
795 | cmpld cr7, rN, rSHR | |
796 | beq L(duZeroReturn) | |
fe6e95d7 | 797 | li r0, 0 |
04067002 | 798 | ble cr7, L(dutrim) |
fe6e95d7 AM |
799 | #ifdef __LITTLE_ENDIAN__ |
800 | ldbrx rWORD2, 0, rSTR2 | |
801 | addi rSTR2, rSTR2, 8 | |
802 | #else | |
04067002 | 803 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
804 | #endif |
805 | srd r0, rWORD2, rSHR | |
04067002 UD |
806 | b L(dutrim) |
807 | /* Remainder is 16 */ | |
fe6e95d7 | 808 | .align 4 |
04067002 | 809 | L(duPs2): |
fe6e95d7 AM |
810 | sld rWORD6_SHIFT, rWORD2, rSHL |
811 | sld rWORD5, rWORD1, rWORD6 | |
812 | sld rWORD6, rWORD8, rWORD6 | |
04067002 UD |
813 | b L(duP2e) |
814 | /* Remainder is 24 */ | |
fe6e95d7 | 815 | .align 4 |
04067002 | 816 | L(duPs3): |
fe6e95d7 AM |
817 | sld rWORD4_SHIFT, rWORD2, rSHL |
818 | sld rWORD3, rWORD1, rWORD6 | |
819 | sld rWORD4, rWORD8, rWORD6 | |
04067002 UD |
820 | b L(duP3e) |
821 | /* Count is a multiple of 32, remainder is 0 */ | |
fe6e95d7 | 822 | .align 4 |
04067002 | 823 | L(duPs4): |
fe6e95d7 AM |
824 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
825 | or rWORD8, r12, rWORD8 | |
826 | sld rWORD2_SHIFT, rWORD2, rSHL | |
827 | sld rWORD1, rWORD1, rWORD6 | |
828 | sld rWORD2, rWORD8, rWORD6 | |
04067002 UD |
829 | b L(duP4e) |
830 | ||
831 | /* At this point we know rSTR1 is double word aligned and the | |
832 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 833 | .align 4 |
04067002 | 834 | L(DWunaligned): |
fe6e95d7 AM |
835 | std rWORD8_SHIFT, -40(r1) |
836 | cfi_offset(rWORD8_SHIFT, -40) | |
04067002 | 837 | clrrdi rSTR2, rSTR2, 3 |
fe6e95d7 AM |
838 | std rWORD2_SHIFT, -48(r1) |
839 | cfi_offset(rWORD2_SHIFT, -48) | |
840 | srdi r0, rN, 5 /* Divide by 32 */ | |
841 | std rWORD4_SHIFT, -56(r1) | |
842 | cfi_offset(rWORD4_SHIFT, -56) | |
843 | andi. r12, rN, 24 /* Get the DW remainder */ | |
844 | std rWORD6_SHIFT, -64(r1) | |
845 | cfi_offset(rWORD6_SHIFT, -64) | |
04067002 | 846 | sldi rSHL, rSHL, 3 |
fe6e95d7 AM |
847 | #ifdef __LITTLE_ENDIAN__ |
848 | ldbrx rWORD6, 0, rSTR2 | |
849 | addi rSTR2, rSTR2, 8 | |
850 | ldbrx rWORD8, 0, rSTR2 | |
851 | addi rSTR2, rSTR2, 8 | |
852 | #else | |
04067002 UD |
853 | ld rWORD6, 0(rSTR2) |
854 | ldu rWORD8, 8(rSTR2) | |
fe6e95d7 AM |
855 | #endif |
856 | cmpldi cr1, r12, 16 | |
04067002 UD |
857 | cmpldi cr7, rN, 32 |
858 | clrldi rN, rN, 61 | |
859 | subfic rSHR, rSHL, 64 | |
fe6e95d7 | 860 | sld rWORD6_SHIFT, rWORD6, rSHL |
04067002 | 861 | beq L(duP4) |
fe6e95d7 | 862 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
863 | bgt cr1, L(duP3) |
864 | beq cr1, L(duP2) | |
9c84384c | 865 | |
04067002 | 866 | /* Remainder is 8 */ |
fe6e95d7 | 867 | .align 4 |
04067002 | 868 | L(duP1): |
fe6e95d7 AM |
869 | srd r12, rWORD8, rSHR |
870 | #ifdef __LITTLE_ENDIAN__ | |
871 | ldbrx rWORD7, 0, rSTR1 | |
872 | addi rSTR1, rSTR1, 8 | |
873 | #else | |
04067002 | 874 | ld rWORD7, 0(rSTR1) |
fe6e95d7 AM |
875 | #endif |
876 | sld rWORD8_SHIFT, rWORD8, rSHL | |
877 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
878 | blt cr7, L(duP1x) |
879 | L(duP1e): | |
fe6e95d7 AM |
880 | #ifdef __LITTLE_ENDIAN__ |
881 | ldbrx rWORD1, 0, rSTR1 | |
882 | ldbrx rWORD2, 0, rSTR2 | |
883 | addi rSTR1, rSTR1, 8 | |
884 | addi rSTR2, rSTR2, 8 | |
885 | #else | |
04067002 UD |
886 | ld rWORD1, 8(rSTR1) |
887 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 888 | #endif |
04067002 | 889 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
890 | srd r0, rWORD2, rSHR |
891 | sld rWORD2_SHIFT, rWORD2, rSHL | |
892 | or rWORD2, r0, rWORD8_SHIFT | |
893 | #ifdef __LITTLE_ENDIAN__ | |
894 | ldbrx rWORD3, 0, rSTR1 | |
895 | ldbrx rWORD4, 0, rSTR2 | |
896 | addi rSTR1, rSTR1, 8 | |
897 | addi rSTR2, rSTR2, 8 | |
898 | #else | |
04067002 UD |
899 | ld rWORD3, 16(rSTR1) |
900 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 AM |
901 | #endif |
902 | cmpld cr7, rWORD1, rWORD2 | |
903 | srd r12, rWORD4, rSHR | |
904 | sld rWORD4_SHIFT, rWORD4, rSHL | |
04067002 | 905 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
906 | or rWORD4, r12, rWORD2_SHIFT |
907 | #ifdef __LITTLE_ENDIAN__ | |
908 | ldbrx rWORD5, 0, rSTR1 | |
909 | ldbrx rWORD6, 0, rSTR2 | |
910 | addi rSTR1, rSTR1, 8 | |
911 | addi rSTR2, rSTR2, 8 | |
912 | #else | |
04067002 UD |
913 | ld rWORD5, 24(rSTR1) |
914 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 915 | #endif |
04067002 | 916 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
917 | srd r0, rWORD6, rSHR |
918 | sld rWORD6_SHIFT, rWORD6, rSHL | |
919 | bne cr7, L(duLcr7) | |
920 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 921 | cmpld cr6, rWORD5, rWORD6 |
9c84384c | 922 | b L(duLoop3) |
fe6e95d7 | 923 | .align 4 |
04067002 UD |
924 | /* At this point we exit early with the first double word compare |
925 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
926 | how we handle the remaining bytes. */ | |
927 | L(duP1x): | |
928 | cmpld cr5, rWORD7, rWORD8 | |
929 | sldi. rN, rN, 3 | |
930 | bne cr5, L(duLcr5) | |
931 | cmpld cr7, rN, rSHR | |
932 | beq L(duZeroReturn) | |
fe6e95d7 | 933 | li r0, 0 |
04067002 | 934 | ble cr7, L(dutrim) |
fe6e95d7 AM |
935 | #ifdef __LITTLE_ENDIAN__ |
936 | ldbrx rWORD2, 0, rSTR2 | |
937 | addi rSTR2, rSTR2, 8 | |
938 | #else | |
04067002 | 939 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
940 | #endif |
941 | srd r0, rWORD2, rSHR | |
04067002 UD |
942 | b L(dutrim) |
943 | /* Remainder is 16 */ | |
fe6e95d7 | 944 | .align 4 |
04067002 | 945 | L(duP2): |
fe6e95d7 AM |
946 | srd r0, rWORD8, rSHR |
947 | #ifdef __LITTLE_ENDIAN__ | |
948 | ldbrx rWORD5, 0, rSTR1 | |
949 | addi rSTR1, rSTR1, 8 | |
950 | #else | |
04067002 | 951 | ld rWORD5, 0(rSTR1) |
fe6e95d7 AM |
952 | #endif |
953 | or rWORD6, r0, rWORD6_SHIFT | |
954 | sld rWORD6_SHIFT, rWORD8, rSHL | |
04067002 | 955 | L(duP2e): |
fe6e95d7 AM |
956 | #ifdef __LITTLE_ENDIAN__ |
957 | ldbrx rWORD7, 0, rSTR1 | |
958 | ldbrx rWORD8, 0, rSTR2 | |
959 | addi rSTR1, rSTR1, 8 | |
960 | addi rSTR2, rSTR2, 8 | |
961 | #else | |
04067002 UD |
962 | ld rWORD7, 8(rSTR1) |
963 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 964 | #endif |
04067002 | 965 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
966 | srd r12, rWORD8, rSHR |
967 | sld rWORD8_SHIFT, rWORD8, rSHL | |
968 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 969 | blt cr7, L(duP2x) |
fe6e95d7 AM |
970 | #ifdef __LITTLE_ENDIAN__ |
971 | ldbrx rWORD1, 0, rSTR1 | |
972 | ldbrx rWORD2, 0, rSTR2 | |
973 | addi rSTR1, rSTR1, 8 | |
974 | addi rSTR2, rSTR2, 8 | |
975 | #else | |
04067002 UD |
976 | ld rWORD1, 16(rSTR1) |
977 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 | 978 | #endif |
04067002 UD |
979 | cmpld cr5, rWORD7, rWORD8 |
980 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
981 | srd r0, rWORD2, rSHR |
982 | sld rWORD2_SHIFT, rWORD2, rSHL | |
983 | or rWORD2, r0, rWORD8_SHIFT | |
984 | #ifdef __LITTLE_ENDIAN__ | |
985 | ldbrx rWORD3, 0, rSTR1 | |
986 | ldbrx rWORD4, 0, rSTR2 | |
987 | addi rSTR1, rSTR1, 8 | |
988 | addi rSTR2, rSTR2, 8 | |
989 | #else | |
04067002 UD |
990 | ld rWORD3, 24(rSTR1) |
991 | ld rWORD4, 24(rSTR2) | |
fe6e95d7 AM |
992 | #endif |
993 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 994 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
995 | srd r12, rWORD4, rSHR |
996 | sld rWORD4_SHIFT, rWORD4, rSHL | |
997 | or rWORD4, r12, rWORD2_SHIFT | |
998 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
999 | addi rSTR1, rSTR1, 8 |
1000 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 1001 | #endif |
04067002 UD |
1002 | cmpld cr1, rWORD3, rWORD4 |
1003 | b L(duLoop2) | |
fe6e95d7 | 1004 | .align 4 |
04067002 UD |
1005 | L(duP2x): |
1006 | cmpld cr5, rWORD7, rWORD8 | |
fe6e95d7 | 1007 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1008 | addi rSTR1, rSTR1, 8 |
1009 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 1010 | #endif |
04067002 UD |
1011 | bne cr6, L(duLcr6) |
1012 | sldi. rN, rN, 3 | |
1013 | bne cr5, L(duLcr5) | |
1014 | cmpld cr7, rN, rSHR | |
1015 | beq L(duZeroReturn) | |
fe6e95d7 | 1016 | li r0, 0 |
04067002 | 1017 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1018 | #ifdef __LITTLE_ENDIAN__ |
1019 | ldbrx rWORD2, 0, rSTR2 | |
1020 | addi rSTR2, rSTR2, 8 | |
1021 | #else | |
04067002 | 1022 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1023 | #endif |
1024 | srd r0, rWORD2, rSHR | |
04067002 | 1025 | b L(dutrim) |
9c84384c | 1026 | |
04067002 | 1027 | /* Remainder is 24 */ |
fe6e95d7 | 1028 | .align 4 |
04067002 | 1029 | L(duP3): |
fe6e95d7 AM |
1030 | srd r12, rWORD8, rSHR |
1031 | #ifdef __LITTLE_ENDIAN__ | |
1032 | ldbrx rWORD3, 0, rSTR1 | |
1033 | addi rSTR1, rSTR1, 8 | |
1034 | #else | |
04067002 | 1035 | ld rWORD3, 0(rSTR1) |
fe6e95d7 AM |
1036 | #endif |
1037 | sld rWORD4_SHIFT, rWORD8, rSHL | |
1038 | or rWORD4, r12, rWORD6_SHIFT | |
04067002 | 1039 | L(duP3e): |
fe6e95d7 AM |
1040 | #ifdef __LITTLE_ENDIAN__ |
1041 | ldbrx rWORD5, 0, rSTR1 | |
1042 | ldbrx rWORD6, 0, rSTR2 | |
1043 | addi rSTR1, rSTR1, 8 | |
1044 | addi rSTR2, rSTR2, 8 | |
1045 | #else | |
04067002 UD |
1046 | ld rWORD5, 8(rSTR1) |
1047 | ld rWORD6, 8(rSTR2) | |
fe6e95d7 | 1048 | #endif |
04067002 | 1049 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1050 | srd r0, rWORD6, rSHR |
1051 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1052 | or rWORD6, r0, rWORD4_SHIFT | |
1053 | #ifdef __LITTLE_ENDIAN__ | |
1054 | ldbrx rWORD7, 0, rSTR1 | |
1055 | ldbrx rWORD8, 0, rSTR2 | |
1056 | addi rSTR1, rSTR1, 8 | |
1057 | addi rSTR2, rSTR2, 8 | |
1058 | #else | |
04067002 UD |
1059 | ld rWORD7, 16(rSTR1) |
1060 | ld rWORD8, 16(rSTR2) | |
fe6e95d7 | 1061 | #endif |
04067002 UD |
1062 | cmpld cr6, rWORD5, rWORD6 |
1063 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1064 | srd r12, rWORD8, rSHR |
1065 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1066 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 1067 | blt cr7, L(duP3x) |
fe6e95d7 AM |
1068 | #ifdef __LITTLE_ENDIAN__ |
1069 | ldbrx rWORD1, 0, rSTR1 | |
1070 | ldbrx rWORD2, 0, rSTR2 | |
1071 | addi rSTR1, rSTR1, 8 | |
1072 | addi rSTR2, rSTR2, 8 | |
1073 | #else | |
04067002 UD |
1074 | ld rWORD1, 24(rSTR1) |
1075 | ld rWORD2, 24(rSTR2) | |
fe6e95d7 | 1076 | #endif |
04067002 UD |
1077 | cmpld cr5, rWORD7, rWORD8 |
1078 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1079 | srd r0, rWORD2, rSHR |
1080 | sld rWORD2_SHIFT, rWORD2, rSHL | |
1081 | or rWORD2, r0, rWORD8_SHIFT | |
1082 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
1083 | addi rSTR1, rSTR1, 16 |
1084 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
1085 | #endif |
1086 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 1087 | b L(duLoop1) |
fe6e95d7 | 1088 | .align 4 |
04067002 | 1089 | L(duP3x): |
fe6e95d7 | 1090 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1091 | addi rSTR1, rSTR1, 16 |
1092 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
1093 | #endif |
1094 | #if 0 | |
1095 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1096 | bne cr1, L(duLcr1) |
fe6e95d7 | 1097 | #endif |
04067002 UD |
1098 | cmpld cr5, rWORD7, rWORD8 |
1099 | bne cr6, L(duLcr6) | |
1100 | sldi. rN, rN, 3 | |
1101 | bne cr5, L(duLcr5) | |
1102 | cmpld cr7, rN, rSHR | |
1103 | beq L(duZeroReturn) | |
fe6e95d7 | 1104 | li r0, 0 |
04067002 | 1105 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1106 | #ifdef __LITTLE_ENDIAN__ |
1107 | ldbrx rWORD2, 0, rSTR2 | |
1108 | addi rSTR2, rSTR2, 8 | |
1109 | #else | |
04067002 | 1110 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1111 | #endif |
1112 | srd r0, rWORD2, rSHR | |
04067002 | 1113 | b L(dutrim) |
9c84384c | 1114 | |
04067002 | 1115 | /* Count is a multiple of 32, remainder is 0 */ |
fe6e95d7 | 1116 | .align 4 |
04067002 | 1117 | L(duP4): |
fe6e95d7 AM |
1118 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
1119 | srd r0, rWORD8, rSHR | |
1120 | #ifdef __LITTLE_ENDIAN__ | |
1121 | ldbrx rWORD1, 0, rSTR1 | |
1122 | addi rSTR1, rSTR1, 8 | |
1123 | #else | |
04067002 | 1124 | ld rWORD1, 0(rSTR1) |
fe6e95d7 AM |
1125 | #endif |
1126 | sld rWORD2_SHIFT, rWORD8, rSHL | |
1127 | or rWORD2, r0, rWORD6_SHIFT | |
04067002 | 1128 | L(duP4e): |
fe6e95d7 AM |
1129 | #ifdef __LITTLE_ENDIAN__ |
1130 | ldbrx rWORD3, 0, rSTR1 | |
1131 | ldbrx rWORD4, 0, rSTR2 | |
1132 | addi rSTR1, rSTR1, 8 | |
1133 | addi rSTR2, rSTR2, 8 | |
1134 | #else | |
04067002 UD |
1135 | ld rWORD3, 8(rSTR1) |
1136 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 AM |
1137 | #endif |
1138 | cmpld cr7, rWORD1, rWORD2 | |
1139 | srd r12, rWORD4, rSHR | |
1140 | sld rWORD4_SHIFT, rWORD4, rSHL | |
1141 | or rWORD4, r12, rWORD2_SHIFT | |
1142 | #ifdef __LITTLE_ENDIAN__ | |
1143 | ldbrx rWORD5, 0, rSTR1 | |
1144 | ldbrx rWORD6, 0, rSTR2 | |
1145 | addi rSTR1, rSTR1, 8 | |
1146 | addi rSTR2, rSTR2, 8 | |
1147 | #else | |
04067002 UD |
1148 | ld rWORD5, 16(rSTR1) |
1149 | ld rWORD6, 16(rSTR2) | |
fe6e95d7 | 1150 | #endif |
04067002 | 1151 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1152 | bne cr7, L(duLcr7) |
1153 | srd r0, rWORD6, rSHR | |
1154 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1155 | or rWORD6, r0, rWORD4_SHIFT | |
1156 | #ifdef __LITTLE_ENDIAN__ | |
1157 | ldbrx rWORD7, 0, rSTR1 | |
1158 | ldbrx rWORD8, 0, rSTR2 | |
1159 | addi rSTR1, rSTR1, 8 | |
1160 | addi rSTR2, rSTR2, 8 | |
1161 | #else | |
04067002 UD |
1162 | ldu rWORD7, 24(rSTR1) |
1163 | ldu rWORD8, 24(rSTR2) | |
fe6e95d7 | 1164 | #endif |
04067002 UD |
1165 | cmpld cr6, rWORD5, rWORD6 |
1166 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1167 | srd r12, rWORD8, rSHR |
1168 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1169 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
1170 | cmpld cr5, rWORD7, rWORD8 |
1171 | bdz- L(du24) /* Adjust CTR as we start with +4 */ | |
1172 | /* This is the primary loop */ | |
fe6e95d7 | 1173 | .align 4 |
04067002 | 1174 | L(duLoop): |
fe6e95d7 AM |
1175 | #ifdef __LITTLE_ENDIAN__ |
1176 | ldbrx rWORD1, 0, rSTR1 | |
1177 | ldbrx rWORD2, 0, rSTR2 | |
1178 | addi rSTR1, rSTR1, 8 | |
1179 | addi rSTR2, rSTR2, 8 | |
1180 | #else | |
04067002 UD |
1181 | ld rWORD1, 8(rSTR1) |
1182 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 1183 | #endif |
04067002 UD |
1184 | cmpld cr1, rWORD3, rWORD4 |
1185 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1186 | srd r0, rWORD2, rSHR |
1187 | sld rWORD2_SHIFT, rWORD2, rSHL | |
1188 | or rWORD2, r0, rWORD8_SHIFT | |
04067002 | 1189 | L(duLoop1): |
fe6e95d7 AM |
1190 | #ifdef __LITTLE_ENDIAN__ |
1191 | ldbrx rWORD3, 0, rSTR1 | |
1192 | ldbrx rWORD4, 0, rSTR2 | |
1193 | addi rSTR1, rSTR1, 8 | |
1194 | addi rSTR2, rSTR2, 8 | |
1195 | #else | |
04067002 UD |
1196 | ld rWORD3, 16(rSTR1) |
1197 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 1198 | #endif |
04067002 UD |
1199 | cmpld cr6, rWORD5, rWORD6 |
1200 | bne cr5, L(duLcr5) | |
fe6e95d7 AM |
1201 | srd r12, rWORD4, rSHR |
1202 | sld rWORD4_SHIFT, rWORD4, rSHL | |
1203 | or rWORD4, r12, rWORD2_SHIFT | |
04067002 | 1204 | L(duLoop2): |
fe6e95d7 AM |
1205 | #ifdef __LITTLE_ENDIAN__ |
1206 | ldbrx rWORD5, 0, rSTR1 | |
1207 | ldbrx rWORD6, 0, rSTR2 | |
1208 | addi rSTR1, rSTR1, 8 | |
1209 | addi rSTR2, rSTR2, 8 | |
1210 | #else | |
04067002 UD |
1211 | ld rWORD5, 24(rSTR1) |
1212 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 1213 | #endif |
04067002 | 1214 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
1215 | bne cr7, L(duLcr7) |
1216 | srd r0, rWORD6, rSHR | |
1217 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1218 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 1219 | L(duLoop3): |
fe6e95d7 AM |
1220 | #ifdef __LITTLE_ENDIAN__ |
1221 | ldbrx rWORD7, 0, rSTR1 | |
1222 | ldbrx rWORD8, 0, rSTR2 | |
1223 | addi rSTR1, rSTR1, 8 | |
1224 | addi rSTR2, rSTR2, 8 | |
1225 | #else | |
04067002 UD |
1226 | ldu rWORD7, 32(rSTR1) |
1227 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 AM |
1228 | #endif |
1229 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 1230 | bne- cr1, L(duLcr1) |
fe6e95d7 AM |
1231 | srd r12, rWORD8, rSHR |
1232 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1233 | or rWORD8, r12, rWORD6_SHIFT | |
9c84384c JM |
1234 | bdnz+ L(duLoop) |
1235 | ||
04067002 | 1236 | L(duL4): |
fe6e95d7 AM |
1237 | #if 0 |
1238 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1239 | bne cr1, L(duLcr1) |
fe6e95d7 | 1240 | #endif |
04067002 UD |
1241 | cmpld cr1, rWORD3, rWORD4 |
1242 | bne cr6, L(duLcr6) | |
1243 | cmpld cr6, rWORD5, rWORD6 | |
1244 | bne cr5, L(duLcr5) | |
1245 | cmpld cr5, rWORD7, rWORD8 | |
1246 | L(du44): | |
fe6e95d7 | 1247 | bne cr7, L(duLcr7) |
04067002 UD |
1248 | L(du34): |
1249 | bne cr1, L(duLcr1) | |
1250 | L(du24): | |
1251 | bne cr6, L(duLcr6) | |
1252 | L(du14): | |
1253 | sldi. rN, rN, 3 | |
1254 | bne cr5, L(duLcr5) | |
1255 | /* At this point we have a remainder of 1 to 7 bytes to compare. We use | |
2ccdea26 | 1256 | shift right double to eliminate bits beyond the compare length. |
04067002 | 1257 | |
9c84384c | 1258 | However it may not be safe to load rWORD2 which may be beyond the |
04067002 UD |
1259 | string length. So we compare the bit length of the remainder to |
1260 | the right shift count (rSHR). If the bit count is less than or equal | |
1261 | we do not need to load rWORD2 (all significant bits are already in | |
fe6e95d7 | 1262 | rWORD8_SHIFT). */ |
04067002 UD |
1263 | cmpld cr7, rN, rSHR |
1264 | beq L(duZeroReturn) | |
fe6e95d7 | 1265 | li r0, 0 |
04067002 | 1266 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1267 | #ifdef __LITTLE_ENDIAN__ |
1268 | ldbrx rWORD2, 0, rSTR2 | |
1269 | addi rSTR2, rSTR2, 8 | |
1270 | #else | |
04067002 | 1271 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1272 | #endif |
1273 | srd r0, rWORD2, rSHR | |
1274 | .align 4 | |
04067002 | 1275 | L(dutrim): |
fe6e95d7 AM |
1276 | #ifdef __LITTLE_ENDIAN__ |
1277 | ldbrx rWORD1, 0, rSTR1 | |
1278 | #else | |
04067002 | 1279 | ld rWORD1, 8(rSTR1) |
fe6e95d7 AM |
1280 | #endif |
1281 | ld rWORD8, -8(r1) | |
9c84384c | 1282 | subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ |
fe6e95d7 AM |
1283 | or rWORD2, r0, rWORD8_SHIFT |
1284 | ld rWORD7, -16(r1) | |
1285 | ld rSHL, -24(r1) | |
04067002 UD |
1286 | srd rWORD1, rWORD1, rN |
1287 | srd rWORD2, rWORD2, rN | |
fe6e95d7 AM |
1288 | ld rSHR, -32(r1) |
1289 | ld rWORD8_SHIFT, -40(r1) | |
04067002 | 1290 | li rRTN, 0 |
fe6e95d7 AM |
1291 | cmpld cr7, rWORD1, rWORD2 |
1292 | ld rWORD2_SHIFT, -48(r1) | |
1293 | ld rWORD4_SHIFT, -56(r1) | |
1294 | beq cr7, L(dureturn24) | |
04067002 | 1295 | li rRTN, 1 |
fe6e95d7 AM |
1296 | ld rWORD6_SHIFT, -64(r1) |
1297 | bgtlr cr7 | |
04067002 UD |
1298 | li rRTN, -1 |
1299 | blr | |
fe6e95d7 AM |
1300 | .align 4 |
1301 | L(duLcr7): | |
1302 | ld rWORD8, -8(r1) | |
1303 | ld rWORD7, -16(r1) | |
04067002 | 1304 | li rRTN, 1 |
fe6e95d7 AM |
1305 | bgt cr7, L(dureturn29) |
1306 | ld rSHL, -24(r1) | |
1307 | ld rSHR, -32(r1) | |
04067002 UD |
1308 | li rRTN, -1 |
1309 | b L(dureturn27) | |
fe6e95d7 | 1310 | .align 4 |
04067002 | 1311 | L(duLcr1): |
fe6e95d7 AM |
1312 | ld rWORD8, -8(r1) |
1313 | ld rWORD7, -16(r1) | |
04067002 | 1314 | li rRTN, 1 |
9c84384c | 1315 | bgt cr1, L(dureturn29) |
fe6e95d7 AM |
1316 | ld rSHL, -24(r1) |
1317 | ld rSHR, -32(r1) | |
04067002 UD |
1318 | li rRTN, -1 |
1319 | b L(dureturn27) | |
fe6e95d7 | 1320 | .align 4 |
04067002 | 1321 | L(duLcr6): |
fe6e95d7 AM |
1322 | ld rWORD8, -8(r1) |
1323 | ld rWORD7, -16(r1) | |
04067002 | 1324 | li rRTN, 1 |
9c84384c | 1325 | bgt cr6, L(dureturn29) |
fe6e95d7 AM |
1326 | ld rSHL, -24(r1) |
1327 | ld rSHR, -32(r1) | |
04067002 UD |
1328 | li rRTN, -1 |
1329 | b L(dureturn27) | |
fe6e95d7 | 1330 | .align 4 |
04067002 | 1331 | L(duLcr5): |
fe6e95d7 AM |
1332 | ld rWORD8, -8(r1) |
1333 | ld rWORD7, -16(r1) | |
04067002 | 1334 | li rRTN, 1 |
9c84384c | 1335 | bgt cr5, L(dureturn29) |
fe6e95d7 AM |
1336 | ld rSHL, -24(r1) |
1337 | ld rSHR, -32(r1) | |
04067002 UD |
1338 | li rRTN, -1 |
1339 | b L(dureturn27) | |
1340 | .align 3 | |
1341 | L(duZeroReturn): | |
fe6e95d7 | 1342 | li rRTN, 0 |
04067002 UD |
1343 | .align 4 |
1344 | L(dureturn): | |
fe6e95d7 AM |
1345 | ld rWORD8, -8(r1) |
1346 | ld rWORD7, -16(r1) | |
9c84384c | 1347 | L(dureturn29): |
fe6e95d7 AM |
1348 | ld rSHL, -24(r1) |
1349 | ld rSHR, -32(r1) | |
9c84384c | 1350 | L(dureturn27): |
fe6e95d7 | 1351 | ld rWORD8_SHIFT, -40(r1) |
9c84384c | 1352 | L(dureturn26): |
fe6e95d7 | 1353 | ld rWORD2_SHIFT, -48(r1) |
9c84384c | 1354 | L(dureturn25): |
fe6e95d7 | 1355 | ld rWORD4_SHIFT, -56(r1) |
04067002 | 1356 | L(dureturn24): |
fe6e95d7 | 1357 | ld rWORD6_SHIFT, -64(r1) |
04067002 UD |
1358 | blr |
1359 | L(duzeroLength): | |
fe6e95d7 | 1360 | li rRTN, 0 |
04067002 UD |
1361 | blr |
1362 | ||
2d67d91a | 1363 | END (memcmp) |
04067002 UD |
1364 | libc_hidden_builtin_def (memcmp) |
1365 | weak_alias (memcmp, bcmp) |