]>
Commit | Line | Data |
---|---|---|
fe6e95d7 | 1 | /* Optimized memcmp implementation for PowerPC64. |
04277e02 | 2 | Copyright (C) 2003-2019 Free Software Foundation, Inc. |
04067002 UD |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
04067002 UD |
18 | |
19 | #include <sysdep.h> | |
04067002 | 20 | |
fe6e95d7 AM |
21 | /* int [r3] memcmp (const char *s1 [r3], |
22 | const char *s2 [r4], | |
23 | size_t size [r5]) */ | |
04067002 | 24 | |
b6a66222 WSM |
25 | #ifndef MEMCMP |
26 | # define MEMCMP memcmp | |
27 | #endif | |
28 | ||
9250e661 | 29 | #ifndef __LITTLE_ENDIAN__ |
a88f47a7 | 30 | .machine power4 |
9250e661 GG |
31 | #else |
32 | /* Little endian is only available since POWER8, so it's safe to | |
33 | specify .machine as power8 (or older), even though this is a POWER4 | |
34 | file. Since the little-endian code uses 'ldbrx', power7 is enough. */ | |
35 | .machine power7 | |
36 | #endif | |
d5b41185 | 37 | ENTRY_TOCLESS (MEMCMP, 4) |
04067002 UD |
38 | CALL_MCOUNT 3 |
39 | ||
04067002 UD |
40 | #define rRTN r3 |
41 | #define rSTR1 r3 /* first string arg */ | |
42 | #define rSTR2 r4 /* second string arg */ | |
43 | #define rN r5 /* max string length */ | |
04067002 UD |
44 | #define rWORD1 r6 /* current word in s1 */ |
45 | #define rWORD2 r7 /* current word in s2 */ | |
46 | #define rWORD3 r8 /* next word in s1 */ | |
47 | #define rWORD4 r9 /* next word in s2 */ | |
48 | #define rWORD5 r10 /* next word in s1 */ | |
49 | #define rWORD6 r11 /* next word in s2 */ | |
04067002 UD |
50 | #define rWORD7 r30 /* next word in s1 */ |
51 | #define rWORD8 r31 /* next word in s2 */ | |
52 | ||
fe6e95d7 | 53 | xor r0, rSTR2, rSTR1 |
04067002 UD |
54 | cmpldi cr6, rN, 0 |
55 | cmpldi cr1, rN, 12 | |
fe6e95d7 AM |
56 | clrldi. r0, r0, 61 |
57 | clrldi r12, rSTR1, 61 | |
58 | cmpldi cr5, r12, 0 | |
04067002 | 59 | beq- cr6, L(zeroLength) |
fe6e95d7 AM |
60 | dcbt 0, rSTR1 |
61 | dcbt 0, rSTR2 | |
2ccdea26 | 62 | /* If less than 8 bytes or not aligned, use the unaligned |
04067002 UD |
63 | byte loop. */ |
64 | blt cr1, L(bytealigned) | |
fe6e95d7 | 65 | std rWORD8, -8(r1) |
fe6e95d7 | 66 | std rWORD7, -16(r1) |
869d7180 | 67 | cfi_offset(rWORD8, -8) |
fe6e95d7 | 68 | cfi_offset(rWORD7, -16) |
04067002 UD |
69 | bne L(unaligned) |
70 | /* At this point we know both strings have the same alignment and the | |
fe6e95d7 | 71 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 72 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 AM |
73 | of r12 to 0. If r12 == 0 then we are already double word |
74 | aligned and can perform the DW aligned loop. | |
9c84384c | 75 | |
04067002 | 76 | Otherwise we know the two strings have the same alignment (but not |
fe6e95d7 AM |
77 | yet DW). So we force the string addresses to the next lower DW |
78 | boundary and special case this first DW using shift left to | |
2ccdea26 | 79 | eliminate bits preceding the first byte. Since we want to join the |
fe6e95d7 | 80 | normal (DW aligned) compare loop, starting at the second double word, |
04067002 | 81 | we need to adjust the length (rN) and special case the loop |
fe6e95d7 AM |
82 | versioning for the first DW. This ensures that the loop count is |
83 | correct and the first DW (shifted) is in the expected register pair. */ | |
84 | .align 4 | |
04067002 UD |
85 | L(samealignment): |
86 | clrrdi rSTR1, rSTR1, 3 | |
87 | clrrdi rSTR2, rSTR2, 3 | |
88 | beq cr5, L(DWaligned) | |
fe6e95d7 AM |
89 | add rN, rN, r12 |
90 | sldi rWORD6, r12, 3 | |
91 | srdi r0, rN, 5 /* Divide by 32 */ | |
92 | andi. r12, rN, 24 /* Get the DW remainder */ | |
93 | #ifdef __LITTLE_ENDIAN__ | |
94 | ldbrx rWORD1, 0, rSTR1 | |
95 | ldbrx rWORD2, 0, rSTR2 | |
96 | addi rSTR1, rSTR1, 8 | |
97 | addi rSTR2, rSTR2, 8 | |
98 | #else | |
04067002 UD |
99 | ld rWORD1, 0(rSTR1) |
100 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
101 | #endif |
102 | cmpldi cr1, r12, 16 | |
04067002 UD |
103 | cmpldi cr7, rN, 32 |
104 | clrldi rN, rN, 61 | |
105 | beq L(dPs4) | |
fe6e95d7 | 106 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
107 | bgt cr1, L(dPs3) |
108 | beq cr1, L(dPs2) | |
109 | ||
110 | /* Remainder is 8 */ | |
fe6e95d7 | 111 | .align 3 |
04067002 | 112 | L(dsP1): |
fe6e95d7 AM |
113 | sld rWORD5, rWORD1, rWORD6 |
114 | sld rWORD6, rWORD2, rWORD6 | |
04067002 UD |
115 | cmpld cr5, rWORD5, rWORD6 |
116 | blt cr7, L(dP1x) | |
117 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
118 | #ifdef __LITTLE_ENDIAN__ |
119 | ldbrx rWORD1, 0, rSTR1 | |
120 | ldbrx rWORD2, 0, rSTR2 | |
121 | addi rSTR1, rSTR1, 8 | |
122 | addi rSTR2, rSTR2, 8 | |
123 | #else | |
04067002 UD |
124 | ld rWORD1, 8(rSTR1) |
125 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
126 | #endif |
127 | cmpld cr7, rWORD1, rWORD2 | |
04067002 UD |
128 | b L(dP1e) |
129 | /* Remainder is 16 */ | |
fe6e95d7 | 130 | .align 4 |
04067002 | 131 | L(dPs2): |
fe6e95d7 AM |
132 | sld rWORD5, rWORD1, rWORD6 |
133 | sld rWORD6, rWORD2, rWORD6 | |
04067002 UD |
134 | cmpld cr6, rWORD5, rWORD6 |
135 | blt cr7, L(dP2x) | |
136 | /* Do something useful in this cycle since we have to branch anyway. */ | |
fe6e95d7 AM |
137 | #ifdef __LITTLE_ENDIAN__ |
138 | ldbrx rWORD7, 0, rSTR1 | |
139 | ldbrx rWORD8, 0, rSTR2 | |
140 | addi rSTR1, rSTR1, 8 | |
141 | addi rSTR2, rSTR2, 8 | |
142 | #else | |
04067002 UD |
143 | ld rWORD7, 8(rSTR1) |
144 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 145 | #endif |
04067002 UD |
146 | cmpld cr5, rWORD7, rWORD8 |
147 | b L(dP2e) | |
148 | /* Remainder is 24 */ | |
fe6e95d7 | 149 | .align 4 |
04067002 | 150 | L(dPs3): |
fe6e95d7 AM |
151 | sld rWORD3, rWORD1, rWORD6 |
152 | sld rWORD4, rWORD2, rWORD6 | |
04067002 UD |
153 | cmpld cr1, rWORD3, rWORD4 |
154 | b L(dP3e) | |
155 | /* Count is a multiple of 32, remainder is 0 */ | |
fe6e95d7 | 156 | .align 4 |
04067002 | 157 | L(dPs4): |
fe6e95d7 AM |
158 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
159 | sld rWORD1, rWORD1, rWORD6 | |
160 | sld rWORD2, rWORD2, rWORD6 | |
161 | cmpld cr7, rWORD1, rWORD2 | |
04067002 UD |
162 | b L(dP4e) |
163 | ||
164 | /* At this point we know both strings are double word aligned and the | |
165 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 166 | .align 4 |
04067002 | 167 | L(DWaligned): |
fe6e95d7 AM |
168 | andi. r12, rN, 24 /* Get the DW remainder */ |
169 | srdi r0, rN, 5 /* Divide by 32 */ | |
170 | cmpldi cr1, r12, 16 | |
04067002 UD |
171 | cmpldi cr7, rN, 32 |
172 | clrldi rN, rN, 61 | |
173 | beq L(dP4) | |
174 | bgt cr1, L(dP3) | |
175 | beq cr1, L(dP2) | |
9c84384c | 176 | |
04067002 | 177 | /* Remainder is 8 */ |
fe6e95d7 | 178 | .align 4 |
04067002 | 179 | L(dP1): |
fe6e95d7 | 180 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 | 181 | /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early |
2ccdea26 AB |
182 | (8-15 byte compare), we want to use only volatile registers. This |
183 | means we can avoid restoring non-volatile registers since we did not | |
04067002 | 184 | change any on the early exit path. The key here is the non-early |
9c84384c | 185 | exit path only cares about the condition code (cr5), not about which |
04067002 | 186 | register pair was used. */ |
fe6e95d7 AM |
187 | #ifdef __LITTLE_ENDIAN__ |
188 | ldbrx rWORD5, 0, rSTR1 | |
189 | ldbrx rWORD6, 0, rSTR2 | |
190 | addi rSTR1, rSTR1, 8 | |
191 | addi rSTR2, rSTR2, 8 | |
192 | #else | |
04067002 UD |
193 | ld rWORD5, 0(rSTR1) |
194 | ld rWORD6, 0(rSTR2) | |
fe6e95d7 | 195 | #endif |
04067002 UD |
196 | cmpld cr5, rWORD5, rWORD6 |
197 | blt cr7, L(dP1x) | |
fe6e95d7 AM |
198 | #ifdef __LITTLE_ENDIAN__ |
199 | ldbrx rWORD1, 0, rSTR1 | |
200 | ldbrx rWORD2, 0, rSTR2 | |
201 | addi rSTR1, rSTR1, 8 | |
202 | addi rSTR2, rSTR2, 8 | |
203 | #else | |
04067002 UD |
204 | ld rWORD1, 8(rSTR1) |
205 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 AM |
206 | #endif |
207 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 208 | L(dP1e): |
fe6e95d7 AM |
209 | #ifdef __LITTLE_ENDIAN__ |
210 | ldbrx rWORD3, 0, rSTR1 | |
211 | ldbrx rWORD4, 0, rSTR2 | |
212 | addi rSTR1, rSTR1, 8 | |
213 | addi rSTR2, rSTR2, 8 | |
214 | #else | |
04067002 UD |
215 | ld rWORD3, 16(rSTR1) |
216 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 217 | #endif |
04067002 | 218 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
219 | #ifdef __LITTLE_ENDIAN__ |
220 | ldbrx rWORD5, 0, rSTR1 | |
221 | ldbrx rWORD6, 0, rSTR2 | |
222 | addi rSTR1, rSTR1, 8 | |
223 | addi rSTR2, rSTR2, 8 | |
224 | #else | |
04067002 UD |
225 | ld rWORD5, 24(rSTR1) |
226 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 227 | #endif |
04067002 | 228 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
229 | bne cr5, L(dLcr5x) |
230 | bne cr7, L(dLcr7x) | |
9c84384c | 231 | |
fe6e95d7 AM |
232 | #ifdef __LITTLE_ENDIAN__ |
233 | ldbrx rWORD7, 0, rSTR1 | |
234 | ldbrx rWORD8, 0, rSTR2 | |
235 | addi rSTR1, rSTR1, 8 | |
236 | addi rSTR2, rSTR2, 8 | |
237 | #else | |
04067002 UD |
238 | ldu rWORD7, 32(rSTR1) |
239 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 | 240 | #endif |
04067002 UD |
241 | bne cr1, L(dLcr1) |
242 | cmpld cr5, rWORD7, rWORD8 | |
243 | bdnz L(dLoop) | |
244 | bne cr6, L(dLcr6) | |
fe6e95d7 AM |
245 | ld rWORD8, -8(r1) |
246 | ld rWORD7, -16(r1) | |
247 | .align 3 | |
04067002 UD |
248 | L(dP1x): |
249 | sldi. r12, rN, 3 | |
fe6e95d7 | 250 | bne cr5, L(dLcr5x) |
04067002 UD |
251 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
252 | bne L(d00) | |
253 | li rRTN, 0 | |
254 | blr | |
9c84384c | 255 | |
04067002 | 256 | /* Remainder is 16 */ |
fe6e95d7 | 257 | .align 4 |
04067002 | 258 | L(dP2): |
fe6e95d7 AM |
259 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
260 | #ifdef __LITTLE_ENDIAN__ | |
261 | ldbrx rWORD5, 0, rSTR1 | |
262 | ldbrx rWORD6, 0, rSTR2 | |
263 | addi rSTR1, rSTR1, 8 | |
264 | addi rSTR2, rSTR2, 8 | |
265 | #else | |
04067002 UD |
266 | ld rWORD5, 0(rSTR1) |
267 | ld rWORD6, 0(rSTR2) | |
fe6e95d7 | 268 | #endif |
04067002 UD |
269 | cmpld cr6, rWORD5, rWORD6 |
270 | blt cr7, L(dP2x) | |
fe6e95d7 AM |
271 | #ifdef __LITTLE_ENDIAN__ |
272 | ldbrx rWORD7, 0, rSTR1 | |
273 | ldbrx rWORD8, 0, rSTR2 | |
274 | addi rSTR1, rSTR1, 8 | |
275 | addi rSTR2, rSTR2, 8 | |
276 | #else | |
04067002 UD |
277 | ld rWORD7, 8(rSTR1) |
278 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 279 | #endif |
04067002 UD |
280 | cmpld cr5, rWORD7, rWORD8 |
281 | L(dP2e): | |
fe6e95d7 AM |
282 | #ifdef __LITTLE_ENDIAN__ |
283 | ldbrx rWORD1, 0, rSTR1 | |
284 | ldbrx rWORD2, 0, rSTR2 | |
285 | addi rSTR1, rSTR1, 8 | |
286 | addi rSTR2, rSTR2, 8 | |
287 | #else | |
04067002 UD |
288 | ld rWORD1, 16(rSTR1) |
289 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 AM |
290 | #endif |
291 | cmpld cr7, rWORD1, rWORD2 | |
292 | #ifdef __LITTLE_ENDIAN__ | |
293 | ldbrx rWORD3, 0, rSTR1 | |
294 | ldbrx rWORD4, 0, rSTR2 | |
295 | addi rSTR1, rSTR1, 8 | |
296 | addi rSTR2, rSTR2, 8 | |
297 | #else | |
04067002 UD |
298 | ld rWORD3, 24(rSTR1) |
299 | ld rWORD4, 24(rSTR2) | |
fe6e95d7 | 300 | #endif |
04067002 | 301 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 | 302 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
303 | addi rSTR1, rSTR1, 8 |
304 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 305 | #endif |
04067002 UD |
306 | bne cr6, L(dLcr6) |
307 | bne cr5, L(dLcr5) | |
308 | b L(dLoop2) | |
309 | /* Again we are on a early exit path (16-23 byte compare), we want to | |
2ccdea26 | 310 | only use volatile registers and avoid restoring non-volatile |
04067002 | 311 | registers. */ |
fe6e95d7 | 312 | .align 4 |
04067002 | 313 | L(dP2x): |
fe6e95d7 AM |
314 | #ifdef __LITTLE_ENDIAN__ |
315 | ldbrx rWORD3, 0, rSTR1 | |
316 | ldbrx rWORD4, 0, rSTR2 | |
317 | addi rSTR1, rSTR1, 8 | |
318 | addi rSTR2, rSTR2, 8 | |
319 | #else | |
04067002 UD |
320 | ld rWORD3, 8(rSTR1) |
321 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 AM |
322 | #endif |
323 | cmpld cr1, rWORD3, rWORD4 | |
04067002 | 324 | sldi. r12, rN, 3 |
fe6e95d7 AM |
325 | bne cr6, L(dLcr6x) |
326 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
327 | addi rSTR1, rSTR1, 8 |
328 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 AM |
329 | #endif |
330 | bne cr1, L(dLcr1x) | |
04067002 UD |
331 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
332 | bne L(d00) | |
333 | li rRTN, 0 | |
334 | blr | |
9c84384c | 335 | |
04067002 | 336 | /* Remainder is 24 */ |
fe6e95d7 | 337 | .align 4 |
04067002 | 338 | L(dP3): |
fe6e95d7 AM |
339 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
340 | #ifdef __LITTLE_ENDIAN__ | |
341 | ldbrx rWORD3, 0, rSTR1 | |
342 | ldbrx rWORD4, 0, rSTR2 | |
343 | addi rSTR1, rSTR1, 8 | |
344 | addi rSTR2, rSTR2, 8 | |
345 | #else | |
04067002 UD |
346 | ld rWORD3, 0(rSTR1) |
347 | ld rWORD4, 0(rSTR2) | |
fe6e95d7 | 348 | #endif |
04067002 UD |
349 | cmpld cr1, rWORD3, rWORD4 |
350 | L(dP3e): | |
fe6e95d7 AM |
351 | #ifdef __LITTLE_ENDIAN__ |
352 | ldbrx rWORD5, 0, rSTR1 | |
353 | ldbrx rWORD6, 0, rSTR2 | |
354 | addi rSTR1, rSTR1, 8 | |
355 | addi rSTR2, rSTR2, 8 | |
356 | #else | |
04067002 UD |
357 | ld rWORD5, 8(rSTR1) |
358 | ld rWORD6, 8(rSTR2) | |
fe6e95d7 | 359 | #endif |
04067002 UD |
360 | cmpld cr6, rWORD5, rWORD6 |
361 | blt cr7, L(dP3x) | |
fe6e95d7 AM |
362 | #ifdef __LITTLE_ENDIAN__ |
363 | ldbrx rWORD7, 0, rSTR1 | |
364 | ldbrx rWORD8, 0, rSTR2 | |
365 | addi rSTR1, rSTR1, 8 | |
366 | addi rSTR2, rSTR2, 8 | |
367 | #else | |
04067002 UD |
368 | ld rWORD7, 16(rSTR1) |
369 | ld rWORD8, 16(rSTR2) | |
fe6e95d7 | 370 | #endif |
04067002 | 371 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
372 | #ifdef __LITTLE_ENDIAN__ |
373 | ldbrx rWORD1, 0, rSTR1 | |
374 | ldbrx rWORD2, 0, rSTR2 | |
375 | addi rSTR1, rSTR1, 8 | |
376 | addi rSTR2, rSTR2, 8 | |
377 | #else | |
04067002 UD |
378 | ld rWORD1, 24(rSTR1) |
379 | ld rWORD2, 24(rSTR2) | |
fe6e95d7 AM |
380 | #endif |
381 | cmpld cr7, rWORD1, rWORD2 | |
382 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
383 | addi rSTR1, rSTR1, 16 |
384 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 | 385 | #endif |
04067002 UD |
386 | bne cr1, L(dLcr1) |
387 | bne cr6, L(dLcr6) | |
388 | b L(dLoop1) | |
389 | /* Again we are on a early exit path (24-31 byte compare), we want to | |
2ccdea26 | 390 | only use volatile registers and avoid restoring non-volatile |
04067002 | 391 | registers. */ |
fe6e95d7 | 392 | .align 4 |
04067002 | 393 | L(dP3x): |
fe6e95d7 AM |
394 | #ifdef __LITTLE_ENDIAN__ |
395 | ldbrx rWORD1, 0, rSTR1 | |
396 | ldbrx rWORD2, 0, rSTR2 | |
397 | addi rSTR1, rSTR1, 8 | |
398 | addi rSTR2, rSTR2, 8 | |
399 | #else | |
04067002 UD |
400 | ld rWORD1, 16(rSTR1) |
401 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 AM |
402 | #endif |
403 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 404 | sldi. r12, rN, 3 |
fe6e95d7 AM |
405 | bne cr1, L(dLcr1x) |
406 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
407 | addi rSTR1, rSTR1, 16 |
408 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
409 | #endif |
410 | bne cr6, L(dLcr6x) | |
04067002 | 411 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
fe6e95d7 | 412 | bne cr7, L(dLcr7x) |
04067002 UD |
413 | bne L(d00) |
414 | li rRTN, 0 | |
415 | blr | |
9c84384c | 416 | |
04067002 | 417 | /* Count is a multiple of 32, remainder is 0 */ |
fe6e95d7 | 418 | .align 4 |
04067002 | 419 | L(dP4): |
fe6e95d7 AM |
420 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
421 | #ifdef __LITTLE_ENDIAN__ | |
422 | ldbrx rWORD1, 0, rSTR1 | |
423 | ldbrx rWORD2, 0, rSTR2 | |
424 | addi rSTR1, rSTR1, 8 | |
425 | addi rSTR2, rSTR2, 8 | |
426 | #else | |
04067002 UD |
427 | ld rWORD1, 0(rSTR1) |
428 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
429 | #endif |
430 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 431 | L(dP4e): |
fe6e95d7 AM |
432 | #ifdef __LITTLE_ENDIAN__ |
433 | ldbrx rWORD3, 0, rSTR1 | |
434 | ldbrx rWORD4, 0, rSTR2 | |
435 | addi rSTR1, rSTR1, 8 | |
436 | addi rSTR2, rSTR2, 8 | |
437 | #else | |
04067002 UD |
438 | ld rWORD3, 8(rSTR1) |
439 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 | 440 | #endif |
04067002 | 441 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
442 | #ifdef __LITTLE_ENDIAN__ |
443 | ldbrx rWORD5, 0, rSTR1 | |
444 | ldbrx rWORD6, 0, rSTR2 | |
445 | addi rSTR1, rSTR1, 8 | |
446 | addi rSTR2, rSTR2, 8 | |
447 | #else | |
04067002 UD |
448 | ld rWORD5, 16(rSTR1) |
449 | ld rWORD6, 16(rSTR2) | |
fe6e95d7 | 450 | #endif |
04067002 | 451 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
452 | #ifdef __LITTLE_ENDIAN__ |
453 | ldbrx rWORD7, 0, rSTR1 | |
454 | ldbrx rWORD8, 0, rSTR2 | |
455 | addi rSTR1, rSTR1, 8 | |
456 | addi rSTR2, rSTR2, 8 | |
457 | #else | |
04067002 UD |
458 | ldu rWORD7, 24(rSTR1) |
459 | ldu rWORD8, 24(rSTR2) | |
fe6e95d7 | 460 | #endif |
04067002 | 461 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 | 462 | bne cr7, L(dLcr7) |
04067002 UD |
463 | bne cr1, L(dLcr1) |
464 | bdz- L(d24) /* Adjust CTR as we start with +4 */ | |
465 | /* This is the primary loop */ | |
fe6e95d7 | 466 | .align 4 |
04067002 | 467 | L(dLoop): |
fe6e95d7 AM |
468 | #ifdef __LITTLE_ENDIAN__ |
469 | ldbrx rWORD1, 0, rSTR1 | |
470 | ldbrx rWORD2, 0, rSTR2 | |
471 | addi rSTR1, rSTR1, 8 | |
472 | addi rSTR2, rSTR2, 8 | |
473 | #else | |
04067002 UD |
474 | ld rWORD1, 8(rSTR1) |
475 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 476 | #endif |
04067002 UD |
477 | cmpld cr1, rWORD3, rWORD4 |
478 | bne cr6, L(dLcr6) | |
479 | L(dLoop1): | |
fe6e95d7 AM |
480 | #ifdef __LITTLE_ENDIAN__ |
481 | ldbrx rWORD3, 0, rSTR1 | |
482 | ldbrx rWORD4, 0, rSTR2 | |
483 | addi rSTR1, rSTR1, 8 | |
484 | addi rSTR2, rSTR2, 8 | |
485 | #else | |
04067002 UD |
486 | ld rWORD3, 16(rSTR1) |
487 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 488 | #endif |
04067002 UD |
489 | cmpld cr6, rWORD5, rWORD6 |
490 | bne cr5, L(dLcr5) | |
491 | L(dLoop2): | |
fe6e95d7 AM |
492 | #ifdef __LITTLE_ENDIAN__ |
493 | ldbrx rWORD5, 0, rSTR1 | |
494 | ldbrx rWORD6, 0, rSTR2 | |
495 | addi rSTR1, rSTR1, 8 | |
496 | addi rSTR2, rSTR2, 8 | |
497 | #else | |
04067002 UD |
498 | ld rWORD5, 24(rSTR1) |
499 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 500 | #endif |
04067002 | 501 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 | 502 | bne cr7, L(dLcr7) |
04067002 | 503 | L(dLoop3): |
fe6e95d7 AM |
504 | #ifdef __LITTLE_ENDIAN__ |
505 | ldbrx rWORD7, 0, rSTR1 | |
506 | ldbrx rWORD8, 0, rSTR2 | |
507 | addi rSTR1, rSTR1, 8 | |
508 | addi rSTR2, rSTR2, 8 | |
509 | #else | |
04067002 UD |
510 | ldu rWORD7, 32(rSTR1) |
511 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 | 512 | #endif |
04067002 | 513 | bne- cr1, L(dLcr1) |
fe6e95d7 | 514 | cmpld cr7, rWORD1, rWORD2 |
9c84384c JM |
515 | bdnz+ L(dLoop) |
516 | ||
04067002 UD |
517 | L(dL4): |
518 | cmpld cr1, rWORD3, rWORD4 | |
519 | bne cr6, L(dLcr6) | |
520 | cmpld cr6, rWORD5, rWORD6 | |
521 | bne cr5, L(dLcr5) | |
522 | cmpld cr5, rWORD7, rWORD8 | |
523 | L(d44): | |
fe6e95d7 | 524 | bne cr7, L(dLcr7) |
04067002 UD |
525 | L(d34): |
526 | bne cr1, L(dLcr1) | |
527 | L(d24): | |
528 | bne cr6, L(dLcr6) | |
529 | L(d14): | |
530 | sldi. r12, rN, 3 | |
9c84384c | 531 | bne cr5, L(dLcr5) |
04067002 | 532 | L(d04): |
fe6e95d7 AM |
533 | ld rWORD8, -8(r1) |
534 | ld rWORD7, -16(r1) | |
04067002 UD |
535 | subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ |
536 | beq L(zeroLength) | |
537 | /* At this point we have a remainder of 1 to 7 bytes to compare. Since | |
538 | we are aligned it is safe to load the whole double word, and use | |
2ccdea26 | 539 | shift right double to eliminate bits beyond the compare length. */ |
04067002 | 540 | L(d00): |
fe6e95d7 AM |
541 | #ifdef __LITTLE_ENDIAN__ |
542 | ldbrx rWORD1, 0, rSTR1 | |
543 | ldbrx rWORD2, 0, rSTR2 | |
544 | addi rSTR1, rSTR1, 8 | |
545 | addi rSTR2, rSTR2, 8 | |
546 | #else | |
04067002 | 547 | ld rWORD1, 8(rSTR1) |
9c84384c | 548 | ld rWORD2, 8(rSTR2) |
fe6e95d7 | 549 | #endif |
04067002 UD |
550 | srd rWORD1, rWORD1, rN |
551 | srd rWORD2, rWORD2, rN | |
fe6e95d7 AM |
552 | cmpld cr7, rWORD1, rWORD2 |
553 | bne cr7, L(dLcr7x) | |
04067002 UD |
554 | li rRTN, 0 |
555 | blr | |
fe6e95d7 AM |
556 | |
557 | .align 4 | |
558 | L(dLcr7): | |
559 | ld rWORD8, -8(r1) | |
560 | ld rWORD7, -16(r1) | |
561 | L(dLcr7x): | |
04067002 | 562 | li rRTN, 1 |
fe6e95d7 | 563 | bgtlr cr7 |
04067002 UD |
564 | li rRTN, -1 |
565 | blr | |
fe6e95d7 | 566 | .align 4 |
04067002 | 567 | L(dLcr1): |
fe6e95d7 AM |
568 | ld rWORD8, -8(r1) |
569 | ld rWORD7, -16(r1) | |
570 | L(dLcr1x): | |
04067002 UD |
571 | li rRTN, 1 |
572 | bgtlr cr1 | |
573 | li rRTN, -1 | |
574 | blr | |
fe6e95d7 | 575 | .align 4 |
04067002 | 576 | L(dLcr6): |
fe6e95d7 AM |
577 | ld rWORD8, -8(r1) |
578 | ld rWORD7, -16(r1) | |
579 | L(dLcr6x): | |
04067002 UD |
580 | li rRTN, 1 |
581 | bgtlr cr6 | |
582 | li rRTN, -1 | |
583 | blr | |
fe6e95d7 | 584 | .align 4 |
04067002 | 585 | L(dLcr5): |
fe6e95d7 AM |
586 | ld rWORD8, -8(r1) |
587 | ld rWORD7, -16(r1) | |
04067002 UD |
588 | L(dLcr5x): |
589 | li rRTN, 1 | |
590 | bgtlr cr5 | |
591 | li rRTN, -1 | |
592 | blr | |
9c84384c | 593 | |
fe6e95d7 | 594 | .align 4 |
04067002 | 595 | L(bytealigned): |
fe6e95d7 AM |
596 | mtctr rN /* Power4 wants mtctr 1st in dispatch group */ |
597 | #if 0 | |
598 | /* Huh? We've already branched on cr6! */ | |
04067002 | 599 | beq- cr6, L(zeroLength) |
fe6e95d7 | 600 | #endif |
04067002 UD |
601 | |
602 | /* We need to prime this loop. This loop is swing modulo scheduled | |
9c84384c | 603 | to avoid pipe delays. The dependent instruction latencies (load to |
04067002 UD |
604 | compare to conditional branch) is 2 to 3 cycles. In this loop each |
605 | dispatch group ends in a branch and takes 1 cycle. Effectively | |
9c84384c JM |
606 | the first iteration of the loop only serves to load operands and |
607 | branches based on compares are delayed until the next loop. | |
04067002 UD |
608 | |
609 | So we must precondition some registers and condition codes so that | |
610 | we don't exit the loop early on the first iteration. */ | |
9c84384c | 611 | |
04067002 UD |
612 | lbz rWORD1, 0(rSTR1) |
613 | lbz rWORD2, 0(rSTR2) | |
614 | bdz- L(b11) | |
fe6e95d7 | 615 | cmpld cr7, rWORD1, rWORD2 |
04067002 UD |
616 | lbz rWORD3, 1(rSTR1) |
617 | lbz rWORD4, 1(rSTR2) | |
618 | bdz- L(b12) | |
619 | cmpld cr1, rWORD3, rWORD4 | |
620 | lbzu rWORD5, 2(rSTR1) | |
621 | lbzu rWORD6, 2(rSTR2) | |
622 | bdz- L(b13) | |
fe6e95d7 | 623 | .align 4 |
04067002 UD |
624 | L(bLoop): |
625 | lbzu rWORD1, 1(rSTR1) | |
626 | lbzu rWORD2, 1(rSTR2) | |
fe6e95d7 | 627 | bne- cr7, L(bLcr7) |
04067002 UD |
628 | |
629 | cmpld cr6, rWORD5, rWORD6 | |
630 | bdz- L(b3i) | |
9c84384c | 631 | |
04067002 UD |
632 | lbzu rWORD3, 1(rSTR1) |
633 | lbzu rWORD4, 1(rSTR2) | |
634 | bne- cr1, L(bLcr1) | |
635 | ||
fe6e95d7 | 636 | cmpld cr7, rWORD1, rWORD2 |
04067002 UD |
637 | bdz- L(b2i) |
638 | ||
639 | lbzu rWORD5, 1(rSTR1) | |
640 | lbzu rWORD6, 1(rSTR2) | |
641 | bne- cr6, L(bLcr6) | |
642 | ||
643 | cmpld cr1, rWORD3, rWORD4 | |
644 | bdnz+ L(bLoop) | |
9c84384c | 645 | |
04067002 UD |
646 | /* We speculatively loading bytes before we have tested the previous |
647 | bytes. But we must avoid overrunning the length (in the ctr) to | |
9c84384c | 648 | prevent these speculative loads from causing a segfault. In this |
04067002 UD |
649 | case the loop will exit early (before the all pending bytes are |
650 | tested. In this case we must complete the pending operations | |
651 | before returning. */ | |
652 | L(b1i): | |
fe6e95d7 | 653 | bne- cr7, L(bLcr7) |
04067002 UD |
654 | bne- cr1, L(bLcr1) |
655 | b L(bx56) | |
fe6e95d7 | 656 | .align 4 |
04067002 UD |
657 | L(b2i): |
658 | bne- cr6, L(bLcr6) | |
fe6e95d7 | 659 | bne- cr7, L(bLcr7) |
04067002 | 660 | b L(bx34) |
fe6e95d7 | 661 | .align 4 |
04067002 UD |
662 | L(b3i): |
663 | bne- cr1, L(bLcr1) | |
664 | bne- cr6, L(bLcr6) | |
665 | b L(bx12) | |
fe6e95d7 AM |
666 | .align 4 |
667 | L(bLcr7): | |
04067002 | 668 | li rRTN, 1 |
fe6e95d7 | 669 | bgtlr cr7 |
04067002 UD |
670 | li rRTN, -1 |
671 | blr | |
672 | L(bLcr1): | |
673 | li rRTN, 1 | |
674 | bgtlr cr1 | |
675 | li rRTN, -1 | |
676 | blr | |
677 | L(bLcr6): | |
678 | li rRTN, 1 | |
679 | bgtlr cr6 | |
680 | li rRTN, -1 | |
681 | blr | |
682 | ||
683 | L(b13): | |
fe6e95d7 | 684 | bne- cr7, L(bx12) |
04067002 UD |
685 | bne- cr1, L(bx34) |
686 | L(bx56): | |
687 | sub rRTN, rWORD5, rWORD6 | |
688 | blr | |
689 | nop | |
690 | L(b12): | |
fe6e95d7 | 691 | bne- cr7, L(bx12) |
9c84384c | 692 | L(bx34): |
04067002 UD |
693 | sub rRTN, rWORD3, rWORD4 |
694 | blr | |
695 | L(b11): | |
696 | L(bx12): | |
697 | sub rRTN, rWORD1, rWORD2 | |
698 | blr | |
fe6e95d7 | 699 | .align 4 |
04067002 UD |
700 | L(zeroLength): |
701 | li rRTN, 0 | |
702 | blr | |
703 | ||
fe6e95d7 | 704 | .align 4 |
04067002 | 705 | /* At this point we know the strings have different alignment and the |
fe6e95d7 | 706 | compare length is at least 8 bytes. r12 contains the low order |
04067002 | 707 | 3 bits of rSTR1 and cr5 contains the result of the logical compare |
fe6e95d7 | 708 | of r12 to 0. If r12 == 0 then rStr1 is double word |
04067002 | 709 | aligned and can perform the DWunaligned loop. |
9c84384c | 710 | |
2ccdea26 | 711 | Otherwise we know that rSTR1 is not already DW aligned yet. |
04067002 | 712 | So we can force the string addresses to the next lower DW |
fe6e95d7 | 713 | boundary and special case this first DW using shift left to |
2ccdea26 | 714 | eliminate bits preceding the first byte. Since we want to join the |
04067002 UD |
715 | normal (DWaligned) compare loop, starting at the second double word, |
716 | we need to adjust the length (rN) and special case the loop | |
fe6e95d7 | 717 | versioning for the first DW. This ensures that the loop count is |
04067002 | 718 | correct and the first DW (shifted) is in the expected resister pair. */ |
fe6e95d7 AM |
719 | #define rSHL r29 /* Unaligned shift left count. */ |
720 | #define rSHR r28 /* Unaligned shift right count. */ | |
721 | #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ | |
722 | #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ | |
723 | #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ | |
724 | #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ | |
04067002 | 725 | L(unaligned): |
fe6e95d7 AM |
726 | std rSHL, -24(r1) |
727 | cfi_offset(rSHL, -24) | |
04067002 UD |
728 | clrldi rSHL, rSTR2, 61 |
729 | beq- cr6, L(duzeroLength) | |
fe6e95d7 AM |
730 | std rSHR, -32(r1) |
731 | cfi_offset(rSHR, -32) | |
04067002 | 732 | beq cr5, L(DWunaligned) |
fe6e95d7 AM |
733 | std rWORD8_SHIFT, -40(r1) |
734 | cfi_offset(rWORD8_SHIFT, -40) | |
735 | /* Adjust the logical start of rSTR2 to compensate for the extra bits | |
04067002 | 736 | in the 1st rSTR1 DW. */ |
fe6e95d7 | 737 | sub rWORD8_SHIFT, rSTR2, r12 |
04067002 UD |
738 | /* But do not attempt to address the DW before that DW that contains |
739 | the actual start of rSTR2. */ | |
740 | clrrdi rSTR2, rSTR2, 3 | |
fe6e95d7 | 741 | std rWORD2_SHIFT, -48(r1) |
fe6e95d7 | 742 | /* Compute the left/right shift counts for the unaligned rSTR2, |
9c84384c | 743 | compensating for the logical (DW aligned) start of rSTR1. */ |
fe6e95d7 | 744 | clrldi rSHL, rWORD8_SHIFT, 61 |
9c84384c | 745 | clrrdi rSTR1, rSTR1, 3 |
fe6e95d7 | 746 | std rWORD4_SHIFT, -56(r1) |
04067002 | 747 | sldi rSHL, rSHL, 3 |
fe6e95d7 AM |
748 | cmpld cr5, rWORD8_SHIFT, rSTR2 |
749 | add rN, rN, r12 | |
750 | sldi rWORD6, r12, 3 | |
751 | std rWORD6_SHIFT, -64(r1) | |
869d7180 RS |
752 | cfi_offset(rWORD2_SHIFT, -48) |
753 | cfi_offset(rWORD4_SHIFT, -56) | |
fe6e95d7 | 754 | cfi_offset(rWORD6_SHIFT, -64) |
04067002 | 755 | subfic rSHR, rSHL, 64 |
fe6e95d7 AM |
756 | srdi r0, rN, 5 /* Divide by 32 */ |
757 | andi. r12, rN, 24 /* Get the DW remainder */ | |
04067002 UD |
758 | /* We normally need to load 2 DWs to start the unaligned rSTR2, but in |
759 | this special case those bits may be discarded anyway. Also we | |
760 | must avoid loading a DW where none of the bits are part of rSTR2 as | |
761 | this may cross a page boundary and cause a page fault. */ | |
762 | li rWORD8, 0 | |
763 | blt cr5, L(dus0) | |
fe6e95d7 AM |
764 | #ifdef __LITTLE_ENDIAN__ |
765 | ldbrx rWORD8, 0, rSTR2 | |
766 | addi rSTR2, rSTR2, 8 | |
767 | #else | |
04067002 | 768 | ld rWORD8, 0(rSTR2) |
fe6e95d7 AM |
769 | addi rSTR2, rSTR2, 8 |
770 | #endif | |
04067002 UD |
771 | sld rWORD8, rWORD8, rSHL |
772 | ||
773 | L(dus0): | |
fe6e95d7 AM |
774 | #ifdef __LITTLE_ENDIAN__ |
775 | ldbrx rWORD1, 0, rSTR1 | |
776 | ldbrx rWORD2, 0, rSTR2 | |
777 | addi rSTR1, rSTR1, 8 | |
778 | addi rSTR2, rSTR2, 8 | |
779 | #else | |
04067002 UD |
780 | ld rWORD1, 0(rSTR1) |
781 | ld rWORD2, 0(rSTR2) | |
fe6e95d7 AM |
782 | #endif |
783 | cmpldi cr1, r12, 16 | |
04067002 | 784 | cmpldi cr7, rN, 32 |
fe6e95d7 | 785 | srd r12, rWORD2, rSHR |
04067002 UD |
786 | clrldi rN, rN, 61 |
787 | beq L(duPs4) | |
fe6e95d7 AM |
788 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
789 | or rWORD8, r12, rWORD8 | |
04067002 UD |
790 | bgt cr1, L(duPs3) |
791 | beq cr1, L(duPs2) | |
792 | ||
793 | /* Remainder is 8 */ | |
fe6e95d7 | 794 | .align 4 |
04067002 | 795 | L(dusP1): |
fe6e95d7 AM |
796 | sld rWORD8_SHIFT, rWORD2, rSHL |
797 | sld rWORD7, rWORD1, rWORD6 | |
798 | sld rWORD8, rWORD8, rWORD6 | |
04067002 UD |
799 | bge cr7, L(duP1e) |
800 | /* At this point we exit early with the first double word compare | |
801 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
802 | how we handle the remaining bytes. */ | |
803 | cmpld cr5, rWORD7, rWORD8 | |
804 | sldi. rN, rN, 3 | |
805 | bne cr5, L(duLcr5) | |
806 | cmpld cr7, rN, rSHR | |
807 | beq L(duZeroReturn) | |
fe6e95d7 | 808 | li r0, 0 |
04067002 | 809 | ble cr7, L(dutrim) |
fe6e95d7 AM |
810 | #ifdef __LITTLE_ENDIAN__ |
811 | ldbrx rWORD2, 0, rSTR2 | |
812 | addi rSTR2, rSTR2, 8 | |
813 | #else | |
04067002 | 814 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
815 | #endif |
816 | srd r0, rWORD2, rSHR | |
04067002 UD |
817 | b L(dutrim) |
818 | /* Remainder is 16 */ | |
fe6e95d7 | 819 | .align 4 |
04067002 | 820 | L(duPs2): |
fe6e95d7 AM |
821 | sld rWORD6_SHIFT, rWORD2, rSHL |
822 | sld rWORD5, rWORD1, rWORD6 | |
823 | sld rWORD6, rWORD8, rWORD6 | |
04067002 UD |
824 | b L(duP2e) |
825 | /* Remainder is 24 */ | |
fe6e95d7 | 826 | .align 4 |
04067002 | 827 | L(duPs3): |
fe6e95d7 AM |
828 | sld rWORD4_SHIFT, rWORD2, rSHL |
829 | sld rWORD3, rWORD1, rWORD6 | |
830 | sld rWORD4, rWORD8, rWORD6 | |
04067002 UD |
831 | b L(duP3e) |
832 | /* Count is a multiple of 32, remainder is 0 */ | |
fe6e95d7 | 833 | .align 4 |
04067002 | 834 | L(duPs4): |
fe6e95d7 AM |
835 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
836 | or rWORD8, r12, rWORD8 | |
837 | sld rWORD2_SHIFT, rWORD2, rSHL | |
838 | sld rWORD1, rWORD1, rWORD6 | |
839 | sld rWORD2, rWORD8, rWORD6 | |
04067002 UD |
840 | b L(duP4e) |
841 | ||
842 | /* At this point we know rSTR1 is double word aligned and the | |
843 | compare length is at least 8 bytes. */ | |
fe6e95d7 | 844 | .align 4 |
04067002 | 845 | L(DWunaligned): |
fe6e95d7 | 846 | std rWORD8_SHIFT, -40(r1) |
04067002 | 847 | clrrdi rSTR2, rSTR2, 3 |
fe6e95d7 | 848 | std rWORD2_SHIFT, -48(r1) |
fe6e95d7 AM |
849 | srdi r0, rN, 5 /* Divide by 32 */ |
850 | std rWORD4_SHIFT, -56(r1) | |
fe6e95d7 AM |
851 | andi. r12, rN, 24 /* Get the DW remainder */ |
852 | std rWORD6_SHIFT, -64(r1) | |
869d7180 RS |
853 | cfi_offset(rWORD8_SHIFT, -40) |
854 | cfi_offset(rWORD2_SHIFT, -48) | |
855 | cfi_offset(rWORD4_SHIFT, -56) | |
fe6e95d7 | 856 | cfi_offset(rWORD6_SHIFT, -64) |
04067002 | 857 | sldi rSHL, rSHL, 3 |
fe6e95d7 AM |
858 | #ifdef __LITTLE_ENDIAN__ |
859 | ldbrx rWORD6, 0, rSTR2 | |
860 | addi rSTR2, rSTR2, 8 | |
861 | ldbrx rWORD8, 0, rSTR2 | |
862 | addi rSTR2, rSTR2, 8 | |
863 | #else | |
04067002 UD |
864 | ld rWORD6, 0(rSTR2) |
865 | ldu rWORD8, 8(rSTR2) | |
fe6e95d7 AM |
866 | #endif |
867 | cmpldi cr1, r12, 16 | |
04067002 UD |
868 | cmpldi cr7, rN, 32 |
869 | clrldi rN, rN, 61 | |
870 | subfic rSHR, rSHL, 64 | |
fe6e95d7 | 871 | sld rWORD6_SHIFT, rWORD6, rSHL |
04067002 | 872 | beq L(duP4) |
fe6e95d7 | 873 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
04067002 UD |
874 | bgt cr1, L(duP3) |
875 | beq cr1, L(duP2) | |
9c84384c | 876 | |
04067002 | 877 | /* Remainder is 8 */ |
fe6e95d7 | 878 | .align 4 |
04067002 | 879 | L(duP1): |
fe6e95d7 AM |
880 | srd r12, rWORD8, rSHR |
881 | #ifdef __LITTLE_ENDIAN__ | |
882 | ldbrx rWORD7, 0, rSTR1 | |
883 | addi rSTR1, rSTR1, 8 | |
884 | #else | |
04067002 | 885 | ld rWORD7, 0(rSTR1) |
fe6e95d7 AM |
886 | #endif |
887 | sld rWORD8_SHIFT, rWORD8, rSHL | |
888 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
889 | blt cr7, L(duP1x) |
890 | L(duP1e): | |
fe6e95d7 AM |
891 | #ifdef __LITTLE_ENDIAN__ |
892 | ldbrx rWORD1, 0, rSTR1 | |
893 | ldbrx rWORD2, 0, rSTR2 | |
894 | addi rSTR1, rSTR1, 8 | |
895 | addi rSTR2, rSTR2, 8 | |
896 | #else | |
04067002 UD |
897 | ld rWORD1, 8(rSTR1) |
898 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 899 | #endif |
04067002 | 900 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
901 | srd r0, rWORD2, rSHR |
902 | sld rWORD2_SHIFT, rWORD2, rSHL | |
903 | or rWORD2, r0, rWORD8_SHIFT | |
904 | #ifdef __LITTLE_ENDIAN__ | |
905 | ldbrx rWORD3, 0, rSTR1 | |
906 | ldbrx rWORD4, 0, rSTR2 | |
907 | addi rSTR1, rSTR1, 8 | |
908 | addi rSTR2, rSTR2, 8 | |
909 | #else | |
04067002 UD |
910 | ld rWORD3, 16(rSTR1) |
911 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 AM |
912 | #endif |
913 | cmpld cr7, rWORD1, rWORD2 | |
914 | srd r12, rWORD4, rSHR | |
915 | sld rWORD4_SHIFT, rWORD4, rSHL | |
04067002 | 916 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
917 | or rWORD4, r12, rWORD2_SHIFT |
918 | #ifdef __LITTLE_ENDIAN__ | |
919 | ldbrx rWORD5, 0, rSTR1 | |
920 | ldbrx rWORD6, 0, rSTR2 | |
921 | addi rSTR1, rSTR1, 8 | |
922 | addi rSTR2, rSTR2, 8 | |
923 | #else | |
04067002 UD |
924 | ld rWORD5, 24(rSTR1) |
925 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 926 | #endif |
04067002 | 927 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
928 | srd r0, rWORD6, rSHR |
929 | sld rWORD6_SHIFT, rWORD6, rSHL | |
930 | bne cr7, L(duLcr7) | |
931 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 932 | cmpld cr6, rWORD5, rWORD6 |
9c84384c | 933 | b L(duLoop3) |
fe6e95d7 | 934 | .align 4 |
04067002 UD |
935 | /* At this point we exit early with the first double word compare |
936 | complete and remainder of 0 to 7 bytes. See L(du14) for details on | |
937 | how we handle the remaining bytes. */ | |
938 | L(duP1x): | |
939 | cmpld cr5, rWORD7, rWORD8 | |
940 | sldi. rN, rN, 3 | |
941 | bne cr5, L(duLcr5) | |
942 | cmpld cr7, rN, rSHR | |
943 | beq L(duZeroReturn) | |
fe6e95d7 | 944 | li r0, 0 |
04067002 | 945 | ble cr7, L(dutrim) |
fe6e95d7 AM |
946 | #ifdef __LITTLE_ENDIAN__ |
947 | ldbrx rWORD2, 0, rSTR2 | |
948 | addi rSTR2, rSTR2, 8 | |
949 | #else | |
04067002 | 950 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
951 | #endif |
952 | srd r0, rWORD2, rSHR | |
04067002 UD |
953 | b L(dutrim) |
954 | /* Remainder is 16 */ | |
fe6e95d7 | 955 | .align 4 |
04067002 | 956 | L(duP2): |
fe6e95d7 AM |
957 | srd r0, rWORD8, rSHR |
958 | #ifdef __LITTLE_ENDIAN__ | |
959 | ldbrx rWORD5, 0, rSTR1 | |
960 | addi rSTR1, rSTR1, 8 | |
961 | #else | |
04067002 | 962 | ld rWORD5, 0(rSTR1) |
fe6e95d7 AM |
963 | #endif |
964 | or rWORD6, r0, rWORD6_SHIFT | |
965 | sld rWORD6_SHIFT, rWORD8, rSHL | |
04067002 | 966 | L(duP2e): |
fe6e95d7 AM |
967 | #ifdef __LITTLE_ENDIAN__ |
968 | ldbrx rWORD7, 0, rSTR1 | |
969 | ldbrx rWORD8, 0, rSTR2 | |
970 | addi rSTR1, rSTR1, 8 | |
971 | addi rSTR2, rSTR2, 8 | |
972 | #else | |
04067002 UD |
973 | ld rWORD7, 8(rSTR1) |
974 | ld rWORD8, 8(rSTR2) | |
fe6e95d7 | 975 | #endif |
04067002 | 976 | cmpld cr6, rWORD5, rWORD6 |
fe6e95d7 AM |
977 | srd r12, rWORD8, rSHR |
978 | sld rWORD8_SHIFT, rWORD8, rSHL | |
979 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 980 | blt cr7, L(duP2x) |
fe6e95d7 AM |
981 | #ifdef __LITTLE_ENDIAN__ |
982 | ldbrx rWORD1, 0, rSTR1 | |
983 | ldbrx rWORD2, 0, rSTR2 | |
984 | addi rSTR1, rSTR1, 8 | |
985 | addi rSTR2, rSTR2, 8 | |
986 | #else | |
04067002 UD |
987 | ld rWORD1, 16(rSTR1) |
988 | ld rWORD2, 16(rSTR2) | |
fe6e95d7 | 989 | #endif |
04067002 UD |
990 | cmpld cr5, rWORD7, rWORD8 |
991 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
992 | srd r0, rWORD2, rSHR |
993 | sld rWORD2_SHIFT, rWORD2, rSHL | |
994 | or rWORD2, r0, rWORD8_SHIFT | |
995 | #ifdef __LITTLE_ENDIAN__ | |
996 | ldbrx rWORD3, 0, rSTR1 | |
997 | ldbrx rWORD4, 0, rSTR2 | |
998 | addi rSTR1, rSTR1, 8 | |
999 | addi rSTR2, rSTR2, 8 | |
1000 | #else | |
04067002 UD |
1001 | ld rWORD3, 24(rSTR1) |
1002 | ld rWORD4, 24(rSTR2) | |
fe6e95d7 AM |
1003 | #endif |
1004 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 1005 | bne cr5, L(duLcr5) |
fe6e95d7 AM |
1006 | srd r12, rWORD4, rSHR |
1007 | sld rWORD4_SHIFT, rWORD4, rSHL | |
1008 | or rWORD4, r12, rWORD2_SHIFT | |
1009 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
1010 | addi rSTR1, rSTR1, 8 |
1011 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 1012 | #endif |
04067002 UD |
1013 | cmpld cr1, rWORD3, rWORD4 |
1014 | b L(duLoop2) | |
fe6e95d7 | 1015 | .align 4 |
04067002 UD |
1016 | L(duP2x): |
1017 | cmpld cr5, rWORD7, rWORD8 | |
fe6e95d7 | 1018 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1019 | addi rSTR1, rSTR1, 8 |
1020 | addi rSTR2, rSTR2, 8 | |
fe6e95d7 | 1021 | #endif |
04067002 UD |
1022 | bne cr6, L(duLcr6) |
1023 | sldi. rN, rN, 3 | |
1024 | bne cr5, L(duLcr5) | |
1025 | cmpld cr7, rN, rSHR | |
1026 | beq L(duZeroReturn) | |
fe6e95d7 | 1027 | li r0, 0 |
04067002 | 1028 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1029 | #ifdef __LITTLE_ENDIAN__ |
1030 | ldbrx rWORD2, 0, rSTR2 | |
1031 | addi rSTR2, rSTR2, 8 | |
1032 | #else | |
04067002 | 1033 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1034 | #endif |
1035 | srd r0, rWORD2, rSHR | |
04067002 | 1036 | b L(dutrim) |
9c84384c | 1037 | |
04067002 | 1038 | /* Remainder is 24 */ |
fe6e95d7 | 1039 | .align 4 |
04067002 | 1040 | L(duP3): |
fe6e95d7 AM |
1041 | srd r12, rWORD8, rSHR |
1042 | #ifdef __LITTLE_ENDIAN__ | |
1043 | ldbrx rWORD3, 0, rSTR1 | |
1044 | addi rSTR1, rSTR1, 8 | |
1045 | #else | |
04067002 | 1046 | ld rWORD3, 0(rSTR1) |
fe6e95d7 AM |
1047 | #endif |
1048 | sld rWORD4_SHIFT, rWORD8, rSHL | |
1049 | or rWORD4, r12, rWORD6_SHIFT | |
04067002 | 1050 | L(duP3e): |
fe6e95d7 AM |
1051 | #ifdef __LITTLE_ENDIAN__ |
1052 | ldbrx rWORD5, 0, rSTR1 | |
1053 | ldbrx rWORD6, 0, rSTR2 | |
1054 | addi rSTR1, rSTR1, 8 | |
1055 | addi rSTR2, rSTR2, 8 | |
1056 | #else | |
04067002 UD |
1057 | ld rWORD5, 8(rSTR1) |
1058 | ld rWORD6, 8(rSTR2) | |
fe6e95d7 | 1059 | #endif |
04067002 | 1060 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1061 | srd r0, rWORD6, rSHR |
1062 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1063 | or rWORD6, r0, rWORD4_SHIFT | |
1064 | #ifdef __LITTLE_ENDIAN__ | |
1065 | ldbrx rWORD7, 0, rSTR1 | |
1066 | ldbrx rWORD8, 0, rSTR2 | |
1067 | addi rSTR1, rSTR1, 8 | |
1068 | addi rSTR2, rSTR2, 8 | |
1069 | #else | |
04067002 UD |
1070 | ld rWORD7, 16(rSTR1) |
1071 | ld rWORD8, 16(rSTR2) | |
fe6e95d7 | 1072 | #endif |
04067002 UD |
1073 | cmpld cr6, rWORD5, rWORD6 |
1074 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1075 | srd r12, rWORD8, rSHR |
1076 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1077 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 | 1078 | blt cr7, L(duP3x) |
fe6e95d7 AM |
1079 | #ifdef __LITTLE_ENDIAN__ |
1080 | ldbrx rWORD1, 0, rSTR1 | |
1081 | ldbrx rWORD2, 0, rSTR2 | |
1082 | addi rSTR1, rSTR1, 8 | |
1083 | addi rSTR2, rSTR2, 8 | |
1084 | #else | |
04067002 UD |
1085 | ld rWORD1, 24(rSTR1) |
1086 | ld rWORD2, 24(rSTR2) | |
fe6e95d7 | 1087 | #endif |
04067002 UD |
1088 | cmpld cr5, rWORD7, rWORD8 |
1089 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1090 | srd r0, rWORD2, rSHR |
1091 | sld rWORD2_SHIFT, rWORD2, rSHL | |
1092 | or rWORD2, r0, rWORD8_SHIFT | |
1093 | #ifndef __LITTLE_ENDIAN__ | |
04067002 UD |
1094 | addi rSTR1, rSTR1, 16 |
1095 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
1096 | #endif |
1097 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 1098 | b L(duLoop1) |
fe6e95d7 | 1099 | .align 4 |
04067002 | 1100 | L(duP3x): |
fe6e95d7 | 1101 | #ifndef __LITTLE_ENDIAN__ |
04067002 UD |
1102 | addi rSTR1, rSTR1, 16 |
1103 | addi rSTR2, rSTR2, 16 | |
fe6e95d7 AM |
1104 | #endif |
1105 | #if 0 | |
1106 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1107 | bne cr1, L(duLcr1) |
fe6e95d7 | 1108 | #endif |
04067002 UD |
1109 | cmpld cr5, rWORD7, rWORD8 |
1110 | bne cr6, L(duLcr6) | |
1111 | sldi. rN, rN, 3 | |
1112 | bne cr5, L(duLcr5) | |
1113 | cmpld cr7, rN, rSHR | |
1114 | beq L(duZeroReturn) | |
fe6e95d7 | 1115 | li r0, 0 |
04067002 | 1116 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1117 | #ifdef __LITTLE_ENDIAN__ |
1118 | ldbrx rWORD2, 0, rSTR2 | |
1119 | addi rSTR2, rSTR2, 8 | |
1120 | #else | |
04067002 | 1121 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1122 | #endif |
1123 | srd r0, rWORD2, rSHR | |
04067002 | 1124 | b L(dutrim) |
9c84384c | 1125 | |
04067002 | 1126 | /* Count is a multiple of 32, remainder is 0 */ |
fe6e95d7 | 1127 | .align 4 |
04067002 | 1128 | L(duP4): |
fe6e95d7 AM |
1129 | mtctr r0 /* Power4 wants mtctr 1st in dispatch group */ |
1130 | srd r0, rWORD8, rSHR | |
1131 | #ifdef __LITTLE_ENDIAN__ | |
1132 | ldbrx rWORD1, 0, rSTR1 | |
1133 | addi rSTR1, rSTR1, 8 | |
1134 | #else | |
04067002 | 1135 | ld rWORD1, 0(rSTR1) |
fe6e95d7 AM |
1136 | #endif |
1137 | sld rWORD2_SHIFT, rWORD8, rSHL | |
1138 | or rWORD2, r0, rWORD6_SHIFT | |
04067002 | 1139 | L(duP4e): |
fe6e95d7 AM |
1140 | #ifdef __LITTLE_ENDIAN__ |
1141 | ldbrx rWORD3, 0, rSTR1 | |
1142 | ldbrx rWORD4, 0, rSTR2 | |
1143 | addi rSTR1, rSTR1, 8 | |
1144 | addi rSTR2, rSTR2, 8 | |
1145 | #else | |
04067002 UD |
1146 | ld rWORD3, 8(rSTR1) |
1147 | ld rWORD4, 8(rSTR2) | |
fe6e95d7 AM |
1148 | #endif |
1149 | cmpld cr7, rWORD1, rWORD2 | |
1150 | srd r12, rWORD4, rSHR | |
1151 | sld rWORD4_SHIFT, rWORD4, rSHL | |
1152 | or rWORD4, r12, rWORD2_SHIFT | |
1153 | #ifdef __LITTLE_ENDIAN__ | |
1154 | ldbrx rWORD5, 0, rSTR1 | |
1155 | ldbrx rWORD6, 0, rSTR2 | |
1156 | addi rSTR1, rSTR1, 8 | |
1157 | addi rSTR2, rSTR2, 8 | |
1158 | #else | |
04067002 UD |
1159 | ld rWORD5, 16(rSTR1) |
1160 | ld rWORD6, 16(rSTR2) | |
fe6e95d7 | 1161 | #endif |
04067002 | 1162 | cmpld cr1, rWORD3, rWORD4 |
fe6e95d7 AM |
1163 | bne cr7, L(duLcr7) |
1164 | srd r0, rWORD6, rSHR | |
1165 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1166 | or rWORD6, r0, rWORD4_SHIFT | |
1167 | #ifdef __LITTLE_ENDIAN__ | |
1168 | ldbrx rWORD7, 0, rSTR1 | |
1169 | ldbrx rWORD8, 0, rSTR2 | |
1170 | addi rSTR1, rSTR1, 8 | |
1171 | addi rSTR2, rSTR2, 8 | |
1172 | #else | |
04067002 UD |
1173 | ldu rWORD7, 24(rSTR1) |
1174 | ldu rWORD8, 24(rSTR2) | |
fe6e95d7 | 1175 | #endif |
04067002 UD |
1176 | cmpld cr6, rWORD5, rWORD6 |
1177 | bne cr1, L(duLcr1) | |
fe6e95d7 AM |
1178 | srd r12, rWORD8, rSHR |
1179 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1180 | or rWORD8, r12, rWORD6_SHIFT | |
04067002 UD |
1181 | cmpld cr5, rWORD7, rWORD8 |
1182 | bdz- L(du24) /* Adjust CTR as we start with +4 */ | |
1183 | /* This is the primary loop */ | |
fe6e95d7 | 1184 | .align 4 |
04067002 | 1185 | L(duLoop): |
fe6e95d7 AM |
1186 | #ifdef __LITTLE_ENDIAN__ |
1187 | ldbrx rWORD1, 0, rSTR1 | |
1188 | ldbrx rWORD2, 0, rSTR2 | |
1189 | addi rSTR1, rSTR1, 8 | |
1190 | addi rSTR2, rSTR2, 8 | |
1191 | #else | |
04067002 UD |
1192 | ld rWORD1, 8(rSTR1) |
1193 | ld rWORD2, 8(rSTR2) | |
fe6e95d7 | 1194 | #endif |
04067002 UD |
1195 | cmpld cr1, rWORD3, rWORD4 |
1196 | bne cr6, L(duLcr6) | |
fe6e95d7 AM |
1197 | srd r0, rWORD2, rSHR |
1198 | sld rWORD2_SHIFT, rWORD2, rSHL | |
1199 | or rWORD2, r0, rWORD8_SHIFT | |
04067002 | 1200 | L(duLoop1): |
fe6e95d7 AM |
1201 | #ifdef __LITTLE_ENDIAN__ |
1202 | ldbrx rWORD3, 0, rSTR1 | |
1203 | ldbrx rWORD4, 0, rSTR2 | |
1204 | addi rSTR1, rSTR1, 8 | |
1205 | addi rSTR2, rSTR2, 8 | |
1206 | #else | |
04067002 UD |
1207 | ld rWORD3, 16(rSTR1) |
1208 | ld rWORD4, 16(rSTR2) | |
fe6e95d7 | 1209 | #endif |
04067002 UD |
1210 | cmpld cr6, rWORD5, rWORD6 |
1211 | bne cr5, L(duLcr5) | |
fe6e95d7 AM |
1212 | srd r12, rWORD4, rSHR |
1213 | sld rWORD4_SHIFT, rWORD4, rSHL | |
1214 | or rWORD4, r12, rWORD2_SHIFT | |
04067002 | 1215 | L(duLoop2): |
fe6e95d7 AM |
1216 | #ifdef __LITTLE_ENDIAN__ |
1217 | ldbrx rWORD5, 0, rSTR1 | |
1218 | ldbrx rWORD6, 0, rSTR2 | |
1219 | addi rSTR1, rSTR1, 8 | |
1220 | addi rSTR2, rSTR2, 8 | |
1221 | #else | |
04067002 UD |
1222 | ld rWORD5, 24(rSTR1) |
1223 | ld rWORD6, 24(rSTR2) | |
fe6e95d7 | 1224 | #endif |
04067002 | 1225 | cmpld cr5, rWORD7, rWORD8 |
fe6e95d7 AM |
1226 | bne cr7, L(duLcr7) |
1227 | srd r0, rWORD6, rSHR | |
1228 | sld rWORD6_SHIFT, rWORD6, rSHL | |
1229 | or rWORD6, r0, rWORD4_SHIFT | |
04067002 | 1230 | L(duLoop3): |
fe6e95d7 AM |
1231 | #ifdef __LITTLE_ENDIAN__ |
1232 | ldbrx rWORD7, 0, rSTR1 | |
1233 | ldbrx rWORD8, 0, rSTR2 | |
1234 | addi rSTR1, rSTR1, 8 | |
1235 | addi rSTR2, rSTR2, 8 | |
1236 | #else | |
04067002 UD |
1237 | ldu rWORD7, 32(rSTR1) |
1238 | ldu rWORD8, 32(rSTR2) | |
fe6e95d7 AM |
1239 | #endif |
1240 | cmpld cr7, rWORD1, rWORD2 | |
04067002 | 1241 | bne- cr1, L(duLcr1) |
fe6e95d7 AM |
1242 | srd r12, rWORD8, rSHR |
1243 | sld rWORD8_SHIFT, rWORD8, rSHL | |
1244 | or rWORD8, r12, rWORD6_SHIFT | |
9c84384c JM |
1245 | bdnz+ L(duLoop) |
1246 | ||
04067002 | 1247 | L(duL4): |
fe6e95d7 AM |
1248 | #if 0 |
1249 | /* Huh? We've already branched on cr1! */ | |
04067002 | 1250 | bne cr1, L(duLcr1) |
fe6e95d7 | 1251 | #endif |
04067002 UD |
1252 | cmpld cr1, rWORD3, rWORD4 |
1253 | bne cr6, L(duLcr6) | |
1254 | cmpld cr6, rWORD5, rWORD6 | |
1255 | bne cr5, L(duLcr5) | |
1256 | cmpld cr5, rWORD7, rWORD8 | |
1257 | L(du44): | |
fe6e95d7 | 1258 | bne cr7, L(duLcr7) |
04067002 UD |
1259 | L(du34): |
1260 | bne cr1, L(duLcr1) | |
1261 | L(du24): | |
1262 | bne cr6, L(duLcr6) | |
1263 | L(du14): | |
1264 | sldi. rN, rN, 3 | |
1265 | bne cr5, L(duLcr5) | |
1266 | /* At this point we have a remainder of 1 to 7 bytes to compare. We use | |
2ccdea26 | 1267 | shift right double to eliminate bits beyond the compare length. |
04067002 | 1268 | |
9c84384c | 1269 | However it may not be safe to load rWORD2 which may be beyond the |
04067002 UD |
1270 | string length. So we compare the bit length of the remainder to |
1271 | the right shift count (rSHR). If the bit count is less than or equal | |
1272 | we do not need to load rWORD2 (all significant bits are already in | |
fe6e95d7 | 1273 | rWORD8_SHIFT). */ |
04067002 UD |
1274 | cmpld cr7, rN, rSHR |
1275 | beq L(duZeroReturn) | |
fe6e95d7 | 1276 | li r0, 0 |
04067002 | 1277 | ble cr7, L(dutrim) |
fe6e95d7 AM |
1278 | #ifdef __LITTLE_ENDIAN__ |
1279 | ldbrx rWORD2, 0, rSTR2 | |
1280 | addi rSTR2, rSTR2, 8 | |
1281 | #else | |
04067002 | 1282 | ld rWORD2, 8(rSTR2) |
fe6e95d7 AM |
1283 | #endif |
1284 | srd r0, rWORD2, rSHR | |
1285 | .align 4 | |
04067002 | 1286 | L(dutrim): |
fe6e95d7 AM |
1287 | #ifdef __LITTLE_ENDIAN__ |
1288 | ldbrx rWORD1, 0, rSTR1 | |
1289 | #else | |
04067002 | 1290 | ld rWORD1, 8(rSTR1) |
fe6e95d7 AM |
1291 | #endif |
1292 | ld rWORD8, -8(r1) | |
9c84384c | 1293 | subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ |
fe6e95d7 AM |
1294 | or rWORD2, r0, rWORD8_SHIFT |
1295 | ld rWORD7, -16(r1) | |
1296 | ld rSHL, -24(r1) | |
04067002 UD |
1297 | srd rWORD1, rWORD1, rN |
1298 | srd rWORD2, rWORD2, rN | |
fe6e95d7 AM |
1299 | ld rSHR, -32(r1) |
1300 | ld rWORD8_SHIFT, -40(r1) | |
04067002 | 1301 | li rRTN, 0 |
fe6e95d7 AM |
1302 | cmpld cr7, rWORD1, rWORD2 |
1303 | ld rWORD2_SHIFT, -48(r1) | |
1304 | ld rWORD4_SHIFT, -56(r1) | |
1305 | beq cr7, L(dureturn24) | |
04067002 | 1306 | li rRTN, 1 |
fe6e95d7 AM |
1307 | ld rWORD6_SHIFT, -64(r1) |
1308 | bgtlr cr7 | |
04067002 UD |
1309 | li rRTN, -1 |
1310 | blr | |
fe6e95d7 AM |
1311 | .align 4 |
1312 | L(duLcr7): | |
1313 | ld rWORD8, -8(r1) | |
1314 | ld rWORD7, -16(r1) | |
04067002 | 1315 | li rRTN, 1 |
fe6e95d7 AM |
1316 | bgt cr7, L(dureturn29) |
1317 | ld rSHL, -24(r1) | |
1318 | ld rSHR, -32(r1) | |
04067002 UD |
1319 | li rRTN, -1 |
1320 | b L(dureturn27) | |
fe6e95d7 | 1321 | .align 4 |
04067002 | 1322 | L(duLcr1): |
fe6e95d7 AM |
1323 | ld rWORD8, -8(r1) |
1324 | ld rWORD7, -16(r1) | |
04067002 | 1325 | li rRTN, 1 |
9c84384c | 1326 | bgt cr1, L(dureturn29) |
fe6e95d7 AM |
1327 | ld rSHL, -24(r1) |
1328 | ld rSHR, -32(r1) | |
04067002 UD |
1329 | li rRTN, -1 |
1330 | b L(dureturn27) | |
fe6e95d7 | 1331 | .align 4 |
04067002 | 1332 | L(duLcr6): |
fe6e95d7 AM |
1333 | ld rWORD8, -8(r1) |
1334 | ld rWORD7, -16(r1) | |
04067002 | 1335 | li rRTN, 1 |
9c84384c | 1336 | bgt cr6, L(dureturn29) |
fe6e95d7 AM |
1337 | ld rSHL, -24(r1) |
1338 | ld rSHR, -32(r1) | |
04067002 UD |
1339 | li rRTN, -1 |
1340 | b L(dureturn27) | |
fe6e95d7 | 1341 | .align 4 |
04067002 | 1342 | L(duLcr5): |
fe6e95d7 AM |
1343 | ld rWORD8, -8(r1) |
1344 | ld rWORD7, -16(r1) | |
04067002 | 1345 | li rRTN, 1 |
9c84384c | 1346 | bgt cr5, L(dureturn29) |
fe6e95d7 AM |
1347 | ld rSHL, -24(r1) |
1348 | ld rSHR, -32(r1) | |
04067002 UD |
1349 | li rRTN, -1 |
1350 | b L(dureturn27) | |
1351 | .align 3 | |
1352 | L(duZeroReturn): | |
fe6e95d7 | 1353 | li rRTN, 0 |
04067002 UD |
1354 | .align 4 |
1355 | L(dureturn): | |
fe6e95d7 AM |
1356 | ld rWORD8, -8(r1) |
1357 | ld rWORD7, -16(r1) | |
9c84384c | 1358 | L(dureturn29): |
fe6e95d7 AM |
1359 | ld rSHL, -24(r1) |
1360 | ld rSHR, -32(r1) | |
9c84384c | 1361 | L(dureturn27): |
fe6e95d7 | 1362 | ld rWORD8_SHIFT, -40(r1) |
9c84384c | 1363 | L(dureturn26): |
fe6e95d7 | 1364 | ld rWORD2_SHIFT, -48(r1) |
9c84384c | 1365 | L(dureturn25): |
fe6e95d7 | 1366 | ld rWORD4_SHIFT, -56(r1) |
04067002 | 1367 | L(dureturn24): |
fe6e95d7 | 1368 | ld rWORD6_SHIFT, -64(r1) |
04067002 UD |
1369 | blr |
1370 | L(duzeroLength): | |
fe6e95d7 | 1371 | li rRTN, 0 |
04067002 UD |
1372 | blr |
1373 | ||
b6a66222 | 1374 | END (MEMCMP) |
04067002 UD |
1375 | libc_hidden_builtin_def (memcmp) |
1376 | weak_alias (memcmp, bcmp) |