]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power4/memcmp.S
65c659660d9948c5ba3e926ae4ea2345ef9c0de6
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
1 /* Optimized memcmp implementation for PowerPC64.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24
25 .machine power4
26 EALIGN (memcmp, 4, 0)
27 CALL_MCOUNT 3
28
29 #define rRTN r3
30 #define rSTR1 r3 /* first string arg */
31 #define rSTR2 r4 /* second string arg */
32 #define rN r5 /* max string length */
33 #define rWORD1 r6 /* current word in s1 */
34 #define rWORD2 r7 /* current word in s2 */
35 #define rWORD3 r8 /* next word in s1 */
36 #define rWORD4 r9 /* next word in s2 */
37 #define rWORD5 r10 /* next word in s1 */
38 #define rWORD6 r11 /* next word in s2 */
39 #define rWORD7 r30 /* next word in s1 */
40 #define rWORD8 r31 /* next word in s2 */
41
42 xor r0, rSTR2, rSTR1
43 cmpldi cr6, rN, 0
44 cmpldi cr1, rN, 12
45 clrldi. r0, r0, 61
46 clrldi r12, rSTR1, 61
47 cmpldi cr5, r12, 0
48 beq- cr6, L(zeroLength)
49 dcbt 0, rSTR1
50 dcbt 0, rSTR2
51 /* If less than 8 bytes or not aligned, use the unaligned
52 byte loop. */
53 blt cr1, L(bytealigned)
54 std rWORD8, -8(r1)
55 std rWORD7, -16(r1)
56 cfi_offset(rWORD8, -8)
57 cfi_offset(rWORD7, -16)
58 bne L(unaligned)
59 /* At this point we know both strings have the same alignment and the
60 compare length is at least 8 bytes. r12 contains the low order
61 3 bits of rSTR1 and cr5 contains the result of the logical compare
62 of r12 to 0. If r12 == 0 then we are already double word
63 aligned and can perform the DW aligned loop.
64
65 Otherwise we know the two strings have the same alignment (but not
66 yet DW). So we force the string addresses to the next lower DW
67 boundary and special case this first DW using shift left to
68 eliminate bits preceding the first byte. Since we want to join the
69 normal (DW aligned) compare loop, starting at the second double word,
70 we need to adjust the length (rN) and special case the loop
71 versioning for the first DW. This ensures that the loop count is
72 correct and the first DW (shifted) is in the expected register pair. */
73 .align 4
74 L(samealignment):
75 clrrdi rSTR1, rSTR1, 3
76 clrrdi rSTR2, rSTR2, 3
77 beq cr5, L(DWaligned)
78 add rN, rN, r12
79 sldi rWORD6, r12, 3
80 srdi r0, rN, 5 /* Divide by 32 */
81 andi. r12, rN, 24 /* Get the DW remainder */
82 #ifdef __LITTLE_ENDIAN__
83 ldbrx rWORD1, 0, rSTR1
84 ldbrx rWORD2, 0, rSTR2
85 addi rSTR1, rSTR1, 8
86 addi rSTR2, rSTR2, 8
87 #else
88 ld rWORD1, 0(rSTR1)
89 ld rWORD2, 0(rSTR2)
90 #endif
91 cmpldi cr1, r12, 16
92 cmpldi cr7, rN, 32
93 clrldi rN, rN, 61
94 beq L(dPs4)
95 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
96 bgt cr1, L(dPs3)
97 beq cr1, L(dPs2)
98
99 /* Remainder is 8 */
100 .align 3
101 L(dsP1):
102 sld rWORD5, rWORD1, rWORD6
103 sld rWORD6, rWORD2, rWORD6
104 cmpld cr5, rWORD5, rWORD6
105 blt cr7, L(dP1x)
106 /* Do something useful in this cycle since we have to branch anyway. */
107 #ifdef __LITTLE_ENDIAN__
108 ldbrx rWORD1, 0, rSTR1
109 ldbrx rWORD2, 0, rSTR2
110 addi rSTR1, rSTR1, 8
111 addi rSTR2, rSTR2, 8
112 #else
113 ld rWORD1, 8(rSTR1)
114 ld rWORD2, 8(rSTR2)
115 #endif
116 cmpld cr7, rWORD1, rWORD2
117 b L(dP1e)
118 /* Remainder is 16 */
119 .align 4
120 L(dPs2):
121 sld rWORD5, rWORD1, rWORD6
122 sld rWORD6, rWORD2, rWORD6
123 cmpld cr6, rWORD5, rWORD6
124 blt cr7, L(dP2x)
125 /* Do something useful in this cycle since we have to branch anyway. */
126 #ifdef __LITTLE_ENDIAN__
127 ldbrx rWORD7, 0, rSTR1
128 ldbrx rWORD8, 0, rSTR2
129 addi rSTR1, rSTR1, 8
130 addi rSTR2, rSTR2, 8
131 #else
132 ld rWORD7, 8(rSTR1)
133 ld rWORD8, 8(rSTR2)
134 #endif
135 cmpld cr5, rWORD7, rWORD8
136 b L(dP2e)
137 /* Remainder is 24 */
138 .align 4
139 L(dPs3):
140 sld rWORD3, rWORD1, rWORD6
141 sld rWORD4, rWORD2, rWORD6
142 cmpld cr1, rWORD3, rWORD4
143 b L(dP3e)
144 /* Count is a multiple of 32, remainder is 0 */
145 .align 4
146 L(dPs4):
147 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
148 sld rWORD1, rWORD1, rWORD6
149 sld rWORD2, rWORD2, rWORD6
150 cmpld cr7, rWORD1, rWORD2
151 b L(dP4e)
152
153 /* At this point we know both strings are double word aligned and the
154 compare length is at least 8 bytes. */
155 .align 4
156 L(DWaligned):
157 andi. r12, rN, 24 /* Get the DW remainder */
158 srdi r0, rN, 5 /* Divide by 32 */
159 cmpldi cr1, r12, 16
160 cmpldi cr7, rN, 32
161 clrldi rN, rN, 61
162 beq L(dP4)
163 bgt cr1, L(dP3)
164 beq cr1, L(dP2)
165
166 /* Remainder is 8 */
167 .align 4
168 L(dP1):
169 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
170 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
171 (8-15 byte compare), we want to use only volatile registers. This
172 means we can avoid restoring non-volatile registers since we did not
173 change any on the early exit path. The key here is the non-early
174 exit path only cares about the condition code (cr5), not about which
175 register pair was used. */
176 #ifdef __LITTLE_ENDIAN__
177 ldbrx rWORD5, 0, rSTR1
178 ldbrx rWORD6, 0, rSTR2
179 addi rSTR1, rSTR1, 8
180 addi rSTR2, rSTR2, 8
181 #else
182 ld rWORD5, 0(rSTR1)
183 ld rWORD6, 0(rSTR2)
184 #endif
185 cmpld cr5, rWORD5, rWORD6
186 blt cr7, L(dP1x)
187 #ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD1, 0, rSTR1
189 ldbrx rWORD2, 0, rSTR2
190 addi rSTR1, rSTR1, 8
191 addi rSTR2, rSTR2, 8
192 #else
193 ld rWORD1, 8(rSTR1)
194 ld rWORD2, 8(rSTR2)
195 #endif
196 cmpld cr7, rWORD1, rWORD2
197 L(dP1e):
198 #ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD3, 0, rSTR1
200 ldbrx rWORD4, 0, rSTR2
201 addi rSTR1, rSTR1, 8
202 addi rSTR2, rSTR2, 8
203 #else
204 ld rWORD3, 16(rSTR1)
205 ld rWORD4, 16(rSTR2)
206 #endif
207 cmpld cr1, rWORD3, rWORD4
208 #ifdef __LITTLE_ENDIAN__
209 ldbrx rWORD5, 0, rSTR1
210 ldbrx rWORD6, 0, rSTR2
211 addi rSTR1, rSTR1, 8
212 addi rSTR2, rSTR2, 8
213 #else
214 ld rWORD5, 24(rSTR1)
215 ld rWORD6, 24(rSTR2)
216 #endif
217 cmpld cr6, rWORD5, rWORD6
218 bne cr5, L(dLcr5x)
219 bne cr7, L(dLcr7x)
220
221 #ifdef __LITTLE_ENDIAN__
222 ldbrx rWORD7, 0, rSTR1
223 ldbrx rWORD8, 0, rSTR2
224 addi rSTR1, rSTR1, 8
225 addi rSTR2, rSTR2, 8
226 #else
227 ldu rWORD7, 32(rSTR1)
228 ldu rWORD8, 32(rSTR2)
229 #endif
230 bne cr1, L(dLcr1)
231 cmpld cr5, rWORD7, rWORD8
232 bdnz L(dLoop)
233 bne cr6, L(dLcr6)
234 ld rWORD8, -8(r1)
235 ld rWORD7, -16(r1)
236 .align 3
237 L(dP1x):
238 sldi. r12, rN, 3
239 bne cr5, L(dLcr5x)
240 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
241 bne L(d00)
242 li rRTN, 0
243 blr
244
245 /* Remainder is 16 */
246 .align 4
247 L(dP2):
248 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
249 #ifdef __LITTLE_ENDIAN__
250 ldbrx rWORD5, 0, rSTR1
251 ldbrx rWORD6, 0, rSTR2
252 addi rSTR1, rSTR1, 8
253 addi rSTR2, rSTR2, 8
254 #else
255 ld rWORD5, 0(rSTR1)
256 ld rWORD6, 0(rSTR2)
257 #endif
258 cmpld cr6, rWORD5, rWORD6
259 blt cr7, L(dP2x)
260 #ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD7, 0, rSTR1
262 ldbrx rWORD8, 0, rSTR2
263 addi rSTR1, rSTR1, 8
264 addi rSTR2, rSTR2, 8
265 #else
266 ld rWORD7, 8(rSTR1)
267 ld rWORD8, 8(rSTR2)
268 #endif
269 cmpld cr5, rWORD7, rWORD8
270 L(dP2e):
271 #ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD1, 0, rSTR1
273 ldbrx rWORD2, 0, rSTR2
274 addi rSTR1, rSTR1, 8
275 addi rSTR2, rSTR2, 8
276 #else
277 ld rWORD1, 16(rSTR1)
278 ld rWORD2, 16(rSTR2)
279 #endif
280 cmpld cr7, rWORD1, rWORD2
281 #ifdef __LITTLE_ENDIAN__
282 ldbrx rWORD3, 0, rSTR1
283 ldbrx rWORD4, 0, rSTR2
284 addi rSTR1, rSTR1, 8
285 addi rSTR2, rSTR2, 8
286 #else
287 ld rWORD3, 24(rSTR1)
288 ld rWORD4, 24(rSTR2)
289 #endif
290 cmpld cr1, rWORD3, rWORD4
291 #ifndef __LITTLE_ENDIAN__
292 addi rSTR1, rSTR1, 8
293 addi rSTR2, rSTR2, 8
294 #endif
295 bne cr6, L(dLcr6)
296 bne cr5, L(dLcr5)
297 b L(dLoop2)
298 /* Again we are on a early exit path (16-23 byte compare), we want to
299 only use volatile registers and avoid restoring non-volatile
300 registers. */
301 .align 4
302 L(dP2x):
303 #ifdef __LITTLE_ENDIAN__
304 ldbrx rWORD3, 0, rSTR1
305 ldbrx rWORD4, 0, rSTR2
306 addi rSTR1, rSTR1, 8
307 addi rSTR2, rSTR2, 8
308 #else
309 ld rWORD3, 8(rSTR1)
310 ld rWORD4, 8(rSTR2)
311 #endif
312 cmpld cr1, rWORD3, rWORD4
313 sldi. r12, rN, 3
314 bne cr6, L(dLcr6x)
315 #ifndef __LITTLE_ENDIAN__
316 addi rSTR1, rSTR1, 8
317 addi rSTR2, rSTR2, 8
318 #endif
319 bne cr1, L(dLcr1x)
320 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
321 bne L(d00)
322 li rRTN, 0
323 blr
324
325 /* Remainder is 24 */
326 .align 4
327 L(dP3):
328 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
329 #ifdef __LITTLE_ENDIAN__
330 ldbrx rWORD3, 0, rSTR1
331 ldbrx rWORD4, 0, rSTR2
332 addi rSTR1, rSTR1, 8
333 addi rSTR2, rSTR2, 8
334 #else
335 ld rWORD3, 0(rSTR1)
336 ld rWORD4, 0(rSTR2)
337 #endif
338 cmpld cr1, rWORD3, rWORD4
339 L(dP3e):
340 #ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD5, 0, rSTR1
342 ldbrx rWORD6, 0, rSTR2
343 addi rSTR1, rSTR1, 8
344 addi rSTR2, rSTR2, 8
345 #else
346 ld rWORD5, 8(rSTR1)
347 ld rWORD6, 8(rSTR2)
348 #endif
349 cmpld cr6, rWORD5, rWORD6
350 blt cr7, L(dP3x)
351 #ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD7, 0, rSTR1
353 ldbrx rWORD8, 0, rSTR2
354 addi rSTR1, rSTR1, 8
355 addi rSTR2, rSTR2, 8
356 #else
357 ld rWORD7, 16(rSTR1)
358 ld rWORD8, 16(rSTR2)
359 #endif
360 cmpld cr5, rWORD7, rWORD8
361 #ifdef __LITTLE_ENDIAN__
362 ldbrx rWORD1, 0, rSTR1
363 ldbrx rWORD2, 0, rSTR2
364 addi rSTR1, rSTR1, 8
365 addi rSTR2, rSTR2, 8
366 #else
367 ld rWORD1, 24(rSTR1)
368 ld rWORD2, 24(rSTR2)
369 #endif
370 cmpld cr7, rWORD1, rWORD2
371 #ifndef __LITTLE_ENDIAN__
372 addi rSTR1, rSTR1, 16
373 addi rSTR2, rSTR2, 16
374 #endif
375 bne cr1, L(dLcr1)
376 bne cr6, L(dLcr6)
377 b L(dLoop1)
378 /* Again we are on a early exit path (24-31 byte compare), we want to
379 only use volatile registers and avoid restoring non-volatile
380 registers. */
381 .align 4
382 L(dP3x):
383 #ifdef __LITTLE_ENDIAN__
384 ldbrx rWORD1, 0, rSTR1
385 ldbrx rWORD2, 0, rSTR2
386 addi rSTR1, rSTR1, 8
387 addi rSTR2, rSTR2, 8
388 #else
389 ld rWORD1, 16(rSTR1)
390 ld rWORD2, 16(rSTR2)
391 #endif
392 cmpld cr7, rWORD1, rWORD2
393 sldi. r12, rN, 3
394 bne cr1, L(dLcr1x)
395 #ifndef __LITTLE_ENDIAN__
396 addi rSTR1, rSTR1, 16
397 addi rSTR2, rSTR2, 16
398 #endif
399 bne cr6, L(dLcr6x)
400 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
401 bne cr7, L(dLcr7x)
402 bne L(d00)
403 li rRTN, 0
404 blr
405
406 /* Count is a multiple of 32, remainder is 0 */
407 .align 4
408 L(dP4):
409 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
410 #ifdef __LITTLE_ENDIAN__
411 ldbrx rWORD1, 0, rSTR1
412 ldbrx rWORD2, 0, rSTR2
413 addi rSTR1, rSTR1, 8
414 addi rSTR2, rSTR2, 8
415 #else
416 ld rWORD1, 0(rSTR1)
417 ld rWORD2, 0(rSTR2)
418 #endif
419 cmpld cr7, rWORD1, rWORD2
420 L(dP4e):
421 #ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD3, 0, rSTR1
423 ldbrx rWORD4, 0, rSTR2
424 addi rSTR1, rSTR1, 8
425 addi rSTR2, rSTR2, 8
426 #else
427 ld rWORD3, 8(rSTR1)
428 ld rWORD4, 8(rSTR2)
429 #endif
430 cmpld cr1, rWORD3, rWORD4
431 #ifdef __LITTLE_ENDIAN__
432 ldbrx rWORD5, 0, rSTR1
433 ldbrx rWORD6, 0, rSTR2
434 addi rSTR1, rSTR1, 8
435 addi rSTR2, rSTR2, 8
436 #else
437 ld rWORD5, 16(rSTR1)
438 ld rWORD6, 16(rSTR2)
439 #endif
440 cmpld cr6, rWORD5, rWORD6
441 #ifdef __LITTLE_ENDIAN__
442 ldbrx rWORD7, 0, rSTR1
443 ldbrx rWORD8, 0, rSTR2
444 addi rSTR1, rSTR1, 8
445 addi rSTR2, rSTR2, 8
446 #else
447 ldu rWORD7, 24(rSTR1)
448 ldu rWORD8, 24(rSTR2)
449 #endif
450 cmpld cr5, rWORD7, rWORD8
451 bne cr7, L(dLcr7)
452 bne cr1, L(dLcr1)
453 bdz- L(d24) /* Adjust CTR as we start with +4 */
454 /* This is the primary loop */
455 .align 4
456 L(dLoop):
457 #ifdef __LITTLE_ENDIAN__
458 ldbrx rWORD1, 0, rSTR1
459 ldbrx rWORD2, 0, rSTR2
460 addi rSTR1, rSTR1, 8
461 addi rSTR2, rSTR2, 8
462 #else
463 ld rWORD1, 8(rSTR1)
464 ld rWORD2, 8(rSTR2)
465 #endif
466 cmpld cr1, rWORD3, rWORD4
467 bne cr6, L(dLcr6)
468 L(dLoop1):
469 #ifdef __LITTLE_ENDIAN__
470 ldbrx rWORD3, 0, rSTR1
471 ldbrx rWORD4, 0, rSTR2
472 addi rSTR1, rSTR1, 8
473 addi rSTR2, rSTR2, 8
474 #else
475 ld rWORD3, 16(rSTR1)
476 ld rWORD4, 16(rSTR2)
477 #endif
478 cmpld cr6, rWORD5, rWORD6
479 bne cr5, L(dLcr5)
480 L(dLoop2):
481 #ifdef __LITTLE_ENDIAN__
482 ldbrx rWORD5, 0, rSTR1
483 ldbrx rWORD6, 0, rSTR2
484 addi rSTR1, rSTR1, 8
485 addi rSTR2, rSTR2, 8
486 #else
487 ld rWORD5, 24(rSTR1)
488 ld rWORD6, 24(rSTR2)
489 #endif
490 cmpld cr5, rWORD7, rWORD8
491 bne cr7, L(dLcr7)
492 L(dLoop3):
493 #ifdef __LITTLE_ENDIAN__
494 ldbrx rWORD7, 0, rSTR1
495 ldbrx rWORD8, 0, rSTR2
496 addi rSTR1, rSTR1, 8
497 addi rSTR2, rSTR2, 8
498 #else
499 ldu rWORD7, 32(rSTR1)
500 ldu rWORD8, 32(rSTR2)
501 #endif
502 bne- cr1, L(dLcr1)
503 cmpld cr7, rWORD1, rWORD2
504 bdnz+ L(dLoop)
505
506 L(dL4):
507 cmpld cr1, rWORD3, rWORD4
508 bne cr6, L(dLcr6)
509 cmpld cr6, rWORD5, rWORD6
510 bne cr5, L(dLcr5)
511 cmpld cr5, rWORD7, rWORD8
512 L(d44):
513 bne cr7, L(dLcr7)
514 L(d34):
515 bne cr1, L(dLcr1)
516 L(d24):
517 bne cr6, L(dLcr6)
518 L(d14):
519 sldi. r12, rN, 3
520 bne cr5, L(dLcr5)
521 L(d04):
522 ld rWORD8, -8(r1)
523 ld rWORD7, -16(r1)
524 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
525 beq L(zeroLength)
526 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
527 we are aligned it is safe to load the whole double word, and use
528 shift right double to eliminate bits beyond the compare length. */
529 L(d00):
530 #ifdef __LITTLE_ENDIAN__
531 ldbrx rWORD1, 0, rSTR1
532 ldbrx rWORD2, 0, rSTR2
533 addi rSTR1, rSTR1, 8
534 addi rSTR2, rSTR2, 8
535 #else
536 ld rWORD1, 8(rSTR1)
537 ld rWORD2, 8(rSTR2)
538 #endif
539 srd rWORD1, rWORD1, rN
540 srd rWORD2, rWORD2, rN
541 cmpld cr7, rWORD1, rWORD2
542 bne cr7, L(dLcr7x)
543 li rRTN, 0
544 blr
545
546 .align 4
547 L(dLcr7):
548 ld rWORD8, -8(r1)
549 ld rWORD7, -16(r1)
550 L(dLcr7x):
551 li rRTN, 1
552 bgtlr cr7
553 li rRTN, -1
554 blr
555 .align 4
556 L(dLcr1):
557 ld rWORD8, -8(r1)
558 ld rWORD7, -16(r1)
559 L(dLcr1x):
560 li rRTN, 1
561 bgtlr cr1
562 li rRTN, -1
563 blr
564 .align 4
565 L(dLcr6):
566 ld rWORD8, -8(r1)
567 ld rWORD7, -16(r1)
568 L(dLcr6x):
569 li rRTN, 1
570 bgtlr cr6
571 li rRTN, -1
572 blr
573 .align 4
574 L(dLcr5):
575 ld rWORD8, -8(r1)
576 ld rWORD7, -16(r1)
577 L(dLcr5x):
578 li rRTN, 1
579 bgtlr cr5
580 li rRTN, -1
581 blr
582
583 .align 4
584 L(bytealigned):
585 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
586 #if 0
587 /* Huh? We've already branched on cr6! */
588 beq- cr6, L(zeroLength)
589 #endif
590
591 /* We need to prime this loop. This loop is swing modulo scheduled
592 to avoid pipe delays. The dependent instruction latencies (load to
593 compare to conditional branch) is 2 to 3 cycles. In this loop each
594 dispatch group ends in a branch and takes 1 cycle. Effectively
595 the first iteration of the loop only serves to load operands and
596 branches based on compares are delayed until the next loop.
597
598 So we must precondition some registers and condition codes so that
599 we don't exit the loop early on the first iteration. */
600
601 lbz rWORD1, 0(rSTR1)
602 lbz rWORD2, 0(rSTR2)
603 bdz- L(b11)
604 cmpld cr7, rWORD1, rWORD2
605 lbz rWORD3, 1(rSTR1)
606 lbz rWORD4, 1(rSTR2)
607 bdz- L(b12)
608 cmpld cr1, rWORD3, rWORD4
609 lbzu rWORD5, 2(rSTR1)
610 lbzu rWORD6, 2(rSTR2)
611 bdz- L(b13)
612 .align 4
613 L(bLoop):
614 lbzu rWORD1, 1(rSTR1)
615 lbzu rWORD2, 1(rSTR2)
616 bne- cr7, L(bLcr7)
617
618 cmpld cr6, rWORD5, rWORD6
619 bdz- L(b3i)
620
621 lbzu rWORD3, 1(rSTR1)
622 lbzu rWORD4, 1(rSTR2)
623 bne- cr1, L(bLcr1)
624
625 cmpld cr7, rWORD1, rWORD2
626 bdz- L(b2i)
627
628 lbzu rWORD5, 1(rSTR1)
629 lbzu rWORD6, 1(rSTR2)
630 bne- cr6, L(bLcr6)
631
632 cmpld cr1, rWORD3, rWORD4
633 bdnz+ L(bLoop)
634
635 /* We speculatively loading bytes before we have tested the previous
636 bytes. But we must avoid overrunning the length (in the ctr) to
637 prevent these speculative loads from causing a segfault. In this
638 case the loop will exit early (before the all pending bytes are
639 tested. In this case we must complete the pending operations
640 before returning. */
641 L(b1i):
642 bne- cr7, L(bLcr7)
643 bne- cr1, L(bLcr1)
644 b L(bx56)
645 .align 4
646 L(b2i):
647 bne- cr6, L(bLcr6)
648 bne- cr7, L(bLcr7)
649 b L(bx34)
650 .align 4
651 L(b3i):
652 bne- cr1, L(bLcr1)
653 bne- cr6, L(bLcr6)
654 b L(bx12)
655 .align 4
656 L(bLcr7):
657 li rRTN, 1
658 bgtlr cr7
659 li rRTN, -1
660 blr
661 L(bLcr1):
662 li rRTN, 1
663 bgtlr cr1
664 li rRTN, -1
665 blr
666 L(bLcr6):
667 li rRTN, 1
668 bgtlr cr6
669 li rRTN, -1
670 blr
671
672 L(b13):
673 bne- cr7, L(bx12)
674 bne- cr1, L(bx34)
675 L(bx56):
676 sub rRTN, rWORD5, rWORD6
677 blr
678 nop
679 L(b12):
680 bne- cr7, L(bx12)
681 L(bx34):
682 sub rRTN, rWORD3, rWORD4
683 blr
684 L(b11):
685 L(bx12):
686 sub rRTN, rWORD1, rWORD2
687 blr
688 .align 4
689 L(zeroLength):
690 li rRTN, 0
691 blr
692
693 .align 4
694 /* At this point we know the strings have different alignment and the
695 compare length is at least 8 bytes. r12 contains the low order
696 3 bits of rSTR1 and cr5 contains the result of the logical compare
697 of r12 to 0. If r12 == 0 then rStr1 is double word
698 aligned and can perform the DWunaligned loop.
699
700 Otherwise we know that rSTR1 is not already DW aligned yet.
701 So we can force the string addresses to the next lower DW
702 boundary and special case this first DW using shift left to
703 eliminate bits preceding the first byte. Since we want to join the
704 normal (DWaligned) compare loop, starting at the second double word,
705 we need to adjust the length (rN) and special case the loop
706 versioning for the first DW. This ensures that the loop count is
707 correct and the first DW (shifted) is in the expected resister pair. */
708 #define rSHL r29 /* Unaligned shift left count. */
709 #define rSHR r28 /* Unaligned shift right count. */
710 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
711 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
712 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
713 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
714 L(unaligned):
715 std rSHL, -24(r1)
716 cfi_offset(rSHL, -24)
717 clrldi rSHL, rSTR2, 61
718 beq- cr6, L(duzeroLength)
719 std rSHR, -32(r1)
720 cfi_offset(rSHR, -32)
721 beq cr5, L(DWunaligned)
722 std rWORD8_SHIFT, -40(r1)
723 cfi_offset(rWORD8_SHIFT, -40)
724 /* Adjust the logical start of rSTR2 to compensate for the extra bits
725 in the 1st rSTR1 DW. */
726 sub rWORD8_SHIFT, rSTR2, r12
727 /* But do not attempt to address the DW before that DW that contains
728 the actual start of rSTR2. */
729 clrrdi rSTR2, rSTR2, 3
730 std rWORD2_SHIFT, -48(r1)
731 /* Compute the left/right shift counts for the unaligned rSTR2,
732 compensating for the logical (DW aligned) start of rSTR1. */
733 clrldi rSHL, rWORD8_SHIFT, 61
734 clrrdi rSTR1, rSTR1, 3
735 std rWORD4_SHIFT, -56(r1)
736 sldi rSHL, rSHL, 3
737 cmpld cr5, rWORD8_SHIFT, rSTR2
738 add rN, rN, r12
739 sldi rWORD6, r12, 3
740 std rWORD6_SHIFT, -64(r1)
741 cfi_offset(rWORD2_SHIFT, -48)
742 cfi_offset(rWORD4_SHIFT, -56)
743 cfi_offset(rWORD6_SHIFT, -64)
744 subfic rSHR, rSHL, 64
745 srdi r0, rN, 5 /* Divide by 32 */
746 andi. r12, rN, 24 /* Get the DW remainder */
747 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
748 this special case those bits may be discarded anyway. Also we
749 must avoid loading a DW where none of the bits are part of rSTR2 as
750 this may cross a page boundary and cause a page fault. */
751 li rWORD8, 0
752 blt cr5, L(dus0)
753 #ifdef __LITTLE_ENDIAN__
754 ldbrx rWORD8, 0, rSTR2
755 addi rSTR2, rSTR2, 8
756 #else
757 ld rWORD8, 0(rSTR2)
758 addi rSTR2, rSTR2, 8
759 #endif
760 sld rWORD8, rWORD8, rSHL
761
762 L(dus0):
763 #ifdef __LITTLE_ENDIAN__
764 ldbrx rWORD1, 0, rSTR1
765 ldbrx rWORD2, 0, rSTR2
766 addi rSTR1, rSTR1, 8
767 addi rSTR2, rSTR2, 8
768 #else
769 ld rWORD1, 0(rSTR1)
770 ld rWORD2, 0(rSTR2)
771 #endif
772 cmpldi cr1, r12, 16
773 cmpldi cr7, rN, 32
774 srd r12, rWORD2, rSHR
775 clrldi rN, rN, 61
776 beq L(duPs4)
777 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
778 or rWORD8, r12, rWORD8
779 bgt cr1, L(duPs3)
780 beq cr1, L(duPs2)
781
782 /* Remainder is 8 */
783 .align 4
784 L(dusP1):
785 sld rWORD8_SHIFT, rWORD2, rSHL
786 sld rWORD7, rWORD1, rWORD6
787 sld rWORD8, rWORD8, rWORD6
788 bge cr7, L(duP1e)
789 /* At this point we exit early with the first double word compare
790 complete and remainder of 0 to 7 bytes. See L(du14) for details on
791 how we handle the remaining bytes. */
792 cmpld cr5, rWORD7, rWORD8
793 sldi. rN, rN, 3
794 bne cr5, L(duLcr5)
795 cmpld cr7, rN, rSHR
796 beq L(duZeroReturn)
797 li r0, 0
798 ble cr7, L(dutrim)
799 #ifdef __LITTLE_ENDIAN__
800 ldbrx rWORD2, 0, rSTR2
801 addi rSTR2, rSTR2, 8
802 #else
803 ld rWORD2, 8(rSTR2)
804 #endif
805 srd r0, rWORD2, rSHR
806 b L(dutrim)
807 /* Remainder is 16 */
808 .align 4
809 L(duPs2):
810 sld rWORD6_SHIFT, rWORD2, rSHL
811 sld rWORD5, rWORD1, rWORD6
812 sld rWORD6, rWORD8, rWORD6
813 b L(duP2e)
814 /* Remainder is 24 */
815 .align 4
816 L(duPs3):
817 sld rWORD4_SHIFT, rWORD2, rSHL
818 sld rWORD3, rWORD1, rWORD6
819 sld rWORD4, rWORD8, rWORD6
820 b L(duP3e)
821 /* Count is a multiple of 32, remainder is 0 */
822 .align 4
823 L(duPs4):
824 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
825 or rWORD8, r12, rWORD8
826 sld rWORD2_SHIFT, rWORD2, rSHL
827 sld rWORD1, rWORD1, rWORD6
828 sld rWORD2, rWORD8, rWORD6
829 b L(duP4e)
830
831 /* At this point we know rSTR1 is double word aligned and the
832 compare length is at least 8 bytes. */
833 .align 4
834 L(DWunaligned):
835 std rWORD8_SHIFT, -40(r1)
836 clrrdi rSTR2, rSTR2, 3
837 std rWORD2_SHIFT, -48(r1)
838 srdi r0, rN, 5 /* Divide by 32 */
839 std rWORD4_SHIFT, -56(r1)
840 andi. r12, rN, 24 /* Get the DW remainder */
841 std rWORD6_SHIFT, -64(r1)
842 cfi_offset(rWORD8_SHIFT, -40)
843 cfi_offset(rWORD2_SHIFT, -48)
844 cfi_offset(rWORD4_SHIFT, -56)
845 cfi_offset(rWORD6_SHIFT, -64)
846 sldi rSHL, rSHL, 3
847 #ifdef __LITTLE_ENDIAN__
848 ldbrx rWORD6, 0, rSTR2
849 addi rSTR2, rSTR2, 8
850 ldbrx rWORD8, 0, rSTR2
851 addi rSTR2, rSTR2, 8
852 #else
853 ld rWORD6, 0(rSTR2)
854 ldu rWORD8, 8(rSTR2)
855 #endif
856 cmpldi cr1, r12, 16
857 cmpldi cr7, rN, 32
858 clrldi rN, rN, 61
859 subfic rSHR, rSHL, 64
860 sld rWORD6_SHIFT, rWORD6, rSHL
861 beq L(duP4)
862 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
863 bgt cr1, L(duP3)
864 beq cr1, L(duP2)
865
866 /* Remainder is 8 */
867 .align 4
868 L(duP1):
869 srd r12, rWORD8, rSHR
870 #ifdef __LITTLE_ENDIAN__
871 ldbrx rWORD7, 0, rSTR1
872 addi rSTR1, rSTR1, 8
873 #else
874 ld rWORD7, 0(rSTR1)
875 #endif
876 sld rWORD8_SHIFT, rWORD8, rSHL
877 or rWORD8, r12, rWORD6_SHIFT
878 blt cr7, L(duP1x)
879 L(duP1e):
880 #ifdef __LITTLE_ENDIAN__
881 ldbrx rWORD1, 0, rSTR1
882 ldbrx rWORD2, 0, rSTR2
883 addi rSTR1, rSTR1, 8
884 addi rSTR2, rSTR2, 8
885 #else
886 ld rWORD1, 8(rSTR1)
887 ld rWORD2, 8(rSTR2)
888 #endif
889 cmpld cr5, rWORD7, rWORD8
890 srd r0, rWORD2, rSHR
891 sld rWORD2_SHIFT, rWORD2, rSHL
892 or rWORD2, r0, rWORD8_SHIFT
893 #ifdef __LITTLE_ENDIAN__
894 ldbrx rWORD3, 0, rSTR1
895 ldbrx rWORD4, 0, rSTR2
896 addi rSTR1, rSTR1, 8
897 addi rSTR2, rSTR2, 8
898 #else
899 ld rWORD3, 16(rSTR1)
900 ld rWORD4, 16(rSTR2)
901 #endif
902 cmpld cr7, rWORD1, rWORD2
903 srd r12, rWORD4, rSHR
904 sld rWORD4_SHIFT, rWORD4, rSHL
905 bne cr5, L(duLcr5)
906 or rWORD4, r12, rWORD2_SHIFT
907 #ifdef __LITTLE_ENDIAN__
908 ldbrx rWORD5, 0, rSTR1
909 ldbrx rWORD6, 0, rSTR2
910 addi rSTR1, rSTR1, 8
911 addi rSTR2, rSTR2, 8
912 #else
913 ld rWORD5, 24(rSTR1)
914 ld rWORD6, 24(rSTR2)
915 #endif
916 cmpld cr1, rWORD3, rWORD4
917 srd r0, rWORD6, rSHR
918 sld rWORD6_SHIFT, rWORD6, rSHL
919 bne cr7, L(duLcr7)
920 or rWORD6, r0, rWORD4_SHIFT
921 cmpld cr6, rWORD5, rWORD6
922 b L(duLoop3)
923 .align 4
924 /* At this point we exit early with the first double word compare
925 complete and remainder of 0 to 7 bytes. See L(du14) for details on
926 how we handle the remaining bytes. */
927 L(duP1x):
928 cmpld cr5, rWORD7, rWORD8
929 sldi. rN, rN, 3
930 bne cr5, L(duLcr5)
931 cmpld cr7, rN, rSHR
932 beq L(duZeroReturn)
933 li r0, 0
934 ble cr7, L(dutrim)
935 #ifdef __LITTLE_ENDIAN__
936 ldbrx rWORD2, 0, rSTR2
937 addi rSTR2, rSTR2, 8
938 #else
939 ld rWORD2, 8(rSTR2)
940 #endif
941 srd r0, rWORD2, rSHR
942 b L(dutrim)
943 /* Remainder is 16 */
944 .align 4
945 L(duP2):
946 srd r0, rWORD8, rSHR
947 #ifdef __LITTLE_ENDIAN__
948 ldbrx rWORD5, 0, rSTR1
949 addi rSTR1, rSTR1, 8
950 #else
951 ld rWORD5, 0(rSTR1)
952 #endif
953 or rWORD6, r0, rWORD6_SHIFT
954 sld rWORD6_SHIFT, rWORD8, rSHL
955 L(duP2e):
956 #ifdef __LITTLE_ENDIAN__
957 ldbrx rWORD7, 0, rSTR1
958 ldbrx rWORD8, 0, rSTR2
959 addi rSTR1, rSTR1, 8
960 addi rSTR2, rSTR2, 8
961 #else
962 ld rWORD7, 8(rSTR1)
963 ld rWORD8, 8(rSTR2)
964 #endif
965 cmpld cr6, rWORD5, rWORD6
966 srd r12, rWORD8, rSHR
967 sld rWORD8_SHIFT, rWORD8, rSHL
968 or rWORD8, r12, rWORD6_SHIFT
969 blt cr7, L(duP2x)
970 #ifdef __LITTLE_ENDIAN__
971 ldbrx rWORD1, 0, rSTR1
972 ldbrx rWORD2, 0, rSTR2
973 addi rSTR1, rSTR1, 8
974 addi rSTR2, rSTR2, 8
975 #else
976 ld rWORD1, 16(rSTR1)
977 ld rWORD2, 16(rSTR2)
978 #endif
979 cmpld cr5, rWORD7, rWORD8
980 bne cr6, L(duLcr6)
981 srd r0, rWORD2, rSHR
982 sld rWORD2_SHIFT, rWORD2, rSHL
983 or rWORD2, r0, rWORD8_SHIFT
984 #ifdef __LITTLE_ENDIAN__
985 ldbrx rWORD3, 0, rSTR1
986 ldbrx rWORD4, 0, rSTR2
987 addi rSTR1, rSTR1, 8
988 addi rSTR2, rSTR2, 8
989 #else
990 ld rWORD3, 24(rSTR1)
991 ld rWORD4, 24(rSTR2)
992 #endif
993 cmpld cr7, rWORD1, rWORD2
994 bne cr5, L(duLcr5)
995 srd r12, rWORD4, rSHR
996 sld rWORD4_SHIFT, rWORD4, rSHL
997 or rWORD4, r12, rWORD2_SHIFT
998 #ifndef __LITTLE_ENDIAN__
999 addi rSTR1, rSTR1, 8
1000 addi rSTR2, rSTR2, 8
1001 #endif
1002 cmpld cr1, rWORD3, rWORD4
1003 b L(duLoop2)
1004 .align 4
1005 L(duP2x):
1006 cmpld cr5, rWORD7, rWORD8
1007 #ifndef __LITTLE_ENDIAN__
1008 addi rSTR1, rSTR1, 8
1009 addi rSTR2, rSTR2, 8
1010 #endif
1011 bne cr6, L(duLcr6)
1012 sldi. rN, rN, 3
1013 bne cr5, L(duLcr5)
1014 cmpld cr7, rN, rSHR
1015 beq L(duZeroReturn)
1016 li r0, 0
1017 ble cr7, L(dutrim)
1018 #ifdef __LITTLE_ENDIAN__
1019 ldbrx rWORD2, 0, rSTR2
1020 addi rSTR2, rSTR2, 8
1021 #else
1022 ld rWORD2, 8(rSTR2)
1023 #endif
1024 srd r0, rWORD2, rSHR
1025 b L(dutrim)
1026
1027 /* Remainder is 24 */
1028 .align 4
1029 L(duP3):
1030 srd r12, rWORD8, rSHR
1031 #ifdef __LITTLE_ENDIAN__
1032 ldbrx rWORD3, 0, rSTR1
1033 addi rSTR1, rSTR1, 8
1034 #else
1035 ld rWORD3, 0(rSTR1)
1036 #endif
1037 sld rWORD4_SHIFT, rWORD8, rSHL
1038 or rWORD4, r12, rWORD6_SHIFT
1039 L(duP3e):
1040 #ifdef __LITTLE_ENDIAN__
1041 ldbrx rWORD5, 0, rSTR1
1042 ldbrx rWORD6, 0, rSTR2
1043 addi rSTR1, rSTR1, 8
1044 addi rSTR2, rSTR2, 8
1045 #else
1046 ld rWORD5, 8(rSTR1)
1047 ld rWORD6, 8(rSTR2)
1048 #endif
1049 cmpld cr1, rWORD3, rWORD4
1050 srd r0, rWORD6, rSHR
1051 sld rWORD6_SHIFT, rWORD6, rSHL
1052 or rWORD6, r0, rWORD4_SHIFT
1053 #ifdef __LITTLE_ENDIAN__
1054 ldbrx rWORD7, 0, rSTR1
1055 ldbrx rWORD8, 0, rSTR2
1056 addi rSTR1, rSTR1, 8
1057 addi rSTR2, rSTR2, 8
1058 #else
1059 ld rWORD7, 16(rSTR1)
1060 ld rWORD8, 16(rSTR2)
1061 #endif
1062 cmpld cr6, rWORD5, rWORD6
1063 bne cr1, L(duLcr1)
1064 srd r12, rWORD8, rSHR
1065 sld rWORD8_SHIFT, rWORD8, rSHL
1066 or rWORD8, r12, rWORD6_SHIFT
1067 blt cr7, L(duP3x)
1068 #ifdef __LITTLE_ENDIAN__
1069 ldbrx rWORD1, 0, rSTR1
1070 ldbrx rWORD2, 0, rSTR2
1071 addi rSTR1, rSTR1, 8
1072 addi rSTR2, rSTR2, 8
1073 #else
1074 ld rWORD1, 24(rSTR1)
1075 ld rWORD2, 24(rSTR2)
1076 #endif
1077 cmpld cr5, rWORD7, rWORD8
1078 bne cr6, L(duLcr6)
1079 srd r0, rWORD2, rSHR
1080 sld rWORD2_SHIFT, rWORD2, rSHL
1081 or rWORD2, r0, rWORD8_SHIFT
1082 #ifndef __LITTLE_ENDIAN__
1083 addi rSTR1, rSTR1, 16
1084 addi rSTR2, rSTR2, 16
1085 #endif
1086 cmpld cr7, rWORD1, rWORD2
1087 b L(duLoop1)
1088 .align 4
1089 L(duP3x):
1090 #ifndef __LITTLE_ENDIAN__
1091 addi rSTR1, rSTR1, 16
1092 addi rSTR2, rSTR2, 16
1093 #endif
1094 #if 0
1095 /* Huh? We've already branched on cr1! */
1096 bne cr1, L(duLcr1)
1097 #endif
1098 cmpld cr5, rWORD7, rWORD8
1099 bne cr6, L(duLcr6)
1100 sldi. rN, rN, 3
1101 bne cr5, L(duLcr5)
1102 cmpld cr7, rN, rSHR
1103 beq L(duZeroReturn)
1104 li r0, 0
1105 ble cr7, L(dutrim)
1106 #ifdef __LITTLE_ENDIAN__
1107 ldbrx rWORD2, 0, rSTR2
1108 addi rSTR2, rSTR2, 8
1109 #else
1110 ld rWORD2, 8(rSTR2)
1111 #endif
1112 srd r0, rWORD2, rSHR
1113 b L(dutrim)
1114
1115 /* Count is a multiple of 32, remainder is 0 */
1116 .align 4
1117 L(duP4):
1118 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1119 srd r0, rWORD8, rSHR
1120 #ifdef __LITTLE_ENDIAN__
1121 ldbrx rWORD1, 0, rSTR1
1122 addi rSTR1, rSTR1, 8
1123 #else
1124 ld rWORD1, 0(rSTR1)
1125 #endif
1126 sld rWORD2_SHIFT, rWORD8, rSHL
1127 or rWORD2, r0, rWORD6_SHIFT
1128 L(duP4e):
1129 #ifdef __LITTLE_ENDIAN__
1130 ldbrx rWORD3, 0, rSTR1
1131 ldbrx rWORD4, 0, rSTR2
1132 addi rSTR1, rSTR1, 8
1133 addi rSTR2, rSTR2, 8
1134 #else
1135 ld rWORD3, 8(rSTR1)
1136 ld rWORD4, 8(rSTR2)
1137 #endif
1138 cmpld cr7, rWORD1, rWORD2
1139 srd r12, rWORD4, rSHR
1140 sld rWORD4_SHIFT, rWORD4, rSHL
1141 or rWORD4, r12, rWORD2_SHIFT
1142 #ifdef __LITTLE_ENDIAN__
1143 ldbrx rWORD5, 0, rSTR1
1144 ldbrx rWORD6, 0, rSTR2
1145 addi rSTR1, rSTR1, 8
1146 addi rSTR2, rSTR2, 8
1147 #else
1148 ld rWORD5, 16(rSTR1)
1149 ld rWORD6, 16(rSTR2)
1150 #endif
1151 cmpld cr1, rWORD3, rWORD4
1152 bne cr7, L(duLcr7)
1153 srd r0, rWORD6, rSHR
1154 sld rWORD6_SHIFT, rWORD6, rSHL
1155 or rWORD6, r0, rWORD4_SHIFT
1156 #ifdef __LITTLE_ENDIAN__
1157 ldbrx rWORD7, 0, rSTR1
1158 ldbrx rWORD8, 0, rSTR2
1159 addi rSTR1, rSTR1, 8
1160 addi rSTR2, rSTR2, 8
1161 #else
1162 ldu rWORD7, 24(rSTR1)
1163 ldu rWORD8, 24(rSTR2)
1164 #endif
1165 cmpld cr6, rWORD5, rWORD6
1166 bne cr1, L(duLcr1)
1167 srd r12, rWORD8, rSHR
1168 sld rWORD8_SHIFT, rWORD8, rSHL
1169 or rWORD8, r12, rWORD6_SHIFT
1170 cmpld cr5, rWORD7, rWORD8
1171 bdz- L(du24) /* Adjust CTR as we start with +4 */
1172 /* This is the primary loop */
1173 .align 4
1174 L(duLoop):
1175 #ifdef __LITTLE_ENDIAN__
1176 ldbrx rWORD1, 0, rSTR1
1177 ldbrx rWORD2, 0, rSTR2
1178 addi rSTR1, rSTR1, 8
1179 addi rSTR2, rSTR2, 8
1180 #else
1181 ld rWORD1, 8(rSTR1)
1182 ld rWORD2, 8(rSTR2)
1183 #endif
1184 cmpld cr1, rWORD3, rWORD4
1185 bne cr6, L(duLcr6)
1186 srd r0, rWORD2, rSHR
1187 sld rWORD2_SHIFT, rWORD2, rSHL
1188 or rWORD2, r0, rWORD8_SHIFT
1189 L(duLoop1):
1190 #ifdef __LITTLE_ENDIAN__
1191 ldbrx rWORD3, 0, rSTR1
1192 ldbrx rWORD4, 0, rSTR2
1193 addi rSTR1, rSTR1, 8
1194 addi rSTR2, rSTR2, 8
1195 #else
1196 ld rWORD3, 16(rSTR1)
1197 ld rWORD4, 16(rSTR2)
1198 #endif
1199 cmpld cr6, rWORD5, rWORD6
1200 bne cr5, L(duLcr5)
1201 srd r12, rWORD4, rSHR
1202 sld rWORD4_SHIFT, rWORD4, rSHL
1203 or rWORD4, r12, rWORD2_SHIFT
1204 L(duLoop2):
1205 #ifdef __LITTLE_ENDIAN__
1206 ldbrx rWORD5, 0, rSTR1
1207 ldbrx rWORD6, 0, rSTR2
1208 addi rSTR1, rSTR1, 8
1209 addi rSTR2, rSTR2, 8
1210 #else
1211 ld rWORD5, 24(rSTR1)
1212 ld rWORD6, 24(rSTR2)
1213 #endif
1214 cmpld cr5, rWORD7, rWORD8
1215 bne cr7, L(duLcr7)
1216 srd r0, rWORD6, rSHR
1217 sld rWORD6_SHIFT, rWORD6, rSHL
1218 or rWORD6, r0, rWORD4_SHIFT
1219 L(duLoop3):
1220 #ifdef __LITTLE_ENDIAN__
1221 ldbrx rWORD7, 0, rSTR1
1222 ldbrx rWORD8, 0, rSTR2
1223 addi rSTR1, rSTR1, 8
1224 addi rSTR2, rSTR2, 8
1225 #else
1226 ldu rWORD7, 32(rSTR1)
1227 ldu rWORD8, 32(rSTR2)
1228 #endif
1229 cmpld cr7, rWORD1, rWORD2
1230 bne- cr1, L(duLcr1)
1231 srd r12, rWORD8, rSHR
1232 sld rWORD8_SHIFT, rWORD8, rSHL
1233 or rWORD8, r12, rWORD6_SHIFT
1234 bdnz+ L(duLoop)
1235
1236 L(duL4):
1237 #if 0
1238 /* Huh? We've already branched on cr1! */
1239 bne cr1, L(duLcr1)
1240 #endif
1241 cmpld cr1, rWORD3, rWORD4
1242 bne cr6, L(duLcr6)
1243 cmpld cr6, rWORD5, rWORD6
1244 bne cr5, L(duLcr5)
1245 cmpld cr5, rWORD7, rWORD8
1246 L(du44):
1247 bne cr7, L(duLcr7)
1248 L(du34):
1249 bne cr1, L(duLcr1)
1250 L(du24):
1251 bne cr6, L(duLcr6)
1252 L(du14):
1253 sldi. rN, rN, 3
1254 bne cr5, L(duLcr5)
1255 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1256 shift right double to eliminate bits beyond the compare length.
1257
1258 However it may not be safe to load rWORD2 which may be beyond the
1259 string length. So we compare the bit length of the remainder to
1260 the right shift count (rSHR). If the bit count is less than or equal
1261 we do not need to load rWORD2 (all significant bits are already in
1262 rWORD8_SHIFT). */
1263 cmpld cr7, rN, rSHR
1264 beq L(duZeroReturn)
1265 li r0, 0
1266 ble cr7, L(dutrim)
1267 #ifdef __LITTLE_ENDIAN__
1268 ldbrx rWORD2, 0, rSTR2
1269 addi rSTR2, rSTR2, 8
1270 #else
1271 ld rWORD2, 8(rSTR2)
1272 #endif
1273 srd r0, rWORD2, rSHR
1274 .align 4
1275 L(dutrim):
1276 #ifdef __LITTLE_ENDIAN__
1277 ldbrx rWORD1, 0, rSTR1
1278 #else
1279 ld rWORD1, 8(rSTR1)
1280 #endif
1281 ld rWORD8, -8(r1)
1282 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1283 or rWORD2, r0, rWORD8_SHIFT
1284 ld rWORD7, -16(r1)
1285 ld rSHL, -24(r1)
1286 srd rWORD1, rWORD1, rN
1287 srd rWORD2, rWORD2, rN
1288 ld rSHR, -32(r1)
1289 ld rWORD8_SHIFT, -40(r1)
1290 li rRTN, 0
1291 cmpld cr7, rWORD1, rWORD2
1292 ld rWORD2_SHIFT, -48(r1)
1293 ld rWORD4_SHIFT, -56(r1)
1294 beq cr7, L(dureturn24)
1295 li rRTN, 1
1296 ld rWORD6_SHIFT, -64(r1)
1297 bgtlr cr7
1298 li rRTN, -1
1299 blr
1300 .align 4
1301 L(duLcr7):
1302 ld rWORD8, -8(r1)
1303 ld rWORD7, -16(r1)
1304 li rRTN, 1
1305 bgt cr7, L(dureturn29)
1306 ld rSHL, -24(r1)
1307 ld rSHR, -32(r1)
1308 li rRTN, -1
1309 b L(dureturn27)
1310 .align 4
1311 L(duLcr1):
1312 ld rWORD8, -8(r1)
1313 ld rWORD7, -16(r1)
1314 li rRTN, 1
1315 bgt cr1, L(dureturn29)
1316 ld rSHL, -24(r1)
1317 ld rSHR, -32(r1)
1318 li rRTN, -1
1319 b L(dureturn27)
1320 .align 4
1321 L(duLcr6):
1322 ld rWORD8, -8(r1)
1323 ld rWORD7, -16(r1)
1324 li rRTN, 1
1325 bgt cr6, L(dureturn29)
1326 ld rSHL, -24(r1)
1327 ld rSHR, -32(r1)
1328 li rRTN, -1
1329 b L(dureturn27)
1330 .align 4
1331 L(duLcr5):
1332 ld rWORD8, -8(r1)
1333 ld rWORD7, -16(r1)
1334 li rRTN, 1
1335 bgt cr5, L(dureturn29)
1336 ld rSHL, -24(r1)
1337 ld rSHR, -32(r1)
1338 li rRTN, -1
1339 b L(dureturn27)
1340 .align 3
1341 L(duZeroReturn):
1342 li rRTN, 0
1343 .align 4
1344 L(dureturn):
1345 ld rWORD8, -8(r1)
1346 ld rWORD7, -16(r1)
1347 L(dureturn29):
1348 ld rSHL, -24(r1)
1349 ld rSHR, -32(r1)
1350 L(dureturn27):
1351 ld rWORD8_SHIFT, -40(r1)
1352 L(dureturn26):
1353 ld rWORD2_SHIFT, -48(r1)
1354 L(dureturn25):
1355 ld rWORD4_SHIFT, -56(r1)
1356 L(dureturn24):
1357 ld rWORD6_SHIFT, -64(r1)
1358 blr
1359 L(duzeroLength):
1360 li rRTN, 0
1361 blr
1362
1363 END (memcmp)
1364 libc_hidden_builtin_def (memcmp)
1365 weak_alias (memcmp, bcmp)