]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power4/memcmp.S
85a5762de33c12433d2bbc14d58dc595d726c635
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
1 /* Optimized memcmp implementation for PowerPC64.
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24
25 #ifndef MEMCMP
26 # define MEMCMP memcmp
27 #endif
28
29 #ifndef __LITTLE_ENDIAN__
30 .machine power4
31 #else
32 /* Little endian is only available since POWER8, so it's safe to
33 specify .machine as power8 (or older), even though this is a POWER4
34 file. Since the little-endian code uses 'ldbrx', power7 is enough. */
35 .machine power7
36 #endif
37 ENTRY_TOCLESS (MEMCMP, 4)
38 CALL_MCOUNT 3
39
40 #define rRTN r3
41 #define rSTR1 r3 /* first string arg */
42 #define rSTR2 r4 /* second string arg */
43 #define rN r5 /* max string length */
44 #define rWORD1 r6 /* current word in s1 */
45 #define rWORD2 r7 /* current word in s2 */
46 #define rWORD3 r8 /* next word in s1 */
47 #define rWORD4 r9 /* next word in s2 */
48 #define rWORD5 r10 /* next word in s1 */
49 #define rWORD6 r11 /* next word in s2 */
50 #define rWORD7 r30 /* next word in s1 */
51 #define rWORD8 r31 /* next word in s2 */
52
53 xor r0, rSTR2, rSTR1
54 cmpldi cr6, rN, 0
55 cmpldi cr1, rN, 12
56 clrldi. r0, r0, 61
57 clrldi r12, rSTR1, 61
58 cmpldi cr5, r12, 0
59 beq- cr6, L(zeroLength)
60 dcbt 0, rSTR1
61 dcbt 0, rSTR2
62 /* If less than 8 bytes or not aligned, use the unaligned
63 byte loop. */
64 blt cr1, L(bytealigned)
65 std rWORD8, -8(r1)
66 std rWORD7, -16(r1)
67 cfi_offset(rWORD8, -8)
68 cfi_offset(rWORD7, -16)
69 bne L(unaligned)
70 /* At this point we know both strings have the same alignment and the
71 compare length is at least 8 bytes. r12 contains the low order
72 3 bits of rSTR1 and cr5 contains the result of the logical compare
73 of r12 to 0. If r12 == 0 then we are already double word
74 aligned and can perform the DW aligned loop.
75
76 Otherwise we know the two strings have the same alignment (but not
77 yet DW). So we force the string addresses to the next lower DW
78 boundary and special case this first DW using shift left to
79 eliminate bits preceding the first byte. Since we want to join the
80 normal (DW aligned) compare loop, starting at the second double word,
81 we need to adjust the length (rN) and special case the loop
82 versioning for the first DW. This ensures that the loop count is
83 correct and the first DW (shifted) is in the expected register pair. */
84 .align 4
85 L(samealignment):
86 clrrdi rSTR1, rSTR1, 3
87 clrrdi rSTR2, rSTR2, 3
88 beq cr5, L(DWaligned)
89 add rN, rN, r12
90 sldi rWORD6, r12, 3
91 srdi r0, rN, 5 /* Divide by 32 */
92 andi. r12, rN, 24 /* Get the DW remainder */
93 #ifdef __LITTLE_ENDIAN__
94 ldbrx rWORD1, 0, rSTR1
95 ldbrx rWORD2, 0, rSTR2
96 addi rSTR1, rSTR1, 8
97 addi rSTR2, rSTR2, 8
98 #else
99 ld rWORD1, 0(rSTR1)
100 ld rWORD2, 0(rSTR2)
101 #endif
102 cmpldi cr1, r12, 16
103 cmpldi cr7, rN, 32
104 clrldi rN, rN, 61
105 beq L(dPs4)
106 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
107 bgt cr1, L(dPs3)
108 beq cr1, L(dPs2)
109
110 /* Remainder is 8 */
111 .align 3
112 L(dsP1):
113 sld rWORD5, rWORD1, rWORD6
114 sld rWORD6, rWORD2, rWORD6
115 cmpld cr5, rWORD5, rWORD6
116 blt cr7, L(dP1x)
117 /* Do something useful in this cycle since we have to branch anyway. */
118 #ifdef __LITTLE_ENDIAN__
119 ldbrx rWORD1, 0, rSTR1
120 ldbrx rWORD2, 0, rSTR2
121 addi rSTR1, rSTR1, 8
122 addi rSTR2, rSTR2, 8
123 #else
124 ld rWORD1, 8(rSTR1)
125 ld rWORD2, 8(rSTR2)
126 #endif
127 cmpld cr7, rWORD1, rWORD2
128 b L(dP1e)
129 /* Remainder is 16 */
130 .align 4
131 L(dPs2):
132 sld rWORD5, rWORD1, rWORD6
133 sld rWORD6, rWORD2, rWORD6
134 cmpld cr6, rWORD5, rWORD6
135 blt cr7, L(dP2x)
136 /* Do something useful in this cycle since we have to branch anyway. */
137 #ifdef __LITTLE_ENDIAN__
138 ldbrx rWORD7, 0, rSTR1
139 ldbrx rWORD8, 0, rSTR2
140 addi rSTR1, rSTR1, 8
141 addi rSTR2, rSTR2, 8
142 #else
143 ld rWORD7, 8(rSTR1)
144 ld rWORD8, 8(rSTR2)
145 #endif
146 cmpld cr5, rWORD7, rWORD8
147 b L(dP2e)
148 /* Remainder is 24 */
149 .align 4
150 L(dPs3):
151 sld rWORD3, rWORD1, rWORD6
152 sld rWORD4, rWORD2, rWORD6
153 cmpld cr1, rWORD3, rWORD4
154 b L(dP3e)
155 /* Count is a multiple of 32, remainder is 0 */
156 .align 4
157 L(dPs4):
158 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
159 sld rWORD1, rWORD1, rWORD6
160 sld rWORD2, rWORD2, rWORD6
161 cmpld cr7, rWORD1, rWORD2
162 b L(dP4e)
163
164 /* At this point we know both strings are double word aligned and the
165 compare length is at least 8 bytes. */
166 .align 4
167 L(DWaligned):
168 andi. r12, rN, 24 /* Get the DW remainder */
169 srdi r0, rN, 5 /* Divide by 32 */
170 cmpldi cr1, r12, 16
171 cmpldi cr7, rN, 32
172 clrldi rN, rN, 61
173 beq L(dP4)
174 bgt cr1, L(dP3)
175 beq cr1, L(dP2)
176
177 /* Remainder is 8 */
178 .align 4
179 L(dP1):
180 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
181 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
182 (8-15 byte compare), we want to use only volatile registers. This
183 means we can avoid restoring non-volatile registers since we did not
184 change any on the early exit path. The key here is the non-early
185 exit path only cares about the condition code (cr5), not about which
186 register pair was used. */
187 #ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD5, 0, rSTR1
189 ldbrx rWORD6, 0, rSTR2
190 addi rSTR1, rSTR1, 8
191 addi rSTR2, rSTR2, 8
192 #else
193 ld rWORD5, 0(rSTR1)
194 ld rWORD6, 0(rSTR2)
195 #endif
196 cmpld cr5, rWORD5, rWORD6
197 blt cr7, L(dP1x)
198 #ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD1, 0, rSTR1
200 ldbrx rWORD2, 0, rSTR2
201 addi rSTR1, rSTR1, 8
202 addi rSTR2, rSTR2, 8
203 #else
204 ld rWORD1, 8(rSTR1)
205 ld rWORD2, 8(rSTR2)
206 #endif
207 cmpld cr7, rWORD1, rWORD2
208 L(dP1e):
209 #ifdef __LITTLE_ENDIAN__
210 ldbrx rWORD3, 0, rSTR1
211 ldbrx rWORD4, 0, rSTR2
212 addi rSTR1, rSTR1, 8
213 addi rSTR2, rSTR2, 8
214 #else
215 ld rWORD3, 16(rSTR1)
216 ld rWORD4, 16(rSTR2)
217 #endif
218 cmpld cr1, rWORD3, rWORD4
219 #ifdef __LITTLE_ENDIAN__
220 ldbrx rWORD5, 0, rSTR1
221 ldbrx rWORD6, 0, rSTR2
222 addi rSTR1, rSTR1, 8
223 addi rSTR2, rSTR2, 8
224 #else
225 ld rWORD5, 24(rSTR1)
226 ld rWORD6, 24(rSTR2)
227 #endif
228 cmpld cr6, rWORD5, rWORD6
229 bne cr5, L(dLcr5x)
230 bne cr7, L(dLcr7x)
231
232 #ifdef __LITTLE_ENDIAN__
233 ldbrx rWORD7, 0, rSTR1
234 ldbrx rWORD8, 0, rSTR2
235 addi rSTR1, rSTR1, 8
236 addi rSTR2, rSTR2, 8
237 #else
238 ldu rWORD7, 32(rSTR1)
239 ldu rWORD8, 32(rSTR2)
240 #endif
241 bne cr1, L(dLcr1)
242 cmpld cr5, rWORD7, rWORD8
243 bdnz L(dLoop)
244 bne cr6, L(dLcr6)
245 ld rWORD8, -8(r1)
246 ld rWORD7, -16(r1)
247 .align 3
248 L(dP1x):
249 sldi. r12, rN, 3
250 bne cr5, L(dLcr5x)
251 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
252 bne L(d00)
253 li rRTN, 0
254 blr
255
256 /* Remainder is 16 */
257 .align 4
258 L(dP2):
259 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
260 #ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD5, 0, rSTR1
262 ldbrx rWORD6, 0, rSTR2
263 addi rSTR1, rSTR1, 8
264 addi rSTR2, rSTR2, 8
265 #else
266 ld rWORD5, 0(rSTR1)
267 ld rWORD6, 0(rSTR2)
268 #endif
269 cmpld cr6, rWORD5, rWORD6
270 blt cr7, L(dP2x)
271 #ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD7, 0, rSTR1
273 ldbrx rWORD8, 0, rSTR2
274 addi rSTR1, rSTR1, 8
275 addi rSTR2, rSTR2, 8
276 #else
277 ld rWORD7, 8(rSTR1)
278 ld rWORD8, 8(rSTR2)
279 #endif
280 cmpld cr5, rWORD7, rWORD8
281 L(dP2e):
282 #ifdef __LITTLE_ENDIAN__
283 ldbrx rWORD1, 0, rSTR1
284 ldbrx rWORD2, 0, rSTR2
285 addi rSTR1, rSTR1, 8
286 addi rSTR2, rSTR2, 8
287 #else
288 ld rWORD1, 16(rSTR1)
289 ld rWORD2, 16(rSTR2)
290 #endif
291 cmpld cr7, rWORD1, rWORD2
292 #ifdef __LITTLE_ENDIAN__
293 ldbrx rWORD3, 0, rSTR1
294 ldbrx rWORD4, 0, rSTR2
295 addi rSTR1, rSTR1, 8
296 addi rSTR2, rSTR2, 8
297 #else
298 ld rWORD3, 24(rSTR1)
299 ld rWORD4, 24(rSTR2)
300 #endif
301 cmpld cr1, rWORD3, rWORD4
302 #ifndef __LITTLE_ENDIAN__
303 addi rSTR1, rSTR1, 8
304 addi rSTR2, rSTR2, 8
305 #endif
306 bne cr6, L(dLcr6)
307 bne cr5, L(dLcr5)
308 b L(dLoop2)
309 /* Again we are on a early exit path (16-23 byte compare), we want to
310 only use volatile registers and avoid restoring non-volatile
311 registers. */
312 .align 4
313 L(dP2x):
314 #ifdef __LITTLE_ENDIAN__
315 ldbrx rWORD3, 0, rSTR1
316 ldbrx rWORD4, 0, rSTR2
317 addi rSTR1, rSTR1, 8
318 addi rSTR2, rSTR2, 8
319 #else
320 ld rWORD3, 8(rSTR1)
321 ld rWORD4, 8(rSTR2)
322 #endif
323 cmpld cr1, rWORD3, rWORD4
324 sldi. r12, rN, 3
325 bne cr6, L(dLcr6x)
326 #ifndef __LITTLE_ENDIAN__
327 addi rSTR1, rSTR1, 8
328 addi rSTR2, rSTR2, 8
329 #endif
330 bne cr1, L(dLcr1x)
331 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
332 bne L(d00)
333 li rRTN, 0
334 blr
335
336 /* Remainder is 24 */
337 .align 4
338 L(dP3):
339 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
340 #ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD3, 0, rSTR1
342 ldbrx rWORD4, 0, rSTR2
343 addi rSTR1, rSTR1, 8
344 addi rSTR2, rSTR2, 8
345 #else
346 ld rWORD3, 0(rSTR1)
347 ld rWORD4, 0(rSTR2)
348 #endif
349 cmpld cr1, rWORD3, rWORD4
350 L(dP3e):
351 #ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD5, 0, rSTR1
353 ldbrx rWORD6, 0, rSTR2
354 addi rSTR1, rSTR1, 8
355 addi rSTR2, rSTR2, 8
356 #else
357 ld rWORD5, 8(rSTR1)
358 ld rWORD6, 8(rSTR2)
359 #endif
360 cmpld cr6, rWORD5, rWORD6
361 blt cr7, L(dP3x)
362 #ifdef __LITTLE_ENDIAN__
363 ldbrx rWORD7, 0, rSTR1
364 ldbrx rWORD8, 0, rSTR2
365 addi rSTR1, rSTR1, 8
366 addi rSTR2, rSTR2, 8
367 #else
368 ld rWORD7, 16(rSTR1)
369 ld rWORD8, 16(rSTR2)
370 #endif
371 cmpld cr5, rWORD7, rWORD8
372 #ifdef __LITTLE_ENDIAN__
373 ldbrx rWORD1, 0, rSTR1
374 ldbrx rWORD2, 0, rSTR2
375 addi rSTR1, rSTR1, 8
376 addi rSTR2, rSTR2, 8
377 #else
378 ld rWORD1, 24(rSTR1)
379 ld rWORD2, 24(rSTR2)
380 #endif
381 cmpld cr7, rWORD1, rWORD2
382 #ifndef __LITTLE_ENDIAN__
383 addi rSTR1, rSTR1, 16
384 addi rSTR2, rSTR2, 16
385 #endif
386 bne cr1, L(dLcr1)
387 bne cr6, L(dLcr6)
388 b L(dLoop1)
389 /* Again we are on a early exit path (24-31 byte compare), we want to
390 only use volatile registers and avoid restoring non-volatile
391 registers. */
392 .align 4
393 L(dP3x):
394 #ifdef __LITTLE_ENDIAN__
395 ldbrx rWORD1, 0, rSTR1
396 ldbrx rWORD2, 0, rSTR2
397 addi rSTR1, rSTR1, 8
398 addi rSTR2, rSTR2, 8
399 #else
400 ld rWORD1, 16(rSTR1)
401 ld rWORD2, 16(rSTR2)
402 #endif
403 cmpld cr7, rWORD1, rWORD2
404 sldi. r12, rN, 3
405 bne cr1, L(dLcr1x)
406 #ifndef __LITTLE_ENDIAN__
407 addi rSTR1, rSTR1, 16
408 addi rSTR2, rSTR2, 16
409 #endif
410 bne cr6, L(dLcr6x)
411 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
412 bne cr7, L(dLcr7x)
413 bne L(d00)
414 li rRTN, 0
415 blr
416
417 /* Count is a multiple of 32, remainder is 0 */
418 .align 4
419 L(dP4):
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421 #ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD1, 0, rSTR1
423 ldbrx rWORD2, 0, rSTR2
424 addi rSTR1, rSTR1, 8
425 addi rSTR2, rSTR2, 8
426 #else
427 ld rWORD1, 0(rSTR1)
428 ld rWORD2, 0(rSTR2)
429 #endif
430 cmpld cr7, rWORD1, rWORD2
431 L(dP4e):
432 #ifdef __LITTLE_ENDIAN__
433 ldbrx rWORD3, 0, rSTR1
434 ldbrx rWORD4, 0, rSTR2
435 addi rSTR1, rSTR1, 8
436 addi rSTR2, rSTR2, 8
437 #else
438 ld rWORD3, 8(rSTR1)
439 ld rWORD4, 8(rSTR2)
440 #endif
441 cmpld cr1, rWORD3, rWORD4
442 #ifdef __LITTLE_ENDIAN__
443 ldbrx rWORD5, 0, rSTR1
444 ldbrx rWORD6, 0, rSTR2
445 addi rSTR1, rSTR1, 8
446 addi rSTR2, rSTR2, 8
447 #else
448 ld rWORD5, 16(rSTR1)
449 ld rWORD6, 16(rSTR2)
450 #endif
451 cmpld cr6, rWORD5, rWORD6
452 #ifdef __LITTLE_ENDIAN__
453 ldbrx rWORD7, 0, rSTR1
454 ldbrx rWORD8, 0, rSTR2
455 addi rSTR1, rSTR1, 8
456 addi rSTR2, rSTR2, 8
457 #else
458 ldu rWORD7, 24(rSTR1)
459 ldu rWORD8, 24(rSTR2)
460 #endif
461 cmpld cr5, rWORD7, rWORD8
462 bne cr7, L(dLcr7)
463 bne cr1, L(dLcr1)
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465 /* This is the primary loop */
466 .align 4
467 L(dLoop):
468 #ifdef __LITTLE_ENDIAN__
469 ldbrx rWORD1, 0, rSTR1
470 ldbrx rWORD2, 0, rSTR2
471 addi rSTR1, rSTR1, 8
472 addi rSTR2, rSTR2, 8
473 #else
474 ld rWORD1, 8(rSTR1)
475 ld rWORD2, 8(rSTR2)
476 #endif
477 cmpld cr1, rWORD3, rWORD4
478 bne cr6, L(dLcr6)
479 L(dLoop1):
480 #ifdef __LITTLE_ENDIAN__
481 ldbrx rWORD3, 0, rSTR1
482 ldbrx rWORD4, 0, rSTR2
483 addi rSTR1, rSTR1, 8
484 addi rSTR2, rSTR2, 8
485 #else
486 ld rWORD3, 16(rSTR1)
487 ld rWORD4, 16(rSTR2)
488 #endif
489 cmpld cr6, rWORD5, rWORD6
490 bne cr5, L(dLcr5)
491 L(dLoop2):
492 #ifdef __LITTLE_ENDIAN__
493 ldbrx rWORD5, 0, rSTR1
494 ldbrx rWORD6, 0, rSTR2
495 addi rSTR1, rSTR1, 8
496 addi rSTR2, rSTR2, 8
497 #else
498 ld rWORD5, 24(rSTR1)
499 ld rWORD6, 24(rSTR2)
500 #endif
501 cmpld cr5, rWORD7, rWORD8
502 bne cr7, L(dLcr7)
503 L(dLoop3):
504 #ifdef __LITTLE_ENDIAN__
505 ldbrx rWORD7, 0, rSTR1
506 ldbrx rWORD8, 0, rSTR2
507 addi rSTR1, rSTR1, 8
508 addi rSTR2, rSTR2, 8
509 #else
510 ldu rWORD7, 32(rSTR1)
511 ldu rWORD8, 32(rSTR2)
512 #endif
513 bne- cr1, L(dLcr1)
514 cmpld cr7, rWORD1, rWORD2
515 bdnz+ L(dLoop)
516
517 L(dL4):
518 cmpld cr1, rWORD3, rWORD4
519 bne cr6, L(dLcr6)
520 cmpld cr6, rWORD5, rWORD6
521 bne cr5, L(dLcr5)
522 cmpld cr5, rWORD7, rWORD8
523 L(d44):
524 bne cr7, L(dLcr7)
525 L(d34):
526 bne cr1, L(dLcr1)
527 L(d24):
528 bne cr6, L(dLcr6)
529 L(d14):
530 sldi. r12, rN, 3
531 bne cr5, L(dLcr5)
532 L(d04):
533 ld rWORD8, -8(r1)
534 ld rWORD7, -16(r1)
535 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
536 beq L(zeroLength)
537 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
538 we are aligned it is safe to load the whole double word, and use
539 shift right double to eliminate bits beyond the compare length. */
540 L(d00):
541 #ifdef __LITTLE_ENDIAN__
542 ldbrx rWORD1, 0, rSTR1
543 ldbrx rWORD2, 0, rSTR2
544 addi rSTR1, rSTR1, 8
545 addi rSTR2, rSTR2, 8
546 #else
547 ld rWORD1, 8(rSTR1)
548 ld rWORD2, 8(rSTR2)
549 #endif
550 srd rWORD1, rWORD1, rN
551 srd rWORD2, rWORD2, rN
552 cmpld cr7, rWORD1, rWORD2
553 bne cr7, L(dLcr7x)
554 li rRTN, 0
555 blr
556
557 .align 4
558 L(dLcr7):
559 ld rWORD8, -8(r1)
560 ld rWORD7, -16(r1)
561 L(dLcr7x):
562 li rRTN, 1
563 bgtlr cr7
564 li rRTN, -1
565 blr
566 .align 4
567 L(dLcr1):
568 ld rWORD8, -8(r1)
569 ld rWORD7, -16(r1)
570 L(dLcr1x):
571 li rRTN, 1
572 bgtlr cr1
573 li rRTN, -1
574 blr
575 .align 4
576 L(dLcr6):
577 ld rWORD8, -8(r1)
578 ld rWORD7, -16(r1)
579 L(dLcr6x):
580 li rRTN, 1
581 bgtlr cr6
582 li rRTN, -1
583 blr
584 .align 4
585 L(dLcr5):
586 ld rWORD8, -8(r1)
587 ld rWORD7, -16(r1)
588 L(dLcr5x):
589 li rRTN, 1
590 bgtlr cr5
591 li rRTN, -1
592 blr
593
594 .align 4
595 L(bytealigned):
596 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
597 #if 0
598 /* Huh? We've already branched on cr6! */
599 beq- cr6, L(zeroLength)
600 #endif
601
602 /* We need to prime this loop. This loop is swing modulo scheduled
603 to avoid pipe delays. The dependent instruction latencies (load to
604 compare to conditional branch) is 2 to 3 cycles. In this loop each
605 dispatch group ends in a branch and takes 1 cycle. Effectively
606 the first iteration of the loop only serves to load operands and
607 branches based on compares are delayed until the next loop.
608
609 So we must precondition some registers and condition codes so that
610 we don't exit the loop early on the first iteration. */
611
612 lbz rWORD1, 0(rSTR1)
613 lbz rWORD2, 0(rSTR2)
614 bdz- L(b11)
615 cmpld cr7, rWORD1, rWORD2
616 lbz rWORD3, 1(rSTR1)
617 lbz rWORD4, 1(rSTR2)
618 bdz- L(b12)
619 cmpld cr1, rWORD3, rWORD4
620 lbzu rWORD5, 2(rSTR1)
621 lbzu rWORD6, 2(rSTR2)
622 bdz- L(b13)
623 .align 4
624 L(bLoop):
625 lbzu rWORD1, 1(rSTR1)
626 lbzu rWORD2, 1(rSTR2)
627 bne- cr7, L(bLcr7)
628
629 cmpld cr6, rWORD5, rWORD6
630 bdz- L(b3i)
631
632 lbzu rWORD3, 1(rSTR1)
633 lbzu rWORD4, 1(rSTR2)
634 bne- cr1, L(bLcr1)
635
636 cmpld cr7, rWORD1, rWORD2
637 bdz- L(b2i)
638
639 lbzu rWORD5, 1(rSTR1)
640 lbzu rWORD6, 1(rSTR2)
641 bne- cr6, L(bLcr6)
642
643 cmpld cr1, rWORD3, rWORD4
644 bdnz+ L(bLoop)
645
646 /* We speculatively loading bytes before we have tested the previous
647 bytes. But we must avoid overrunning the length (in the ctr) to
648 prevent these speculative loads from causing a segfault. In this
649 case the loop will exit early (before the all pending bytes are
650 tested. In this case we must complete the pending operations
651 before returning. */
652 L(b1i):
653 bne- cr7, L(bLcr7)
654 bne- cr1, L(bLcr1)
655 b L(bx56)
656 .align 4
657 L(b2i):
658 bne- cr6, L(bLcr6)
659 bne- cr7, L(bLcr7)
660 b L(bx34)
661 .align 4
662 L(b3i):
663 bne- cr1, L(bLcr1)
664 bne- cr6, L(bLcr6)
665 b L(bx12)
666 .align 4
667 L(bLcr7):
668 li rRTN, 1
669 bgtlr cr7
670 li rRTN, -1
671 blr
672 L(bLcr1):
673 li rRTN, 1
674 bgtlr cr1
675 li rRTN, -1
676 blr
677 L(bLcr6):
678 li rRTN, 1
679 bgtlr cr6
680 li rRTN, -1
681 blr
682
683 L(b13):
684 bne- cr7, L(bx12)
685 bne- cr1, L(bx34)
686 L(bx56):
687 sub rRTN, rWORD5, rWORD6
688 blr
689 nop
690 L(b12):
691 bne- cr7, L(bx12)
692 L(bx34):
693 sub rRTN, rWORD3, rWORD4
694 blr
695 L(b11):
696 L(bx12):
697 sub rRTN, rWORD1, rWORD2
698 blr
699 .align 4
700 L(zeroLength):
701 li rRTN, 0
702 blr
703
704 .align 4
705 /* At this point we know the strings have different alignment and the
706 compare length is at least 8 bytes. r12 contains the low order
707 3 bits of rSTR1 and cr5 contains the result of the logical compare
708 of r12 to 0. If r12 == 0 then rStr1 is double word
709 aligned and can perform the DWunaligned loop.
710
711 Otherwise we know that rSTR1 is not already DW aligned yet.
712 So we can force the string addresses to the next lower DW
713 boundary and special case this first DW using shift left to
714 eliminate bits preceding the first byte. Since we want to join the
715 normal (DWaligned) compare loop, starting at the second double word,
716 we need to adjust the length (rN) and special case the loop
717 versioning for the first DW. This ensures that the loop count is
718 correct and the first DW (shifted) is in the expected resister pair. */
719 #define rSHL r29 /* Unaligned shift left count. */
720 #define rSHR r28 /* Unaligned shift right count. */
721 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
722 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
723 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
724 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
725 L(unaligned):
726 std rSHL, -24(r1)
727 cfi_offset(rSHL, -24)
728 clrldi rSHL, rSTR2, 61
729 beq- cr6, L(duzeroLength)
730 std rSHR, -32(r1)
731 cfi_offset(rSHR, -32)
732 beq cr5, L(DWunaligned)
733 std rWORD8_SHIFT, -40(r1)
734 cfi_offset(rWORD8_SHIFT, -40)
735 /* Adjust the logical start of rSTR2 to compensate for the extra bits
736 in the 1st rSTR1 DW. */
737 sub rWORD8_SHIFT, rSTR2, r12
738 /* But do not attempt to address the DW before that DW that contains
739 the actual start of rSTR2. */
740 clrrdi rSTR2, rSTR2, 3
741 std rWORD2_SHIFT, -48(r1)
742 /* Compute the left/right shift counts for the unaligned rSTR2,
743 compensating for the logical (DW aligned) start of rSTR1. */
744 clrldi rSHL, rWORD8_SHIFT, 61
745 clrrdi rSTR1, rSTR1, 3
746 std rWORD4_SHIFT, -56(r1)
747 sldi rSHL, rSHL, 3
748 cmpld cr5, rWORD8_SHIFT, rSTR2
749 add rN, rN, r12
750 sldi rWORD6, r12, 3
751 std rWORD6_SHIFT, -64(r1)
752 cfi_offset(rWORD2_SHIFT, -48)
753 cfi_offset(rWORD4_SHIFT, -56)
754 cfi_offset(rWORD6_SHIFT, -64)
755 subfic rSHR, rSHL, 64
756 srdi r0, rN, 5 /* Divide by 32 */
757 andi. r12, rN, 24 /* Get the DW remainder */
758 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
759 this special case those bits may be discarded anyway. Also we
760 must avoid loading a DW where none of the bits are part of rSTR2 as
761 this may cross a page boundary and cause a page fault. */
762 li rWORD8, 0
763 blt cr5, L(dus0)
764 #ifdef __LITTLE_ENDIAN__
765 ldbrx rWORD8, 0, rSTR2
766 addi rSTR2, rSTR2, 8
767 #else
768 ld rWORD8, 0(rSTR2)
769 addi rSTR2, rSTR2, 8
770 #endif
771 sld rWORD8, rWORD8, rSHL
772
773 L(dus0):
774 #ifdef __LITTLE_ENDIAN__
775 ldbrx rWORD1, 0, rSTR1
776 ldbrx rWORD2, 0, rSTR2
777 addi rSTR1, rSTR1, 8
778 addi rSTR2, rSTR2, 8
779 #else
780 ld rWORD1, 0(rSTR1)
781 ld rWORD2, 0(rSTR2)
782 #endif
783 cmpldi cr1, r12, 16
784 cmpldi cr7, rN, 32
785 srd r12, rWORD2, rSHR
786 clrldi rN, rN, 61
787 beq L(duPs4)
788 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
789 or rWORD8, r12, rWORD8
790 bgt cr1, L(duPs3)
791 beq cr1, L(duPs2)
792
793 /* Remainder is 8 */
794 .align 4
795 L(dusP1):
796 sld rWORD8_SHIFT, rWORD2, rSHL
797 sld rWORD7, rWORD1, rWORD6
798 sld rWORD8, rWORD8, rWORD6
799 bge cr7, L(duP1e)
800 /* At this point we exit early with the first double word compare
801 complete and remainder of 0 to 7 bytes. See L(du14) for details on
802 how we handle the remaining bytes. */
803 cmpld cr5, rWORD7, rWORD8
804 sldi. rN, rN, 3
805 bne cr5, L(duLcr5)
806 cmpld cr7, rN, rSHR
807 beq L(duZeroReturn)
808 li r0, 0
809 ble cr7, L(dutrim)
810 #ifdef __LITTLE_ENDIAN__
811 ldbrx rWORD2, 0, rSTR2
812 addi rSTR2, rSTR2, 8
813 #else
814 ld rWORD2, 8(rSTR2)
815 #endif
816 srd r0, rWORD2, rSHR
817 b L(dutrim)
818 /* Remainder is 16 */
819 .align 4
820 L(duPs2):
821 sld rWORD6_SHIFT, rWORD2, rSHL
822 sld rWORD5, rWORD1, rWORD6
823 sld rWORD6, rWORD8, rWORD6
824 b L(duP2e)
825 /* Remainder is 24 */
826 .align 4
827 L(duPs3):
828 sld rWORD4_SHIFT, rWORD2, rSHL
829 sld rWORD3, rWORD1, rWORD6
830 sld rWORD4, rWORD8, rWORD6
831 b L(duP3e)
832 /* Count is a multiple of 32, remainder is 0 */
833 .align 4
834 L(duPs4):
835 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
836 or rWORD8, r12, rWORD8
837 sld rWORD2_SHIFT, rWORD2, rSHL
838 sld rWORD1, rWORD1, rWORD6
839 sld rWORD2, rWORD8, rWORD6
840 b L(duP4e)
841
842 /* At this point we know rSTR1 is double word aligned and the
843 compare length is at least 8 bytes. */
844 .align 4
845 L(DWunaligned):
846 std rWORD8_SHIFT, -40(r1)
847 clrrdi rSTR2, rSTR2, 3
848 std rWORD2_SHIFT, -48(r1)
849 srdi r0, rN, 5 /* Divide by 32 */
850 std rWORD4_SHIFT, -56(r1)
851 andi. r12, rN, 24 /* Get the DW remainder */
852 std rWORD6_SHIFT, -64(r1)
853 cfi_offset(rWORD8_SHIFT, -40)
854 cfi_offset(rWORD2_SHIFT, -48)
855 cfi_offset(rWORD4_SHIFT, -56)
856 cfi_offset(rWORD6_SHIFT, -64)
857 sldi rSHL, rSHL, 3
858 #ifdef __LITTLE_ENDIAN__
859 ldbrx rWORD6, 0, rSTR2
860 addi rSTR2, rSTR2, 8
861 ldbrx rWORD8, 0, rSTR2
862 addi rSTR2, rSTR2, 8
863 #else
864 ld rWORD6, 0(rSTR2)
865 ldu rWORD8, 8(rSTR2)
866 #endif
867 cmpldi cr1, r12, 16
868 cmpldi cr7, rN, 32
869 clrldi rN, rN, 61
870 subfic rSHR, rSHL, 64
871 sld rWORD6_SHIFT, rWORD6, rSHL
872 beq L(duP4)
873 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
874 bgt cr1, L(duP3)
875 beq cr1, L(duP2)
876
877 /* Remainder is 8 */
878 .align 4
879 L(duP1):
880 srd r12, rWORD8, rSHR
881 #ifdef __LITTLE_ENDIAN__
882 ldbrx rWORD7, 0, rSTR1
883 addi rSTR1, rSTR1, 8
884 #else
885 ld rWORD7, 0(rSTR1)
886 #endif
887 sld rWORD8_SHIFT, rWORD8, rSHL
888 or rWORD8, r12, rWORD6_SHIFT
889 blt cr7, L(duP1x)
890 L(duP1e):
891 #ifdef __LITTLE_ENDIAN__
892 ldbrx rWORD1, 0, rSTR1
893 ldbrx rWORD2, 0, rSTR2
894 addi rSTR1, rSTR1, 8
895 addi rSTR2, rSTR2, 8
896 #else
897 ld rWORD1, 8(rSTR1)
898 ld rWORD2, 8(rSTR2)
899 #endif
900 cmpld cr5, rWORD7, rWORD8
901 srd r0, rWORD2, rSHR
902 sld rWORD2_SHIFT, rWORD2, rSHL
903 or rWORD2, r0, rWORD8_SHIFT
904 #ifdef __LITTLE_ENDIAN__
905 ldbrx rWORD3, 0, rSTR1
906 ldbrx rWORD4, 0, rSTR2
907 addi rSTR1, rSTR1, 8
908 addi rSTR2, rSTR2, 8
909 #else
910 ld rWORD3, 16(rSTR1)
911 ld rWORD4, 16(rSTR2)
912 #endif
913 cmpld cr7, rWORD1, rWORD2
914 srd r12, rWORD4, rSHR
915 sld rWORD4_SHIFT, rWORD4, rSHL
916 bne cr5, L(duLcr5)
917 or rWORD4, r12, rWORD2_SHIFT
918 #ifdef __LITTLE_ENDIAN__
919 ldbrx rWORD5, 0, rSTR1
920 ldbrx rWORD6, 0, rSTR2
921 addi rSTR1, rSTR1, 8
922 addi rSTR2, rSTR2, 8
923 #else
924 ld rWORD5, 24(rSTR1)
925 ld rWORD6, 24(rSTR2)
926 #endif
927 cmpld cr1, rWORD3, rWORD4
928 srd r0, rWORD6, rSHR
929 sld rWORD6_SHIFT, rWORD6, rSHL
930 bne cr7, L(duLcr7)
931 or rWORD6, r0, rWORD4_SHIFT
932 cmpld cr6, rWORD5, rWORD6
933 b L(duLoop3)
934 .align 4
935 /* At this point we exit early with the first double word compare
936 complete and remainder of 0 to 7 bytes. See L(du14) for details on
937 how we handle the remaining bytes. */
938 L(duP1x):
939 cmpld cr5, rWORD7, rWORD8
940 sldi. rN, rN, 3
941 bne cr5, L(duLcr5)
942 cmpld cr7, rN, rSHR
943 beq L(duZeroReturn)
944 li r0, 0
945 ble cr7, L(dutrim)
946 #ifdef __LITTLE_ENDIAN__
947 ldbrx rWORD2, 0, rSTR2
948 addi rSTR2, rSTR2, 8
949 #else
950 ld rWORD2, 8(rSTR2)
951 #endif
952 srd r0, rWORD2, rSHR
953 b L(dutrim)
954 /* Remainder is 16 */
955 .align 4
956 L(duP2):
957 srd r0, rWORD8, rSHR
958 #ifdef __LITTLE_ENDIAN__
959 ldbrx rWORD5, 0, rSTR1
960 addi rSTR1, rSTR1, 8
961 #else
962 ld rWORD5, 0(rSTR1)
963 #endif
964 or rWORD6, r0, rWORD6_SHIFT
965 sld rWORD6_SHIFT, rWORD8, rSHL
966 L(duP2e):
967 #ifdef __LITTLE_ENDIAN__
968 ldbrx rWORD7, 0, rSTR1
969 ldbrx rWORD8, 0, rSTR2
970 addi rSTR1, rSTR1, 8
971 addi rSTR2, rSTR2, 8
972 #else
973 ld rWORD7, 8(rSTR1)
974 ld rWORD8, 8(rSTR2)
975 #endif
976 cmpld cr6, rWORD5, rWORD6
977 srd r12, rWORD8, rSHR
978 sld rWORD8_SHIFT, rWORD8, rSHL
979 or rWORD8, r12, rWORD6_SHIFT
980 blt cr7, L(duP2x)
981 #ifdef __LITTLE_ENDIAN__
982 ldbrx rWORD1, 0, rSTR1
983 ldbrx rWORD2, 0, rSTR2
984 addi rSTR1, rSTR1, 8
985 addi rSTR2, rSTR2, 8
986 #else
987 ld rWORD1, 16(rSTR1)
988 ld rWORD2, 16(rSTR2)
989 #endif
990 cmpld cr5, rWORD7, rWORD8
991 bne cr6, L(duLcr6)
992 srd r0, rWORD2, rSHR
993 sld rWORD2_SHIFT, rWORD2, rSHL
994 or rWORD2, r0, rWORD8_SHIFT
995 #ifdef __LITTLE_ENDIAN__
996 ldbrx rWORD3, 0, rSTR1
997 ldbrx rWORD4, 0, rSTR2
998 addi rSTR1, rSTR1, 8
999 addi rSTR2, rSTR2, 8
1000 #else
1001 ld rWORD3, 24(rSTR1)
1002 ld rWORD4, 24(rSTR2)
1003 #endif
1004 cmpld cr7, rWORD1, rWORD2
1005 bne cr5, L(duLcr5)
1006 srd r12, rWORD4, rSHR
1007 sld rWORD4_SHIFT, rWORD4, rSHL
1008 or rWORD4, r12, rWORD2_SHIFT
1009 #ifndef __LITTLE_ENDIAN__
1010 addi rSTR1, rSTR1, 8
1011 addi rSTR2, rSTR2, 8
1012 #endif
1013 cmpld cr1, rWORD3, rWORD4
1014 b L(duLoop2)
1015 .align 4
1016 L(duP2x):
1017 cmpld cr5, rWORD7, rWORD8
1018 #ifndef __LITTLE_ENDIAN__
1019 addi rSTR1, rSTR1, 8
1020 addi rSTR2, rSTR2, 8
1021 #endif
1022 bne cr6, L(duLcr6)
1023 sldi. rN, rN, 3
1024 bne cr5, L(duLcr5)
1025 cmpld cr7, rN, rSHR
1026 beq L(duZeroReturn)
1027 li r0, 0
1028 ble cr7, L(dutrim)
1029 #ifdef __LITTLE_ENDIAN__
1030 ldbrx rWORD2, 0, rSTR2
1031 addi rSTR2, rSTR2, 8
1032 #else
1033 ld rWORD2, 8(rSTR2)
1034 #endif
1035 srd r0, rWORD2, rSHR
1036 b L(dutrim)
1037
1038 /* Remainder is 24 */
1039 .align 4
1040 L(duP3):
1041 srd r12, rWORD8, rSHR
1042 #ifdef __LITTLE_ENDIAN__
1043 ldbrx rWORD3, 0, rSTR1
1044 addi rSTR1, rSTR1, 8
1045 #else
1046 ld rWORD3, 0(rSTR1)
1047 #endif
1048 sld rWORD4_SHIFT, rWORD8, rSHL
1049 or rWORD4, r12, rWORD6_SHIFT
1050 L(duP3e):
1051 #ifdef __LITTLE_ENDIAN__
1052 ldbrx rWORD5, 0, rSTR1
1053 ldbrx rWORD6, 0, rSTR2
1054 addi rSTR1, rSTR1, 8
1055 addi rSTR2, rSTR2, 8
1056 #else
1057 ld rWORD5, 8(rSTR1)
1058 ld rWORD6, 8(rSTR2)
1059 #endif
1060 cmpld cr1, rWORD3, rWORD4
1061 srd r0, rWORD6, rSHR
1062 sld rWORD6_SHIFT, rWORD6, rSHL
1063 or rWORD6, r0, rWORD4_SHIFT
1064 #ifdef __LITTLE_ENDIAN__
1065 ldbrx rWORD7, 0, rSTR1
1066 ldbrx rWORD8, 0, rSTR2
1067 addi rSTR1, rSTR1, 8
1068 addi rSTR2, rSTR2, 8
1069 #else
1070 ld rWORD7, 16(rSTR1)
1071 ld rWORD8, 16(rSTR2)
1072 #endif
1073 cmpld cr6, rWORD5, rWORD6
1074 bne cr1, L(duLcr1)
1075 srd r12, rWORD8, rSHR
1076 sld rWORD8_SHIFT, rWORD8, rSHL
1077 or rWORD8, r12, rWORD6_SHIFT
1078 blt cr7, L(duP3x)
1079 #ifdef __LITTLE_ENDIAN__
1080 ldbrx rWORD1, 0, rSTR1
1081 ldbrx rWORD2, 0, rSTR2
1082 addi rSTR1, rSTR1, 8
1083 addi rSTR2, rSTR2, 8
1084 #else
1085 ld rWORD1, 24(rSTR1)
1086 ld rWORD2, 24(rSTR2)
1087 #endif
1088 cmpld cr5, rWORD7, rWORD8
1089 bne cr6, L(duLcr6)
1090 srd r0, rWORD2, rSHR
1091 sld rWORD2_SHIFT, rWORD2, rSHL
1092 or rWORD2, r0, rWORD8_SHIFT
1093 #ifndef __LITTLE_ENDIAN__
1094 addi rSTR1, rSTR1, 16
1095 addi rSTR2, rSTR2, 16
1096 #endif
1097 cmpld cr7, rWORD1, rWORD2
1098 b L(duLoop1)
1099 .align 4
1100 L(duP3x):
1101 #ifndef __LITTLE_ENDIAN__
1102 addi rSTR1, rSTR1, 16
1103 addi rSTR2, rSTR2, 16
1104 #endif
1105 #if 0
1106 /* Huh? We've already branched on cr1! */
1107 bne cr1, L(duLcr1)
1108 #endif
1109 cmpld cr5, rWORD7, rWORD8
1110 bne cr6, L(duLcr6)
1111 sldi. rN, rN, 3
1112 bne cr5, L(duLcr5)
1113 cmpld cr7, rN, rSHR
1114 beq L(duZeroReturn)
1115 li r0, 0
1116 ble cr7, L(dutrim)
1117 #ifdef __LITTLE_ENDIAN__
1118 ldbrx rWORD2, 0, rSTR2
1119 addi rSTR2, rSTR2, 8
1120 #else
1121 ld rWORD2, 8(rSTR2)
1122 #endif
1123 srd r0, rWORD2, rSHR
1124 b L(dutrim)
1125
1126 /* Count is a multiple of 32, remainder is 0 */
1127 .align 4
1128 L(duP4):
1129 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1130 srd r0, rWORD8, rSHR
1131 #ifdef __LITTLE_ENDIAN__
1132 ldbrx rWORD1, 0, rSTR1
1133 addi rSTR1, rSTR1, 8
1134 #else
1135 ld rWORD1, 0(rSTR1)
1136 #endif
1137 sld rWORD2_SHIFT, rWORD8, rSHL
1138 or rWORD2, r0, rWORD6_SHIFT
1139 L(duP4e):
1140 #ifdef __LITTLE_ENDIAN__
1141 ldbrx rWORD3, 0, rSTR1
1142 ldbrx rWORD4, 0, rSTR2
1143 addi rSTR1, rSTR1, 8
1144 addi rSTR2, rSTR2, 8
1145 #else
1146 ld rWORD3, 8(rSTR1)
1147 ld rWORD4, 8(rSTR2)
1148 #endif
1149 cmpld cr7, rWORD1, rWORD2
1150 srd r12, rWORD4, rSHR
1151 sld rWORD4_SHIFT, rWORD4, rSHL
1152 or rWORD4, r12, rWORD2_SHIFT
1153 #ifdef __LITTLE_ENDIAN__
1154 ldbrx rWORD5, 0, rSTR1
1155 ldbrx rWORD6, 0, rSTR2
1156 addi rSTR1, rSTR1, 8
1157 addi rSTR2, rSTR2, 8
1158 #else
1159 ld rWORD5, 16(rSTR1)
1160 ld rWORD6, 16(rSTR2)
1161 #endif
1162 cmpld cr1, rWORD3, rWORD4
1163 bne cr7, L(duLcr7)
1164 srd r0, rWORD6, rSHR
1165 sld rWORD6_SHIFT, rWORD6, rSHL
1166 or rWORD6, r0, rWORD4_SHIFT
1167 #ifdef __LITTLE_ENDIAN__
1168 ldbrx rWORD7, 0, rSTR1
1169 ldbrx rWORD8, 0, rSTR2
1170 addi rSTR1, rSTR1, 8
1171 addi rSTR2, rSTR2, 8
1172 #else
1173 ldu rWORD7, 24(rSTR1)
1174 ldu rWORD8, 24(rSTR2)
1175 #endif
1176 cmpld cr6, rWORD5, rWORD6
1177 bne cr1, L(duLcr1)
1178 srd r12, rWORD8, rSHR
1179 sld rWORD8_SHIFT, rWORD8, rSHL
1180 or rWORD8, r12, rWORD6_SHIFT
1181 cmpld cr5, rWORD7, rWORD8
1182 bdz- L(du24) /* Adjust CTR as we start with +4 */
1183 /* This is the primary loop */
1184 .align 4
1185 L(duLoop):
1186 #ifdef __LITTLE_ENDIAN__
1187 ldbrx rWORD1, 0, rSTR1
1188 ldbrx rWORD2, 0, rSTR2
1189 addi rSTR1, rSTR1, 8
1190 addi rSTR2, rSTR2, 8
1191 #else
1192 ld rWORD1, 8(rSTR1)
1193 ld rWORD2, 8(rSTR2)
1194 #endif
1195 cmpld cr1, rWORD3, rWORD4
1196 bne cr6, L(duLcr6)
1197 srd r0, rWORD2, rSHR
1198 sld rWORD2_SHIFT, rWORD2, rSHL
1199 or rWORD2, r0, rWORD8_SHIFT
1200 L(duLoop1):
1201 #ifdef __LITTLE_ENDIAN__
1202 ldbrx rWORD3, 0, rSTR1
1203 ldbrx rWORD4, 0, rSTR2
1204 addi rSTR1, rSTR1, 8
1205 addi rSTR2, rSTR2, 8
1206 #else
1207 ld rWORD3, 16(rSTR1)
1208 ld rWORD4, 16(rSTR2)
1209 #endif
1210 cmpld cr6, rWORD5, rWORD6
1211 bne cr5, L(duLcr5)
1212 srd r12, rWORD4, rSHR
1213 sld rWORD4_SHIFT, rWORD4, rSHL
1214 or rWORD4, r12, rWORD2_SHIFT
1215 L(duLoop2):
1216 #ifdef __LITTLE_ENDIAN__
1217 ldbrx rWORD5, 0, rSTR1
1218 ldbrx rWORD6, 0, rSTR2
1219 addi rSTR1, rSTR1, 8
1220 addi rSTR2, rSTR2, 8
1221 #else
1222 ld rWORD5, 24(rSTR1)
1223 ld rWORD6, 24(rSTR2)
1224 #endif
1225 cmpld cr5, rWORD7, rWORD8
1226 bne cr7, L(duLcr7)
1227 srd r0, rWORD6, rSHR
1228 sld rWORD6_SHIFT, rWORD6, rSHL
1229 or rWORD6, r0, rWORD4_SHIFT
1230 L(duLoop3):
1231 #ifdef __LITTLE_ENDIAN__
1232 ldbrx rWORD7, 0, rSTR1
1233 ldbrx rWORD8, 0, rSTR2
1234 addi rSTR1, rSTR1, 8
1235 addi rSTR2, rSTR2, 8
1236 #else
1237 ldu rWORD7, 32(rSTR1)
1238 ldu rWORD8, 32(rSTR2)
1239 #endif
1240 cmpld cr7, rWORD1, rWORD2
1241 bne- cr1, L(duLcr1)
1242 srd r12, rWORD8, rSHR
1243 sld rWORD8_SHIFT, rWORD8, rSHL
1244 or rWORD8, r12, rWORD6_SHIFT
1245 bdnz+ L(duLoop)
1246
1247 L(duL4):
1248 #if 0
1249 /* Huh? We've already branched on cr1! */
1250 bne cr1, L(duLcr1)
1251 #endif
1252 cmpld cr1, rWORD3, rWORD4
1253 bne cr6, L(duLcr6)
1254 cmpld cr6, rWORD5, rWORD6
1255 bne cr5, L(duLcr5)
1256 cmpld cr5, rWORD7, rWORD8
1257 L(du44):
1258 bne cr7, L(duLcr7)
1259 L(du34):
1260 bne cr1, L(duLcr1)
1261 L(du24):
1262 bne cr6, L(duLcr6)
1263 L(du14):
1264 sldi. rN, rN, 3
1265 bne cr5, L(duLcr5)
1266 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1267 shift right double to eliminate bits beyond the compare length.
1268
1269 However it may not be safe to load rWORD2 which may be beyond the
1270 string length. So we compare the bit length of the remainder to
1271 the right shift count (rSHR). If the bit count is less than or equal
1272 we do not need to load rWORD2 (all significant bits are already in
1273 rWORD8_SHIFT). */
1274 cmpld cr7, rN, rSHR
1275 beq L(duZeroReturn)
1276 li r0, 0
1277 ble cr7, L(dutrim)
1278 #ifdef __LITTLE_ENDIAN__
1279 ldbrx rWORD2, 0, rSTR2
1280 addi rSTR2, rSTR2, 8
1281 #else
1282 ld rWORD2, 8(rSTR2)
1283 #endif
1284 srd r0, rWORD2, rSHR
1285 .align 4
1286 L(dutrim):
1287 #ifdef __LITTLE_ENDIAN__
1288 ldbrx rWORD1, 0, rSTR1
1289 #else
1290 ld rWORD1, 8(rSTR1)
1291 #endif
1292 ld rWORD8, -8(r1)
1293 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1294 or rWORD2, r0, rWORD8_SHIFT
1295 ld rWORD7, -16(r1)
1296 ld rSHL, -24(r1)
1297 srd rWORD1, rWORD1, rN
1298 srd rWORD2, rWORD2, rN
1299 ld rSHR, -32(r1)
1300 ld rWORD8_SHIFT, -40(r1)
1301 li rRTN, 0
1302 cmpld cr7, rWORD1, rWORD2
1303 ld rWORD2_SHIFT, -48(r1)
1304 ld rWORD4_SHIFT, -56(r1)
1305 beq cr7, L(dureturn24)
1306 li rRTN, 1
1307 ld rWORD6_SHIFT, -64(r1)
1308 bgtlr cr7
1309 li rRTN, -1
1310 blr
1311 .align 4
1312 L(duLcr7):
1313 ld rWORD8, -8(r1)
1314 ld rWORD7, -16(r1)
1315 li rRTN, 1
1316 bgt cr7, L(dureturn29)
1317 ld rSHL, -24(r1)
1318 ld rSHR, -32(r1)
1319 li rRTN, -1
1320 b L(dureturn27)
1321 .align 4
1322 L(duLcr1):
1323 ld rWORD8, -8(r1)
1324 ld rWORD7, -16(r1)
1325 li rRTN, 1
1326 bgt cr1, L(dureturn29)
1327 ld rSHL, -24(r1)
1328 ld rSHR, -32(r1)
1329 li rRTN, -1
1330 b L(dureturn27)
1331 .align 4
1332 L(duLcr6):
1333 ld rWORD8, -8(r1)
1334 ld rWORD7, -16(r1)
1335 li rRTN, 1
1336 bgt cr6, L(dureturn29)
1337 ld rSHL, -24(r1)
1338 ld rSHR, -32(r1)
1339 li rRTN, -1
1340 b L(dureturn27)
1341 .align 4
1342 L(duLcr5):
1343 ld rWORD8, -8(r1)
1344 ld rWORD7, -16(r1)
1345 li rRTN, 1
1346 bgt cr5, L(dureturn29)
1347 ld rSHL, -24(r1)
1348 ld rSHR, -32(r1)
1349 li rRTN, -1
1350 b L(dureturn27)
1351 .align 3
1352 L(duZeroReturn):
1353 li rRTN, 0
1354 .align 4
1355 L(dureturn):
1356 ld rWORD8, -8(r1)
1357 ld rWORD7, -16(r1)
1358 L(dureturn29):
1359 ld rSHL, -24(r1)
1360 ld rSHR, -32(r1)
1361 L(dureturn27):
1362 ld rWORD8_SHIFT, -40(r1)
1363 L(dureturn26):
1364 ld rWORD2_SHIFT, -48(r1)
1365 L(dureturn25):
1366 ld rWORD4_SHIFT, -56(r1)
1367 L(dureturn24):
1368 ld rWORD6_SHIFT, -64(r1)
1369 blr
1370 L(duzeroLength):
1371 li rRTN, 0
1372 blr
1373
1374 END (MEMCMP)
1375 libc_hidden_builtin_def (memcmp)
1376 weak_alias (memcmp, bcmp)