]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power8/memcmp.S
ec4ccf33825aea3ec17d05f1d4aa0b53132cddd8
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power8 / memcmp.S
1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* int [r3] memcmp (const char *s1 [r3],
22 const char *s2 [r4],
23 size_t size [r5]) */
24
25 /* TODO: change these to the actual instructions when the minimum required
26 binutils allows it. */
27 #define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
28 #ifndef MEMCMP
29 # define MEMCMP memcmp
30 #endif
31 .machine power7
32 ENTRY_TOCLESS (MEMCMP, 4)
33 CALL_MCOUNT 3
34
35 #define rRTN r3
36 #define rSTR1 r3 /* First string arg. */
37 #define rSTR2 r4 /* Second string arg. */
38 #define rN r5 /* Max string length. */
39 #define rWORD1 r6 /* Current word in s1. */
40 #define rWORD2 r7 /* Current word in s2. */
41 #define rWORD3 r8 /* Next word in s1. */
42 #define rWORD4 r9 /* Next word in s2. */
43 #define rWORD5 r10 /* Next word in s1. */
44 #define rWORD6 r11 /* Next word in s2. */
45
46 #define rOFF8 r20 /* 8 bytes offset. */
47 #define rOFF16 r21 /* 16 bytes offset. */
48 #define rOFF24 r22 /* 24 bytes offset. */
49 #define rOFF32 r23 /* 24 bytes offset. */
50 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
51 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
52 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
53 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
54 #define rSHR r28 /* Unaligned shift right count. */
55 #define rSHL r29 /* Unaligned shift left count. */
56 #define rWORD7 r30 /* Next word in s1. */
57 #define rWORD8 r31 /* Next word in s2. */
58
59 #define rWORD8SAVE (-8)
60 #define rWORD7SAVE (-16)
61 #define rOFF8SAVE (-24)
62 #define rOFF16SAVE (-32)
63 #define rOFF24SAVE (-40)
64 #define rOFF32SAVE (-48)
65 #define rSHRSAVE (-56)
66 #define rSHLSAVE (-64)
67 #define rWORD8SHIFTSAVE (-72)
68 #define rWORD2SHIFTSAVE (-80)
69 #define rWORD4SHIFTSAVE (-88)
70 #define rWORD6SHIFTSAVE (-96)
71
72 #ifdef __LITTLE_ENDIAN__
73 # define LD ldbrx
74 #else
75 # define LD ldx
76 #endif
77
78 xor r10, rSTR2, rSTR1
79 cmpldi cr6, rN, 0
80 cmpldi cr1, rN, 8
81 clrldi. r0, r10, 61
82 clrldi r12, rSTR1, 61
83 cmpldi cr5, r12, 0
84 beq- cr6, L(zeroLength)
85 dcbt 0, rSTR1
86 dcbt 0, rSTR2
87 /* If less than 8 bytes or not aligned, use the unaligned
88 byte loop. */
89 blt cr1, L(bytealigned)
90 bne L(unalignedqw)
91 /* At this point we know both strings have the same alignment and the
92 compare length is at least 8 bytes. r12 contains the low order
93 3 bits of rSTR1 and cr5 contains the result of the logical compare
94 of r12 to 0. If r12 == 0 then we are already double word
95 aligned and can perform the DW aligned loop. */
96
97 .align 4
98 L(samealignment):
99 or r11, rSTR2, rSTR1
100 clrldi. r11, r11, 60
101 beq L(qw_align)
102 /* Try to align to QW else proceed to DW loop. */
103 clrldi. r10, r10, 60
104 bne L(DW)
105 /* For the difference to reach QW alignment, load as DW. */
106 clrrdi rSTR1, rSTR1, 3
107 clrrdi rSTR2, rSTR2, 3
108 subfic r10, r12, 8
109 LD rWORD1, 0, rSTR1
110 LD rWORD2, 0, rSTR2
111 sldi r9, r10, 3
112 subfic r9, r9, 64
113 sld rWORD1, rWORD1, r9
114 sld rWORD2, rWORD2, r9
115 cmpld cr6, rWORD1, rWORD2
116 addi rSTR1, rSTR1, 8
117 addi rSTR2, rSTR2, 8
118 bne cr6, L(ret_diff)
119 subf rN, r10, rN
120
121 cmpld cr6, r11, r12
122 bgt cr6, L(qw_align)
123 LD rWORD1, 0, rSTR1
124 LD rWORD2, 0, rSTR2
125 cmpld cr6, rWORD1, rWORD2
126 addi rSTR1, rSTR1, 8
127 addi rSTR2, rSTR2, 8
128 bne cr6, L(different)
129 cmpldi cr6, rN, 8
130 ble cr6, L(zeroLength)
131 addi rN, rN, -8
132 /* Now both rSTR1 and rSTR2 are aligned to QW. */
133 .align 4
134 L(qw_align):
135 vspltisb v0, 0
136 srdi. r6, rN, 6
137 li r8, 16
138 li r10, 32
139 li r11, 48
140 ble cr0, L(lessthan64)
141 mtctr r6
142 vspltisb v8, 0
143 vspltisb v6, 0
144 /* Aligned vector loop. */
145 .align 4
146 L(aligned_loop):
147 lvx v4, 0, rSTR1
148 lvx v5, 0, rSTR2
149 vcmpequb. v7, v6, v8
150 bnl cr6, L(different3)
151 lvx v6, rSTR1, r8
152 lvx v8, rSTR2, r8
153 vcmpequb. v7, v5, v4
154 bnl cr6, L(different2)
155 lvx v4, rSTR1, r10
156 lvx v5, rSTR2, r10
157 vcmpequb. v7, v6, v8
158 bnl cr6, L(different3)
159 lvx v6, rSTR1, r11
160 lvx v8, rSTR2, r11
161 vcmpequb. v7, v5, v4
162 bnl cr6, L(different2)
163 addi rSTR1, rSTR1, 64
164 addi rSTR2, rSTR2, 64
165 bdnz L(aligned_loop)
166 vcmpequb. v7, v6, v8
167 bnl cr6, L(different3)
168 clrldi rN, rN, 58
169 /* Handle remainder for aligned loop. */
170 .align 4
171 L(lessthan64):
172 mr r9, rSTR1
173 cmpdi cr6, rN, 0
174 li rSTR1, 0
175 blelr cr6
176 lvx v4, 0, r9
177 lvx v5, 0, rSTR2
178 vcmpequb. v7, v5, v4
179 bnl cr6, L(different1)
180 addi rN, rN, -16
181
182 cmpdi cr6, rN, 0
183 blelr cr6
184 lvx v4, r9, r8
185 lvx v5, rSTR2, r8
186 vcmpequb. v7, v5, v4
187 bnl cr6, L(different1)
188 addi rN, rN, -16
189
190 cmpdi cr6, rN, 0
191 blelr cr6
192 lvx v4, r9, r10
193 lvx v5, rSTR2, r10
194 vcmpequb. v7, v5, v4
195 bnl cr6, L(different1)
196 addi rN, rN, -16
197
198 cmpdi cr6, rN, 0
199 blelr cr6
200 lvx v4, r9, r11
201 lvx v5, rSTR2, r11
202 vcmpequb. v7, v5, v4
203 bnl cr6, L(different1)
204 blr
205
206 /* Calculate and return the difference. */
207 .align 4
208 L(different1):
209 cmpdi cr6, rN, 16
210 bge cr6, L(different2)
211 /* Discard unwanted bytes. */
212 #ifdef __LITTLE_ENDIAN__
213 lvsr v1, 0, rN
214 vperm v4, v4, v0, v1
215 vperm v5, v5, v0, v1
216 #else
217 lvsl v1, 0, rN
218 vperm v4, v0, v4, v1
219 vperm v5, v0, v5, v1
220 #endif
221 vcmpequb. v7, v4, v5
222 li rRTN, 0
223 bltlr cr6
224 .align 4
225 L(different2):
226 #ifdef __LITTLE_ENDIAN__
227 /* Reverse bytes for direct comparison. */
228 lvsl v10, r0, r0
229 vspltisb v8, 15
230 vsububm v9, v8, v10
231 vperm v4, v4, v0, v9
232 vperm v5, v5, v0, v9
233 #endif
234 MFVRD(r7, v4)
235 MFVRD(r9, v5)
236 cmpld cr6, r7, r9
237 bne cr6, L(ret_diff)
238 /* Difference in second DW. */
239 vsldoi v4, v4, v4, 8
240 vsldoi v5, v5, v5, 8
241 MFVRD(r7, v4)
242 MFVRD(r9, v5)
243 cmpld cr6, r7, r9
244 L(ret_diff):
245 li rRTN, 1
246 bgtlr cr6
247 li rRTN, -1
248 blr
249 .align 4
250 L(different3):
251 #ifdef __LITTLE_ENDIAN__
252 /* Reverse bytes for direct comparison. */
253 vspltisb v9, 15
254 lvsl v10, r0, r0
255 vsububm v9, v9, v10
256 vperm v6, v6, v0, v9
257 vperm v8, v8, v0, v9
258 #endif
259 MFVRD(r7, v6)
260 MFVRD(r9, v8)
261 cmpld cr6, r7, r9
262 bne cr6, L(ret_diff)
263 /* Difference in second DW. */
264 vsldoi v6, v6, v6, 8
265 vsldoi v8, v8, v8, 8
266 MFVRD(r7, v6)
267 MFVRD(r9, v8)
268 cmpld cr6, r7, r9
269 li rRTN, 1
270 bgtlr cr6
271 li rRTN, -1
272 blr
273
274 .align 4
275 L(different):
276 cmpldi cr7, rN, 8
277 bgt cr7, L(end)
278 /* Skip unwanted bytes. */
279 sldi r8, rN, 3
280 subfic r8, r8, 64
281 srd rWORD1, rWORD1, r8
282 srd rWORD2, rWORD2, r8
283 cmpld cr6, rWORD1, rWORD2
284 li rRTN, 0
285 beqlr cr6
286 L(end):
287 li rRTN, 1
288 bgtlr cr6
289 li rRTN, -1
290 blr
291
292 .align 4
293 L(unalignedqw):
294 /* Proceed to DW unaligned loop,if there is a chance of pagecross. */
295 rldicl r9, rSTR1, 0, 52
296 add r9, r9, rN
297 cmpldi cr0, r9, 4096-16
298 bgt cr0, L(unaligned)
299 rldicl r9, rSTR2, 0, 52
300 add r9, r9, rN
301 cmpldi cr0, r9, 4096-16
302 bgt cr0, L(unaligned)
303 li r0, 0
304 li r8, 16
305 vspltisb v0, 0
306 /* Check if rSTR1 is aligned to QW. */
307 andi. r11, rSTR1, 0xF
308 beq L(s1_align)
309
310 /* Compare 16B and align S1 to QW. */
311 #ifdef __LITTLE_ENDIAN__
312 lvsr v10, 0, rSTR1 /* Compute mask. */
313 lvsr v6, 0, rSTR2 /* Compute mask. */
314 #else
315 lvsl v10, 0, rSTR1 /* Compute mask. */
316 lvsl v6, 0, rSTR2 /* Compute mask. */
317 #endif
318 lvx v5, 0, rSTR2
319 lvx v9, rSTR2, r8
320 #ifdef __LITTLE_ENDIAN__
321 vperm v5, v9, v5, v6
322 #else
323 vperm v5, v5, v9, v6
324 #endif
325 lvx v4, 0, rSTR1
326 lvx v9, rSTR1, r8
327 #ifdef __LITTLE_ENDIAN__
328 vperm v4, v9, v4, v10
329 #else
330 vperm v4, v4, v9, v10
331 #endif
332 vcmpequb. v7, v5, v4
333 bnl cr6, L(different1)
334 cmpldi cr6, rN, 16
335 ble cr6, L(zeroLength)
336 subfic r11, r11, 16
337 subf rN, r11, rN
338 add rSTR1, rSTR1, r11
339 add rSTR2, rSTR2, r11
340
341 /* As s1 is QW aligned prepare for unaligned loop. */
342 .align 4
343 L(s1_align):
344 #ifdef __LITTLE_ENDIAN__
345 lvsr v6, 0, rSTR2
346 #else
347 lvsl v6, 0, rSTR2
348 #endif
349 lvx v5, 0, rSTR2
350 srdi. r6, rN, 6
351 li r10, 32
352 li r11, 48
353 ble cr0, L(lessthan64_unalign)
354 mtctr r6
355 li r9, 64
356 /* Unaligned vector loop. */
357 .align 4
358 L(unalign_qwloop):
359 lvx v4, 0, rSTR1
360 lvx v10, rSTR2, r8
361 #ifdef __LITTLE_ENDIAN__
362 vperm v5, v10, v5, v6
363 #else
364 vperm v5, v5, v10, v6
365 #endif
366 vcmpequb. v7, v5, v4
367 bnl cr6, L(different2)
368 vor v5, v10, v10
369 lvx v4, rSTR1, r8
370 lvx v10, rSTR2, r10
371 #ifdef __LITTLE_ENDIAN__
372 vperm v5, v10, v5, v6
373 #else
374 vperm v5, v5, v10, v6
375 #endif
376 vcmpequb. v7, v5, v4
377 bnl cr6, L(different2)
378 vor v5, v10, v10
379 lvx v4, rSTR1, r10
380 lvx v10, rSTR2, r11
381 #ifdef __LITTLE_ENDIAN__
382 vperm v5, v10, v5, v6
383 #else
384 vperm v5, v5, v10, v6
385 #endif
386 vcmpequb. v7, v5, v4
387 bnl cr6, L(different2)
388 vor v5, v10, v10
389 lvx v4, rSTR1, r11
390 lvx v10, rSTR2, r9
391 #ifdef __LITTLE_ENDIAN__
392 vperm v5, v10, v5, v6
393 #else
394 vperm v5, v5, v10, v6
395 #endif
396 vcmpequb. v7, v5, v4
397 bnl cr6, L(different2)
398 vor v5, v10, v10
399 addi rSTR1, rSTR1, 64
400 addi rSTR2, rSTR2, 64
401 bdnz L(unalign_qwloop)
402 clrldi rN, rN, 58
403 /* Handle remainder for unaligned loop. */
404 .align 4
405 L(lessthan64_unalign):
406 mr r9, rSTR1
407 cmpdi cr6, rN, 0
408 li rSTR1, 0
409 blelr cr6
410 lvx v4, 0, r9
411 lvx v10, rSTR2, r8
412 #ifdef __LITTLE_ENDIAN__
413 vperm v5, v10, v5, v6
414 #else
415 vperm v5, v5, v10, v6
416 #endif
417 vcmpequb. v7, v5, v4
418 bnl cr6, L(different1)
419 vor v5, v10, v10
420 addi rN, rN, -16
421
422 cmpdi cr6, rN, 0
423 blelr cr6
424 lvx v4, r9, r8
425 lvx v10, rSTR2, r10
426 #ifdef __LITTLE_ENDIAN__
427 vperm v5, v10, v5, v6
428 #else
429 vperm v5, v5, v10, v6
430 #endif
431 vcmpequb. v7, v5, v4
432 bnl cr6, L(different1)
433 vor v5, v10, v10
434 addi rN, rN, -16
435
436 cmpdi cr6, rN, 0
437 blelr cr6
438 lvx v4, r9, r10
439 lvx v10, rSTR2, r11
440 #ifdef __LITTLE_ENDIAN__
441 vperm v5, v10, v5, v6
442 #else
443 vperm v5, v5, v10, v6
444 #endif
445 vcmpequb. v7, v5, v4
446 bnl cr6, L(different1)
447 vor v5, v10, v10
448 addi rN, rN, -16
449
450 cmpdi cr6, rN, 0
451 blelr cr6
452 lvx v4, r9, r11
453 addi r11, r11, 16
454 lvx v10, rSTR2, r11
455 #ifdef __LITTLE_ENDIAN__
456 vperm v5, v10, v5, v6
457 #else
458 vperm v5, v5, v10, v6
459 #endif
460 vcmpequb. v7, v5, v4
461 bnl cr6, L(different1)
462 blr
463
464 /* Otherwise we know the two strings have the same alignment (but not
465 yet DW). So we force the string addresses to the next lower DW
466 boundary and special case this first DW using shift left to
467 eliminate bits preceding the first byte. Since we want to join the
468 normal (DW aligned) compare loop, starting at the second double word,
469 we need to adjust the length (rN) and special case the loop
470 versioning for the first DW. This ensures that the loop count is
471 correct and the first DW (shifted) is in the expected register pair. */
472 .align 4
473 L(DW):
474 std rWORD8, rWORD8SAVE(r1)
475 std rWORD7, rWORD7SAVE(r1)
476 std rOFF8, rOFF8SAVE(r1)
477 std rOFF16, rOFF16SAVE(r1)
478 std rOFF24, rOFF24SAVE(r1)
479 std rOFF32, rOFF32SAVE(r1)
480 cfi_offset(rWORD8, rWORD8SAVE)
481 cfi_offset(rWORD7, rWORD7SAVE)
482 cfi_offset(rOFF8, rOFF8SAVE)
483 cfi_offset(rOFF16, rOFF16SAVE)
484 cfi_offset(rOFF24, rOFF24SAVE)
485 cfi_offset(rOFF32, rOFF32SAVE)
486
487 li rOFF8,8
488 li rOFF16,16
489 li rOFF24,24
490 li rOFF32,32
491 clrrdi rSTR1, rSTR1, 3
492 clrrdi rSTR2, rSTR2, 3
493 beq cr5, L(DWaligned)
494 add rN, rN, r12
495 sldi rWORD6, r12, 3
496 srdi r0, rN, 5 /* Divide by 32. */
497 andi. r12, rN, 24 /* Get the DW remainder. */
498 LD rWORD1, 0, rSTR1
499 LD rWORD2, 0, rSTR2
500 cmpldi cr1, r12, 16
501 cmpldi cr7, rN, 32
502 clrldi rN, rN, 61
503 beq L(dPs4)
504 mtctr r0
505 bgt cr1, L(dPs3)
506 beq cr1, L(dPs2)
507
508 /* Remainder is 8. */
509 .align 3
510 L(dsP1):
511 sld rWORD5, rWORD1, rWORD6
512 sld rWORD6, rWORD2, rWORD6
513 cmpld cr5, rWORD5, rWORD6
514 blt cr7, L(dP1x)
515 /* Do something useful in this cycle since we have to branch anyway. */
516 LD rWORD1, rOFF8, rSTR1
517 LD rWORD2, rOFF8, rSTR2
518 cmpld cr7, rWORD1, rWORD2
519 b L(dP1e)
520 /* Remainder is 16. */
521 .align 4
522 L(dPs2):
523 sld rWORD5, rWORD1, rWORD6
524 sld rWORD6, rWORD2, rWORD6
525 cmpld cr6, rWORD5, rWORD6
526 blt cr7, L(dP2x)
527 /* Do something useful in this cycle since we have to branch anyway. */
528 LD rWORD7, rOFF8, rSTR1
529 LD rWORD8, rOFF8, rSTR2
530 cmpld cr5, rWORD7, rWORD8
531 b L(dP2e)
532 /* Remainder is 24. */
533 .align 4
534 L(dPs3):
535 sld rWORD3, rWORD1, rWORD6
536 sld rWORD4, rWORD2, rWORD6
537 cmpld cr1, rWORD3, rWORD4
538 b L(dP3e)
539 /* Count is a multiple of 32, remainder is 0. */
540 .align 4
541 L(dPs4):
542 mtctr r0
543 sld rWORD1, rWORD1, rWORD6
544 sld rWORD2, rWORD2, rWORD6
545 cmpld cr7, rWORD1, rWORD2
546 b L(dP4e)
547
548 /* At this point we know both strings are double word aligned and the
549 compare length is at least 8 bytes. */
550 .align 4
551 L(DWaligned):
552 andi. r12, rN, 24 /* Get the DW remainder. */
553 srdi r0, rN, 5 /* Divide by 32. */
554 cmpldi cr1, r12, 16
555 cmpldi cr7, rN, 32
556 clrldi rN, rN, 61
557 beq L(dP4)
558 bgt cr1, L(dP3)
559 beq cr1, L(dP2)
560
561 /* Remainder is 8. */
562 .align 4
563 L(dP1):
564 mtctr r0
565 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
566 (8-15 byte compare), we want to use only volatile registers. This
567 means we can avoid restoring non-volatile registers since we did not
568 change any on the early exit path. The key here is the non-early
569 exit path only cares about the condition code (cr5), not about which
570 register pair was used. */
571 LD rWORD5, 0, rSTR1
572 LD rWORD6, 0, rSTR2
573 cmpld cr5, rWORD5, rWORD6
574 blt cr7, L(dP1x)
575 LD rWORD1, rOFF8, rSTR1
576 LD rWORD2, rOFF8, rSTR2
577 cmpld cr7, rWORD1, rWORD2
578 L(dP1e):
579 LD rWORD3, rOFF16, rSTR1
580 LD rWORD4, rOFF16, rSTR2
581 cmpld cr1, rWORD3, rWORD4
582 LD rWORD5, rOFF24, rSTR1
583 LD rWORD6, rOFF24, rSTR2
584 cmpld cr6, rWORD5, rWORD6
585 bne cr5, L(dLcr5x)
586 bne cr7, L(dLcr7x)
587
588 LD rWORD7, rOFF32, rSTR1
589 LD rWORD8, rOFF32, rSTR2
590 addi rSTR1, rSTR1, 32
591 addi rSTR2, rSTR2, 32
592 bne cr1, L(dLcr1)
593 cmpld cr5, rWORD7, rWORD8
594 bdnz L(dLoop)
595 bne cr6, L(dLcr6)
596 ld rWORD8, rWORD8SAVE(r1)
597 ld rWORD7, rWORD7SAVE(r1)
598 .align 3
599 L(dP1x):
600 sldi. r12, rN, 3
601 bne cr5, L(dLcr5x)
602 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
603 bne L(d00)
604 ld rOFF8, rOFF8SAVE(r1)
605 ld rOFF16, rOFF16SAVE(r1)
606 ld rOFF24, rOFF24SAVE(r1)
607 ld rOFF32, rOFF32SAVE(r1)
608 li rRTN, 0
609 blr
610
611 /* Remainder is 16. */
612 .align 4
613 L(dP2):
614 mtctr r0
615 LD rWORD5, 0, rSTR1
616 LD rWORD6, 0, rSTR2
617 cmpld cr6, rWORD5, rWORD6
618 blt cr7, L(dP2x)
619 LD rWORD7, rOFF8, rSTR1
620 LD rWORD8, rOFF8, rSTR2
621 cmpld cr5, rWORD7, rWORD8
622 L(dP2e):
623 LD rWORD1, rOFF16, rSTR1
624 LD rWORD2, rOFF16, rSTR2
625 cmpld cr7, rWORD1, rWORD2
626 LD rWORD3, rOFF24, rSTR1
627 LD rWORD4, rOFF24, rSTR2
628 cmpld cr1, rWORD3, rWORD4
629 addi rSTR1, rSTR1, 8
630 addi rSTR2, rSTR2, 8
631 bne cr6, L(dLcr6)
632 bne cr5, L(dLcr5)
633 b L(dLoop2)
634 .align 4
635 L(dP2x):
636 LD rWORD3, rOFF8, rSTR1
637 LD rWORD4, rOFF8, rSTR2
638 cmpld cr1, rWORD3, rWORD4
639 sldi. r12, rN, 3
640 bne cr6, L(dLcr6x)
641 addi rSTR1, rSTR1, 8
642 addi rSTR2, rSTR2, 8
643 bne cr1, L(dLcr1x)
644 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
645 bne L(d00)
646 ld rOFF8, rOFF8SAVE(r1)
647 ld rOFF16, rOFF16SAVE(r1)
648 ld rOFF24, rOFF24SAVE(r1)
649 ld rOFF32, rOFF32SAVE(r1)
650 li rRTN, 0
651 blr
652
653 /* Remainder is 24. */
654 .align 4
655 L(dP3):
656 mtctr r0
657 LD rWORD3, 0, rSTR1
658 LD rWORD4, 0, rSTR2
659 cmpld cr1, rWORD3, rWORD4
660 L(dP3e):
661 LD rWORD5, rOFF8, rSTR1
662 LD rWORD6, rOFF8, rSTR2
663 cmpld cr6, rWORD5, rWORD6
664 blt cr7, L(dP3x)
665 LD rWORD7, rOFF16, rSTR1
666 LD rWORD8, rOFF16, rSTR2
667 cmpld cr5, rWORD7, rWORD8
668 LD rWORD1, rOFF24, rSTR1
669 LD rWORD2, rOFF24, rSTR2
670 cmpld cr7, rWORD1, rWORD2
671 addi rSTR1, rSTR1, 16
672 addi rSTR2, rSTR2, 16
673 bne cr1, L(dLcr1)
674 bne cr6, L(dLcr6)
675 b L(dLoop1)
676 /* Again we are on a early exit path (24-31 byte compare), we want to
677 only use volatile registers and avoid restoring non-volatile
678 registers. */
679 .align 4
680 L(dP3x):
681 LD rWORD1, rOFF16, rSTR1
682 LD rWORD2, rOFF16, rSTR2
683 cmpld cr7, rWORD1, rWORD2
684 sldi. r12, rN, 3
685 bne cr1, L(dLcr1x)
686 addi rSTR1, rSTR1, 16
687 addi rSTR2, rSTR2, 16
688 bne cr6, L(dLcr6x)
689 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
690 bne cr7, L(dLcr7x)
691 bne L(d00)
692 ld rOFF8, rOFF8SAVE(r1)
693 ld rOFF16, rOFF16SAVE(r1)
694 ld rOFF24, rOFF24SAVE(r1)
695 ld rOFF32, rOFF32SAVE(r1)
696 li rRTN, 0
697 blr
698
699 /* Count is a multiple of 32, remainder is 0. */
700 .align 4
701 L(dP4):
702 mtctr r0
703 LD rWORD1, 0, rSTR1
704 LD rWORD2, 0, rSTR2
705 cmpld cr7, rWORD1, rWORD2
706 L(dP4e):
707 LD rWORD3, rOFF8, rSTR1
708 LD rWORD4, rOFF8, rSTR2
709 cmpld cr1, rWORD3, rWORD4
710 LD rWORD5, rOFF16, rSTR1
711 LD rWORD6, rOFF16, rSTR2
712 cmpld cr6, rWORD5, rWORD6
713 LD rWORD7, rOFF24, rSTR1
714 LD rWORD8, rOFF24, rSTR2
715 addi rSTR1, rSTR1, 24
716 addi rSTR2, rSTR2, 24
717 cmpld cr5, rWORD7, rWORD8
718 bne cr7, L(dLcr7)
719 bne cr1, L(dLcr1)
720 bdz- L(d24) /* Adjust CTR as we start with +4. */
721 /* This is the primary loop. */
722 .align 4
723 L(dLoop):
724 LD rWORD1, rOFF8, rSTR1
725 LD rWORD2, rOFF8, rSTR2
726 cmpld cr1, rWORD3, rWORD4
727 bne cr6, L(dLcr6)
728 L(dLoop1):
729 LD rWORD3, rOFF16, rSTR1
730 LD rWORD4, rOFF16, rSTR2
731 cmpld cr6, rWORD5, rWORD6
732 bne cr5, L(dLcr5)
733 L(dLoop2):
734 LD rWORD5, rOFF24, rSTR1
735 LD rWORD6, rOFF24, rSTR2
736 cmpld cr5, rWORD7, rWORD8
737 bne cr7, L(dLcr7)
738 L(dLoop3):
739 LD rWORD7, rOFF32, rSTR1
740 LD rWORD8, rOFF32, rSTR2
741 addi rSTR1, rSTR1, 32
742 addi rSTR2, rSTR2, 32
743 bne cr1, L(dLcr1)
744 cmpld cr7, rWORD1, rWORD2
745 bdnz L(dLoop)
746
747 L(dL4):
748 cmpld cr1, rWORD3, rWORD4
749 bne cr6, L(dLcr6)
750 cmpld cr6, rWORD5, rWORD6
751 bne cr5, L(dLcr5)
752 cmpld cr5, rWORD7, rWORD8
753 L(d44):
754 bne cr7, L(dLcr7)
755 L(d34):
756 bne cr1, L(dLcr1)
757 L(d24):
758 bne cr6, L(dLcr6)
759 L(d14):
760 sldi. r12, rN, 3
761 bne cr5, L(dLcr5)
762 L(d04):
763 ld rWORD8, rWORD8SAVE(r1)
764 ld rWORD7, rWORD7SAVE(r1)
765 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
766 beq L(duzeroLength)
767 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
768 we are aligned it is safe to load the whole double word, and use
769 shift right double to eliminate bits beyond the compare length. */
770 L(d00):
771 LD rWORD1, rOFF8, rSTR1
772 LD rWORD2, rOFF8, rSTR2
773 srd rWORD1, rWORD1, rN
774 srd rWORD2, rWORD2, rN
775 cmpld cr7, rWORD1, rWORD2
776 bne cr7, L(dLcr7x)
777 ld rOFF8, rOFF8SAVE(r1)
778 ld rOFF16, rOFF16SAVE(r1)
779 ld rOFF24, rOFF24SAVE(r1)
780 ld rOFF32, rOFF32SAVE(r1)
781 li rRTN, 0
782 blr
783
784 .align 4
785 L(dLcr7):
786 ld rWORD8, rWORD8SAVE(r1)
787 ld rWORD7, rWORD7SAVE(r1)
788 L(dLcr7x):
789 ld rOFF8, rOFF8SAVE(r1)
790 ld rOFF16, rOFF16SAVE(r1)
791 ld rOFF24, rOFF24SAVE(r1)
792 ld rOFF32, rOFF32SAVE(r1)
793 li rRTN, 1
794 bgtlr cr7
795 li rRTN, -1
796 blr
797 .align 4
798 L(dLcr1):
799 ld rWORD8, rWORD8SAVE(r1)
800 ld rWORD7, rWORD7SAVE(r1)
801 L(dLcr1x):
802 ld rOFF8, rOFF8SAVE(r1)
803 ld rOFF16, rOFF16SAVE(r1)
804 ld rOFF24, rOFF24SAVE(r1)
805 ld rOFF32, rOFF32SAVE(r1)
806 li rRTN, 1
807 bgtlr cr1
808 li rRTN, -1
809 blr
810 .align 4
811 L(dLcr6):
812 ld rWORD8, rWORD8SAVE(r1)
813 ld rWORD7, rWORD7SAVE(r1)
814 L(dLcr6x):
815 ld rOFF8, rOFF8SAVE(r1)
816 ld rOFF16, rOFF16SAVE(r1)
817 ld rOFF24, rOFF24SAVE(r1)
818 ld rOFF32, rOFF32SAVE(r1)
819 li rRTN, 1
820 bgtlr cr6
821 li rRTN, -1
822 blr
823 .align 4
824 L(dLcr5):
825 ld rWORD8, rWORD8SAVE(r1)
826 ld rWORD7, rWORD7SAVE(r1)
827 L(dLcr5x):
828 ld rOFF8, rOFF8SAVE(r1)
829 ld rOFF16, rOFF16SAVE(r1)
830 ld rOFF24, rOFF24SAVE(r1)
831 ld rOFF32, rOFF32SAVE(r1)
832 li rRTN, 1
833 bgtlr cr5
834 li rRTN, -1
835 blr
836
837 .align 4
838 L(bytealigned):
839 mtctr rN
840
841 /* We need to prime this loop. This loop is swing modulo scheduled
842 to avoid pipe delays. The dependent instruction latencies (load to
843 compare to conditional branch) is 2 to 3 cycles. In this loop each
844 dispatch group ends in a branch and takes 1 cycle. Effectively
845 the first iteration of the loop only serves to load operands and
846 branches based on compares are delayed until the next loop.
847
848 So we must precondition some registers and condition codes so that
849 we don't exit the loop early on the first iteration. */
850
851 lbz rWORD1, 0(rSTR1)
852 lbz rWORD2, 0(rSTR2)
853 bdz L(b11)
854 cmpld cr7, rWORD1, rWORD2
855 lbz rWORD3, 1(rSTR1)
856 lbz rWORD4, 1(rSTR2)
857 bdz L(b12)
858 cmpld cr1, rWORD3, rWORD4
859 lbzu rWORD5, 2(rSTR1)
860 lbzu rWORD6, 2(rSTR2)
861 bdz L(b13)
862 .align 4
863 L(bLoop):
864 lbzu rWORD1, 1(rSTR1)
865 lbzu rWORD2, 1(rSTR2)
866 bne cr7, L(bLcr7)
867
868 cmpld cr6, rWORD5, rWORD6
869 bdz L(b3i)
870
871 lbzu rWORD3, 1(rSTR1)
872 lbzu rWORD4, 1(rSTR2)
873 bne cr1, L(bLcr1)
874
875 cmpld cr7, rWORD1, rWORD2
876 bdz L(b2i)
877
878 lbzu rWORD5, 1(rSTR1)
879 lbzu rWORD6, 1(rSTR2)
880 bne cr6, L(bLcr6)
881
882 cmpld cr1, rWORD3, rWORD4
883 bdnz L(bLoop)
884
885 /* We speculatively loading bytes before we have tested the previous
886 bytes. But we must avoid overrunning the length (in the ctr) to
887 prevent these speculative loads from causing a segfault. In this
888 case the loop will exit early (before the all pending bytes are
889 tested. In this case we must complete the pending operations
890 before returning. */
891 L(b1i):
892 bne cr7, L(bLcr7)
893 bne cr1, L(bLcr1)
894 b L(bx56)
895 .align 4
896 L(b2i):
897 bne cr6, L(bLcr6)
898 bne cr7, L(bLcr7)
899 b L(bx34)
900 .align 4
901 L(b3i):
902 bne cr1, L(bLcr1)
903 bne cr6, L(bLcr6)
904 b L(bx12)
905 .align 4
906 L(bLcr7):
907 li rRTN, 1
908 bgtlr cr7
909 li rRTN, -1
910 blr
911 L(bLcr1):
912 li rRTN, 1
913 bgtlr cr1
914 li rRTN, -1
915 blr
916 L(bLcr6):
917 li rRTN, 1
918 bgtlr cr6
919 li rRTN, -1
920 blr
921
922 L(b13):
923 bne cr7, L(bx12)
924 bne cr1, L(bx34)
925 L(bx56):
926 sub rRTN, rWORD5, rWORD6
927 blr
928 nop
929 L(b12):
930 bne cr7, L(bx12)
931 L(bx34):
932 sub rRTN, rWORD3, rWORD4
933 blr
934 L(b11):
935 L(bx12):
936 sub rRTN, rWORD1, rWORD2
937 blr
938
939 .align 4
940 L(zeroLength):
941 li rRTN, 0
942 blr
943
944 .align 4
945 /* At this point we know the strings have different alignment and the
946 compare length is at least 8 bytes. r12 contains the low order
947 3 bits of rSTR1 and cr5 contains the result of the logical compare
948 of r12 to 0. If r12 == 0 then rStr1 is double word
949 aligned and can perform the DWunaligned loop.
950
951 Otherwise we know that rSTR1 is not already DW aligned yet.
952 So we can force the string addresses to the next lower DW
953 boundary and special case this first DW using shift left to
954 eliminate bits preceding the first byte. Since we want to join the
955 normal (DWaligned) compare loop, starting at the second double word,
956 we need to adjust the length (rN) and special case the loop
957 versioning for the first DW. This ensures that the loop count is
958 correct and the first DW (shifted) is in the expected resister pair. */
959 L(unaligned):
960 std rWORD8, rWORD8SAVE(r1)
961 std rWORD7, rWORD7SAVE(r1)
962 std rOFF8, rOFF8SAVE(r1)
963 std rOFF16, rOFF16SAVE(r1)
964 std rOFF24, rOFF24SAVE(r1)
965 std rOFF32, rOFF32SAVE(r1)
966 cfi_offset(rWORD8, rWORD8SAVE)
967 cfi_offset(rWORD7, rWORD7SAVE)
968 cfi_offset(rOFF8, rOFF8SAVE)
969 cfi_offset(rOFF16, rOFF16SAVE)
970 cfi_offset(rOFF24, rOFF24SAVE)
971 cfi_offset(rOFF32, rOFF32SAVE)
972 li rOFF8,8
973 li rOFF16,16
974 li rOFF24,24
975 li rOFF32,32
976 std rSHL, rSHLSAVE(r1)
977 cfi_offset(rSHL, rSHLSAVE)
978 clrldi rSHL, rSTR2, 61
979 beq cr6, L(duzeroLength)
980 std rSHR, rSHRSAVE(r1)
981 cfi_offset(rSHR, rSHRSAVE)
982 beq cr5, L(DWunaligned)
983 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
984 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
985 /* Adjust the logical start of rSTR2 to compensate for the extra bits
986 in the 1st rSTR1 DW. */
987 sub rWORD8_SHIFT, rSTR2, r12
988 /* But do not attempt to address the DW before that DW that contains
989 the actual start of rSTR2. */
990 clrrdi rSTR2, rSTR2, 3
991 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
992 /* Compute the left/right shift counts for the unaligned rSTR2,
993 compensating for the logical (DW aligned) start of rSTR1. */
994 clrldi rSHL, rWORD8_SHIFT, 61
995 clrrdi rSTR1, rSTR1, 3
996 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
997 sldi rSHL, rSHL, 3
998 cmpld cr5, rWORD8_SHIFT, rSTR2
999 add rN, rN, r12
1000 sldi rWORD6, r12, 3
1001 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1002 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1003 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1004 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1005 subfic rSHR, rSHL, 64
1006 srdi r0, rN, 5 /* Divide by 32. */
1007 andi. r12, rN, 24 /* Get the DW remainder. */
1008 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
1009 this special case those bits may be discarded anyway. Also we
1010 must avoid loading a DW where none of the bits are part of rSTR2 as
1011 this may cross a page boundary and cause a page fault. */
1012 li rWORD8, 0
1013 blt cr5, L(dus0)
1014 LD rWORD8, 0, rSTR2
1015 addi rSTR2, rSTR2, 8
1016 sld rWORD8, rWORD8, rSHL
1017
1018 L(dus0):
1019 LD rWORD1, 0, rSTR1
1020 LD rWORD2, 0, rSTR2
1021 cmpldi cr1, r12, 16
1022 cmpldi cr7, rN, 32
1023 srd r12, rWORD2, rSHR
1024 clrldi rN, rN, 61
1025 beq L(duPs4)
1026 mtctr r0
1027 or rWORD8, r12, rWORD8
1028 bgt cr1, L(duPs3)
1029 beq cr1, L(duPs2)
1030
1031 /* Remainder is 8. */
1032 .align 4
1033 L(dusP1):
1034 sld rWORD8_SHIFT, rWORD2, rSHL
1035 sld rWORD7, rWORD1, rWORD6
1036 sld rWORD8, rWORD8, rWORD6
1037 bge cr7, L(duP1e)
1038 /* At this point we exit early with the first double word compare
1039 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1040 how we handle the remaining bytes. */
1041 cmpld cr5, rWORD7, rWORD8
1042 sldi. rN, rN, 3
1043 bne cr5, L(duLcr5)
1044 cmpld cr7, rN, rSHR
1045 beq L(duZeroReturn)
1046 li r0, 0
1047 ble cr7, L(dutrim)
1048 LD rWORD2, rOFF8, rSTR2
1049 srd r0, rWORD2, rSHR
1050 b L(dutrim)
1051 /* Remainder is 16. */
1052 .align 4
1053 L(duPs2):
1054 sld rWORD6_SHIFT, rWORD2, rSHL
1055 sld rWORD5, rWORD1, rWORD6
1056 sld rWORD6, rWORD8, rWORD6
1057 b L(duP2e)
1058 /* Remainder is 24. */
1059 .align 4
1060 L(duPs3):
1061 sld rWORD4_SHIFT, rWORD2, rSHL
1062 sld rWORD3, rWORD1, rWORD6
1063 sld rWORD4, rWORD8, rWORD6
1064 b L(duP3e)
1065 /* Count is a multiple of 32, remainder is 0. */
1066 .align 4
1067 L(duPs4):
1068 mtctr r0
1069 or rWORD8, r12, rWORD8
1070 sld rWORD2_SHIFT, rWORD2, rSHL
1071 sld rWORD1, rWORD1, rWORD6
1072 sld rWORD2, rWORD8, rWORD6
1073 b L(duP4e)
1074
1075 /* At this point we know rSTR1 is double word aligned and the
1076 compare length is at least 8 bytes. */
1077 .align 4
1078 L(DWunaligned):
1079 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1080 clrrdi rSTR2, rSTR2, 3
1081 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1082 srdi r0, rN, 5 /* Divide by 32. */
1083 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1084 andi. r12, rN, 24 /* Get the DW remainder. */
1085 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1086 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
1087 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
1088 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
1089 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
1090 sldi rSHL, rSHL, 3
1091 LD rWORD6, 0, rSTR2
1092 LD rWORD8, rOFF8, rSTR2
1093 addi rSTR2, rSTR2, 8
1094 cmpldi cr1, r12, 16
1095 cmpldi cr7, rN, 32
1096 clrldi rN, rN, 61
1097 subfic rSHR, rSHL, 64
1098 sld rWORD6_SHIFT, rWORD6, rSHL
1099 beq L(duP4)
1100 mtctr r0
1101 bgt cr1, L(duP3)
1102 beq cr1, L(duP2)
1103
1104 /* Remainder is 8. */
1105 .align 4
1106 L(duP1):
1107 srd r12, rWORD8, rSHR
1108 LD rWORD7, 0, rSTR1
1109 sld rWORD8_SHIFT, rWORD8, rSHL
1110 or rWORD8, r12, rWORD6_SHIFT
1111 blt cr7, L(duP1x)
1112 L(duP1e):
1113 LD rWORD1, rOFF8, rSTR1
1114 LD rWORD2, rOFF8, rSTR2
1115 cmpld cr5, rWORD7, rWORD8
1116 srd r0, rWORD2, rSHR
1117 sld rWORD2_SHIFT, rWORD2, rSHL
1118 or rWORD2, r0, rWORD8_SHIFT
1119 LD rWORD3, rOFF16, rSTR1
1120 LD rWORD4, rOFF16, rSTR2
1121 cmpld cr7, rWORD1, rWORD2
1122 srd r12, rWORD4, rSHR
1123 sld rWORD4_SHIFT, rWORD4, rSHL
1124 bne cr5, L(duLcr5)
1125 or rWORD4, r12, rWORD2_SHIFT
1126 LD rWORD5, rOFF24, rSTR1
1127 LD rWORD6, rOFF24, rSTR2
1128 cmpld cr1, rWORD3, rWORD4
1129 srd r0, rWORD6, rSHR
1130 sld rWORD6_SHIFT, rWORD6, rSHL
1131 bne cr7, L(duLcr7)
1132 or rWORD6, r0, rWORD4_SHIFT
1133 cmpld cr6, rWORD5, rWORD6
1134 b L(duLoop3)
1135 .align 4
1136 /* At this point we exit early with the first double word compare
1137 complete and remainder of 0 to 7 bytes. See L(du14) for details on
1138 how we handle the remaining bytes. */
1139 L(duP1x):
1140 cmpld cr5, rWORD7, rWORD8
1141 sldi. rN, rN, 3
1142 bne cr5, L(duLcr5)
1143 cmpld cr7, rN, rSHR
1144 beq L(duZeroReturn)
1145 li r0, 0
1146 ble cr7, L(dutrim)
1147 LD rWORD2, rOFF8, rSTR2
1148 srd r0, rWORD2, rSHR
1149 b L(dutrim)
1150 /* Remainder is 16. */
1151 .align 4
1152 L(duP2):
1153 srd r0, rWORD8, rSHR
1154 LD rWORD5, 0, rSTR1
1155 or rWORD6, r0, rWORD6_SHIFT
1156 sld rWORD6_SHIFT, rWORD8, rSHL
1157 L(duP2e):
1158 LD rWORD7, rOFF8, rSTR1
1159 LD rWORD8, rOFF8, rSTR2
1160 cmpld cr6, rWORD5, rWORD6
1161 srd r12, rWORD8, rSHR
1162 sld rWORD8_SHIFT, rWORD8, rSHL
1163 or rWORD8, r12, rWORD6_SHIFT
1164 blt cr7, L(duP2x)
1165 LD rWORD1, rOFF16, rSTR1
1166 LD rWORD2, rOFF16, rSTR2
1167 cmpld cr5, rWORD7, rWORD8
1168 bne cr6, L(duLcr6)
1169 srd r0, rWORD2, rSHR
1170 sld rWORD2_SHIFT, rWORD2, rSHL
1171 or rWORD2, r0, rWORD8_SHIFT
1172 LD rWORD3, rOFF24, rSTR1
1173 LD rWORD4, rOFF24, rSTR2
1174 cmpld cr7, rWORD1, rWORD2
1175 bne cr5, L(duLcr5)
1176 srd r12, rWORD4, rSHR
1177 sld rWORD4_SHIFT, rWORD4, rSHL
1178 or rWORD4, r12, rWORD2_SHIFT
1179 addi rSTR1, rSTR1, 8
1180 addi rSTR2, rSTR2, 8
1181 cmpld cr1, rWORD3, rWORD4
1182 b L(duLoop2)
1183 .align 4
1184 L(duP2x):
1185 cmpld cr5, rWORD7, rWORD8
1186 addi rSTR1, rSTR1, 8
1187 addi rSTR2, rSTR2, 8
1188 bne cr6, L(duLcr6)
1189 sldi. rN, rN, 3
1190 bne cr5, L(duLcr5)
1191 cmpld cr7, rN, rSHR
1192 beq L(duZeroReturn)
1193 li r0, 0
1194 ble cr7, L(dutrim)
1195 LD rWORD2, rOFF8, rSTR2
1196 srd r0, rWORD2, rSHR
1197 b L(dutrim)
1198
1199 /* Remainder is 24. */
1200 .align 4
1201 L(duP3):
1202 srd r12, rWORD8, rSHR
1203 LD rWORD3, 0, rSTR1
1204 sld rWORD4_SHIFT, rWORD8, rSHL
1205 or rWORD4, r12, rWORD6_SHIFT
1206 L(duP3e):
1207 LD rWORD5, rOFF8, rSTR1
1208 LD rWORD6, rOFF8, rSTR2
1209 cmpld cr1, rWORD3, rWORD4
1210 srd r0, rWORD6, rSHR
1211 sld rWORD6_SHIFT, rWORD6, rSHL
1212 or rWORD6, r0, rWORD4_SHIFT
1213 LD rWORD7, rOFF16, rSTR1
1214 LD rWORD8, rOFF16, rSTR2
1215 cmpld cr6, rWORD5, rWORD6
1216 bne cr1, L(duLcr1)
1217 srd r12, rWORD8, rSHR
1218 sld rWORD8_SHIFT, rWORD8, rSHL
1219 or rWORD8, r12, rWORD6_SHIFT
1220 blt cr7, L(duP3x)
1221 LD rWORD1, rOFF24, rSTR1
1222 LD rWORD2, rOFF24, rSTR2
1223 cmpld cr5, rWORD7, rWORD8
1224 bne cr6, L(duLcr6)
1225 srd r0, rWORD2, rSHR
1226 sld rWORD2_SHIFT, rWORD2, rSHL
1227 or rWORD2, r0, rWORD8_SHIFT
1228 addi rSTR1, rSTR1, 16
1229 addi rSTR2, rSTR2, 16
1230 cmpld cr7, rWORD1, rWORD2
1231 b L(duLoop1)
1232 .align 4
1233 L(duP3x):
1234 addi rSTR1, rSTR1, 16
1235 addi rSTR2, rSTR2, 16
1236 cmpld cr5, rWORD7, rWORD8
1237 bne cr6, L(duLcr6)
1238 sldi. rN, rN, 3
1239 bne cr5, L(duLcr5)
1240 cmpld cr7, rN, rSHR
1241 beq L(duZeroReturn)
1242 li r0, 0
1243 ble cr7, L(dutrim)
1244 LD rWORD2, rOFF8, rSTR2
1245 srd r0, rWORD2, rSHR
1246 b L(dutrim)
1247
1248 /* Count is a multiple of 32, remainder is 0. */
1249 .align 4
1250 L(duP4):
1251 mtctr r0
1252 srd r0, rWORD8, rSHR
1253 LD rWORD1, 0, rSTR1
1254 sld rWORD2_SHIFT, rWORD8, rSHL
1255 or rWORD2, r0, rWORD6_SHIFT
1256 L(duP4e):
1257 LD rWORD3, rOFF8, rSTR1
1258 LD rWORD4, rOFF8, rSTR2
1259 cmpld cr7, rWORD1, rWORD2
1260 srd r12, rWORD4, rSHR
1261 sld rWORD4_SHIFT, rWORD4, rSHL
1262 or rWORD4, r12, rWORD2_SHIFT
1263 LD rWORD5, rOFF16, rSTR1
1264 LD rWORD6, rOFF16, rSTR2
1265 cmpld cr1, rWORD3, rWORD4
1266 bne cr7, L(duLcr7)
1267 srd r0, rWORD6, rSHR
1268 sld rWORD6_SHIFT, rWORD6, rSHL
1269 or rWORD6, r0, rWORD4_SHIFT
1270 LD rWORD7, rOFF24, rSTR1
1271 LD rWORD8, rOFF24, rSTR2
1272 addi rSTR1, rSTR1, 24
1273 addi rSTR2, rSTR2, 24
1274 cmpld cr6, rWORD5, rWORD6
1275 bne cr1, L(duLcr1)
1276 srd r12, rWORD8, rSHR
1277 sld rWORD8_SHIFT, rWORD8, rSHL
1278 or rWORD8, r12, rWORD6_SHIFT
1279 cmpld cr5, rWORD7, rWORD8
1280 bdz L(du24) /* Adjust CTR as we start with +4. */
1281 /* This is the primary loop. */
1282 .align 4
1283 L(duLoop):
1284 LD rWORD1, rOFF8, rSTR1
1285 LD rWORD2, rOFF8, rSTR2
1286 cmpld cr1, rWORD3, rWORD4
1287 bne cr6, L(duLcr6)
1288 srd r0, rWORD2, rSHR
1289 sld rWORD2_SHIFT, rWORD2, rSHL
1290 or rWORD2, r0, rWORD8_SHIFT
1291 L(duLoop1):
1292 LD rWORD3, rOFF16, rSTR1
1293 LD rWORD4, rOFF16, rSTR2
1294 cmpld cr6, rWORD5, rWORD6
1295 bne cr5, L(duLcr5)
1296 srd r12, rWORD4, rSHR
1297 sld rWORD4_SHIFT, rWORD4, rSHL
1298 or rWORD4, r12, rWORD2_SHIFT
1299 L(duLoop2):
1300 LD rWORD5, rOFF24, rSTR1
1301 LD rWORD6, rOFF24, rSTR2
1302 cmpld cr5, rWORD7, rWORD8
1303 bne cr7, L(duLcr7)
1304 srd r0, rWORD6, rSHR
1305 sld rWORD6_SHIFT, rWORD6, rSHL
1306 or rWORD6, r0, rWORD4_SHIFT
1307 L(duLoop3):
1308 LD rWORD7, rOFF32, rSTR1
1309 LD rWORD8, rOFF32, rSTR2
1310 addi rSTR1, rSTR1, 32
1311 addi rSTR2, rSTR2, 32
1312 cmpld cr7, rWORD1, rWORD2
1313 bne cr1, L(duLcr1)
1314 srd r12, rWORD8, rSHR
1315 sld rWORD8_SHIFT, rWORD8, rSHL
1316 or rWORD8, r12, rWORD6_SHIFT
1317 bdnz L(duLoop)
1318
1319 L(duL4):
1320 cmpld cr1, rWORD3, rWORD4
1321 bne cr6, L(duLcr6)
1322 cmpld cr6, rWORD5, rWORD6
1323 bne cr5, L(duLcr5)
1324 cmpld cr5, rWORD7, rWORD8
1325 L(du44):
1326 bne cr7, L(duLcr7)
1327 L(du34):
1328 bne cr1, L(duLcr1)
1329 L(du24):
1330 bne cr6, L(duLcr6)
1331 L(du14):
1332 sldi. rN, rN, 3
1333 bne cr5, L(duLcr5)
1334 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1335 shift right double to eliminate bits beyond the compare length.
1336
1337 However it may not be safe to load rWORD2 which may be beyond the
1338 string length. So we compare the bit length of the remainder to
1339 the right shift count (rSHR). If the bit count is less than or equal
1340 we do not need to load rWORD2 (all significant bits are already in
1341 rWORD8_SHIFT). */
1342 cmpld cr7, rN, rSHR
1343 beq L(duZeroReturn)
1344 li r0, 0
1345 ble cr7, L(dutrim)
1346 LD rWORD2, rOFF8, rSTR2
1347 srd r0, rWORD2, rSHR
1348 .align 4
1349 L(dutrim):
1350 LD rWORD1, rOFF8, rSTR1
1351 ld rWORD8, -8(r1)
1352 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1353 or rWORD2, r0, rWORD8_SHIFT
1354 ld rWORD7, rWORD7SAVE(r1)
1355 ld rSHL, rSHLSAVE(r1)
1356 srd rWORD1, rWORD1, rN
1357 srd rWORD2, rWORD2, rN
1358 ld rSHR, rSHRSAVE(r1)
1359 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1360 li rRTN, 0
1361 cmpld cr7, rWORD1, rWORD2
1362 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1363 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1364 beq cr7, L(dureturn24)
1365 li rRTN, 1
1366 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1367 ld rOFF8, rOFF8SAVE(r1)
1368 ld rOFF16, rOFF16SAVE(r1)
1369 ld rOFF24, rOFF24SAVE(r1)
1370 ld rOFF32, rOFF32SAVE(r1)
1371 bgtlr cr7
1372 li rRTN, -1
1373 blr
1374 .align 4
1375 L(duLcr7):
1376 ld rWORD8, rWORD8SAVE(r1)
1377 ld rWORD7, rWORD7SAVE(r1)
1378 li rRTN, 1
1379 bgt cr7, L(dureturn29)
1380 ld rSHL, rSHLSAVE(r1)
1381 ld rSHR, rSHRSAVE(r1)
1382 li rRTN, -1
1383 b L(dureturn27)
1384 .align 4
1385 L(duLcr1):
1386 ld rWORD8, rWORD8SAVE(r1)
1387 ld rWORD7, rWORD7SAVE(r1)
1388 li rRTN, 1
1389 bgt cr1, L(dureturn29)
1390 ld rSHL, rSHLSAVE(r1)
1391 ld rSHR, rSHRSAVE(r1)
1392 li rRTN, -1
1393 b L(dureturn27)
1394 .align 4
1395 L(duLcr6):
1396 ld rWORD8, rWORD8SAVE(r1)
1397 ld rWORD7, rWORD7SAVE(r1)
1398 li rRTN, 1
1399 bgt cr6, L(dureturn29)
1400 ld rSHL, rSHLSAVE(r1)
1401 ld rSHR, rSHRSAVE(r1)
1402 li rRTN, -1
1403 b L(dureturn27)
1404 .align 4
1405 L(duLcr5):
1406 ld rWORD8, rWORD8SAVE(r1)
1407 ld rWORD7, rWORD7SAVE(r1)
1408 li rRTN, 1
1409 bgt cr5, L(dureturn29)
1410 ld rSHL, rSHLSAVE(r1)
1411 ld rSHR, rSHRSAVE(r1)
1412 li rRTN, -1
1413 b L(dureturn27)
1414
1415 .align 3
1416 L(duZeroReturn):
1417 li rRTN, 0
1418 .align 4
1419 L(dureturn):
1420 ld rWORD8, rWORD8SAVE(r1)
1421 ld rWORD7, rWORD7SAVE(r1)
1422 L(dureturn29):
1423 ld rSHL, rSHLSAVE(r1)
1424 ld rSHR, rSHRSAVE(r1)
1425 L(dureturn27):
1426 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1427 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1428 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1429 L(dureturn24):
1430 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1431 ld rOFF8, rOFF8SAVE(r1)
1432 ld rOFF16, rOFF16SAVE(r1)
1433 ld rOFF24, rOFF24SAVE(r1)
1434 ld rOFF32, rOFF32SAVE(r1)
1435 blr
1436
1437 L(duzeroLength):
1438 ld rOFF8, rOFF8SAVE(r1)
1439 ld rOFF16, rOFF16SAVE(r1)
1440 ld rOFF24, rOFF24SAVE(r1)
1441 ld rOFF32, rOFF32SAVE(r1)
1442 li rRTN, 0
1443 blr
1444
1445 END (MEMCMP)
1446 libc_hidden_builtin_def (memcmp)
1447 weak_alias (memcmp, bcmp)