]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/powerpc/powerpc64/power4/memcmp.S
Remove trailing whitespace.
[thirdparty/glibc.git] / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
1 /* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
22
23 .machine power4
24 EALIGN (memcmp, 4, 0)
25 CALL_MCOUNT 3
26
27 #define rTMP r0
28 #define rRTN r3
29 #define rSTR1 r3 /* first string arg */
30 #define rSTR2 r4 /* second string arg */
31 #define rN r5 /* max string length */
32 #define rWORD1 r6 /* current word in s1 */
33 #define rWORD2 r7 /* current word in s2 */
34 #define rWORD3 r8 /* next word in s1 */
35 #define rWORD4 r9 /* next word in s2 */
36 #define rWORD5 r10 /* next word in s1 */
37 #define rWORD6 r11 /* next word in s2 */
38 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
39 #define rWORD7 r30 /* next word in s1 */
40 #define rWORD8 r31 /* next word in s2 */
41
42 xor rTMP, rSTR2, rSTR1
43 cmpldi cr6, rN, 0
44 cmpldi cr1, rN, 12
45 clrldi. rTMP, rTMP, 61
46 clrldi rBITDIF, rSTR1, 61
47 cmpldi cr5, rBITDIF, 0
48 beq- cr6, L(zeroLength)
49 dcbt 0,rSTR1
50 dcbt 0,rSTR2
51 /* If less than 8 bytes or not aligned, use the unaligned
52 byte loop. */
53 blt cr1, L(bytealigned)
54 std rWORD8,-8(r1)
55 cfi_offset(rWORD8,-8)
56 std rWORD7,-16(r1)
57 cfi_offset(rWORD7,-16)
58 bne L(unaligned)
59 /* At this point we know both strings have the same alignment and the
60 compare length is at least 8 bytes. rBITDIF contains the low order
61 3 bits of rSTR1 and cr5 contains the result of the logical compare
62 of rBITDIF to 0. If rBITDIF == 0 then we are already double word
63 aligned and can perform the DWaligned loop.
64
65 Otherwise we know the two strings have the same alignment (but not
66 yet DW). So we can force the string addresses to the next lower DW
67 boundary and special case this first DW word using shift left to
68 eliminate bits preceding the first byte. Since we want to join the
69 normal (DWaligned) compare loop, starting at the second double word,
70 we need to adjust the length (rN) and special case the loop
71 versioning for the first DW. This insures that the loop count is
72 correct and the first DW (shifted) is in the expected resister pair. */
73 .align 4
74 L(samealignment):
75 clrrdi rSTR1, rSTR1, 3
76 clrrdi rSTR2, rSTR2, 3
77 beq cr5, L(DWaligned)
78 add rN, rN, rBITDIF
79 sldi r11, rBITDIF, 3
80 srdi rTMP, rN, 5 /* Divide by 32 */
81 andi. rBITDIF, rN, 24 /* Get the DW remainder */
82 ld rWORD1, 0(rSTR1)
83 ld rWORD2, 0(rSTR2)
84 cmpldi cr1, rBITDIF, 16
85 cmpldi cr7, rN, 32
86 clrldi rN, rN, 61
87 beq L(dPs4)
88 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
89 bgt cr1, L(dPs3)
90 beq cr1, L(dPs2)
91
92 /* Remainder is 8 */
93 .align 3
94 L(dsP1):
95 sld rWORD5, rWORD1, r11
96 sld rWORD6, rWORD2, r11
97 cmpld cr5, rWORD5, rWORD6
98 blt cr7, L(dP1x)
99 /* Do something useful in this cycle since we have to branch anyway. */
100 ld rWORD1, 8(rSTR1)
101 ld rWORD2, 8(rSTR2)
102 cmpld cr0, rWORD1, rWORD2
103 b L(dP1e)
104 /* Remainder is 16 */
105 .align 4
106 L(dPs2):
107 sld rWORD5, rWORD1, r11
108 sld rWORD6, rWORD2, r11
109 cmpld cr6, rWORD5, rWORD6
110 blt cr7, L(dP2x)
111 /* Do something useful in this cycle since we have to branch anyway. */
112 ld rWORD7, 8(rSTR1)
113 ld rWORD8, 8(rSTR2)
114 cmpld cr5, rWORD7, rWORD8
115 b L(dP2e)
116 /* Remainder is 24 */
117 .align 4
118 L(dPs3):
119 sld rWORD3, rWORD1, r11
120 sld rWORD4, rWORD2, r11
121 cmpld cr1, rWORD3, rWORD4
122 b L(dP3e)
123 /* Count is a multiple of 32, remainder is 0 */
124 .align 4
125 L(dPs4):
126 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
127 sld rWORD1, rWORD1, r11
128 sld rWORD2, rWORD2, r11
129 cmpld cr0, rWORD1, rWORD2
130 b L(dP4e)
131
132 /* At this point we know both strings are double word aligned and the
133 compare length is at least 8 bytes. */
134 .align 4
135 L(DWaligned):
136 andi. rBITDIF, rN, 24 /* Get the DW remainder */
137 srdi rTMP, rN, 5 /* Divide by 32 */
138 cmpldi cr1, rBITDIF, 16
139 cmpldi cr7, rN, 32
140 clrldi rN, rN, 61
141 beq L(dP4)
142 bgt cr1, L(dP3)
143 beq cr1, L(dP2)
144
145 /* Remainder is 8 */
146 .align 4
147 L(dP1):
148 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
149 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
150 (8-15 byte compare), we want to use only volatile registers. This
151 means we can avoid restoring non-volatile registers since we did not
152 change any on the early exit path. The key here is the non-early
153 exit path only cares about the condition code (cr5), not about which
154 register pair was used. */
155 ld rWORD5, 0(rSTR1)
156 ld rWORD6, 0(rSTR2)
157 cmpld cr5, rWORD5, rWORD6
158 blt cr7, L(dP1x)
159 ld rWORD1, 8(rSTR1)
160 ld rWORD2, 8(rSTR2)
161 cmpld cr0, rWORD1, rWORD2
162 L(dP1e):
163 ld rWORD3, 16(rSTR1)
164 ld rWORD4, 16(rSTR2)
165 cmpld cr1, rWORD3, rWORD4
166 ld rWORD5, 24(rSTR1)
167 ld rWORD6, 24(rSTR2)
168 cmpld cr6, rWORD5, rWORD6
169 bne cr5, L(dLcr5)
170 bne cr0, L(dLcr0)
171
172 ldu rWORD7, 32(rSTR1)
173 ldu rWORD8, 32(rSTR2)
174 bne cr1, L(dLcr1)
175 cmpld cr5, rWORD7, rWORD8
176 bdnz L(dLoop)
177 bne cr6, L(dLcr6)
178 ld rWORD8,-8(r1)
179 ld rWORD7,-16(r1)
180 .align 3
181 L(dP1x):
182 sldi. r12, rN, 3
183 bne cr5, L(dLcr5)
184 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
185 bne L(d00)
186 li rRTN, 0
187 blr
188
189 /* Remainder is 16 */
190 .align 4
191 L(dP2):
192 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
193 ld rWORD5, 0(rSTR1)
194 ld rWORD6, 0(rSTR2)
195 cmpld cr6, rWORD5, rWORD6
196 blt cr7, L(dP2x)
197 ld rWORD7, 8(rSTR1)
198 ld rWORD8, 8(rSTR2)
199 cmpld cr5, rWORD7, rWORD8
200 L(dP2e):
201 ld rWORD1, 16(rSTR1)
202 ld rWORD2, 16(rSTR2)
203 cmpld cr0, rWORD1, rWORD2
204 ld rWORD3, 24(rSTR1)
205 ld rWORD4, 24(rSTR2)
206 cmpld cr1, rWORD3, rWORD4
207 addi rSTR1, rSTR1, 8
208 addi rSTR2, rSTR2, 8
209 bne cr6, L(dLcr6)
210 bne cr5, L(dLcr5)
211 b L(dLoop2)
212 /* Again we are on a early exit path (16-23 byte compare), we want to
213 only use volatile registers and avoid restoring non-volatile
214 registers. */
215 .align 4
216 L(dP2x):
217 ld rWORD3, 8(rSTR1)
218 ld rWORD4, 8(rSTR2)
219 cmpld cr5, rWORD3, rWORD4
220 sldi. r12, rN, 3
221 bne cr6, L(dLcr6)
222 addi rSTR1, rSTR1, 8
223 addi rSTR2, rSTR2, 8
224 bne cr5, L(dLcr5)
225 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
226 bne L(d00)
227 li rRTN, 0
228 blr
229
230 /* Remainder is 24 */
231 .align 4
232 L(dP3):
233 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
234 ld rWORD3, 0(rSTR1)
235 ld rWORD4, 0(rSTR2)
236 cmpld cr1, rWORD3, rWORD4
237 L(dP3e):
238 ld rWORD5, 8(rSTR1)
239 ld rWORD6, 8(rSTR2)
240 cmpld cr6, rWORD5, rWORD6
241 blt cr7, L(dP3x)
242 ld rWORD7, 16(rSTR1)
243 ld rWORD8, 16(rSTR2)
244 cmpld cr5, rWORD7, rWORD8
245 ld rWORD1, 24(rSTR1)
246 ld rWORD2, 24(rSTR2)
247 cmpld cr0, rWORD1, rWORD2
248 addi rSTR1, rSTR1, 16
249 addi rSTR2, rSTR2, 16
250 bne cr1, L(dLcr1)
251 bne cr6, L(dLcr6)
252 b L(dLoop1)
253 /* Again we are on a early exit path (24-31 byte compare), we want to
254 only use volatile registers and avoid restoring non-volatile
255 registers. */
256 .align 4
257 L(dP3x):
258 ld rWORD1, 16(rSTR1)
259 ld rWORD2, 16(rSTR2)
260 cmpld cr5, rWORD1, rWORD2
261 sldi. r12, rN, 3
262 bne cr1, L(dLcr1)
263 addi rSTR1, rSTR1, 16
264 addi rSTR2, rSTR2, 16
265 bne cr6, L(dLcr6)
266 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
267 bne cr5, L(dLcr5)
268 bne L(d00)
269 li rRTN, 0
270 blr
271
272 /* Count is a multiple of 32, remainder is 0 */
273 .align 4
274 L(dP4):
275 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
276 ld rWORD1, 0(rSTR1)
277 ld rWORD2, 0(rSTR2)
278 cmpld cr0, rWORD1, rWORD2
279 L(dP4e):
280 ld rWORD3, 8(rSTR1)
281 ld rWORD4, 8(rSTR2)
282 cmpld cr1, rWORD3, rWORD4
283 ld rWORD5, 16(rSTR1)
284 ld rWORD6, 16(rSTR2)
285 cmpld cr6, rWORD5, rWORD6
286 ldu rWORD7, 24(rSTR1)
287 ldu rWORD8, 24(rSTR2)
288 cmpld cr5, rWORD7, rWORD8
289 bne cr0, L(dLcr0)
290 bne cr1, L(dLcr1)
291 bdz- L(d24) /* Adjust CTR as we start with +4 */
292 /* This is the primary loop */
293 .align 4
294 L(dLoop):
295 ld rWORD1, 8(rSTR1)
296 ld rWORD2, 8(rSTR2)
297 cmpld cr1, rWORD3, rWORD4
298 bne cr6, L(dLcr6)
299 L(dLoop1):
300 ld rWORD3, 16(rSTR1)
301 ld rWORD4, 16(rSTR2)
302 cmpld cr6, rWORD5, rWORD6
303 bne cr5, L(dLcr5)
304 L(dLoop2):
305 ld rWORD5, 24(rSTR1)
306 ld rWORD6, 24(rSTR2)
307 cmpld cr5, rWORD7, rWORD8
308 bne cr0, L(dLcr0)
309 L(dLoop3):
310 ldu rWORD7, 32(rSTR1)
311 ldu rWORD8, 32(rSTR2)
312 bne- cr1, L(dLcr1)
313 cmpld cr0, rWORD1, rWORD2
314 bdnz+ L(dLoop)
315
316 L(dL4):
317 cmpld cr1, rWORD3, rWORD4
318 bne cr6, L(dLcr6)
319 cmpld cr6, rWORD5, rWORD6
320 bne cr5, L(dLcr5)
321 cmpld cr5, rWORD7, rWORD8
322 L(d44):
323 bne cr0, L(dLcr0)
324 L(d34):
325 bne cr1, L(dLcr1)
326 L(d24):
327 bne cr6, L(dLcr6)
328 L(d14):
329 sldi. r12, rN, 3
330 bne cr5, L(dLcr5)
331 L(d04):
332 ld rWORD8,-8(r1)
333 ld rWORD7,-16(r1)
334 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
335 beq L(zeroLength)
336 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
337 we are aligned it is safe to load the whole double word, and use
338 shift right double to eliminate bits beyond the compare length. */
339 L(d00):
340 ld rWORD1, 8(rSTR1)
341 ld rWORD2, 8(rSTR2)
342 srd rWORD1, rWORD1, rN
343 srd rWORD2, rWORD2, rN
344 cmpld cr5, rWORD1, rWORD2
345 bne cr5, L(dLcr5x)
346 li rRTN, 0
347 blr
348 .align 4
349 L(dLcr0):
350 ld rWORD8,-8(r1)
351 ld rWORD7,-16(r1)
352 li rRTN, 1
353 bgtlr cr0
354 li rRTN, -1
355 blr
356 .align 4
357 L(dLcr1):
358 ld rWORD8,-8(r1)
359 ld rWORD7,-16(r1)
360 li rRTN, 1
361 bgtlr cr1
362 li rRTN, -1
363 blr
364 .align 4
365 L(dLcr6):
366 ld rWORD8,-8(r1)
367 ld rWORD7,-16(r1)
368 li rRTN, 1
369 bgtlr cr6
370 li rRTN, -1
371 blr
372 .align 4
373 L(dLcr5):
374 ld rWORD8,-8(r1)
375 ld rWORD7,-16(r1)
376 L(dLcr5x):
377 li rRTN, 1
378 bgtlr cr5
379 li rRTN, -1
380 blr
381
382 .align 4
383 L(bytealigned):
384 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
385 beq- cr6, L(zeroLength)
386
387 /* We need to prime this loop. This loop is swing modulo scheduled
388 to avoid pipe delays. The dependent instruction latencies (load to
389 compare to conditional branch) is 2 to 3 cycles. In this loop each
390 dispatch group ends in a branch and takes 1 cycle. Effectively
391 the first iteration of the loop only serves to load operands and
392 branches based on compares are delayed until the next loop.
393
394 So we must precondition some registers and condition codes so that
395 we don't exit the loop early on the first iteration. */
396
397 lbz rWORD1, 0(rSTR1)
398 lbz rWORD2, 0(rSTR2)
399 bdz- L(b11)
400 cmpld cr0, rWORD1, rWORD2
401 lbz rWORD3, 1(rSTR1)
402 lbz rWORD4, 1(rSTR2)
403 bdz- L(b12)
404 cmpld cr1, rWORD3, rWORD4
405 lbzu rWORD5, 2(rSTR1)
406 lbzu rWORD6, 2(rSTR2)
407 bdz- L(b13)
408 .align 4
409 L(bLoop):
410 lbzu rWORD1, 1(rSTR1)
411 lbzu rWORD2, 1(rSTR2)
412 bne- cr0, L(bLcr0)
413
414 cmpld cr6, rWORD5, rWORD6
415 bdz- L(b3i)
416
417 lbzu rWORD3, 1(rSTR1)
418 lbzu rWORD4, 1(rSTR2)
419 bne- cr1, L(bLcr1)
420
421 cmpld cr0, rWORD1, rWORD2
422 bdz- L(b2i)
423
424 lbzu rWORD5, 1(rSTR1)
425 lbzu rWORD6, 1(rSTR2)
426 bne- cr6, L(bLcr6)
427
428 cmpld cr1, rWORD3, rWORD4
429 bdnz+ L(bLoop)
430
431 /* We speculatively loading bytes before we have tested the previous
432 bytes. But we must avoid overrunning the length (in the ctr) to
433 prevent these speculative loads from causing a segfault. In this
434 case the loop will exit early (before the all pending bytes are
435 tested. In this case we must complete the pending operations
436 before returning. */
437 L(b1i):
438 bne- cr0, L(bLcr0)
439 bne- cr1, L(bLcr1)
440 b L(bx56)
441 .align 4
442 L(b2i):
443 bne- cr6, L(bLcr6)
444 bne- cr0, L(bLcr0)
445 b L(bx34)
446 .align 4
447 L(b3i):
448 bne- cr1, L(bLcr1)
449 bne- cr6, L(bLcr6)
450 b L(bx12)
451 .align 4
452 L(bLcr0):
453 li rRTN, 1
454 bgtlr cr0
455 li rRTN, -1
456 blr
457 L(bLcr1):
458 li rRTN, 1
459 bgtlr cr1
460 li rRTN, -1
461 blr
462 L(bLcr6):
463 li rRTN, 1
464 bgtlr cr6
465 li rRTN, -1
466 blr
467
468 L(b13):
469 bne- cr0, L(bx12)
470 bne- cr1, L(bx34)
471 L(bx56):
472 sub rRTN, rWORD5, rWORD6
473 blr
474 nop
475 L(b12):
476 bne- cr0, L(bx12)
477 L(bx34):
478 sub rRTN, rWORD3, rWORD4
479 blr
480 L(b11):
481 L(bx12):
482 sub rRTN, rWORD1, rWORD2
483 blr
484 .align 4
485 L(zeroLengthReturn):
486 ld rWORD8,-8(r1)
487 ld rWORD7,-16(r1)
488 L(zeroLength):
489 li rRTN, 0
490 blr
491
492 .align 4
493 /* At this point we know the strings have different alignment and the
494 compare length is at least 8 bytes. rBITDIF contains the low order
495 3 bits of rSTR1 and cr5 contains the result of the logical compare
496 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
497 aligned and can perform the DWunaligned loop.
498
499 Otherwise we know that rSTR1 is not already DW aligned yet.
500 So we can force the string addresses to the next lower DW
501 boundary and special case this first DW word using shift left to
502 eliminate bits preceding the first byte. Since we want to join the
503 normal (DWaligned) compare loop, starting at the second double word,
504 we need to adjust the length (rN) and special case the loop
505 versioning for the first DW. This insures that the loop count is
506 correct and the first DW (shifted) is in the expected resister pair. */
507 #define rSHL r29 /* Unaligned shift left count. */
508 #define rSHR r28 /* Unaligned shift right count. */
509 #define rB r27 /* Left rotation temp for rWORD2. */
510 #define rD r26 /* Left rotation temp for rWORD4. */
511 #define rF r25 /* Left rotation temp for rWORD6. */
512 #define rH r24 /* Left rotation temp for rWORD8. */
513 #define rA r0 /* Right rotation temp for rWORD2. */
514 #define rC r12 /* Right rotation temp for rWORD4. */
515 #define rE r0 /* Right rotation temp for rWORD6. */
516 #define rG r12 /* Right rotation temp for rWORD8. */
517 L(unaligned):
518 std r29,-24(r1)
519 cfi_offset(r29,-24)
520 clrldi rSHL, rSTR2, 61
521 beq- cr6, L(duzeroLength)
522 std r28,-32(r1)
523 cfi_offset(r28,-32)
524 beq cr5, L(DWunaligned)
525 std r27,-40(r1)
526 cfi_offset(r27,-40)
527 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
528 in the 1st rSTR1 DW. */
529 sub r27, rSTR2, rBITDIF
530 /* But do not attempt to address the DW before that DW that contains
531 the actual start of rSTR2. */
532 clrrdi rSTR2, rSTR2, 3
533 std r26,-48(r1)
534 cfi_offset(r26,-48)
535 /* Compute the left/right shift counts for the unalign rSTR2,
536 compensating for the logical (DW aligned) start of rSTR1. */
537 clrldi rSHL, r27, 61
538 clrrdi rSTR1, rSTR1, 3
539 std r25,-56(r1)
540 cfi_offset(r25,-56)
541 sldi rSHL, rSHL, 3
542 cmpld cr5, r27, rSTR2
543 add rN, rN, rBITDIF
544 sldi r11, rBITDIF, 3
545 std r24,-64(r1)
546 cfi_offset(r24,-64)
547 subfic rSHR, rSHL, 64
548 srdi rTMP, rN, 5 /* Divide by 32 */
549 andi. rBITDIF, rN, 24 /* Get the DW remainder */
550 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
551 this special case those bits may be discarded anyway. Also we
552 must avoid loading a DW where none of the bits are part of rSTR2 as
553 this may cross a page boundary and cause a page fault. */
554 li rWORD8, 0
555 blt cr5, L(dus0)
556 ld rWORD8, 0(rSTR2)
557 la rSTR2, 8(rSTR2)
558 sld rWORD8, rWORD8, rSHL
559
560 L(dus0):
561 ld rWORD1, 0(rSTR1)
562 ld rWORD2, 0(rSTR2)
563 cmpldi cr1, rBITDIF, 16
564 cmpldi cr7, rN, 32
565 srd rG, rWORD2, rSHR
566 clrldi rN, rN, 61
567 beq L(duPs4)
568 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
569 or rWORD8, rG, rWORD8
570 bgt cr1, L(duPs3)
571 beq cr1, L(duPs2)
572
573 /* Remainder is 8 */
574 .align 4
575 L(dusP1):
576 sld rB, rWORD2, rSHL
577 sld rWORD7, rWORD1, r11
578 sld rWORD8, rWORD8, r11
579 bge cr7, L(duP1e)
580 /* At this point we exit early with the first double word compare
581 complete and remainder of 0 to 7 bytes. See L(du14) for details on
582 how we handle the remaining bytes. */
583 cmpld cr5, rWORD7, rWORD8
584 sldi. rN, rN, 3
585 bne cr5, L(duLcr5)
586 cmpld cr7, rN, rSHR
587 beq L(duZeroReturn)
588 li rA, 0
589 ble cr7, L(dutrim)
590 ld rWORD2, 8(rSTR2)
591 srd rA, rWORD2, rSHR
592 b L(dutrim)
593 /* Remainder is 16 */
594 .align 4
595 L(duPs2):
596 sld rH, rWORD2, rSHL
597 sld rWORD5, rWORD1, r11
598 sld rWORD6, rWORD8, r11
599 b L(duP2e)
600 /* Remainder is 24 */
601 .align 4
602 L(duPs3):
603 sld rF, rWORD2, rSHL
604 sld rWORD3, rWORD1, r11
605 sld rWORD4, rWORD8, r11
606 b L(duP3e)
607 /* Count is a multiple of 32, remainder is 0 */
608 .align 4
609 L(duPs4):
610 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
611 or rWORD8, rG, rWORD8
612 sld rD, rWORD2, rSHL
613 sld rWORD1, rWORD1, r11
614 sld rWORD2, rWORD8, r11
615 b L(duP4e)
616
617 /* At this point we know rSTR1 is double word aligned and the
618 compare length is at least 8 bytes. */
619 .align 4
620 L(DWunaligned):
621 std r27,-40(r1)
622 cfi_offset(r27,-40)
623 clrrdi rSTR2, rSTR2, 3
624 std r26,-48(r1)
625 cfi_offset(r26,-48)
626 srdi rTMP, rN, 5 /* Divide by 32 */
627 std r25,-56(r1)
628 cfi_offset(r25,-56)
629 andi. rBITDIF, rN, 24 /* Get the DW remainder */
630 std r24,-64(r1)
631 cfi_offset(r24,-64)
632 sldi rSHL, rSHL, 3
633 ld rWORD6, 0(rSTR2)
634 ldu rWORD8, 8(rSTR2)
635 cmpldi cr1, rBITDIF, 16
636 cmpldi cr7, rN, 32
637 clrldi rN, rN, 61
638 subfic rSHR, rSHL, 64
639 sld rH, rWORD6, rSHL
640 beq L(duP4)
641 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
642 bgt cr1, L(duP3)
643 beq cr1, L(duP2)
644
645 /* Remainder is 8 */
646 .align 4
647 L(duP1):
648 srd rG, rWORD8, rSHR
649 ld rWORD7, 0(rSTR1)
650 sld rB, rWORD8, rSHL
651 or rWORD8, rG, rH
652 blt cr7, L(duP1x)
653 L(duP1e):
654 ld rWORD1, 8(rSTR1)
655 ld rWORD2, 8(rSTR2)
656 cmpld cr5, rWORD7, rWORD8
657 srd rA, rWORD2, rSHR
658 sld rD, rWORD2, rSHL
659 or rWORD2, rA, rB
660 ld rWORD3, 16(rSTR1)
661 ld rWORD4, 16(rSTR2)
662 cmpld cr0, rWORD1, rWORD2
663 srd rC, rWORD4, rSHR
664 sld rF, rWORD4, rSHL
665 bne cr5, L(duLcr5)
666 or rWORD4, rC, rD
667 ld rWORD5, 24(rSTR1)
668 ld rWORD6, 24(rSTR2)
669 cmpld cr1, rWORD3, rWORD4
670 srd rE, rWORD6, rSHR
671 sld rH, rWORD6, rSHL
672 bne cr0, L(duLcr0)
673 or rWORD6, rE, rF
674 cmpld cr6, rWORD5, rWORD6
675 b L(duLoop3)
676 .align 4
677 /* At this point we exit early with the first double word compare
678 complete and remainder of 0 to 7 bytes. See L(du14) for details on
679 how we handle the remaining bytes. */
680 L(duP1x):
681 cmpld cr5, rWORD7, rWORD8
682 sldi. rN, rN, 3
683 bne cr5, L(duLcr5)
684 cmpld cr7, rN, rSHR
685 beq L(duZeroReturn)
686 li rA, 0
687 ble cr7, L(dutrim)
688 ld rWORD2, 8(rSTR2)
689 srd rA, rWORD2, rSHR
690 b L(dutrim)
691 /* Remainder is 16 */
692 .align 4
693 L(duP2):
694 srd rE, rWORD8, rSHR
695 ld rWORD5, 0(rSTR1)
696 or rWORD6, rE, rH
697 sld rH, rWORD8, rSHL
698 L(duP2e):
699 ld rWORD7, 8(rSTR1)
700 ld rWORD8, 8(rSTR2)
701 cmpld cr6, rWORD5, rWORD6
702 srd rG, rWORD8, rSHR
703 sld rB, rWORD8, rSHL
704 or rWORD8, rG, rH
705 blt cr7, L(duP2x)
706 ld rWORD1, 16(rSTR1)
707 ld rWORD2, 16(rSTR2)
708 cmpld cr5, rWORD7, rWORD8
709 bne cr6, L(duLcr6)
710 srd rA, rWORD2, rSHR
711 sld rD, rWORD2, rSHL
712 or rWORD2, rA, rB
713 ld rWORD3, 24(rSTR1)
714 ld rWORD4, 24(rSTR2)
715 cmpld cr0, rWORD1, rWORD2
716 bne cr5, L(duLcr5)
717 srd rC, rWORD4, rSHR
718 sld rF, rWORD4, rSHL
719 or rWORD4, rC, rD
720 addi rSTR1, rSTR1, 8
721 addi rSTR2, rSTR2, 8
722 cmpld cr1, rWORD3, rWORD4
723 b L(duLoop2)
724 .align 4
725 L(duP2x):
726 cmpld cr5, rWORD7, rWORD8
727 addi rSTR1, rSTR1, 8
728 addi rSTR2, rSTR2, 8
729 bne cr6, L(duLcr6)
730 sldi. rN, rN, 3
731 bne cr5, L(duLcr5)
732 cmpld cr7, rN, rSHR
733 beq L(duZeroReturn)
734 li rA, 0
735 ble cr7, L(dutrim)
736 ld rWORD2, 8(rSTR2)
737 srd rA, rWORD2, rSHR
738 b L(dutrim)
739
740 /* Remainder is 24 */
741 .align 4
742 L(duP3):
743 srd rC, rWORD8, rSHR
744 ld rWORD3, 0(rSTR1)
745 sld rF, rWORD8, rSHL
746 or rWORD4, rC, rH
747 L(duP3e):
748 ld rWORD5, 8(rSTR1)
749 ld rWORD6, 8(rSTR2)
750 cmpld cr1, rWORD3, rWORD4
751 srd rE, rWORD6, rSHR
752 sld rH, rWORD6, rSHL
753 or rWORD6, rE, rF
754 ld rWORD7, 16(rSTR1)
755 ld rWORD8, 16(rSTR2)
756 cmpld cr6, rWORD5, rWORD6
757 bne cr1, L(duLcr1)
758 srd rG, rWORD8, rSHR
759 sld rB, rWORD8, rSHL
760 or rWORD8, rG, rH
761 blt cr7, L(duP3x)
762 ld rWORD1, 24(rSTR1)
763 ld rWORD2, 24(rSTR2)
764 cmpld cr5, rWORD7, rWORD8
765 bne cr6, L(duLcr6)
766 srd rA, rWORD2, rSHR
767 sld rD, rWORD2, rSHL
768 or rWORD2, rA, rB
769 addi rSTR1, rSTR1, 16
770 addi rSTR2, rSTR2, 16
771 cmpld cr0, rWORD1, rWORD2
772 b L(duLoop1)
773 .align 4
774 L(duP3x):
775 addi rSTR1, rSTR1, 16
776 addi rSTR2, rSTR2, 16
777 bne cr1, L(duLcr1)
778 cmpld cr5, rWORD7, rWORD8
779 bne cr6, L(duLcr6)
780 sldi. rN, rN, 3
781 bne cr5, L(duLcr5)
782 cmpld cr7, rN, rSHR
783 beq L(duZeroReturn)
784 li rA, 0
785 ble cr7, L(dutrim)
786 ld rWORD2, 8(rSTR2)
787 srd rA, rWORD2, rSHR
788 b L(dutrim)
789
790 /* Count is a multiple of 32, remainder is 0 */
791 .align 4
792 L(duP4):
793 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
794 srd rA, rWORD8, rSHR
795 ld rWORD1, 0(rSTR1)
796 sld rD, rWORD8, rSHL
797 or rWORD2, rA, rH
798 L(duP4e):
799 ld rWORD3, 8(rSTR1)
800 ld rWORD4, 8(rSTR2)
801 cmpld cr0, rWORD1, rWORD2
802 srd rC, rWORD4, rSHR
803 sld rF, rWORD4, rSHL
804 or rWORD4, rC, rD
805 ld rWORD5, 16(rSTR1)
806 ld rWORD6, 16(rSTR2)
807 cmpld cr1, rWORD3, rWORD4
808 bne cr0, L(duLcr0)
809 srd rE, rWORD6, rSHR
810 sld rH, rWORD6, rSHL
811 or rWORD6, rE, rF
812 ldu rWORD7, 24(rSTR1)
813 ldu rWORD8, 24(rSTR2)
814 cmpld cr6, rWORD5, rWORD6
815 bne cr1, L(duLcr1)
816 srd rG, rWORD8, rSHR
817 sld rB, rWORD8, rSHL
818 or rWORD8, rG, rH
819 cmpld cr5, rWORD7, rWORD8
820 bdz- L(du24) /* Adjust CTR as we start with +4 */
821 /* This is the primary loop */
822 .align 4
823 L(duLoop):
824 ld rWORD1, 8(rSTR1)
825 ld rWORD2, 8(rSTR2)
826 cmpld cr1, rWORD3, rWORD4
827 bne cr6, L(duLcr6)
828 srd rA, rWORD2, rSHR
829 sld rD, rWORD2, rSHL
830 or rWORD2, rA, rB
831 L(duLoop1):
832 ld rWORD3, 16(rSTR1)
833 ld rWORD4, 16(rSTR2)
834 cmpld cr6, rWORD5, rWORD6
835 bne cr5, L(duLcr5)
836 srd rC, rWORD4, rSHR
837 sld rF, rWORD4, rSHL
838 or rWORD4, rC, rD
839 L(duLoop2):
840 ld rWORD5, 24(rSTR1)
841 ld rWORD6, 24(rSTR2)
842 cmpld cr5, rWORD7, rWORD8
843 bne cr0, L(duLcr0)
844 srd rE, rWORD6, rSHR
845 sld rH, rWORD6, rSHL
846 or rWORD6, rE, rF
847 L(duLoop3):
848 ldu rWORD7, 32(rSTR1)
849 ldu rWORD8, 32(rSTR2)
850 cmpld cr0, rWORD1, rWORD2
851 bne- cr1, L(duLcr1)
852 srd rG, rWORD8, rSHR
853 sld rB, rWORD8, rSHL
854 or rWORD8, rG, rH
855 bdnz+ L(duLoop)
856
857 L(duL4):
858 bne cr1, L(duLcr1)
859 cmpld cr1, rWORD3, rWORD4
860 bne cr6, L(duLcr6)
861 cmpld cr6, rWORD5, rWORD6
862 bne cr5, L(duLcr5)
863 cmpld cr5, rWORD7, rWORD8
864 L(du44):
865 bne cr0, L(duLcr0)
866 L(du34):
867 bne cr1, L(duLcr1)
868 L(du24):
869 bne cr6, L(duLcr6)
870 L(du14):
871 sldi. rN, rN, 3
872 bne cr5, L(duLcr5)
873 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
874 shift right double to eliminate bits beyond the compare length.
875 This allows the use of double word subtract to compute the final
876 result.
877
878 However it may not be safe to load rWORD2 which may be beyond the
879 string length. So we compare the bit length of the remainder to
880 the right shift count (rSHR). If the bit count is less than or equal
881 we do not need to load rWORD2 (all significant bits are already in
882 rB). */
883 cmpld cr7, rN, rSHR
884 beq L(duZeroReturn)
885 li rA, 0
886 ble cr7, L(dutrim)
887 ld rWORD2, 8(rSTR2)
888 srd rA, rWORD2, rSHR
889 .align 4
890 L(dutrim):
891 ld rWORD1, 8(rSTR1)
892 ld rWORD8,-8(r1)
893 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
894 or rWORD2, rA, rB
895 ld rWORD7,-16(r1)
896 ld r29,-24(r1)
897 srd rWORD1, rWORD1, rN
898 srd rWORD2, rWORD2, rN
899 ld r28,-32(r1)
900 ld r27,-40(r1)
901 li rRTN, 0
902 cmpld cr0, rWORD1, rWORD2
903 ld r26,-48(r1)
904 ld r25,-56(r1)
905 beq cr0, L(dureturn24)
906 li rRTN, 1
907 ld r24,-64(r1)
908 bgtlr cr0
909 li rRTN, -1
910 blr
911 .align 4
912 L(duLcr0):
913 ld rWORD8,-8(r1)
914 ld rWORD7,-16(r1)
915 li rRTN, 1
916 bgt cr0, L(dureturn29)
917 ld r29,-24(r1)
918 ld r28,-32(r1)
919 li rRTN, -1
920 b L(dureturn27)
921 .align 4
922 L(duLcr1):
923 ld rWORD8,-8(r1)
924 ld rWORD7,-16(r1)
925 li rRTN, 1
926 bgt cr1, L(dureturn29)
927 ld r29,-24(r1)
928 ld r28,-32(r1)
929 li rRTN, -1
930 b L(dureturn27)
931 .align 4
932 L(duLcr6):
933 ld rWORD8,-8(r1)
934 ld rWORD7,-16(r1)
935 li rRTN, 1
936 bgt cr6, L(dureturn29)
937 ld r29,-24(r1)
938 ld r28,-32(r1)
939 li rRTN, -1
940 b L(dureturn27)
941 .align 4
942 L(duLcr5):
943 ld rWORD8,-8(r1)
944 ld rWORD7,-16(r1)
945 li rRTN, 1
946 bgt cr5, L(dureturn29)
947 ld r29,-24(r1)
948 ld r28,-32(r1)
949 li rRTN, -1
950 b L(dureturn27)
951 .align 3
952 L(duZeroReturn):
953 li rRTN,0
954 .align 4
955 L(dureturn):
956 ld rWORD8,-8(r1)
957 ld rWORD7,-16(r1)
958 L(dureturn29):
959 ld r29,-24(r1)
960 ld r28,-32(r1)
961 L(dureturn27):
962 ld r27,-40(r1)
963 L(dureturn26):
964 ld r26,-48(r1)
965 L(dureturn25):
966 ld r25,-56(r1)
967 L(dureturn24):
968 ld r24,-64(r1)
969 blr
970 L(duzeroLength):
971 li rRTN,0
972 blr
973
974 END (memcmp)
975 libc_hidden_builtin_def (memcmp)
976 weak_alias (memcmp, bcmp)