]> git.ipfire.org Git - thirdparty/glibc.git/blame - powerpc-cpu/sysdeps/powerpc/powerpc32/power4/memcmp.S
2.5-18.1
[thirdparty/glibc.git] / powerpc-cpu / sysdeps / powerpc / powerpc32 / power4 / memcmp.S
CommitLineData
0ecb606c
JJ
1/* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
18 02110-1301 USA. */
19
20#include <sysdep.h>
21#include <bp-sym.h>
22#include <bp-asm.h>
23
24/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
25
26EALIGN (BP_SYM(memcmp), 4, 0)
27 CALL_MCOUNT
28
29#define rTMP r0
30#define rRTN r3
31#define rSTR1 r3 /* first string arg */
32#define rSTR2 r4 /* second string arg */
33#define rN r5 /* max string length */
34#define rWORD1 r6 /* current word in s1 */
35#define rWORD2 r7 /* current word in s2 */
36#define rWORD3 r8 /* next word in s1 */
37#define rWORD4 r9 /* next word in s2 */
38#define rWORD5 r10 /* next word in s1 */
39#define rWORD6 r11 /* next word in s2 */
40#define rBITDIF r12 /* bits that differ in s1 & s2 words */
41#define rWORD7 r30 /* next word in s1 */
42#define rWORD8 r31 /* next word in s2 */
43
44 xor rTMP, rSTR2, rSTR1
45 cmplwi cr6, rN, 0
46 cmplwi cr1, rN, 12
47 clrlwi. rTMP, rTMP, 30
48 clrlwi rBITDIF, rSTR1, 30
49 cmplwi cr5, rBITDIF, 0
50 beq- cr6, L(zeroLength)
51 dcbt 0,rSTR1
52 dcbt 0,rSTR2
53/* If less than 8 bytes or not aligned, use the unaligned
54 byte loop. */
55 blt cr1, L(bytealigned)
56 stwu 1,-64(1)
57 cfi_adjust_cfa_offset(64)
58 stw r31,48(1)
59 cfi_offset(31,(48-64))
60 stw r30,44(1)
61 cfi_offset(30,(44-64))
62 bne L(unaligned)
63/* At this point we know both strings have the same alignment and the
64 compare length is at least 8 bytes. rBITDIF contains the low order
65 2 bits of rSTR1 and cr5 contains the result of the logical compare
66 of rBITDIF to 0. If rBITDIF == 0 then we are already word
67 aligned and can perform the word aligned loop.
68
69 Otherwise we know the two strings have the same alignment (but not
70 yet word aligned). So we force the string addresses to the next lower
71 word boundary and special case this first word using shift left to
72 eliminate bits preceeding the first byte. Since we want to join the
73 normal (word aligned) compare loop, starting at the second word,
74 we need to adjust the length (rN) and special case the loop
75 versioning for the first word. This insures that the loop count is
76 correct and the first word (shifted) is in the expected register pair. */
77 .align 4
78L(samealignment):
79 clrrwi rSTR1, rSTR1, 2
80 clrrwi rSTR2, rSTR2, 2
81 beq cr5, L(Waligned)
82 add rN, rN, rBITDIF
83 slwi r11, rBITDIF, 3
84 srwi rTMP, rN, 4 /* Divide by 16 */
85 andi. rBITDIF, rN, 12 /* Get the word remainder */
86 lwz rWORD1, 0(rSTR1)
87 lwz rWORD2, 0(rSTR2)
88 cmplwi cr1, rBITDIF, 8
89 cmplwi cr7, rN, 16
90 clrlwi rN, rN, 30
91 beq L(dPs4)
92 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
93 bgt cr1, L(dPs3)
94 beq cr1, L(dPs2)
95
96/* Remainder is 4 */
97 .align 3
98L(dsP1):
99 slw rWORD5, rWORD1, r11
100 slw rWORD6, rWORD2, r11
101 cmplw cr5, rWORD5, rWORD6
102 blt cr7, L(dP1x)
103/* Do something useful in this cycle since we have to branch anyway. */
104 lwz rWORD1, 4(rSTR1)
105 lwz rWORD2, 4(rSTR2)
106 cmplw cr0, rWORD1, rWORD2
107 b L(dP1e)
108/* Remainder is 8 */
109 .align 4
110L(dPs2):
111 slw rWORD5, rWORD1, r11
112 slw rWORD6, rWORD2, r11
113 cmplw cr6, rWORD5, rWORD6
114 blt cr7, L(dP2x)
115/* Do something useful in this cycle since we have to branch anyway. */
116 lwz rWORD7, 4(rSTR1)
117 lwz rWORD8, 4(rSTR2)
118 cmplw cr5, rWORD7, rWORD8
119 b L(dP2e)
120/* Remainder is 12 */
121 .align 4
122L(dPs3):
123 slw rWORD3, rWORD1, r11
124 slw rWORD4, rWORD2, r11
125 cmplw cr1, rWORD3, rWORD4
126 b L(dP3e)
127/* Count is a multiple of 16, remainder is 0 */
128 .align 4
129L(dPs4):
130 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
131 slw rWORD1, rWORD1, r11
132 slw rWORD2, rWORD2, r11
133 cmplw cr0, rWORD1, rWORD2
134 b L(dP4e)
135
136/* At this point we know both strings are word aligned and the
137 compare length is at least 8 bytes. */
138 .align 4
139L(Waligned):
140 andi. rBITDIF, rN, 12 /* Get the word remainder */
141 srwi rTMP, rN, 4 /* Divide by 16 */
142 cmplwi cr1, rBITDIF, 8
143 cmplwi cr7, rN, 16
144 clrlwi rN, rN, 30
145 beq L(dP4)
146 bgt cr1, L(dP3)
147 beq cr1, L(dP2)
148
149/* Remainder is 4 */
150 .align 4
151L(dP1):
152 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
153/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
154 (8-15 byte compare), we want to use only volatile registers. This
155 means we can avoid restoring non-volatile registers since we did not
156 change any on the early exit path. The key here is the non-early
157 exit path only cares about the condition code (cr5), not about which
158 register pair was used. */
159 lwz rWORD5, 0(rSTR1)
160 lwz rWORD6, 0(rSTR2)
161 cmplw cr5, rWORD5, rWORD6
162 blt cr7, L(dP1x)
163 lwz rWORD1, 4(rSTR1)
164 lwz rWORD2, 4(rSTR2)
165 cmplw cr0, rWORD1, rWORD2
166L(dP1e):
167 lwz rWORD3, 8(rSTR1)
168 lwz rWORD4, 8(rSTR2)
169 cmplw cr1, rWORD3, rWORD4
170 lwz rWORD5, 12(rSTR1)
171 lwz rWORD6, 12(rSTR2)
172 cmplw cr6, rWORD5, rWORD6
173 bne cr5, L(dLcr5)
174 bne cr0, L(dLcr0)
175
176 lwzu rWORD7, 16(rSTR1)
177 lwzu rWORD8, 16(rSTR2)
178 bne cr1, L(dLcr1)
179 cmplw cr5, rWORD7, rWORD8
180 bdnz L(dLoop)
181 bne cr6, L(dLcr6)
182 lwz r30,44(1)
183 lwz r31,48(1)
184 .align 3
185L(dP1x):
186 slwi. r12, rN, 3
187 bne cr5, L(dLcr5)
188 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
189 lwz 1,0(1)
190 bne L(d00)
191 li rRTN, 0
192 blr
193
194/* Remainder is 8 */
195 .align 4
196L(dP2):
197 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
198 lwz rWORD5, 0(rSTR1)
199 lwz rWORD6, 0(rSTR2)
200 cmplw cr6, rWORD5, rWORD6
201 blt cr7, L(dP2x)
202 lwz rWORD7, 4(rSTR1)
203 lwz rWORD8, 4(rSTR2)
204 cmplw cr5, rWORD7, rWORD8
205L(dP2e):
206 lwz rWORD1, 8(rSTR1)
207 lwz rWORD2, 8(rSTR2)
208 cmplw cr0, rWORD1, rWORD2
209 lwz rWORD3, 12(rSTR1)
210 lwz rWORD4, 12(rSTR2)
211 cmplw cr1, rWORD3, rWORD4
212 addi rSTR1, rSTR1, 4
213 addi rSTR2, rSTR2, 4
214 bne cr6, L(dLcr6)
215 bne cr5, L(dLcr5)
216 b L(dLoop2)
217/* Again we are on a early exit path (16-23 byte compare), we want to
218 only use volatile registers and avoid restoring non-volatile
219 registers. */
220 .align 4
221L(dP2x):
222 lwz rWORD3, 4(rSTR1)
223 lwz rWORD4, 4(rSTR2)
224 cmplw cr5, rWORD3, rWORD4
225 slwi. r12, rN, 3
226 bne cr6, L(dLcr6)
227 addi rSTR1, rSTR1, 4
228 addi rSTR2, rSTR2, 4
229 bne cr5, L(dLcr5)
230 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
231 lwz 1,0(1)
232 bne L(d00)
233 li rRTN, 0
234 blr
235
236/* Remainder is 12 */
237 .align 4
238L(dP3):
239 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
240 lwz rWORD3, 0(rSTR1)
241 lwz rWORD4, 0(rSTR2)
242 cmplw cr1, rWORD3, rWORD4
243L(dP3e):
244 lwz rWORD5, 4(rSTR1)
245 lwz rWORD6, 4(rSTR2)
246 cmplw cr6, rWORD5, rWORD6
247 blt cr7, L(dP3x)
248 lwz rWORD7, 8(rSTR1)
249 lwz rWORD8, 8(rSTR2)
250 cmplw cr5, rWORD7, rWORD8
251 lwz rWORD1, 12(rSTR1)
252 lwz rWORD2, 12(rSTR2)
253 cmplw cr0, rWORD1, rWORD2
254 addi rSTR1, rSTR1, 8
255 addi rSTR2, rSTR2, 8
256 bne cr1, L(dLcr1)
257 bne cr6, L(dLcr6)
258 b L(dLoop1)
259/* Again we are on a early exit path (24-31 byte compare), we want to
260 only use volatile registers and avoid restoring non-volatile
261 registers. */
262 .align 4
263L(dP3x):
264 lwz rWORD1, 8(rSTR1)
265 lwz rWORD2, 8(rSTR2)
266 cmplw cr5, rWORD1, rWORD2
267 slwi. r12, rN, 3
268 bne cr1, L(dLcr1)
269 addi rSTR1, rSTR1, 8
270 addi rSTR2, rSTR2, 8
271 bne cr6, L(dLcr6)
272 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
273 bne cr5, L(dLcr5)
274 lwz 1,0(1)
275 bne L(d00)
276 li rRTN, 0
277 blr
278
279/* Count is a multiple of 16, remainder is 0 */
280 .align 4
281L(dP4):
282 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
283 lwz rWORD1, 0(rSTR1)
284 lwz rWORD2, 0(rSTR2)
285 cmplw cr0, rWORD1, rWORD2
286L(dP4e):
287 lwz rWORD3, 4(rSTR1)
288 lwz rWORD4, 4(rSTR2)
289 cmplw cr1, rWORD3, rWORD4
290 lwz rWORD5, 8(rSTR1)
291 lwz rWORD6, 8(rSTR2)
292 cmplw cr6, rWORD5, rWORD6
293 lwzu rWORD7, 12(rSTR1)
294 lwzu rWORD8, 12(rSTR2)
295 cmplw cr5, rWORD7, rWORD8
296 bne cr0, L(dLcr0)
297 bne cr1, L(dLcr1)
298 bdz- L(d24) /* Adjust CTR as we start with +4 */
299/* This is the primary loop */
300 .align 4
301L(dLoop):
302 lwz rWORD1, 4(rSTR1)
303 lwz rWORD2, 4(rSTR2)
304 cmplw cr1, rWORD3, rWORD4
305 bne cr6, L(dLcr6)
306L(dLoop1):
307 lwz rWORD3, 8(rSTR1)
308 lwz rWORD4, 8(rSTR2)
309 cmplw cr6, rWORD5, rWORD6
310 bne cr5, L(dLcr5)
311L(dLoop2):
312 lwz rWORD5, 12(rSTR1)
313 lwz rWORD6, 12(rSTR2)
314 cmplw cr5, rWORD7, rWORD8
315 bne cr0, L(dLcr0)
316L(dLoop3):
317 lwzu rWORD7, 16(rSTR1)
318 lwzu rWORD8, 16(rSTR2)
319 bne- cr1, L(dLcr1)
320 cmplw cr0, rWORD1, rWORD2
321 bdnz+ L(dLoop)
322
323L(dL4):
324 cmplw cr1, rWORD3, rWORD4
325 bne cr6, L(dLcr6)
326 cmplw cr6, rWORD5, rWORD6
327 bne cr5, L(dLcr5)
328 cmplw cr5, rWORD7, rWORD8
329L(d44):
330 bne cr0, L(dLcr0)
331L(d34):
332 bne cr1, L(dLcr1)
333L(d24):
334 bne cr6, L(dLcr6)
335L(d14):
336 slwi. r12, rN, 3
337 bne cr5, L(dLcr5)
338L(d04):
339 lwz r30,44(1)
340 lwz r31,48(1)
341 lwz 1,0(1)
342 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
343 beq L(zeroLength)
344/* At this point we have a remainder of 1 to 3 bytes to compare. Since
345 we are aligned it is safe to load the whole word, and use
346 shift right to eliminate bits beyond the compare length. */
347L(d00):
348 lwz rWORD1, 4(rSTR1)
349 lwz rWORD2, 4(rSTR2)
350 srw rWORD1, rWORD1, rN
351 srw rWORD2, rWORD2, rN
352 cmplw rWORD1,rWORD2
353 li rRTN,0
354 beqlr
355 li rRTN,1
356 bgtlr
357 li rRTN,-1
358 blr
359
360 .align 4
361L(dLcr0):
362 lwz r30,44(1)
363 lwz r31,48(1)
364 li rRTN, 1
365 lwz 1,0(1)
366 bgtlr cr0
367 li rRTN, -1
368 blr
369 .align 4
370L(dLcr1):
371 lwz r30,44(1)
372 lwz r31,48(1)
373 li rRTN, 1
374 lwz 1,0(1)
375 bgtlr cr1
376 li rRTN, -1
377 blr
378 .align 4
379L(dLcr6):
380 lwz r30,44(1)
381 lwz r31,48(1)
382 li rRTN, 1
383 lwz 1,0(1)
384 bgtlr cr6
385 li rRTN, -1
386 blr
387 .align 4
388L(dLcr5):
389 lwz r30,44(1)
390 lwz r31,48(1)
391L(dLcr5x):
392 li rRTN, 1
393 lwz 1,0(1)
394 bgtlr cr5
395 li rRTN, -1
396 blr
397
398 .align 4
399L(bytealigned):
400 cfi_adjust_cfa_offset(-64)
401 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
402
403/* We need to prime this loop. This loop is swing modulo scheduled
404 to avoid pipe delays. The dependent instruction latencies (load to
405 compare to conditional branch) is 2 to 3 cycles. In this loop each
406 dispatch group ends in a branch and takes 1 cycle. Effectively
407 the first iteration of the loop only serves to load operands and
408 branches based on compares are delayed until the next loop.
409
410 So we must precondition some registers and condition codes so that
411 we don't exit the loop early on the first iteration. */
412
413 lbz rWORD1, 0(rSTR1)
414 lbz rWORD2, 0(rSTR2)
415 bdz- L(b11)
416 cmplw cr0, rWORD1, rWORD2
417 lbz rWORD3, 1(rSTR1)
418 lbz rWORD4, 1(rSTR2)
419 bdz- L(b12)
420 cmplw cr1, rWORD3, rWORD4
421 lbzu rWORD5, 2(rSTR1)
422 lbzu rWORD6, 2(rSTR2)
423 bdz- L(b13)
424 .align 4
425L(bLoop):
426 lbzu rWORD1, 1(rSTR1)
427 lbzu rWORD2, 1(rSTR2)
428 bne- cr0, L(bLcr0)
429
430 cmplw cr6, rWORD5, rWORD6
431 bdz- L(b3i)
432
433 lbzu rWORD3, 1(rSTR1)
434 lbzu rWORD4, 1(rSTR2)
435 bne- cr1, L(bLcr1)
436
437 cmplw cr0, rWORD1, rWORD2
438 bdz- L(b2i)
439
440 lbzu rWORD5, 1(rSTR1)
441 lbzu rWORD6, 1(rSTR2)
442 bne- cr6, L(bLcr6)
443
444 cmplw cr1, rWORD3, rWORD4
445 bdnz+ L(bLoop)
446
447/* We speculatively loading bytes before we have tested the previous
448 bytes. But we must avoid overrunning the length (in the ctr) to
449 prevent these speculative loads from causing a segfault. In this
450 case the loop will exit early (before the all pending bytes are
451 tested. In this case we must complete the pending operations
452 before returning. */
453L(b1i):
454 bne- cr0, L(bLcr0)
455 bne- cr1, L(bLcr1)
456 b L(bx56)
457 .align 4
458L(b2i):
459 bne- cr6, L(bLcr6)
460 bne- cr0, L(bLcr0)
461 b L(bx34)
462 .align 4
463L(b3i):
464 bne- cr1, L(bLcr1)
465 bne- cr6, L(bLcr6)
466 b L(bx12)
467 .align 4
468L(bLcr0):
469 li rRTN, 1
470 bgtlr cr0
471 li rRTN, -1
472 blr
473L(bLcr1):
474 li rRTN, 1
475 bgtlr cr1
476 li rRTN, -1
477 blr
478L(bLcr6):
479 li rRTN, 1
480 bgtlr cr6
481 li rRTN, -1
482 blr
483
484L(b13):
485 bne- cr0, L(bx12)
486 bne- cr1, L(bx34)
487L(bx56):
488 sub rRTN, rWORD5, rWORD6
489 blr
490 nop
491L(b12):
492 bne- cr0, L(bx12)
493L(bx34):
494 sub rRTN, rWORD3, rWORD4
495 blr
496
497L(b11):
498L(bx12):
499 sub rRTN, rWORD1, rWORD2
500 blr
501
502 .align 4
503L(zeroLengthReturn):
504
505L(zeroLength):
506 li rRTN, 0
507 blr
508
509 cfi_adjust_cfa_offset(64)
510 .align 4
511/* At this point we know the strings have different alignment and the
512 compare length is at least 8 bytes. rBITDIF contains the low order
513 2 bits of rSTR1 and cr5 contains the result of the logical compare
514 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
515 perform the Wunaligned loop.
516
517 Otherwise we know that rSTR1 is not aready word aligned yet.
518 So we can force the string addresses to the next lower word
519 boundary and special case this first word using shift left to
520 eliminate bits preceeding the first byte. Since we want to join the
521 normal (Wualigned) compare loop, starting at the second word,
522 we need to adjust the length (rN) and special case the loop
523 versioning for the first W. This insures that the loop count is
524 correct and the first W (shifted) is in the expected resister pair. */
525#define rSHL r29 /* Unaligned shift left count. */
526#define rSHR r28 /* Unaligned shift right count. */
527#define rB r27 /* Left rotation temp for rWORD2. */
528#define rD r26 /* Left rotation temp for rWORD4. */
529#define rF r25 /* Left rotation temp for rWORD6. */
530#define rH r24 /* Left rotation temp for rWORD8. */
531#define rA r0 /* Right rotation temp for rWORD2. */
532#define rC r12 /* Right rotation temp for rWORD4. */
533#define rE r0 /* Right rotation temp for rWORD6. */
534#define rG r12 /* Right rotation temp for rWORD8. */
535L(unaligned):
536 stw r29,40(r1)
537 cfi_offset(r29,(40-64))
538 clrlwi rSHL, rSTR2, 30
539 stw r28,36(r1)
540 cfi_offset(r28,(36-64))
541 beq cr5, L(Wunaligned)
542 stw r27,32(r1)
543 cfi_offset(r27,(32-64))
544/* Adjust the logical start of rSTR2 to compensate for the extra bits
545 in the 1st rSTR1 W. */
546 sub r27, rSTR2, rBITDIF
547/* But do not attempt to address the W before that W that contains
548 the actual start of rSTR2. */
549 clrrwi rSTR2, rSTR2, 2
550 stw r26,28(r1)
551 cfi_offset(r26,(28-64))
552/* Compute the left/right shift counts for the unalign rSTR2,
553 compensating for the logical (W aligned) start of rSTR1. */
554 clrlwi rSHL, r27, 30
555 clrrwi rSTR1, rSTR1, 2
556 stw r25,24(r1)
557 cfi_offset(r25,(24-64))
558 slwi rSHL, rSHL, 3
559 cmplw cr5, r27, rSTR2
560 add rN, rN, rBITDIF
561 slwi r11, rBITDIF, 3
562 stw r24,20(r1)
563 cfi_offset(r24,(20-64))
564 subfic rSHR, rSHL, 32
565 srwi rTMP, rN, 4 /* Divide by 16 */
566 andi. rBITDIF, rN, 12 /* Get the W remainder */
567/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
568 this special case those bits may be discarded anyway. Also we
569 must avoid loading a W where none of the bits are part of rSTR2 as
570 this may cross a page boundary and cause a page fault. */
571 li rWORD8, 0
572 blt cr5, L(dus0)
573 lwz rWORD8, 0(rSTR2)
574 la rSTR2, 4(rSTR2)
575 slw rWORD8, rWORD8, rSHL
576
577L(dus0):
578 lwz rWORD1, 0(rSTR1)
579 lwz rWORD2, 0(rSTR2)
580 cmplwi cr1, rBITDIF, 8
581 cmplwi cr7, rN, 16
582 srw rG, rWORD2, rSHR
583 clrlwi rN, rN, 30
584 beq L(duPs4)
585 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
586 or rWORD8, rG, rWORD8
587 bgt cr1, L(duPs3)
588 beq cr1, L(duPs2)
589
590/* Remainder is 4 */
591 .align 4
592L(dusP1):
593 slw rB, rWORD2, rSHL
594 slw rWORD7, rWORD1, r11
595 slw rWORD8, rWORD8, r11
596 bge cr7, L(duP1e)
597/* At this point we exit early with the first word compare
598 complete and remainder of 0 to 3 bytes. See L(du14) for details on
599 how we handle the remaining bytes. */
600 cmplw cr5, rWORD7, rWORD8
601 slwi. rN, rN, 3
602 bne cr5, L(duLcr5)
603 cmplw cr7, rN, rSHR
604 beq L(duZeroReturn)
605 li rA, 0
606 ble cr7, L(dutrim)
607 lwz rWORD2, 4(rSTR2)
608 srw rA, rWORD2, rSHR
609 b L(dutrim)
610/* Remainder is 8 */
611 .align 4
612L(duPs2):
613 slw rH, rWORD2, rSHL
614 slw rWORD5, rWORD1, r11
615 slw rWORD6, rWORD8, r11
616 b L(duP2e)
617/* Remainder is 12 */
618 .align 4
619L(duPs3):
620 slw rF, rWORD2, rSHL
621 slw rWORD3, rWORD1, r11
622 slw rWORD4, rWORD8, r11
623 b L(duP3e)
624/* Count is a multiple of 16, remainder is 0 */
625 .align 4
626L(duPs4):
627 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
628 or rWORD8, rG, rWORD8
629 slw rD, rWORD2, rSHL
630 slw rWORD1, rWORD1, r11
631 slw rWORD2, rWORD8, r11
632 b L(duP4e)
633
634/* At this point we know rSTR1 is word aligned and the
635 compare length is at least 8 bytes. */
636 .align 4
637L(Wunaligned):
638 stw r27,32(r1)
639 cfi_offset(r27,(32-64))
640 clrrwi rSTR2, rSTR2, 2
641 stw r26,28(r1)
642 cfi_offset(r26,(28-64))
643 srwi rTMP, rN, 4 /* Divide by 16 */
644 stw r25,24(r1)
645 cfi_offset(r25,(24-64))
646 andi. rBITDIF, rN, 12 /* Get the W remainder */
647 stw r24,20(r1)
648 cfi_offset(r24,(24-64))
649 slwi rSHL, rSHL, 3
650 lwz rWORD6, 0(rSTR2)
651 lwzu rWORD8, 4(rSTR2)
652 cmplwi cr1, rBITDIF, 8
653 cmplwi cr7, rN, 16
654 clrlwi rN, rN, 30
655 subfic rSHR, rSHL, 32
656 slw rH, rWORD6, rSHL
657 beq L(duP4)
658 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
659 bgt cr1, L(duP3)
660 beq cr1, L(duP2)
661
662/* Remainder is 4 */
663 .align 4
664L(duP1):
665 srw rG, rWORD8, rSHR
666 lwz rWORD7, 0(rSTR1)
667 slw rB, rWORD8, rSHL
668 or rWORD8, rG, rH
669 blt cr7, L(duP1x)
670L(duP1e):
671 lwz rWORD1, 4(rSTR1)
672 lwz rWORD2, 4(rSTR2)
673 cmplw cr5, rWORD7, rWORD8
674 srw rA, rWORD2, rSHR
675 slw rD, rWORD2, rSHL
676 or rWORD2, rA, rB
677 lwz rWORD3, 8(rSTR1)
678 lwz rWORD4, 8(rSTR2)
679 cmplw cr0, rWORD1, rWORD2
680 srw rC, rWORD4, rSHR
681 slw rF, rWORD4, rSHL
682 bne cr5, L(duLcr5)
683 or rWORD4, rC, rD
684 lwz rWORD5, 12(rSTR1)
685 lwz rWORD6, 12(rSTR2)
686 cmplw cr1, rWORD3, rWORD4
687 srw rE, rWORD6, rSHR
688 slw rH, rWORD6, rSHL
689 bne cr0, L(duLcr0)
690 or rWORD6, rE, rF
691 cmplw cr6, rWORD5, rWORD6
692 b L(duLoop3)
693 .align 4
694/* At this point we exit early with the first word compare
695 complete and remainder of 0 to 3 bytes. See L(du14) for details on
696 how we handle the remaining bytes. */
697L(duP1x):
698 cmplw cr5, rWORD7, rWORD8
699 slwi. rN, rN, 3
700 bne cr5, L(duLcr5)
701 cmplw cr7, rN, rSHR
702 beq L(duZeroReturn)
703 li rA, 0
704 ble cr7, L(dutrim)
705 ld rWORD2, 8(rSTR2)
706 srw rA, rWORD2, rSHR
707 b L(dutrim)
708/* Remainder is 8 */
709 .align 4
710L(duP2):
711 srw rE, rWORD8, rSHR
712 lwz rWORD5, 0(rSTR1)
713 or rWORD6, rE, rH
714 slw rH, rWORD8, rSHL
715L(duP2e):
716 lwz rWORD7, 4(rSTR1)
717 lwz rWORD8, 4(rSTR2)
718 cmplw cr6, rWORD5, rWORD6
719 srw rG, rWORD8, rSHR
720 slw rB, rWORD8, rSHL
721 or rWORD8, rG, rH
722 blt cr7, L(duP2x)
723 lwz rWORD1, 8(rSTR1)
724 lwz rWORD2, 8(rSTR2)
725 cmplw cr5, rWORD7, rWORD8
726 bne cr6, L(duLcr6)
727 srw rA, rWORD2, rSHR
728 slw rD, rWORD2, rSHL
729 or rWORD2, rA, rB
730 lwz rWORD3, 12(rSTR1)
731 lwz rWORD4, 12(rSTR2)
732 cmplw cr0, rWORD1, rWORD2
733 bne cr5, L(duLcr5)
734 srw rC, rWORD4, rSHR
735 slw rF, rWORD4, rSHL
736 or rWORD4, rC, rD
737 addi rSTR1, rSTR1, 4
738 addi rSTR2, rSTR2, 4
739 cmplw cr1, rWORD3, rWORD4
740 b L(duLoop2)
741 .align 4
742L(duP2x):
743 cmplw cr5, rWORD7, rWORD8
744 addi rSTR1, rSTR1, 4
745 addi rSTR2, rSTR2, 4
746 bne cr6, L(duLcr6)
747 slwi. rN, rN, 3
748 bne cr5, L(duLcr5)
749 cmplw cr7, rN, rSHR
750 beq L(duZeroReturn)
751 li rA, 0
752 ble cr7, L(dutrim)
753 lwz rWORD2, 4(rSTR2)
754 srw rA, rWORD2, rSHR
755 b L(dutrim)
756
757/* Remainder is 12 */
758 .align 4
759L(duP3):
760 srw rC, rWORD8, rSHR
761 lwz rWORD3, 0(rSTR1)
762 slw rF, rWORD8, rSHL
763 or rWORD4, rC, rH
764L(duP3e):
765 lwz rWORD5, 4(rSTR1)
766 lwz rWORD6, 4(rSTR2)
767 cmplw cr1, rWORD3, rWORD4
768 srw rE, rWORD6, rSHR
769 slw rH, rWORD6, rSHL
770 or rWORD6, rE, rF
771 lwz rWORD7, 8(rSTR1)
772 lwz rWORD8, 8(rSTR2)
773 cmplw cr6, rWORD5, rWORD6
774 bne cr1, L(duLcr1)
775 srw rG, rWORD8, rSHR
776 slw rB, rWORD8, rSHL
777 or rWORD8, rG, rH
778 blt cr7, L(duP3x)
779 lwz rWORD1, 12(rSTR1)
780 lwz rWORD2, 12(rSTR2)
781 cmplw cr5, rWORD7, rWORD8
782 bne cr6, L(duLcr6)
783 srw rA, rWORD2, rSHR
784 slw rD, rWORD2, rSHL
785 or rWORD2, rA, rB
786 addi rSTR1, rSTR1, 8
787 addi rSTR2, rSTR2, 8
788 cmplw cr0, rWORD1, rWORD2
789 b L(duLoop1)
790 .align 4
791L(duP3x):
792 addi rSTR1, rSTR1, 8
793 addi rSTR2, rSTR2, 8
794 bne cr1, L(duLcr1)
795 cmplw cr5, rWORD7, rWORD8
796 bne cr6, L(duLcr6)
797 slwi. rN, rN, 3
798 bne cr5, L(duLcr5)
799 cmplw cr7, rN, rSHR
800 beq L(duZeroReturn)
801 li rA, 0
802 ble cr7, L(dutrim)
803 lwz rWORD2, 4(rSTR2)
804 srw rA, rWORD2, rSHR
805 b L(dutrim)
806
807/* Count is a multiple of 16, remainder is 0 */
808 .align 4
809L(duP4):
810 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
811 srw rA, rWORD8, rSHR
812 lwz rWORD1, 0(rSTR1)
813 slw rD, rWORD8, rSHL
814 or rWORD2, rA, rH
815L(duP4e):
816 lwz rWORD3, 4(rSTR1)
817 lwz rWORD4, 4(rSTR2)
818 cmplw cr0, rWORD1, rWORD2
819 srw rC, rWORD4, rSHR
820 slw rF, rWORD4, rSHL
821 or rWORD4, rC, rD
822 lwz rWORD5, 8(rSTR1)
823 lwz rWORD6, 8(rSTR2)
824 cmplw cr1, rWORD3, rWORD4
825 bne cr0, L(duLcr0)
826 srw rE, rWORD6, rSHR
827 slw rH, rWORD6, rSHL
828 or rWORD6, rE, rF
829 lwzu rWORD7, 12(rSTR1)
830 lwzu rWORD8, 12(rSTR2)
831 cmplw cr6, rWORD5, rWORD6
832 bne cr1, L(duLcr1)
833 srw rG, rWORD8, rSHR
834 slw rB, rWORD8, rSHL
835 or rWORD8, rG, rH
836 cmplw cr5, rWORD7, rWORD8
837 bdz- L(du24) /* Adjust CTR as we start with +4 */
838/* This is the primary loop */
839 .align 4
840L(duLoop):
841 lwz rWORD1, 4(rSTR1)
842 lwz rWORD2, 4(rSTR2)
843 cmplw cr1, rWORD3, rWORD4
844 bne cr6, L(duLcr6)
845 srw rA, rWORD2, rSHR
846 slw rD, rWORD2, rSHL
847 or rWORD2, rA, rB
848L(duLoop1):
849 lwz rWORD3, 8(rSTR1)
850 lwz rWORD4, 8(rSTR2)
851 cmplw cr6, rWORD5, rWORD6
852 bne cr5, L(duLcr5)
853 srw rC, rWORD4, rSHR
854 slw rF, rWORD4, rSHL
855 or rWORD4, rC, rD
856L(duLoop2):
857 lwz rWORD5, 12(rSTR1)
858 lwz rWORD6, 12(rSTR2)
859 cmplw cr5, rWORD7, rWORD8
860 bne cr0, L(duLcr0)
861 srw rE, rWORD6, rSHR
862 slw rH, rWORD6, rSHL
863 or rWORD6, rE, rF
864L(duLoop3):
865 lwzu rWORD7, 16(rSTR1)
866 lwzu rWORD8, 16(rSTR2)
867 cmplw cr0, rWORD1, rWORD2
868 bne- cr1, L(duLcr1)
869 srw rG, rWORD8, rSHR
870 slw rB, rWORD8, rSHL
871 or rWORD8, rG, rH
872 bdnz+ L(duLoop)
873
874L(duL4):
875 bne cr1, L(duLcr1)
876 cmplw cr1, rWORD3, rWORD4
877 bne cr6, L(duLcr6)
878 cmplw cr6, rWORD5, rWORD6
879 bne cr5, L(duLcr5)
880 cmplw cr5, rWORD7, rWORD8
881L(du44):
882 bne cr0, L(duLcr0)
883L(du34):
884 bne cr1, L(duLcr1)
885L(du24):
886 bne cr6, L(duLcr6)
887L(du14):
888 slwi. rN, rN, 3
889 bne cr5, L(duLcr5)
890/* At this point we have a remainder of 1 to 3 bytes to compare. We use
891 shift right to eliminate bits beyond the compare length.
892
893 However it may not be safe to load rWORD2 which may be beyond the
894 string length. So we compare the bit length of the remainder to
895 the right shift count (rSHR). If the bit count is less than or equal
896 we do not need to load rWORD2 (all significant bits are already in
897 rB). */
898 cmplw cr7, rN, rSHR
899 beq L(duZeroReturn)
900 li rA, 0
901 ble cr7, L(dutrim)
902 lwz rWORD2, 4(rSTR2)
903 srw rA, rWORD2, rSHR
904 .align 4
905L(dutrim):
906 lwz rWORD1, 4(rSTR1)
907 lwz r31,48(1)
908 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
909 or rWORD2, rA, rB
910 lwz r30,44(1)
911 lwz r29,40(r1)
912 srw rWORD1, rWORD1, rN
913 srw rWORD2, rWORD2, rN
914 lwz r28,36(r1)
915 lwz r27,32(r1)
916 cmplw rWORD1,rWORD2
917 li rRTN,0
918 beq L(dureturn26)
919 li rRTN,1
920 bgt L(dureturn26)
921 li rRTN,-1
922 b L(dureturn26)
923 .align 4
924L(duLcr0):
925 lwz r31,48(1)
926 lwz r30,44(1)
927 li rRTN, 1
928 bgt cr0, L(dureturn29)
929 lwz r29,40(r1)
930 lwz r28,36(r1)
931 li rRTN, -1
932 b L(dureturn27)
933 .align 4
934L(duLcr1):
935 lwz r31,48(1)
936 lwz r30,44(1)
937 li rRTN, 1
938 bgt cr1, L(dureturn29)
939 lwz r29,40(r1)
940 lwz r28,36(r1)
941 li rRTN, -1
942 b L(dureturn27)
943 .align 4
944L(duLcr6):
945 lwz r31,48(1)
946 lwz r30,44(1)
947 li rRTN, 1
948 bgt cr6, L(dureturn29)
949 lwz r29,40(r1)
950 lwz r28,36(r1)
951 li rRTN, -1
952 b L(dureturn27)
953 .align 4
954L(duLcr5):
955 lwz r31,48(1)
956 lwz r30,44(1)
957 li rRTN, 1
958 bgt cr5, L(dureturn29)
959 lwz r29,40(r1)
960 lwz r28,36(r1)
961 li rRTN, -1
962 b L(dureturn27)
963 .align 3
964L(duZeroReturn):
965 li rRTN,0
966 .align 4
967L(dureturn):
968 lwz r31,48(1)
969 lwz r30,44(1)
970L(dureturn29):
971 lwz r29,40(r1)
972 lwz r28,36(r1)
973L(dureturn27):
974 lwz r27,32(r1)
975L(dureturn26):
976 lwz r26,28(r1)
977L(dureturn25):
978 lwz r25,24(r1)
979 lwz r24,20(r1)
980 lwz 1,0(1)
981 blr
982END (BP_SYM (memcmp))
983
984libc_hidden_builtin_def (memcmp)
985weak_alias (memcmp, bcmp)