]> git.ipfire.org Git - thirdparty/glibc.git/blame - powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S
2.5-18.1
[thirdparty/glibc.git] / powerpc-cpu / sysdeps / powerpc / powerpc64 / power4 / memcmp.S
CommitLineData
0ecb606c
JJ
1/* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
18 02110-1301 USA. */
19
20#include <sysdep.h>
21#include <bp-sym.h>
22#include <bp-asm.h>
23
24/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
25
26EALIGN (BP_SYM(memcmp), 4, 0)
27 CALL_MCOUNT 3
28
29#define rTMP r0
30#define rRTN r3
31#define rSTR1 r3 /* first string arg */
32#define rSTR2 r4 /* second string arg */
33#define rN r5 /* max string length */
34/* Note: The Bounded pointer support in this code is broken. This code
35 was inherited from PPC32 and and that support was never completed.
36 Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
37#define rWORD1 r6 /* current word in s1 */
38#define rWORD2 r7 /* current word in s2 */
39#define rWORD3 r8 /* next word in s1 */
40#define rWORD4 r9 /* next word in s2 */
41#define rWORD5 r10 /* next word in s1 */
42#define rWORD6 r11 /* next word in s2 */
43#define rBITDIF r12 /* bits that differ in s1 & s2 words */
44#define rWORD7 r30 /* next word in s1 */
45#define rWORD8 r31 /* next word in s2 */
46
47 xor rTMP, rSTR2, rSTR1
48 cmpldi cr6, rN, 0
49 cmpldi cr1, rN, 12
50 clrldi. rTMP, rTMP, 61
51 clrldi rBITDIF, rSTR1, 61
52 cmpldi cr5, rBITDIF, 0
53 beq- cr6, L(zeroLength)
54 dcbt 0,rSTR1
55 dcbt 0,rSTR2
56/* If less than 8 bytes or not aligned, use the unalligned
57 byte loop. */
58 blt cr1, L(bytealigned)
59 std rWORD8,-8(r1)
60 cfi_offset(rWORD8,-8)
61 std rWORD7,-16(r1)
62 cfi_offset(rWORD7,-16)
63 bne L(unaligned)
64/* At this point we know both strings have the same alignment and the
65 compare length is at least 8 bytes. rBITDIF containes the low order
66 3 bits of rSTR1 and cr5 contains the result of the logical compare
67 of rBITDIF to 0. If rBITDIF == 0 then we are already double word
68 aligned and can perform the DWaligned loop.
69
70 Otherwise we know the two strings have the same alignment (but not
71 yet DW). So we can force the string addresses to the next lower DW
72 boundary and special case this first DW word using shift left to
73 ellimiate bits preceeding the first byte. Since we want to join the
74 normal (DWaligned) compare loop, starting at the second double word,
75 we need to adjust the length (rN) and special case the loop
76 versioning for the first DW. This insures that the loop count is
77 correct and the first DW (shifted) is in the expected resister pair. */
78 .align 4
79L(samealignment):
80 clrrdi rSTR1, rSTR1, 3
81 clrrdi rSTR2, rSTR2, 3
82 beq cr5, L(DWaligned)
83 add rN, rN, rBITDIF
84 sldi r11, rBITDIF, 3
85 srdi rTMP, rN, 5 /* Divide by 32 */
86 andi. rBITDIF, rN, 24 /* Get the DW remainder */
87 ld rWORD1, 0(rSTR1)
88 ld rWORD2, 0(rSTR2)
89 cmpldi cr1, rBITDIF, 16
90 cmpldi cr7, rN, 32
91 clrldi rN, rN, 61
92 beq L(dPs4)
93 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
94 bgt cr1, L(dPs3)
95 beq cr1, L(dPs2)
96
97/* Remainder is 8 */
98 .align 3
99L(dsP1):
100 sld rWORD5, rWORD1, r11
101 sld rWORD6, rWORD2, r11
102 cmpld cr5, rWORD5, rWORD6
103 blt cr7, L(dP1x)
104/* Do something useful in this cycle since we have to branch anyway. */
105 ld rWORD1, 8(rSTR1)
106 ld rWORD2, 8(rSTR2)
107 cmpld cr0, rWORD1, rWORD2
108 b L(dP1e)
109/* Remainder is 16 */
110 .align 4
111L(dPs2):
112 sld rWORD5, rWORD1, r11
113 sld rWORD6, rWORD2, r11
114 cmpld cr6, rWORD5, rWORD6
115 blt cr7, L(dP2x)
116/* Do something useful in this cycle since we have to branch anyway. */
117 ld rWORD7, 8(rSTR1)
118 ld rWORD8, 8(rSTR2)
119 cmpld cr5, rWORD7, rWORD8
120 b L(dP2e)
121/* Remainder is 24 */
122 .align 4
123L(dPs3):
124 sld rWORD3, rWORD1, r11
125 sld rWORD4, rWORD2, r11
126 cmpld cr1, rWORD3, rWORD4
127 b L(dP3e)
128/* Count is a multiple of 32, remainder is 0 */
129 .align 4
130L(dPs4):
131 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
132 sld rWORD1, rWORD1, r11
133 sld rWORD2, rWORD2, r11
134 cmpld cr0, rWORD1, rWORD2
135 b L(dP4e)
136
137/* At this point we know both strings are double word aligned and the
138 compare length is at least 8 bytes. */
139 .align 4
140L(DWaligned):
141 andi. rBITDIF, rN, 24 /* Get the DW remainder */
142 srdi rTMP, rN, 5 /* Divide by 32 */
143 cmpldi cr1, rBITDIF, 16
144 cmpldi cr7, rN, 32
145 clrldi rN, rN, 61
146 beq L(dP4)
147 bgt cr1, L(dP3)
148 beq cr1, L(dP2)
149
150/* Remainder is 8 */
151 .align 4
152L(dP1):
153 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
154/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
155 (8-15 byte compare), we want to use only volitile registers. This
156 means we can avoid restoring non-volitile registers since we did not
157 change any on the early exit path. The key here is the non-early
158 exit path only cares about the condition code (cr5), not about which
159 register pair was used. */
160 ld rWORD5, 0(rSTR1)
161 ld rWORD6, 0(rSTR2)
162 cmpld cr5, rWORD5, rWORD6
163 blt cr7, L(dP1x)
164 ld rWORD1, 8(rSTR1)
165 ld rWORD2, 8(rSTR2)
166 cmpld cr0, rWORD1, rWORD2
167L(dP1e):
168 ld rWORD3, 16(rSTR1)
169 ld rWORD4, 16(rSTR2)
170 cmpld cr1, rWORD3, rWORD4
171 ld rWORD5, 24(rSTR1)
172 ld rWORD6, 24(rSTR2)
173 cmpld cr6, rWORD5, rWORD6
174 bne cr5, L(dLcr5)
175 bne cr0, L(dLcr0)
176
177 ldu rWORD7, 32(rSTR1)
178 ldu rWORD8, 32(rSTR2)
179 bne cr1, L(dLcr1)
180 cmpld cr5, rWORD7, rWORD8
181 bdnz L(dLoop)
182 bne cr6, L(dLcr6)
183 ld rWORD8,-8(r1)
184 ld rWORD7,-16(r1)
185 .align 3
186L(dP1x):
187 sldi. r12, rN, 3
188 bne cr5, L(dLcr5)
189 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
190 bne L(d00)
191 li rRTN, 0
192 blr
193
194/* Remainder is 16 */
195 .align 4
196L(dP2):
197 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
198 ld rWORD5, 0(rSTR1)
199 ld rWORD6, 0(rSTR2)
200 cmpld cr6, rWORD5, rWORD6
201 blt cr7, L(dP2x)
202 ld rWORD7, 8(rSTR1)
203 ld rWORD8, 8(rSTR2)
204 cmpld cr5, rWORD7, rWORD8
205L(dP2e):
206 ld rWORD1, 16(rSTR1)
207 ld rWORD2, 16(rSTR2)
208 cmpld cr0, rWORD1, rWORD2
209 ld rWORD3, 24(rSTR1)
210 ld rWORD4, 24(rSTR2)
211 cmpld cr1, rWORD3, rWORD4
212 addi rSTR1, rSTR1, 8
213 addi rSTR2, rSTR2, 8
214 bne cr6, L(dLcr6)
215 bne cr5, L(dLcr5)
216 b L(dLoop2)
217/* Again we are on a early exit path (16-23 byte compare), we want to
218 only use volitile registers and avoid restoring non-volitile
219 registers. */
220 .align 4
221L(dP2x):
222 ld rWORD3, 8(rSTR1)
223 ld rWORD4, 8(rSTR2)
224 cmpld cr5, rWORD3, rWORD4
225 sldi. r12, rN, 3
226 bne cr6, L(dLcr6)
227 addi rSTR1, rSTR1, 8
228 addi rSTR2, rSTR2, 8
229 bne cr5, L(dLcr5)
230 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
231 bne L(d00)
232 li rRTN, 0
233 blr
234
235/* Remainder is 24 */
236 .align 4
237L(dP3):
238 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
239 ld rWORD3, 0(rSTR1)
240 ld rWORD4, 0(rSTR2)
241 cmpld cr1, rWORD3, rWORD4
242L(dP3e):
243 ld rWORD5, 8(rSTR1)
244 ld rWORD6, 8(rSTR2)
245 cmpld cr6, rWORD5, rWORD6
246 blt cr7, L(dP3x)
247 ld rWORD7, 16(rSTR1)
248 ld rWORD8, 16(rSTR2)
249 cmpld cr5, rWORD7, rWORD8
250 ld rWORD1, 24(rSTR1)
251 ld rWORD2, 24(rSTR2)
252 cmpld cr0, rWORD1, rWORD2
253 addi rSTR1, rSTR1, 16
254 addi rSTR2, rSTR2, 16
255 bne cr1, L(dLcr1)
256 bne cr6, L(dLcr6)
257 b L(dLoop1)
258/* Again we are on a early exit path (24-31 byte compare), we want to
259 only use volitile registers and avoid restoring non-volitile
260 registers. */
261 .align 4
262L(dP3x):
263 ld rWORD1, 16(rSTR1)
264 ld rWORD2, 16(rSTR2)
265 cmpld cr5, rWORD1, rWORD2
266 sldi. r12, rN, 3
267 bne cr1, L(dLcr1)
268 addi rSTR1, rSTR1, 16
269 addi rSTR2, rSTR2, 16
270 bne cr6, L(dLcr6)
271 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
272 bne cr5, L(dLcr5)
273 bne L(d00)
274 li rRTN, 0
275 blr
276
277/* Count is a multiple of 32, remainder is 0 */
278 .align 4
279L(dP4):
280 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
281 ld rWORD1, 0(rSTR1)
282 ld rWORD2, 0(rSTR2)
283 cmpld cr0, rWORD1, rWORD2
284L(dP4e):
285 ld rWORD3, 8(rSTR1)
286 ld rWORD4, 8(rSTR2)
287 cmpld cr1, rWORD3, rWORD4
288 ld rWORD5, 16(rSTR1)
289 ld rWORD6, 16(rSTR2)
290 cmpld cr6, rWORD5, rWORD6
291 ldu rWORD7, 24(rSTR1)
292 ldu rWORD8, 24(rSTR2)
293 cmpld cr5, rWORD7, rWORD8
294 bne cr0, L(dLcr0)
295 bne cr1, L(dLcr1)
296 bdz- L(d24) /* Adjust CTR as we start with +4 */
297/* This is the primary loop */
298 .align 4
299L(dLoop):
300 ld rWORD1, 8(rSTR1)
301 ld rWORD2, 8(rSTR2)
302 cmpld cr1, rWORD3, rWORD4
303 bne cr6, L(dLcr6)
304L(dLoop1):
305 ld rWORD3, 16(rSTR1)
306 ld rWORD4, 16(rSTR2)
307 cmpld cr6, rWORD5, rWORD6
308 bne cr5, L(dLcr5)
309L(dLoop2):
310 ld rWORD5, 24(rSTR1)
311 ld rWORD6, 24(rSTR2)
312 cmpld cr5, rWORD7, rWORD8
313 bne cr0, L(dLcr0)
314L(dLoop3):
315 ldu rWORD7, 32(rSTR1)
316 ldu rWORD8, 32(rSTR2)
317 bne- cr1, L(dLcr1)
318 cmpld cr0, rWORD1, rWORD2
319 bdnz+ L(dLoop)
320
321L(dL4):
322 cmpld cr1, rWORD3, rWORD4
323 bne cr6, L(dLcr6)
324 cmpld cr6, rWORD5, rWORD6
325 bne cr5, L(dLcr5)
326 cmpld cr5, rWORD7, rWORD8
327L(d44):
328 bne cr0, L(dLcr0)
329L(d34):
330 bne cr1, L(dLcr1)
331L(d24):
332 bne cr6, L(dLcr6)
333L(d14):
334 sldi. r12, rN, 3
335 bne cr5, L(dLcr5)
336L(d04):
337 ld rWORD8,-8(r1)
338 ld rWORD7,-16(r1)
339 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
340 beq L(zeroLength)
341/* At this point we have a remainder of 1 to 7 bytes to compare. Since
342 we are aligned it is safe to load the whole double word, and use
343 shift right double to elliminate bits beyond the compare length. */
344L(d00):
345 ld rWORD1, 8(rSTR1)
346 ld rWORD2, 8(rSTR2)
347 srd rWORD1, rWORD1, rN
348 srd rWORD2, rWORD2, rN
349 cmpld cr5, rWORD1, rWORD2
350 bne cr5, L(dLcr5x)
351 li rRTN, 0
352 blr
353 .align 4
354L(dLcr0):
355 ld rWORD8,-8(r1)
356 ld rWORD7,-16(r1)
357 li rRTN, 1
358 bgtlr cr0
359 li rRTN, -1
360 blr
361 .align 4
362L(dLcr1):
363 ld rWORD8,-8(r1)
364 ld rWORD7,-16(r1)
365 li rRTN, 1
366 bgtlr cr1
367 li rRTN, -1
368 blr
369 .align 4
370L(dLcr6):
371 ld rWORD8,-8(r1)
372 ld rWORD7,-16(r1)
373 li rRTN, 1
374 bgtlr cr6
375 li rRTN, -1
376 blr
377 .align 4
378L(dLcr5):
379 ld rWORD8,-8(r1)
380 ld rWORD7,-16(r1)
381L(dLcr5x):
382 li rRTN, 1
383 bgtlr cr5
384 li rRTN, -1
385 blr
386
387 .align 4
388L(bytealigned):
389 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
390 beq- cr6, L(zeroLength)
391
392/* We need to prime this loop. This loop is swing modulo scheduled
393 to avoid pipe delays. The dependent instruction latencies (load to
394 compare to conditional branch) is 2 to 3 cycles. In this loop each
395 dispatch group ends in a branch and takes 1 cycle. Effectively
396 the first iteration of the loop only serves to load operands and
397 branches based on compares are delayed until the next loop.
398
399 So we must precondition some registers and condition codes so that
400 we don't exit the loop early on the first iteration. */
401
402 lbz rWORD1, 0(rSTR1)
403 lbz rWORD2, 0(rSTR2)
404 bdz- L(b11)
405 cmpld cr0, rWORD1, rWORD2
406 lbz rWORD3, 1(rSTR1)
407 lbz rWORD4, 1(rSTR2)
408 bdz- L(b12)
409 cmpld cr1, rWORD3, rWORD4
410 lbzu rWORD5, 2(rSTR1)
411 lbzu rWORD6, 2(rSTR2)
412 bdz- L(b13)
413 .align 4
414L(bLoop):
415 lbzu rWORD1, 1(rSTR1)
416 lbzu rWORD2, 1(rSTR2)
417 bne- cr0, L(bLcr0)
418
419 cmpld cr6, rWORD5, rWORD6
420 bdz- L(b3i)
421
422 lbzu rWORD3, 1(rSTR1)
423 lbzu rWORD4, 1(rSTR2)
424 bne- cr1, L(bLcr1)
425
426 cmpld cr0, rWORD1, rWORD2
427 bdz- L(b2i)
428
429 lbzu rWORD5, 1(rSTR1)
430 lbzu rWORD6, 1(rSTR2)
431 bne- cr6, L(bLcr6)
432
433 cmpld cr1, rWORD3, rWORD4
434 bdnz+ L(bLoop)
435
436/* We speculatively loading bytes before we have tested the previous
437 bytes. But we must avoid overrunning the length (in the ctr) to
438 prevent these speculative loads from causing a segfault. In this
439 case the loop will exit early (before the all pending bytes are
440 tested. In this case we must complete the pending operations
441 before returning. */
442L(b1i):
443 bne- cr0, L(bLcr0)
444 bne- cr1, L(bLcr1)
445 b L(bx56)
446 .align 4
447L(b2i):
448 bne- cr6, L(bLcr6)
449 bne- cr0, L(bLcr0)
450 b L(bx34)
451 .align 4
452L(b3i):
453 bne- cr1, L(bLcr1)
454 bne- cr6, L(bLcr6)
455 b L(bx12)
456 .align 4
457L(bLcr0):
458 li rRTN, 1
459 bgtlr cr0
460 li rRTN, -1
461 blr
462L(bLcr1):
463 li rRTN, 1
464 bgtlr cr1
465 li rRTN, -1
466 blr
467L(bLcr6):
468 li rRTN, 1
469 bgtlr cr6
470 li rRTN, -1
471 blr
472
473L(b13):
474 bne- cr0, L(bx12)
475 bne- cr1, L(bx34)
476L(bx56):
477 sub rRTN, rWORD5, rWORD6
478 blr
479 nop
480L(b12):
481 bne- cr0, L(bx12)
482L(bx34):
483 sub rRTN, rWORD3, rWORD4
484 blr
485L(b11):
486L(bx12):
487 sub rRTN, rWORD1, rWORD2
488 blr
489 .align 4
490L(zeroLengthReturn):
491 ld rWORD8,-8(r1)
492 ld rWORD7,-16(r1)
493L(zeroLength):
494 li rRTN, 0
495 blr
496
497 .align 4
498/* At this point we know the strings have different alignment and the
499 compare length is at least 8 bytes. rBITDIF containes the low order
500 3 bits of rSTR1 and cr5 contains the result of the logical compare
501 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
502 aligned and can perform the DWunaligned loop.
503
504 Otherwise we know that rSTR1 is not aready DW aligned yet.
505 So we can force the string addresses to the next lower DW
506 boundary and special case this first DW word using shift left to
507 ellimiate bits preceeding the first byte. Since we want to join the
508 normal (DWaligned) compare loop, starting at the second double word,
509 we need to adjust the length (rN) and special case the loop
510 versioning for the first DW. This insures that the loop count is
511 correct and the first DW (shifted) is in the expected resister pair. */
512#define rSHL r29 /* Unaligned shift left count. */
513#define rSHR r28 /* Unaligned shift right count. */
514#define rB r27 /* Left rotation temp for rWORD2. */
515#define rD r26 /* Left rotation temp for rWORD4. */
516#define rF r25 /* Left rotation temp for rWORD6. */
517#define rH r24 /* Left rotation temp for rWORD8. */
518#define rA r0 /* Right rotation temp for rWORD2. */
519#define rC r12 /* Right rotation temp for rWORD4. */
520#define rE r0 /* Right rotation temp for rWORD6. */
521#define rG r12 /* Right rotation temp for rWORD8. */
522L(unaligned):
523 std r29,-24(r1)
524 cfi_offset(r29,-24)
525 clrldi rSHL, rSTR2, 61
526 beq- cr6, L(duzeroLength)
527 std r28,-32(r1)
528 cfi_offset(r28,-32)
529 beq cr5, L(DWunaligned)
530 std r27,-40(r1)
531 cfi_offset(r27,-40)
532/* Adjust the logical start of rSTR2 ro compensate for the extra bits
533 in the 1st rSTR1 DW. */
534 sub r27, rSTR2, rBITDIF
535/* But do not attempt to address the DW before that DW that contains
536 the actual start of rSTR2. */
537 clrrdi rSTR2, rSTR2, 3
538 std r26,-48(r1)
539 cfi_offset(r26,-48)
540/* Compute the leaft/right shift counts for the unalign rSTR2,
541 compensating for the logical (DW aligned) start of rSTR1. */
542 clrldi rSHL, r27, 61
543 clrrdi rSTR1, rSTR1, 3
544 std r25,-56(r1)
545 cfi_offset(r25,-56)
546 sldi rSHL, rSHL, 3
547 cmpld cr5, r27, rSTR2
548 add rN, rN, rBITDIF
549 sldi r11, rBITDIF, 3
550 std r24,-64(r1)
551 cfi_offset(r24,-64)
552 subfic rSHR, rSHL, 64
553 srdi rTMP, rN, 5 /* Divide by 32 */
554 andi. rBITDIF, rN, 24 /* Get the DW remainder */
555/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
556 this special case those bits may be discarded anyway. Also we
557 must avoid loading a DW where none of the bits are part of rSTR2 as
558 this may cross a page boundary and cause a page fault. */
559 li rWORD8, 0
560 blt cr5, L(dus0)
561 ld rWORD8, 0(rSTR2)
562 la rSTR2, 8(rSTR2)
563 sld rWORD8, rWORD8, rSHL
564
565L(dus0):
566 ld rWORD1, 0(rSTR1)
567 ld rWORD2, 0(rSTR2)
568 cmpldi cr1, rBITDIF, 16
569 cmpldi cr7, rN, 32
570 srd rG, rWORD2, rSHR
571 clrldi rN, rN, 61
572 beq L(duPs4)
573 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
574 or rWORD8, rG, rWORD8
575 bgt cr1, L(duPs3)
576 beq cr1, L(duPs2)
577
578/* Remainder is 8 */
579 .align 4
580L(dusP1):
581 sld rB, rWORD2, rSHL
582 sld rWORD7, rWORD1, r11
583 sld rWORD8, rWORD8, r11
584 bge cr7, L(duP1e)
585/* At this point we exit early with the first double word compare
586 complete and remainder of 0 to 7 bytes. See L(du14) for details on
587 how we handle the remaining bytes. */
588 cmpld cr5, rWORD7, rWORD8
589 sldi. rN, rN, 3
590 bne cr5, L(duLcr5)
591 cmpld cr7, rN, rSHR
592 beq L(duZeroReturn)
593 li rA, 0
594 ble cr7, L(dutrim)
595 ld rWORD2, 8(rSTR2)
596 srd rA, rWORD2, rSHR
597 b L(dutrim)
598/* Remainder is 16 */
599 .align 4
600L(duPs2):
601 sld rH, rWORD2, rSHL
602 sld rWORD5, rWORD1, r11
603 sld rWORD6, rWORD8, r11
604 b L(duP2e)
605/* Remainder is 24 */
606 .align 4
607L(duPs3):
608 sld rF, rWORD2, rSHL
609 sld rWORD3, rWORD1, r11
610 sld rWORD4, rWORD8, r11
611 b L(duP3e)
612/* Count is a multiple of 32, remainder is 0 */
613 .align 4
614L(duPs4):
615 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
616 or rWORD8, rG, rWORD8
617 sld rD, rWORD2, rSHL
618 sld rWORD1, rWORD1, r11
619 sld rWORD2, rWORD8, r11
620 b L(duP4e)
621
622/* At this point we know rSTR1 is double word aligned and the
623 compare length is at least 8 bytes. */
624 .align 4
625L(DWunaligned):
626 std r27,-40(r1)
627 cfi_offset(r27,-40)
628 clrrdi rSTR2, rSTR2, 3
629 std r26,-48(r1)
630 cfi_offset(r26,-48)
631 srdi rTMP, rN, 5 /* Divide by 32 */
632 std r25,-56(r1)
633 cfi_offset(r25,-56)
634 andi. rBITDIF, rN, 24 /* Get the DW remainder */
635 std r24,-64(r1)
636 cfi_offset(r24,-64)
637 sldi rSHL, rSHL, 3
638 ld rWORD6, 0(rSTR2)
639 ldu rWORD8, 8(rSTR2)
640 cmpldi cr1, rBITDIF, 16
641 cmpldi cr7, rN, 32
642 clrldi rN, rN, 61
643 subfic rSHR, rSHL, 64
644 sld rH, rWORD6, rSHL
645 beq L(duP4)
646 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
647 bgt cr1, L(duP3)
648 beq cr1, L(duP2)
649
650/* Remainder is 8 */
651 .align 4
652L(duP1):
653 srd rG, rWORD8, rSHR
654 ld rWORD7, 0(rSTR1)
655 sld rB, rWORD8, rSHL
656 or rWORD8, rG, rH
657 blt cr7, L(duP1x)
658L(duP1e):
659 ld rWORD1, 8(rSTR1)
660 ld rWORD2, 8(rSTR2)
661 cmpld cr5, rWORD7, rWORD8
662 srd rA, rWORD2, rSHR
663 sld rD, rWORD2, rSHL
664 or rWORD2, rA, rB
665 ld rWORD3, 16(rSTR1)
666 ld rWORD4, 16(rSTR2)
667 cmpld cr0, rWORD1, rWORD2
668 srd rC, rWORD4, rSHR
669 sld rF, rWORD4, rSHL
670 bne cr5, L(duLcr5)
671 or rWORD4, rC, rD
672 ld rWORD5, 24(rSTR1)
673 ld rWORD6, 24(rSTR2)
674 cmpld cr1, rWORD3, rWORD4
675 srd rE, rWORD6, rSHR
676 sld rH, rWORD6, rSHL
677 bne cr0, L(duLcr0)
678 or rWORD6, rE, rF
679 cmpld cr6, rWORD5, rWORD6
680 b L(duLoop3)
681 .align 4
682/* At this point we exit early with the first double word compare
683 complete and remainder of 0 to 7 bytes. See L(du14) for details on
684 how we handle the remaining bytes. */
685L(duP1x):
686 cmpld cr5, rWORD7, rWORD8
687 sldi. rN, rN, 3
688 bne cr5, L(duLcr5)
689 cmpld cr7, rN, rSHR
690 beq L(duZeroReturn)
691 li rA, 0
692 ble cr7, L(dutrim)
693 ld rWORD2, 8(rSTR2)
694 srd rA, rWORD2, rSHR
695 b L(dutrim)
696/* Remainder is 16 */
697 .align 4
698L(duP2):
699 srd rE, rWORD8, rSHR
700 ld rWORD5, 0(rSTR1)
701 or rWORD6, rE, rH
702 sld rH, rWORD8, rSHL
703L(duP2e):
704 ld rWORD7, 8(rSTR1)
705 ld rWORD8, 8(rSTR2)
706 cmpld cr6, rWORD5, rWORD6
707 srd rG, rWORD8, rSHR
708 sld rB, rWORD8, rSHL
709 or rWORD8, rG, rH
710 blt cr7, L(duP2x)
711 ld rWORD1, 16(rSTR1)
712 ld rWORD2, 16(rSTR2)
713 cmpld cr5, rWORD7, rWORD8
714 bne cr6, L(duLcr6)
715 srd rA, rWORD2, rSHR
716 sld rD, rWORD2, rSHL
717 or rWORD2, rA, rB
718 ld rWORD3, 24(rSTR1)
719 ld rWORD4, 24(rSTR2)
720 cmpld cr0, rWORD1, rWORD2
721 bne cr5, L(duLcr5)
722 srd rC, rWORD4, rSHR
723 sld rF, rWORD4, rSHL
724 or rWORD4, rC, rD
725 addi rSTR1, rSTR1, 8
726 addi rSTR2, rSTR2, 8
727 cmpld cr1, rWORD3, rWORD4
728 b L(duLoop2)
729 .align 4
730L(duP2x):
731 cmpld cr5, rWORD7, rWORD8
732 addi rSTR1, rSTR1, 8
733 addi rSTR2, rSTR2, 8
734 bne cr6, L(duLcr6)
735 sldi. rN, rN, 3
736 bne cr5, L(duLcr5)
737 cmpld cr7, rN, rSHR
738 beq L(duZeroReturn)
739 li rA, 0
740 ble cr7, L(dutrim)
741 ld rWORD2, 8(rSTR2)
742 srd rA, rWORD2, rSHR
743 b L(dutrim)
744
745/* Remainder is 24 */
746 .align 4
747L(duP3):
748 srd rC, rWORD8, rSHR
749 ld rWORD3, 0(rSTR1)
750 sld rF, rWORD8, rSHL
751 or rWORD4, rC, rH
752L(duP3e):
753 ld rWORD5, 8(rSTR1)
754 ld rWORD6, 8(rSTR2)
755 cmpld cr1, rWORD3, rWORD4
756 srd rE, rWORD6, rSHR
757 sld rH, rWORD6, rSHL
758 or rWORD6, rE, rF
759 ld rWORD7, 16(rSTR1)
760 ld rWORD8, 16(rSTR2)
761 cmpld cr6, rWORD5, rWORD6
762 bne cr1, L(duLcr1)
763 srd rG, rWORD8, rSHR
764 sld rB, rWORD8, rSHL
765 or rWORD8, rG, rH
766 blt cr7, L(duP3x)
767 ld rWORD1, 24(rSTR1)
768 ld rWORD2, 24(rSTR2)
769 cmpld cr5, rWORD7, rWORD8
770 bne cr6, L(duLcr6)
771 srd rA, rWORD2, rSHR
772 sld rD, rWORD2, rSHL
773 or rWORD2, rA, rB
774 addi rSTR1, rSTR1, 16
775 addi rSTR2, rSTR2, 16
776 cmpld cr0, rWORD1, rWORD2
777 b L(duLoop1)
778 .align 4
779L(duP3x):
780 addi rSTR1, rSTR1, 16
781 addi rSTR2, rSTR2, 16
782 bne cr1, L(duLcr1)
783 cmpld cr5, rWORD7, rWORD8
784 bne cr6, L(duLcr6)
785 sldi. rN, rN, 3
786 bne cr5, L(duLcr5)
787 cmpld cr7, rN, rSHR
788 beq L(duZeroReturn)
789 li rA, 0
790 ble cr7, L(dutrim)
791 ld rWORD2, 8(rSTR2)
792 srd rA, rWORD2, rSHR
793 b L(dutrim)
794
795/* Count is a multiple of 32, remainder is 0 */
796 .align 4
797L(duP4):
798 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
799 srd rA, rWORD8, rSHR
800 ld rWORD1, 0(rSTR1)
801 sld rD, rWORD8, rSHL
802 or rWORD2, rA, rH
803L(duP4e):
804 ld rWORD3, 8(rSTR1)
805 ld rWORD4, 8(rSTR2)
806 cmpld cr0, rWORD1, rWORD2
807 srd rC, rWORD4, rSHR
808 sld rF, rWORD4, rSHL
809 or rWORD4, rC, rD
810 ld rWORD5, 16(rSTR1)
811 ld rWORD6, 16(rSTR2)
812 cmpld cr1, rWORD3, rWORD4
813 bne cr0, L(duLcr0)
814 srd rE, rWORD6, rSHR
815 sld rH, rWORD6, rSHL
816 or rWORD6, rE, rF
817 ldu rWORD7, 24(rSTR1)
818 ldu rWORD8, 24(rSTR2)
819 cmpld cr6, rWORD5, rWORD6
820 bne cr1, L(duLcr1)
821 srd rG, rWORD8, rSHR
822 sld rB, rWORD8, rSHL
823 or rWORD8, rG, rH
824 cmpld cr5, rWORD7, rWORD8
825 bdz- L(du24) /* Adjust CTR as we start with +4 */
826/* This is the primary loop */
827 .align 4
828L(duLoop):
829 ld rWORD1, 8(rSTR1)
830 ld rWORD2, 8(rSTR2)
831 cmpld cr1, rWORD3, rWORD4
832 bne cr6, L(duLcr6)
833 srd rA, rWORD2, rSHR
834 sld rD, rWORD2, rSHL
835 or rWORD2, rA, rB
836L(duLoop1):
837 ld rWORD3, 16(rSTR1)
838 ld rWORD4, 16(rSTR2)
839 cmpld cr6, rWORD5, rWORD6
840 bne cr5, L(duLcr5)
841 srd rC, rWORD4, rSHR
842 sld rF, rWORD4, rSHL
843 or rWORD4, rC, rD
844L(duLoop2):
845 ld rWORD5, 24(rSTR1)
846 ld rWORD6, 24(rSTR2)
847 cmpld cr5, rWORD7, rWORD8
848 bne cr0, L(duLcr0)
849 srd rE, rWORD6, rSHR
850 sld rH, rWORD6, rSHL
851 or rWORD6, rE, rF
852L(duLoop3):
853 ldu rWORD7, 32(rSTR1)
854 ldu rWORD8, 32(rSTR2)
855 cmpld cr0, rWORD1, rWORD2
856 bne- cr1, L(duLcr1)
857 srd rG, rWORD8, rSHR
858 sld rB, rWORD8, rSHL
859 or rWORD8, rG, rH
860 bdnz+ L(duLoop)
861
862L(duL4):
863 bne cr1, L(duLcr1)
864 cmpld cr1, rWORD3, rWORD4
865 bne cr6, L(duLcr6)
866 cmpld cr6, rWORD5, rWORD6
867 bne cr5, L(duLcr5)
868 cmpld cr5, rWORD7, rWORD8
869L(du44):
870 bne cr0, L(duLcr0)
871L(du34):
872 bne cr1, L(duLcr1)
873L(du24):
874 bne cr6, L(duLcr6)
875L(du14):
876 sldi. rN, rN, 3
877 bne cr5, L(duLcr5)
878/* At this point we have a remainder of 1 to 7 bytes to compare. We use
879 shift right double to elliminate bits beyond the compare length.
880 This allows the use of double word subtract to compute the final
881 result.
882
883 However it may not be safe to load rWORD2 which may be beyond the
884 string length. So we compare the bit length of the remainder to
885 the right shift count (rSHR). If the bit count is less than or equal
886 we do not need to load rWORD2 (all significant bits are already in
887 rB). */
888 cmpld cr7, rN, rSHR
889 beq L(duZeroReturn)
890 li rA, 0
891 ble cr7, L(dutrim)
892 ld rWORD2, 8(rSTR2)
893 srd rA, rWORD2, rSHR
894 .align 4
895L(dutrim):
896 ld rWORD1, 8(rSTR1)
897 ld rWORD8,-8(r1)
898 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
899 or rWORD2, rA, rB
900 ld rWORD7,-16(r1)
901 ld r29,-24(r1)
902 srd rWORD1, rWORD1, rN
903 srd rWORD2, rWORD2, rN
904 ld r28,-32(r1)
905 ld r27,-40(r1)
906 li rRTN, 0
907 cmpld cr0, rWORD1, rWORD2
908 ld r26,-48(r1)
909 ld r25,-56(r1)
910 beq cr0, L(dureturn24)
911 li rRTN, 1
912 ld r24,-64(r1)
913 bgtlr cr0
914 li rRTN, -1
915 blr
916 .align 4
917L(duLcr0):
918 ld rWORD8,-8(r1)
919 ld rWORD7,-16(r1)
920 li rRTN, 1
921 bgt cr0, L(dureturn29)
922 ld r29,-24(r1)
923 ld r28,-32(r1)
924 li rRTN, -1
925 b L(dureturn27)
926 .align 4
927L(duLcr1):
928 ld rWORD8,-8(r1)
929 ld rWORD7,-16(r1)
930 li rRTN, 1
931 bgt cr1, L(dureturn29)
932 ld r29,-24(r1)
933 ld r28,-32(r1)
934 li rRTN, -1
935 b L(dureturn27)
936 .align 4
937L(duLcr6):
938 ld rWORD8,-8(r1)
939 ld rWORD7,-16(r1)
940 li rRTN, 1
941 bgt cr6, L(dureturn29)
942 ld r29,-24(r1)
943 ld r28,-32(r1)
944 li rRTN, -1
945 b L(dureturn27)
946 .align 4
947L(duLcr5):
948 ld rWORD8,-8(r1)
949 ld rWORD7,-16(r1)
950 li rRTN, 1
951 bgt cr5, L(dureturn29)
952 ld r29,-24(r1)
953 ld r28,-32(r1)
954 li rRTN, -1
955 b L(dureturn27)
956 .align 3
957L(duZeroReturn):
958 li rRTN,0
959 .align 4
960L(dureturn):
961 ld rWORD8,-8(r1)
962 ld rWORD7,-16(r1)
963L(dureturn29):
964 ld r29,-24(r1)
965 ld r28,-32(r1)
966L(dureturn27):
967 ld r27,-40(r1)
968L(dureturn26):
969 ld r26,-48(r1)
970L(dureturn25):
971 ld r25,-56(r1)
972L(dureturn24):
973 ld r24,-64(r1)
974 blr
975L(duzeroLength):
976 li rRTN,0
977 blr
978
979END (BP_SYM (memcmp))
980libc_hidden_builtin_def (memcmp)
981weak_alias (memcmp, bcmp)