sysdeps/powerpc/powerpc64/power4/memcmp.S

   1 /* Optimized strcmp implementation for PowerPC64.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
  22
  23         .machine power4
  24 EALIGN (memcmp, 4, 0)
  25         CALL_MCOUNT 3
  26
  27 #define rTMP    r0
  28 #define rRTN    r3
  29 #define rSTR1   r3      /* first string arg */
  30 #define rSTR2   r4      /* second string arg */
  31 #define rN      r5      /* max string length */
  32 #define rWORD1  r6      /* current word in s1 */
  33 #define rWORD2  r7      /* current word in s2 */
  34 #define rWORD3  r8      /* next word in s1 */
  35 #define rWORD4  r9      /* next word in s2 */
  36 #define rWORD5  r10     /* next word in s1 */
  37 #define rWORD6  r11     /* next word in s2 */
  38 #define rBITDIF r12     /* bits that differ in s1 & s2 words */
  39 #define rWORD7  r30     /* next word in s1 */
  40 #define rWORD8  r31     /* next word in s2 */
  41
  42         xor     rTMP, rSTR2, rSTR1
  43         cmpldi  cr6, rN, 0
  44         cmpldi  cr1, rN, 12
  45         clrldi. rTMP, rTMP, 61
  46         clrldi  rBITDIF, rSTR1, 61
  47         cmpldi  cr5, rBITDIF, 0
  48         beq-    cr6, L(zeroLength)
  49         dcbt    0,rSTR1
  50         dcbt    0,rSTR2
  51 /* If less than 8 bytes or not aligned, use the unaligned
  52    byte loop.  */
  53         blt     cr1, L(bytealigned)
  54         std     rWORD8,-8(r1)
  55         cfi_offset(rWORD8,-8)
  56         std     rWORD7,-16(r1)
  57         cfi_offset(rWORD7,-16)
  58         bne     L(unaligned)
  59 /* At this point we know both strings have the same alignment and the
  60    compare length is at least 8 bytes.  rBITDIF contains the low order
  61    3 bits of rSTR1 and cr5 contains the result of the logical compare
  62    of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
  63    aligned and can perform the DWaligned loop.
  64
  65    Otherwise we know the two strings have the same alignment (but not
  66    yet DW).  So we can force the string addresses to the next lower DW
  67    boundary and special case this first DW word using shift left to
  68    eliminate bits preceding the first byte.  Since we want to join the
  69    normal (DWaligned) compare loop, starting at the second double word,
  70    we need to adjust the length (rN) and special case the loop
  71    versioning for the first DW. This insures that the loop count is
  72    correct and the first DW (shifted) is in the expected resister pair.  */
  73         .align 4
  74 L(samealignment):
  75         clrrdi  rSTR1, rSTR1, 3
  76         clrrdi  rSTR2, rSTR2, 3
  77         beq     cr5, L(DWaligned)
  78         add     rN, rN, rBITDIF
  79         sldi    r11, rBITDIF, 3
  80         srdi    rTMP, rN, 5     /* Divide by 32 */
  81         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
  82         ld      rWORD1, 0(rSTR1)
  83         ld      rWORD2, 0(rSTR2)
  84         cmpldi  cr1, rBITDIF, 16
  85         cmpldi  cr7, rN, 32
  86         clrldi  rN, rN, 61
  87         beq     L(dPs4)
  88         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
  89         bgt     cr1, L(dPs3)
  90         beq     cr1, L(dPs2)
  91
  92 /* Remainder is 8 */
  93         .align 3
  94 L(dsP1):
  95         sld     rWORD5, rWORD1, r11
  96         sld     rWORD6, rWORD2, r11
  97         cmpld   cr5, rWORD5, rWORD6
  98         blt     cr7, L(dP1x)
  99 /* Do something useful in this cycle since we have to branch anyway.  */
 100         ld      rWORD1, 8(rSTR1)
 101         ld      rWORD2, 8(rSTR2)
 102         cmpld   cr0, rWORD1, rWORD2
 103         b       L(dP1e)
 104 /* Remainder is 16 */
 105         .align 4
 106 L(dPs2):
 107         sld     rWORD5, rWORD1, r11
 108         sld     rWORD6, rWORD2, r11
 109         cmpld   cr6, rWORD5, rWORD6
 110         blt     cr7, L(dP2x)
 111 /* Do something useful in this cycle since we have to branch anyway.  */
 112         ld      rWORD7, 8(rSTR1)
 113         ld      rWORD8, 8(rSTR2)
 114         cmpld   cr5, rWORD7, rWORD8
 115         b       L(dP2e)
 116 /* Remainder is 24 */
 117         .align 4
 118 L(dPs3):
 119         sld     rWORD3, rWORD1, r11
 120         sld     rWORD4, rWORD2, r11
 121         cmpld   cr1, rWORD3, rWORD4
 122         b       L(dP3e)
 123 /* Count is a multiple of 32, remainder is 0 */
 124         .align 4
 125 L(dPs4):
 126         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 127         sld     rWORD1, rWORD1, r11
 128         sld     rWORD2, rWORD2, r11
 129         cmpld   cr0, rWORD1, rWORD2
 130         b       L(dP4e)
 131
 132 /* At this point we know both strings are double word aligned and the
 133    compare length is at least 8 bytes.  */
 134         .align 4
 135 L(DWaligned):
 136         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 137         srdi    rTMP, rN, 5     /* Divide by 32 */
 138         cmpldi  cr1, rBITDIF, 16
 139         cmpldi  cr7, rN, 32
 140         clrldi  rN, rN, 61
 141         beq     L(dP4)
 142         bgt     cr1, L(dP3)
 143         beq     cr1, L(dP2)
 144
 145 /* Remainder is 8 */
 146         .align 4
 147 L(dP1):
 148         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 149 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 150    (8-15 byte compare), we want to use only volatile registers.  This
 151    means we can avoid restoring non-volatile registers since we did not
 152    change any on the early exit path.  The key here is the non-early
 153    exit path only cares about the condition code (cr5), not about which
 154    register pair was used.  */
 155         ld      rWORD5, 0(rSTR1)
 156         ld      rWORD6, 0(rSTR2)
 157         cmpld   cr5, rWORD5, rWORD6
 158         blt     cr7, L(dP1x)
 159         ld      rWORD1, 8(rSTR1)
 160         ld      rWORD2, 8(rSTR2)
 161         cmpld   cr0, rWORD1, rWORD2
 162 L(dP1e):
 163         ld      rWORD3, 16(rSTR1)
 164         ld      rWORD4, 16(rSTR2)
 165         cmpld   cr1, rWORD3, rWORD4
 166         ld      rWORD5, 24(rSTR1)
 167         ld      rWORD6, 24(rSTR2)
 168         cmpld   cr6, rWORD5, rWORD6
 169         bne     cr5, L(dLcr5)
 170         bne     cr0, L(dLcr0)
 171
 172         ldu     rWORD7, 32(rSTR1)
 173         ldu     rWORD8, 32(rSTR2)
 174         bne     cr1, L(dLcr1)
 175         cmpld   cr5, rWORD7, rWORD8
 176         bdnz    L(dLoop)
 177         bne     cr6, L(dLcr6)
 178         ld      rWORD8,-8(r1)
 179         ld      rWORD7,-16(r1)
 180         .align 3
 181 L(dP1x):
 182         sldi.   r12, rN, 3
 183         bne     cr5, L(dLcr5)
 184         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 185         bne     L(d00)
 186         li      rRTN, 0
 187         blr
 188
 189 /* Remainder is 16 */
 190         .align 4
 191 L(dP2):
 192         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 193         ld      rWORD5, 0(rSTR1)
 194         ld      rWORD6, 0(rSTR2)
 195         cmpld   cr6, rWORD5, rWORD6
 196         blt     cr7, L(dP2x)
 197         ld      rWORD7, 8(rSTR1)
 198         ld      rWORD8, 8(rSTR2)
 199         cmpld   cr5, rWORD7, rWORD8
 200 L(dP2e):
 201         ld      rWORD1, 16(rSTR1)
 202         ld      rWORD2, 16(rSTR2)
 203         cmpld   cr0, rWORD1, rWORD2
 204         ld      rWORD3, 24(rSTR1)
 205         ld      rWORD4, 24(rSTR2)
 206         cmpld   cr1, rWORD3, rWORD4
 207         addi    rSTR1, rSTR1, 8
 208         addi    rSTR2, rSTR2, 8
 209         bne     cr6, L(dLcr6)
 210         bne     cr5, L(dLcr5)
 211         b       L(dLoop2)
 212 /* Again we are on a early exit path (16-23 byte compare), we want to
 213    only use volatile registers and avoid restoring non-volatile
 214    registers.  */
 215         .align 4
 216 L(dP2x):
 217         ld      rWORD3, 8(rSTR1)
 218         ld      rWORD4, 8(rSTR2)
 219         cmpld   cr5, rWORD3, rWORD4
 220         sldi.   r12, rN, 3
 221         bne     cr6, L(dLcr6)
 222         addi    rSTR1, rSTR1, 8
 223         addi    rSTR2, rSTR2, 8
 224         bne     cr5, L(dLcr5)
 225         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 226         bne     L(d00)
 227         li      rRTN, 0
 228         blr
 229
 230 /* Remainder is 24 */
 231         .align 4
 232 L(dP3):
 233         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 234         ld      rWORD3, 0(rSTR1)
 235         ld      rWORD4, 0(rSTR2)
 236         cmpld   cr1, rWORD3, rWORD4
 237 L(dP3e):
 238         ld      rWORD5, 8(rSTR1)
 239         ld      rWORD6, 8(rSTR2)
 240         cmpld   cr6, rWORD5, rWORD6
 241         blt     cr7, L(dP3x)
 242         ld      rWORD7, 16(rSTR1)
 243         ld      rWORD8, 16(rSTR2)
 244         cmpld   cr5, rWORD7, rWORD8
 245         ld      rWORD1, 24(rSTR1)
 246         ld      rWORD2, 24(rSTR2)
 247         cmpld   cr0, rWORD1, rWORD2
 248         addi    rSTR1, rSTR1, 16
 249         addi    rSTR2, rSTR2, 16
 250         bne     cr1, L(dLcr1)
 251         bne     cr6, L(dLcr6)
 252         b       L(dLoop1)
 253 /* Again we are on a early exit path (24-31 byte compare), we want to
 254    only use volatile registers and avoid restoring non-volatile
 255    registers.  */
 256         .align 4
 257 L(dP3x):
 258         ld      rWORD1, 16(rSTR1)
 259         ld      rWORD2, 16(rSTR2)
 260         cmpld   cr5, rWORD1, rWORD2
 261         sldi.   r12, rN, 3
 262         bne     cr1, L(dLcr1)
 263         addi    rSTR1, rSTR1, 16
 264         addi    rSTR2, rSTR2, 16
 265         bne     cr6, L(dLcr6)
 266         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 267         bne     cr5, L(dLcr5)
 268         bne     L(d00)
 269         li      rRTN, 0
 270         blr
 271
 272 /* Count is a multiple of 32, remainder is 0 */
 273         .align 4
 274 L(dP4):
 275         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 276         ld      rWORD1, 0(rSTR1)
 277         ld      rWORD2, 0(rSTR2)
 278         cmpld   cr0, rWORD1, rWORD2
 279 L(dP4e):
 280         ld      rWORD3, 8(rSTR1)
 281         ld      rWORD4, 8(rSTR2)
 282         cmpld   cr1, rWORD3, rWORD4
 283         ld      rWORD5, 16(rSTR1)
 284         ld      rWORD6, 16(rSTR2)
 285         cmpld   cr6, rWORD5, rWORD6
 286         ldu     rWORD7, 24(rSTR1)
 287         ldu     rWORD8, 24(rSTR2)
 288         cmpld   cr5, rWORD7, rWORD8
 289         bne     cr0, L(dLcr0)
 290         bne     cr1, L(dLcr1)
 291         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 292 /* This is the primary loop */
 293         .align 4
 294 L(dLoop):
 295         ld      rWORD1, 8(rSTR1)
 296         ld      rWORD2, 8(rSTR2)
 297         cmpld   cr1, rWORD3, rWORD4
 298         bne     cr6, L(dLcr6)
 299 L(dLoop1):
 300         ld      rWORD3, 16(rSTR1)
 301         ld      rWORD4, 16(rSTR2)
 302         cmpld   cr6, rWORD5, rWORD6
 303         bne     cr5, L(dLcr5)
 304 L(dLoop2):
 305         ld      rWORD5, 24(rSTR1)
 306         ld      rWORD6, 24(rSTR2)
 307         cmpld   cr5, rWORD7, rWORD8
 308         bne     cr0, L(dLcr0)
 309 L(dLoop3):
 310         ldu     rWORD7, 32(rSTR1)
 311         ldu     rWORD8, 32(rSTR2)
 312         bne-    cr1, L(dLcr1)
 313         cmpld   cr0, rWORD1, rWORD2
 314         bdnz+   L(dLoop)
 315
 316 L(dL4):
 317         cmpld   cr1, rWORD3, rWORD4
 318         bne     cr6, L(dLcr6)
 319         cmpld   cr6, rWORD5, rWORD6
 320         bne     cr5, L(dLcr5)
 321         cmpld   cr5, rWORD7, rWORD8
 322 L(d44):
 323         bne     cr0, L(dLcr0)
 324 L(d34):
 325         bne     cr1, L(dLcr1)
 326 L(d24):
 327         bne     cr6, L(dLcr6)
 328 L(d14):
 329         sldi.   r12, rN, 3
 330         bne     cr5, L(dLcr5)
 331 L(d04):
 332         ld      rWORD8,-8(r1)
 333         ld      rWORD7,-16(r1)
 334         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 335         beq     L(zeroLength)
 336 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
 337    we are aligned it is safe to load the whole double word, and use
 338    shift right double to eliminate bits beyond the compare length.  */
 339 L(d00):
 340         ld      rWORD1, 8(rSTR1)
 341         ld      rWORD2, 8(rSTR2)
 342         srd     rWORD1, rWORD1, rN
 343         srd     rWORD2, rWORD2, rN
 344         cmpld   cr5, rWORD1, rWORD2
 345         bne     cr5, L(dLcr5x)
 346         li      rRTN, 0
 347         blr
 348         .align 4
 349 L(dLcr0):
 350         ld      rWORD8,-8(r1)
 351         ld      rWORD7,-16(r1)
 352         li      rRTN, 1
 353         bgtlr   cr0
 354         li      rRTN, -1
 355         blr
 356         .align 4
 357 L(dLcr1):
 358         ld      rWORD8,-8(r1)
 359         ld      rWORD7,-16(r1)
 360         li      rRTN, 1
 361         bgtlr   cr1
 362         li      rRTN, -1
 363         blr
 364         .align 4
 365 L(dLcr6):
 366         ld      rWORD8,-8(r1)
 367         ld      rWORD7,-16(r1)
 368         li      rRTN, 1
 369         bgtlr   cr6
 370         li      rRTN, -1
 371         blr
 372         .align 4
 373 L(dLcr5):
 374         ld      rWORD8,-8(r1)
 375         ld      rWORD7,-16(r1)
 376 L(dLcr5x):
 377         li      rRTN, 1
 378         bgtlr   cr5
 379         li      rRTN, -1
 380         blr
 381
 382         .align 4
 383 L(bytealigned):
 384         mtctr   rN      /* Power4 wants mtctr 1st in dispatch group */
 385         beq-    cr6, L(zeroLength)
 386
 387 /* We need to prime this loop.  This loop is swing modulo scheduled
 388    to avoid pipe delays.  The dependent instruction latencies (load to
 389    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 390    dispatch group ends in a branch and takes 1 cycle.  Effectively
 391    the first iteration of the loop only serves to load operands and
 392    branches based on compares are delayed until the next loop.
 393
 394    So we must precondition some registers and condition codes so that
 395    we don't exit the loop early on the first iteration.  */
 396
 397         lbz     rWORD1, 0(rSTR1)
 398         lbz     rWORD2, 0(rSTR2)
 399         bdz-    L(b11)
 400         cmpld   cr0, rWORD1, rWORD2
 401         lbz     rWORD3, 1(rSTR1)
 402         lbz     rWORD4, 1(rSTR2)
 403         bdz-    L(b12)
 404         cmpld   cr1, rWORD3, rWORD4
 405         lbzu    rWORD5, 2(rSTR1)
 406         lbzu    rWORD6, 2(rSTR2)
 407         bdz-    L(b13)
 408         .align 4
 409 L(bLoop):
 410         lbzu    rWORD1, 1(rSTR1)
 411         lbzu    rWORD2, 1(rSTR2)
 412         bne-    cr0, L(bLcr0)
 413
 414         cmpld   cr6, rWORD5, rWORD6
 415         bdz-    L(b3i)
 416
 417         lbzu    rWORD3, 1(rSTR1)
 418         lbzu    rWORD4, 1(rSTR2)
 419         bne-    cr1, L(bLcr1)
 420
 421         cmpld   cr0, rWORD1, rWORD2
 422         bdz-    L(b2i)
 423
 424         lbzu    rWORD5, 1(rSTR1)
 425         lbzu    rWORD6, 1(rSTR2)
 426         bne-    cr6, L(bLcr6)
 427
 428         cmpld   cr1, rWORD3, rWORD4
 429         bdnz+   L(bLoop)
 430
 431 /* We speculatively loading bytes before we have tested the previous
 432    bytes.  But we must avoid overrunning the length (in the ctr) to
 433    prevent these speculative loads from causing a segfault.  In this
 434    case the loop will exit early (before the all pending bytes are
 435    tested.  In this case we must complete the pending operations
 436    before returning.  */
 437 L(b1i):
 438         bne-    cr0, L(bLcr0)
 439         bne-    cr1, L(bLcr1)
 440         b       L(bx56)
 441         .align 4
 442 L(b2i):
 443         bne-    cr6, L(bLcr6)
 444         bne-    cr0, L(bLcr0)
 445         b       L(bx34)
 446         .align 4
 447 L(b3i):
 448         bne-    cr1, L(bLcr1)
 449         bne-    cr6, L(bLcr6)
 450         b       L(bx12)
 451         .align 4
 452 L(bLcr0):
 453         li      rRTN, 1
 454         bgtlr   cr0
 455         li      rRTN, -1
 456         blr
 457 L(bLcr1):
 458         li      rRTN, 1
 459         bgtlr   cr1
 460         li      rRTN, -1
 461         blr
 462 L(bLcr6):
 463         li      rRTN, 1
 464         bgtlr   cr6
 465         li      rRTN, -1
 466         blr
 467
 468 L(b13):
 469         bne-    cr0, L(bx12)
 470         bne-    cr1, L(bx34)
 471 L(bx56):
 472         sub     rRTN, rWORD5, rWORD6
 473         blr
 474         nop
 475 L(b12):
 476         bne-    cr0, L(bx12)
 477 L(bx34):
 478         sub     rRTN, rWORD3, rWORD4
 479         blr
 480 L(b11):
 481 L(bx12):
 482         sub     rRTN, rWORD1, rWORD2
 483         blr
 484         .align 4
 485 L(zeroLengthReturn):
 486         ld      rWORD8,-8(r1)
 487         ld      rWORD7,-16(r1)
 488 L(zeroLength):
 489         li      rRTN, 0
 490         blr
 491
 492         .align 4
 493 /* At this point we know the strings have different alignment and the
 494    compare length is at least 8 bytes.  rBITDIF contains the low order
 495    3 bits of rSTR1 and cr5 contains the result of the logical compare
 496    of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
 497    aligned and can perform the DWunaligned loop.
 498
 499    Otherwise we know that rSTR1 is not already DW aligned yet.
 500    So we can force the string addresses to the next lower DW
 501    boundary and special case this first DW word using shift left to
 502    eliminate bits preceding the first byte.  Since we want to join the
 503    normal (DWaligned) compare loop, starting at the second double word,
 504    we need to adjust the length (rN) and special case the loop
 505    versioning for the first DW. This insures that the loop count is
 506    correct and the first DW (shifted) is in the expected resister pair.  */
 507 #define rSHL    r29     /* Unaligned shift left count.  */
 508 #define rSHR    r28     /* Unaligned shift right count.  */
 509 #define rB              r27     /* Left rotation temp for rWORD2.  */
 510 #define rD              r26     /* Left rotation temp for rWORD4.  */
 511 #define rF              r25     /* Left rotation temp for rWORD6.  */
 512 #define rH              r24     /* Left rotation temp for rWORD8.  */
 513 #define rA              r0      /* Right rotation temp for rWORD2.  */
 514 #define rC              r12     /* Right rotation temp for rWORD4.  */
 515 #define rE              r0      /* Right rotation temp for rWORD6.  */
 516 #define rG              r12     /* Right rotation temp for rWORD8.  */
 517 L(unaligned):
 518         std     r29,-24(r1)
 519         cfi_offset(r29,-24)
 520         clrldi  rSHL, rSTR2, 61
 521         beq-    cr6, L(duzeroLength)
 522         std     r28,-32(r1)
 523         cfi_offset(r28,-32)
 524         beq     cr5, L(DWunaligned)
 525         std     r27,-40(r1)
 526         cfi_offset(r27,-40)
 527 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
 528    in the 1st rSTR1 DW.  */
 529         sub     r27, rSTR2, rBITDIF
 530 /* But do not attempt to address the DW before that DW that contains
 531    the actual start of rSTR2.  */
 532         clrrdi  rSTR2, rSTR2, 3
 533         std     r26,-48(r1)
 534         cfi_offset(r26,-48)
 535 /* Compute the left/right shift counts for the unalign rSTR2,
 536    compensating for the logical (DW aligned) start of rSTR1.  */
 537         clrldi  rSHL, r27, 61
 538         clrrdi  rSTR1, rSTR1, 3
 539         std     r25,-56(r1)
 540         cfi_offset(r25,-56)
 541         sldi    rSHL, rSHL, 3
 542         cmpld   cr5, r27, rSTR2
 543         add     rN, rN, rBITDIF
 544         sldi    r11, rBITDIF, 3
 545         std     r24,-64(r1)
 546         cfi_offset(r24,-64)
 547         subfic  rSHR, rSHL, 64
 548         srdi    rTMP, rN, 5     /* Divide by 32 */
 549         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 550 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
 551    this special case those bits may be discarded anyway.  Also we
 552    must avoid loading a DW where none of the bits are part of rSTR2 as
 553    this may cross a page boundary and cause a page fault.  */
 554         li      rWORD8, 0
 555         blt     cr5, L(dus0)
 556         ld      rWORD8, 0(rSTR2)
 557         la      rSTR2, 8(rSTR2)
 558         sld     rWORD8, rWORD8, rSHL
 559
 560 L(dus0):
 561         ld      rWORD1, 0(rSTR1)
 562         ld      rWORD2, 0(rSTR2)
 563         cmpldi  cr1, rBITDIF, 16
 564         cmpldi  cr7, rN, 32
 565         srd     rG, rWORD2, rSHR
 566         clrldi  rN, rN, 61
 567         beq     L(duPs4)
 568         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 569         or      rWORD8, rG, rWORD8
 570         bgt     cr1, L(duPs3)
 571         beq     cr1, L(duPs2)
 572
 573 /* Remainder is 8 */
 574         .align 4
 575 L(dusP1):
 576         sld     rB, rWORD2, rSHL
 577         sld     rWORD7, rWORD1, r11
 578         sld     rWORD8, rWORD8, r11
 579         bge     cr7, L(duP1e)
 580 /* At this point we exit early with the first double word compare
 581    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 582    how we handle the remaining bytes.  */
 583         cmpld   cr5, rWORD7, rWORD8
 584         sldi.   rN, rN, 3
 585         bne     cr5, L(duLcr5)
 586         cmpld   cr7, rN, rSHR
 587         beq     L(duZeroReturn)
 588         li      rA, 0
 589         ble     cr7, L(dutrim)
 590         ld      rWORD2, 8(rSTR2)
 591         srd     rA, rWORD2, rSHR
 592         b       L(dutrim)
 593 /* Remainder is 16 */
 594         .align 4
 595 L(duPs2):
 596         sld     rH, rWORD2, rSHL
 597         sld     rWORD5, rWORD1, r11
 598         sld     rWORD6, rWORD8, r11
 599         b       L(duP2e)
 600 /* Remainder is 24 */
 601         .align 4
 602 L(duPs3):
 603         sld     rF, rWORD2, rSHL
 604         sld     rWORD3, rWORD1, r11
 605         sld     rWORD4, rWORD8, r11
 606         b       L(duP3e)
 607 /* Count is a multiple of 32, remainder is 0 */
 608         .align 4
 609 L(duPs4):
 610         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 611         or      rWORD8, rG, rWORD8
 612         sld     rD, rWORD2, rSHL
 613         sld     rWORD1, rWORD1, r11
 614         sld     rWORD2, rWORD8, r11
 615         b       L(duP4e)
 616
 617 /* At this point we know rSTR1 is double word aligned and the
 618    compare length is at least 8 bytes.  */
 619         .align 4
 620 L(DWunaligned):
 621         std     r27,-40(r1)
 622         cfi_offset(r27,-40)
 623         clrrdi  rSTR2, rSTR2, 3
 624         std     r26,-48(r1)
 625         cfi_offset(r26,-48)
 626         srdi    rTMP, rN, 5     /* Divide by 32 */
 627         std     r25,-56(r1)
 628         cfi_offset(r25,-56)
 629         andi.   rBITDIF, rN, 24 /* Get the DW remainder */
 630         std     r24,-64(r1)
 631         cfi_offset(r24,-64)
 632         sldi    rSHL, rSHL, 3
 633         ld      rWORD6, 0(rSTR2)
 634         ldu     rWORD8, 8(rSTR2)
 635         cmpldi  cr1, rBITDIF, 16
 636         cmpldi  cr7, rN, 32
 637         clrldi  rN, rN, 61
 638         subfic  rSHR, rSHL, 64
 639         sld     rH, rWORD6, rSHL
 640         beq     L(duP4)
 641         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 642         bgt     cr1, L(duP3)
 643         beq     cr1, L(duP2)
 644
 645 /* Remainder is 8 */
 646         .align 4
 647 L(duP1):
 648         srd     rG, rWORD8, rSHR
 649         ld      rWORD7, 0(rSTR1)
 650         sld     rB, rWORD8, rSHL
 651         or      rWORD8, rG, rH
 652         blt     cr7, L(duP1x)
 653 L(duP1e):
 654         ld      rWORD1, 8(rSTR1)
 655         ld      rWORD2, 8(rSTR2)
 656         cmpld   cr5, rWORD7, rWORD8
 657         srd     rA, rWORD2, rSHR
 658         sld     rD, rWORD2, rSHL
 659         or      rWORD2, rA, rB
 660         ld      rWORD3, 16(rSTR1)
 661         ld      rWORD4, 16(rSTR2)
 662         cmpld   cr0, rWORD1, rWORD2
 663         srd     rC, rWORD4, rSHR
 664         sld     rF, rWORD4, rSHL
 665         bne     cr5, L(duLcr5)
 666         or      rWORD4, rC, rD
 667         ld      rWORD5, 24(rSTR1)
 668         ld      rWORD6, 24(rSTR2)
 669         cmpld   cr1, rWORD3, rWORD4
 670         srd     rE, rWORD6, rSHR
 671         sld     rH, rWORD6, rSHL
 672         bne     cr0, L(duLcr0)
 673         or      rWORD6, rE, rF
 674         cmpld   cr6, rWORD5, rWORD6
 675         b       L(duLoop3)
 676         .align 4
 677 /* At this point we exit early with the first double word compare
 678    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 679    how we handle the remaining bytes.  */
 680 L(duP1x):
 681         cmpld   cr5, rWORD7, rWORD8
 682         sldi.   rN, rN, 3
 683         bne     cr5, L(duLcr5)
 684         cmpld   cr7, rN, rSHR
 685         beq     L(duZeroReturn)
 686         li      rA, 0
 687         ble     cr7, L(dutrim)
 688         ld      rWORD2, 8(rSTR2)
 689         srd     rA, rWORD2, rSHR
 690         b       L(dutrim)
 691 /* Remainder is 16 */
 692         .align 4
 693 L(duP2):
 694         srd     rE, rWORD8, rSHR
 695         ld      rWORD5, 0(rSTR1)
 696         or      rWORD6, rE, rH
 697         sld     rH, rWORD8, rSHL
 698 L(duP2e):
 699         ld      rWORD7, 8(rSTR1)
 700         ld      rWORD8, 8(rSTR2)
 701         cmpld   cr6, rWORD5, rWORD6
 702         srd     rG, rWORD8, rSHR
 703         sld     rB, rWORD8, rSHL
 704         or      rWORD8, rG, rH
 705         blt     cr7, L(duP2x)
 706         ld      rWORD1, 16(rSTR1)
 707         ld      rWORD2, 16(rSTR2)
 708         cmpld   cr5, rWORD7, rWORD8
 709         bne     cr6, L(duLcr6)
 710         srd     rA, rWORD2, rSHR
 711         sld     rD, rWORD2, rSHL
 712         or      rWORD2, rA, rB
 713         ld      rWORD3, 24(rSTR1)
 714         ld      rWORD4, 24(rSTR2)
 715         cmpld   cr0, rWORD1, rWORD2
 716         bne     cr5, L(duLcr5)
 717         srd     rC, rWORD4, rSHR
 718         sld     rF, rWORD4, rSHL
 719         or      rWORD4, rC, rD
 720         addi    rSTR1, rSTR1, 8
 721         addi    rSTR2, rSTR2, 8
 722         cmpld   cr1, rWORD3, rWORD4
 723         b       L(duLoop2)
 724         .align 4
 725 L(duP2x):
 726         cmpld   cr5, rWORD7, rWORD8
 727         addi    rSTR1, rSTR1, 8
 728         addi    rSTR2, rSTR2, 8
 729         bne     cr6, L(duLcr6)
 730         sldi.   rN, rN, 3
 731         bne     cr5, L(duLcr5)
 732         cmpld   cr7, rN, rSHR
 733         beq     L(duZeroReturn)
 734         li      rA, 0
 735         ble     cr7, L(dutrim)
 736         ld      rWORD2, 8(rSTR2)
 737         srd     rA, rWORD2, rSHR
 738         b       L(dutrim)
 739
 740 /* Remainder is 24 */
 741         .align 4
 742 L(duP3):
 743         srd     rC, rWORD8, rSHR
 744         ld      rWORD3, 0(rSTR1)
 745         sld     rF, rWORD8, rSHL
 746         or      rWORD4, rC, rH
 747 L(duP3e):
 748         ld      rWORD5, 8(rSTR1)
 749         ld      rWORD6, 8(rSTR2)
 750         cmpld   cr1, rWORD3, rWORD4
 751         srd     rE, rWORD6, rSHR
 752         sld     rH, rWORD6, rSHL
 753         or      rWORD6, rE, rF
 754         ld      rWORD7, 16(rSTR1)
 755         ld      rWORD8, 16(rSTR2)
 756         cmpld   cr6, rWORD5, rWORD6
 757         bne     cr1, L(duLcr1)
 758         srd     rG, rWORD8, rSHR
 759         sld     rB, rWORD8, rSHL
 760         or      rWORD8, rG, rH
 761         blt     cr7, L(duP3x)
 762         ld      rWORD1, 24(rSTR1)
 763         ld      rWORD2, 24(rSTR2)
 764         cmpld   cr5, rWORD7, rWORD8
 765         bne     cr6, L(duLcr6)
 766         srd     rA, rWORD2, rSHR
 767         sld     rD, rWORD2, rSHL
 768         or      rWORD2, rA, rB
 769         addi    rSTR1, rSTR1, 16
 770         addi    rSTR2, rSTR2, 16
 771         cmpld   cr0, rWORD1, rWORD2
 772         b       L(duLoop1)
 773         .align 4
 774 L(duP3x):
 775         addi    rSTR1, rSTR1, 16
 776         addi    rSTR2, rSTR2, 16
 777         bne     cr1, L(duLcr1)
 778         cmpld   cr5, rWORD7, rWORD8
 779         bne     cr6, L(duLcr6)
 780         sldi.   rN, rN, 3
 781         bne     cr5, L(duLcr5)
 782         cmpld   cr7, rN, rSHR
 783         beq     L(duZeroReturn)
 784         li      rA, 0
 785         ble     cr7, L(dutrim)
 786         ld      rWORD2, 8(rSTR2)
 787         srd     rA, rWORD2, rSHR
 788         b       L(dutrim)
 789
 790 /* Count is a multiple of 32, remainder is 0 */
 791         .align 4
 792 L(duP4):
 793         mtctr   rTMP    /* Power4 wants mtctr 1st in dispatch group */
 794         srd     rA, rWORD8, rSHR
 795         ld      rWORD1, 0(rSTR1)
 796         sld     rD, rWORD8, rSHL
 797         or      rWORD2, rA, rH
 798 L(duP4e):
 799         ld      rWORD3, 8(rSTR1)
 800         ld      rWORD4, 8(rSTR2)
 801         cmpld   cr0, rWORD1, rWORD2
 802         srd     rC, rWORD4, rSHR
 803         sld     rF, rWORD4, rSHL
 804         or      rWORD4, rC, rD
 805         ld      rWORD5, 16(rSTR1)
 806         ld      rWORD6, 16(rSTR2)
 807         cmpld   cr1, rWORD3, rWORD4
 808         bne     cr0, L(duLcr0)
 809         srd     rE, rWORD6, rSHR
 810         sld     rH, rWORD6, rSHL
 811         or      rWORD6, rE, rF
 812         ldu     rWORD7, 24(rSTR1)
 813         ldu     rWORD8, 24(rSTR2)
 814         cmpld   cr6, rWORD5, rWORD6
 815         bne     cr1, L(duLcr1)
 816         srd     rG, rWORD8, rSHR
 817         sld     rB, rWORD8, rSHL
 818         or      rWORD8, rG, rH
 819         cmpld   cr5, rWORD7, rWORD8
 820         bdz-    L(du24)         /* Adjust CTR as we start with +4 */
 821 /* This is the primary loop */
 822         .align 4
 823 L(duLoop):
 824         ld      rWORD1, 8(rSTR1)
 825         ld      rWORD2, 8(rSTR2)
 826         cmpld   cr1, rWORD3, rWORD4
 827         bne     cr6, L(duLcr6)
 828         srd     rA, rWORD2, rSHR
 829         sld     rD, rWORD2, rSHL
 830         or      rWORD2, rA, rB
 831 L(duLoop1):
 832         ld      rWORD3, 16(rSTR1)
 833         ld      rWORD4, 16(rSTR2)
 834         cmpld   cr6, rWORD5, rWORD6
 835         bne     cr5, L(duLcr5)
 836         srd     rC, rWORD4, rSHR
 837         sld     rF, rWORD4, rSHL
 838         or      rWORD4, rC, rD
 839 L(duLoop2):
 840         ld      rWORD5, 24(rSTR1)
 841         ld      rWORD6, 24(rSTR2)
 842         cmpld   cr5, rWORD7, rWORD8
 843         bne     cr0, L(duLcr0)
 844         srd     rE, rWORD6, rSHR
 845         sld     rH, rWORD6, rSHL
 846         or      rWORD6, rE, rF
 847 L(duLoop3):
 848         ldu     rWORD7, 32(rSTR1)
 849         ldu     rWORD8, 32(rSTR2)
 850         cmpld   cr0, rWORD1, rWORD2
 851         bne-    cr1, L(duLcr1)
 852         srd     rG, rWORD8, rSHR
 853         sld     rB, rWORD8, rSHL
 854         or      rWORD8, rG, rH
 855         bdnz+   L(duLoop)
 856
 857 L(duL4):
 858         bne     cr1, L(duLcr1)
 859         cmpld   cr1, rWORD3, rWORD4
 860         bne     cr6, L(duLcr6)
 861         cmpld   cr6, rWORD5, rWORD6
 862         bne     cr5, L(duLcr5)
 863         cmpld   cr5, rWORD7, rWORD8
 864 L(du44):
 865         bne     cr0, L(duLcr0)
 866 L(du34):
 867         bne     cr1, L(duLcr1)
 868 L(du24):
 869         bne     cr6, L(duLcr6)
 870 L(du14):
 871         sldi.   rN, rN, 3
 872         bne     cr5, L(duLcr5)
 873 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
 874    shift right double to eliminate bits beyond the compare length.
 875    This allows the use of double word subtract to compute the final
 876    result.
 877
 878    However it may not be safe to load rWORD2 which may be beyond the
 879    string length. So we compare the bit length of the remainder to
 880    the right shift count (rSHR). If the bit count is less than or equal
 881    we do not need to load rWORD2 (all significant bits are already in
 882    rB).  */
 883         cmpld   cr7, rN, rSHR
 884         beq     L(duZeroReturn)
 885         li      rA, 0
 886         ble     cr7, L(dutrim)
 887         ld      rWORD2, 8(rSTR2)
 888         srd     rA, rWORD2, rSHR
 889         .align 4
 890 L(dutrim):
 891         ld      rWORD1, 8(rSTR1)
 892         ld      rWORD8,-8(r1)
 893         subfic  rN, rN, 64      /* Shift count is 64 - (rN * 8).  */
 894         or      rWORD2, rA, rB
 895         ld      rWORD7,-16(r1)
 896         ld      r29,-24(r1)
 897         srd     rWORD1, rWORD1, rN
 898         srd     rWORD2, rWORD2, rN
 899         ld      r28,-32(r1)
 900         ld      r27,-40(r1)
 901         li      rRTN, 0
 902         cmpld   cr0, rWORD1, rWORD2
 903         ld      r26,-48(r1)
 904         ld      r25,-56(r1)
 905         beq     cr0, L(dureturn24)
 906         li      rRTN, 1
 907         ld      r24,-64(r1)
 908         bgtlr   cr0
 909         li      rRTN, -1
 910         blr
 911         .align 4
 912 L(duLcr0):
 913         ld      rWORD8,-8(r1)
 914         ld      rWORD7,-16(r1)
 915         li      rRTN, 1
 916         bgt     cr0, L(dureturn29)
 917         ld      r29,-24(r1)
 918         ld      r28,-32(r1)
 919         li      rRTN, -1
 920         b       L(dureturn27)
 921         .align 4
 922 L(duLcr1):
 923         ld      rWORD8,-8(r1)
 924         ld      rWORD7,-16(r1)
 925         li      rRTN, 1
 926         bgt     cr1, L(dureturn29)
 927         ld      r29,-24(r1)
 928         ld      r28,-32(r1)
 929         li      rRTN, -1
 930         b       L(dureturn27)
 931         .align 4
 932 L(duLcr6):
 933         ld      rWORD8,-8(r1)
 934         ld      rWORD7,-16(r1)
 935         li      rRTN, 1
 936         bgt     cr6, L(dureturn29)
 937         ld      r29,-24(r1)
 938         ld      r28,-32(r1)
 939         li      rRTN, -1
 940         b       L(dureturn27)
 941         .align 4
 942 L(duLcr5):
 943         ld      rWORD8,-8(r1)
 944         ld      rWORD7,-16(r1)
 945         li      rRTN, 1
 946         bgt     cr5, L(dureturn29)
 947         ld      r29,-24(r1)
 948         ld      r28,-32(r1)
 949         li      rRTN, -1
 950         b       L(dureturn27)
 951         .align  3
 952 L(duZeroReturn):
 953         li      rRTN,0
 954         .align  4
 955 L(dureturn):
 956         ld      rWORD8,-8(r1)
 957         ld      rWORD7,-16(r1)
 958 L(dureturn29):
 959         ld      r29,-24(r1)
 960         ld      r28,-32(r1)
 961 L(dureturn27):
 962         ld      r27,-40(r1)
 963 L(dureturn26):
 964         ld      r26,-48(r1)
 965 L(dureturn25):
 966         ld      r25,-56(r1)
 967 L(dureturn24):
 968         ld      r24,-64(r1)
 969         blr
 970 L(duzeroLength):
 971         li      rRTN,0
 972         blr
 973
 974 END (memcmp)
 975 libc_hidden_builtin_def (memcmp)
 976 weak_alias (memcmp, bcmp)