sysdeps/powerpc/powerpc64/power7/memcmp.S

   1 /* Optimized memcmp implementation for POWER7/PowerPC64.
   2    Copyright (C) 2010-2019 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* int [r3] memcmp (const char *s1 [r3],
  22                     const char *s2 [r4],
  23                     size_t size [r5])  */
  24 #ifndef MEMCMP
  25 # define MEMCMP memcmp
  26 #endif
  27         .machine power7
  28 ENTRY_TOCLESS (MEMCMP, 4)
  29         CALL_MCOUNT 3
  30
  31 #define rRTN            r3
  32 #define rSTR1           r3      /* first string arg */
  33 #define rSTR2           r4      /* second string arg */
  34 #define rN              r5      /* max string length */
  35 #define rWORD1          r6      /* current word in s1 */
  36 #define rWORD2          r7      /* current word in s2 */
  37 #define rWORD3          r8      /* next word in s1 */
  38 #define rWORD4          r9      /* next word in s2 */
  39 #define rWORD5          r10     /* next word in s1 */
  40 #define rWORD6          r11     /* next word in s2 */
  41
  42 #define rOFF8           r20     /* 8 bytes offset.  */
  43 #define rOFF16          r21     /* 16 bytes offset.  */
  44 #define rOFF24          r22     /* 24 bytes offset.  */
  45 #define rOFF32          r23     /* 24 bytes offset.  */
  46 #define rWORD6_SHIFT    r24     /* Left rotation temp for rWORD8.  */
  47 #define rWORD4_SHIFT    r25     /* Left rotation temp for rWORD6.  */
  48 #define rWORD2_SHIFT    r26     /* Left rotation temp for rWORD4.  */
  49 #define rWORD8_SHIFT    r27     /* Left rotation temp for rWORD2.  */
  50 #define rSHR            r28     /* Unaligned shift right count.  */
  51 #define rSHL            r29     /* Unaligned shift left count.  */
  52 #define rWORD7          r30     /* next word in s1 */
  53 #define rWORD8          r31     /* next word in s2 */
  54
  55 #define rWORD8SAVE      (-8)
  56 #define rWORD7SAVE      (-16)
  57 #define rOFF8SAVE       (-24)
  58 #define rOFF16SAVE      (-32)
  59 #define rOFF24SAVE      (-40)
  60 #define rOFF32SAVE      (-48)
  61 #define rSHRSAVE        (-56)
  62 #define rSHLSAVE        (-64)
  63 #define rWORD8SHIFTSAVE (-72)
  64 #define rWORD2SHIFTSAVE (-80)
  65 #define rWORD4SHIFTSAVE (-88)
  66 #define rWORD6SHIFTSAVE (-96)
  67
  68 #ifdef __LITTLE_ENDIAN__
  69 # define LD     ldbrx
  70 #else
  71 # define LD     ldx
  72 #endif
  73
  74         xor     r0, rSTR2, rSTR1
  75         cmpldi  cr6, rN, 0
  76         cmpldi  cr1, rN, 12
  77         clrldi. r0, r0, 61
  78         clrldi  r12, rSTR1, 61
  79         cmpldi  cr5, r12, 0
  80         beq-    cr6, L(zeroLength)
  81         dcbt    0, rSTR1
  82         dcbt    0, rSTR2
  83 /* If less than 8 bytes or not aligned, use the unaligned
  84    byte loop.  */
  85         blt     cr1, L(bytealigned)
  86         std     rWORD8, rWORD8SAVE(r1)
  87         std     rWORD7, rWORD7SAVE(r1)
  88         std     rOFF8, rOFF8SAVE(r1)
  89         std     rOFF16, rOFF16SAVE(r1)
  90         std     rOFF24, rOFF24SAVE(r1)
  91         std     rOFF32, rOFF32SAVE(r1)
  92         cfi_offset(rWORD8, rWORD8SAVE)
  93         cfi_offset(rWORD7, rWORD7SAVE)
  94         cfi_offset(rOFF8, rOFF8SAVE)
  95         cfi_offset(rOFF16, rOFF16SAVE)
  96         cfi_offset(rOFF24, rOFF24SAVE)
  97         cfi_offset(rOFF32, rOFF32SAVE)
  98
  99         li      rOFF8,8
 100         li      rOFF16,16
 101         li      rOFF24,24
 102         li      rOFF32,32
 103
 104         bne     L(unaligned)
 105 /* At this point we know both strings have the same alignment and the
 106    compare length is at least 8 bytes.  r12 contains the low order
 107    3 bits of rSTR1 and cr5 contains the result of the logical compare
 108    of r12 to 0.  If r12 == 0 then we are already double word
 109    aligned and can perform the DW aligned loop.
 110
 111    Otherwise we know the two strings have the same alignment (but not
 112    yet DW).  So we force the string addresses to the next lower DW
 113    boundary and special case this first DW using shift left to
 114    eliminate bits preceding the first byte.  Since we want to join the
 115    normal (DW aligned) compare loop, starting at the second double word,
 116    we need to adjust the length (rN) and special case the loop
 117    versioning for the first DW. This ensures that the loop count is
 118    correct and the first DW (shifted) is in the expected register pair.  */
 119         .align  4
 120 L(samealignment):
 121         clrrdi  rSTR1, rSTR1, 3
 122         clrrdi  rSTR2, rSTR2, 3
 123         beq     cr5, L(DWaligned)
 124         add     rN, rN, r12
 125         sldi    rWORD6, r12, 3
 126         srdi    r0, rN, 5       /* Divide by 32 */
 127         andi.   r12, rN, 24     /* Get the DW remainder */
 128         LD      rWORD1, 0, rSTR1
 129         LD      rWORD2, 0, rSTR2
 130         cmpldi  cr1, r12, 16
 131         cmpldi  cr7, rN, 32
 132         clrldi  rN, rN, 61
 133         beq     L(dPs4)
 134         mtctr   r0
 135         bgt     cr1, L(dPs3)
 136         beq     cr1, L(dPs2)
 137
 138 /* Remainder is 8 */
 139         .align  3
 140 L(dsP1):
 141         sld     rWORD5, rWORD1, rWORD6
 142         sld     rWORD6, rWORD2, rWORD6
 143         cmpld   cr5, rWORD5, rWORD6
 144         blt     cr7, L(dP1x)
 145 /* Do something useful in this cycle since we have to branch anyway.  */
 146         LD      rWORD1, rOFF8, rSTR1
 147         LD      rWORD2, rOFF8, rSTR2
 148         cmpld   cr7, rWORD1, rWORD2
 149         b       L(dP1e)
 150 /* Remainder is 16 */
 151         .align  4
 152 L(dPs2):
 153         sld     rWORD5, rWORD1, rWORD6
 154         sld     rWORD6, rWORD2, rWORD6
 155         cmpld   cr6, rWORD5, rWORD6
 156         blt     cr7, L(dP2x)
 157 /* Do something useful in this cycle since we have to branch anyway.  */
 158         LD      rWORD7, rOFF8, rSTR1
 159         LD      rWORD8, rOFF8, rSTR2
 160         cmpld   cr5, rWORD7, rWORD8
 161         b       L(dP2e)
 162 /* Remainder is 24 */
 163         .align  4
 164 L(dPs3):
 165         sld     rWORD3, rWORD1, rWORD6
 166         sld     rWORD4, rWORD2, rWORD6
 167         cmpld   cr1, rWORD3, rWORD4
 168         b       L(dP3e)
 169 /* Count is a multiple of 32, remainder is 0 */
 170         .align  4
 171 L(dPs4):
 172         mtctr   r0
 173         sld     rWORD1, rWORD1, rWORD6
 174         sld     rWORD2, rWORD2, rWORD6
 175         cmpld   cr7, rWORD1, rWORD2
 176         b       L(dP4e)
 177
 178 /* At this point we know both strings are double word aligned and the
 179    compare length is at least 8 bytes.  */
 180         .align  4
 181 L(DWaligned):
 182         andi.   r12, rN, 24     /* Get the DW remainder */
 183         srdi    r0, rN, 5       /* Divide by 32 */
 184         cmpldi  cr1, r12, 16
 185         cmpldi  cr7, rN, 32
 186         clrldi  rN, rN, 61
 187         beq     L(dP4)
 188         bgt     cr1, L(dP3)
 189         beq     cr1, L(dP2)
 190
 191 /* Remainder is 8 */
 192         .align  4
 193 L(dP1):
 194         mtctr   r0
 195 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
 196    (8-15 byte compare), we want to use only volatile registers.  This
 197    means we can avoid restoring non-volatile registers since we did not
 198    change any on the early exit path.  The key here is the non-early
 199    exit path only cares about the condition code (cr5), not about which
 200    register pair was used.  */
 201         LD      rWORD5, 0, rSTR1
 202         LD      rWORD6, 0, rSTR2
 203         cmpld   cr5, rWORD5, rWORD6
 204         blt     cr7, L(dP1x)
 205         LD      rWORD1, rOFF8, rSTR1
 206         LD      rWORD2, rOFF8, rSTR2
 207         cmpld   cr7, rWORD1, rWORD2
 208 L(dP1e):
 209         LD      rWORD3, rOFF16, rSTR1
 210         LD      rWORD4, rOFF16, rSTR2
 211         cmpld   cr1, rWORD3, rWORD4
 212         LD      rWORD5, rOFF24, rSTR1
 213         LD      rWORD6, rOFF24, rSTR2
 214         cmpld   cr6, rWORD5, rWORD6
 215         bne     cr5, L(dLcr5x)
 216         bne     cr7, L(dLcr7x)
 217
 218         LD      rWORD7, rOFF32, rSTR1
 219         LD      rWORD8, rOFF32, rSTR2
 220         addi    rSTR1, rSTR1, 32
 221         addi    rSTR2, rSTR2, 32
 222         bne     cr1, L(dLcr1)
 223         cmpld   cr5, rWORD7, rWORD8
 224         bdnz    L(dLoop)
 225         bne     cr6, L(dLcr6)
 226         ld      rWORD8, rWORD8SAVE(r1)
 227         ld      rWORD7, rWORD7SAVE(r1)
 228         .align  3
 229 L(dP1x):
 230         sldi.   r12, rN, 3
 231         bne     cr5, L(dLcr5x)
 232         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 233         bne     L(d00)
 234         ld      rOFF8,  rOFF8SAVE(r1)
 235         ld      rOFF16, rOFF16SAVE(r1)
 236         ld      rOFF24, rOFF24SAVE(r1)
 237         ld      rOFF32, rOFF32SAVE(r1)
 238         li      rRTN, 0
 239         blr
 240
 241 /* Remainder is 16 */
 242         .align  4
 243 L(dP2):
 244         mtctr   r0
 245         LD      rWORD5, 0, rSTR1
 246         LD      rWORD6, 0, rSTR2
 247         cmpld   cr6, rWORD5, rWORD6
 248         blt     cr7, L(dP2x)
 249         LD      rWORD7, rOFF8, rSTR1
 250         LD      rWORD8, rOFF8, rSTR2
 251         cmpld   cr5, rWORD7, rWORD8
 252 L(dP2e):
 253         LD      rWORD1, rOFF16, rSTR1
 254         LD      rWORD2, rOFF16, rSTR2
 255         cmpld   cr7, rWORD1, rWORD2
 256         LD      rWORD3, rOFF24, rSTR1
 257         LD      rWORD4, rOFF24, rSTR2
 258         cmpld   cr1, rWORD3, rWORD4
 259         addi    rSTR1, rSTR1, 8
 260         addi    rSTR2, rSTR2, 8
 261         bne     cr6, L(dLcr6)
 262         bne     cr5, L(dLcr5)
 263         b       L(dLoop2)
 264         .align  4
 265 L(dP2x):
 266         LD      rWORD3, rOFF8, rSTR1
 267         LD      rWORD4, rOFF8, rSTR2
 268         cmpld   cr1, rWORD3, rWORD4
 269         sldi.   r12, rN, 3
 270         bne     cr6, L(dLcr6x)
 271         addi    rSTR1, rSTR1, 8
 272         addi    rSTR2, rSTR2, 8
 273         bne     cr1, L(dLcr1x)
 274         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 275         bne     L(d00)
 276         ld      rOFF8,  rOFF8SAVE(r1)
 277         ld      rOFF16, rOFF16SAVE(r1)
 278         ld      rOFF24, rOFF24SAVE(r1)
 279         ld      rOFF32, rOFF32SAVE(r1)
 280         li      rRTN, 0
 281         blr
 282
 283 /* Remainder is 24 */
 284         .align  4
 285 L(dP3):
 286         mtctr   r0
 287         LD      rWORD3, 0, rSTR1
 288         LD      rWORD4, 0, rSTR2
 289         cmpld   cr1, rWORD3, rWORD4
 290 L(dP3e):
 291         LD      rWORD5, rOFF8, rSTR1
 292         LD      rWORD6, rOFF8, rSTR2
 293         cmpld   cr6, rWORD5, rWORD6
 294         blt     cr7, L(dP3x)
 295         LD      rWORD7, rOFF16, rSTR1
 296         LD      rWORD8, rOFF16, rSTR2
 297         cmpld   cr5, rWORD7, rWORD8
 298         LD      rWORD1, rOFF24, rSTR1
 299         LD      rWORD2, rOFF24, rSTR2
 300         cmpld   cr7, rWORD1, rWORD2
 301         addi    rSTR1, rSTR1, 16
 302         addi    rSTR2, rSTR2, 16
 303         bne     cr1, L(dLcr1)
 304         bne     cr6, L(dLcr6)
 305         b       L(dLoop1)
 306 /* Again we are on a early exit path (24-31 byte compare), we want to
 307    only use volatile registers and avoid restoring non-volatile
 308    registers.  */
 309         .align  4
 310 L(dP3x):
 311         LD      rWORD1, rOFF16, rSTR1
 312         LD      rWORD2, rOFF16, rSTR2
 313         cmpld   cr7, rWORD1, rWORD2
 314         sldi.   r12, rN, 3
 315         bne     cr1, L(dLcr1x)
 316         addi    rSTR1, rSTR1, 16
 317         addi    rSTR2, rSTR2, 16
 318         bne     cr6, L(dLcr6x)
 319         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 320         bne     cr7, L(dLcr7x)
 321         bne     L(d00)
 322         ld      rOFF8,  rOFF8SAVE(r1)
 323         ld      rOFF16, rOFF16SAVE(r1)
 324         ld      rOFF24, rOFF24SAVE(r1)
 325         ld      rOFF32, rOFF32SAVE(r1)
 326         li      rRTN, 0
 327         blr
 328
 329 /* Count is a multiple of 32, remainder is 0 */
 330         .align  4
 331 L(dP4):
 332         mtctr   r0
 333         LD      rWORD1, 0, rSTR1
 334         LD      rWORD2, 0, rSTR2
 335         cmpld   cr7, rWORD1, rWORD2
 336 L(dP4e):
 337         LD      rWORD3, rOFF8, rSTR1
 338         LD      rWORD4, rOFF8, rSTR2
 339         cmpld   cr1, rWORD3, rWORD4
 340         LD      rWORD5, rOFF16, rSTR1
 341         LD      rWORD6, rOFF16, rSTR2
 342         cmpld   cr6, rWORD5, rWORD6
 343         LD      rWORD7, rOFF24, rSTR1
 344         LD      rWORD8, rOFF24, rSTR2
 345         addi    rSTR1, rSTR1, 24
 346         addi    rSTR2, rSTR2, 24
 347         cmpld   cr5, rWORD7, rWORD8
 348         bne     cr7, L(dLcr7)
 349         bne     cr1, L(dLcr1)
 350         bdz-    L(d24)          /* Adjust CTR as we start with +4 */
 351 /* This is the primary loop */
 352         .align  4
 353 L(dLoop):
 354         LD      rWORD1, rOFF8, rSTR1
 355         LD      rWORD2, rOFF8, rSTR2
 356         cmpld   cr1, rWORD3, rWORD4
 357         bne     cr6, L(dLcr6)
 358 L(dLoop1):
 359         LD      rWORD3, rOFF16, rSTR1
 360         LD      rWORD4, rOFF16, rSTR2
 361         cmpld   cr6, rWORD5, rWORD6
 362         bne     cr5, L(dLcr5)
 363 L(dLoop2):
 364         LD      rWORD5, rOFF24, rSTR1
 365         LD      rWORD6, rOFF24, rSTR2
 366         cmpld   cr5, rWORD7, rWORD8
 367         bne     cr7, L(dLcr7)
 368 L(dLoop3):
 369         LD      rWORD7, rOFF32, rSTR1
 370         LD      rWORD8, rOFF32, rSTR2
 371         addi    rSTR1, rSTR1, 32
 372         addi    rSTR2, rSTR2, 32
 373         bne     cr1, L(dLcr1)
 374         cmpld   cr7, rWORD1, rWORD2
 375         bdnz    L(dLoop)
 376
 377 L(dL4):
 378         cmpld   cr1, rWORD3, rWORD4
 379         bne     cr6, L(dLcr6)
 380         cmpld   cr6, rWORD5, rWORD6
 381         bne     cr5, L(dLcr5)
 382         cmpld   cr5, rWORD7, rWORD8
 383 L(d44):
 384         bne     cr7, L(dLcr7)
 385 L(d34):
 386         bne     cr1, L(dLcr1)
 387 L(d24):
 388         bne     cr6, L(dLcr6)
 389 L(d14):
 390         sldi.   r12, rN, 3
 391         bne     cr5, L(dLcr5)
 392 L(d04):
 393         ld      rWORD8, rWORD8SAVE(r1)
 394         ld      rWORD7, rWORD7SAVE(r1)
 395         subfic  rN, r12, 64     /* Shift count is 64 - (rN * 8).  */
 396         beq     L(duzeroLength)
 397 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
 398    we are aligned it is safe to load the whole double word, and use
 399    shift right double to eliminate bits beyond the compare length.  */
 400 L(d00):
 401         LD      rWORD1, rOFF8, rSTR1
 402         LD      rWORD2, rOFF8, rSTR2
 403         srd     rWORD1, rWORD1, rN
 404         srd     rWORD2, rWORD2, rN
 405         cmpld   cr7, rWORD1, rWORD2
 406         bne     cr7, L(dLcr7x)
 407         ld      rOFF8,  rOFF8SAVE(r1)
 408         ld      rOFF16, rOFF16SAVE(r1)
 409         ld      rOFF24, rOFF24SAVE(r1)
 410         ld      rOFF32, rOFF32SAVE(r1)
 411         li      rRTN, 0
 412         blr
 413
 414         .align  4
 415 L(dLcr7):
 416         ld      rWORD8, rWORD8SAVE(r1)
 417         ld      rWORD7, rWORD7SAVE(r1)
 418 L(dLcr7x):
 419         ld      rOFF8,  rOFF8SAVE(r1)
 420         ld      rOFF16, rOFF16SAVE(r1)
 421         ld      rOFF24, rOFF24SAVE(r1)
 422         ld      rOFF32, rOFF32SAVE(r1)
 423         li      rRTN, 1
 424         bgtlr   cr7
 425         li      rRTN, -1
 426         blr
 427         .align  4
 428 L(dLcr1):
 429         ld      rWORD8, rWORD8SAVE(r1)
 430         ld      rWORD7, rWORD7SAVE(r1)
 431 L(dLcr1x):
 432         ld      rOFF8,  rOFF8SAVE(r1)
 433         ld      rOFF16, rOFF16SAVE(r1)
 434         ld      rOFF24, rOFF24SAVE(r1)
 435         ld      rOFF32, rOFF32SAVE(r1)
 436         li      rRTN, 1
 437         bgtlr   cr1
 438         li      rRTN, -1
 439         blr
 440         .align  4
 441 L(dLcr6):
 442         ld      rWORD8, rWORD8SAVE(r1)
 443         ld      rWORD7, rWORD7SAVE(r1)
 444 L(dLcr6x):
 445         ld      rOFF8,  rOFF8SAVE(r1)
 446         ld      rOFF16, rOFF16SAVE(r1)
 447         ld      rOFF24, rOFF24SAVE(r1)
 448         ld      rOFF32, rOFF32SAVE(r1)
 449         li      rRTN, 1
 450         bgtlr   cr6
 451         li      rRTN, -1
 452         blr
 453         .align  4
 454 L(dLcr5):
 455         ld      rWORD8, rWORD8SAVE(r1)
 456         ld      rWORD7, rWORD7SAVE(r1)
 457 L(dLcr5x):
 458         ld      rOFF8,  rOFF8SAVE(r1)
 459         ld      rOFF16, rOFF16SAVE(r1)
 460         ld      rOFF24, rOFF24SAVE(r1)
 461         ld      rOFF32, rOFF32SAVE(r1)
 462         li      rRTN, 1
 463         bgtlr   cr5
 464         li      rRTN, -1
 465         blr
 466
 467         .align  4
 468 L(bytealigned):
 469         mtctr   rN
 470
 471 /* We need to prime this loop.  This loop is swing modulo scheduled
 472    to avoid pipe delays.  The dependent instruction latencies (load to
 473    compare to conditional branch) is 2 to 3 cycles.  In this loop each
 474    dispatch group ends in a branch and takes 1 cycle.  Effectively
 475    the first iteration of the loop only serves to load operands and
 476    branches based on compares are delayed until the next loop.
 477
 478    So we must precondition some registers and condition codes so that
 479    we don't exit the loop early on the first iteration.  */
 480
 481         lbz     rWORD1, 0(rSTR1)
 482         lbz     rWORD2, 0(rSTR2)
 483         bdz     L(b11)
 484         cmpld   cr7, rWORD1, rWORD2
 485         lbz     rWORD3, 1(rSTR1)
 486         lbz     rWORD4, 1(rSTR2)
 487         bdz     L(b12)
 488         cmpld   cr1, rWORD3, rWORD4
 489         lbzu    rWORD5, 2(rSTR1)
 490         lbzu    rWORD6, 2(rSTR2)
 491         bdz     L(b13)
 492         .align  4
 493 L(bLoop):
 494         lbzu    rWORD1, 1(rSTR1)
 495         lbzu    rWORD2, 1(rSTR2)
 496         bne     cr7, L(bLcr7)
 497
 498         cmpld   cr6, rWORD5, rWORD6
 499         bdz     L(b3i)
 500
 501         lbzu    rWORD3, 1(rSTR1)
 502         lbzu    rWORD4, 1(rSTR2)
 503         bne     cr1, L(bLcr1)
 504
 505         cmpld   cr7, rWORD1, rWORD2
 506         bdz     L(b2i)
 507
 508         lbzu    rWORD5, 1(rSTR1)
 509         lbzu    rWORD6, 1(rSTR2)
 510         bne     cr6, L(bLcr6)
 511
 512         cmpld   cr1, rWORD3, rWORD4
 513         bdnz    L(bLoop)
 514
 515 /* We speculatively loading bytes before we have tested the previous
 516    bytes.  But we must avoid overrunning the length (in the ctr) to
 517    prevent these speculative loads from causing a segfault.  In this
 518    case the loop will exit early (before the all pending bytes are
 519    tested.  In this case we must complete the pending operations
 520    before returning.  */
 521 L(b1i):
 522         bne     cr7, L(bLcr7)
 523         bne     cr1, L(bLcr1)
 524         b       L(bx56)
 525         .align  4
 526 L(b2i):
 527         bne     cr6, L(bLcr6)
 528         bne     cr7, L(bLcr7)
 529         b       L(bx34)
 530         .align  4
 531 L(b3i):
 532         bne     cr1, L(bLcr1)
 533         bne     cr6, L(bLcr6)
 534         b       L(bx12)
 535         .align  4
 536 L(bLcr7):
 537         li      rRTN, 1
 538         bgtlr   cr7
 539         li      rRTN, -1
 540         blr
 541 L(bLcr1):
 542         li      rRTN, 1
 543         bgtlr   cr1
 544         li      rRTN, -1
 545         blr
 546 L(bLcr6):
 547         li      rRTN, 1
 548         bgtlr   cr6
 549         li      rRTN, -1
 550         blr
 551
 552 L(b13):
 553         bne     cr7, L(bx12)
 554         bne     cr1, L(bx34)
 555 L(bx56):
 556         sub     rRTN, rWORD5, rWORD6
 557         blr
 558         nop
 559 L(b12):
 560         bne     cr7, L(bx12)
 561 L(bx34):
 562         sub     rRTN, rWORD3, rWORD4
 563         blr
 564 L(b11):
 565 L(bx12):
 566         sub     rRTN, rWORD1, rWORD2
 567         blr
 568
 569         .align  4
 570 L(zeroLength):
 571         li      rRTN, 0
 572         blr
 573
 574         .align  4
 575 /* At this point we know the strings have different alignment and the
 576    compare length is at least 8 bytes.  r12 contains the low order
 577    3 bits of rSTR1 and cr5 contains the result of the logical compare
 578    of r12 to 0.  If r12 == 0 then rStr1 is double word
 579    aligned and can perform the DWunaligned loop.
 580
 581    Otherwise we know that rSTR1 is not already DW aligned yet.
 582    So we can force the string addresses to the next lower DW
 583    boundary and special case this first DW using shift left to
 584    eliminate bits preceding the first byte.  Since we want to join the
 585    normal (DWaligned) compare loop, starting at the second double word,
 586    we need to adjust the length (rN) and special case the loop
 587    versioning for the first DW. This ensures that the loop count is
 588    correct and the first DW (shifted) is in the expected resister pair.  */
 589 L(unaligned):
 590         std     rSHL, rSHLSAVE(r1)
 591         cfi_offset(rSHL, rSHLSAVE)
 592         clrldi  rSHL, rSTR2, 61
 593         beq     cr6, L(duzeroLength)
 594         std     rSHR, rSHRSAVE(r1)
 595         cfi_offset(rSHR, rSHRSAVE)
 596         beq     cr5, L(DWunaligned)
 597         std     rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 598         cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 599 /* Adjust the logical start of rSTR2 to compensate for the extra bits
 600    in the 1st rSTR1 DW.  */
 601         sub     rWORD8_SHIFT, rSTR2, r12
 602 /* But do not attempt to address the DW before that DW that contains
 603    the actual start of rSTR2.  */
 604         clrrdi  rSTR2, rSTR2, 3
 605         std     rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
 606 /* Compute the left/right shift counts for the unaligned rSTR2,
 607    compensating for the logical (DW aligned) start of rSTR1.  */
 608         clrldi  rSHL, rWORD8_SHIFT, 61
 609         clrrdi  rSTR1, rSTR1, 3
 610         std     rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 611         sldi    rSHL, rSHL, 3
 612         cmpld   cr5, rWORD8_SHIFT, rSTR2
 613         add     rN, rN, r12
 614         sldi    rWORD6, r12, 3
 615         std     rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
 616         cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 617         cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 618         cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 619         subfic  rSHR, rSHL, 64
 620         srdi    r0, rN, 5       /* Divide by 32 */
 621         andi.   r12, rN, 24     /* Get the DW remainder */
 622 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
 623    this special case those bits may be discarded anyway.  Also we
 624    must avoid loading a DW where none of the bits are part of rSTR2 as
 625    this may cross a page boundary and cause a page fault.  */
 626         li      rWORD8, 0
 627         blt     cr5, L(dus0)
 628         LD      rWORD8, 0, rSTR2
 629         addi    rSTR2, rSTR2, 8
 630         sld     rWORD8, rWORD8, rSHL
 631
 632 L(dus0):
 633         LD      rWORD1, 0, rSTR1
 634         LD      rWORD2, 0, rSTR2
 635         cmpldi  cr1, r12, 16
 636         cmpldi  cr7, rN, 32
 637         srd     r12, rWORD2, rSHR
 638         clrldi  rN, rN, 61
 639         beq     L(duPs4)
 640         mtctr   r0
 641         or      rWORD8, r12, rWORD8
 642         bgt     cr1, L(duPs3)
 643         beq     cr1, L(duPs2)
 644
 645 /* Remainder is 8 */
 646         .align  4
 647 L(dusP1):
 648         sld     rWORD8_SHIFT, rWORD2, rSHL
 649         sld     rWORD7, rWORD1, rWORD6
 650         sld     rWORD8, rWORD8, rWORD6
 651         bge     cr7, L(duP1e)
 652 /* At this point we exit early with the first double word compare
 653    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 654    how we handle the remaining bytes.  */
 655         cmpld   cr5, rWORD7, rWORD8
 656         sldi.   rN, rN, 3
 657         bne     cr5, L(duLcr5)
 658         cmpld   cr7, rN, rSHR
 659         beq     L(duZeroReturn)
 660         li      r0, 0
 661         ble     cr7, L(dutrim)
 662         LD      rWORD2, rOFF8, rSTR2
 663         srd     r0, rWORD2, rSHR
 664         b       L(dutrim)
 665 /* Remainder is 16 */
 666         .align  4
 667 L(duPs2):
 668         sld     rWORD6_SHIFT, rWORD2, rSHL
 669         sld     rWORD5, rWORD1, rWORD6
 670         sld     rWORD6, rWORD8, rWORD6
 671         b       L(duP2e)
 672 /* Remainder is 24 */
 673         .align  4
 674 L(duPs3):
 675         sld     rWORD4_SHIFT, rWORD2, rSHL
 676         sld     rWORD3, rWORD1, rWORD6
 677         sld     rWORD4, rWORD8, rWORD6
 678         b       L(duP3e)
 679 /* Count is a multiple of 32, remainder is 0 */
 680         .align  4
 681 L(duPs4):
 682         mtctr   r0
 683         or      rWORD8, r12, rWORD8
 684         sld     rWORD2_SHIFT, rWORD2, rSHL
 685         sld     rWORD1, rWORD1, rWORD6
 686         sld     rWORD2, rWORD8, rWORD6
 687         b       L(duP4e)
 688
 689 /* At this point we know rSTR1 is double word aligned and the
 690    compare length is at least 8 bytes.  */
 691         .align  4
 692 L(DWunaligned):
 693         std     rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 694         clrrdi  rSTR2, rSTR2, 3
 695         std     rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
 696         srdi    r0, rN, 5       /* Divide by 32 */
 697         std     rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 698         andi.   r12, rN, 24     /* Get the DW remainder */
 699         std     rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
 700         cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 701         cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 702         cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 703         cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 704         sldi    rSHL, rSHL, 3
 705         LD      rWORD6, 0, rSTR2
 706         LD      rWORD8, rOFF8, rSTR2
 707         addi    rSTR2, rSTR2, 8
 708         cmpldi  cr1, r12, 16
 709         cmpldi  cr7, rN, 32
 710         clrldi  rN, rN, 61
 711         subfic  rSHR, rSHL, 64
 712         sld     rWORD6_SHIFT, rWORD6, rSHL
 713         beq     L(duP4)
 714         mtctr   r0
 715         bgt     cr1, L(duP3)
 716         beq     cr1, L(duP2)
 717
 718 /* Remainder is 8 */
 719         .align  4
 720 L(duP1):
 721         srd     r12, rWORD8, rSHR
 722         LD      rWORD7, 0, rSTR1
 723         sld     rWORD8_SHIFT, rWORD8, rSHL
 724         or      rWORD8, r12, rWORD6_SHIFT
 725         blt     cr7, L(duP1x)
 726 L(duP1e):
 727         LD      rWORD1, rOFF8, rSTR1
 728         LD      rWORD2, rOFF8, rSTR2
 729         cmpld   cr5, rWORD7, rWORD8
 730         srd     r0, rWORD2, rSHR
 731         sld     rWORD2_SHIFT, rWORD2, rSHL
 732         or      rWORD2, r0, rWORD8_SHIFT
 733         LD      rWORD3, rOFF16, rSTR1
 734         LD      rWORD4, rOFF16, rSTR2
 735         cmpld   cr7, rWORD1, rWORD2
 736         srd     r12, rWORD4, rSHR
 737         sld     rWORD4_SHIFT, rWORD4, rSHL
 738         bne     cr5, L(duLcr5)
 739         or      rWORD4, r12, rWORD2_SHIFT
 740         LD      rWORD5, rOFF24, rSTR1
 741         LD      rWORD6, rOFF24, rSTR2
 742         cmpld   cr1, rWORD3, rWORD4
 743         srd     r0, rWORD6, rSHR
 744         sld     rWORD6_SHIFT, rWORD6, rSHL
 745         bne     cr7, L(duLcr7)
 746         or      rWORD6, r0, rWORD4_SHIFT
 747         cmpld   cr6, rWORD5, rWORD6
 748         b       L(duLoop3)
 749         .align  4
 750 /* At this point we exit early with the first double word compare
 751    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
 752    how we handle the remaining bytes.  */
 753 L(duP1x):
 754         cmpld   cr5, rWORD7, rWORD8
 755         sldi.   rN, rN, 3
 756         bne     cr5, L(duLcr5)
 757         cmpld   cr7, rN, rSHR
 758         beq     L(duZeroReturn)
 759         li      r0, 0
 760         ble     cr7, L(dutrim)
 761         LD      rWORD2, rOFF8, rSTR2
 762         srd     r0, rWORD2, rSHR
 763         b       L(dutrim)
 764 /* Remainder is 16 */
 765         .align  4
 766 L(duP2):
 767         srd     r0, rWORD8, rSHR
 768         LD      rWORD5, 0, rSTR1
 769         or      rWORD6, r0, rWORD6_SHIFT
 770         sld     rWORD6_SHIFT, rWORD8, rSHL
 771 L(duP2e):
 772         LD      rWORD7, rOFF8, rSTR1
 773         LD      rWORD8, rOFF8, rSTR2
 774         cmpld   cr6, rWORD5, rWORD6
 775         srd     r12, rWORD8, rSHR
 776         sld     rWORD8_SHIFT, rWORD8, rSHL
 777         or      rWORD8, r12, rWORD6_SHIFT
 778         blt     cr7, L(duP2x)
 779         LD      rWORD1, rOFF16, rSTR1
 780         LD      rWORD2, rOFF16, rSTR2
 781         cmpld   cr5, rWORD7, rWORD8
 782         bne     cr6, L(duLcr6)
 783         srd     r0, rWORD2, rSHR
 784         sld     rWORD2_SHIFT, rWORD2, rSHL
 785         or      rWORD2, r0, rWORD8_SHIFT
 786         LD      rWORD3, rOFF24, rSTR1
 787         LD      rWORD4, rOFF24, rSTR2
 788         cmpld   cr7, rWORD1, rWORD2
 789         bne     cr5, L(duLcr5)
 790         srd     r12, rWORD4, rSHR
 791         sld     rWORD4_SHIFT, rWORD4, rSHL
 792         or      rWORD4, r12, rWORD2_SHIFT
 793         addi    rSTR1, rSTR1, 8
 794         addi    rSTR2, rSTR2, 8
 795         cmpld   cr1, rWORD3, rWORD4
 796         b       L(duLoop2)
 797         .align  4
 798 L(duP2x):
 799         cmpld   cr5, rWORD7, rWORD8
 800         addi    rSTR1, rSTR1, 8
 801         addi    rSTR2, rSTR2, 8
 802         bne     cr6, L(duLcr6)
 803         sldi.   rN, rN, 3
 804         bne     cr5, L(duLcr5)
 805         cmpld   cr7, rN, rSHR
 806         beq     L(duZeroReturn)
 807         li      r0, 0
 808         ble     cr7, L(dutrim)
 809         LD      rWORD2, rOFF8, rSTR2
 810         srd     r0, rWORD2, rSHR
 811         b       L(dutrim)
 812
 813 /* Remainder is 24 */
 814         .align  4
 815 L(duP3):
 816         srd     r12, rWORD8, rSHR
 817         LD      rWORD3, 0, rSTR1
 818         sld     rWORD4_SHIFT, rWORD8, rSHL
 819         or      rWORD4, r12, rWORD6_SHIFT
 820 L(duP3e):
 821         LD      rWORD5, rOFF8, rSTR1
 822         LD      rWORD6, rOFF8, rSTR2
 823         cmpld   cr1, rWORD3, rWORD4
 824         srd     r0, rWORD6, rSHR
 825         sld     rWORD6_SHIFT, rWORD6, rSHL
 826         or      rWORD6, r0, rWORD4_SHIFT
 827         LD      rWORD7, rOFF16, rSTR1
 828         LD      rWORD8, rOFF16, rSTR2
 829         cmpld   cr6, rWORD5, rWORD6
 830         bne     cr1, L(duLcr1)
 831         srd     r12, rWORD8, rSHR
 832         sld     rWORD8_SHIFT, rWORD8, rSHL
 833         or      rWORD8, r12, rWORD6_SHIFT
 834         blt     cr7, L(duP3x)
 835         LD      rWORD1, rOFF24, rSTR1
 836         LD      rWORD2, rOFF24, rSTR2
 837         cmpld   cr5, rWORD7, rWORD8
 838         bne     cr6, L(duLcr6)
 839         srd     r0, rWORD2, rSHR
 840         sld     rWORD2_SHIFT, rWORD2, rSHL
 841         or      rWORD2, r0, rWORD8_SHIFT
 842         addi    rSTR1, rSTR1, 16
 843         addi    rSTR2, rSTR2, 16
 844         cmpld   cr7, rWORD1, rWORD2
 845         b       L(duLoop1)
 846         .align  4
 847 L(duP3x):
 848         addi    rSTR1, rSTR1, 16
 849         addi    rSTR2, rSTR2, 16
 850         cmpld   cr5, rWORD7, rWORD8
 851         bne     cr6, L(duLcr6)
 852         sldi.   rN, rN, 3
 853         bne     cr5, L(duLcr5)
 854         cmpld   cr7, rN, rSHR
 855         beq     L(duZeroReturn)
 856         li      r0, 0
 857         ble     cr7, L(dutrim)
 858         LD      rWORD2, rOFF8, rSTR2
 859         srd     r0, rWORD2, rSHR
 860         b       L(dutrim)
 861
 862 /* Count is a multiple of 32, remainder is 0 */
 863         .align  4
 864 L(duP4):
 865         mtctr   r0
 866         srd     r0, rWORD8, rSHR
 867         LD      rWORD1, 0, rSTR1
 868         sld     rWORD2_SHIFT, rWORD8, rSHL
 869         or      rWORD2, r0, rWORD6_SHIFT
 870 L(duP4e):
 871         LD      rWORD3, rOFF8, rSTR1
 872         LD      rWORD4, rOFF8, rSTR2
 873         cmpld   cr7, rWORD1, rWORD2
 874         srd     r12, rWORD4, rSHR
 875         sld     rWORD4_SHIFT, rWORD4, rSHL
 876         or      rWORD4, r12, rWORD2_SHIFT
 877         LD      rWORD5, rOFF16, rSTR1
 878         LD      rWORD6, rOFF16, rSTR2
 879         cmpld   cr1, rWORD3, rWORD4
 880         bne     cr7, L(duLcr7)
 881         srd     r0, rWORD6, rSHR
 882         sld     rWORD6_SHIFT, rWORD6, rSHL
 883         or      rWORD6, r0, rWORD4_SHIFT
 884         LD      rWORD7, rOFF24, rSTR1
 885         LD      rWORD8, rOFF24, rSTR2
 886         addi    rSTR1, rSTR1, 24
 887         addi    rSTR2, rSTR2, 24
 888         cmpld   cr6, rWORD5, rWORD6
 889         bne     cr1, L(duLcr1)
 890         srd     r12, rWORD8, rSHR
 891         sld     rWORD8_SHIFT, rWORD8, rSHL
 892         or      rWORD8, r12, rWORD6_SHIFT
 893         cmpld   cr5, rWORD7, rWORD8
 894         bdz     L(du24)         /* Adjust CTR as we start with +4 */
 895 /* This is the primary loop */
 896         .align  4
 897 L(duLoop):
 898         LD      rWORD1, rOFF8, rSTR1
 899         LD      rWORD2, rOFF8, rSTR2
 900         cmpld   cr1, rWORD3, rWORD4
 901         bne     cr6, L(duLcr6)
 902         srd     r0, rWORD2, rSHR
 903         sld     rWORD2_SHIFT, rWORD2, rSHL
 904         or      rWORD2, r0, rWORD8_SHIFT
 905 L(duLoop1):
 906         LD      rWORD3, rOFF16, rSTR1
 907         LD      rWORD4, rOFF16, rSTR2
 908         cmpld   cr6, rWORD5, rWORD6
 909         bne     cr5, L(duLcr5)
 910         srd     r12, rWORD4, rSHR
 911         sld     rWORD4_SHIFT, rWORD4, rSHL
 912         or      rWORD4, r12, rWORD2_SHIFT
 913 L(duLoop2):
 914         LD      rWORD5, rOFF24, rSTR1
 915         LD      rWORD6, rOFF24, rSTR2
 916         cmpld   cr5, rWORD7, rWORD8
 917         bne     cr7, L(duLcr7)
 918         srd     r0, rWORD6, rSHR
 919         sld     rWORD6_SHIFT, rWORD6, rSHL
 920         or      rWORD6, r0, rWORD4_SHIFT
 921 L(duLoop3):
 922         LD      rWORD7, rOFF32, rSTR1
 923         LD      rWORD8, rOFF32, rSTR2
 924         addi    rSTR1, rSTR1, 32
 925         addi    rSTR2, rSTR2, 32
 926         cmpld   cr7, rWORD1, rWORD2
 927         bne     cr1, L(duLcr1)
 928         srd     r12, rWORD8, rSHR
 929         sld     rWORD8_SHIFT, rWORD8, rSHL
 930         or      rWORD8, r12, rWORD6_SHIFT
 931         bdnz    L(duLoop)
 932
 933 L(duL4):
 934         cmpld   cr1, rWORD3, rWORD4
 935         bne     cr6, L(duLcr6)
 936         cmpld   cr6, rWORD5, rWORD6
 937         bne     cr5, L(duLcr5)
 938         cmpld   cr5, rWORD7, rWORD8
 939 L(du44):
 940         bne     cr7, L(duLcr7)
 941 L(du34):
 942         bne     cr1, L(duLcr1)
 943 L(du24):
 944         bne     cr6, L(duLcr6)
 945 L(du14):
 946         sldi.   rN, rN, 3
 947         bne     cr5, L(duLcr5)
 948 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
 949    shift right double to eliminate bits beyond the compare length.
 950
 951    However it may not be safe to load rWORD2 which may be beyond the
 952    string length. So we compare the bit length of the remainder to
 953    the right shift count (rSHR). If the bit count is less than or equal
 954    we do not need to load rWORD2 (all significant bits are already in
 955    rWORD8_SHIFT).  */
 956         cmpld   cr7, rN, rSHR
 957         beq     L(duZeroReturn)
 958         li      r0, 0
 959         ble     cr7, L(dutrim)
 960         LD      rWORD2, rOFF8, rSTR2
 961         srd     r0, rWORD2, rSHR
 962         .align  4
 963 L(dutrim):
 964         LD      rWORD1, rOFF8, rSTR1
 965         ld      rWORD8, -8(r1)
 966         subfic  rN, rN, 64      /* Shift count is 64 - (rN * 8).  */
 967         or      rWORD2, r0, rWORD8_SHIFT
 968         ld      rWORD7, rWORD7SAVE(r1)
 969         ld      rSHL, rSHLSAVE(r1)
 970         srd     rWORD1, rWORD1, rN
 971         srd     rWORD2, rWORD2, rN
 972         ld      rSHR, rSHRSAVE(r1)
 973         ld      rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 974         li      rRTN, 0
 975         cmpld   cr7, rWORD1, rWORD2
 976         ld      rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
 977         ld      rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 978         beq     cr7, L(dureturn24)
 979         li      rRTN, 1
 980         ld      rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
 981         ld      rOFF8,  rOFF8SAVE(r1)
 982         ld      rOFF16, rOFF16SAVE(r1)
 983         ld      rOFF24, rOFF24SAVE(r1)
 984         ld      rOFF32, rOFF32SAVE(r1)
 985         bgtlr   cr7
 986         li      rRTN, -1
 987         blr
 988         .align  4
 989 L(duLcr7):
 990         ld      rWORD8, rWORD8SAVE(r1)
 991         ld      rWORD7, rWORD7SAVE(r1)
 992         li      rRTN, 1
 993         bgt     cr7, L(dureturn29)
 994         ld      rSHL, rSHLSAVE(r1)
 995         ld      rSHR, rSHRSAVE(r1)
 996         li      rRTN, -1
 997         b       L(dureturn27)
 998         .align  4
 999 L(duLcr1):
1000         ld      rWORD8, rWORD8SAVE(r1)
1001         ld      rWORD7, rWORD7SAVE(r1)
1002         li      rRTN, 1
1003         bgt     cr1, L(dureturn29)
1004         ld      rSHL, rSHLSAVE(r1)
1005         ld      rSHR, rSHRSAVE(r1)
1006         li      rRTN, -1
1007         b       L(dureturn27)
1008         .align  4
1009 L(duLcr6):
1010         ld      rWORD8, rWORD8SAVE(r1)
1011         ld      rWORD7, rWORD7SAVE(r1)
1012         li      rRTN, 1
1013         bgt     cr6, L(dureturn29)
1014         ld      rSHL, rSHLSAVE(r1)
1015         ld      rSHR, rSHRSAVE(r1)
1016         li      rRTN, -1
1017         b       L(dureturn27)
1018         .align  4
1019 L(duLcr5):
1020         ld      rWORD8, rWORD8SAVE(r1)
1021         ld      rWORD7, rWORD7SAVE(r1)
1022         li      rRTN, 1
1023         bgt     cr5, L(dureturn29)
1024         ld      rSHL, rSHLSAVE(r1)
1025         ld      rSHR, rSHRSAVE(r1)
1026         li      rRTN, -1
1027         b       L(dureturn27)
1028
1029         .align  3
1030 L(duZeroReturn):
1031         li      rRTN, 0
1032         .align  4
1033 L(dureturn):
1034         ld      rWORD8, rWORD8SAVE(r1)
1035         ld      rWORD7, rWORD7SAVE(r1)
1036 L(dureturn29):
1037         ld      rSHL, rSHLSAVE(r1)
1038         ld      rSHR, rSHRSAVE(r1)
1039 L(dureturn27):
1040         ld      rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041         ld      rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042         ld      rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1043 L(dureturn24):
1044         ld      rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045         ld      rOFF8,  rOFF8SAVE(r1)
1046         ld      rOFF16, rOFF16SAVE(r1)
1047         ld      rOFF24, rOFF24SAVE(r1)
1048         ld      rOFF32, rOFF32SAVE(r1)
1049         blr
1050
1051 L(duzeroLength):
1052         ld      rOFF8,  rOFF8SAVE(r1)
1053         ld      rOFF16, rOFF16SAVE(r1)
1054         ld      rOFF24, rOFF24SAVE(r1)
1055         ld      rOFF32, rOFF32SAVE(r1)
1056         li      rRTN, 0
1057         blr
1058
1059 END (MEMCMP)
1060 libc_hidden_builtin_def (memcmp)
1061 weak_alias (memcmp, bcmp)