sysdeps/powerpc/powerpc32/power6/memset.S

   1 /* Optimized 32-bit memset implementation for POWER6.
   2    Copyright (C) 1997-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
  22    Returns 's'.
  23
  24    The memset is done in three sizes: byte (8 bits), word (32 bits),
  25    cache line (1024 bits). There is a special case for setting cache lines
  26    to 0, to take advantage of the dcbz instruction.  */
  27
  28         .machine power6
  29 EALIGN (memset, 7, 0)
  30         CALL_MCOUNT
  31
  32 #define rTMP    r0
  33 #define rRTN    r3      /* Initial value of 1st argument.  */
  34 #define rMEMP0  r3      /* Original value of 1st arg.  */
  35 #define rCHR    r4      /* Char to set in each byte.  */
  36 #define rLEN    r5      /* Length of region to set.  */
  37 #define rMEMP   r6      /* Address at which we are storing.  */
  38 #define rALIGN  r7      /* Number of bytes we are setting now (when aligning). */
  39 #define rMEMP2  r8
  40
  41 #define rNEG64  r8      /* Constant -64 for clearing with dcbz.  */
  42 #define rMEMP3  r9      /* Alt mem pointer.  */
  43 L(_memset):
  44 /* Take care of case for size <= 4.  */
  45         cmplwi  cr1, rLEN, 4
  46         andi.   rALIGN, rMEMP0, 3
  47         mr      rMEMP, rMEMP0
  48         ble-    cr1, L(small)
  49 /* Align to word boundary.  */
  50         cmplwi  cr5, rLEN, 31
  51         insrwi  rCHR, rCHR, 8, 16       /* Replicate byte to halfword.  */
  52         beq+    L(aligned)
  53         mtcrf   0x01, rMEMP0
  54         subfic  rALIGN, rALIGN, 4
  55         add     rMEMP, rMEMP, rALIGN
  56         sub     rLEN, rLEN, rALIGN
  57         bf+     31, L(g0)
  58         stb     rCHR, 0(rMEMP0)
  59         bt      30, L(aligned)
  60 L(g0):
  61         sth     rCHR, -2(rMEMP)
  62
  63         .align 4
  64 /* Handle the case of size < 31.  */
  65 L(aligned):
  66         mtcrf   0x01, rLEN
  67         insrwi  rCHR, rCHR, 16, 0       /* Replicate halfword to word.  */
  68         ble     cr5, L(medium)
  69 /* Align to 32-byte boundary.  */
  70         andi.   rALIGN, rMEMP, 0x1C
  71         subfic  rALIGN, rALIGN, 0x20
  72         beq     L(caligned)
  73         mtcrf   0x01, rALIGN
  74         add     rMEMP, rMEMP, rALIGN
  75         sub     rLEN, rLEN, rALIGN
  76         cmplwi  cr1, rALIGN, 0x10
  77         mr      rMEMP2, rMEMP
  78         bf      28, L(a1)
  79         stw     rCHR, -4(rMEMP2)
  80         stwu    rCHR, -8(rMEMP2)
  81         nop
  82 L(a1):  blt     cr1, L(a2)
  83         stw     rCHR, -4(rMEMP2)
  84         stw     rCHR, -8(rMEMP2)
  85         stw     rCHR, -12(rMEMP2)
  86         stwu    rCHR, -16(rMEMP2)
  87 L(a2):  bf      29, L(caligned)
  88         stw     rCHR, -4(rMEMP2)
  89
  90         .align 3
  91 /* Now aligned to a 32 byte boundary.  */
  92 L(caligned):
  93         cmplwi  cr1, rCHR, 0
  94         clrrwi. rALIGN, rLEN, 5
  95         mtcrf   0x01, rLEN
  96         beq     cr1, L(zloopstart) /* Special case for clearing memory using dcbz.  */
  97 L(nondcbz):
  98         beq     L(medium)       /* We may not actually get to do a full line.  */
  99         nop
 100 /* Storing a non-zero "c" value. We are aligned at a sector (32-byte)
 101    boundary may not be at cache line (128-byte) boundary.  */
 102 L(nzloopstart):
 103 /* memset in 32-byte chunks until we get to a cache line boundary.
 104    If rLEN is less than the distance to the next cache-line boundary use
 105    cacheAligned1 code to finish the tail.  */
 106         cmplwi  cr1,rLEN,128
 107
 108         andi.   rTMP,rMEMP,127
 109         blt     cr1,L(cacheAligned1)
 110         addi    rMEMP3,rMEMP,32
 111         beq     L(nzCacheAligned)
 112         addi    rLEN,rLEN,-32
 113         stw     rCHR,0(rMEMP)
 114         stw     rCHR,4(rMEMP)
 115         stw     rCHR,8(rMEMP)
 116         stw     rCHR,12(rMEMP)
 117         stw     rCHR,16(rMEMP)
 118         stw     rCHR,20(rMEMP)
 119         addi    rMEMP,rMEMP,32
 120         andi.   rTMP,rMEMP3,127
 121         stw     rCHR,-8(rMEMP3)
 122         stw     rCHR,-4(rMEMP3)
 123
 124         beq     L(nzCacheAligned)
 125         addi    rLEN,rLEN,-32
 126         stw     rCHR,0(rMEMP3)
 127         stw     rCHR,4(rMEMP3)
 128         addi    rMEMP,rMEMP,32
 129         stw     rCHR,8(rMEMP3)
 130         stw     rCHR,12(rMEMP3)
 131         andi.   rTMP,rMEMP,127
 132         stw     rCHR,16(rMEMP3)
 133         stw     rCHR,20(rMEMP3)
 134         stw     rCHR,24(rMEMP3)
 135         stw     rCHR,28(rMEMP3)
 136
 137         beq     L(nzCacheAligned)
 138         addi    rLEN,rLEN,-32
 139 /* At this point we can overrun the store queue (pipe reject) so it is
 140    time to slow things down. The store queue can merge two adjacent
 141    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 142    So we add "group ending nops" to guarantee that we dispatch only two
 143    stores every other cycle. */
 144         ori     r1,r1,0
 145         ori     r1,r1,0
 146         stw     rCHR,32(rMEMP3)
 147         stw     rCHR,36(rMEMP3)
 148         addi    rMEMP,rMEMP,32
 149         cmplwi  cr1,rLEN,128
 150         ori     r1,r1,0
 151         ori     r1,r1,0
 152         stw     rCHR,40(rMEMP3)
 153         stw     rCHR,44(rMEMP3)
 154         ori     r1,r1,0
 155         ori     r1,r1,0
 156         stw     rCHR,48(rMEMP3)
 157         stw     rCHR,52(rMEMP3)
 158         ori     r1,r1,0
 159         ori     r1,r1,0
 160         stw     rCHR,56(rMEMP3)
 161         stw     rCHR,60(rMEMP3)
 162         blt     cr1,L(cacheAligned1)
 163         b       L(nzCacheAligned)
 164
 165 /* Now we are aligned to the cache line and can use dcbtst.  */
 166         .align 5
 167 L(nzCacheAligned):
 168         cmplwi  cr1,rLEN,128
 169         cmplwi  cr6,rLEN,256
 170         blt     cr1,L(cacheAligned1)
 171         blt     cr6,L(nzCacheAligned128)
 172         .align 4
 173 L(nzCacheAligned128):
 174         nop
 175         addi    rMEMP3,rMEMP,64
 176         stw     rCHR,0(rMEMP)
 177         stw     rCHR,4(rMEMP)
 178         stw     rCHR,8(rMEMP)
 179         stw     rCHR,12(rMEMP)
 180         stw     rCHR,16(rMEMP)
 181         stw     rCHR,20(rMEMP)
 182         stw     rCHR,24(rMEMP)
 183         stw     rCHR,28(rMEMP)
 184         stw     rCHR,32(rMEMP)
 185         stw     rCHR,36(rMEMP)
 186         stw     rCHR,40(rMEMP)
 187         stw     rCHR,44(rMEMP)
 188         stw     rCHR,48(rMEMP)
 189         stw     rCHR,52(rMEMP)
 190         stw     rCHR,56(rMEMP)
 191         stw     rCHR,60(rMEMP)
 192         addi    rMEMP,rMEMP3,64
 193         addi    rLEN,rLEN,-128
 194 /* At this point we can overrun the store queue (pipe reject) so it is
 195    time to slow things down. The store queue can merge two adjacent
 196    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 197    So we add "group ending nops" to guarantee that we dispatch only one
 198    store per cycle. */
 199         stw     rCHR,0(rMEMP3)
 200         ori     r1,r1,0
 201         stw     rCHR,4(rMEMP3)
 202         ori     r1,r1,0
 203         stw     rCHR,8(rMEMP3)
 204         ori     r1,r1,0
 205         stw     rCHR,12(rMEMP3)
 206         ori     r1,r1,0
 207         stw     rCHR,16(rMEMP3)
 208         ori     r1,r1,0
 209         stw     rCHR,20(rMEMP3)
 210         ori     r1,r1,0
 211         stw     rCHR,24(rMEMP3)
 212         ori     r1,r1,0
 213         stw     rCHR,28(rMEMP3)
 214         ori     r1,r1,0
 215         stw     rCHR,32(rMEMP3)
 216         ori     r1,r1,0
 217         stw     rCHR,36(rMEMP3)
 218         ori     r1,r1,0
 219         stw     rCHR,40(rMEMP3)
 220         ori     r1,r1,0
 221         stw     rCHR,44(rMEMP3)
 222         ori     r1,r1,0
 223         stw     rCHR,48(rMEMP3)
 224         ori     r1,r1,0
 225         stw     rCHR,52(rMEMP3)
 226         ori     r1,r1,0
 227         stw     rCHR,56(rMEMP3)
 228         ori     r1,r1,0
 229         stw     rCHR,60(rMEMP3)
 230         blt     cr6,L(cacheAligned1)
 231 #if IS_IN (libc)
 232         lfd     0,-128(rMEMP)
 233 #endif
 234         b       L(nzCacheAligned256)
 235         .align 5
 236 L(nzCacheAligned256):
 237         cmplwi  cr1,rLEN,256
 238         addi    rMEMP3,rMEMP,64
 239 #if !IS_IN (libc)
 240 /* When we are not in libc we should use only GPRs to avoid the FPU lock
 241    interrupt.  */
 242         stw     rCHR,0(rMEMP)
 243         stw     rCHR,4(rMEMP)
 244         stw     rCHR,8(rMEMP)
 245         stw     rCHR,12(rMEMP)
 246         stw     rCHR,16(rMEMP)
 247         stw     rCHR,20(rMEMP)
 248         stw     rCHR,24(rMEMP)
 249         stw     rCHR,28(rMEMP)
 250         stw     rCHR,32(rMEMP)
 251         stw     rCHR,36(rMEMP)
 252         stw     rCHR,40(rMEMP)
 253         stw     rCHR,44(rMEMP)
 254         stw     rCHR,48(rMEMP)
 255         stw     rCHR,52(rMEMP)
 256         stw     rCHR,56(rMEMP)
 257         stw     rCHR,60(rMEMP)
 258         addi    rMEMP,rMEMP3,64
 259         addi    rLEN,rLEN,-128
 260         stw     rCHR,0(rMEMP3)
 261         stw     rCHR,4(rMEMP3)
 262         stw     rCHR,8(rMEMP3)
 263         stw     rCHR,12(rMEMP3)
 264         stw     rCHR,16(rMEMP3)
 265         stw     rCHR,20(rMEMP3)
 266         stw     rCHR,24(rMEMP3)
 267         stw     rCHR,28(rMEMP3)
 268         stw     rCHR,32(rMEMP3)
 269         stw     rCHR,36(rMEMP3)
 270         stw     rCHR,40(rMEMP3)
 271         stw     rCHR,44(rMEMP3)
 272         stw     rCHR,48(rMEMP3)
 273         stw     rCHR,52(rMEMP3)
 274         stw     rCHR,56(rMEMP3)
 275         stw     rCHR,60(rMEMP3)
 276 #else
 277 /* We are in libc and this is a long memset so we can use FPRs and can afford
 278    occasional FPU locked interrupts.  */
 279         stfd    0,0(rMEMP)
 280         stfd    0,8(rMEMP)
 281         stfd    0,16(rMEMP)
 282         stfd    0,24(rMEMP)
 283         stfd    0,32(rMEMP)
 284         stfd    0,40(rMEMP)
 285         stfd    0,48(rMEMP)
 286         stfd    0,56(rMEMP)
 287         addi    rMEMP,rMEMP3,64
 288         addi    rLEN,rLEN,-128
 289         stfd    0,0(rMEMP3)
 290         stfd    0,8(rMEMP3)
 291         stfd    0,16(rMEMP3)
 292         stfd    0,24(rMEMP3)
 293         stfd    0,32(rMEMP3)
 294         stfd    0,40(rMEMP3)
 295         stfd    0,48(rMEMP3)
 296         stfd    0,56(rMEMP3)
 297 #endif
 298         bge     cr1,L(nzCacheAligned256)
 299         dcbtst  0,rMEMP
 300         b       L(cacheAligned1)
 301
 302         .align 4
 303 /* Storing a zero "c" value. We are aligned at a sector (32-byte)
 304    boundary but may not be at cache line (128-byte) boundary.  If the
 305    remaining length spans a full cache line we can use the Data cache
 306    block zero instruction. */
 307 L(zloopstart):
 308 /* memset in 32-byte chunks until we get to a cache line boundary.
 309    If rLEN is less than the distance to the next cache-line boundary use
 310    cacheAligned1 code to finish the tail.  */
 311         cmplwi  cr1,rLEN,128
 312         beq     L(medium)
 313 L(getCacheAligned):
 314         andi.   rTMP,rMEMP,127
 315         blt     cr1,L(cacheAligned1)
 316         addi    rMEMP3,rMEMP,32
 317         beq     L(cacheAligned)
 318         addi    rLEN,rLEN,-32
 319         stw     rCHR,0(rMEMP)
 320         stw     rCHR,4(rMEMP)
 321         stw     rCHR,8(rMEMP)
 322         stw     rCHR,12(rMEMP)
 323         stw     rCHR,16(rMEMP)
 324         stw     rCHR,20(rMEMP)
 325         addi    rMEMP,rMEMP,32
 326         andi.   rTMP,rMEMP3,127
 327         stw     rCHR,-8(rMEMP3)
 328         stw     rCHR,-4(rMEMP3)
 329 L(getCacheAligned2):
 330         beq     L(cacheAligned)
 331         addi    rLEN,rLEN,-32
 332         addi    rMEMP,rMEMP,32
 333         stw     rCHR,0(rMEMP3)
 334         stw     rCHR,4(rMEMP3)
 335         stw     rCHR,8(rMEMP3)
 336         stw     rCHR,12(rMEMP3)
 337         andi.   rTMP,rMEMP,127
 338         nop
 339         stw     rCHR,16(rMEMP3)
 340         stw     rCHR,20(rMEMP3)
 341         stw     rCHR,24(rMEMP3)
 342         stw     rCHR,28(rMEMP3)
 343 L(getCacheAligned3):
 344         beq     L(cacheAligned)
 345 /* At this point we can overrun the store queue (pipe reject) so it is
 346    time to slow things down. The store queue can merge two adjacent
 347    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 348    So we add "group ending nops" to guarantee that we dispatch only two
 349    stores every other cycle. */
 350         addi    rLEN,rLEN,-32
 351         ori     r1,r1,0
 352         ori     r1,r1,0
 353         stw     rCHR,32(rMEMP3)
 354         stw     rCHR,36(rMEMP3)
 355         addi    rMEMP,rMEMP,32
 356         cmplwi  cr1,rLEN,128
 357         ori     r1,r1,0
 358         stw     rCHR,40(rMEMP3)
 359         stw     rCHR,44(rMEMP3)
 360         cmplwi  cr6,rLEN,256
 361         li      rMEMP2,128
 362         ori     r1,r1,0
 363         stw     rCHR,48(rMEMP3)
 364         stw     rCHR,52(rMEMP3)
 365         ori     r1,r1,0
 366         ori     r1,r1,0
 367         stw     rCHR,56(rMEMP3)
 368         stw     rCHR,60(rMEMP3)
 369         blt     cr1,L(cacheAligned1)
 370         blt     cr6,L(cacheAligned128)
 371         b       L(cacheAlignedx)
 372
 373 /* Now we are aligned to the cache line and can use dcbz.  */
 374         .align 4
 375 L(cacheAligned):
 376         cmplwi  cr1,rLEN,128
 377         cmplwi  cr6,rLEN,256
 378         blt     cr1,L(cacheAligned1)
 379         li      rMEMP2,128
 380 L(cacheAlignedx):
 381         cmplwi  cr5,rLEN,640
 382         blt     cr6,L(cacheAligned128)
 383         bgt     cr5,L(cacheAligned512)
 384         cmplwi  cr6,rLEN,512
 385         dcbz    0,rMEMP
 386         cmplwi  cr1,rLEN,384
 387         dcbz    rMEMP2,rMEMP
 388         addi    rMEMP,rMEMP,256
 389         addi    rLEN,rLEN,-256
 390         blt     cr1,L(cacheAligned1)
 391         blt     cr6,L(cacheAligned128)
 392         b       L(cacheAligned256)
 393         .align 5
 394 /* A simple loop for the longer (>640 bytes) lengths.  This form limits
 395    the branch miss-predicted to exactly 1 at loop exit.*/
 396 L(cacheAligned512):
 397         cmpli   cr1,rLEN,128
 398         blt     cr1,L(cacheAligned1)
 399         dcbz    0,rMEMP
 400         addi    rLEN,rLEN,-128
 401         addi    rMEMP,rMEMP,128
 402         b       L(cacheAligned512)
 403         .align 5
 404 L(cacheAligned256):
 405         cmplwi  cr6,rLEN,512
 406         dcbz    0,rMEMP
 407         cmplwi  cr1,rLEN,384
 408         dcbz    rMEMP2,rMEMP
 409         addi    rMEMP,rMEMP,256
 410         addi    rLEN,rLEN,-256
 411         bge     cr6,L(cacheAligned256)
 412         blt     cr1,L(cacheAligned1)
 413         .align 4
 414 L(cacheAligned128):
 415         dcbz    0,rMEMP
 416         addi    rMEMP,rMEMP,128
 417         addi    rLEN,rLEN,-128
 418         .align 4
 419 L(cacheAligned1):
 420         cmplwi  cr1,rLEN,32
 421         blt     cr1,L(handletail32)
 422         addi    rMEMP3,rMEMP,32
 423         addi    rLEN,rLEN,-32
 424         stw     rCHR,0(rMEMP)
 425         stw     rCHR,4(rMEMP)
 426         stw     rCHR,8(rMEMP)
 427         stw     rCHR,12(rMEMP)
 428         stw     rCHR,16(rMEMP)
 429         stw     rCHR,20(rMEMP)
 430         addi    rMEMP,rMEMP,32
 431         cmplwi  cr1,rLEN,32
 432         stw     rCHR,-8(rMEMP3)
 433         stw     rCHR,-4(rMEMP3)
 434 L(cacheAligned2):
 435         blt     cr1,L(handletail32)
 436         addi    rLEN,rLEN,-32
 437         stw     rCHR,0(rMEMP3)
 438         stw     rCHR,4(rMEMP3)
 439         stw     rCHR,8(rMEMP3)
 440         stw     rCHR,12(rMEMP3)
 441         addi    rMEMP,rMEMP,32
 442         cmplwi  cr1,rLEN,32
 443         stw     rCHR,16(rMEMP3)
 444         stw     rCHR,20(rMEMP3)
 445         stw     rCHR,24(rMEMP3)
 446         stw     rCHR,28(rMEMP3)
 447         nop
 448 L(cacheAligned3):
 449         blt     cr1,L(handletail32)
 450 /* At this point we can overrun the store queue (pipe reject) so it is
 451    time to slow things down. The store queue can merge two adjacent
 452    stores into a single L1/L2 op, but the L2 is clocked at 1/2 the CPU.
 453    So we add "group ending nops" to guarantee that we dispatch only two
 454    stores every other cycle. */
 455         ori     r1,r1,0
 456         ori     r1,r1,0
 457         addi    rMEMP,rMEMP,32
 458         addi    rLEN,rLEN,-32
 459         ori     r1,r1,0
 460         ori     r1,r1,0
 461         stw     rCHR,32(rMEMP3)
 462         stw     rCHR,36(rMEMP3)
 463         ori     r1,r1,0
 464         ori     r1,r1,0
 465         stw     rCHR,40(rMEMP3)
 466         stw     rCHR,44(rMEMP3)
 467         ori     r1,r1,0
 468         ori     r1,r1,0
 469         stw     rCHR,48(rMEMP3)
 470         stw     rCHR,52(rMEMP3)
 471         ori     r1,r1,0
 472         ori     r1,r1,0
 473         stw     rCHR,56(rMEMP3)
 474         stw     rCHR,60(rMEMP3)
 475
 476 /* We are here because the length or remainder (rLEN) is less than the
 477    cache line/sector size and does not justify aggressive loop unrolling.
 478    So set up the preconditions for L(medium) and go there.  */
 479         .align 3
 480 L(handletail32):
 481         cmplwi  cr1,rLEN,0
 482         beqlr   cr1
 483         b       L(medium)
 484
 485         .align 4
 486 L(small):
 487 /* Memset of 4 bytes or less.  */
 488         cmplwi  cr5, rLEN, 1
 489         cmplwi  cr1, rLEN, 3
 490         bltlr   cr5
 491         stb     rCHR, 0(rMEMP)
 492         beqlr   cr5
 493         stb     rCHR, 1(rMEMP)
 494         bltlr   cr1
 495         stb     rCHR, 2(rMEMP)
 496         beqlr   cr1
 497         stb     rCHR, 3(rMEMP)
 498         blr
 499
 500 /* Memset of 0-31 bytes.  */
 501         .align 5
 502 L(medium):
 503         cmplwi  cr1, rLEN, 16
 504 L(medium_tail2):
 505         add     rMEMP, rMEMP, rLEN
 506 L(medium_tail):
 507         bt-     31, L(medium_31t)
 508         bt-     30, L(medium_30t)
 509 L(medium_30f):
 510         bt      29, L(medium_29t)
 511 L(medium_29f):
 512         bge     cr1, L(medium_27t)
 513         bflr    28
 514         stw     rCHR, -4(rMEMP)
 515         stw     rCHR, -8(rMEMP)
 516         blr
 517
 518 L(medium_31t):
 519         stbu    rCHR, -1(rMEMP)
 520         bf-     30, L(medium_30f)
 521 L(medium_30t):
 522         sthu    rCHR, -2(rMEMP)
 523         bf-     29, L(medium_29f)
 524 L(medium_29t):
 525         stwu    rCHR, -4(rMEMP)
 526         blt     cr1, L(medium_27f)
 527 L(medium_27t):
 528         stw     rCHR, -4(rMEMP)
 529         stw     rCHR, -8(rMEMP)
 530         stw     rCHR, -12(rMEMP)
 531         stwu    rCHR, -16(rMEMP)
 532 L(medium_27f):
 533         bflr    28
 534 L(medium_28t):
 535         stw     rCHR, -4(rMEMP)
 536         stw     rCHR, -8(rMEMP)
 537         blr
 538 END (memset)
 539 libc_hidden_builtin_def (memset)