sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S

   1 /* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
   2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #ifndef XCC
  22 # define XCC    xcc
  23 #endif
  24         .register       %g2,#scratch
  25         .register       %g3,#scratch
  26         .register       %g6,#scratch
  27
  28 #define FPRS_FEF        0x04
  29
  30 /*
  31  * ASI_STBI_P marks the cache line as "least recently used"
  32  * which means if many threads are active, it has a high chance
  33  * of being pushed out of the cache between the first initializing
  34  * store and the final stores.
  35  * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
  36  * cache line as "most recently used" for all but the last cache
  37  * line.
  38  */
  39
  40 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  41 #define ASI_ST_BLK_INIT_MRU_P   0xf2
  42
  43 #define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
  44 #define ASI_STBIMRU_P   ASI_ST_BLK_INIT_MRU_P
  45
  46 #define BLOCK_SIZE      64      /* L2 data cache line size  */
  47 #define SHORTCOPY       3
  48 #define SHORTCHECK      14
  49 #define SHORT_LONG      64      /* max copy for short longword-aligned case  */
  50                                 /* must be at least 64  */
  51 #define SMALL_MAX       255     /* max small copy for word/long aligned  */
  52 #define SMALL_UMAX      128     /* max small copy for unaligned case  */
  53 #define MED_WMAX        1023    /* max copy for medium word-aligned case  */
  54 #define MED_MAX         511     /* max copy for medium longword-aligned case  */
  55 #define ST_CHUNK        20      /* ST_CHUNK - block of values for BIS Store  */
  56 /* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
  57  * prefetch 20 can cause inst pipeline to delay if data is in memory
  58  * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache  */
  59 #define ALIGN_PRE       20      /* distance for aligned prefetch loop  */
  60
  61 #define EX_ST(x)        x
  62 #define EX_RETVAL(x)    x
  63 #define STORE_ASI(src,addr)     stxa src, [addr] ASI_STBIMRU_P
  64 #define STORE_INIT(src,addr)    stxa src, [addr] ASI_STBI_P
  65
  66 #if IS_IN (libc)
  67
  68         .text
  69
  70 ENTRY(__memmove_niagara7)
  71         /* %o0=dst, %o1=src, %o2=len */
  72         cmp     %o1, %o0        /* if from address is >= to use forward copy  */
  73         bgeu,pn %XCC, .Lforcpy  /* else use backward if ...  */
  74          sub    %o0, %o1, %o4   /* get difference of two addresses  */
  75         cmp     %o2, %o4        /* compare size and difference of addresses  */
  76         bleu,pn %XCC, .Lforcpy  /* if size is bigger, do overlapped copy  */
  77          add    %o1, %o2, %o5   /* get to end of source space  */
  78
  79 /* an overlapped copy that must be done "backwards"  */
  80 .Lchksize:
  81         cmp     %o2, 8                  /* less than 8 byte do byte copy  */
  82         blu,pn %XCC, 2f                 /* else continue  */
  83
  84 /* Now size is bigger than 8  */
  85 .Ldbalign:
  86          add    %o0, %o2, %g1           /* get to end of dest space  */
  87         andcc   %g1, 7, %o3             /* %o3 has cnt til dst 8 byte align  */
  88         bz,a,pn %XCC, .Ldbbck           /* skip if dst is 8 byte aligned  */
  89          andn   %o2, 7, %o3             /* force %o3 cnt to multiple of 8  */
  90         sub     %o2, %o3, %o2           /* update o2 with new count  */
  91
  92 1:      dec     %o5                     /* decrement source  */
  93         ldub    [%o5], %g1              /* load one byte  */
  94         deccc   %o3                     /* decrement count  */
  95         bgu,pt  %XCC, 1b                /* if not done keep copying  */
  96          stb    %g1, [%o5+%o4]          /* store one byte into dest  */
  97         andncc  %o2, 7, %o3             /* force %o3 cnt to multiple of 8  */
  98         bz,pn   %XCC, 2f                /* if size < 8, move to byte copy  */
  99
 100 /* Now Destination is 8 byte aligned  */
 101 .Ldbbck:
 102          andcc  %o5, 7, %o0             /* %o0 has src offset  */
 103         bz,a,pn %XCC, .Ldbcopybc        /* if src is aligned do fast memmove  */
 104          sub    %o2, %o3, %o2           /* Residue bytes in %o2  */
 105
 106 .Lcpy_dbwdbc:                           /* alignment of src is needed  */
 107         sub     %o2, 8, %o2             /* set size one loop ahead  */
 108         sll     %o0, 3, %g1             /* %g1 is left shift  */
 109         mov     64, %g5                 /* init %g5 to be 64  */
 110         sub     %g5, %g1, %g5           /* %g5 rightshift = (64 - leftshift)  */
 111         sub     %o5, %o0, %o5           /* align the src at 8 bytes.  */
 112         add     %o4, %o0, %o4           /* increase diff between src & dst  */
 113         ldx     [%o5], %o1              /* load first 8 bytes  */
 114         srlx    %o1, %g5, %o1
 115 1:      sub     %o5, 8, %o5             /* subtract 8 from src  */
 116         ldx     [%o5], %o0              /* load 8 byte  */
 117         sllx    %o0, %g1, %o3           /* shift loaded val left to tmp reg  */
 118         or      %o1, %o3, %o3           /* align data  */
 119         stx     %o3, [%o5+%o4]          /* store 8 byte  */
 120         subcc   %o2, 8, %o2             /* subtract 8 byte from size  */
 121         bg,pt   %XCC, 1b                /* if size > 0 continue  */
 122          srlx   %o0, %g5, %o1           /* move extra byte for the next use  */
 123
 124         srl     %g1, 3, %o0             /* restore %o0 value for alignment  */
 125         add     %o5, %o0, %o5           /* restore src alignment  */
 126         sub     %o4, %o0, %o4           /* restore diff between src & dest  */
 127
 128         ba      2f                      /* branch to the trailing byte copy  */
 129          add    %o2, 8, %o2             /* restore size value  */
 130
 131 .Ldbcopybc:                             /* alignment of src is not needed  */
 132 1:      sub     %o5, 8, %o5             /* subtract from src  */
 133         ldx     [%o5], %g1              /* load 8 bytes  */
 134         subcc   %o3, 8, %o3             /* subtract from size  */
 135         bgu,pt  %XCC, 1b                /* if size is bigger 0 continue  */
 136          stx    %g1, [%o5+%o4]          /* store 8 bytes to destination  */
 137
 138         ba      2f
 139          nop
 140
 141 .Lbcbyte:
 142 1:      ldub    [%o5], %g1              /* load one byte  */
 143         stb     %g1, [%o5+%o4]          /* store one byte  */
 144 2:      deccc   %o2                     /* decrement size  */
 145         bgeu,a,pt %XCC, 1b              /* if size is >= 0 continue  */
 146          dec    %o5                     /* decrement from address  */
 147
 148 .Lexitbc:                               /* exit from backward copy  */
 149         retl
 150          add    %o5, %o4, %o0           /* restore dest addr  */
 151
 152
 153 /* Check to see if memmove is large aligned copy
 154  * If so, use special version of copy that avoids
 155  * use of block store init.  */
 156 .Lforcpy:
 157         cmp     %o2, SMALL_MAX          /* check for not small case  */
 158         blt,pn  %XCC, .Lmv_short        /* merge with memcpy  */
 159          mov    %o0, %g1                /* save %o0  */
 160         neg     %o0, %o5
 161         andcc   %o5, 7, %o5             /* bytes till DST 8 byte aligned  */
 162         brz,pt  %o5, .Lmv_dst_aligned_on_8
 163
 164 /* %o5 has the bytes to be written in partial store.  */
 165          sub    %o2, %o5, %o2
 166         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 167 7:                                      /* dst aligning loop  */
 168         ldub    [%o1+%o0], %o4          /* load one byte  */
 169         subcc   %o5, 1, %o5
 170         stb     %o4, [%o0]
 171         bgu,pt  %XCC, 7b
 172          add    %o0, 1, %o0             /* advance dst  */
 173         add     %o1, %o0, %o1           /* restore %o1  */
 174 .Lmv_dst_aligned_on_8:
 175         andcc   %o1, 7, %o5
 176         brnz,pn %o5, .Lsrc_dst_unaligned_on_8
 177          prefetch [%o1 + (1 * BLOCK_SIZE)], 20
 178
 179 .Lmv_src_dst_aligned_on_8:
 180 /* check if we are copying MED_MAX or more bytes  */
 181         cmp     %o2, MED_MAX            /* limit to store buffer size  */
 182         bleu,pt %XCC, .Lmedlong
 183          prefetch [%o1 + (2 * BLOCK_SIZE)], 20
 184
 185 /* The mv_align loop below mimics the memcpy code for large aligned copies,
 186  * but does not use the ASI_STBI_P (block initializing store) performance
 187  * optimization.  This is used when memcpy is incorrectly invoked with
 188  * overlapping buffers.  */
 189
 190 .Lmv_large_align8_copy:                 /* Src and dst share 8 byte align  */
 191                                         /* align dst to 64 byte boundary  */
 192         andcc   %o0, 0x3f, %o3          /* check for dst 64 byte aligned  */
 193         brz,pn  %o3, .Lmv_aligned_on_64
 194          sub    %o3, 64, %o3            /* %o3 has negative bytes to move  */
 195         add     %o2, %o3, %o2           /* adjust remaining count  */
 196 .Lmv_align_to_64:
 197         ldx     [%o1], %o4
 198         add     %o1, 8, %o1             /* increment src ptr  */
 199         addcc   %o3, 8, %o3
 200         stx     %o4, [%o0]
 201         brnz,pt %o3, .Lmv_align_to_64
 202          add    %o0, 8, %o0             /* increment dst ptr  */
 203
 204 .Lmv_aligned_on_64:
 205         andn    %o2, 0x3f, %o5          /* %o5 is multiple of block size  */
 206         and     %o2, 0x3f, %o2          /* residue bytes in %o2  */
 207 .Lmv_align_loop:
 208         ldx     [%o1],%o4
 209         stx     %o4,[%o0]
 210         prefetch [%o0 + (10 * BLOCK_SIZE)], 22
 211         prefetch [%o1 + (10 * BLOCK_SIZE)], 21
 212         subcc   %o5, 64, %o5
 213         ldx     [%o1+8],%o4
 214         stx     %o4,[%o0+8]
 215         ldx     [%o1+16],%o4
 216         stx     %o4,[%o0+16]
 217         ldx     [%o1+24],%o4
 218         stx     %o4,[%o0+24]
 219         ldx     [%o1+32],%o4
 220         stx     %o4,[%o0+32]
 221         ldx     [%o1+40],%o4
 222         stx     %o4,[%o0+40]
 223         ldx     [%o1+48],%o4
 224         add     %o1, 64, %o1
 225         stx     %o4,[%o0+48]
 226         add     %o0, 64, %o0
 227         ldx     [%o1-8],%o4
 228         bgt,pt  %XCC, .Lmv_align_loop
 229          stx    %o4,[%o0-8]
 230
 231         ba      .Lmedlong
 232          nop
 233 END(__memmove_niagara7)
 234
 235 ENTRY(__mempcpy_niagara7)
 236         /* %o0=dst, %o1=src, %o2=len */
 237         ba,pt   %icc, 101f
 238          add    %o0, %o2, %g1           /* save dst + len  */
 239 END(__mempcpy_niagara7)
 240
 241         .align  32
 242 ENTRY(__memcpy_niagara7)
 243 100:    /* %o0=dst, %o1=src, %o2=len */
 244         mov     %o0, %g1                /* save %o0  */
 245 101:
 246 #ifndef __arch64__
 247         srl     %o2, 0, %o2
 248 #endif
 249         cmp     %o2, SMALL_MAX          /* check for not small case  */
 250         bgeu,pn %XCC, .Lmedium          /* go to larger cases  */
 251 .Lmv_short:
 252          cmp    %o2, SHORTCOPY          /* check for really short case  */
 253         ble,pn  %XCC, .Lsmallfin
 254          or     %o0, %o1, %o4           /* prepare alignment check  */
 255         andcc   %o4, 0x3, %o5           /* test for word alignment  */
 256         bnz,pn  %XCC, .Lsmallunalign    /* branch to non-word aligned case  */
 257          nop
 258         subcc   %o2, 7, %o2             /* adjust count  */
 259         ble,pn  %XCC, .Lsmallwordx
 260          andcc  %o4, 0x7, %o5           /* test for long alignment  */
 261 /* 8 or more bytes, src and dest start on word boundary
 262  * %o4 contains or %o0, %o1  */
 263 .Lsmalllong:
 264         bnz,pn  %XCC, .Lsmallwords      /* branch to word aligned case  */
 265          cmp    %o2, SHORT_LONG-7
 266         bge,a   %XCC, .Lmedl64          /* if we branch  */
 267          sub    %o2,56,%o2              /* adjust %o2 to -63 off count  */
 268
 269 /* slightly unroll the small_long_loop to improve very short copies  */
 270         cmp     %o2, 32-7
 271         blt,a,pn %XCC, .Lsmall_long_l
 272          sub    %o1, %o0, %o1           /* %o1 gets the difference  */
 273
 274         ldx     [%o1], %o5
 275         ldx     [%o1+8], %o4
 276         ldx     [%o1+16], %o3
 277
 278         subcc   %o2, 24, %o2
 279         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 280
 281         stx     %o5, [%o0]              /* write word  */
 282         stx     %o4, [%o0+8]            /* write word  */
 283         stx     %o3, [%o0+16]           /* write word  */
 284
 285         add     %o0, 24, %o0
 286
 287 /* end loop unroll  */
 288
 289 .Lsmall_long_l:
 290         ldx     [%o1+%o0], %o3
 291         subcc   %o2, 8, %o2
 292         add     %o0, 8, %o0
 293         bgu,pn  %XCC, .Lsmall_long_l    /* loop until done  */
 294          stx    %o3, [%o0-8]            /* write word  */
 295         addcc   %o2, 7, %o2             /* restore %o2 to correct count  */
 296         bnz,pn  %XCC, .Lsmall_long_x    /* check for completion  */
 297          add    %o1, %o0, %o1           /* restore %o1  */
 298         retl
 299          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 300 .Lsmall_long_x:
 301         cmp     %o2, 4                  /* check for 4 or more bytes left  */
 302         blt,pn  %XCC, .Lsmallleft3      /* if not, go to finish up  */
 303          nop
 304         lduw    [%o1], %o3
 305         add     %o1, 4, %o1
 306         subcc   %o2, 4, %o2
 307         stw     %o3, [%o0]
 308         bnz,pn  %XCC, .Lsmallleft3
 309          add    %o0, 4, %o0
 310         retl
 311          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 312
 313         .align 32
 314 /* src and dest start on word boundary; 7 or fewer bytes  */
 315 .Lsmallwordx:
 316         lduw    [%o1], %o3              /* read word  */
 317         addcc   %o2, 3, %o2             /* restore count  */
 318         bz,pt   %XCC, .Lsmallexit
 319          stw    %o3, [%o0]              /* write word  */
 320         deccc   %o2                     /* reduce count for cc test  */
 321         ldub    [%o1+4], %o3            /* load one byte  */
 322         bz,pt   %XCC, .Lsmallexit
 323          stb    %o3, [%o0+4]            /* store one byte  */
 324         ldub    [%o1+5], %o3            /* load second byte  */
 325         deccc   %o2
 326         bz,pt   %XCC, .Lsmallexit
 327          stb    %o3, [%o0+5]            /* store second byte  */
 328         ldub    [%o1+6], %o3            /* load third byte  */
 329         stb     %o3, [%o0+6]            /* store third byte  */
 330 .Lsmallexit:
 331         retl
 332          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 333
 334         .align 32
 335 .Lsmallunalign:
 336         cmp     %o2, SHORTCHECK
 337         ble,pn  %XCC, .Lsmallrest
 338          cmp    %o2, SMALL_UMAX
 339         bge,pt  %XCC, .Lmedium_join
 340          andcc  %o1, 0x3, %o5           /* is src word aligned  */
 341         bz,pn   %XCC, .Laldst
 342          cmp    %o5, 2                  /* is src half-word aligned  */
 343         be,pt   %XCC, .Ls2algn
 344          cmp    %o5, 3                  /* src is byte aligned  */
 345 .Ls1algn:
 346         ldub    [%o1], %o3              /* move 1 or 3 bytes to align it  */
 347         inc     1, %o1
 348         stb     %o3, [%o0]              /* move a byte to align src  */
 349         inc     1, %o0
 350         bne,pt  %XCC, .Ls2algn
 351          dec    %o2
 352         b       .Lald                   /* now go align dest  */
 353          andcc  %o0, 0x3, %o5
 354
 355 .Ls2algn:
 356         lduh    [%o1], %o3              /* know src is 2 byte aligned  */
 357         inc     2, %o1
 358         srl     %o3, 8, %o4
 359         stb     %o4, [%o0]              /* have to do bytes,  */
 360         stb     %o3, [%o0 + 1]          /* do not know dst alignment  */
 361         inc     2, %o0
 362         dec     2, %o2
 363
 364 .Laldst:
 365         andcc   %o0, 0x3, %o5           /* align the destination address  */
 366 .Lald:
 367         bz,pn   %XCC, .Lw4cp
 368          cmp    %o5, 2
 369         be,pn   %XCC, .Lw2cp
 370          cmp    %o5, 3
 371 .Lw3cp: lduw    [%o1], %o4
 372         inc     4, %o1
 373         srl     %o4, 24, %o5
 374         stb     %o5, [%o0]
 375         bne,pt  %XCC, .Lw1cp
 376          inc    %o0
 377         dec     1, %o2
 378         andn    %o2, 3, %o3             /* %o3 is aligned word count  */
 379         dec     4, %o3                  /* avoid reading beyond tail of src  */
 380         sub     %o1, %o0, %o1           /*  %o1 gets the difference  */
 381
 382 1:      sll     %o4, 8, %g5             /* save residual bytes  */
 383         lduw    [%o1+%o0], %o4
 384         deccc   4, %o3
 385         srl     %o4, 24, %o5            /* merge with residual  */
 386         or      %o5, %g5, %g5
 387         st      %g5, [%o0]
 388         bnz,pt  %XCC, 1b
 389          inc    4, %o0
 390         sub     %o1, 3, %o1             /* used one byte of last word read  */
 391         and     %o2, 3, %o2
 392         b       7f
 393          inc    4, %o2
 394
 395 .Lw1cp: srl     %o4, 8, %o5
 396         sth     %o5, [%o0]
 397         inc     2, %o0
 398         dec     3, %o2
 399         andn    %o2, 3, %o3             /* %o3 is aligned word count  */
 400         dec     4, %o3                  /* avoid reading beyond tail of src  */
 401         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 402
 403 2:      sll     %o4, 24, %g5            /* save residual bytes  */
 404         lduw    [%o1+%o0], %o4
 405         deccc   4, %o3
 406         srl     %o4, 8, %o5             /* merge with residual  */
 407         or      %o5, %g5, %g5
 408         st      %g5, [%o0]
 409         bnz,pt  %XCC, 2b
 410          inc    4, %o0
 411         sub     %o1, 1, %o1             /* used 3 bytes of last word read  */
 412         and     %o2, 3, %o2
 413         b       7f
 414          inc    4, %o2
 415
 416 .Lw2cp: lduw    [%o1], %o4
 417         inc     4, %o1
 418         srl     %o4, 16, %o5
 419         sth     %o5, [%o0]
 420         inc     2, %o0
 421         dec     2, %o2
 422         andn    %o2, 3, %o3             /* %o3 is aligned word count  */
 423         dec     4, %o3                  /* avoid reading beyond tail of src  */
 424         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 425
 426 3:      sll     %o4, 16, %g5            /* save residual bytes  */
 427         lduw    [%o1+%o0], %o4
 428         deccc   4, %o3
 429         srl     %o4, 16, %o5            /* merge with residual  */
 430         or      %o5, %g5, %g5
 431         st      %g5, [%o0]
 432         bnz,pt  %XCC, 3b
 433          inc    4, %o0
 434         sub     %o1, 2, %o1             /* used two bytes of last word read  */
 435         and     %o2, 3, %o2
 436         b       7f
 437          inc    4, %o2
 438
 439 .Lw4cp: andn    %o2, 3, %o3             /* %o3 is aligned word count  */
 440         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 441
 442 1:      lduw    [%o1+%o0], %o4          /* read from address  */
 443         deccc   4, %o3                  /* decrement count  */
 444         st      %o4, [%o0]              /* write at destination address  */
 445         bgu,pt  %XCC, 1b
 446          inc    4, %o0                  /* increment to address  */
 447         and     %o2, 3, %o2             /* number of leftover bytes, if any  */
 448
 449         /* simple finish up byte copy, works with any alignment  */
 450 7:
 451         add     %o1, %o0, %o1           /* restore %o1  */
 452 .Lsmallrest:
 453         tst     %o2
 454         bz,pt   %XCC, .Lsmallx
 455          cmp    %o2, 4
 456         blt,pn  %XCC, .Lsmallleft3
 457          nop
 458         sub     %o2, 3, %o2
 459 .Lsmallnotalign4:
 460         ldub    [%o1], %o3              /* read byte  */
 461         subcc   %o2, 4, %o2             /* reduce count by 4  */
 462         stb     %o3, [%o0]              /* write byte  */
 463         ldub    [%o1+1], %o3            /* repeat for total of 4 bytes  */
 464         add     %o1, 4, %o1             /* advance SRC by 4  */
 465         stb     %o3, [%o0+1]
 466         ldub    [%o1-2], %o3
 467         add     %o0, 4, %o0             /* advance DST by 4  */
 468         stb     %o3, [%o0-2]
 469         ldub    [%o1-1], %o3
 470         bgu,pt  %XCC, .Lsmallnotalign4  /* loop til 3 or fewer bytes remain  */
 471          stb    %o3, [%o0-1]
 472         addcc   %o2, 3, %o2             /* restore count  */
 473         bz,pt   %XCC, .Lsmallx
 474 .Lsmallleft3:                           /* 1, 2, or 3 bytes remain  */
 475          subcc  %o2, 1, %o2
 476         ldub    [%o1], %o3              /* load one byte  */
 477         bz,pt   %XCC, .Lsmallx
 478          stb    %o3, [%o0]              /* store one byte  */
 479         ldub    [%o1+1], %o3            /* load second byte  */
 480         subcc   %o2, 1, %o2
 481         bz,pt   %XCC, .Lsmallx
 482          stb    %o3, [%o0+1]            /* store second byte  */
 483         ldub    [%o1+2], %o3            /* load third byte  */
 484         stb     %o3, [%o0+2]            /* store third byte  */
 485 .Lsmallx:
 486         retl
 487          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 488
 489 .Lsmallfin:
 490         tst     %o2
 491         bnz,pn  %XCC, .Lsmallleft3
 492          nop
 493         retl
 494          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 495
 496         .align 16
 497 .Lsmallwords:
 498         lduw    [%o1], %o3              /* read word  */
 499         subcc   %o2, 8, %o2             /* update count  */
 500         stw     %o3, [%o0]              /* write word  */
 501         add     %o1, 8, %o1             /* update SRC  */
 502         lduw    [%o1-4], %o3            /* read word  */
 503         add     %o0, 8, %o0             /* update DST  */
 504         bgu,pt  %XCC, .Lsmallwords      /* loop until done  */
 505          stw    %o3, [%o0-4]            /* write word  */
 506         addcc   %o2, 7, %o2             /* restore count  */
 507         bz,pt   %XCC, .Lsmallexit       /* check for completion  */
 508          cmp    %o2, 4                  /* check for 4 or more bytes left  */
 509         blt,pt  %XCC, .Lsmallleft3      /* if not, go to finish up  */
 510          nop
 511         lduw    [%o1], %o3
 512         add     %o1, 4, %o1
 513         subcc   %o2, 4, %o2
 514         add     %o0, 4, %o0
 515         bnz,pn  %XCC, .Lsmallleft3
 516          stw    %o3, [%o0-4]
 517         retl
 518          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 519
 520         .align 16
 521 .Lmedium:
 522 .Lmedium_join:
 523         neg     %o0, %o5
 524         andcc   %o5, 7, %o5             /* bytes till DST 8 byte aligned  */
 525         brz,pt  %o5, .Ldst_aligned_on_8
 526
 527         /* %o5 has the bytes to be written in partial store.  */
 528          sub    %o2, %o5, %o2
 529         sub     %o1, %o0, %o1           /* %o1 gets the difference  */
 530 7:                                      /* dst aligning loop  */
 531         ldub    [%o1+%o0], %o4          /* load one byte  */
 532         subcc   %o5, 1, %o5
 533         stb     %o4, [%o0]
 534         bgu,pt  %XCC, 7b
 535          add    %o0, 1, %o0             /* advance dst  */
 536         add     %o1, %o0, %o1           /* restore %o1  */
 537 .Ldst_aligned_on_8:
 538         andcc   %o1, 7, %o5
 539         brnz,pt %o5, .Lsrc_dst_unaligned_on_8
 540          nop
 541
 542 .Lsrc_dst_aligned_on_8:
 543         /* check if we are copying MED_MAX or more bytes  */
 544         cmp     %o2, MED_MAX            /* limit to store buffer size  */
 545         bgu,pn  %XCC, .Llarge_align8_copy
 546          nop
 547 /*
 548  * Special case for handling when src and dest are both long word aligned
 549  * and total data to move is less than MED_MAX bytes
 550  */
 551 .Lmedlong:
 552         subcc   %o2, 63, %o2            /* adjust length to allow cc test  */
 553         ble,pn  %XCC, .Lmedl63          /* skip big loop if < 64 bytes  */
 554          nop
 555 .Lmedl64:
 556         ldx     [%o1], %o4              /* load  */
 557         subcc   %o2, 64, %o2            /* decrement length count  */
 558         stx     %o4, [%o0]              /* and store  */
 559         ldx     [%o1+8], %o3            /* a block of 64 bytes  */
 560         stx     %o3, [%o0+8]
 561         ldx     [%o1+16], %o4
 562         stx     %o4, [%o0+16]
 563         ldx     [%o1+24], %o3
 564         stx     %o3, [%o0+24]
 565         ldx     [%o1+32], %o4           /* load  */
 566         stx     %o4, [%o0+32]           /* and store  */
 567         ldx     [%o1+40], %o3           /* a block of 64 bytes  */
 568         add     %o1, 64, %o1            /* increase src ptr by 64  */
 569         stx     %o3, [%o0+40]
 570         ldx     [%o1-16], %o4
 571         add     %o0, 64, %o0            /* increase dst ptr by 64  */
 572         stx     %o4, [%o0-16]
 573         ldx     [%o1-8], %o3
 574         bgu,pt  %XCC, .Lmedl64          /* repeat if at least 64 bytes left  */
 575          stx    %o3, [%o0-8]
 576 .Lmedl63:
 577         addcc   %o2, 32, %o2            /* adjust remaining count  */
 578         ble,pt  %XCC, .Lmedl31          /* to skip if 31 or fewer bytes left  */
 579          nop
 580         ldx     [%o1], %o4              /* load  */
 581         sub     %o2, 32, %o2            /* decrement length count  */
 582         stx     %o4, [%o0]              /* and store  */
 583         ldx     [%o1+8], %o3            /* a block of 32 bytes  */
 584         add     %o1, 32, %o1            /* increase src ptr by 32  */
 585         stx     %o3, [%o0+8]
 586         ldx     [%o1-16], %o4
 587         add     %o0, 32, %o0            /* increase dst ptr by 32  */
 588         stx     %o4, [%o0-16]
 589         ldx     [%o1-8], %o3
 590         stx     %o3, [%o0-8]
 591 .Lmedl31:
 592         addcc   %o2, 16, %o2            /* adjust remaining count  */
 593         ble,pt  %XCC, .Lmedl15          /* skip if 15 or fewer bytes left  */
 594          nop
 595         ldx     [%o1], %o4              /* load and store 16 bytes  */
 596         add     %o1, 16, %o1            /* increase src ptr by 16  */
 597         stx     %o4, [%o0]
 598         sub     %o2, 16, %o2            /* decrease count by 16  */
 599         ldx     [%o1-8], %o3
 600         add     %o0, 16, %o0            /* increase dst ptr by 16  */
 601         stx     %o3, [%o0-8]
 602 .Lmedl15:
 603         addcc   %o2, 15, %o2            /* restore count  */
 604         bz,pt   %XCC, .Lsmallexit       /* exit if finished  */
 605          cmp    %o2, 8
 606         blt,pt  %XCC, .Lmedw7           /* skip if 7 or fewer bytes left  */
 607          tst    %o2
 608         ldx     [%o1], %o4              /* load 8 bytes  */
 609         add     %o1, 8, %o1             /* increase src ptr by 8  */
 610         add     %o0, 8, %o0             /* increase dst ptr by 8  */
 611         subcc   %o2, 8, %o2             /* decrease count by 8  */
 612         bnz,pn  %XCC, .Lmedw7
 613          stx    %o4, [%o0-8]            /* and store 8 bytes  */
 614         retl
 615          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 616
 617         .align 16
 618 .Lsrc_dst_unaligned_on_8:
 619         /* DST is 8-byte aligned, src is not  */
 620         andcc   %o1, 0x3, %o5           /* test word alignment  */
 621         bnz,pt  %XCC, .Lunalignsetup    /* branch if not word aligned  */
 622          nop
 623
 624 /*
 625  * Handle all cases where src and dest are aligned on word
 626  * boundaries. Use unrolled loops for better performance.
 627  * This option wins over standard large data move when
 628  * source and destination is in cache for medium
 629  * to short data moves.
 630  */
 631         cmp %o2, MED_WMAX               /* limit to store buffer size  */
 632         bge,pt  %XCC, .Lunalignrejoin   /* otherwise rejoin main loop  */
 633          nop
 634
 635         subcc   %o2, 31, %o2            /* adjust length to allow cc test  */
 636                                         /* for end of loop  */
 637         ble,pt  %XCC, .Lmedw31          /* skip big loop if less than 16  */
 638 .Lmedw32:
 639          ld     [%o1], %o4              /* move a block of 32 bytes  */
 640         sllx    %o4, 32, %o5
 641         ld      [%o1+4], %o4
 642         or      %o4, %o5, %o5
 643         stx     %o5, [%o0]
 644         subcc   %o2, 32, %o2            /* decrement length count  */
 645         ld      [%o1+8], %o4
 646         sllx    %o4, 32, %o5
 647         ld      [%o1+12], %o4
 648         or      %o4, %o5, %o5
 649         stx     %o5, [%o0+8]
 650         add     %o1, 32, %o1            /* increase src ptr by 32  */
 651         ld      [%o1-16], %o4
 652         sllx    %o4, 32, %o5
 653         ld      [%o1-12], %o4
 654         or      %o4, %o5, %o5
 655         stx     %o5, [%o0+16]
 656         add     %o0, 32, %o0            /* increase dst ptr by 32  */
 657         ld      [%o1-8], %o4
 658         sllx    %o4, 32, %o5
 659         ld      [%o1-4], %o4
 660         or      %o4, %o5, %o5
 661         bgu,pt  %XCC, .Lmedw32          /* repeat if at least 32 bytes left  */
 662          stx    %o5, [%o0-8]
 663 .Lmedw31:
 664         addcc   %o2, 31, %o2            /* restore count  */
 665         bz,pt   %XCC, .Lsmallexit       /* exit if finished  */
 666          cmp    %o2, 16
 667         blt,pt  %XCC, .Lmedw15
 668          nop
 669         ld      [%o1], %o4              /* move a block of 16 bytes  */
 670         sllx    %o4, 32, %o5
 671         subcc   %o2, 16, %o2            /* decrement length count  */
 672         ld      [%o1+4], %o4
 673         or      %o4, %o5, %o5
 674         stx     %o5, [%o0]
 675         add     %o1, 16, %o1            /* increase src ptr by 16  */
 676         ld      [%o1-8], %o4
 677         add     %o0, 16, %o0            /* increase dst ptr by 16  */
 678         sllx    %o4, 32, %o5
 679         ld      [%o1-4], %o4
 680         or      %o4, %o5, %o5
 681         stx     %o5, [%o0-8]
 682 .Lmedw15:
 683         bz,pt   %XCC, .Lsmallexit       /* exit if finished  */
 684          cmp    %o2, 8
 685         blt,pn  %XCC, .Lmedw7           /* skip if 7 or fewer bytes left  */
 686          tst    %o2
 687         ld      [%o1], %o4              /* load 4 bytes  */
 688         subcc   %o2, 8, %o2             /* decrease count by 8  */
 689         stw     %o4, [%o0]              /* and store 4 bytes  */
 690         add     %o1, 8, %o1             /* increase src ptr by 8  */
 691         ld      [%o1-4], %o3            /* load 4 bytes  */
 692         add     %o0, 8, %o0             /* increase dst ptr by 8  */
 693         stw     %o3, [%o0-4]            /* and store 4 bytes  */
 694         bz,pt   %XCC, .Lsmallexit       /* exit if finished  */
 695 .Lmedw7:                                /* count is ge 1, less than 8  */
 696          cmp    %o2, 4                  /* check for 4 bytes left  */
 697         blt,pn  %XCC, .Lsmallleft3      /* skip if 3 or fewer bytes left  */
 698          nop
 699         ld      [%o1], %o4              /* load 4 bytes  */
 700         add     %o1, 4, %o1             /* increase src ptr by 4  */
 701         add     %o0, 4, %o0             /* increase dst ptr by 4  */
 702         subcc   %o2, 4, %o2             /* decrease count by 4  */
 703         bnz,pt  %XCC, .Lsmallleft3
 704          stw    %o4, [%o0-4]            /* and store 4 bytes  */
 705         retl
 706          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 707
 708         .align 16
 709 .Llarge_align8_copy:                    /* Src and dst 8 byte aligned  */
 710         /* align dst to 64 byte boundary  */
 711         andcc   %o0, 0x3f, %o3          /* check for dst 64 byte aligned  */
 712         brz,pn  %o3, .Laligned_to_64
 713          andcc  %o0, 8, %o3             /* odd long words to move?  */
 714         brz,pt  %o3, .Laligned_to_16
 715          nop
 716         ldx     [%o1], %o4
 717         sub     %o2, 8, %o2
 718         add     %o1, 8, %o1             /* increment src ptr  */
 719         add     %o0, 8, %o0             /* increment dst ptr  */
 720         stx     %o4, [%o0-8]
 721 .Laligned_to_16:
 722         andcc   %o0, 16, %o3            /* pair of long words to move?  */
 723         brz,pt  %o3, .Laligned_to_32
 724          nop
 725         ldx     [%o1], %o4
 726         sub     %o2, 16, %o2
 727         stx     %o4, [%o0]
 728         add     %o1, 16, %o1            /* increment src ptr  */
 729         ldx     [%o1-8], %o4
 730         add     %o0, 16, %o0            /* increment dst ptr  */
 731         stx     %o4, [%o0-8]
 732 .Laligned_to_32:
 733         andcc   %o0, 32, %o3            /* four long words to move?  */
 734         brz,pt  %o3, .Laligned_to_64
 735          nop
 736         ldx     [%o1], %o4
 737         sub     %o2, 32, %o2
 738         stx     %o4, [%o0]
 739         ldx     [%o1+8], %o4
 740         stx     %o4, [%o0+8]
 741         ldx     [%o1+16], %o4
 742         stx     %o4, [%o0+16]
 743         add     %o1, 32, %o1            /* increment src ptr  */
 744         ldx     [%o1-8], %o4
 745         add     %o0, 32, %o0            /* increment dst ptr  */
 746         stx     %o4, [%o0-8]
 747 .Laligned_to_64:
 748 /*      Following test is included to avoid issues where existing executables
 749  *      incorrectly call memcpy with overlapping src and dest instead of memmove
 750  *
 751  *      if ( (src ge dst) and (dst+len > src)) go to overlap case
 752  *      if ( (src lt dst) and (src+len > dst)) go to overlap case
 753  */
 754         cmp     %o1,%o0
 755         bge,pt  %XCC, 1f
 756          nop
 757 /*                              src+len > dst?  */
 758         add     %o1, %o2, %o4
 759         cmp     %o4, %o0
 760         bgt,pt  %XCC, .Lmv_aligned_on_64
 761          nop
 762         ba      2f
 763          nop
 764 1:
 765 /*                              dst+len > src?  */
 766         add     %o0, %o2, %o4
 767         cmp     %o4, %o1
 768         bgt,pt  %XCC, .Lmv_aligned_on_64
 769          nop
 770 2:
 771 /*      handle non-overlapped copies
 772  *
 773  *      Using block init store (BIS) instructions to avoid fetching cache
 774  *      lines from memory. Use ST_CHUNK stores to first element of each cache
 775  *      line (similar to prefetching) to avoid overfilling STQ or miss buffers.
 776  *      Gives existing cache lines time to be moved out of L1/L2/L3 cache.
 777  */
 778         andn    %o2, 0x3f, %o5          /* %o5 is multiple of block size  */
 779         and     %o2, 0x3f, %o2          /* residue bytes in %o2  */
 780
 781 /*      We use ASI_STBIMRU_P for the first store to each cache line
 782  *      followed by ASI_STBI_P (mark as LRU) for the last store. That
 783  *      mixed approach reduces the chances the cache line is removed
 784  *      before we finish setting it, while minimizing the effects on
 785  *      other cached values during a large memcpy
 786  *
 787  *      Intermediate stores can be normal since first BIS activates the
 788  *      cache line in the L2 cache.
 789  *
 790  *      ST_CHUNK batches up initial BIS operations for several cache lines
 791  *      to allow multiple requests to not be blocked by overflowing the
 792  *      the store miss buffer. Then the matching stores for all those
 793  *      BIS operations are executed.
 794  */
 795
 796 .Lalign_loop:
 797         cmp     %o5, ST_CHUNK*64
 798         blu,pt  %XCC, .Lalign_short
 799          mov    ST_CHUNK, %o3
 800         sllx    %o3, 6, %g5             /* ST_CHUNK*64  */
 801
 802 .Lalign_loop_start:
 803         prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
 804         subcc   %o3, 2, %o3
 805         ldx     [%o1], %o4
 806         add     %o1, 128, %o1
 807         EX_ST(STORE_ASI(%o4, %o0))
 808         add     %o0, 64, %o0
 809         ldx     [%o1-64], %o4
 810         EX_ST(STORE_ASI(%o4, %o0))
 811         add     %o0, 64, %o0
 812         bgu,pt  %XCC, .Lalign_loop_start
 813          prefetch [%o1 + ((ALIGN_PRE-1) * BLOCK_SIZE)], 21
 814
 815         mov     ST_CHUNK, %o3
 816         sub     %o1, %g5, %o1           /* reset %o1  */
 817         sub     %o0, %g5, %o0           /* reset %o0  */
 818
 819         sub     %o0, 8, %o0             /* adjust %o0 for ASI alignment  */
 820 .Lalign_loop_rest:
 821         ldx     [%o1+8],%o4
 822         add     %o0, 64, %o0
 823         stx     %o4, [%o0-48]
 824         subcc   %o3, 1, %o3
 825         ldx     [%o1+16],%o4
 826         stx     %o4, [%o0-40]
 827         sub     %o5, 64, %o5
 828         ldx     [%o1+24],%o4
 829         stx     %o4, [%o0-32]
 830         ldx     [%o1+32],%o4
 831         stx     %o4, [%o0-24]
 832         ldx     [%o1+40],%o4
 833         stx     %o4, [%o0-16]
 834         ldx     [%o1+48],%o4
 835         stx     %o4, [%o0-8]
 836         add     %o1, 64, %o1
 837         ldx     [%o1-8],%o4
 838         bgu,pt  %XCC, .Lalign_loop_rest
 839          EX_ST(STORE_INIT(%o4,%o0))     /* mark cache line as LRU  */
 840
 841         mov     ST_CHUNK, %o3
 842         cmp     %o5, ST_CHUNK*64
 843         bgu,pt  %XCC, .Lalign_loop_start
 844          add    %o0, 8, %o0             /* restore %o0 from ASI alignment  */
 845
 846         cmp     %o5, 0
 847         beq,pt  %XCC, .Lalign_done
 848
 849 /* no prefetches needed in these loops
 850  * since we are within ALIGN_PRE of the end */
 851 .Lalign_short:
 852          srl    %o5, 6, %o3
 853 .Lalign_loop_short:
 854         subcc   %o3, 1, %o3
 855         ldx     [%o1], %o4
 856         add     %o1, 64, %o1
 857         EX_ST(STORE_ASI(%o4, %o0))
 858         bgu,pt  %XCC, .Lalign_loop_short
 859          add    %o0, 64, %o0
 860
 861         sub     %o1, %o5, %o1           /* reset %o1  */
 862         sub     %o0, %o5, %o0           /* reset %o0  */
 863
 864         sub     %o0, 8, %o0             /* adjust %o0 for ASI alignment  */
 865 .Lalign_short_rest:
 866         ldx     [%o1+8],%o4
 867         add     %o0, 64, %o0
 868         stx     %o4, [%o0-48]
 869         ldx     [%o1+16],%o4
 870         subcc   %o5, 64, %o5
 871         stx     %o4, [%o0-40]
 872         ldx     [%o1+24],%o4
 873         stx     %o4, [%o0-32]
 874         ldx     [%o1+32],%o4
 875         stx     %o4, [%o0-24]
 876         ldx     [%o1+40],%o4
 877         stx     %o4, [%o0-16]
 878         ldx     [%o1+48],%o4
 879         stx     %o4, [%o0-8]
 880         add     %o1, 64, %o1
 881         ldx     [%o1-8],%o4
 882         bgu,pt  %XCC, .Lalign_short_rest
 883          EX_ST(STORE_INIT(%o4,%o0))     /* mark cache line as LRU  */
 884
 885         add     %o0, 8, %o0             /* restore %o0 from ASI alignment  */
 886
 887 .Lalign_done:
 888         cmp     %o2, 0
 889         membar  #StoreStore
 890         bne,pt  %XCC, .Lmedl63
 891          subcc  %o2, 63, %o2            /* adjust length to allow cc test  */
 892         retl
 893          mov    EX_RETVAL(%g1), %o0     /* restore %o0  */
 894
 895         .align 16
 896         /* Dst is on 8 byte boundary; src is not; remaining cnt > SMALL_MAX  */
 897         /* Since block load/store and BIS are not in use for unaligned data,
 898          * no need to align dst on 64 byte cache line boundary  */
 899 .Lunalignsetup:
 900 .Lunalignrejoin:
 901         rd      %fprs, %g5              /* check for unused fp  */
 902         /* if fprs.fef == 0, set it.
 903          * Setting it when already set costs more than checking */
 904         andcc   %g5, FPRS_FEF, %g5      /* test FEF, fprs.du = fprs.dl = 0  */
 905         bz,a    %XCC, 1f
 906          wr     %g0, FPRS_FEF, %fprs    /* fprs.fef = 1  */
 907 1:
 908         andn    %o2, 0x3f, %o5          /* %o5 is multiple of block size  */
 909         and     %o2, 0x3f, %o2          /* residue bytes in %o2  */
 910         cmp     %o2, 8                  /* Insure we do not load beyond  */
 911         bgt,pt  %XCC, .Lunalign_adjust  /* end of source buffer  */
 912          andn   %o1, 0x7, %o4           /* %o4 has 8 byte aligned src addr  */
 913         add     %o2, 64, %o2            /* adjust to leave loop  */
 914         sub     %o5, 64, %o5            /* early if necessary  */
 915 .Lunalign_adjust:
 916         alignaddr %o1, %g0, %g0         /* generate %gsr  */
 917         add     %o1, %o5, %o1           /* advance %o1 to after blocks  */
 918         ldd     [%o4], %f0
 919 .Lunalign_loop:
 920         prefetch [%o0 + (9 * BLOCK_SIZE)], 20
 921         ldd     [%o4+8], %f2
 922         faligndata %f0, %f2, %f16
 923         ldd     [%o4+16], %f4
 924         subcc   %o5, BLOCK_SIZE, %o5
 925         std     %f16, [%o0]
 926         faligndata %f2, %f4, %f18
 927         ldd     [%o4+24], %f6
 928         std     %f18, [%o0+8]
 929         faligndata %f4, %f6, %f20
 930         ldd     [%o4+32], %f8
 931         std     %f20, [%o0+16]
 932         faligndata %f6, %f8, %f22
 933         ldd     [%o4+40], %f10
 934         std     %f22, [%o0+24]
 935         faligndata %f8, %f10, %f24
 936         ldd     [%o4+48], %f12
 937         std     %f24, [%o0+32]
 938         faligndata %f10, %f12, %f26
 939         ldd     [%o4+56], %f14
 940         add     %o4, BLOCK_SIZE, %o4
 941         std     %f26, [%o0+40]
 942         faligndata %f12, %f14, %f28
 943         ldd     [%o4], %f0
 944         std     %f28, [%o0+48]
 945         faligndata %f14, %f0, %f30
 946         std     %f30, [%o0+56]
 947         add     %o0, BLOCK_SIZE, %o0
 948         bgu,pt  %XCC, .Lunalign_loop
 949          prefetch [%o4 + (11 * BLOCK_SIZE)], 20
 950
 951         /* Handle trailing bytes, 64 to 127
 952          * Dest long word aligned, Src not long word aligned  */
 953         cmp     %o2, 15
 954         bleu,pt %XCC, .Lunalign_short
 955
 956          andn   %o2, 0x7, %o5           /* %o5 is multiple of 8  */
 957         and     %o2, 0x7, %o2           /* residue bytes in %o2  */
 958         add     %o2, 8, %o2
 959         sub     %o5, 8, %o5             /* do not load past end of src  */
 960         andn    %o1, 0x7, %o4           /* %o4 has 8 byte aligned src addr  */
 961         add     %o1, %o5, %o1           /* move %o1 to after multiple of 8  */
 962         ldd     [%o4], %f0              /* fetch partial word  */
 963 .Lunalign_by8:
 964         ldd     [%o4+8], %f2
 965         add     %o4, 8, %o4
 966         faligndata %f0, %f2, %f16
 967         subcc   %o5, 8, %o5
 968         std     %f16, [%o0]
 969         fsrc2   %f2, %f0
 970         bgu,pt  %XCC, .Lunalign_by8
 971          add    %o0, 8, %o0
 972
 973 .Lunalign_short:                        /* restore fprs state */
 974         brnz,pt %g5, .Lsmallrest
 975          nop
 976         ba      .Lsmallrest
 977          wr     %g5, %g0, %fprs
 978 END(__memcpy_niagara7)
 979
 980 #endif