sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S

   1 /* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
   2    Copyright (C) 2006-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by David S. Miller (davem@davemloft.net)
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  23 #define ASI_P                   0x80
  24 #define ASI_PNF                 0x82
  25
  26 #define LOAD(type,addr,dest)    type##a [addr] ASI_P, dest
  27 #define LOAD_TWIN(addr_reg,dest0,dest1) \
  28         ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  29
  30 #define STORE(type,src,addr)    type src, [addr]
  31 #define STORE_INIT(src,addr)    stxa src, [addr] %asi
  32
  33 #ifndef XCC
  34 #define USE_BPR
  35 #define XCC xcc
  36 #endif
  37
  38 #if IS_IN (libc)
  39
  40         .register       %g2,#scratch
  41         .register       %g3,#scratch
  42         .register       %g6,#scratch
  43
  44         .text
  45
  46 ENTRY(__mempcpy_niagara1)
  47         ba,pt           %XCC, 101f
  48          add            %o0, %o2, %g5
  49 END(__mempcpy_niagara1)
  50
  51         .align          32
  52 ENTRY(__memcpy_niagara1)
  53 100:    /* %o0=dst, %o1=src, %o2=len */
  54         mov             %o0, %g5
  55 101:
  56 # ifndef USE_BPR
  57         srl             %o2, 0, %o2
  58 # endif
  59         cmp             %o2, 0
  60         be,pn           %XCC, 85f
  61 218:     or             %o0, %o1, %o3
  62         cmp             %o2, 16
  63         blu,a,pn        %XCC, 80f
  64          or             %o3, %o2, %o3
  65
  66         /* 2 blocks (128 bytes) is the minimum we can do the block
  67          * copy with.  We need to ensure that we'll iterate at least
  68          * once in the block copy loop.  At worst we'll need to align
  69          * the destination to a 64-byte boundary which can chew up
  70          * to (64 - 1) bytes from the length before we perform the
  71          * block copy loop.
  72          */
  73         cmp             %o2, (2 * 64)
  74         blu,pt          %XCC, 70f
  75          andcc          %o3, 0x7, %g0
  76
  77         /* %o0: dst
  78          * %o1: src
  79          * %o2: len  (known to be >= 128)
  80          *
  81          * The block copy loops will use %o4/%o5,%g2/%g3 as
  82          * temporaries while copying the data.
  83          */
  84
  85         LOAD(prefetch, %o1, #one_read)
  86         wr              %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
  87
  88         /* Align destination on 64-byte boundary.  */
  89         andcc           %o0, (64 - 1), %o4
  90         be,pt           %XCC, 2f
  91          sub            %o4, 64, %o4
  92         sub             %g0, %o4, %o4   ! bytes to align dst
  93         sub             %o2, %o4, %o2
  94 1:      subcc           %o4, 1, %o4
  95         LOAD(ldub, %o1, %g1)
  96         STORE(stb, %g1, %o0)
  97         add             %o1, 1, %o1
  98         bne,pt          %XCC, 1b
  99         add             %o0, 1, %o0
 100
 101         /* If the source is on a 16-byte boundary we can do
 102          * the direct block copy loop.  If it is 8-byte aligned
 103          * we can do the 16-byte loads offset by -8 bytes and the
 104          * init stores offset by one register.
 105          *
 106          * If the source is not even 8-byte aligned, we need to do
 107          * shifting and masking (basically integer faligndata).
 108          *
 109          * The careful bit with init stores is that if we store
 110          * to any part of the cache line we have to store the whole
 111          * cacheline else we can end up with corrupt L2 cache line
 112          * contents.  Since the loop works on 64-bytes of 64-byte
 113          * aligned store data at a time, this is easy to ensure.
 114          */
 115 2:
 116         andcc           %o1, (16 - 1), %o4
 117         andn            %o2, (64 - 1), %g1      ! block copy loop iterator
 118         sub             %o2, %g1, %o2           ! final sub-block copy bytes
 119         be,pt           %XCC, 50f
 120          cmp            %o4, 8
 121         be,a,pt         %XCC, 10f
 122          sub            %o1, 0x8, %o1
 123
 124         /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 125         mov             %g1, %o4
 126         and             %o1, 0x7, %g1
 127         sll             %g1, 3, %g1
 128         mov             64, %o3
 129         andn            %o1, 0x7, %o1
 130         LOAD(ldx, %o1, %g2)
 131         sub             %o3, %g1, %o3
 132         sllx            %g2, %g1, %g2
 133
 134 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
 135         LOAD(ldx, SRC, TMP1); \
 136         srlx            TMP1, PRE_SHIFT, TMP2; \
 137         or              TMP2, PRE_VAL, TMP2; \
 138         STORE_INIT(TMP2, DST); \
 139         sllx            TMP1, POST_SHIFT, PRE_VAL;
 140
 141 1:      add             %o1, 0x8, %o1
 142         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
 143         add             %o1, 0x8, %o1
 144         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
 145         add             %o1, 0x8, %o1
 146         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
 147         add             %o1, 0x8, %o1
 148         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
 149         add             %o1, 32, %o1
 150         LOAD(prefetch, %o1, #one_read)
 151         sub             %o1, 32 - 8, %o1
 152         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
 153         add             %o1, 8, %o1
 154         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
 155         add             %o1, 8, %o1
 156         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
 157         add             %o1, 8, %o1
 158         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
 159         subcc           %o4, 64, %o4
 160         bne,pt          %XCC, 1b
 161          add            %o0, 64, %o0
 162
 163 #undef SWIVEL_ONE_DWORD
 164
 165         srl             %g1, 3, %g1
 166         ba,pt           %XCC, 60f
 167          add            %o1, %g1, %o1
 168
 169 10:     /* Destination is 64-byte aligned, source was only 8-byte
 170          * aligned but it has been subtracted by 8 and we perform
 171          * one twin load ahead, then add 8 back into source when
 172          * we finish the loop.
 173          */
 174         LOAD_TWIN(%o1, %o4, %o5)
 175 1:      add             %o1, 16, %o1
 176         LOAD_TWIN(%o1, %g2, %g3)
 177         add             %o1, 16 + 32, %o1
 178         LOAD(prefetch, %o1, #one_read)
 179         sub             %o1, 32, %o1
 180         STORE_INIT(%o5, %o0 + 0x00)             ! initializes cache line
 181         STORE_INIT(%g2, %o0 + 0x08)
 182         LOAD_TWIN(%o1, %o4, %o5)
 183         add             %o1, 16, %o1
 184         STORE_INIT(%g3, %o0 + 0x10)
 185         STORE_INIT(%o4, %o0 + 0x18)
 186         LOAD_TWIN(%o1, %g2, %g3)
 187         add             %o1, 16, %o1
 188         STORE_INIT(%o5, %o0 + 0x20)
 189         STORE_INIT(%g2, %o0 + 0x28)
 190         LOAD_TWIN(%o1, %o4, %o5)
 191         STORE_INIT(%g3, %o0 + 0x30)
 192         STORE_INIT(%o4, %o0 + 0x38)
 193         subcc           %g1, 64, %g1
 194         bne,pt          %XCC, 1b
 195          add            %o0, 64, %o0
 196
 197         ba,pt           %XCC, 60f
 198          add            %o1, 0x8, %o1
 199
 200 50:     /* Destination is 64-byte aligned, and source is 16-byte
 201          * aligned.
 202          */
 203 1:      LOAD_TWIN(%o1, %o4, %o5)
 204         add     %o1, 16, %o1
 205         LOAD_TWIN(%o1, %g2, %g3)
 206         add     %o1, 16 + 32, %o1
 207         LOAD(prefetch, %o1, #one_read)
 208         sub     %o1, 32, %o1
 209         STORE_INIT(%o4, %o0 + 0x00)             ! initializes cache line
 210         STORE_INIT(%o5, %o0 + 0x08)
 211         LOAD_TWIN(%o1, %o4, %o5)
 212         add     %o1, 16, %o1
 213         STORE_INIT(%g2, %o0 + 0x10)
 214         STORE_INIT(%g3, %o0 + 0x18)
 215         LOAD_TWIN(%o1, %g2, %g3)
 216         add     %o1, 16, %o1
 217         STORE_INIT(%o4, %o0 + 0x20)
 218         STORE_INIT(%o5, %o0 + 0x28)
 219         STORE_INIT(%g2, %o0 + 0x30)
 220         STORE_INIT(%g3, %o0 + 0x38)
 221         subcc   %g1, 64, %g1
 222         bne,pt  %XCC, 1b
 223          add    %o0, 64, %o0
 224         /* fall through */
 225
 226 60:
 227         /* %o2 contains any final bytes still needed to be copied
 228          * over. If anything is left, we copy it one byte at a time.
 229          */
 230         wr              %g0, ASI_PNF, %asi
 231         brz,pt          %o2, 85f
 232          sub            %o0, %o1, %o3
 233         ba,a,pt         %XCC, 90f
 234
 235         .align          64
 236 70: /* 16 < len <= 64 */
 237         bne,pn          %XCC, 75f
 238          sub            %o0, %o1, %o3
 239
 240 72:
 241         andn            %o2, 0xf, %o4
 242         and             %o2, 0xf, %o2
 243 1:      subcc           %o4, 0x10, %o4
 244         LOAD(ldx, %o1, %o5)
 245         add             %o1, 0x08, %o1
 246         LOAD(ldx, %o1, %g1)
 247         sub             %o1, 0x08, %o1
 248         STORE(stx, %o5, %o1 + %o3)
 249         add             %o1, 0x8, %o1
 250         STORE(stx, %g1, %o1 + %o3)
 251         bgu,pt          %XCC, 1b
 252          add            %o1, 0x8, %o1
 253 73:     andcc           %o2, 0x8, %g0
 254         be,pt           %XCC, 1f
 255          nop
 256         sub             %o2, 0x8, %o2
 257         LOAD(ldx, %o1, %o5)
 258         STORE(stx, %o5, %o1 + %o3)
 259         add             %o1, 0x8, %o1
 260 1:      andcc           %o2, 0x4, %g0
 261         be,pt           %XCC, 1f
 262          nop
 263         sub             %o2, 0x4, %o2
 264         LOAD(lduw, %o1, %o5)
 265         STORE(stw, %o5, %o1 + %o3)
 266         add             %o1, 0x4, %o1
 267 1:      cmp             %o2, 0
 268         be,pt           %XCC, 85f
 269          nop
 270         ba,pt           %XCC, 90f
 271          nop
 272
 273 75:
 274         andcc           %o0, 0x7, %g1
 275         sub             %g1, 0x8, %g1
 276         be,pn           %icc, 2f
 277          sub            %g0, %g1, %g1
 278         sub             %o2, %g1, %o2
 279
 280 1:      subcc           %g1, 1, %g1
 281         LOAD(ldub, %o1, %o5)
 282         STORE(stb, %o5, %o1 + %o3)
 283         bgu,pt          %icc, 1b
 284          add            %o1, 1, %o1
 285
 286 2:      add             %o1, %o3, %o0
 287         andcc           %o1, 0x7, %g1
 288         bne,pt          %icc, 8f
 289          sll            %g1, 3, %g1
 290
 291         cmp             %o2, 16
 292         bgeu,pt         %icc, 72b
 293          nop
 294         ba,a,pt         %XCC, 73b
 295
 296 8:      mov             64, %o3
 297         andn            %o1, 0x7, %o1
 298         LOAD(ldx, %o1, %g2)
 299         sub             %o3, %g1, %o3
 300         andn            %o2, 0x7, %o4
 301         sllx            %g2, %g1, %g2
 302 1:      add             %o1, 0x8, %o1
 303         LOAD(ldx, %o1, %g3)
 304         subcc           %o4, 0x8, %o4
 305         srlx            %g3, %o3, %o5
 306         or              %o5, %g2, %o5
 307         STORE(stx, %o5, %o0)
 308         add             %o0, 0x8, %o0
 309         bgu,pt          %icc, 1b
 310          sllx           %g3, %g1, %g2
 311
 312         srl             %g1, 3, %g1
 313         andcc           %o2, 0x7, %o2
 314         be,pn           %icc, 85f
 315          add            %o1, %g1, %o1
 316         ba,pt           %XCC, 90f
 317          sub            %o0, %o1, %o3
 318
 319         .align          64
 320 80: /* 0 < len <= 16 */
 321         andcc           %o3, 0x3, %g0
 322         bne,pn          %XCC, 90f
 323          sub            %o0, %o1, %o3
 324
 325 1:
 326         subcc           %o2, 4, %o2
 327         LOAD(lduw, %o1, %g1)
 328         STORE(stw, %g1, %o1 + %o3)
 329         bgu,pt          %XCC, 1b
 330          add            %o1, 4, %o1
 331
 332 85:     retl
 333          mov            %g5, %o0
 334
 335         .align          32
 336 90:
 337         subcc           %o2, 1, %o2
 338         LOAD(ldub, %o1, %g1)
 339         STORE(stb, %g1, %o1 + %o3)
 340         bgu,pt          %XCC, 90b
 341          add            %o1, 1, %o1
 342         retl
 343          mov            %g5, %o0
 344
 345 END(__memcpy_niagara1)
 346
 347 #endif