sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S

   1 /* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
   2    Copyright (C) 2006, 2008 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by David S. Miller (davem@davemloft.net)
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <sysdep.h>
  22
  23 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  24 #define ASI_P                   0x80
  25 #define ASI_PNF                 0x82
  26
  27 #define LOAD(type,addr,dest)    type##a [addr] ASI_P, dest
  28 #define LOAD_TWIN(addr_reg,dest0,dest1) \
  29         ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  30
  31 #define STORE(type,src,addr)    type src, [addr]
  32 #define STORE_INIT(src,addr)    stxa src, [addr] %asi
  33
  34 #ifndef XCC
  35 #define USE_BPR
  36 #define XCC xcc
  37 #endif
  38
  39 #if !defined NOT_IN_libc
  40
  41         .register       %g2,#scratch
  42         .register       %g3,#scratch
  43         .register       %g6,#scratch
  44
  45         .text
  46
  47         .align          32
  48 ENTRY(__memcpy_niagara1)
  49 # ifndef USE_BPR
  50         srl             %o2, 0, %o2
  51 # endif
  52 100:    /* %o0=dst, %o1=src, %o2=len */
  53         mov             %o0, %g5
  54         cmp             %o2, 0
  55         be,pn           %XCC, 85f
  56 218:     or             %o0, %o1, %o3
  57         cmp             %o2, 16
  58         blu,a,pn        %XCC, 80f
  59          or             %o3, %o2, %o3
  60
  61         /* 2 blocks (128 bytes) is the minimum we can do the block
  62          * copy with.  We need to ensure that we'll iterate at least
  63          * once in the block copy loop.  At worst we'll need to align
  64          * the destination to a 64-byte boundary which can chew up
  65          * to (64 - 1) bytes from the length before we perform the
  66          * block copy loop.
  67          */
  68         cmp             %o2, (2 * 64)
  69         blu,pt          %XCC, 70f
  70          andcc          %o3, 0x7, %g0
  71
  72         /* %o0: dst
  73          * %o1: src
  74          * %o2: len  (known to be >= 128)
  75          *
  76          * The block copy loops will use %o4/%o5,%g2/%g3 as
  77          * temporaries while copying the data.
  78          */
  79
  80         LOAD(prefetch, %o1, #one_read)
  81         wr              %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
  82
  83         /* Align destination on 64-byte boundary.  */
  84         andcc           %o0, (64 - 1), %o4
  85         be,pt           %XCC, 2f
  86          sub            %o4, 64, %o4
  87         sub             %g0, %o4, %o4   ! bytes to align dst
  88         sub             %o2, %o4, %o2
  89 1:      subcc           %o4, 1, %o4
  90         LOAD(ldub, %o1, %g1)
  91         STORE(stb, %g1, %o0)
  92         add             %o1, 1, %o1
  93         bne,pt          %XCC, 1b
  94         add             %o0, 1, %o0
  95
  96         /* If the source is on a 16-byte boundary we can do
  97          * the direct block copy loop.  If it is 8-byte aligned
  98          * we can do the 16-byte loads offset by -8 bytes and the
  99          * init stores offset by one register.
 100          *
 101          * If the source is not even 8-byte aligned, we need to do
 102          * shifting and masking (basically integer faligndata).
 103          *
 104          * The careful bit with init stores is that if we store
 105          * to any part of the cache line we have to store the whole
 106          * cacheline else we can end up with corrupt L2 cache line
 107          * contents.  Since the loop works on 64-bytes of 64-byte
 108          * aligned store data at a time, this is easy to ensure.
 109          */
 110 2:
 111         andcc           %o1, (16 - 1), %o4
 112         andn            %o2, (64 - 1), %g1      ! block copy loop iterator
 113         sub             %o2, %g1, %o2           ! final sub-block copy bytes
 114         be,pt           %XCC, 50f
 115          cmp            %o4, 8
 116         be,a,pt         %XCC, 10f
 117          sub            %o1, 0x8, %o1
 118
 119         /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 120         mov             %g1, %o4
 121         and             %o1, 0x7, %g1
 122         sll             %g1, 3, %g1
 123         mov             64, %o3
 124         andn            %o1, 0x7, %o1
 125         LOAD(ldx, %o1, %g2)
 126         sub             %o3, %g1, %o3
 127         sllx            %g2, %g1, %g2
 128
 129 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
 130         LOAD(ldx, SRC, TMP1); \
 131         srlx            TMP1, PRE_SHIFT, TMP2; \
 132         or              TMP2, PRE_VAL, TMP2; \
 133         STORE_INIT(TMP2, DST); \
 134         sllx            TMP1, POST_SHIFT, PRE_VAL;
 135
 136 1:      add             %o1, 0x8, %o1
 137         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
 138         add             %o1, 0x8, %o1
 139         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
 140         add             %o1, 0x8, %o1
 141         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
 142         add             %o1, 0x8, %o1
 143         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
 144         add             %o1, 32, %o1
 145         LOAD(prefetch, %o1, #one_read)
 146         sub             %o1, 32 - 8, %o1
 147         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
 148         add             %o1, 8, %o1
 149         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
 150         add             %o1, 8, %o1
 151         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
 152         add             %o1, 8, %o1
 153         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
 154         subcc           %o4, 64, %o4
 155         bne,pt          %XCC, 1b
 156          add            %o0, 64, %o0
 157
 158 #undef SWIVEL_ONE_DWORD
 159
 160         srl             %g1, 3, %g1
 161         ba,pt           %XCC, 60f
 162          add            %o1, %g1, %o1
 163
 164 10:     /* Destination is 64-byte aligned, source was only 8-byte
 165          * aligned but it has been subtracted by 8 and we perform
 166          * one twin load ahead, then add 8 back into source when
 167          * we finish the loop.
 168          */
 169         LOAD_TWIN(%o1, %o4, %o5)
 170 1:      add             %o1, 16, %o1
 171         LOAD_TWIN(%o1, %g2, %g3)
 172         add             %o1, 16 + 32, %o1
 173         LOAD(prefetch, %o1, #one_read)
 174         sub             %o1, 32, %o1
 175         STORE_INIT(%o5, %o0 + 0x00)             ! initializes cache line
 176         STORE_INIT(%g2, %o0 + 0x08)
 177         LOAD_TWIN(%o1, %o4, %o5)
 178         add             %o1, 16, %o1
 179         STORE_INIT(%g3, %o0 + 0x10)
 180         STORE_INIT(%o4, %o0 + 0x18)
 181         LOAD_TWIN(%o1, %g2, %g3)
 182         add             %o1, 16, %o1
 183         STORE_INIT(%o5, %o0 + 0x20)
 184         STORE_INIT(%g2, %o0 + 0x28)
 185         LOAD_TWIN(%o1, %o4, %o5)
 186         STORE_INIT(%g3, %o0 + 0x30)
 187         STORE_INIT(%o4, %o0 + 0x38)
 188         subcc           %g1, 64, %g1
 189         bne,pt          %XCC, 1b
 190          add            %o0, 64, %o0
 191
 192         ba,pt           %XCC, 60f
 193          add            %o1, 0x8, %o1
 194
 195 50:     /* Destination is 64-byte aligned, and source is 16-byte
 196          * aligned.
 197          */
 198 1:      LOAD_TWIN(%o1, %o4, %o5)
 199         add     %o1, 16, %o1
 200         LOAD_TWIN(%o1, %g2, %g3)
 201         add     %o1, 16 + 32, %o1
 202         LOAD(prefetch, %o1, #one_read)
 203         sub     %o1, 32, %o1
 204         STORE_INIT(%o4, %o0 + 0x00)             ! initializes cache line
 205         STORE_INIT(%o5, %o0 + 0x08)
 206         LOAD_TWIN(%o1, %o4, %o5)
 207         add     %o1, 16, %o1
 208         STORE_INIT(%g2, %o0 + 0x10)
 209         STORE_INIT(%g3, %o0 + 0x18)
 210         LOAD_TWIN(%o1, %g2, %g3)
 211         add     %o1, 16, %o1
 212         STORE_INIT(%o4, %o0 + 0x20)
 213         STORE_INIT(%o5, %o0 + 0x28)
 214         STORE_INIT(%g2, %o0 + 0x30)
 215         STORE_INIT(%g3, %o0 + 0x38)
 216         subcc   %g1, 64, %g1
 217         bne,pt  %XCC, 1b
 218          add    %o0, 64, %o0
 219         /* fall through */
 220
 221 60:
 222         /* %o2 contains any final bytes still needed to be copied
 223          * over. If anything is left, we copy it one byte at a time.
 224          */
 225         wr              %g0, ASI_PNF, %asi
 226         brz,pt          %o2, 85f
 227          sub            %o0, %o1, %o3
 228         ba,a,pt         %XCC, 90f
 229
 230         .align          64
 231 70: /* 16 < len <= 64 */
 232         bne,pn          %XCC, 75f
 233          sub            %o0, %o1, %o3
 234
 235 72:
 236         andn            %o2, 0xf, %o4
 237         and             %o2, 0xf, %o2
 238 1:      subcc           %o4, 0x10, %o4
 239         LOAD(ldx, %o1, %o5)
 240         add             %o1, 0x08, %o1
 241         LOAD(ldx, %o1, %g1)
 242         sub             %o1, 0x08, %o1
 243         STORE(stx, %o5, %o1 + %o3)
 244         add             %o1, 0x8, %o1
 245         STORE(stx, %g1, %o1 + %o3)
 246         bgu,pt          %XCC, 1b
 247          add            %o1, 0x8, %o1
 248 73:     andcc           %o2, 0x8, %g0
 249         be,pt           %XCC, 1f
 250          nop
 251         sub             %o2, 0x8, %o2
 252         LOAD(ldx, %o1, %o5)
 253         STORE(stx, %o5, %o1 + %o3)
 254         add             %o1, 0x8, %o1
 255 1:      andcc           %o2, 0x4, %g0
 256         be,pt           %XCC, 1f
 257          nop
 258         sub             %o2, 0x4, %o2
 259         LOAD(lduw, %o1, %o5)
 260         STORE(stw, %o5, %o1 + %o3)
 261         add             %o1, 0x4, %o1
 262 1:      cmp             %o2, 0
 263         be,pt           %XCC, 85f
 264          nop
 265         ba,pt           %XCC, 90f
 266          nop
 267
 268 75:
 269         andcc           %o0, 0x7, %g1
 270         sub             %g1, 0x8, %g1
 271         be,pn           %icc, 2f
 272          sub            %g0, %g1, %g1
 273         sub             %o2, %g1, %o2
 274
 275 1:      subcc           %g1, 1, %g1
 276         LOAD(ldub, %o1, %o5)
 277         STORE(stb, %o5, %o1 + %o3)
 278         bgu,pt          %icc, 1b
 279          add            %o1, 1, %o1
 280
 281 2:      add             %o1, %o3, %o0
 282         andcc           %o1, 0x7, %g1
 283         bne,pt          %icc, 8f
 284          sll            %g1, 3, %g1
 285
 286         cmp             %o2, 16
 287         bgeu,pt         %icc, 72b
 288          nop
 289         ba,a,pt         %XCC, 73b
 290
 291 8:      mov             64, %o3
 292         andn            %o1, 0x7, %o1
 293         LOAD(ldx, %o1, %g2)
 294         sub             %o3, %g1, %o3
 295         andn            %o2, 0x7, %o4
 296         sllx            %g2, %g1, %g2
 297 1:      add             %o1, 0x8, %o1
 298         LOAD(ldx, %o1, %g3)
 299         subcc           %o4, 0x8, %o4
 300         srlx            %g3, %o3, %o5
 301         or              %o5, %g2, %o5
 302         STORE(stx, %o5, %o0)
 303         add             %o0, 0x8, %o0
 304         bgu,pt          %icc, 1b
 305          sllx           %g3, %g1, %g2
 306
 307         srl             %g1, 3, %g1
 308         andcc           %o2, 0x7, %o2
 309         be,pn           %icc, 85f
 310          add            %o1, %g1, %o1
 311         ba,pt           %XCC, 90f
 312          sub            %o0, %o1, %o3
 313
 314         .align          64
 315 80: /* 0 < len <= 16 */
 316         andcc           %o3, 0x3, %g0
 317         bne,pn          %XCC, 90f
 318          sub            %o0, %o1, %o3
 319
 320 1:
 321         subcc           %o2, 4, %o2
 322         LOAD(lduw, %o1, %g1)
 323         STORE(stw, %g1, %o1 + %o3)
 324         bgu,pt          %XCC, 1b
 325          add            %o1, 4, %o1
 326
 327 85:     retl
 328          mov            %g5, %o0
 329
 330         .align          32
 331 90:
 332         subcc           %o2, 1, %o2
 333         LOAD(ldub, %o1, %g1)
 334         STORE(stb, %g1, %o1 + %o3)
 335         bgu,pt          %XCC, 90b
 336          add            %o1, 1, %o1
 337         retl
 338          mov            %g5, %o0
 339
 340 END(__memcpy_niagara1)
 341
 342 #endif