sysdeps/powerpc/powerpc64/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64.
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  22    Returns 'dst'.
  23
  24    Memcpy handles short copies (< 32-bytes) using a binary move blocks
  25    (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
  26    with the appropriate combination of byte and halfword load/stores.
  27    There is minimal effort to optimize the alignment of short moves.
  28    The 64-bit implementations of POWER3 and POWER4 do a reasonable job
  29    of handling unaligned load/stores that do not cross 32-byte boundaries.
  30
  31    Longer moves (>= 32-bytes) justify the effort to get at least the
  32    destination doubleword (8-byte) aligned.  Further optimization is
  33    possible when both source and destination are doubleword aligned.
  34    Each case has a optimized unrolled loop.   */
  35
  36 EALIGN (memcpy, 5, 0)
  37         CALL_MCOUNT 3
  38
  39     cmpldi cr1,5,31
  40     neg   0,3
  41     std   3,-16(1)
  42     std   31,-8(1)
  43     cfi_offset(31,-8)
  44     andi. 11,3,7        /* check alignment of dst.  */
  45     clrldi 0,0,61       /* Number of bytes until the 1st doubleword of dst.  */
  46     clrldi 10,4,61      /* check alignment of src.  */
  47     cmpldi cr6,5,8
  48     ble-  cr1,.L2       /* If move < 32 bytes use short move code.  */
  49     cmpld cr6,10,11
  50     mr    12,4
  51     srdi  9,5,3         /* Number of full double words remaining.  */
  52     mtcrf 0x01,0
  53     mr    31,5
  54     beq   .L0
  55
  56     subf  31,0,5
  57   /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
  58 1:  bf    31,2f
  59     lbz   6,0(12)
  60     addi  12,12,1
  61     stb   6,0(3)
  62     addi  3,3,1
  63 2:  bf    30,4f
  64     lhz   6,0(12)
  65     addi  12,12,2
  66     sth   6,0(3)
  67     addi  3,3,2
  68 4:  bf    29,0f
  69     lwz   6,0(12)
  70     addi  12,12,4
  71     stw   6,0(3)
  72     addi  3,3,4
  73 0:
  74     clrldi 10,12,61     /* check alignment of src again.  */
  75     srdi  9,31,3        /* Number of full double words remaining.  */
  76
  77   /* Copy doublewords from source to destination, assuming the
  78      destination is aligned on a doubleword boundary.
  79
  80      At this point we know there are at least 25 bytes left (32-7) to copy.
  81      The next step is to determine if the source is also doubleword aligned.
  82      If not branch to the unaligned move code at .L6. which uses
  83      a load, shift, store strategy.
  84
  85      Otherwise source and destination are doubleword aligned, and we can
  86      the optimized doubleword copy loop.  */
  87 .L0:
  88     clrldi      11,31,61
  89     mtcrf 0x01,9
  90     bne-  cr6,.L6   /* If source is not DW aligned.  */
  91
  92   /* Move doublewords where destination and source are DW aligned.
  93      Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
  94      If the copy is not an exact multiple of 32 bytes, 1-3
  95      doublewords are copied as needed to set up the main loop.  After
  96      the main loop exits there may be a tail of 1-7 bytes. These byte are
  97      copied a word/halfword/byte at a time as needed to preserve alignment.  */
  98
  99     srdi  8,31,5
 100     cmpldi      cr1,9,4
 101     cmpldi      cr6,11,0
 102     mr    11,12
 103
 104     bf    30,1f
 105     ld    6,0(12)
 106     ld    7,8(12)
 107     addi  11,12,16
 108     mtctr 8
 109     std   6,0(3)
 110     std   7,8(3)
 111     addi  10,3,16
 112     bf    31,4f
 113     ld    0,16(12)
 114     std   0,16(3)
 115     blt   cr1,3f
 116     addi  11,12,24
 117     addi  10,3,24
 118     b     4f
 119     .align  4
 120 1:
 121     mr    10,3
 122     mtctr 8
 123     bf    31,4f
 124     ld    6,0(12)
 125     addi  11,12,8
 126     std   6,0(3)
 127     addi  10,3,8
 128
 129     .align  4
 130 4:
 131     ld    6,0(11)
 132     ld    7,8(11)
 133     ld    8,16(11)
 134     ld    0,24(11)
 135     addi  11,11,32
 136 2:
 137     std   6,0(10)
 138     std   7,8(10)
 139     std   8,16(10)
 140     std   0,24(10)
 141     addi  10,10,32
 142     bdnz  4b
 143 3:
 144
 145     rldicr 0,31,0,60
 146     mtcrf 0x01,31
 147     beq   cr6,0f
 148 .L9:
 149     add   3,3,0
 150     add   12,12,0
 151
 152 /*  At this point we have a tail of 0-7 bytes and we know that the
 153     destination is double word aligned.  */
 154 4:  bf    29,2f
 155     lwz   6,0(12)
 156     addi  12,12,4
 157     stw   6,0(3)
 158     addi  3,3,4
 159 2:  bf    30,1f
 160     lhz   6,0(12)
 161     addi  12,12,2
 162     sth   6,0(3)
 163     addi  3,3,2
 164 1:  bf    31,0f
 165     lbz   6,0(12)
 166     stb   6,0(3)
 167 0:
 168   /* Return original dst pointer.  */
 169     ld 31,-8(1)
 170     ld 3,-16(1)
 171     blr
 172
 173 /* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
 174    bytes.  Each case is handled without loops, using binary (1,2,4,8)
 175    tests.
 176
 177    In the short (0-8 byte) case no attempt is made to force alignment
 178    of either source or destination.  The hardware will handle the
 179    unaligned load/stores with small delays for crossing 32- 64-byte, and
 180    4096-byte boundaries. Since these short moves are unlikely to be
 181    unaligned or cross these boundaries, the overhead to force
 182    alignment is not justified.
 183
 184    The longer (9-31 byte) move is more likely to cross 32- or 64-byte
 185    boundaries.  Since only loads are sensitive to the 32-/64-byte
 186    boundaries it is more important to align the source then the
 187    destination.  If the source is not already word aligned, we first
 188    move 1-3 bytes as needed.  Since we are only word aligned we don't
 189    use double word load/stores to insure that all loads are aligned.
 190    While the destination and stores may still be unaligned, this
 191    is only an issue for page (4096 byte boundary) crossing, which
 192    should be rare for these short moves.  The hardware handles this
 193    case automatically with a small delay.  */
 194
 195     .align  4
 196 .L2:
 197     mtcrf 0x01,5
 198     neg   8,4
 199     clrrdi      11,4,2
 200     andi. 0,8,3
 201     ble   cr6,.LE8      /* Handle moves of 0-8 bytes.  */
 202 /* At least 9 bytes left.  Get the source word aligned.  */
 203     cmpldi      cr1,5,16
 204     mr    10,5
 205     mr    12,4
 206     cmpldi      cr6,0,2
 207     beq   .L3   /* If the source is already word aligned skip this.  */
 208 /* Copy 1-3 bytes to get source address word aligned.  */
 209     lwz   6,0(11)
 210     subf  10,0,5
 211     add   12,4,0
 212     blt   cr6,5f
 213     srdi  7,6,16
 214     bgt   cr6,3f
 215 #ifdef __LITTLE_ENDIAN__
 216     sth   7,0(3)
 217 #else
 218     sth   6,0(3)
 219 #endif
 220     b     7f
 221     .align  4
 222 3:
 223 #ifdef __LITTLE_ENDIAN__
 224     rotlwi 6,6,24
 225     stb   6,0(3)
 226     sth   7,1(3)
 227 #else
 228     stb   7,0(3)
 229     sth   6,1(3)
 230 #endif
 231     b     7f
 232     .align  4
 233 5:
 234 #ifdef __LITTLE_ENDIAN__
 235     rotlwi 6,6,8
 236 #endif
 237     stb   6,0(3)
 238 7:
 239     cmpldi      cr1,10,16
 240     add   3,3,0
 241     mtcrf 0x01,10
 242     .align  4
 243 .L3:
 244 /* At least 6 bytes left and the source is word aligned.  */
 245     blt   cr1,8f
 246 16: /* Move 16 bytes.  */
 247     lwz   6,0(12)
 248     lwz   7,4(12)
 249     stw   6,0(3)
 250     lwz   6,8(12)
 251     stw   7,4(3)
 252     lwz   7,12(12)
 253     addi  12,12,16
 254     stw   6,8(3)
 255     stw   7,12(3)
 256     addi  3,3,16
 257 8:  /* Move 8 bytes.  */
 258     bf    28,4f
 259     lwz   6,0(12)
 260     lwz   7,4(12)
 261     addi  12,12,8
 262     stw   6,0(3)
 263     stw   7,4(3)
 264     addi  3,3,8
 265 4:  /* Move 4 bytes.  */
 266     bf    29,2f
 267     lwz   6,0(12)
 268     addi  12,12,4
 269     stw   6,0(3)
 270     addi  3,3,4
 271 2:  /* Move 2-3 bytes.  */
 272     bf    30,1f
 273     lhz   6,0(12)
 274     sth   6,0(3)
 275     bf    31,0f
 276     lbz   7,2(12)
 277     stb   7,2(3)
 278     ld 3,-16(1)
 279     blr
 280 1:  /* Move 1 byte.  */
 281     bf    31,0f
 282     lbz   6,0(12)
 283     stb   6,0(3)
 284 0:
 285   /* Return original dst pointer.  */
 286     ld    3,-16(1)
 287     blr
 288
 289 /* Special case to copy 0-8 bytes.  */
 290     .align  4
 291 .LE8:
 292     mr    12,4
 293     bne   cr6,4f
 294 /* Would have liked to use use ld/std here but the 630 processors are
 295    slow for load/store doubles that are not at least word aligned.
 296    Unaligned Load/Store word execute with only a 1 cycle penalty.  */
 297     lwz   6,0(4)
 298     lwz   7,4(4)
 299     stw   6,0(3)
 300     stw   7,4(3)
 301   /* Return original dst pointer.  */
 302     ld    3,-16(1)
 303     blr
 304     .align  4
 305 4:  bf    29,2b
 306     lwz   6,0(4)
 307     stw   6,0(3)
 308 6:
 309     bf    30,5f
 310     lhz   7,4(4)
 311     sth   7,4(3)
 312     bf    31,0f
 313     lbz   8,6(4)
 314     stb   8,6(3)
 315     ld 3,-16(1)
 316     blr
 317     .align  4
 318 5:
 319     bf    31,0f
 320     lbz   6,4(4)
 321     stb   6,4(3)
 322     .align  4
 323 0:
 324   /* Return original dst pointer.  */
 325     ld    3,-16(1)
 326     blr
 327
 328     .align  4
 329 .L6:
 330
 331   /* Copy doublewords where the destination is aligned but the source is
 332      not.  Use aligned doubleword loads from the source, shifted to realign
 333      the data, to allow aligned destination stores.  */
 334     subf  5,10,12
 335     andi. 0,9,1
 336     cmpldi cr6,11,0
 337     sldi  10,10,3
 338     mr    11,9
 339     mr    4,3
 340     ld    6,0(5)
 341     ld    7,8(5)
 342     subfic  9,10,64
 343     beq   2f
 344 #ifdef __LITTLE_ENDIAN__
 345     srd   0,6,10
 346 #else
 347     sld   0,6,10
 348 #endif
 349     cmpldi  11,1
 350     mr    6,7
 351     addi  4,4,-8
 352     addi  11,11,-1
 353     b     1f
 354 2:  addi  5,5,8
 355     .align  4
 356 #ifdef __LITTLE_ENDIAN__
 357 0:  srd   0,6,10
 358     sld   8,7,9
 359 #else
 360 0:  sld   0,6,10
 361     srd   8,7,9
 362 #endif
 363     cmpldi  11,2
 364     ld    6,8(5)
 365     or    0,0,8
 366     addi  11,11,-2
 367     std   0,0(4)
 368 #ifdef __LITTLE_ENDIAN__
 369     srd   0,7,10
 370 1:  sld   8,6,9
 371 #else
 372     sld   0,7,10
 373 1:  srd   8,6,9
 374 #endif
 375     or    0,0,8
 376     beq   8f
 377     ld    7,16(5)
 378     std   0,8(4)
 379     addi  5,5,16
 380     addi  4,4,16
 381     b     0b
 382     .align 4
 383 8:
 384     std   0,8(4)
 385     rldicr 0,31,0,60
 386     mtcrf 0x01,31
 387     bne   cr6,.L9       /* If the tail is 0 bytes we are done!  */
 388   /* Return original dst pointer.  */
 389     ld 31,-8(1)
 390     ld 3,-16(1)
 391     blr
 392 END_GEN_TB (memcpy,TB_TOCLESS)
 393 libc_hidden_builtin_def (memcpy)