sysdeps/powerpc/powerpc64/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC64/POWER7.
   2    Copyright (C) 2010-2017 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22
  23 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  24    Returns 'dst'.  */
  25
  26 #ifndef MEMCPY
  27 # define MEMCPY memcpy
  28 #endif
  29
  30 #define dst 11          /* Use r11 so r3 kept unchanged.  */
  31 #define src 4
  32 #define cnt 5
  33
  34         .machine power7
  35 ENTRY_TOCLESS (MEMCPY, 5)
  36         CALL_MCOUNT 3
  37
  38         cmpldi  cr1,cnt,31
  39         neg     0,3
  40         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  41                                     code.  */
  42
  43 /* Align copies using VSX instructions to quadword. It is to avoid alignment
  44    traps when memcpy is used on non-cacheable memory (for instance, memory
  45    mapped I/O).  */
  46         andi.   10,3,15
  47         clrldi  11,4,60
  48         cmpld   cr6,10,11       /* SRC and DST alignments match?  */
  49
  50         mr      dst,3
  51         bne     cr6,L(copy_GE_32_unaligned)
  52         beq     L(aligned_copy)
  53
  54         mtocrf  0x01,0
  55         clrldi  0,0,60
  56
  57 /* Get the DST and SRC aligned to 16 bytes.  */
  58 1:
  59         bf      31,2f
  60         lbz     6,0(src)
  61         addi    src,src,1
  62         stb     6,0(dst)
  63         addi    dst,dst,1
  64 2:
  65         bf      30,4f
  66         lhz     6,0(src)
  67         addi    src,src,2
  68         sth     6,0(dst)
  69         addi    dst,dst,2
  70 4:
  71         bf      29,8f
  72         lwz     6,0(src)
  73         addi    src,src,4
  74         stw     6,0(dst)
  75         addi    dst,dst,4
  76 8:
  77         bf      28,16f
  78         ld      6,0(src)
  79         addi    src,src,8
  80         std     6,0(dst)
  81         addi    dst,dst,8
  82 16:
  83         subf    cnt,0,cnt
  84
  85 /* Main aligned copy loop. Copies 128 bytes at a time. */
  86 L(aligned_copy):
  87         li      6,16
  88         li      7,32
  89         li      8,48
  90         mtocrf  0x02,cnt
  91         srdi    12,cnt,7
  92         cmpdi   12,0
  93         beq     L(aligned_tail)
  94         lxvd2x  6,0,src
  95         lxvd2x  7,src,6
  96         mtctr   12
  97         b       L(aligned_128loop)
  98
  99         .align  4
 100 L(aligned_128head):
 101         /* for the 2nd + iteration of this loop. */
 102         lxvd2x  6,0,src
 103         lxvd2x  7,src,6
 104 L(aligned_128loop):
 105         lxvd2x  8,src,7
 106         lxvd2x  9,src,8
 107         stxvd2x 6,0,dst
 108         addi    src,src,64
 109         stxvd2x 7,dst,6
 110         stxvd2x 8,dst,7
 111         stxvd2x 9,dst,8
 112         lxvd2x  6,0,src
 113         lxvd2x  7,src,6
 114         addi    dst,dst,64
 115         lxvd2x  8,src,7
 116         lxvd2x  9,src,8
 117         addi    src,src,64
 118         stxvd2x 6,0,dst
 119         stxvd2x 7,dst,6
 120         stxvd2x 8,dst,7
 121         stxvd2x 9,dst,8
 122         addi    dst,dst,64
 123         bdnz    L(aligned_128head)
 124
 125 L(aligned_tail):
 126         mtocrf  0x01,cnt
 127         bf      25,32f
 128         lxvd2x  6,0,src
 129         lxvd2x  7,src,6
 130         lxvd2x  8,src,7
 131         lxvd2x  9,src,8
 132         addi    src,src,64
 133         stxvd2x 6,0,dst
 134         stxvd2x 7,dst,6
 135         stxvd2x 8,dst,7
 136         stxvd2x 9,dst,8
 137         addi    dst,dst,64
 138 32:
 139         bf      26,16f
 140         lxvd2x  6,0,src
 141         lxvd2x  7,src,6
 142         addi    src,src,32
 143         stxvd2x 6,0,dst
 144         stxvd2x 7,dst,6
 145         addi    dst,dst,32
 146 16:
 147         bf      27,8f
 148         lxvd2x  6,0,src
 149         addi    src,src,16
 150         stxvd2x 6,0,dst
 151         addi    dst,dst,16
 152 8:
 153         bf      28,4f
 154         ld      6,0(src)
 155         addi    src,src,8
 156         std     6,0(dst)
 157         addi    dst,dst,8
 158 4:      /* Copies 4~7 bytes.  */
 159         bf      29,L(tail2)
 160         lwz     6,0(src)
 161         stw     6,0(dst)
 162         bf      30,L(tail5)
 163         lhz     7,4(src)
 164         sth     7,4(dst)
 165         bflr    31
 166         lbz     8,6(src)
 167         stb     8,6(dst)
 168         /* Return original DST pointer.  */
 169         blr
 170
 171
 172 /* Handle copies of 0~31 bytes.  */
 173         .align  4
 174 L(copy_LT_32):
 175         mr      dst,3
 176         cmpldi  cr6,cnt,8
 177         mtocrf  0x01,cnt
 178         ble     cr6,L(copy_LE_8)
 179
 180         /* At least 9 bytes to go.  */
 181         neg     8,4
 182         andi.   0,8,3
 183         cmpldi  cr1,cnt,16
 184         beq     L(copy_LT_32_aligned)
 185
 186         /* Force 4-byte alignment for SRC.  */
 187         mtocrf  0x01,0
 188         subf    cnt,0,cnt
 189 2:
 190         bf      30,1f
 191         lhz     6,0(src)
 192         addi    src,src,2
 193         sth     6,0(dst)
 194         addi    dst,dst,2
 195 1:
 196         bf      31,L(end_4bytes_alignment)
 197         lbz     6,0(src)
 198         addi    src,src,1
 199         stb     6,0(dst)
 200         addi    dst,dst,1
 201
 202         .align  4
 203 L(end_4bytes_alignment):
 204         cmpldi  cr1,cnt,16
 205         mtocrf  0x01,cnt
 206
 207 L(copy_LT_32_aligned):
 208         /* At least 6 bytes to go, and SRC is word-aligned.  */
 209         blt     cr1,8f
 210
 211         /* Copy 16 bytes.  */
 212         lwz     6,0(src)
 213         lwz     7,4(src)
 214         stw     6,0(dst)
 215         lwz     8,8(src)
 216         stw     7,4(dst)
 217         lwz     6,12(src)
 218         addi    src,src,16
 219         stw     8,8(dst)
 220         stw     6,12(dst)
 221         addi    dst,dst,16
 222 8:      /* Copy 8 bytes.  */
 223         bf      28,L(tail4)
 224         lwz     6,0(src)
 225         lwz     7,4(src)
 226         addi    src,src,8
 227         stw     6,0(dst)
 228         stw     7,4(dst)
 229         addi    dst,dst,8
 230
 231         .align  4
 232 /* Copies 4~7 bytes.  */
 233 L(tail4):
 234         bf      29,L(tail2)
 235         lwz     6,0(src)
 236         stw     6,0(dst)
 237         bf      30,L(tail5)
 238         lhz     7,4(src)
 239         sth     7,4(dst)
 240         bflr    31
 241         lbz     8,6(src)
 242         stb     8,6(dst)
 243         /* Return original DST pointer.  */
 244         blr
 245
 246         .align  4
 247 /* Copies 2~3 bytes.  */
 248 L(tail2):
 249         bf      30,1f
 250         lhz     6,0(src)
 251         sth     6,0(dst)
 252         bflr    31
 253         lbz     7,2(src)
 254         stb     7,2(dst)
 255         blr
 256
 257         .align  4
 258 L(tail5):
 259         bflr    31
 260         lbz     6,4(src)
 261         stb     6,4(dst)
 262         blr
 263
 264         .align  4
 265 1:
 266         bflr    31
 267         lbz     6,0(src)
 268         stb     6,0(dst)
 269         /* Return original DST pointer.  */
 270         blr
 271
 272
 273 /* Handles copies of 0~8 bytes.  */
 274         .align  4
 275 L(copy_LE_8):
 276         bne     cr6,L(tail4)
 277
 278         /* Though we could've used ld/std here, they are still
 279         slow for unaligned cases.  */
 280
 281         lwz     6,0(src)
 282         lwz     7,4(src)
 283         stw     6,0(dst)
 284         stw     7,4(dst)
 285         blr
 286
 287
 288 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 289    SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 290    the data, allowing for aligned DST stores.  */
 291         .align  4
 292 L(copy_GE_32_unaligned):
 293         clrldi  0,0,60        /* Number of bytes until the 1st dst quadword.  */
 294         srdi    9,cnt,4       /* Number of full quadwords remaining.  */
 295
 296         beq     L(copy_GE_32_unaligned_cont)
 297
 298         /* DST is not quadword aligned, get it aligned.  */
 299
 300         mtocrf  0x01,0
 301         subf    cnt,0,cnt
 302
 303         /* Vector instructions work best when proper alignment (16-bytes)
 304         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 305 1:
 306         bf      31,2f
 307         lbz     6,0(src)
 308         addi    src,src,1
 309         stb     6,0(dst)
 310         addi    dst,dst,1
 311 2:
 312         bf      30,4f
 313         lhz     6,0(src)
 314         addi    src,src,2
 315         sth     6,0(dst)
 316         addi    dst,dst,2
 317 4:
 318         bf      29,8f
 319         lwz     6,0(src)
 320         addi    src,src,4
 321         stw     6,0(dst)
 322         addi    dst,dst,4
 323 8:
 324         bf      28,0f
 325         ld      6,0(src)
 326         addi    src,src,8
 327         std     6,0(dst)
 328         addi    dst,dst,8
 329 0:
 330         srdi    9,cnt,4       /* Number of full quadwords remaining.  */
 331
 332         /* The proper alignment is present, it is OK to copy the bytes now.  */
 333 L(copy_GE_32_unaligned_cont):
 334
 335         /* Setup two indexes to speed up the indexed vector operations.  */
 336         clrldi  10,cnt,60
 337         li      6,16          /* Index for 16-bytes offsets.  */
 338         li      7,32          /* Index for 32-bytes offsets.  */
 339         cmpldi  cr1,10,0
 340         srdi    8,cnt,5       /* Setup the loop counter.  */
 341         mtocrf  0x01,9
 342         cmpldi  cr6,9,1
 343 #ifdef __LITTLE_ENDIAN__
 344         lvsr    5,0,src
 345 #else
 346         lvsl    5,0,src
 347 #endif
 348         lvx     3,0,src
 349         li      0,0
 350         bf      31,L(setup_unaligned_loop)
 351
 352         /* Copy another 16 bytes to align to 32-bytes due to the loop.  */
 353         lvx     4,src,6
 354 #ifdef __LITTLE_ENDIAN__
 355         vperm   6,4,3,5
 356 #else
 357         vperm   6,3,4,5
 358 #endif
 359         addi    src,src,16
 360         stvx    6,0,dst
 361         addi    dst,dst,16
 362         vor     3,4,4
 363         clrrdi  0,src,60
 364
 365 L(setup_unaligned_loop):
 366         mtctr   8
 367         ble     cr6,L(end_unaligned_loop)
 368
 369         /* Copy 32 bytes at a time using vector instructions.  */
 370         .align  4
 371 L(unaligned_loop):
 372
 373         /* Note: vr6/vr10 may contain data that was already copied,
 374         but in order to get proper alignment, we may have to copy
 375         some portions again. This is faster than having unaligned
 376         vector instructions though.  */
 377
 378         lvx     4,src,6
 379 #ifdef __LITTLE_ENDIAN__
 380         vperm   6,4,3,5
 381 #else
 382         vperm   6,3,4,5
 383 #endif
 384         lvx     3,src,7
 385 #ifdef __LITTLE_ENDIAN__
 386         vperm   10,3,4,5
 387 #else
 388         vperm   10,4,3,5
 389 #endif
 390         addi    src,src,32
 391         stvx    6,0,dst
 392         stvx    10,dst,6
 393         addi    dst,dst,32
 394         bdnz    L(unaligned_loop)
 395
 396         clrrdi  0,src,60
 397
 398         .align  4
 399 L(end_unaligned_loop):
 400
 401         /* Check for tail bytes.  */
 402         mtocrf  0x01,cnt
 403         beqlr   cr1
 404
 405         add     src,src,0
 406
 407         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 408         /* Copy 8 bytes.  */
 409         bf      28,4f
 410         lwz     6,0(src)
 411         lwz     7,4(src)
 412         addi    src,src,8
 413         stw     6,0(dst)
 414         stw     7,4(dst)
 415         addi    dst,dst,8
 416 4:      /* Copy 4~7 bytes.  */
 417         bf      29,L(tail2)
 418         lwz     6,0(src)
 419         stw     6,0(dst)
 420         bf      30,L(tail5)
 421         lhz     7,4(src)
 422         sth     7,4(dst)
 423         bflr    31
 424         lbz     8,6(src)
 425         stb     8,6(dst)
 426         /* Return original DST pointer.  */
 427         blr
 428
 429 END_GEN_TB (MEMCPY,TB_TOCLESS)
 430 libc_hidden_builtin_def (memcpy)