sysdeps/powerpc/powerpc64/power7/mempcpy.S

   1 /* Optimized mempcpy implementation for POWER7.
   2    Copyright (C) 2010-2018 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22
  23 /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  24     Returns 'dst' + 'len'.  */
  25
  26 #ifndef MEMPCPY
  27 # define MEMPCPY __mempcpy
  28 #endif
  29         .machine  power7
  30 ENTRY_TOCLESS (MEMPCPY, 5)
  31         CALL_MCOUNT 3
  32
  33         cmpldi  cr1,5,31
  34         neg     0,3
  35         std     3,-16(1)
  36         std     31,-8(1)
  37         cfi_offset(31,-8)
  38         ble     cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
  39                                        code.  */
  40
  41         andi.   11,3,7        /* Check alignment of DST.  */
  42
  43
  44         clrldi  10,4,61       /* Check alignment of SRC.  */
  45         cmpld   cr6,10,11     /* SRC and DST alignments match?  */
  46         mr      12,4
  47         mr      31,5
  48         bne     cr6,L(copy_GE_32_unaligned)
  49
  50         srdi    9,5,3         /* Number of full quadwords remaining.  */
  51
  52         beq     L(copy_GE_32_aligned_cont)
  53
  54         clrldi  0,0,61
  55         mtcrf   0x01,0
  56         subf    31,0,5
  57
  58         /* Get the SRC aligned to 8 bytes.  */
  59
  60 1:      bf      31,2f
  61         lbz     6,0(12)
  62         addi    12,12,1
  63         stb     6,0(3)
  64         addi    3,3,1
  65 2:      bf      30,4f
  66         lhz     6,0(12)
  67         addi    12,12,2
  68         sth     6,0(3)
  69         addi    3,3,2
  70 4:      bf      29,0f
  71         lwz     6,0(12)
  72         addi    12,12,4
  73         stw     6,0(3)
  74         addi    3,3,4
  75 0:
  76         clrldi  10,12,61      /* Check alignment of SRC again.  */
  77         srdi    9,31,3        /* Number of full doublewords remaining.  */
  78
  79 L(copy_GE_32_aligned_cont):
  80
  81         clrldi  11,31,61
  82         mtcrf   0x01,9
  83
  84         srdi    8,31,5
  85         cmpldi  cr1,9,4
  86         cmpldi  cr6,11,0
  87         mr      11,12
  88
  89         /* Copy 1~3 doublewords so the main loop starts
  90         at a multiple of 32 bytes.  */
  91
  92         bf      30,1f
  93         ld      6,0(12)
  94         ld      7,8(12)
  95         addi    11,12,16
  96         mtctr   8
  97         std     6,0(3)
  98         std     7,8(3)
  99         addi    10,3,16
 100         bf      31,4f
 101         ld      0,16(12)
 102         std     0,16(3)
 103         blt     cr1,3f
 104         addi    11,12,24
 105         addi    10,3,24
 106         b       4f
 107
 108         .align  4
 109 1:      /* Copy 1 doubleword and set the counter.  */
 110         mr      10,3
 111         mtctr   8
 112         bf      31,4f
 113         ld      6,0(12)
 114         addi    11,12,8
 115         std     6,0(3)
 116         addi    10,3,8
 117
 118         /* Main aligned copy loop. Copies 32-bytes at a time.  */
 119         .align  4
 120 4:
 121         ld      6,0(11)
 122         ld      7,8(11)
 123         ld      8,16(11)
 124         ld      0,24(11)
 125         addi    11,11,32
 126
 127         std     6,0(10)
 128         std     7,8(10)
 129         std     8,16(10)
 130         std     0,24(10)
 131         addi    10,10,32
 132         bdnz    4b
 133 3:
 134
 135         /* Check for tail bytes.  */
 136         rldicr  0,31,0,60
 137         mtcrf   0x01,31
 138         beq     cr6,0f
 139
 140 .L9:
 141         add     3,3,0
 142         add     12,12,0
 143
 144         /*  At this point we have a tail of 0-7 bytes and we know that the
 145         destination is doubleword-aligned.  */
 146 4:      /* Copy 4 bytes.  */
 147         bf      29,2f
 148
 149         lwz     6,0(12)
 150         addi    12,12,4
 151         stw     6,0(3)
 152         addi    3,3,4
 153 2:      /* Copy 2 bytes.  */
 154         bf      30,1f
 155
 156         lhz     6,0(12)
 157         addi    12,12,2
 158         sth     6,0(3)
 159         addi    3,3,2
 160 1:      /* Copy 1 byte.  */
 161         bf      31,0f
 162
 163         lbz     6,0(12)
 164         stb     6,0(3)
 165 0:      /* Return DST + LEN pointer.  */
 166         ld      31,-8(1)
 167         ld      3,-16(1)
 168         add     3,3,5
 169         blr
 170
 171         /* Handle copies of 0~31 bytes.  */
 172         .align  4
 173 L(copy_LT_32):
 174         cmpldi  cr6,5,8
 175         mr      12,4
 176         mtcrf   0x01,5
 177         ble     cr6,L(copy_LE_8)
 178
 179         /* At least 9 bytes to go.  */
 180         neg     8,4
 181         clrrdi  11,4,2
 182         andi.   0,8,3
 183         cmpldi  cr1,5,16
 184         mr      10,5
 185         beq     L(copy_LT_32_aligned)
 186
 187         /* Force 4-bytes alignment for SRC.  */
 188         mtocrf  0x01,0
 189         subf    10,0,5
 190 2:      bf      30,1f
 191
 192         lhz     6,0(12)
 193         addi    12,12,2
 194         sth     6,0(3)
 195         addi    3,3,2
 196 1:      bf      31,L(end_4bytes_alignment)
 197
 198         lbz     6,0(12)
 199         addi    12,12,1
 200         stb     6,0(3)
 201         addi    3,3,1
 202
 203         .align  4
 204 L(end_4bytes_alignment):
 205         cmpldi  cr1,10,16
 206         mtcrf   0x01,10
 207
 208 L(copy_LT_32_aligned):
 209         /* At least 6 bytes to go, and SRC is word-aligned.  */
 210         blt     cr1,8f
 211
 212         /* Copy 16 bytes.  */
 213         lwz     6,0(12)
 214         lwz     7,4(12)
 215         stw     6,0(3)
 216         lwz     8,8(12)
 217         stw     7,4(3)
 218         lwz     6,12(12)
 219         addi    12,12,16
 220         stw     8,8(3)
 221         stw     6,12(3)
 222         addi    3,3,16
 223 8:      /* Copy 8 bytes.  */
 224         bf      28,4f
 225
 226         lwz     6,0(12)
 227         lwz     7,4(12)
 228         addi    12,12,8
 229         stw     6,0(3)
 230         stw     7,4(3)
 231         addi    3,3,8
 232 4:      /* Copy 4 bytes.  */
 233         bf      29,2f
 234
 235         lwz     6,0(12)
 236         addi    12,12,4
 237         stw     6,0(3)
 238         addi    3,3,4
 239 2:      /* Copy 2-3 bytes.  */
 240         bf      30,1f
 241
 242         lhz     6,0(12)
 243         sth     6,0(3)
 244         bf      31,0f
 245         lbz     7,2(12)
 246         stb     7,2(3)
 247         ld      3,-16(1)
 248         add     3,3,5
 249         blr
 250
 251         .align  4
 252 1:      /* Copy 1 byte.  */
 253         bf      31,0f
 254
 255         lbz     6,0(12)
 256         stb     6,0(3)
 257 0:      /* Return DST + LEN pointer.  */
 258         ld      3,-16(1)
 259         add     3,3,5
 260         blr
 261
 262         /* Handles copies of 0~8 bytes.  */
 263         .align  4
 264 L(copy_LE_8):
 265         bne     cr6,4f
 266
 267         /* Though we could've used ld/std here, they are still
 268         slow for unaligned cases.  */
 269
 270         lwz     6,0(4)
 271         lwz     7,4(4)
 272         stw     6,0(3)
 273         stw     7,4(3)
 274         ld      3,-16(1)      /* Return DST + LEN pointer.  */
 275         add     3,3,5
 276         blr
 277
 278         .align  4
 279 4:      /* Copies 4~7 bytes.  */
 280         bf      29,2b
 281
 282         lwz     6,0(4)
 283         stw     6,0(3)
 284         bf      30,5f
 285         lhz     7,4(4)
 286         sth     7,4(3)
 287         bf      31,0f
 288         lbz     8,6(4)
 289         stb     8,6(3)
 290         ld      3,-16(1)
 291         add     3,3,5
 292         blr
 293
 294         .align  4
 295 5:      /* Copy 1 byte.  */
 296         bf      31,0f
 297
 298         lbz     6,4(4)
 299         stb     6,4(3)
 300
 301 0:      /* Return DST + LEN pointer.  */
 302         ld      3,-16(1)
 303         add     3,3,5
 304         blr
 305
 306         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 307         SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 308         the data, allowing for aligned DST stores.  */
 309         .align  4
 310 L(copy_GE_32_unaligned):
 311         clrldi  0,0,60        /* Number of bytes until the 1st
 312                                  quadword.  */
 313         andi.   11,3,15       /* Check alignment of DST (against
 314                                  quadwords).  */
 315         srdi    9,5,4         /* Number of full quadwords remaining.  */
 316
 317         beq     L(copy_GE_32_unaligned_cont)
 318
 319         /* SRC is not quadword aligned, get it aligned.  */
 320
 321         mtcrf   0x01,0
 322         subf    31,0,5
 323
 324         /* Vector instructions work best when proper alignment (16-bytes)
 325         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 326 1:      /* Copy 1 byte.  */
 327         bf      31,2f
 328
 329         lbz     6,0(12)
 330         addi    12,12,1
 331         stb     6,0(3)
 332         addi    3,3,1
 333 2:      /* Copy 2 bytes.  */
 334         bf      30,4f
 335
 336         lhz     6,0(12)
 337         addi    12,12,2
 338         sth     6,0(3)
 339         addi    3,3,2
 340 4:      /* Copy 4 bytes.  */
 341         bf      29,8f
 342
 343         lwz     6,0(12)
 344         addi    12,12,4
 345         stw     6,0(3)
 346         addi    3,3,4
 347 8:      /* Copy 8 bytes.  */
 348         bf      28,0f
 349
 350         ld      6,0(12)
 351         addi    12,12,8
 352         std     6,0(3)
 353         addi    3,3,8
 354 0:
 355         clrldi  10,12,60      /* Check alignment of SRC.  */
 356         srdi    9,31,4        /* Number of full quadwords remaining.  */
 357
 358         /* The proper alignment is present, it is OK to copy the bytes now.  */
 359 L(copy_GE_32_unaligned_cont):
 360
 361         /* Setup two indexes to speed up the indexed vector operations.  */
 362         clrldi  11,31,60
 363         li      6,16          /* Index for 16-bytes offsets.  */
 364         li      7,32          /* Index for 32-bytes offsets.  */
 365         cmpldi  cr1,11,0
 366         srdi    8,31,5        /* Setup the loop counter.  */
 367         mr      10,3
 368         mr      11,12
 369         mtcrf   0x01,9
 370         cmpldi  cr6,9,1
 371 #ifdef __LITTLE_ENDIAN__
 372         lvsr    5,0,12
 373 #else
 374         lvsl    5,0,12
 375 #endif
 376         lvx     3,0,12
 377         bf      31,L(setup_unaligned_loop)
 378
 379         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 380         lvx     4,12,6
 381 #ifdef __LITTLE_ENDIAN__
 382         vperm   6,4,3,5
 383 #else
 384         vperm   6,3,4,5
 385 #endif
 386         addi    11,12,16
 387         addi    10,3,16
 388         stvx    6,0,3
 389         vor     3,4,4
 390
 391 L(setup_unaligned_loop):
 392         mtctr   8
 393         ble     cr6,L(end_unaligned_loop)
 394
 395         /* Copy 32 bytes at a time using vector instructions.  */
 396         .align  4
 397 L(unaligned_loop):
 398
 399         /* Note: vr6/vr10 may contain data that was already copied,
 400         but in order to get proper alignment, we may have to copy
 401         some portions again. This is faster than having unaligned
 402         vector instructions though.  */
 403
 404         lvx     4,11,6        /* vr4 = r11+16.  */
 405 #ifdef __LITTLE_ENDIAN__
 406         vperm   6,4,3,5
 407 #else
 408         vperm   6,3,4,5
 409 #endif
 410         lvx     3,11,7        /* vr3 = r11+32.  */
 411 #ifdef __LITTLE_ENDIAN__
 412         vperm   10,3,4,5
 413 #else
 414         vperm   10,4,3,5
 415 #endif
 416         addi    11,11,32
 417         stvx    6,0,10
 418         stvx    10,10,6
 419         addi    10,10,32
 420
 421         bdnz    L(unaligned_loop)
 422
 423         .align  4
 424 L(end_unaligned_loop):
 425
 426         /* Check for tail bytes.  */
 427         rldicr  0,31,0,59
 428         mtcrf   0x01,31
 429         beq     cr1,0f
 430
 431         add     3,3,0
 432         add     12,12,0
 433
 434         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 435 8:      /* Copy 8 bytes.  */
 436         bf      28,4f
 437
 438         lwz     6,0(12)
 439         lwz     7,4(12)
 440         addi    12,12,8
 441         stw     6,0(3)
 442         stw     7,4(3)
 443         addi    3,3,8
 444 4:      /* Copy 4 bytes.  */
 445         bf      29,2f
 446
 447         lwz     6,0(12)
 448         addi    12,12,4
 449         stw     6,0(3)
 450         addi    3,3,4
 451 2:      /* Copy 2~3 bytes.  */
 452         bf      30,1f
 453
 454         lhz     6,0(12)
 455         addi    12,12,2
 456         sth     6,0(3)
 457         addi    3,3,2
 458 1:      /* Copy 1 byte.  */
 459         bf      31,0f
 460
 461         lbz     6,0(12)
 462         stb     6,0(3)
 463 0:      /* Return DST + LEN pointer.  */
 464         ld      31,-8(1)
 465         ld      3,-16(1)
 466         add     3,3,5
 467         blr
 468
 469 END_GEN_TB (MEMPCPY,TB_TOCLESS)
 470 libc_hidden_def (__mempcpy)
 471 weak_alias (__mempcpy, mempcpy)
 472 libc_hidden_builtin_def (mempcpy)