sysdeps/powerpc/powerpc32/power7/mempcpy.S

   1 /* Optimized mempcpy implementation for POWER7.
   2    Copyright (C) 2010-2019 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  23         Returns 'dst' + 'len'.  */
  24
  25         .machine  power7
  26 EALIGN (__mempcpy, 5, 0)
  27         CALL_MCOUNT
  28
  29         stwu    1,-32(1)
  30         cfi_adjust_cfa_offset(32)
  31         stw     30,20(1)
  32         cfi_offset(30,(20-32))
  33         stw     31,24(1)
  34         mr      30,3
  35         cmplwi  cr1,5,31
  36         neg     0,3
  37         cfi_offset(31,-8)
  38         ble     cr1,L(copy_LT_32)  /* If move < 32 bytes use short move
  39                                         code.  */
  40
  41         andi.   11,3,7        /* Check alignment of DST.  */
  42         clrlwi  10,4,29       /* Check alignment of SRC.  */
  43         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  44         mr      12,4
  45         mr      31,5
  46         bne     cr6,L(copy_GE_32_unaligned)
  47
  48         srwi    9,5,3         /* Number of full quadwords remaining.  */
  49
  50         beq     L(copy_GE_32_aligned_cont)
  51
  52         clrlwi  0,0,29
  53         mtcrf   0x01,0
  54         subf    31,0,5
  55
  56         /* Get the SRC aligned to 8 bytes.  */
  57
  58 1:      bf      31,2f
  59         lbz     6,0(12)
  60         addi    12,12,1
  61         stb     6,0(3)
  62         addi    3,3,1
  63 2:      bf      30,4f
  64         lhz     6,0(12)
  65         addi    12,12,2
  66         sth     6,0(3)
  67         addi    3,3,2
  68 4:      bf      29,0f
  69         lwz     6,0(12)
  70         addi    12,12,4
  71         stw     6,0(3)
  72         addi    3,3,4
  73 0:
  74         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  75         srwi    9,31,3        /* Number of full doublewords remaining.  */
  76
  77 L(copy_GE_32_aligned_cont):
  78
  79         clrlwi  11,31,29
  80         mtcrf   0x01,9
  81
  82         srwi    8,31,5
  83         cmplwi  cr1,9,4
  84         cmplwi  cr6,11,0
  85         mr      11,12
  86
  87         /* Copy 1~3 doublewords so the main loop starts
  88         at a multiple of 32 bytes.  */
  89
  90         bf      30,1f
  91         lfd     6,0(12)
  92         lfd     7,8(12)
  93         addi    11,12,16
  94         mtctr   8
  95         stfd    6,0(3)
  96         stfd    7,8(3)
  97         addi    10,3,16
  98         bf      31,4f
  99         lfd     0,16(12)
 100         stfd    0,16(3)
 101         blt     cr1,3f
 102         addi    11,12,24
 103         addi    10,3,24
 104         b       4f
 105
 106         .align  4
 107 1:      /* Copy 1 doubleword and set the counter.  */
 108         mr      10,3
 109         mtctr   8
 110         bf      31,4f
 111         lfd     6,0(12)
 112         addi    11,12,8
 113         stfd    6,0(3)
 114         addi    10,3,8
 115
 116         .align  4
 117 4:      /* Main aligned copy loop. Copies 32-bytes at a time.  */
 118         lfd     6,0(11)
 119         lfd     7,8(11)
 120         lfd     8,16(11)
 121         lfd     0,24(11)
 122         addi    11,11,32
 123
 124         stfd    6,0(10)
 125         stfd    7,8(10)
 126         stfd    8,16(10)
 127         stfd    0,24(10)
 128         addi    10,10,32
 129         bdnz    4b
 130 3:
 131
 132         /* Check for tail bytes.  */
 133
 134         clrrwi  0,31,3
 135         mtcrf   0x01,31
 136         beq     cr6,0f
 137
 138 .L9:
 139         add     3,3,0
 140         add     12,12,0
 141
 142         /*  At this point we have a tail of 0-7 bytes and we know that the
 143         destination is doubleword-aligned.  */
 144 4:      /* Copy 4 bytes.  */
 145         bf      29,2f
 146
 147         lwz     6,0(12)
 148         addi    12,12,4
 149         stw     6,0(3)
 150         addi    3,3,4
 151 2:      /* Copy 2 bytes.  */
 152         bf      30,1f
 153
 154         lhz     6,0(12)
 155         addi    12,12,2
 156         sth     6,0(3)
 157         addi    3,3,2
 158 1:      /* Copy 1 byte.  */
 159         bf      31,0f
 160
 161         lbz     6,0(12)
 162         stb     6,0(3)
 163 0:      /* Return DST + LEN pointer.  */
 164         add     3,30,5
 165         lwz     30,20(1)
 166         lwz     31,24(1)
 167         addi    1,1,32
 168         blr
 169
 170         /* Handle copies of 0~31 bytes.  */
 171         .align  4
 172 L(copy_LT_32):
 173         cmplwi  cr6,5,8
 174         mr      12,4
 175         mtcrf   0x01,5
 176         ble     cr6,L(copy_LE_8)
 177
 178         /* At least 9 bytes to go.  */
 179         neg     8,4
 180         clrrwi  11,4,2
 181         andi.   0,8,3
 182         cmplwi  cr1,5,16
 183         mr      10,5
 184         beq     L(copy_LT_32_aligned)
 185
 186         /* Force 4-bytes alignment for SRC.  */
 187         mtocrf  0x01,0
 188         subf    10,0,5
 189 2:      bf      30,1f
 190
 191         lhz     6,0(12)
 192         addi    12,12,2
 193         sth     6,0(3)
 194         addi    3,3,2
 195 1:      bf      31,L(end_4bytes_alignment)
 196
 197         lbz     6,0(12)
 198         addi    12,12,1
 199         stb     6,0(3)
 200         addi    3,3,1
 201
 202         .align  4
 203 L(end_4bytes_alignment):
 204         cmplwi  cr1,10,16
 205         mtcrf   0x01,10
 206
 207 L(copy_LT_32_aligned):
 208         /* At least 6 bytes to go, and SRC is word-aligned.  */
 209         blt     cr1,8f
 210
 211         /* Copy 16 bytes.  */
 212         lwz     6,0(12)
 213         lwz     7,4(12)
 214         stw     6,0(3)
 215         lwz     8,8(12)
 216         stw     7,4(3)
 217         lwz     6,12(12)
 218         addi    12,12,16
 219         stw     8,8(3)
 220         stw     6,12(3)
 221         addi    3,3,16
 222 8:      /* Copy 8 bytes.  */
 223         bf      28,4f
 224
 225         lwz     6,0(12)
 226         lwz     7,4(12)
 227         addi    12,12,8
 228         stw     6,0(3)
 229         stw     7,4(3)
 230         addi    3,3,8
 231 4:      /* Copy 4 bytes.  */
 232         bf      29,2f
 233
 234         lwz     6,0(12)
 235         addi    12,12,4
 236         stw     6,0(3)
 237         addi    3,3,4
 238 2:      /* Copy 2-3 bytes.  */
 239         bf      30,1f
 240
 241         lhz     6,0(12)
 242         sth     6,0(3)
 243         bf      31,0f
 244         lbz     7,2(12)
 245         stb     7,2(3)
 246
 247         /* Return DST + LEN pointer.  */
 248         add     3,30,5
 249         lwz     30,20(1)
 250         addi    1,1,32
 251         blr
 252
 253         .align  4
 254 1:      /* Copy 1 byte.  */
 255         bf      31,0f
 256
 257         lbz     6,0(12)
 258         stb     6,0(3)
 259 0:      /* Return DST + LEN pointer.  */
 260         add     3,30,5
 261         lwz     30,20(1)
 262         addi    1,1,32
 263         blr
 264
 265         /* Handles copies of 0~8 bytes.  */
 266         .align  4
 267 L(copy_LE_8):
 268         bne     cr6,4f
 269
 270         /* Though we could've used lfd/stfd here, they are still
 271         slow for unaligned cases.  */
 272
 273         lwz     6,0(4)
 274         lwz     7,4(4)
 275         stw     6,0(3)
 276         stw     7,4(3)
 277
 278         /* Return DST + LEN pointer.  */
 279         add     3,30,5
 280         lwz     30,20(1)
 281         addi    1,1,32
 282         blr
 283
 284         .align  4
 285 4:      /* Copies 4~7 bytes.  */
 286         bf      29,2b
 287
 288         lwz     6,0(4)
 289         stw     6,0(3)
 290         bf      30,5f
 291         lhz     7,4(4)
 292         sth     7,4(3)
 293         bf      31,0f
 294         lbz     8,6(4)
 295         stb     8,6(3)
 296
 297         /* Return DST + LEN pointer.  */
 298         add     3,30,5
 299         lwz     30,20(1)
 300         addi    1,1,32
 301         blr
 302
 303         .align  4
 304 5:      /* Copy 1 byte.  */
 305         bf      31,0f
 306
 307         lbz     6,4(4)
 308         stb     6,4(3)
 309
 310 0:      /* Return DST + LEN pointer.  */
 311         add     3,30,5
 312         lwz     30,20(1)
 313         addi    1,1,32
 314         blr
 315
 316         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 317         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 318         the data, allowing for aligned DST stores.  */
 319         .align  4
 320 L(copy_GE_32_unaligned):
 321         andi.   11,3,15       /* Check alignment of DST.  */
 322         clrlwi  0,0,28        /* Number of bytes until the 1st
 323                                  quadword of DST.  */
 324         srwi    9,5,4         /* Number of full quadwords remaining.  */
 325
 326         beq     L(copy_GE_32_unaligned_cont)
 327
 328         /* DST is not quadword aligned, get it aligned.  */
 329
 330         mtcrf   0x01,0
 331         subf    31,0,5
 332
 333         /* Vector instructions work best when proper alignment (16-bytes)
 334         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 335 1:      /* Copy 1 byte.  */
 336         bf      31,2f
 337
 338         lbz     6,0(12)
 339         addi    12,12,1
 340         stb     6,0(3)
 341         addi    3,3,1
 342 2:      /* Copy 2 bytes.  */
 343         bf              30,4f
 344
 345         lhz     6,0(12)
 346         addi    12,12,2
 347         sth     6,0(3)
 348         addi    3,3,2
 349 4:      /* Copy 4 bytes.  */
 350         bf      29,8f
 351
 352         lwz     6,0(12)
 353         addi    12,12,4
 354         stw     6,0(3)
 355         addi    3,3,4
 356 8:      /* Copy 8 bytes.  */
 357         bf      28,0f
 358
 359         lfd     6,0(12)
 360         addi    12,12,8
 361         stfd    6,0(3)
 362         addi    3,3,8
 363 0:
 364         clrlwi  10,12,28      /* Check alignment of SRC.  */
 365         srwi    9,31,4        /* Number of full quadwords remaining.  */
 366
 367         /* The proper alignment is present, it is OK to copy the bytes now.  */
 368 L(copy_GE_32_unaligned_cont):
 369
 370         /* Setup two indexes to speed up the indexed vector operations.  */
 371         clrlwi  11,31,28
 372         li      6,16          /* Index for 16-bytes offsets.  */
 373         li      7,32          /* Index for 32-bytes offsets.  */
 374         cmplwi  cr1,11,0
 375         srwi    8,31,5        /* Setup the loop counter.  */
 376         mr      10,3
 377         mr      11,12
 378         mtcrf   0x01,9
 379         cmplwi  cr6,9,1
 380 #ifdef __LITTLE_ENDIAN__
 381         lvsr    5,0,12
 382 #else
 383         lvsl    5,0,12
 384 #endif
 385         lvx     3,0,12
 386         bf      31,L(setup_unaligned_loop)
 387
 388         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 389         lvx     4,12,6
 390 #ifdef __LITTLE_ENDIAN__
 391         vperm   6,4,3,5
 392 #else
 393         vperm   6,3,4,5
 394 #endif
 395         addi    11,12,16
 396         addi    10,3,16
 397         stvx    6,0,3
 398         vor     3,4,4
 399
 400 L(setup_unaligned_loop):
 401         mtctr   8
 402         ble     cr6,L(end_unaligned_loop)
 403
 404         /* Copy 32 bytes at a time using vector instructions.  */
 405         .align  4
 406 L(unaligned_loop):
 407
 408         /* Note: vr6/vr10 may contain data that was already copied,
 409         but in order to get proper alignment, we may have to copy
 410         some portions again. This is faster than having unaligned
 411         vector instructions though.  */
 412
 413         lvx     4,11,6        /* vr4 = r11+16.  */
 414 #ifdef __LITTLE_ENDIAN__
 415         vperm   6,4,3,5
 416 #else
 417         vperm   6,3,4,5
 418 #endif
 419         lvx     3,11,7        /* vr3 = r11+32.  */
 420 #ifdef __LITTLE_ENDIAN__
 421         vperm   10,3,4,5
 422 #else
 423         vperm   10,4,3,5
 424 #endif
 425         addi    11,11,32
 426         stvx    6,0,10
 427         stvx    10,10,6
 428         addi    10,10,32
 429
 430         bdnz    L(unaligned_loop)
 431
 432         .align  4
 433 L(end_unaligned_loop):
 434
 435         /* Check for tail bytes.  */
 436         clrrwi  0,31,4
 437         mtcrf   0x01,31
 438         beq     cr1,0f
 439
 440         add     3,3,0
 441         add     12,12,0
 442
 443         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 444 8:      /* Copy 8 bytes.  */
 445         bf      28,4f
 446
 447         lwz     6,0(12)
 448         lwz     7,4(12)
 449         addi    12,12,8
 450         stw     6,0(3)
 451         stw     7,4(3)
 452         addi    3,3,8
 453 4:      /* Copy 4 bytes.  */
 454         bf      29,2f
 455
 456         lwz     6,0(12)
 457         addi    12,12,4
 458         stw     6,0(3)
 459         addi    3,3,4
 460 2:      /* Copy 2~3 bytes.  */
 461         bf      30,1f
 462
 463         lhz     6,0(12)
 464         addi    12,12,2
 465         sth     6,0(3)
 466         addi    3,3,2
 467 1:      /* Copy 1 byte.  */
 468         bf      31,0f
 469
 470         lbz     6,0(12)
 471         stb     6,0(3)
 472 0:      /* Return DST + LEN pointer.  */
 473         add     3,30,5
 474         lwz     30,20(1)
 475         lwz     31,24(1)
 476         addi    1,1,32
 477         blr
 478
 479 END (__mempcpy)
 480 libc_hidden_def (__mempcpy)
 481 weak_alias (__mempcpy, mempcpy)
 482 libc_hidden_builtin_def (mempcpy)