sysdeps/powerpc/powerpc64/power7/mempcpy.S

   1 /* Optimized mempcpy implementation for POWER7.
   2    Copyright (C) 2010-2021 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21
  22 /* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]);
  23     Returns 'dst' + 'len'.  */
  24
  25 #ifndef MEMPCPY
  26 # define MEMPCPY __mempcpy
  27 #endif
  28         .machine  power7
  29 ENTRY_TOCLESS (MEMPCPY, 5)
  30         CALL_MCOUNT 3
  31
  32         cmpldi  cr1,5,31
  33         neg     0,3
  34         std     3,-16(1)
  35         std     31,-8(1)
  36         cfi_offset(31,-8)
  37         ble     cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
  38                                        code.  */
  39
  40         andi.   11,3,7        /* Check alignment of DST.  */
  41
  42
  43         clrldi  10,4,61       /* Check alignment of SRC.  */
  44         cmpld   cr6,10,11     /* SRC and DST alignments match?  */
  45         mr      12,4
  46         mr      31,5
  47         bne     cr6,L(copy_GE_32_unaligned)
  48
  49         srdi    9,5,3         /* Number of full quadwords remaining.  */
  50
  51         beq     L(copy_GE_32_aligned_cont)
  52
  53         clrldi  0,0,61
  54         mtcrf   0x01,0
  55         subf    31,0,5
  56
  57         /* Get the SRC aligned to 8 bytes.  */
  58
  59 1:      bf      31,2f
  60         lbz     6,0(12)
  61         addi    12,12,1
  62         stb     6,0(3)
  63         addi    3,3,1
  64 2:      bf      30,4f
  65         lhz     6,0(12)
  66         addi    12,12,2
  67         sth     6,0(3)
  68         addi    3,3,2
  69 4:      bf      29,0f
  70         lwz     6,0(12)
  71         addi    12,12,4
  72         stw     6,0(3)
  73         addi    3,3,4
  74 0:
  75         clrldi  10,12,61      /* Check alignment of SRC again.  */
  76         srdi    9,31,3        /* Number of full doublewords remaining.  */
  77
  78 L(copy_GE_32_aligned_cont):
  79
  80         clrldi  11,31,61
  81         mtcrf   0x01,9
  82
  83         srdi    8,31,5
  84         cmpldi  cr1,9,4
  85         cmpldi  cr6,11,0
  86         mr      11,12
  87
  88         /* Copy 1~3 doublewords so the main loop starts
  89         at a multiple of 32 bytes.  */
  90
  91         bf      30,1f
  92         ld      6,0(12)
  93         ld      7,8(12)
  94         addi    11,12,16
  95         mtctr   8
  96         std     6,0(3)
  97         std     7,8(3)
  98         addi    10,3,16
  99         bf      31,4f
 100         ld      0,16(12)
 101         std     0,16(3)
 102         blt     cr1,3f
 103         addi    11,12,24
 104         addi    10,3,24
 105         b       4f
 106
 107         .align  4
 108 1:      /* Copy 1 doubleword and set the counter.  */
 109         mr      10,3
 110         mtctr   8
 111         bf      31,4f
 112         ld      6,0(12)
 113         addi    11,12,8
 114         std     6,0(3)
 115         addi    10,3,8
 116
 117         /* Main aligned copy loop. Copies 32-bytes at a time.  */
 118         .align  4
 119 4:
 120         ld      6,0(11)
 121         ld      7,8(11)
 122         ld      8,16(11)
 123         ld      0,24(11)
 124         addi    11,11,32
 125
 126         std     6,0(10)
 127         std     7,8(10)
 128         std     8,16(10)
 129         std     0,24(10)
 130         addi    10,10,32
 131         bdnz    4b
 132 3:
 133
 134         /* Check for tail bytes.  */
 135         rldicr  0,31,0,60
 136         mtcrf   0x01,31
 137         beq     cr6,0f
 138
 139 .L9:
 140         add     3,3,0
 141         add     12,12,0
 142
 143         /*  At this point we have a tail of 0-7 bytes and we know that the
 144         destination is doubleword-aligned.  */
 145 4:      /* Copy 4 bytes.  */
 146         bf      29,2f
 147
 148         lwz     6,0(12)
 149         addi    12,12,4
 150         stw     6,0(3)
 151         addi    3,3,4
 152 2:      /* Copy 2 bytes.  */
 153         bf      30,1f
 154
 155         lhz     6,0(12)
 156         addi    12,12,2
 157         sth     6,0(3)
 158         addi    3,3,2
 159 1:      /* Copy 1 byte.  */
 160         bf      31,0f
 161
 162         lbz     6,0(12)
 163         stb     6,0(3)
 164 0:      /* Return DST + LEN pointer.  */
 165         ld      31,-8(1)
 166         ld      3,-16(1)
 167         add     3,3,5
 168         blr
 169
 170         /* Handle copies of 0~31 bytes.  */
 171         .align  4
 172 L(copy_LT_32):
 173         cmpldi  cr6,5,8
 174         mr      12,4
 175         mtcrf   0x01,5
 176         ble     cr6,L(copy_LE_8)
 177
 178         /* At least 9 bytes to go.  */
 179         neg     8,4
 180         clrrdi  11,4,2
 181         andi.   0,8,3
 182         cmpldi  cr1,5,16
 183         mr      10,5
 184         beq     L(copy_LT_32_aligned)
 185
 186         /* Force 4-bytes alignment for SRC.  */
 187         mtocrf  0x01,0
 188         subf    10,0,5
 189 2:      bf      30,1f
 190
 191         lhz     6,0(12)
 192         addi    12,12,2
 193         sth     6,0(3)
 194         addi    3,3,2
 195 1:      bf      31,L(end_4bytes_alignment)
 196
 197         lbz     6,0(12)
 198         addi    12,12,1
 199         stb     6,0(3)
 200         addi    3,3,1
 201
 202         .align  4
 203 L(end_4bytes_alignment):
 204         cmpldi  cr1,10,16
 205         mtcrf   0x01,10
 206
 207 L(copy_LT_32_aligned):
 208         /* At least 6 bytes to go, and SRC is word-aligned.  */
 209         blt     cr1,8f
 210
 211         /* Copy 16 bytes.  */
 212         lwz     6,0(12)
 213         lwz     7,4(12)
 214         stw     6,0(3)
 215         lwz     8,8(12)
 216         stw     7,4(3)
 217         lwz     6,12(12)
 218         addi    12,12,16
 219         stw     8,8(3)
 220         stw     6,12(3)
 221         addi    3,3,16
 222 8:      /* Copy 8 bytes.  */
 223         bf      28,4f
 224
 225         lwz     6,0(12)
 226         lwz     7,4(12)
 227         addi    12,12,8
 228         stw     6,0(3)
 229         stw     7,4(3)
 230         addi    3,3,8
 231 4:      /* Copy 4 bytes.  */
 232         bf      29,2f
 233
 234         lwz     6,0(12)
 235         addi    12,12,4
 236         stw     6,0(3)
 237         addi    3,3,4
 238 2:      /* Copy 2-3 bytes.  */
 239         bf      30,1f
 240
 241         lhz     6,0(12)
 242         sth     6,0(3)
 243         bf      31,0f
 244         lbz     7,2(12)
 245         stb     7,2(3)
 246         ld      3,-16(1)
 247         add     3,3,5
 248         blr
 249
 250         .align  4
 251 1:      /* Copy 1 byte.  */
 252         bf      31,0f
 253
 254         lbz     6,0(12)
 255         stb     6,0(3)
 256 0:      /* Return DST + LEN pointer.  */
 257         ld      3,-16(1)
 258         add     3,3,5
 259         blr
 260
 261         /* Handles copies of 0~8 bytes.  */
 262         .align  4
 263 L(copy_LE_8):
 264         bne     cr6,4f
 265
 266         /* Though we could've used ld/std here, they are still
 267         slow for unaligned cases.  */
 268
 269         lwz     6,0(4)
 270         lwz     7,4(4)
 271         stw     6,0(3)
 272         stw     7,4(3)
 273         ld      3,-16(1)      /* Return DST + LEN pointer.  */
 274         add     3,3,5
 275         blr
 276
 277         .align  4
 278 4:      /* Copies 4~7 bytes.  */
 279         bf      29,2b
 280
 281         lwz     6,0(4)
 282         stw     6,0(3)
 283         bf      30,5f
 284         lhz     7,4(4)
 285         sth     7,4(3)
 286         bf      31,0f
 287         lbz     8,6(4)
 288         stb     8,6(3)
 289         ld      3,-16(1)
 290         add     3,3,5
 291         blr
 292
 293         .align  4
 294 5:      /* Copy 1 byte.  */
 295         bf      31,0f
 296
 297         lbz     6,4(4)
 298         stb     6,4(3)
 299
 300 0:      /* Return DST + LEN pointer.  */
 301         ld      3,-16(1)
 302         add     3,3,5
 303         blr
 304
 305         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 306         SRC is not.  Use aligned quadword loads from SRC, shifted to realign
 307         the data, allowing for aligned DST stores.  */
 308         .align  4
 309 L(copy_GE_32_unaligned):
 310         clrldi  0,0,60        /* Number of bytes until the 1st
 311                                  quadword.  */
 312         andi.   11,3,15       /* Check alignment of DST (against
 313                                  quadwords).  */
 314         srdi    9,5,4         /* Number of full quadwords remaining.  */
 315
 316         beq     L(copy_GE_32_unaligned_cont)
 317
 318         /* SRC is not quadword aligned, get it aligned.  */
 319
 320         mtcrf   0x01,0
 321         subf    31,0,5
 322
 323         /* Vector instructions work best when proper alignment (16-bytes)
 324         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 325 1:      /* Copy 1 byte.  */
 326         bf      31,2f
 327
 328         lbz     6,0(12)
 329         addi    12,12,1
 330         stb     6,0(3)
 331         addi    3,3,1
 332 2:      /* Copy 2 bytes.  */
 333         bf      30,4f
 334
 335         lhz     6,0(12)
 336         addi    12,12,2
 337         sth     6,0(3)
 338         addi    3,3,2
 339 4:      /* Copy 4 bytes.  */
 340         bf      29,8f
 341
 342         lwz     6,0(12)
 343         addi    12,12,4
 344         stw     6,0(3)
 345         addi    3,3,4
 346 8:      /* Copy 8 bytes.  */
 347         bf      28,0f
 348
 349         ld      6,0(12)
 350         addi    12,12,8
 351         std     6,0(3)
 352         addi    3,3,8
 353 0:
 354         clrldi  10,12,60      /* Check alignment of SRC.  */
 355         srdi    9,31,4        /* Number of full quadwords remaining.  */
 356
 357         /* The proper alignment is present, it is OK to copy the bytes now.  */
 358 L(copy_GE_32_unaligned_cont):
 359
 360         /* Setup two indexes to speed up the indexed vector operations.  */
 361         clrldi  11,31,60
 362         li      6,16          /* Index for 16-bytes offsets.  */
 363         li      7,32          /* Index for 32-bytes offsets.  */
 364         cmpldi  cr1,11,0
 365         srdi    8,31,5        /* Setup the loop counter.  */
 366         mr      10,3
 367         mr      11,12
 368         mtcrf   0x01,9
 369         cmpldi  cr6,9,1
 370 #ifdef __LITTLE_ENDIAN__
 371         lvsr    5,0,12
 372 #else
 373         lvsl    5,0,12
 374 #endif
 375         lvx     3,0,12
 376         bf      31,L(setup_unaligned_loop)
 377
 378         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 379         lvx     4,12,6
 380 #ifdef __LITTLE_ENDIAN__
 381         vperm   6,4,3,5
 382 #else
 383         vperm   6,3,4,5
 384 #endif
 385         addi    11,12,16
 386         addi    10,3,16
 387         stvx    6,0,3
 388         vor     3,4,4
 389
 390 L(setup_unaligned_loop):
 391         mtctr   8
 392         ble     cr6,L(end_unaligned_loop)
 393
 394         /* Copy 32 bytes at a time using vector instructions.  */
 395         .align  4
 396 L(unaligned_loop):
 397
 398         /* Note: vr6/vr10 may contain data that was already copied,
 399         but in order to get proper alignment, we may have to copy
 400         some portions again. This is faster than having unaligned
 401         vector instructions though.  */
 402
 403         lvx     4,11,6        /* vr4 = r11+16.  */
 404 #ifdef __LITTLE_ENDIAN__
 405         vperm   6,4,3,5
 406 #else
 407         vperm   6,3,4,5
 408 #endif
 409         lvx     3,11,7        /* vr3 = r11+32.  */
 410 #ifdef __LITTLE_ENDIAN__
 411         vperm   10,3,4,5
 412 #else
 413         vperm   10,4,3,5
 414 #endif
 415         addi    11,11,32
 416         stvx    6,0,10
 417         stvx    10,10,6
 418         addi    10,10,32
 419
 420         bdnz    L(unaligned_loop)
 421
 422         .align  4
 423 L(end_unaligned_loop):
 424
 425         /* Check for tail bytes.  */
 426         rldicr  0,31,0,59
 427         mtcrf   0x01,31
 428         beq     cr1,0f
 429
 430         add     3,3,0
 431         add     12,12,0
 432
 433         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 434 8:      /* Copy 8 bytes.  */
 435         bf      28,4f
 436
 437         lwz     6,0(12)
 438         lwz     7,4(12)
 439         addi    12,12,8
 440         stw     6,0(3)
 441         stw     7,4(3)
 442         addi    3,3,8
 443 4:      /* Copy 4 bytes.  */
 444         bf      29,2f
 445
 446         lwz     6,0(12)
 447         addi    12,12,4
 448         stw     6,0(3)
 449         addi    3,3,4
 450 2:      /* Copy 2~3 bytes.  */
 451         bf      30,1f
 452
 453         lhz     6,0(12)
 454         addi    12,12,2
 455         sth     6,0(3)
 456         addi    3,3,2
 457 1:      /* Copy 1 byte.  */
 458         bf      31,0f
 459
 460         lbz     6,0(12)
 461         stb     6,0(3)
 462 0:      /* Return DST + LEN pointer.  */
 463         ld      31,-8(1)
 464         ld      3,-16(1)
 465         add     3,3,5
 466         blr
 467
 468 END_GEN_TB (MEMPCPY,TB_TOCLESS)
 469 libc_hidden_def (__mempcpy)
 470 weak_alias (__mempcpy, mempcpy)
 471 libc_hidden_builtin_def (mempcpy)