sysdeps/aarch64/multiarch/memcpy_falkor.S

   1 /* Optimized memcpy for Qualcomm Falkor processor.
   2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
   3
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library.  If not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Assumptions:
  23
  24    ARMv8-a, AArch64, falkor, unaligned accesses.  */
  25
  26 #define dstin   x0
  27 #define src     x1
  28 #define count   x2
  29 #define dst     x3
  30 #define srcend  x4
  31 #define dstend  x5
  32 #define A_l     x6
  33 #define A_lw    w6
  34 #define A_h     x7
  35 #define A_hw    w7
  36 #define tmp1    x14
  37
  38 /* Copies are split into 3 main cases:
  39
  40    1. Small copies of up to 32 bytes
  41    2. Medium copies of 33..128 bytes which are fully unrolled
  42    3. Large copies of more than 128 bytes.
  43
  44    Large copies align the sourceto a quad word and use an unrolled loop
  45    processing 64 bytes per iteration.
  46
  47    FALKOR-SPECIFIC DESIGN:
  48
  49    The smallest copies (32 bytes or less) focus on optimal pipeline usage,
  50    which is why the redundant copies of 0-3 bytes have been replaced with
  51    conditionals, since the former would unnecessarily break across multiple
  52    issue groups.  The medium copy group has been enlarged to 128 bytes since
  53    bumping up the small copies up to 32 bytes allows us to do that without
  54    cost and also allows us to reduce the size of the prep code before loop64.
  55
  56    All copies are done only via two registers r6 and r7.  This is to ensure
  57    that all loads hit a single hardware prefetcher which can get correctly
  58    trained to prefetch a single stream.
  59
  60    The non-temporal stores help optimize cache utilization.  */
  61
  62 #if IS_IN (libc)
  63 ENTRY_ALIGN (__memcpy_falkor, 6)
  64
  65         cmp     count, 32
  66         add     srcend, src, count
  67         add     dstend, dstin, count
  68         b.ls    L(copy32)
  69         ldp     A_l, A_h, [src]
  70         cmp     count, 128
  71         stp     A_l, A_h, [dstin]
  72         b.hi    L(copy_long)
  73
  74         /* Medium copies: 33..128 bytes.  */
  75         sub     tmp1, count, 1
  76         ldp     A_l, A_h, [src, 16]
  77         stp     A_l, A_h, [dstin, 16]
  78         tbz     tmp1, 6, 1f
  79         ldp     A_l, A_h, [src, 32]
  80         stp     A_l, A_h, [dstin, 32]
  81         ldp     A_l, A_h, [src, 48]
  82         stp     A_l, A_h, [dstin, 48]
  83         ldp     A_l, A_h, [srcend, -64]
  84         stp     A_l, A_h, [dstend, -64]
  85         ldp     A_l, A_h, [srcend, -48]
  86         stp     A_l, A_h, [dstend, -48]
  87 1:
  88         ldp     A_l, A_h, [srcend, -32]
  89         stp     A_l, A_h, [dstend, -32]
  90         ldp     A_l, A_h, [srcend, -16]
  91         stp     A_l, A_h, [dstend, -16]
  92         ret
  93
  94         .p2align 4
  95         /* Small copies: 0..32 bytes.  */
  96 L(copy32):
  97         /* 16-32 */
  98         cmp     count, 16
  99         b.lo    1f
 100         ldp     A_l, A_h, [src]
 101         stp     A_l, A_h, [dstin]
 102         ldp     A_l, A_h, [srcend, -16]
 103         stp     A_l, A_h, [dstend, -16]
 104         ret
 105         .p2align 4
 106 1:
 107         /* 8-15 */
 108         tbz     count, 3, 1f
 109         ldr     A_l, [src]
 110         str     A_l, [dstin]
 111         ldr     A_l, [srcend, -8]
 112         str     A_l, [dstend, -8]
 113         ret
 114         .p2align 4
 115 1:
 116         /* 4-7 */
 117         tbz     count, 2, 1f
 118         ldr     A_lw, [src]
 119         str     A_lw, [dstin]
 120         ldr     A_lw, [srcend, -4]
 121         str     A_lw, [dstend, -4]
 122         ret
 123         .p2align 4
 124 1:
 125         /* 2-3 */
 126         tbz     count, 1, 1f
 127         ldrh    A_lw, [src]
 128         strh    A_lw, [dstin]
 129         ldrh    A_lw, [srcend, -2]
 130         strh    A_lw, [dstend, -2]
 131         ret
 132         .p2align 4
 133 1:
 134         /* 0-1 */
 135         tbz     count, 0, 1f
 136         ldrb    A_lw, [src]
 137         strb    A_lw, [dstin]
 138 1:
 139         ret
 140
 141         /* Align SRC to 16 bytes and copy; that way at least one of the
 142            accesses is aligned throughout the copy sequence.
 143
 144            The count is off by 0 to 15 bytes, but this is OK because we trim
 145            off the last 64 bytes to copy off from the end.  Due to this the
 146            loop never runs out of bounds.  */
 147         .p2align 6
 148 L(copy_long):
 149         sub     count, count, 64 + 16
 150         and     tmp1, src, 15
 151         bic     src, src, 15
 152         sub     dst, dstin, tmp1
 153         add     count, count, tmp1
 154
 155 L(loop64):
 156         ldp     A_l, A_h, [src, 16]!
 157         stnp    A_l, A_h, [dst, 16]
 158         ldp     A_l, A_h, [src, 16]!
 159         subs    count, count, 64
 160         stnp    A_l, A_h, [dst, 32]
 161         ldp     A_l, A_h, [src, 16]!
 162         stnp    A_l, A_h, [dst, 48]
 163         ldp     A_l, A_h, [src, 16]!
 164         stnp    A_l, A_h, [dst, 64]
 165         add     dst, dst, 64
 166         b.hi    L(loop64)
 167
 168         /* Write the last full set of 64 bytes.  The remainder is at most 64
 169            bytes, so it is safe to always copy 64 bytes from the end even if
 170            there is just 1 byte left.  */
 171 L(last64):
 172         ldp     A_l, A_h, [srcend, -64]
 173         stnp    A_l, A_h, [dstend, -64]
 174         ldp     A_l, A_h, [srcend, -48]
 175         stnp    A_l, A_h, [dstend, -48]
 176         ldp     A_l, A_h, [srcend, -32]
 177         stnp    A_l, A_h, [dstend, -32]
 178         ldp     A_l, A_h, [srcend, -16]
 179         stnp    A_l, A_h, [dstend, -16]
 180         ret
 181
 182 END (__memcpy_falkor)
 183 libc_hidden_builtin_def (__memcpy_falkor)
 184 #endif