sysdeps/alpha/alphaev6/memcpy.S

   1 /* Copyright (C) 2000-2018 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 /*
  20  * Much of the information about 21264 scheduling/coding comes from:
  21  *      Compiler Writer's Guide for the Alpha 21264
  22  *      abbreviated as 'CWG' in other comments here
  23  *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  24  * Scheduling notation:
  25  *      E       - either cluster
  26  *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  27  *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  28  *
  29  * Temp usage notes:
  30  *      $0              - destination address
  31  *      $1,$2,          - scratch
  32  */
  33
  34 #include <sysdep.h>
  35
  36         .arch ev6
  37         .set noreorder
  38         .set noat
  39
  40 ENTRY(memcpy)
  41         .prologue 0
  42
  43         mov     $16, $0                 # E : copy dest to return
  44         ble     $18, $nomoredata        # U : done with the copy?
  45         xor     $16, $17, $1            # E : are source and dest alignments the same?
  46         and     $1, 7, $1               # E : are they the same mod 8?
  47
  48         bne     $1, $misaligned         # U : Nope - gotta do this the slow way
  49         /* source and dest are same mod 8 address */
  50         and     $16, 7, $1              # E : Are both 0mod8?
  51         beq     $1, $both_0mod8         # U : Yes
  52         nop                             # E :
  53
  54         /*
  55          * source and dest are same misalignment.  move a byte at a time
  56          * until a 0mod8 alignment for both is reached.
  57          * At least one byte more to move
  58          */
  59
  60 $head_align:
  61         ldbu    $1, 0($17)              # L : grab a byte
  62         subq    $18, 1, $18             # E : count--
  63         addq    $17, 1, $17             # E : src++
  64         stb     $1, 0($16)              # L :
  65         addq    $16, 1, $16             # E : dest++
  66         and     $16, 7, $1              # E : Are we at 0mod8 yet?
  67         ble     $18, $nomoredata        # U : done with the copy?
  68         bne     $1, $head_align         # U :
  69
  70 $both_0mod8:
  71         cmple   $18, 127, $1            # E : Can we unroll the loop?
  72         bne     $1, $no_unroll          # U :
  73         and     $16, 63, $1             # E : get mod64 alignment
  74         beq     $1, $do_unroll          # U : no single quads to fiddle
  75
  76 $single_head_quad:
  77         ldq     $1, 0($17)              # L : get 8 bytes
  78         subq    $18, 8, $18             # E : count -= 8
  79         addq    $17, 8, $17             # E : src += 8
  80         nop                             # E :
  81
  82         stq     $1, 0($16)              # L : store
  83         addq    $16, 8, $16             # E : dest += 8
  84         and     $16, 63, $1             # E : get mod64 alignment
  85         bne     $1, $single_head_quad   # U : still not fully aligned
  86
  87 $do_unroll:
  88         addq    $16, 64, $7             # E : Initial (+1 trip) wh64 address
  89         cmple   $18, 127, $1            # E : Can we go through the unrolled loop?
  90         bne     $1, $tail_quads         # U : Nope
  91         nop                             # E :
  92
  93 $unroll_body:
  94         wh64    ($7)                    # L1 : memory subsystem hint: 64 bytes at
  95                                         # ($7) are about to be over-written
  96         ldq     $6, 0($17)              # L0 : bytes 0..7
  97         nop                             # E :
  98         nop                             # E :
  99
 100         ldq     $4, 8($17)              # L : bytes 8..15
 101         ldq     $5, 16($17)             # L : bytes 16..23
 102         addq    $7, 64, $7              # E : Update next wh64 address
 103         nop                             # E :
 104
 105         ldq     $3, 24($17)             # L : bytes 24..31
 106         addq    $16, 64, $1             # E : fallback value for wh64
 107         nop                             # E :
 108         nop                             # E :
 109
 110         addq    $17, 32, $17            # E : src += 32 bytes
 111         stq     $6, 0($16)              # L : bytes 0..7
 112         nop                             # E :
 113         nop                             # E :
 114
 115         stq     $4, 8($16)              # L : bytes 8..15
 116         stq     $5, 16($16)             # L : bytes 16..23
 117         subq    $18, 192, $2            # E : At least two more trips to go?
 118         nop                             # E :
 119
 120         stq     $3, 24($16)             # L : bytes 24..31
 121         addq    $16, 32, $16            # E : dest += 32 bytes
 122         nop                             # E :
 123         nop                             # E :
 124
 125         ldq     $6, 0($17)              # L : bytes 0..7
 126         ldq     $4, 8($17)              # L : bytes 8..15
 127         cmovlt  $2, $1, $7              # E : Latency 2, extra map slot - Use
 128                                         # fallback wh64 address if < 2 more trips
 129         nop                             # E :
 130
 131         ldq     $5, 16($17)             # L : bytes 16..23
 132         ldq     $3, 24($17)             # L : bytes 24..31
 133         addq    $16, 32, $16            # E : dest += 32
 134         subq    $18, 64, $18            # E : count -= 64
 135
 136         addq    $17, 32, $17            # E : src += 32
 137         stq     $6, -32($16)            # L : bytes 0..7
 138         stq     $4, -24($16)            # L : bytes 8..15
 139         cmple   $18, 63, $1             # E : At least one more trip?
 140
 141         stq     $5, -16($16)            # L : bytes 16..23
 142         stq     $3, -8($16)             # L : bytes 24..31
 143         nop                             # E :
 144         beq     $1, $unroll_body
 145
 146 $tail_quads:
 147 $no_unroll:
 148         .align 4
 149         subq    $18, 8, $18             # E : At least a quad left?
 150         blt     $18, $less_than_8       # U : Nope
 151         nop                             # E :
 152         nop                             # E :
 153
 154 $move_a_quad:
 155         ldq     $1, 0($17)              # L : fetch 8
 156         subq    $18, 8, $18             # E : count -= 8
 157         addq    $17, 8, $17             # E : src += 8
 158         nop                             # E :
 159
 160         stq     $1, 0($16)              # L : store 8
 161         addq    $16, 8, $16             # E : dest += 8
 162         bge     $18, $move_a_quad       # U :
 163         nop                             # E :
 164
 165 $less_than_8:
 166         .align 4
 167         addq    $18, 8, $18             # E : add back for trailing bytes
 168         ble     $18, $nomoredata        # U : All-done
 169         nop                             # E :
 170         nop                             # E :
 171
 172         /* Trailing bytes */
 173 $tail_bytes:
 174         subq    $18, 1, $18             # E : count--
 175         ldbu    $1, 0($17)              # L : fetch a byte
 176         addq    $17, 1, $17             # E : src++
 177         nop                             # E :
 178
 179         stb     $1, 0($16)              # L : store a byte
 180         addq    $16, 1, $16             # E : dest++
 181         bgt     $18, $tail_bytes        # U : more to be done?
 182         nop                             # E :
 183
 184         /* branching to exit takes 3 extra cycles, so replicate exit here */
 185         ret     $31, ($26), 1           # L0 :
 186         nop                             # E :
 187         nop                             # E :
 188         nop                             # E :
 189
 190 $misaligned:
 191         mov     $0, $4                  # E : dest temp
 192         and     $0, 7, $1               # E : dest alignment mod8
 193         beq     $1, $dest_0mod8         # U : life doesnt totally suck
 194         nop
 195
 196 $aligndest:
 197         ble     $18, $nomoredata        # U :
 198         ldbu    $1, 0($17)              # L : fetch a byte
 199         subq    $18, 1, $18             # E : count--
 200         addq    $17, 1, $17             # E : src++
 201
 202         stb     $1, 0($4)               # L : store it
 203         addq    $4, 1, $4               # E : dest++
 204         and     $4, 7, $1               # E : dest 0mod8 yet?
 205         bne     $1, $aligndest          # U : go until we are aligned.
 206
 207         /* Source has unknown alignment, but dest is known to be 0mod8 */
 208 $dest_0mod8:
 209         subq    $18, 8, $18             # E : At least a quad left?
 210         blt     $18, $misalign_tail     # U : Nope
 211         ldq_u   $3, 0($17)              # L : seed (rotating load) of 8 bytes
 212         nop                             # E :
 213
 214 $mis_quad:
 215         ldq_u   $16, 8($17)             # L : Fetch next 8
 216         extql   $3, $17, $3             # U : masking
 217         extqh   $16, $17, $1            # U : masking
 218         bis     $3, $1, $1              # E : merged bytes to store
 219
 220         subq    $18, 8, $18             # E : count -= 8
 221         addq    $17, 8, $17             # E : src += 8
 222         stq     $1, 0($4)               # L : store 8 (aligned)
 223         mov     $16, $3                 # E : "rotate" source data
 224
 225         addq    $4, 8, $4               # E : dest += 8
 226         bge     $18, $mis_quad          # U : More quads to move
 227         nop
 228         nop
 229
 230 $misalign_tail:
 231         addq    $18, 8, $18             # E : account for tail stuff
 232         ble     $18, $nomoredata        # U :
 233         nop
 234         nop
 235
 236 $misalign_byte:
 237         ldbu    $1, 0($17)              # L : fetch 1
 238         subq    $18, 1, $18             # E : count--
 239         addq    $17, 1, $17             # E : src++
 240         nop                             # E :
 241
 242         stb     $1, 0($4)               # L : store
 243         addq    $4, 1, $4               # E : dest++
 244         bgt     $18, $misalign_byte     # U : more to go?
 245         nop
 246
 247
 248 $nomoredata:
 249         ret     $31, ($26), 1           # L0 :
 250         nop                             # E :
 251         nop                             # E :
 252         nop                             # E :
 253
 254 END(memcpy)
 255 libc_hidden_builtin_def (memcpy)