sysdeps/alpha/stxcpy.S

   1 /* Copyright (C) 1996-2021 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson (rth@tamu.edu)
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 /* Copy a null-terminated string from SRC to DST.
  20
  21    This is an internal routine used by strcpy, stpcpy, and strcat.
  22    As such, it uses special linkage conventions to make implementation
  23    of these public functions more efficient.
  24
  25    On input:
  26         t9 = return address
  27         a0 = DST
  28         a1 = SRC
  29
  30    On output:
  31         t8  = bitmask (with one bit set) indicating the last byte written
  32         a0  = unaligned address of the last *word* written
  33
  34    Furthermore, v0, a3-a5, t11, and t12 are untouched.
  35 */
  36
  37 /* This is generally scheduled for the EV5, but should still be pretty
  38    good for the EV4 too.  */
  39
  40 #include <sysdep.h>
  41
  42         .set noat
  43         .set noreorder
  44
  45         .text
  46         .type   __stxcpy, @function
  47         .globl  __stxcpy
  48         .usepv  __stxcpy, no
  49
  50         cfi_startproc
  51         cfi_return_column (t9)
  52
  53         /* On entry to this basic block:
  54            t0 == the first destination word for masking back in
  55            t1 == the first source word.  */
  56         .align 3
  57 stxcpy_aligned:
  58         /* Create the 1st output word and detect 0's in the 1st input word.  */
  59         lda     t2, -1          # e1    : build a mask against false zero
  60         mskqh   t2, a1, t2      # e0    :   detection in the src word
  61         mskqh   t1, a1, t3      # e0    :
  62         ornot   t1, t2, t2      # .. e1 :
  63         mskql   t0, a1, t0      # e0    : assemble the first output word
  64         cmpbge  zero, t2, t7    # .. e1 : bits set iff null found
  65         or      t0, t3, t1      # e0    :
  66         bne     t7, $a_eos      # .. e1 :
  67
  68         /* On entry to this basic block:
  69            t0 == the first destination word for masking back in
  70            t1 == a source word not containing a null.  */
  71 $a_loop:
  72         stq_u   t1, 0(a0)       # e0    :
  73         addq    a0, 8, a0       # .. e1 :
  74         ldq_u   t1, 0(a1)       # e0    :
  75         addq    a1, 8, a1       # .. e1 :
  76         cmpbge  zero, t1, t7    # e0 (stall)
  77         beq     t7, $a_loop     # .. e1 (zdb)
  78
  79         /* Take care of the final (partial) word store.
  80            On entry to this basic block we have:
  81            t1 == the source word containing the null
  82            t7 == the cmpbge mask that found it.  */
  83 $a_eos:
  84         negq    t7, t6          # e0    : find low bit set
  85         and     t7, t6, t8      # e1 (stall)
  86
  87         /* For the sake of the cache, don't read a destination word
  88            if we're not going to need it.  */
  89         and     t8, 0x80, t6    # e0    :
  90         bne     t6, 1f          # .. e1 (zdb)
  91
  92         /* We're doing a partial word store and so need to combine
  93            our source and original destination words.  */
  94         ldq_u   t0, 0(a0)       # e0    :
  95         subq    t8, 1, t6       # .. e1 :
  96         zapnot  t1, t6, t1      # e0    : clear src bytes >= null
  97         or      t8, t6, t7      # .. e1 :
  98         zap     t0, t7, t0      # e0    : clear dst bytes <= null
  99         or      t0, t1, t1      # e1    :
 100
 101 1:      stq_u   t1, 0(a0)       # e0    :
 102         ret     (t9)            # .. e1 :
 103
 104         .align 3
 105 __stxcpy:
 106         /* Are source and destination co-aligned?  */
 107         xor     a0, a1, t0      # e0    :
 108         unop                    #       :
 109         and     t0, 7, t0       # e0    :
 110         bne     t0, $unaligned  # .. e1 :
 111
 112         /* We are co-aligned; take care of a partial first word.  */
 113         ldq_u   t1, 0(a1)       # e0    : load first src word
 114         and     a0, 7, t0       # .. e1 : take care not to load a word ...
 115         addq    a1, 8, a1               # e0    :
 116         beq     t0, stxcpy_aligned      # .. e1 : ... if we wont need it
 117         ldq_u   t0, 0(a0)       # e0    :
 118         br      stxcpy_aligned  # .. e1 :
 119
 120
 121 /* The source and destination are not co-aligned.  Align the destination
 122    and cope.  We have to be very careful about not reading too much and
 123    causing a SEGV.  */
 124
 125         .align 3
 126 $u_head:
 127         /* We know just enough now to be able to assemble the first
 128            full source word.  We can still find a zero at the end of it
 129            that prevents us from outputting the whole thing.
 130
 131            On entry to this basic block:
 132            t0 == the first dest word, for masking back in, if needed else 0
 133            t1 == the low bits of the first source word
 134            t6 == bytemask that is -1 in dest word bytes */
 135
 136         ldq_u   t2, 8(a1)       # e0    :
 137         addq    a1, 8, a1       # .. e1 :
 138
 139         extql   t1, a1, t1      # e0    :
 140         extqh   t2, a1, t4      # e0    :
 141         mskql   t0, a0, t0      # e0    :
 142         or      t1, t4, t1      # .. e1 :
 143         mskqh   t1, a0, t1      # e0    :
 144         or      t0, t1, t1      # e1    :
 145
 146         or      t1, t6, t6      # e0    :
 147         cmpbge  zero, t6, t7    # .. e1 :
 148         lda     t6, -1          # e0    : for masking just below
 149         bne     t7, $u_final    # .. e1 :
 150
 151         mskql   t6, a1, t6              # e0    : mask out the bits we have
 152         or      t6, t2, t2              # e1    :   already extracted before
 153         cmpbge  zero, t2, t7            # e0    :   testing eos
 154         bne     t7, $u_late_head_exit   # .. e1 (zdb)
 155
 156         /* Finally, we've got all the stupid leading edge cases taken care
 157            of and we can set up to enter the main loop.  */
 158
 159         stq_u   t1, 0(a0)       # e0    : store first output word
 160         addq    a0, 8, a0       # .. e1 :
 161         extql   t2, a1, t0      # e0    : position ho-bits of lo word
 162         ldq_u   t2, 8(a1)       # .. e1 : read next high-order source word
 163         addq    a1, 8, a1       # e0    :
 164         cmpbge  zero, t2, t7    # .. e1 :
 165         nop                     # e0    :
 166         bne     t7, $u_eos      # .. e1 :
 167
 168         /* Unaligned copy main loop.  In order to avoid reading too much,
 169            the loop is structured to detect zeros in aligned source words.
 170            This has, unfortunately, effectively pulled half of a loop
 171            iteration out into the head and half into the tail, but it does
 172            prevent nastiness from accumulating in the very thing we want
 173            to run as fast as possible.
 174
 175            On entry to this basic block:
 176            t0 == the shifted high-order bits from the previous source word
 177            t2 == the unshifted current source word
 178
 179            We further know that t2 does not contain a null terminator.  */
 180
 181         .align 3
 182 $u_loop:
 183         extqh   t2, a1, t1      # e0    : extract high bits for current word
 184         addq    a1, 8, a1       # .. e1 :
 185         extql   t2, a1, t3      # e0    : extract low bits for next time
 186         addq    a0, 8, a0       # .. e1 :
 187         or      t0, t1, t1      # e0    : current dst word now complete
 188         ldq_u   t2, 0(a1)       # .. e1 : load high word for next time
 189         stq_u   t1, -8(a0)      # e0    : save the current word
 190         mov     t3, t0          # .. e1 :
 191         cmpbge  zero, t2, t7    # e0    : test new word for eos
 192         beq     t7, $u_loop     # .. e1 :
 193
 194         /* We've found a zero somewhere in the source word we just read.
 195            If it resides in the lower half, we have one (probably partial)
 196            word to write out, and if it resides in the upper half, we
 197            have one full and one partial word left to write out.
 198
 199            On entry to this basic block:
 200            t0 == the shifted high-order bits from the previous source word
 201            t2 == the unshifted current source word.  */
 202 $u_eos:
 203         extqh   t2, a1, t1      # e0    :
 204         or      t0, t1, t1      # e1    : first (partial) source word complete
 205
 206         cmpbge  zero, t1, t7    # e0    : is the null in this first bit?
 207         bne     t7, $u_final    # .. e1 (zdb)
 208
 209 $u_late_head_exit:
 210         stq_u   t1, 0(a0)       # e0    : the null was in the high-order bits
 211         addq    a0, 8, a0       # .. e1 :
 212         extql   t2, a1, t1      # e0    :
 213         cmpbge  zero, t1, t7    # .. e1 :
 214
 215         /* Take care of a final (probably partial) result word.
 216            On entry to this basic block:
 217            t1 == assembled source word
 218            t7 == cmpbge mask that found the null.  */
 219 $u_final:
 220         negq    t7, t6          # e0    : isolate low bit set
 221         and     t6, t7, t8      # e1    :
 222
 223         and     t8, 0x80, t6    # e0    : avoid dest word load if we can
 224         bne     t6, 1f          # .. e1 (zdb)
 225
 226         ldq_u   t0, 0(a0)       # e0    :
 227         subq    t8, 1, t6       # .. e1 :
 228         or      t6, t8, t7      # e0    :
 229         zapnot  t1, t6, t1      # .. e1 : kill source bytes >= null
 230         zap     t0, t7, t0      # e0    : kill dest bytes <= null
 231         or      t0, t1, t1      # e1    :
 232
 233 1:      stq_u   t1, 0(a0)       # e0    :
 234         ret     (t9)            # .. e1 :
 235
 236         /* Unaligned copy entry point.  */
 237         .align 3
 238 $unaligned:
 239
 240         ldq_u   t1, 0(a1)       # e0    : load first source word
 241
 242         and     a0, 7, t4       # .. e1 : find dest misalignment
 243         and     a1, 7, t5       # e0    : find src misalignment
 244
 245         /* Conditionally load the first destination word and a bytemask
 246            with 0xff indicating that the destination byte is sacrosanct.  */
 247
 248         mov     zero, t0        # .. e1 :
 249         mov     zero, t6        # e0    :
 250         beq     t4, 1f          # .. e1 :
 251         ldq_u   t0, 0(a0)       # e0    :
 252         lda     t6, -1          # .. e1 :
 253         mskql   t6, a0, t6      # e0    :
 254 1:
 255         subq    a1, t4, a1      # .. e1 : sub dest misalignment from src addr
 256
 257         /* If source misalignment is larger than dest misalignment, we need
 258            extra startup checks to avoid SEGV.  */
 259
 260         cmplt   t4, t5, t8      # e0    :
 261         beq     t8, $u_head     # .. e1 (zdb)
 262
 263         lda     t2, -1          # e1    : mask out leading garbage in source
 264         mskqh   t2, t5, t2      # e0    :
 265         nop                     # e0    :
 266         ornot   t1, t2, t3      # .. e1 :
 267         cmpbge  zero, t3, t7    # e0    : is there a zero?
 268         beq     t7, $u_head     # .. e1 (zdb)
 269
 270         /* At this point we've found a zero in the first partial word of
 271            the source.  We need to isolate the valid source data and mask
 272            it into the original destination data.  (Incidentally, we know
 273            that we'll need at least one byte of that original dest word.) */
 274
 275         ldq_u   t0, 0(a0)       # e0    :
 276
 277         negq    t7, t6          # .. e1 : build bitmask of bytes <= zero
 278         and     t6, t7, t8      # e0    :
 279         and     a1, 7, t5       # .. e1 :
 280         subq    t8, 1, t6       # e0    :
 281         or      t6, t8, t7      # e1    :
 282         srl     t8, t5, t8      # e0    : adjust final null return value
 283
 284         zapnot  t2, t7, t2      # .. e1 : prepare source word; mirror changes
 285         and     t1, t2, t1      # e1    : to source validity mask
 286         extql   t2, a1, t2      # .. e0 :
 287         extql   t1, a1, t1      # e0    :
 288
 289         andnot  t0, t2, t0      # .. e1 : zero place for source to reside
 290         or      t0, t1, t1      # e1    : and put it there
 291         stq_u   t1, 0(a0)       # .. e0 :
 292         ret     (t9)
 293
 294         cfi_endproc