1 /* Copyright (C) 2000-2015 Free Software Foundation, Inc.
2 Contributed by Richard Henderson (rth@tamu.edu)
3 EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <http://www.gnu.org/licenses/>. */
20 /* Copy a null-terminated string from SRC to DST.
22 This is an internal routine used by strcpy, stpcpy, and strcat.
23 As such, it uses special linkage conventions to make implementation
24 of these public functions more efficient.
32 t8 = bitmask (with one bit set) indicating the last byte written
33 a0 = unaligned address of the last *word* written
35 Furthermore, v0, a3-a5, t11, and t12 are untouched.
46 .type __stxcpy, @function
51 cfi_return_column (t9)
53 /* On entry to this basic block:
54 t0 == the first destination word for masking back in
55 t1 == the first source word. */
58 /* Create the 1st output word and detect 0's in the 1st input word. */
59 lda t2, -1 # E : build a mask against false zero
60 mskqh t2, a1, t2 # U : detection in the src word (stall)
61 mskqh t1, a1, t3 # U :
62 ornot t1, t2, t2 # E : (stall)
64 mskql t0, a1, t0 # U : assemble the first output word
65 cmpbge zero, t2, t10 # E : bits set iff null found
66 or t0, t3, t1 # E : (stall)
67 bne t10, $a_eos # U : (stall)
69 /* On entry to this basic block:
70 t0 == the first destination word for masking back in
71 t1 == a source word not containing a null. */
72 /* Nops here to separate store quads from load quads */
80 ldq_u t1, 0(a1) # L : Latency=3
82 cmpbge zero, t1, t10 # E : (3 cycle stall)
83 beq t10, $a_loop # U : (stall for t10)
85 /* Take care of the final (partial) word store.
86 On entry to this basic block we have:
87 t1 == the source word containing the null
88 t10 == the cmpbge mask that found it. */
90 negq t10, t6 # E : find low bit set
91 and t10, t6, t8 # E : (stall)
92 /* For the sake of the cache, don't read a destination word
93 if we're not going to need it. */
94 and t8, 0x80, t6 # E : (stall)
95 bne t6, 1f # U : (stall)
97 /* We're doing a partial word store and so need to combine
98 our source and original destination words. */
99 ldq_u t0, 0(a0) # L : Latency=3
101 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
102 or t8, t6, t10 # E : (stall)
104 zap t0, t10, t0 # E : clear dst bytes <= null
105 or t0, t1, t1 # E : (stall)
109 1: stq_u t1, 0(a0) # L :
110 ret (t9) # L0 : Latency=3
116 /* Are source and destination co-aligned? */
119 and t0, 7, t0 # E : (stall)
120 bne t0, $unaligned # U : (stall)
122 /* We are co-aligned; take care of a partial first word. */
123 ldq_u t1, 0(a1) # L : load first src word
124 and a0, 7, t0 # E : take care not to load a word ...
126 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
128 ldq_u t0, 0(a0) # L :
129 br stxcpy_aligned # L0 : Latency=3
134 /* The source and destination are not co-aligned. Align the destination
135 and cope. We have to be very careful about not reading too much and
140 /* We know just enough now to be able to assemble the first
141 full source word. We can still find a zero at the end of it
142 that prevents us from outputting the whole thing.
144 On entry to this basic block:
145 t0 == the first dest word, for masking back in, if needed else 0
146 t1 == the low bits of the first source word
147 t6 == bytemask that is -1 in dest word bytes */
149 ldq_u t2, 8(a1) # L :
151 extql t1, a1, t1 # U : (stall on a1)
152 extqh t2, a1, t4 # U : (stall on a1)
154 mskql t0, a0, t0 # U :
156 mskqh t1, a0, t1 # U : (stall on t1)
157 or t0, t1, t1 # E : (stall on t1)
160 cmpbge zero, t6, t10 # E : (stall)
161 lda t6, -1 # E : for masking just below
162 bne t10, $u_final # U : (stall)
164 mskql t6, a1, t6 # U : mask out the bits we have
165 or t6, t2, t2 # E : already extracted before (stall)
166 cmpbge zero, t2, t10 # E : testing eos (stall)
167 bne t10, $u_late_head_exit # U : (stall)
169 /* Finally, we've got all the stupid leading edge cases taken care
170 of and we can set up to enter the main loop. */
172 stq_u t1, 0(a0) # L : store first output word
174 extql t2, a1, t0 # U : position ho-bits of lo word
175 ldq_u t2, 8(a1) # U : read next high-order source word
178 cmpbge zero, t2, t10 # E : (stall for t2)
180 bne t10, $u_eos # U : (stall)
182 /* Unaligned copy main loop. In order to avoid reading too much,
183 the loop is structured to detect zeros in aligned source words.
184 This has, unfortunately, effectively pulled half of a loop
185 iteration out into the head and half into the tail, but it does
186 prevent nastiness from accumulating in the very thing we want
187 to run as fast as possible.
189 On entry to this basic block:
190 t0 == the shifted high-order bits from the previous source word
191 t2 == the unshifted current source word
193 We further know that t2 does not contain a null terminator. */
197 extqh t2, a1, t1 # U : extract high bits for current word
198 addq a1, 8, a1 # E : (stall)
199 extql t2, a1, t3 # U : extract low bits for next time (stall)
202 or t0, t1, t1 # E : current dst word now complete
203 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
204 stq_u t1, -8(a0) # L : save the current word (stall)
207 cmpbge zero, t2, t10 # E : test new word for eos
208 beq t10, $u_loop # U : (stall)
212 /* We've found a zero somewhere in the source word we just read.
213 If it resides in the lower half, we have one (probably partial)
214 word to write out, and if it resides in the upper half, we
215 have one full and one partial word left to write out.
217 On entry to this basic block:
218 t0 == the shifted high-order bits from the previous source word
219 t2 == the unshifted current source word. */
221 extqh t2, a1, t1 # U :
222 or t0, t1, t1 # E : first (partial) source word complete (stall)
223 cmpbge zero, t1, t10 # E : is the null in this first bit? (stall)
224 bne t10, $u_final # U : (stall)
227 stq_u t1, 0(a0) # L : the null was in the high-order bits
229 extql t2, a1, t1 # U :
230 cmpbge zero, t1, t10 # E : (stall)
232 /* Take care of a final (probably partial) result word.
233 On entry to this basic block:
234 t1 == assembled source word
235 t10 == cmpbge mask that found the null. */
237 negq t10, t6 # E : isolate low bit set
238 and t6, t10, t8 # E : (stall)
239 and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
240 bne t6, 1f # U : (stall)
242 ldq_u t0, 0(a0) # E :
244 or t6, t8, t10 # E : (stall)
245 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
247 zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
248 or t0, t1, t1 # E : (stall)
252 1: stq_u t1, 0(a0) # L :
253 ret (t9) # L0 : Latency=3
257 /* Unaligned copy entry point. */
261 ldq_u t1, 0(a1) # L : load first source word
262 and a0, 7, t4 # E : find dest misalignment
263 and a1, 7, t5 # E : find src misalignment
264 /* Conditionally load the first destination word and a bytemask
265 with 0xff indicating that the destination byte is sacrosanct. */
270 ldq_u t0, 0(a0) # L :
273 mskql t6, a0, t6 # U :
278 subq a1, t4, a1 # E : sub dest misalignment from src addr
279 /* If source misalignment is larger than dest misalignment, we need
280 extra startup checks to avoid SEGV. */
281 cmplt t4, t5, t8 # E :
282 beq t8, $u_head # U :
283 lda t2, -1 # E : mask out leading garbage in source
285 mskqh t2, t5, t2 # U :
286 ornot t1, t2, t3 # E : (stall)
287 cmpbge zero, t3, t10 # E : is there a zero? (stall)
288 beq t10, $u_head # U : (stall)
290 /* At this point we've found a zero in the first partial word of
291 the source. We need to isolate the valid source data and mask
292 it into the original destination data. (Incidentally, we know
293 that we'll need at least one byte of that original dest word.) */
295 ldq_u t0, 0(a0) # L :
296 negq t10, t6 # E : build bitmask of bytes <= zero
297 and t6, t10, t8 # E : (stall)
301 or t6, t8, t10 # E : (stall)
302 srl t8, t5, t8 # U : adjust final null return value
303 zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
305 and t1, t2, t1 # E : to source validity mask
306 extql t2, a1, t2 # U :
307 extql t1, a1, t1 # U : (stall)
308 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
310 or t0, t1, t1 # e1 : and put it there
311 stq_u t1, 0(a0) # .. e0 : (stall)