]> git.ipfire.org Git - thirdparty/glibc.git/blob - sysdeps/alpha/alphaev6/stxcpy.S
6a8b92c70fbf3d3a882c5a6a337db6c80dec3dcd
[thirdparty/glibc.git] / sysdeps / alpha / alphaev6 / stxcpy.S
1 /* Copyright (C) 2000-2014 Free Software Foundation, Inc.
2 Contributed by Richard Henderson (rth@tamu.edu)
3 EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <http://www.gnu.org/licenses/>. */
19
20 /* Copy a null-terminated string from SRC to DST.
21
22 This is an internal routine used by strcpy, stpcpy, and strcat.
23 As such, it uses special linkage conventions to make implementation
24 of these public functions more efficient.
25
26 On input:
27 t9 = return address
28 a0 = DST
29 a1 = SRC
30
31 On output:
32 t8 = bitmask (with one bit set) indicating the last byte written
33 a0 = unaligned address of the last *word* written
34
35 Furthermore, v0, a3-a5, t11, and t12 are untouched.
36 */
37
38
39 #include <sysdep.h>
40
41 .arch ev6
42 .set noat
43 .set noreorder
44
45 .text
46 .type __stxcpy, @function
47 .globl __stxcpy
48 .usepv __stxcpy, no
49
50 cfi_startproc
51 cfi_return_column (t9)
52
53 /* On entry to this basic block:
54 t0 == the first destination word for masking back in
55 t1 == the first source word. */
56 .align 4
57 stxcpy_aligned:
58 /* Create the 1st output word and detect 0's in the 1st input word. */
59 lda t2, -1 # E : build a mask against false zero
60 mskqh t2, a1, t2 # U : detection in the src word (stall)
61 mskqh t1, a1, t3 # U :
62 ornot t1, t2, t2 # E : (stall)
63
64 mskql t0, a1, t0 # U : assemble the first output word
65 cmpbge zero, t2, t10 # E : bits set iff null found
66 or t0, t3, t1 # E : (stall)
67 bne t10, $a_eos # U : (stall)
68
69 /* On entry to this basic block:
70 t0 == the first destination word for masking back in
71 t1 == a source word not containing a null. */
72 /* Nops here to separate store quads from load quads */
73
74 $a_loop:
75 stq_u t1, 0(a0) # L :
76 addq a0, 8, a0 # E :
77 nop
78 nop
79
80 ldq_u t1, 0(a1) # L : Latency=3
81 addq a1, 8, a1 # E :
82 cmpbge zero, t1, t10 # E : (3 cycle stall)
83 beq t10, $a_loop # U : (stall for t10)
84
85 /* Take care of the final (partial) word store.
86 On entry to this basic block we have:
87 t1 == the source word containing the null
88 t10 == the cmpbge mask that found it. */
89 $a_eos:
90 negq t10, t6 # E : find low bit set
91 and t10, t6, t8 # E : (stall)
92 /* For the sake of the cache, don't read a destination word
93 if we're not going to need it. */
94 and t8, 0x80, t6 # E : (stall)
95 bne t6, 1f # U : (stall)
96
97 /* We're doing a partial word store and so need to combine
98 our source and original destination words. */
99 ldq_u t0, 0(a0) # L : Latency=3
100 subq t8, 1, t6 # E :
101 zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
102 or t8, t6, t10 # E : (stall)
103
104 zap t0, t10, t0 # E : clear dst bytes <= null
105 or t0, t1, t1 # E : (stall)
106 nop
107 nop
108
109 1: stq_u t1, 0(a0) # L :
110 ret (t9) # L0 : Latency=3
111 nop
112 nop
113
114 .align 4
115 __stxcpy:
116 /* Are source and destination co-aligned? */
117 xor a0, a1, t0 # E :
118 unop # E :
119 and t0, 7, t0 # E : (stall)
120 bne t0, $unaligned # U : (stall)
121
122 /* We are co-aligned; take care of a partial first word. */
123 ldq_u t1, 0(a1) # L : load first src word
124 and a0, 7, t0 # E : take care not to load a word ...
125 addq a1, 8, a1 # E :
126 beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
127
128 ldq_u t0, 0(a0) # L :
129 br stxcpy_aligned # L0 : Latency=3
130 nop
131 nop
132
133
134 /* The source and destination are not co-aligned. Align the destination
135 and cope. We have to be very careful about not reading too much and
136 causing a SEGV. */
137
138 .align 4
139 $u_head:
140 /* We know just enough now to be able to assemble the first
141 full source word. We can still find a zero at the end of it
142 that prevents us from outputting the whole thing.
143
144 On entry to this basic block:
145 t0 == the first dest word, for masking back in, if needed else 0
146 t1 == the low bits of the first source word
147 t6 == bytemask that is -1 in dest word bytes */
148
149 ldq_u t2, 8(a1) # L :
150 addq a1, 8, a1 # E :
151 extql t1, a1, t1 # U : (stall on a1)
152 extqh t2, a1, t4 # U : (stall on a1)
153
154 mskql t0, a0, t0 # U :
155 or t1, t4, t1 # E :
156 mskqh t1, a0, t1 # U : (stall on t1)
157 or t0, t1, t1 # E : (stall on t1)
158
159 or t1, t6, t6 # E :
160 cmpbge zero, t6, t10 # E : (stall)
161 lda t6, -1 # E : for masking just below
162 bne t10, $u_final # U : (stall)
163
164 mskql t6, a1, t6 # U : mask out the bits we have
165 or t6, t2, t2 # E : already extracted before (stall)
166 cmpbge zero, t2, t10 # E : testing eos (stall)
167 bne t10, $u_late_head_exit # U : (stall)
168
169 /* Finally, we've got all the stupid leading edge cases taken care
170 of and we can set up to enter the main loop. */
171
172 stq_u t1, 0(a0) # L : store first output word
173 addq a0, 8, a0 # E :
174 extql t2, a1, t0 # U : position ho-bits of lo word
175 ldq_u t2, 8(a1) # U : read next high-order source word
176
177 addq a1, 8, a1 # E :
178 cmpbge zero, t2, t10 # E : (stall for t2)
179 nop # E :
180 bne t10, $u_eos # U : (stall)
181
182 /* Unaligned copy main loop. In order to avoid reading too much,
183 the loop is structured to detect zeros in aligned source words.
184 This has, unfortunately, effectively pulled half of a loop
185 iteration out into the head and half into the tail, but it does
186 prevent nastiness from accumulating in the very thing we want
187 to run as fast as possible.
188
189 On entry to this basic block:
190 t0 == the shifted high-order bits from the previous source word
191 t2 == the unshifted current source word
192
193 We further know that t2 does not contain a null terminator. */
194
195 .align 3
196 $u_loop:
197 extqh t2, a1, t1 # U : extract high bits for current word
198 addq a1, 8, a1 # E : (stall)
199 extql t2, a1, t3 # U : extract low bits for next time (stall)
200 addq a0, 8, a0 # E :
201
202 or t0, t1, t1 # E : current dst word now complete
203 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
204 stq_u t1, -8(a0) # L : save the current word (stall)
205 mov t3, t0 # E :
206
207 cmpbge zero, t2, t10 # E : test new word for eos
208 beq t10, $u_loop # U : (stall)
209 nop
210 nop
211
212 /* We've found a zero somewhere in the source word we just read.
213 If it resides in the lower half, we have one (probably partial)
214 word to write out, and if it resides in the upper half, we
215 have one full and one partial word left to write out.
216
217 On entry to this basic block:
218 t0 == the shifted high-order bits from the previous source word
219 t2 == the unshifted current source word. */
220 $u_eos:
221 extqh t2, a1, t1 # U :
222 or t0, t1, t1 # E : first (partial) source word complete (stall)
223 cmpbge zero, t1, t10 # E : is the null in this first bit? (stall)
224 bne t10, $u_final # U : (stall)
225
226 $u_late_head_exit:
227 stq_u t1, 0(a0) # L : the null was in the high-order bits
228 addq a0, 8, a0 # E :
229 extql t2, a1, t1 # U :
230 cmpbge zero, t1, t10 # E : (stall)
231
232 /* Take care of a final (probably partial) result word.
233 On entry to this basic block:
234 t1 == assembled source word
235 t10 == cmpbge mask that found the null. */
236 $u_final:
237 negq t10, t6 # E : isolate low bit set
238 and t6, t10, t8 # E : (stall)
239 and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
240 bne t6, 1f # U : (stall)
241
242 ldq_u t0, 0(a0) # E :
243 subq t8, 1, t6 # E :
244 or t6, t8, t10 # E : (stall)
245 zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
246
247 zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall)
248 or t0, t1, t1 # E : (stall)
249 nop
250 nop
251
252 1: stq_u t1, 0(a0) # L :
253 ret (t9) # L0 : Latency=3
254 nop
255 nop
256
257 /* Unaligned copy entry point. */
258 .align 4
259 $unaligned:
260
261 ldq_u t1, 0(a1) # L : load first source word
262 and a0, 7, t4 # E : find dest misalignment
263 and a1, 7, t5 # E : find src misalignment
264 /* Conditionally load the first destination word and a bytemask
265 with 0xff indicating that the destination byte is sacrosanct. */
266 mov zero, t0 # E :
267
268 mov zero, t6 # E :
269 beq t4, 1f # U :
270 ldq_u t0, 0(a0) # L :
271 lda t6, -1 # E :
272
273 mskql t6, a0, t6 # U :
274 nop
275 nop
276 nop
277 1:
278 subq a1, t4, a1 # E : sub dest misalignment from src addr
279 /* If source misalignment is larger than dest misalignment, we need
280 extra startup checks to avoid SEGV. */
281 cmplt t4, t5, t8 # E :
282 beq t8, $u_head # U :
283 lda t2, -1 # E : mask out leading garbage in source
284
285 mskqh t2, t5, t2 # U :
286 ornot t1, t2, t3 # E : (stall)
287 cmpbge zero, t3, t10 # E : is there a zero? (stall)
288 beq t10, $u_head # U : (stall)
289
290 /* At this point we've found a zero in the first partial word of
291 the source. We need to isolate the valid source data and mask
292 it into the original destination data. (Incidentally, we know
293 that we'll need at least one byte of that original dest word.) */
294
295 ldq_u t0, 0(a0) # L :
296 negq t10, t6 # E : build bitmask of bytes <= zero
297 and t6, t10, t8 # E : (stall)
298 and a1, 7, t5 # E :
299
300 subq t8, 1, t6 # E :
301 or t6, t8, t10 # E : (stall)
302 srl t8, t5, t8 # U : adjust final null return value
303 zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall)
304
305 and t1, t2, t1 # E : to source validity mask
306 extql t2, a1, t2 # U :
307 extql t1, a1, t1 # U : (stall)
308 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
309
310 or t0, t1, t1 # e1 : and put it there
311 stq_u t1, 0(a0) # .. e0 : (stall)
312 ret (t9) # e1 :
313
314 cfi_endproc