]>
Commit | Line | Data |
---|---|---|
c65ebc55 JW |
1 | #ifdef L__divdf3 |
2 | // Compute a 64-bit IEEE double quotient. | |
3 | // | |
4 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
5 | // alternative. | |
6 | // | |
7 | // farg0 holds the dividend. farg1 holds the divisor. | |
8 | ||
9 | .text | |
10 | .align 16 | |
11 | .global __divdf3 | |
12 | .proc __divdf3 | |
13 | __divdf3: | |
14 | frcpa f10, p6 = farg0, farg1 | |
15 | ;; | |
16 | (p6) fma.s1 f11 = farg0, f10, f0 | |
17 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
18 | ;; | |
19 | (p6) fma.s1 f11 = f12, f11, f11 | |
20 | (p6) fma.s1 f13 = f12, f12, f0 | |
21 | (p6) fma.s1 f10 = f12, f10, f10 | |
22 | ;; | |
23 | (p6) fma.s1 f11 = f13, f11, f11 | |
24 | (p6) fma.s1 f12 = f13, f13, f0 | |
25 | (p6) fma.s1 f10 = f13, f10, f10 | |
26 | ;; | |
27 | (p6) fma.d.s1 f11 = f12, f11, f11 | |
28 | (p6) fma.s1 f10 = f12, f10, f10 | |
29 | ;; | |
30 | (p6) fnma.d.s1 f8 = farg1, f11, farg0 | |
31 | ;; | |
32 | (p6) fma.d f10 = f8, f10, f11 | |
33 | ;; | |
34 | mov fret0 = f10 | |
35 | br.ret.sptk rp | |
36 | ;; | |
37 | .endp __divdf3 | |
38 | #endif | |
39 | ||
40 | #ifdef L__divsf3 | |
41 | // Compute a 32-bit IEEE float quotient. | |
42 | // | |
43 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
44 | // alternative. | |
45 | // | |
46 | // farg0 holds the dividend. farg1 holds the divisor. | |
47 | ||
48 | .text | |
49 | .align 16 | |
50 | .global __divsf3 | |
51 | .proc __divsf3 | |
52 | __divsf3: | |
53 | frcpa f10, p6 = farg0, farg1 | |
54 | ;; | |
55 | (p6) fma.s1 f8 = farg0, f10, f0 | |
56 | (p6) fnma.s1 f9 = farg1, f10, f1 | |
57 | ;; | |
58 | (p6) fma.s1 f8 = f9, f8, f8 | |
59 | (p6) fma.s1 f9 = f9, f9, f0 | |
60 | ;; | |
61 | (p6) fma.s1 f8 = f9, f8, f8 | |
62 | (p6) fma.s1 f9 = f9, f9, f0 | |
63 | ;; | |
64 | (p6) fma.d.s1 f8 = f9, f8, f8 | |
65 | ;; | |
66 | (p6) fma.s f10 = f8, f1, f0 | |
67 | ;; | |
68 | mov fret0 = f10 | |
69 | br.ret.sptk rp | |
70 | ;; | |
71 | .endp __divsf3 | |
72 | #endif | |
73 | ||
74 | #ifdef L__divdi3 | |
75 | // Compute a 64-bit integer quotient. | |
76 | // | |
77 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
78 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
79 | // to get more than the 64 bits of precision that we need for DImode. | |
80 | // | |
81 | // Must use max precision for the reciprocal computations to get 64 bits of | |
82 | // precision. | |
83 | // | |
84 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
85 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
86 | // f12 is a temporary. | |
87 | ||
88 | .text | |
89 | .align 16 | |
90 | .global __divdi3 | |
91 | .proc __divdi3 | |
92 | __divdi3: | |
93 | .regstk 2,0,0,0 | |
94 | // Transfer inputs to FP registers. | |
95 | setf.sig f8 = in0 | |
96 | setf.sig f9 = in1 | |
97 | ;; | |
98 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
99 | fcvt.xf f8 = f8 | |
100 | fcvt.xf f9 = f9 | |
101 | ;; | |
102 | // Compute the reciprocal approximation. | |
660a0ebd | 103 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
104 | ;; |
105 | // 3 Newton-Raphson iterations. | |
106 | (p6) fma.s1 f11 = farg0, f10, f0 | |
107 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
108 | ;; | |
109 | (p6) fma.s1 f11 = f12, f11, f11 | |
110 | (p6) fma.s1 f13 = f12, f12, f0 | |
111 | (p6) fma.s1 f10 = f12, f10, f10 | |
112 | ;; | |
113 | (p6) fma.s1 f11 = f13, f11, f11 | |
114 | (p6) fma.s1 f12 = f13, f13, f0 | |
115 | (p6) fma.s1 f10 = f13, f10, f10 | |
116 | ;; | |
117 | (p6) fma.s1 f11 = f12, f11, f11 | |
118 | (p6) fma.s1 f10 = f12, f10, f10 | |
119 | ;; | |
120 | (p6) fnma.s1 f8 = f9, f11, f8 | |
121 | ;; | |
660a0ebd | 122 | (p6) fma.s1 f10 = f8, f10, f11 |
c65ebc55 JW |
123 | ;; |
124 | // Round quotient to an integer. | |
660a0ebd | 125 | fcvt.fx.trunc.s1 f8 = f10 |
c65ebc55 JW |
126 | ;; |
127 | // Transfer result to GP registers. | |
128 | getf.sig ret0 = f8 | |
129 | br.ret.sptk rp | |
130 | ;; | |
131 | .endp __divdi3 | |
132 | #endif | |
133 | ||
134 | #ifdef L__moddi3 | |
135 | // Compute a 64-bit integer modulus. | |
136 | // | |
137 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
138 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
139 | // to get more than the 64 bits of precision that we need for DImode. | |
140 | // | |
141 | // Must use max precision for the reciprocal computations to get 64 bits of | |
142 | // precision. | |
143 | // | |
144 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
145 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
146 | // f12 is a temporary. | |
147 | ||
148 | .text | |
149 | .align 16 | |
150 | .global __moddi3 | |
151 | .proc __moddi3 | |
152 | __moddi3: | |
153 | .regstk 2,0,0,0 | |
154 | // Transfer inputs to FP registers. | |
155 | setf.sig f8 = in0 | |
156 | setf.sig f9 = in1 | |
157 | ;; | |
158 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
159 | fcvt.xf f8 = f8 | |
160 | fcvt.xf f9 = f9 | |
161 | ;; | |
162 | // Compute the reciprocal approximation. | |
660a0ebd | 163 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
164 | ;; |
165 | // 3 Newton-Raphson iterations. | |
166 | (p6) fma.s1 f11 = farg0, f10, f0 | |
167 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
168 | ;; | |
169 | (p6) fma.s1 f11 = f12, f11, f11 | |
170 | (p6) fma.s1 f13 = f12, f12, f0 | |
171 | (p6) fma.s1 f10 = f12, f10, f10 | |
172 | ;; | |
173 | (p6) fma.s1 f11 = f13, f11, f11 | |
174 | (p6) fma.s1 f12 = f13, f13, f0 | |
175 | (p6) fma.s1 f10 = f13, f10, f10 | |
176 | ;; | |
177 | (p6) fma.s1 f11 = f12, f11, f11 | |
178 | (p6) fma.s1 f10 = f12, f10, f10 | |
179 | ;; | |
180 | (p6) fnma.s1 f12 = f9, f11, f8 | |
181 | ;; | |
660a0ebd | 182 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
183 | ;; |
184 | // Round quotient to an integer. | |
660a0ebd | 185 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 JW |
186 | ;; |
187 | // Renormalize. | |
188 | fcvt.xf f10 = f10 | |
189 | ;; | |
190 | // Compute remainder. | |
660a0ebd | 191 | fnma.s1 f8 = f10, f9, f8 |
c65ebc55 JW |
192 | ;; |
193 | // Round remainder to an integer. | |
660a0ebd | 194 | fcvt.fx.trunc.s1 f8 = f8 |
c65ebc55 JW |
195 | ;; |
196 | // Transfer result to GP registers. | |
197 | getf.sig ret0 = f8 | |
198 | br.ret.sptk rp | |
199 | ;; | |
200 | .endp __moddi3 | |
201 | #endif | |
202 | ||
203 | #ifdef L__udivdi3 | |
204 | // Compute a 64-bit unsigned integer quotient. | |
205 | // | |
206 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
207 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
208 | // to get more than the 64 bits of precision that we need for DImode. | |
209 | // | |
210 | // Must use max precision for the reciprocal computations to get 64 bits of | |
211 | // precision. | |
212 | // | |
213 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
214 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
215 | // f12 is a temporary. | |
216 | ||
217 | .text | |
218 | .align 16 | |
219 | .global __udivdi3 | |
220 | .proc __udivdi3 | |
221 | __udivdi3: | |
222 | .regstk 2,0,0,0 | |
223 | // Transfer inputs to FP registers. | |
224 | setf.sig f8 = in0 | |
225 | setf.sig f9 = in1 | |
226 | ;; | |
227 | // Convert the inputs to FP, to avoid FP software-assist faults. | |
660a0ebd JW |
228 | fcvt.xuf.s1 f8 = f8 |
229 | fcvt.xuf.s1 f9 = f9 | |
c65ebc55 JW |
230 | ;; |
231 | // Compute the reciprocal approximation. | |
660a0ebd | 232 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
233 | ;; |
234 | // 3 Newton-Raphson iterations. | |
235 | (p6) fma.s1 f11 = farg0, f10, f0 | |
236 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
237 | ;; | |
238 | (p6) fma.s1 f11 = f12, f11, f11 | |
239 | (p6) fma.s1 f13 = f12, f12, f0 | |
240 | (p6) fma.s1 f10 = f12, f10, f10 | |
241 | ;; | |
242 | (p6) fma.s1 f11 = f13, f11, f11 | |
243 | (p6) fma.s1 f12 = f13, f13, f0 | |
244 | (p6) fma.s1 f10 = f13, f10, f10 | |
245 | ;; | |
246 | (p6) fma.s1 f11 = f12, f11, f11 | |
247 | (p6) fma.s1 f10 = f12, f10, f10 | |
248 | ;; | |
249 | (p6) fnma.s1 f8 = f9, f11, f8 | |
250 | ;; | |
660a0ebd | 251 | (p6) fma.s1 f10 = f8, f10, f11 |
c65ebc55 JW |
252 | ;; |
253 | // Round quotient to an unsigned integer. | |
660a0ebd | 254 | fcvt.fxu.trunc.s1 f8 = f10 |
c65ebc55 JW |
255 | ;; |
256 | // Transfer result to GP registers. | |
257 | getf.sig ret0 = f8 | |
258 | br.ret.sptk rp | |
259 | ;; | |
260 | .endp __udivdi3 | |
261 | #endif | |
262 | ||
263 | #ifdef L__umoddi3 | |
264 | // Compute a 64-bit unsigned integer modulus. | |
265 | // | |
266 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
267 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
268 | // to get more than the 64 bits of precision that we need for DImode. | |
269 | // | |
270 | // Must use max precision for the reciprocal computations to get 64 bits of | |
271 | // precision. | |
272 | // | |
273 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
274 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
275 | // f12 is a temporary. | |
276 | ||
277 | .text | |
278 | .align 16 | |
279 | .global __umoddi3 | |
280 | .proc __umoddi3 | |
281 | __umoddi3: | |
282 | .regstk 2,0,0,0 | |
283 | // Transfer inputs to FP registers. | |
284 | setf.sig f8 = in0 | |
285 | setf.sig f9 = in1 | |
286 | ;; | |
287 | // Convert the inputs to FP, to avoid FP software assist faults. | |
660a0ebd JW |
288 | fcvt.xuf.s1 f8 = f8 |
289 | fcvt.xuf.s1 f9 = f9 | |
c65ebc55 JW |
290 | ;; |
291 | // Compute the reciprocal approximation. | |
660a0ebd | 292 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
293 | ;; |
294 | // 3 Newton-Raphson iterations. | |
295 | (p6) fma.s1 f11 = farg0, f10, f0 | |
296 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
297 | ;; | |
298 | (p6) fma.s1 f11 = f12, f11, f11 | |
299 | (p6) fma.s1 f13 = f12, f12, f0 | |
300 | (p6) fma.s1 f10 = f12, f10, f10 | |
301 | ;; | |
302 | (p6) fma.s1 f11 = f13, f11, f11 | |
303 | (p6) fma.s1 f12 = f13, f13, f0 | |
304 | (p6) fma.s1 f10 = f13, f10, f10 | |
305 | ;; | |
306 | (p6) fma.s1 f11 = f12, f11, f11 | |
307 | (p6) fma.s1 f10 = f12, f10, f10 | |
308 | ;; | |
309 | (p6) fnma.s1 f12 = f9, f11, f8 | |
310 | ;; | |
660a0ebd | 311 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
312 | ;; |
313 | // Round quotient to an unsigned integer. | |
660a0ebd | 314 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 JW |
315 | ;; |
316 | // Renormalize. | |
660a0ebd | 317 | fcvt.xuf.s1 f10 = f10 |
c65ebc55 JW |
318 | ;; |
319 | // Compute remainder. | |
660a0ebd | 320 | fnma.s1 f8 = f10, f9, f8 |
c65ebc55 JW |
321 | ;; |
322 | // Round remainder to an integer. | |
660a0ebd | 323 | fcvt.fxu.trunc.s1 f8 = f8 |
c65ebc55 JW |
324 | ;; |
325 | // Transfer result to GP registers. | |
326 | getf.sig ret0 = f8 | |
327 | br.ret.sptk rp | |
328 | ;; | |
329 | .endp __umoddi3 | |
330 | #endif | |
331 | ||
332 | #ifdef L__divsi3 | |
333 | // Compute a 32-bit integer quotient. | |
334 | // | |
335 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
336 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
337 | // to get more than the 32 bits of precision that we need for SImode. | |
338 | // | |
339 | // ??? This is currently not used. It needs to be fixed to be more like the | |
340 | // above DImode routines. | |
341 | // | |
342 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
343 | // some adjustment code to get precise enough results. | |
344 | // | |
345 | // ??? Should probably use max precision for the reciprocal computations. | |
346 | // | |
347 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
348 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
349 | // f12 is a temporary. | |
350 | ||
351 | .text | |
352 | .align 16 | |
353 | .global __divsi3 | |
354 | .proc __divsi3 | |
355 | __divsi3: | |
356 | .regstk 2,0,0,0 | |
357 | setf.sig f8 = in0 | |
358 | setf.sig f9 = in1 | |
359 | ;; | |
360 | fcvt.xf f8 = f8 | |
361 | fcvt.xf f9 = f9 | |
362 | ;; | |
363 | frcpa f11, p6 = f8, f9 | |
364 | fadd f10 = f1, f1 | |
365 | ;; | |
366 | fnma f12 = f9, f11, f10 | |
367 | ;; | |
368 | fmpy f11 = f11, f12 | |
369 | ;; | |
370 | fnma f12 = f9, f11, f10 | |
371 | ;; | |
372 | fmpy f11 = f11, f12 | |
373 | ;; | |
374 | fmpy f8 = f8, f11 | |
375 | ;; | |
376 | fcvt.fx.trunc f8 = f8 | |
377 | ;; | |
378 | getf.sig ret0 = f8 | |
379 | br.ret.sptk rp | |
380 | ;; | |
381 | .endp __divsi3 | |
382 | #endif | |
383 | ||
384 | #ifdef L__modsi3 | |
385 | // Compute a 32-bit integer modulus. | |
386 | // | |
387 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
388 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
389 | // to get more than the 32 bits of precision that we need for SImode. | |
390 | // | |
391 | // ??? This is currently not used. It needs to be fixed to be more like the | |
392 | // above DImode routines. | |
393 | // | |
394 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
395 | // some adjustment code to get precise enough results. | |
396 | // | |
397 | // ??? Should probably use max precision for the reciprocal computations. | |
398 | // | |
399 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
400 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
401 | // f12 is a temporary. | |
402 | ||
403 | .text | |
404 | .align 16 | |
405 | .global __modsi3 | |
406 | .proc __modsi3 | |
407 | __modsi3: | |
408 | .regstk 2,0,0,0 | |
409 | setf.sig f8 = r32 | |
410 | setf.sig f9 = r33 | |
411 | ;; | |
412 | fcvt.xf f8 = f8 | |
413 | fcvt.xf f9 = f9 | |
414 | ;; | |
415 | frcpa f11, p6 = f8, f9 | |
416 | fadd f10 = f1, f1 | |
417 | ;; | |
418 | fnma f12 = f9, f11, f10 | |
419 | ;; | |
420 | fmpy f11 = f11, f12 | |
421 | ;; | |
422 | fnma f12 = f9, f11, f10 | |
423 | ;; | |
424 | fmpy f11 = f11, f12 | |
425 | ;; | |
426 | fmpy f10 = f8, f11 | |
427 | ;; | |
428 | fcvt.fx.trunc f10 = f10 | |
429 | ;; | |
430 | fcvt.xf f10 = f10 | |
431 | ;; | |
432 | fnma f8 = f10, f9, f8 | |
433 | ;; | |
434 | fcvt.fx f8 = f8 | |
435 | ;; | |
436 | getf.sig r32 = f8 | |
437 | br.ret.sptk rp | |
438 | ;; | |
439 | .endp __modsi3 | |
440 | #endif | |
441 | ||
442 | #ifdef L__udivsi3 | |
443 | // Compute a 32-bit unsigned integer quotient. | |
444 | // | |
445 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
446 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
447 | // to get more than the 32 bits of precision that we need for SImode. | |
448 | // | |
449 | // ??? This is currently not used. It needs to be fixed to be more like the | |
450 | // above DImode routines. | |
451 | // | |
452 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
453 | // some adjustment code to get precise enough results. | |
454 | // | |
455 | // ??? Should probably use max precision for the reciprocal computations. | |
456 | // | |
457 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
458 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
459 | // f12 is a temporary. | |
460 | // | |
461 | // This is the same as divsi3, except that we don't need fcvt instructions | |
462 | // before the frcpa. | |
463 | ||
464 | .text | |
465 | .align 16 | |
466 | .global __udivsi3 | |
467 | .proc __udivsi3 | |
468 | __udivsi3: | |
469 | .regstk 2,0,0,0 | |
470 | setf.sig f8 = r32 | |
471 | setf.sig f9 = r33 | |
472 | ;; | |
473 | frcpa f11, p6 = f8, f9 | |
474 | fadd f10 = f1, f1 | |
475 | ;; | |
476 | fnma f12 = f9, f11, f10 | |
477 | ;; | |
478 | fmpy f11 = f11, f12 | |
479 | ;; | |
480 | fnma f12 = f9, f11, f10 | |
481 | ;; | |
482 | fmpy f11 = f11, f12 | |
483 | ;; | |
484 | fmpy f8 = f8, f11 | |
485 | ;; | |
486 | fcvt.fxu.trunc f8 = f8 | |
487 | ;; | |
488 | getf.sig ret0 = f8 | |
489 | br.ret.sptk rp | |
490 | ;; | |
491 | .endp __udivsi3 | |
492 | #endif | |
493 | ||
494 | #ifdef L__umodsi3 | |
495 | // Compute a 32-bit unsigned integer modulus. | |
496 | // | |
497 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
498 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
499 | // to get more than the 32 bits of precision that we need for SImode. | |
500 | // | |
501 | // ??? This is currently not used. It needs to be fixed to be more like the | |
502 | // above DImode routines. | |
503 | // | |
504 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
505 | // some adjustment code to get precise enough results. | |
506 | // | |
507 | // ??? Should probably use max precision for the reciprocal computations. | |
508 | // | |
509 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
510 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
511 | // f12 is a temporary. | |
512 | // | |
513 | // This is the same as modsi3, except that we don't need fcvt instructions | |
514 | // before the frcpa. | |
515 | ||
516 | .text | |
517 | .align 16 | |
518 | .global __umodsi3 | |
519 | .proc __umodsi3 | |
520 | __umodsi3: | |
521 | .regstk 2,0,0,0 | |
522 | setf.sig f8 = r32 | |
523 | setf.sig f9 = r33 | |
524 | ;; | |
525 | frcpa f11, p6 = f8, f9 | |
526 | fadd f10 = f1, f1 | |
527 | ;; | |
528 | fnma f12 = f9, f11, f10 | |
529 | ;; | |
530 | fmpy f11 = f11, f12 | |
531 | ;; | |
532 | fnma f12 = f9, f11, f10 | |
533 | ;; | |
534 | fmpy f11 = f11, f12 | |
535 | ;; | |
536 | fmpy f10 = f8, f11 | |
537 | ;; | |
538 | fcvt.fxu.trunc f10 = f10 | |
539 | ;; | |
540 | fcvt.xuf f10 = f10 | |
541 | ;; | |
542 | fnma f8 = f10, f9, f8 | |
543 | ;; | |
544 | fcvt.fxu f8 = f8 | |
545 | ;; | |
546 | getf.sig r32 = f8 | |
547 | br.ret.sptk rp | |
548 | ;; | |
549 | .endp __umodsi3 | |
550 | #endif | |
551 | ||
552 | #ifdef L__save_stack_nonlocal | |
553 | // Notes on save/restore stack nonlocal: We read ar.bsp but write | |
554 | // ar.bspstore. This is because ar.bsp can be read at all times | |
555 | // (independent of the RSE mode) but since it's read-only we need to | |
556 | // restore the value via ar.bspstore. This is OK because | |
557 | // ar.bsp==ar.bspstore after executing "flushrs". | |
558 | ||
559 | // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) | |
560 | ||
561 | .text | |
562 | .align 16 | |
563 | .global __ia64_save_stack_nonlocal | |
564 | .proc __ia64_save_stack_nonlocal | |
565 | __ia64_save_stack_nonlocal: | |
97e242b0 RH |
566 | { .mmf |
567 | alloc r18 = ar.pfs, 2, 0, 0, 0 | |
568 | mov r19 = ar.rsc | |
569 | ;; | |
570 | } | |
571 | { .mmi | |
572 | flushrs | |
573 | st8 [in0] = in1, 24 | |
574 | and r19 = 0x1c, r19 | |
575 | ;; | |
576 | } | |
577 | { .mmi | |
578 | st8 [in0] = r18, -16 | |
579 | mov ar.rsc = r19 | |
580 | or r19 = 0x3, r19 | |
581 | ;; | |
582 | } | |
583 | { .mmi | |
584 | mov r16 = ar.bsp | |
585 | mov r17 = ar.rnat | |
586 | adds r2 = 8, in0 | |
587 | ;; | |
588 | } | |
589 | { .mmi | |
590 | st8 [in0] = r16 | |
591 | st8 [r2] = r17 | |
592 | } | |
593 | { .mib | |
594 | mov ar.rsc = r19 | |
595 | br.ret.sptk.few rp | |
596 | ;; | |
597 | } | |
c65ebc55 JW |
598 | .endp __ia64_save_stack_nonlocal |
599 | #endif | |
600 | ||
601 | #ifdef L__nonlocal_goto | |
97e242b0 | 602 | // void __ia64_nonlocal_goto(void *target_label, void *save_area, |
c65ebc55 JW |
603 | // void *static_chain); |
604 | ||
605 | .text | |
606 | .align 16 | |
607 | .global __ia64_nonlocal_goto | |
608 | .proc __ia64_nonlocal_goto | |
609 | __ia64_nonlocal_goto: | |
97e242b0 RH |
610 | { .mmi |
611 | alloc r20 = ar.pfs, 3, 0, 0, 0 | |
612 | ld8 r12 = [in1], 8 | |
613 | mov.ret.sptk rp = in0, .L0 | |
614 | ;; | |
615 | } | |
616 | { .mmf | |
617 | ld8 r16 = [in1], 8 | |
618 | mov r19 = ar.rsc | |
619 | ;; | |
620 | } | |
621 | { .mmi | |
622 | flushrs | |
623 | ld8 r17 = [in1], 8 | |
624 | and r19 = 0x1c, r19 | |
625 | ;; | |
626 | } | |
627 | { .mmi | |
628 | ld8 r18 = [in1] | |
629 | mov ar.rsc = r19 | |
630 | or r19 = 0x3, r19 | |
631 | ;; | |
632 | } | |
633 | { .mmi | |
634 | mov ar.bspstore = r16 | |
635 | ;; | |
636 | mov ar.rnat = r17 | |
637 | ;; | |
638 | } | |
639 | { .mmi | |
640 | loadrs | |
641 | invala | |
642 | mov r15 = in2 | |
643 | ;; | |
644 | } | |
645 | .L0: { .mib | |
646 | mov ar.rsc = r19 | |
647 | mov ar.pfs = r18 | |
648 | br.ret.sptk.few rp | |
649 | ;; | |
c65ebc55 | 650 | } |
c65ebc55 JW |
651 | .endp __ia64_nonlocal_goto |
652 | #endif | |
9525c690 JW |
653 | |
654 | #ifdef L__restore_stack_nonlocal | |
655 | // This is mostly the same as nonlocal_goto above. | |
656 | // ??? This has not been tested yet. | |
657 | ||
658 | // void __ia64_restore_stack_nonlocal(void *save_area) | |
659 | ||
660 | .text | |
661 | .align 16 | |
662 | .global __ia64_restore_stack_nonlocal | |
663 | .proc __ia64_restore_stack_nonlocal | |
664 | __ia64_restore_stack_nonlocal: | |
97e242b0 RH |
665 | { .mmf |
666 | alloc r20 = ar.pfs, 4, 0, 0, 0 | |
667 | ld8 r12 = [in0], 8 | |
668 | ;; | |
669 | } | |
670 | { .mmb | |
671 | ld8 r16=[in0], 8 | |
672 | mov r19 = ar.rsc | |
673 | ;; | |
674 | } | |
675 | { .mmi | |
676 | flushrs | |
677 | ld8 r17 = [in0], 8 | |
678 | and r19 = 0x1c, r19 | |
679 | ;; | |
680 | } | |
681 | { .mmf | |
682 | ld8 r18 = [in0] | |
683 | mov ar.rsc = r19 | |
684 | ;; | |
685 | } | |
686 | { .mmi | |
687 | mov ar.bspstore = r16 | |
688 | ;; | |
689 | mov ar.rnat = r17 | |
690 | or r19 = 0x3, r19 | |
691 | ;; | |
692 | } | |
693 | { .mmf | |
694 | loadrs | |
695 | invala | |
696 | ;; | |
697 | } | |
698 | .L0: { .mib | |
699 | mov ar.rsc = r19 | |
700 | mov ar.pfs = r18 | |
701 | br.ret.sptk.few rp | |
702 | ;; | |
9525c690 | 703 | } |
9525c690 JW |
704 | .endp __ia64_restore_stack_nonlocal |
705 | #endif | |
97e242b0 RH |
706 | |
707 | #ifdef L__trampoline | |
708 | // Implement the nested function trampoline. This is out of line | |
709 | // so that we don't have to bother with flushing the icache, as | |
710 | // well as making the on-stack trampoline smaller. | |
711 | // | |
712 | // The trampoline has the following form: | |
713 | // | |
714 | // +-------------------+ \ | |
715 | // TRAMP: | __ia64_trampoline | | | |
716 | // +-------------------+ > fake function descriptor | |
717 | // | TRAMP+16 | | | |
718 | // +-------------------+ / | |
719 | // | target descriptor | | |
720 | // +-------------------+ | |
721 | // | static link | | |
722 | // +-------------------+ | |
723 | ||
724 | .text | |
725 | .align 16 | |
726 | .global __ia64_trampoline | |
727 | .proc __ia64_trampoline | |
728 | __ia64_trampoline: | |
729 | { .mmi | |
730 | ld8 r2 = [r1], 8 | |
731 | ;; | |
732 | ld8 r15 = [r1] | |
733 | } | |
734 | { .mmi | |
735 | ld8 r3 = [r2], 8 | |
736 | ;; | |
737 | ld8 r1 = [r2] | |
738 | mov b6 = r3 | |
739 | } | |
740 | { .bbb | |
741 | br.sptk.many b6 | |
742 | ;; | |
743 | } | |
744 | .endp __ia64_trampoline | |
745 | #endif |