]>
Commit | Line | Data |
---|---|---|
c65ebc55 JW |
1 | #ifdef L__divdf3 |
2 | // Compute a 64-bit IEEE double quotient. | |
3 | // | |
4 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
5 | // alternative. | |
6 | // | |
7 | // farg0 holds the dividend. farg1 holds the divisor. | |
8 | ||
9 | .text | |
10 | .align 16 | |
11 | .global __divdf3 | |
12 | .proc __divdf3 | |
13 | __divdf3: | |
14 | frcpa f10, p6 = farg0, farg1 | |
15 | ;; | |
16 | (p6) fma.s1 f11 = farg0, f10, f0 | |
17 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
18 | ;; | |
19 | (p6) fma.s1 f11 = f12, f11, f11 | |
20 | (p6) fma.s1 f13 = f12, f12, f0 | |
21 | (p6) fma.s1 f10 = f12, f10, f10 | |
22 | ;; | |
23 | (p6) fma.s1 f11 = f13, f11, f11 | |
24 | (p6) fma.s1 f12 = f13, f13, f0 | |
25 | (p6) fma.s1 f10 = f13, f10, f10 | |
26 | ;; | |
27 | (p6) fma.d.s1 f11 = f12, f11, f11 | |
28 | (p6) fma.s1 f10 = f12, f10, f10 | |
29 | ;; | |
30 | (p6) fnma.d.s1 f8 = farg1, f11, farg0 | |
31 | ;; | |
32 | (p6) fma.d f10 = f8, f10, f11 | |
33 | ;; | |
34 | mov fret0 = f10 | |
35 | br.ret.sptk rp | |
36 | ;; | |
37 | .endp __divdf3 | |
38 | #endif | |
39 | ||
40 | #ifdef L__divsf3 | |
41 | // Compute a 32-bit IEEE float quotient. | |
42 | // | |
43 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
44 | // alternative. | |
45 | // | |
46 | // farg0 holds the dividend. farg1 holds the divisor. | |
47 | ||
48 | .text | |
49 | .align 16 | |
50 | .global __divsf3 | |
51 | .proc __divsf3 | |
52 | __divsf3: | |
53 | frcpa f10, p6 = farg0, farg1 | |
54 | ;; | |
55 | (p6) fma.s1 f8 = farg0, f10, f0 | |
56 | (p6) fnma.s1 f9 = farg1, f10, f1 | |
57 | ;; | |
58 | (p6) fma.s1 f8 = f9, f8, f8 | |
59 | (p6) fma.s1 f9 = f9, f9, f0 | |
60 | ;; | |
61 | (p6) fma.s1 f8 = f9, f8, f8 | |
62 | (p6) fma.s1 f9 = f9, f9, f0 | |
63 | ;; | |
64 | (p6) fma.d.s1 f8 = f9, f8, f8 | |
65 | ;; | |
66 | (p6) fma.s f10 = f8, f1, f0 | |
67 | ;; | |
68 | mov fret0 = f10 | |
69 | br.ret.sptk rp | |
70 | ;; | |
71 | .endp __divsf3 | |
72 | #endif | |
73 | ||
74 | #ifdef L__divdi3 | |
75 | // Compute a 64-bit integer quotient. | |
76 | // | |
77 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
78 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
79 | // to get more than the 64 bits of precision that we need for DImode. | |
80 | // | |
81 | // Must use max precision for the reciprocal computations to get 64 bits of | |
82 | // precision. | |
83 | // | |
84 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
85 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
86 | // f12 is a temporary. | |
87 | ||
88 | .text | |
89 | .align 16 | |
90 | .global __divdi3 | |
91 | .proc __divdi3 | |
92 | __divdi3: | |
93 | .regstk 2,0,0,0 | |
94 | // Transfer inputs to FP registers. | |
95 | setf.sig f8 = in0 | |
96 | setf.sig f9 = in1 | |
97 | ;; | |
98 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
99 | fcvt.xf f8 = f8 | |
100 | fcvt.xf f9 = f9 | |
101 | ;; | |
102 | // Compute the reciprocal approximation. | |
103 | frcpa f10, p6 = f8, f9 | |
104 | ;; | |
105 | // 3 Newton-Raphson iterations. | |
106 | (p6) fma.s1 f11 = farg0, f10, f0 | |
107 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
108 | ;; | |
109 | (p6) fma.s1 f11 = f12, f11, f11 | |
110 | (p6) fma.s1 f13 = f12, f12, f0 | |
111 | (p6) fma.s1 f10 = f12, f10, f10 | |
112 | ;; | |
113 | (p6) fma.s1 f11 = f13, f11, f11 | |
114 | (p6) fma.s1 f12 = f13, f13, f0 | |
115 | (p6) fma.s1 f10 = f13, f10, f10 | |
116 | ;; | |
117 | (p6) fma.s1 f11 = f12, f11, f11 | |
118 | (p6) fma.s1 f10 = f12, f10, f10 | |
119 | ;; | |
120 | (p6) fnma.s1 f8 = f9, f11, f8 | |
121 | ;; | |
122 | (p6) fma f10 = f8, f10, f11 | |
123 | ;; | |
124 | // Round quotient to an integer. | |
125 | fcvt.fx.trunc f8 = f10 | |
126 | ;; | |
127 | // Transfer result to GP registers. | |
128 | getf.sig ret0 = f8 | |
129 | br.ret.sptk rp | |
130 | ;; | |
131 | .endp __divdi3 | |
132 | #endif | |
133 | ||
134 | #ifdef L__moddi3 | |
135 | // Compute a 64-bit integer modulus. | |
136 | // | |
137 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
138 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
139 | // to get more than the 64 bits of precision that we need for DImode. | |
140 | // | |
141 | // Must use max precision for the reciprocal computations to get 64 bits of | |
142 | // precision. | |
143 | // | |
144 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
145 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
146 | // f12 is a temporary. | |
147 | ||
148 | .text | |
149 | .align 16 | |
150 | .global __moddi3 | |
151 | .proc __moddi3 | |
152 | __moddi3: | |
153 | .regstk 2,0,0,0 | |
154 | // Transfer inputs to FP registers. | |
155 | setf.sig f8 = in0 | |
156 | setf.sig f9 = in1 | |
157 | ;; | |
158 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
159 | fcvt.xf f8 = f8 | |
160 | fcvt.xf f9 = f9 | |
161 | ;; | |
162 | // Compute the reciprocal approximation. | |
163 | frcpa f10, p6 = f8, f9 | |
164 | ;; | |
165 | // 3 Newton-Raphson iterations. | |
166 | (p6) fma.s1 f11 = farg0, f10, f0 | |
167 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
168 | ;; | |
169 | (p6) fma.s1 f11 = f12, f11, f11 | |
170 | (p6) fma.s1 f13 = f12, f12, f0 | |
171 | (p6) fma.s1 f10 = f12, f10, f10 | |
172 | ;; | |
173 | (p6) fma.s1 f11 = f13, f11, f11 | |
174 | (p6) fma.s1 f12 = f13, f13, f0 | |
175 | (p6) fma.s1 f10 = f13, f10, f10 | |
176 | ;; | |
177 | (p6) fma.s1 f11 = f12, f11, f11 | |
178 | (p6) fma.s1 f10 = f12, f10, f10 | |
179 | ;; | |
180 | (p6) fnma.s1 f12 = f9, f11, f8 | |
181 | ;; | |
182 | (p6) fma f10 = f12, f10, f11 | |
183 | ;; | |
184 | // Round quotient to an integer. | |
185 | fcvt.fx.trunc f10 = f10 | |
186 | ;; | |
187 | // Renormalize. | |
188 | fcvt.xf f10 = f10 | |
189 | ;; | |
190 | // Compute remainder. | |
191 | fnma f8 = f10, f9, f8 | |
192 | ;; | |
193 | // Round remainder to an integer. | |
194 | fcvt.fx.trunc f8 = f8 | |
195 | ;; | |
196 | // Transfer result to GP registers. | |
197 | getf.sig ret0 = f8 | |
198 | br.ret.sptk rp | |
199 | ;; | |
200 | .endp __moddi3 | |
201 | #endif | |
202 | ||
203 | #ifdef L__udivdi3 | |
204 | // Compute a 64-bit unsigned integer quotient. | |
205 | // | |
206 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
207 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
208 | // to get more than the 64 bits of precision that we need for DImode. | |
209 | // | |
210 | // Must use max precision for the reciprocal computations to get 64 bits of | |
211 | // precision. | |
212 | // | |
213 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
214 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
215 | // f12 is a temporary. | |
216 | ||
217 | .text | |
218 | .align 16 | |
219 | .global __udivdi3 | |
220 | .proc __udivdi3 | |
221 | __udivdi3: | |
222 | .regstk 2,0,0,0 | |
223 | // Transfer inputs to FP registers. | |
224 | setf.sig f8 = in0 | |
225 | setf.sig f9 = in1 | |
226 | ;; | |
227 | // Convert the inputs to FP, to avoid FP software-assist faults. | |
228 | fcvt.xuf f8 = f8 | |
229 | fcvt.xuf f9 = f9 | |
230 | ;; | |
231 | // Compute the reciprocal approximation. | |
232 | frcpa f10, p6 = f8, f9 | |
233 | ;; | |
234 | // 3 Newton-Raphson iterations. | |
235 | (p6) fma.s1 f11 = farg0, f10, f0 | |
236 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
237 | ;; | |
238 | (p6) fma.s1 f11 = f12, f11, f11 | |
239 | (p6) fma.s1 f13 = f12, f12, f0 | |
240 | (p6) fma.s1 f10 = f12, f10, f10 | |
241 | ;; | |
242 | (p6) fma.s1 f11 = f13, f11, f11 | |
243 | (p6) fma.s1 f12 = f13, f13, f0 | |
244 | (p6) fma.s1 f10 = f13, f10, f10 | |
245 | ;; | |
246 | (p6) fma.s1 f11 = f12, f11, f11 | |
247 | (p6) fma.s1 f10 = f12, f10, f10 | |
248 | ;; | |
249 | (p6) fnma.s1 f8 = f9, f11, f8 | |
250 | ;; | |
251 | (p6) fma f10 = f8, f10, f11 | |
252 | ;; | |
253 | // Round quotient to an unsigned integer. | |
254 | fcvt.fxu.trunc f8 = f10 | |
255 | ;; | |
256 | // Transfer result to GP registers. | |
257 | getf.sig ret0 = f8 | |
258 | br.ret.sptk rp | |
259 | ;; | |
260 | .endp __udivdi3 | |
261 | #endif | |
262 | ||
263 | #ifdef L__umoddi3 | |
264 | // Compute a 64-bit unsigned integer modulus. | |
265 | // | |
266 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
267 | // quotient. frcpa gives 8.6 significant bits, so we need 3 iterations | |
268 | // to get more than the 64 bits of precision that we need for DImode. | |
269 | // | |
270 | // Must use max precision for the reciprocal computations to get 64 bits of | |
271 | // precision. | |
272 | // | |
273 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
274 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
275 | // f12 is a temporary. | |
276 | ||
277 | .text | |
278 | .align 16 | |
279 | .global __umoddi3 | |
280 | .proc __umoddi3 | |
281 | __umoddi3: | |
282 | .regstk 2,0,0,0 | |
283 | // Transfer inputs to FP registers. | |
284 | setf.sig f8 = in0 | |
285 | setf.sig f9 = in1 | |
286 | ;; | |
287 | // Convert the inputs to FP, to avoid FP software assist faults. | |
288 | fcvt.xuf f8 = f8 | |
289 | fcvt.xuf f9 = f9 | |
290 | ;; | |
291 | // Compute the reciprocal approximation. | |
292 | frcpa f10, p6 = f8, f9 | |
293 | ;; | |
294 | // 3 Newton-Raphson iterations. | |
295 | (p6) fma.s1 f11 = farg0, f10, f0 | |
296 | (p6) fnma.s1 f12 = farg1, f10, f1 | |
297 | ;; | |
298 | (p6) fma.s1 f11 = f12, f11, f11 | |
299 | (p6) fma.s1 f13 = f12, f12, f0 | |
300 | (p6) fma.s1 f10 = f12, f10, f10 | |
301 | ;; | |
302 | (p6) fma.s1 f11 = f13, f11, f11 | |
303 | (p6) fma.s1 f12 = f13, f13, f0 | |
304 | (p6) fma.s1 f10 = f13, f10, f10 | |
305 | ;; | |
306 | (p6) fma.s1 f11 = f12, f11, f11 | |
307 | (p6) fma.s1 f10 = f12, f10, f10 | |
308 | ;; | |
309 | (p6) fnma.s1 f12 = f9, f11, f8 | |
310 | ;; | |
311 | (p6) fma f10 = f12, f10, f11 | |
312 | ;; | |
313 | // Round quotient to an unsigned integer. | |
314 | fcvt.fxu.trunc f10 = f10 | |
315 | ;; | |
316 | // Renormalize. | |
317 | fcvt.xuf f10 = f10 | |
318 | ;; | |
319 | // Compute remainder. | |
320 | fnma f8 = f10, f9, f8 | |
321 | ;; | |
322 | // Round remainder to an integer. | |
323 | fcvt.fxu.trunc f8 = f8 | |
324 | ;; | |
325 | // Transfer result to GP registers. | |
326 | getf.sig ret0 = f8 | |
327 | br.ret.sptk rp | |
328 | ;; | |
329 | .endp __umoddi3 | |
330 | #endif | |
331 | ||
332 | #ifdef L__divsi3 | |
333 | // Compute a 32-bit integer quotient. | |
334 | // | |
335 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
336 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
337 | // to get more than the 32 bits of precision that we need for SImode. | |
338 | // | |
339 | // ??? This is currently not used. It needs to be fixed to be more like the | |
340 | // above DImode routines. | |
341 | // | |
342 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
343 | // some adjustment code to get precise enough results. | |
344 | // | |
345 | // ??? Should probably use max precision for the reciprocal computations. | |
346 | // | |
347 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
348 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
349 | // f12 is a temporary. | |
350 | ||
351 | .text | |
352 | .align 16 | |
353 | .global __divsi3 | |
354 | .proc __divsi3 | |
355 | __divsi3: | |
356 | .regstk 2,0,0,0 | |
357 | setf.sig f8 = in0 | |
358 | setf.sig f9 = in1 | |
359 | ;; | |
360 | fcvt.xf f8 = f8 | |
361 | fcvt.xf f9 = f9 | |
362 | ;; | |
363 | frcpa f11, p6 = f8, f9 | |
364 | fadd f10 = f1, f1 | |
365 | ;; | |
366 | fnma f12 = f9, f11, f10 | |
367 | ;; | |
368 | fmpy f11 = f11, f12 | |
369 | ;; | |
370 | fnma f12 = f9, f11, f10 | |
371 | ;; | |
372 | fmpy f11 = f11, f12 | |
373 | ;; | |
374 | fmpy f8 = f8, f11 | |
375 | ;; | |
376 | fcvt.fx.trunc f8 = f8 | |
377 | ;; | |
378 | getf.sig ret0 = f8 | |
379 | br.ret.sptk rp | |
380 | ;; | |
381 | .endp __divsi3 | |
382 | #endif | |
383 | ||
384 | #ifdef L__modsi3 | |
385 | // Compute a 32-bit integer modulus. | |
386 | // | |
387 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
388 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
389 | // to get more than the 32 bits of precision that we need for SImode. | |
390 | // | |
391 | // ??? This is currently not used. It needs to be fixed to be more like the | |
392 | // above DImode routines. | |
393 | // | |
394 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
395 | // some adjustment code to get precise enough results. | |
396 | // | |
397 | // ??? Should probably use max precision for the reciprocal computations. | |
398 | // | |
399 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
400 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
401 | // f12 is a temporary. | |
402 | ||
403 | .text | |
404 | .align 16 | |
405 | .global __modsi3 | |
406 | .proc __modsi3 | |
407 | __modsi3: | |
408 | .regstk 2,0,0,0 | |
409 | setf.sig f8 = r32 | |
410 | setf.sig f9 = r33 | |
411 | ;; | |
412 | fcvt.xf f8 = f8 | |
413 | fcvt.xf f9 = f9 | |
414 | ;; | |
415 | frcpa f11, p6 = f8, f9 | |
416 | fadd f10 = f1, f1 | |
417 | ;; | |
418 | fnma f12 = f9, f11, f10 | |
419 | ;; | |
420 | fmpy f11 = f11, f12 | |
421 | ;; | |
422 | fnma f12 = f9, f11, f10 | |
423 | ;; | |
424 | fmpy f11 = f11, f12 | |
425 | ;; | |
426 | fmpy f10 = f8, f11 | |
427 | ;; | |
428 | fcvt.fx.trunc f10 = f10 | |
429 | ;; | |
430 | fcvt.xf f10 = f10 | |
431 | ;; | |
432 | fnma f8 = f10, f9, f8 | |
433 | ;; | |
434 | fcvt.fx f8 = f8 | |
435 | ;; | |
436 | getf.sig r32 = f8 | |
437 | br.ret.sptk rp | |
438 | ;; | |
439 | .endp __modsi3 | |
440 | #endif | |
441 | ||
442 | #ifdef L__udivsi3 | |
443 | // Compute a 32-bit unsigned integer quotient. | |
444 | // | |
445 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
446 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
447 | // to get more than the 32 bits of precision that we need for SImode. | |
448 | // | |
449 | // ??? This is currently not used. It needs to be fixed to be more like the | |
450 | // above DImode routines. | |
451 | // | |
452 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
453 | // some adjustment code to get precise enough results. | |
454 | // | |
455 | // ??? Should probably use max precision for the reciprocal computations. | |
456 | // | |
457 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
458 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
459 | // f12 is a temporary. | |
460 | // | |
461 | // This is the same as divsi3, except that we don't need fcvt instructions | |
462 | // before the frcpa. | |
463 | ||
464 | .text | |
465 | .align 16 | |
466 | .global __udivsi3 | |
467 | .proc __udivsi3 | |
468 | __udivsi3: | |
469 | .regstk 2,0,0,0 | |
470 | setf.sig f8 = r32 | |
471 | setf.sig f9 = r33 | |
472 | ;; | |
473 | frcpa f11, p6 = f8, f9 | |
474 | fadd f10 = f1, f1 | |
475 | ;; | |
476 | fnma f12 = f9, f11, f10 | |
477 | ;; | |
478 | fmpy f11 = f11, f12 | |
479 | ;; | |
480 | fnma f12 = f9, f11, f10 | |
481 | ;; | |
482 | fmpy f11 = f11, f12 | |
483 | ;; | |
484 | fmpy f8 = f8, f11 | |
485 | ;; | |
486 | fcvt.fxu.trunc f8 = f8 | |
487 | ;; | |
488 | getf.sig ret0 = f8 | |
489 | br.ret.sptk rp | |
490 | ;; | |
491 | .endp __udivsi3 | |
492 | #endif | |
493 | ||
494 | #ifdef L__umodsi3 | |
495 | // Compute a 32-bit unsigned integer modulus. | |
496 | // | |
497 | // Use reciprocal approximation and Newton-Raphson iteration to compute the | |
498 | // quotient. frcpa gives 8.6 significant bits, so we need 2 iterations | |
499 | // to get more than the 32 bits of precision that we need for SImode. | |
500 | // | |
501 | // ??? This is currently not used. It needs to be fixed to be more like the | |
502 | // above DImode routines. | |
503 | // | |
504 | // ??? Check to see if the error is less than >.5ulp error. We may need | |
505 | // some adjustment code to get precise enough results. | |
506 | // | |
507 | // ??? Should probably use max precision for the reciprocal computations. | |
508 | // | |
509 | // r32/f8 holds the dividend. r33/f9 holds the divisor. | |
510 | // f10 holds the value 2.0. f11 holds the reciprocal approximation. | |
511 | // f12 is a temporary. | |
512 | // | |
513 | // This is the same as modsi3, except that we don't need fcvt instructions | |
514 | // before the frcpa. | |
515 | ||
516 | .text | |
517 | .align 16 | |
518 | .global __umodsi3 | |
519 | .proc __umodsi3 | |
520 | __umodsi3: | |
521 | .regstk 2,0,0,0 | |
522 | setf.sig f8 = r32 | |
523 | setf.sig f9 = r33 | |
524 | ;; | |
525 | frcpa f11, p6 = f8, f9 | |
526 | fadd f10 = f1, f1 | |
527 | ;; | |
528 | fnma f12 = f9, f11, f10 | |
529 | ;; | |
530 | fmpy f11 = f11, f12 | |
531 | ;; | |
532 | fnma f12 = f9, f11, f10 | |
533 | ;; | |
534 | fmpy f11 = f11, f12 | |
535 | ;; | |
536 | fmpy f10 = f8, f11 | |
537 | ;; | |
538 | fcvt.fxu.trunc f10 = f10 | |
539 | ;; | |
540 | fcvt.xuf f10 = f10 | |
541 | ;; | |
542 | fnma f8 = f10, f9, f8 | |
543 | ;; | |
544 | fcvt.fxu f8 = f8 | |
545 | ;; | |
546 | getf.sig r32 = f8 | |
547 | br.ret.sptk rp | |
548 | ;; | |
549 | .endp __umodsi3 | |
550 | #endif | |
551 | ||
552 | #ifdef L__save_stack_nonlocal | |
553 | // Notes on save/restore stack nonlocal: We read ar.bsp but write | |
554 | // ar.bspstore. This is because ar.bsp can be read at all times | |
555 | // (independent of the RSE mode) but since it's read-only we need to | |
556 | // restore the value via ar.bspstore. This is OK because | |
557 | // ar.bsp==ar.bspstore after executing "flushrs". | |
558 | ||
559 | // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) | |
560 | ||
561 | .text | |
562 | .align 16 | |
563 | .global __ia64_save_stack_nonlocal | |
564 | .proc __ia64_save_stack_nonlocal | |
565 | __ia64_save_stack_nonlocal: | |
566 | alloc r18=ar.pfs,2,0,0,0 | |
567 | st8 [in0]=in1,8 | |
568 | mov r19=ar.rsc | |
569 | ;; | |
570 | flushrs | |
571 | and r19=0x1c,r19 | |
572 | mov ar.pfs=r18 | |
573 | ;; | |
574 | mov ar.rsc=r19 | |
575 | mov r16=ar.bsp | |
576 | adds r2=16,in0 | |
577 | ;; | |
578 | mov r17=ar.rnat | |
579 | st8 [in0]=r16,8 | |
580 | or r19=0x3,r19 | |
581 | ;; | |
582 | st8 [in0]=r17 | |
583 | mov ar.rsc=r19 | |
584 | st8 [r2]=r18 | |
585 | mov ar.pfs=r18 | |
586 | br.ret.sptk.few rp | |
587 | ;; | |
588 | .endp __ia64_save_stack_nonlocal | |
589 | #endif | |
590 | ||
591 | #ifdef L__nonlocal_goto | |
592 | // void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area, | |
593 | // void *static_chain); | |
594 | ||
595 | .text | |
596 | .align 16 | |
597 | .global __ia64_nonlocal_goto | |
598 | .proc __ia64_nonlocal_goto | |
599 | __ia64_nonlocal_goto: | |
600 | alloc r20=ar.pfs,4,0,0,0 | |
601 | mov r19=ar.rsc | |
602 | adds r2=8,in2 | |
603 | ld8 r12=[in2],16 | |
604 | mov.ret.sptk.few.dc.dc rp = r33, .L0 | |
605 | // ??? flushrs must be first instruction of a group. Gas is unfortunately | |
606 | // putting the stop bit before the padding nop instead of after it, making | |
607 | // flushrs the first instruction of its bundle, but the second instruction | |
608 | // of its group. We explicitly add the nop to avoid this problem. | |
609 | nop.i 0 | |
610 | ;; | |
611 | flushrs | |
612 | ld8 r16=[r2],16 | |
613 | and r19=0x1c,r19 | |
614 | ld8 r17=[in2] | |
615 | ;; | |
616 | ld8 r18=[r2] | |
617 | mov ar.rsc=r19 | |
618 | ;; | |
619 | mov ar.bspstore=r16 | |
620 | ;; | |
621 | mov ar.rnat=r17 | |
622 | mov ar.pfs=r18 | |
623 | or r19=0x3,r19 | |
624 | ;; | |
625 | loadrs | |
626 | invala | |
627 | mov r7=r32 | |
628 | .L0: { | |
629 | mov ar.rsc=r19 | |
630 | mov r15=r35 | |
631 | br.ret.sptk.few rp | |
632 | } | |
633 | ;; | |
634 | .endp __ia64_nonlocal_goto | |
635 | #endif |