]>
Commit | Line | Data |
---|---|---|
3f622353 RH |
1 | #ifdef L__divtf3 |
2 | // Compute a 80-bit IEEE double-extended quotient. | |
3 | // | |
4 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
5 | // alternative. | |
6 | // | |
7 | // farg0 holds the dividend. farg1 holds the divisor. | |
8 | ||
9 | .text | |
10 | .align 16 | |
11 | .global __divtf3 | |
12 | .proc __divtf3 | |
13 | __divtf3: | |
3392dafc RH |
14 | cmp.eq p7, p0 = r0, r0 |
15 | frcpa.s0 f10, p6 = farg0, farg1 | |
3f622353 | 16 | ;; |
3392dafc RH |
17 | (p6) cmp.ne p7, p0 = r0, r0 |
18 | .pred.rel.mutex p6, p7 | |
3f622353 | 19 | (p6) fnma.s1 f11 = farg1, f10, f1 |
3392dafc | 20 | (p6) fmpy.s1 f12 = farg0, f10 |
3f622353 | 21 | ;; |
3392dafc RH |
22 | (p6) fmpy.s1 f13 = f11, f11 |
23 | (p6) fma.s1 f14 = f11, f11, f11 | |
3f622353 | 24 | ;; |
3392dafc RH |
25 | (p6) fma.s1 f11 = f13, f13, f11 |
26 | (p6) fma.s1 f13 = f14, f10, f10 | |
3f622353 | 27 | ;; |
3392dafc RH |
28 | (p6) fma.s1 f10 = f13, f11, f10 |
29 | (p6) fnma.s1 f12 = farg1, f12, farg0 | |
3f622353 | 30 | ;; |
3392dafc RH |
31 | (p6) fma.s1 f11 = f11, f10, f12 |
32 | (p6) fnma.s1 f13 = farg1, f10, f1 | |
3f622353 | 33 | ;; |
3392dafc RH |
34 | (p6) fma.s1 f10 = f12, f10, f10 |
35 | (p6) fnma.s1 f12 = farg1, f11, farg0 | |
3f622353 | 36 | ;; |
3392dafc RH |
37 | (p6) fma fret0 = f12, f10, f11 |
38 | (p7) mov fret0 = f10 | |
3f622353 RH |
39 | br.ret.sptk rp |
40 | ;; | |
41 | .endp __divtf3 | |
42 | #endif | |
43 | ||
c65ebc55 JW |
44 | #ifdef L__divdf3 |
45 | // Compute a 64-bit IEEE double quotient. | |
46 | // | |
47 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
48 | // alternative. | |
49 | // | |
50 | // farg0 holds the dividend. farg1 holds the divisor. | |
51 | ||
52 | .text | |
53 | .align 16 | |
54 | .global __divdf3 | |
55 | .proc __divdf3 | |
56 | __divdf3: | |
3392dafc RH |
57 | cmp.eq p7, p0 = r0, r0 |
58 | frcpa.s0 f10, p6 = farg0, farg1 | |
c65ebc55 | 59 | ;; |
3392dafc RH |
60 | (p6) cmp.ne p7, p0 = r0, r0 |
61 | .pred.rel.mutex p6, p7 | |
62 | (p6) fmpy.s1 f11 = farg0, f10 | |
c65ebc55 JW |
63 | (p6) fnma.s1 f12 = farg1, f10, f1 |
64 | ;; | |
65 | (p6) fma.s1 f11 = f12, f11, f11 | |
3392dafc | 66 | (p6) fmpy.s1 f13 = f12, f12 |
c65ebc55 | 67 | ;; |
3392dafc | 68 | (p6) fma.s1 f10 = f12, f10, f10 |
c65ebc55 | 69 | (p6) fma.s1 f11 = f13, f11, f11 |
3392dafc RH |
70 | ;; |
71 | (p6) fmpy.s1 f12 = f13, f13 | |
c65ebc55 JW |
72 | (p6) fma.s1 f10 = f13, f10, f10 |
73 | ;; | |
74 | (p6) fma.d.s1 f11 = f12, f11, f11 | |
75 | (p6) fma.s1 f10 = f12, f10, f10 | |
76 | ;; | |
77 | (p6) fnma.d.s1 f8 = farg1, f11, farg0 | |
78 | ;; | |
3392dafc RH |
79 | (p6) fma.d fret0 = f8, f10, f11 |
80 | (p7) mov fret0 = f10 | |
c65ebc55 JW |
81 | br.ret.sptk rp |
82 | ;; | |
83 | .endp __divdf3 | |
84 | #endif | |
85 | ||
86 | #ifdef L__divsf3 | |
87 | // Compute a 32-bit IEEE float quotient. | |
88 | // | |
89 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
90 | // alternative. | |
91 | // | |
92 | // farg0 holds the dividend. farg1 holds the divisor. | |
93 | ||
94 | .text | |
95 | .align 16 | |
96 | .global __divsf3 | |
97 | .proc __divsf3 | |
98 | __divsf3: | |
938566fb | 99 | cmp.eq p7, p0 = r0, r0 |
3392dafc | 100 | frcpa.s0 f10, p6 = farg0, farg1 |
c65ebc55 | 101 | ;; |
938566fb RH |
102 | (p6) cmp.ne p7, p0 = r0, r0 |
103 | .pred.rel.mutex p6, p7 | |
3392dafc | 104 | (p6) fmpy.s1 f8 = farg0, f10 |
c65ebc55 JW |
105 | (p6) fnma.s1 f9 = farg1, f10, f1 |
106 | ;; | |
107 | (p6) fma.s1 f8 = f9, f8, f8 | |
3392dafc | 108 | (p6) fmpy.s1 f9 = f9, f9 |
c65ebc55 JW |
109 | ;; |
110 | (p6) fma.s1 f8 = f9, f8, f8 | |
3392dafc | 111 | (p6) fmpy.s1 f9 = f9, f9 |
c65ebc55 | 112 | ;; |
3392dafc | 113 | (p6) fma.d.s1 f10 = f9, f8, f8 |
c65ebc55 | 114 | ;; |
938566fb RH |
115 | (p6) fnorm.s.s0 fret0 = f10 |
116 | (p7) mov fret0 = f10 | |
c65ebc55 JW |
117 | br.ret.sptk rp |
118 | ;; | |
119 | .endp __divsf3 | |
120 | #endif | |
121 | ||
122 | #ifdef L__divdi3 | |
123 | // Compute a 64-bit integer quotient. | |
124 | // | |
d8d7a286 RH |
125 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
126 | // alternative. | |
c65ebc55 | 127 | // |
d8d7a286 | 128 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
129 | |
130 | .text | |
131 | .align 16 | |
132 | .global __divdi3 | |
133 | .proc __divdi3 | |
134 | __divdi3: | |
135 | .regstk 2,0,0,0 | |
136 | // Transfer inputs to FP registers. | |
137 | setf.sig f8 = in0 | |
138 | setf.sig f9 = in1 | |
139 | ;; | |
140 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
141 | fcvt.xf f8 = f8 | |
142 | fcvt.xf f9 = f9 | |
143 | ;; | |
144 | // Compute the reciprocal approximation. | |
660a0ebd | 145 | frcpa.s1 f10, p6 = f8, f9 |
2a7ffc85 | 146 | ;; |
c65ebc55 | 147 | // 3 Newton-Raphson iterations. |
d8d7a286 RH |
148 | (p6) fnma.s1 f11 = f9, f10, f1 |
149 | (p6) fmpy.s1 f12 = f8, f10 | |
c65ebc55 | 150 | ;; |
d8d7a286 RH |
151 | (p6) fmpy.s1 f13 = f11, f11 |
152 | (p6) fma.s1 f12 = f11, f12, f12 | |
c65ebc55 | 153 | ;; |
d8d7a286 RH |
154 | (p6) fma.s1 f10 = f11, f10, f10 |
155 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 156 | ;; |
d8d7a286 RH |
157 | (p6) fma.s1 f10 = f13, f10, f10 |
158 | (p6) fnma.s1 f12 = f9, f11, f8 | |
c65ebc55 | 159 | ;; |
d8d7a286 | 160 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
161 | ;; |
162 | // Round quotient to an integer. | |
d8d7a286 | 163 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 JW |
164 | ;; |
165 | // Transfer result to GP registers. | |
d8d7a286 | 166 | getf.sig ret0 = f10 |
c65ebc55 JW |
167 | br.ret.sptk rp |
168 | ;; | |
169 | .endp __divdi3 | |
170 | #endif | |
171 | ||
172 | #ifdef L__moddi3 | |
173 | // Compute a 64-bit integer modulus. | |
174 | // | |
d8d7a286 RH |
175 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
176 | // alternative. | |
c65ebc55 | 177 | // |
d8d7a286 | 178 | // in0 holds the dividend (a). in1 holds the divisor (b). |
c65ebc55 JW |
179 | |
180 | .text | |
181 | .align 16 | |
182 | .global __moddi3 | |
183 | .proc __moddi3 | |
184 | __moddi3: | |
185 | .regstk 2,0,0,0 | |
186 | // Transfer inputs to FP registers. | |
d8d7a286 | 187 | setf.sig f14 = in0 |
c65ebc55 JW |
188 | setf.sig f9 = in1 |
189 | ;; | |
190 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
d8d7a286 | 191 | fcvt.xf f8 = f14 |
c65ebc55 JW |
192 | fcvt.xf f9 = f9 |
193 | ;; | |
194 | // Compute the reciprocal approximation. | |
660a0ebd | 195 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
196 | ;; |
197 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
198 | (p6) fmpy.s1 f12 = f8, f10 |
199 | (p6) fnma.s1 f11 = f9, f10, f1 | |
c65ebc55 | 200 | ;; |
d8d7a286 RH |
201 | (p6) fma.s1 f12 = f11, f12, f12 |
202 | (p6) fmpy.s1 f13 = f11, f11 | |
c65ebc55 | 203 | ;; |
d8d7a286 RH |
204 | (p6) fma.s1 f10 = f11, f10, f10 |
205 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 206 | ;; |
d8d7a286 RH |
207 | sub in1 = r0, in1 |
208 | (p6) fma.s1 f10 = f13, f10, f10 | |
c65ebc55 JW |
209 | (p6) fnma.s1 f12 = f9, f11, f8 |
210 | ;; | |
d8d7a286 | 211 | setf.sig f9 = in1 |
660a0ebd | 212 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 | 213 | ;; |
660a0ebd | 214 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 215 | ;; |
d8d7a286 RH |
216 | // r = q * (-b) + a |
217 | xma.l f10 = f10, f9, f14 | |
c65ebc55 JW |
218 | ;; |
219 | // Transfer result to GP registers. | |
d8d7a286 | 220 | getf.sig ret0 = f10 |
c65ebc55 JW |
221 | br.ret.sptk rp |
222 | ;; | |
223 | .endp __moddi3 | |
224 | #endif | |
225 | ||
226 | #ifdef L__udivdi3 | |
227 | // Compute a 64-bit unsigned integer quotient. | |
228 | // | |
d8d7a286 RH |
229 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
230 | // alternative. | |
c65ebc55 | 231 | // |
d8d7a286 | 232 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
233 | |
234 | .text | |
235 | .align 16 | |
236 | .global __udivdi3 | |
237 | .proc __udivdi3 | |
238 | __udivdi3: | |
239 | .regstk 2,0,0,0 | |
240 | // Transfer inputs to FP registers. | |
241 | setf.sig f8 = in0 | |
242 | setf.sig f9 = in1 | |
243 | ;; | |
244 | // Convert the inputs to FP, to avoid FP software-assist faults. | |
660a0ebd JW |
245 | fcvt.xuf.s1 f8 = f8 |
246 | fcvt.xuf.s1 f9 = f9 | |
c65ebc55 JW |
247 | ;; |
248 | // Compute the reciprocal approximation. | |
660a0ebd | 249 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
250 | ;; |
251 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
252 | (p6) fnma.s1 f11 = f9, f10, f1 |
253 | (p6) fmpy.s1 f12 = f8, f10 | |
c65ebc55 | 254 | ;; |
d8d7a286 RH |
255 | (p6) fmpy.s1 f13 = f11, f11 |
256 | (p6) fma.s1 f12 = f11, f12, f12 | |
c65ebc55 | 257 | ;; |
d8d7a286 RH |
258 | (p6) fma.s1 f10 = f11, f10, f10 |
259 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 260 | ;; |
d8d7a286 RH |
261 | (p6) fma.s1 f10 = f13, f10, f10 |
262 | (p6) fnma.s1 f12 = f9, f11, f8 | |
c65ebc55 | 263 | ;; |
2a7ffc85 | 264 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
265 | ;; |
266 | // Round quotient to an unsigned integer. | |
d8d7a286 | 267 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 JW |
268 | ;; |
269 | // Transfer result to GP registers. | |
d8d7a286 | 270 | getf.sig ret0 = f10 |
c65ebc55 JW |
271 | br.ret.sptk rp |
272 | ;; | |
273 | .endp __udivdi3 | |
274 | #endif | |
275 | ||
276 | #ifdef L__umoddi3 | |
277 | // Compute a 64-bit unsigned integer modulus. | |
278 | // | |
d8d7a286 RH |
279 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
280 | // alternative. | |
c65ebc55 | 281 | // |
d8d7a286 | 282 | // in0 holds the dividend (a). in1 holds the divisor (b). |
c65ebc55 JW |
283 | |
284 | .text | |
285 | .align 16 | |
286 | .global __umoddi3 | |
287 | .proc __umoddi3 | |
288 | __umoddi3: | |
289 | .regstk 2,0,0,0 | |
290 | // Transfer inputs to FP registers. | |
d8d7a286 | 291 | setf.sig f14 = in0 |
c65ebc55 JW |
292 | setf.sig f9 = in1 |
293 | ;; | |
294 | // Convert the inputs to FP, to avoid FP software assist faults. | |
d8d7a286 | 295 | fcvt.xuf.s1 f8 = f14 |
660a0ebd | 296 | fcvt.xuf.s1 f9 = f9 |
c65ebc55 JW |
297 | ;; |
298 | // Compute the reciprocal approximation. | |
660a0ebd | 299 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
300 | ;; |
301 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
302 | (p6) fmpy.s1 f12 = f8, f10 |
303 | (p6) fnma.s1 f11 = f9, f10, f1 | |
c65ebc55 | 304 | ;; |
d8d7a286 RH |
305 | (p6) fma.s1 f12 = f11, f12, f12 |
306 | (p6) fmpy.s1 f13 = f11, f11 | |
c65ebc55 | 307 | ;; |
d8d7a286 RH |
308 | (p6) fma.s1 f10 = f11, f10, f10 |
309 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 310 | ;; |
d8d7a286 RH |
311 | sub in1 = r0, in1 |
312 | (p6) fma.s1 f10 = f13, f10, f10 | |
c65ebc55 JW |
313 | (p6) fnma.s1 f12 = f9, f11, f8 |
314 | ;; | |
d8d7a286 | 315 | setf.sig f9 = in1 |
660a0ebd | 316 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
317 | ;; |
318 | // Round quotient to an unsigned integer. | |
660a0ebd | 319 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 320 | ;; |
d8d7a286 RH |
321 | // r = q * (-b) + a |
322 | xma.l f10 = f10, f9, f14 | |
c65ebc55 JW |
323 | ;; |
324 | // Transfer result to GP registers. | |
d8d7a286 | 325 | getf.sig ret0 = f10 |
c65ebc55 JW |
326 | br.ret.sptk rp |
327 | ;; | |
328 | .endp __umoddi3 | |
329 | #endif | |
330 | ||
331 | #ifdef L__divsi3 | |
332 | // Compute a 32-bit integer quotient. | |
333 | // | |
d8d7a286 RH |
334 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
335 | // alternative. | |
c65ebc55 | 336 | // |
d8d7a286 | 337 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
338 | |
339 | .text | |
340 | .align 16 | |
341 | .global __divsi3 | |
342 | .proc __divsi3 | |
343 | __divsi3: | |
344 | .regstk 2,0,0,0 | |
d8d7a286 RH |
345 | sxt4 in0 = in0 |
346 | sxt4 in1 = in1 | |
347 | ;; | |
c65ebc55 JW |
348 | setf.sig f8 = in0 |
349 | setf.sig f9 = in1 | |
350 | ;; | |
d8d7a286 | 351 | mov r2 = 0x0ffdd |
c65ebc55 JW |
352 | fcvt.xf f8 = f8 |
353 | fcvt.xf f9 = f9 | |
354 | ;; | |
d8d7a286 | 355 | setf.exp f11 = r2 |
4287b5f1 | 356 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 357 | ;; |
d8d7a286 RH |
358 | (p6) fmpy.s1 f8 = f8, f10 |
359 | (p6) fnma.s1 f9 = f9, f10, f1 | |
c65ebc55 | 360 | ;; |
d8d7a286 RH |
361 | (p6) fma.s1 f8 = f9, f8, f8 |
362 | (p6) fma.s1 f9 = f9, f9, f11 | |
c65ebc55 | 363 | ;; |
d8d7a286 | 364 | (p6) fma.s1 f10 = f9, f8, f8 |
c65ebc55 | 365 | ;; |
d8d7a286 | 366 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 367 | ;; |
d8d7a286 | 368 | getf.sig ret0 = f10 |
c65ebc55 JW |
369 | br.ret.sptk rp |
370 | ;; | |
371 | .endp __divsi3 | |
372 | #endif | |
373 | ||
374 | #ifdef L__modsi3 | |
375 | // Compute a 32-bit integer modulus. | |
376 | // | |
d8d7a286 RH |
377 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
378 | // alternative. | |
c65ebc55 | 379 | // |
d8d7a286 | 380 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
381 | |
382 | .text | |
383 | .align 16 | |
384 | .global __modsi3 | |
385 | .proc __modsi3 | |
386 | __modsi3: | |
387 | .regstk 2,0,0,0 | |
d8d7a286 RH |
388 | mov r2 = 0x0ffdd |
389 | sxt4 in0 = in0 | |
390 | sxt4 in1 = in1 | |
391 | ;; | |
392 | setf.sig f13 = r32 | |
c65ebc55 JW |
393 | setf.sig f9 = r33 |
394 | ;; | |
d8d7a286 RH |
395 | sub in1 = r0, in1 |
396 | fcvt.xf f8 = f13 | |
c65ebc55 JW |
397 | fcvt.xf f9 = f9 |
398 | ;; | |
d8d7a286 | 399 | setf.exp f11 = r2 |
4287b5f1 | 400 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 401 | ;; |
d8d7a286 RH |
402 | (p6) fmpy.s1 f12 = f8, f10 |
403 | (p6) fnma.s1 f10 = f9, f10, f1 | |
c65ebc55 | 404 | ;; |
d8d7a286 RH |
405 | setf.sig f9 = in1 |
406 | (p6) fma.s1 f12 = f10, f12, f12 | |
407 | (p6) fma.s1 f10 = f10, f10, f11 | |
c65ebc55 | 408 | ;; |
d8d7a286 | 409 | (p6) fma.s1 f10 = f10, f12, f12 |
c65ebc55 | 410 | ;; |
d8d7a286 | 411 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 412 | ;; |
d8d7a286 | 413 | xma.l f10 = f10, f9, f13 |
c65ebc55 | 414 | ;; |
d8d7a286 | 415 | getf.sig ret0 = f10 |
c65ebc55 JW |
416 | br.ret.sptk rp |
417 | ;; | |
418 | .endp __modsi3 | |
419 | #endif | |
420 | ||
421 | #ifdef L__udivsi3 | |
422 | // Compute a 32-bit unsigned integer quotient. | |
423 | // | |
d8d7a286 RH |
424 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
425 | // alternative. | |
c65ebc55 | 426 | // |
d8d7a286 | 427 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
428 | |
429 | .text | |
430 | .align 16 | |
431 | .global __udivsi3 | |
432 | .proc __udivsi3 | |
433 | __udivsi3: | |
434 | .regstk 2,0,0,0 | |
d8d7a286 RH |
435 | mov r2 = 0x0ffdd |
436 | zxt4 in0 = in0 | |
437 | zxt4 in1 = in1 | |
c65ebc55 | 438 | ;; |
d8d7a286 RH |
439 | setf.sig f8 = in0 |
440 | setf.sig f9 = in1 | |
c65ebc55 | 441 | ;; |
4287b5f1 RH |
442 | fcvt.xf f8 = f8 |
443 | fcvt.xf f9 = f9 | |
444 | ;; | |
d8d7a286 | 445 | setf.exp f11 = r2 |
4287b5f1 | 446 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 447 | ;; |
d8d7a286 RH |
448 | (p6) fmpy.s1 f8 = f8, f10 |
449 | (p6) fnma.s1 f9 = f9, f10, f1 | |
c65ebc55 | 450 | ;; |
d8d7a286 RH |
451 | (p6) fma.s1 f8 = f9, f8, f8 |
452 | (p6) fma.s1 f9 = f9, f9, f11 | |
c65ebc55 | 453 | ;; |
d8d7a286 | 454 | (p6) fma.s1 f10 = f9, f8, f8 |
c65ebc55 | 455 | ;; |
d8d7a286 | 456 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 457 | ;; |
d8d7a286 | 458 | getf.sig ret0 = f10 |
c65ebc55 JW |
459 | br.ret.sptk rp |
460 | ;; | |
461 | .endp __udivsi3 | |
462 | #endif | |
463 | ||
464 | #ifdef L__umodsi3 | |
465 | // Compute a 32-bit unsigned integer modulus. | |
466 | // | |
d8d7a286 RH |
467 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
468 | // alternative. | |
c65ebc55 | 469 | // |
d8d7a286 | 470 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
471 | |
472 | .text | |
473 | .align 16 | |
474 | .global __umodsi3 | |
475 | .proc __umodsi3 | |
476 | __umodsi3: | |
477 | .regstk 2,0,0,0 | |
d8d7a286 RH |
478 | mov r2 = 0x0ffdd |
479 | zxt4 in0 = in0 | |
480 | zxt4 in1 = in1 | |
c65ebc55 | 481 | ;; |
d8d7a286 RH |
482 | setf.sig f13 = in0 |
483 | setf.sig f9 = in1 | |
c65ebc55 | 484 | ;; |
d8d7a286 RH |
485 | sub in1 = r0, in1 |
486 | fcvt.xf f8 = f13 | |
487 | fcvt.xf f9 = f9 | |
c65ebc55 | 488 | ;; |
d8d7a286 | 489 | setf.exp f11 = r2 |
4287b5f1 | 490 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 491 | ;; |
d8d7a286 RH |
492 | (p6) fmpy.s1 f12 = f8, f10 |
493 | (p6) fnma.s1 f10 = f9, f10, f1 | |
c65ebc55 | 494 | ;; |
2a7ffc85 | 495 | setf.sig f9 = in1 |
d8d7a286 RH |
496 | (p6) fma.s1 f12 = f10, f12, f12 |
497 | (p6) fma.s1 f10 = f10, f10, f11 | |
c65ebc55 | 498 | ;; |
d8d7a286 | 499 | (p6) fma.s1 f10 = f10, f12, f12 |
c65ebc55 | 500 | ;; |
d8d7a286 | 501 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 502 | ;; |
d8d7a286 | 503 | xma.l f10 = f10, f9, f13 |
c65ebc55 | 504 | ;; |
d8d7a286 | 505 | getf.sig ret0 = f10 |
c65ebc55 JW |
506 | br.ret.sptk rp |
507 | ;; | |
508 | .endp __umodsi3 | |
509 | #endif | |
510 | ||
511 | #ifdef L__save_stack_nonlocal | |
512 | // Notes on save/restore stack nonlocal: We read ar.bsp but write | |
513 | // ar.bspstore. This is because ar.bsp can be read at all times | |
514 | // (independent of the RSE mode) but since it's read-only we need to | |
515 | // restore the value via ar.bspstore. This is OK because | |
516 | // ar.bsp==ar.bspstore after executing "flushrs". | |
517 | ||
518 | // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) | |
519 | ||
520 | .text | |
521 | .align 16 | |
522 | .global __ia64_save_stack_nonlocal | |
523 | .proc __ia64_save_stack_nonlocal | |
524 | __ia64_save_stack_nonlocal: | |
97e242b0 RH |
525 | { .mmf |
526 | alloc r18 = ar.pfs, 2, 0, 0, 0 | |
527 | mov r19 = ar.rsc | |
528 | ;; | |
529 | } | |
530 | { .mmi | |
531 | flushrs | |
532 | st8 [in0] = in1, 24 | |
533 | and r19 = 0x1c, r19 | |
534 | ;; | |
535 | } | |
536 | { .mmi | |
537 | st8 [in0] = r18, -16 | |
538 | mov ar.rsc = r19 | |
539 | or r19 = 0x3, r19 | |
540 | ;; | |
541 | } | |
542 | { .mmi | |
543 | mov r16 = ar.bsp | |
544 | mov r17 = ar.rnat | |
545 | adds r2 = 8, in0 | |
546 | ;; | |
547 | } | |
548 | { .mmi | |
549 | st8 [in0] = r16 | |
550 | st8 [r2] = r17 | |
551 | } | |
552 | { .mib | |
553 | mov ar.rsc = r19 | |
554 | br.ret.sptk.few rp | |
555 | ;; | |
556 | } | |
c65ebc55 JW |
557 | .endp __ia64_save_stack_nonlocal |
558 | #endif | |
559 | ||
560 | #ifdef L__nonlocal_goto | |
97e242b0 | 561 | // void __ia64_nonlocal_goto(void *target_label, void *save_area, |
c65ebc55 JW |
562 | // void *static_chain); |
563 | ||
564 | .text | |
565 | .align 16 | |
566 | .global __ia64_nonlocal_goto | |
567 | .proc __ia64_nonlocal_goto | |
568 | __ia64_nonlocal_goto: | |
97e242b0 RH |
569 | { .mmi |
570 | alloc r20 = ar.pfs, 3, 0, 0, 0 | |
571 | ld8 r12 = [in1], 8 | |
572 | mov.ret.sptk rp = in0, .L0 | |
573 | ;; | |
574 | } | |
575 | { .mmf | |
576 | ld8 r16 = [in1], 8 | |
577 | mov r19 = ar.rsc | |
578 | ;; | |
579 | } | |
580 | { .mmi | |
581 | flushrs | |
582 | ld8 r17 = [in1], 8 | |
583 | and r19 = 0x1c, r19 | |
584 | ;; | |
585 | } | |
586 | { .mmi | |
587 | ld8 r18 = [in1] | |
588 | mov ar.rsc = r19 | |
589 | or r19 = 0x3, r19 | |
590 | ;; | |
591 | } | |
592 | { .mmi | |
593 | mov ar.bspstore = r16 | |
594 | ;; | |
595 | mov ar.rnat = r17 | |
596 | ;; | |
597 | } | |
598 | { .mmi | |
599 | loadrs | |
600 | invala | |
601 | mov r15 = in2 | |
602 | ;; | |
603 | } | |
604 | .L0: { .mib | |
605 | mov ar.rsc = r19 | |
606 | mov ar.pfs = r18 | |
607 | br.ret.sptk.few rp | |
608 | ;; | |
c65ebc55 | 609 | } |
c65ebc55 JW |
610 | .endp __ia64_nonlocal_goto |
611 | #endif | |
9525c690 JW |
612 | |
613 | #ifdef L__restore_stack_nonlocal | |
614 | // This is mostly the same as nonlocal_goto above. | |
615 | // ??? This has not been tested yet. | |
616 | ||
617 | // void __ia64_restore_stack_nonlocal(void *save_area) | |
618 | ||
619 | .text | |
620 | .align 16 | |
621 | .global __ia64_restore_stack_nonlocal | |
622 | .proc __ia64_restore_stack_nonlocal | |
623 | __ia64_restore_stack_nonlocal: | |
97e242b0 RH |
624 | { .mmf |
625 | alloc r20 = ar.pfs, 4, 0, 0, 0 | |
626 | ld8 r12 = [in0], 8 | |
627 | ;; | |
628 | } | |
629 | { .mmb | |
630 | ld8 r16=[in0], 8 | |
631 | mov r19 = ar.rsc | |
632 | ;; | |
633 | } | |
634 | { .mmi | |
635 | flushrs | |
636 | ld8 r17 = [in0], 8 | |
637 | and r19 = 0x1c, r19 | |
638 | ;; | |
639 | } | |
640 | { .mmf | |
641 | ld8 r18 = [in0] | |
642 | mov ar.rsc = r19 | |
643 | ;; | |
644 | } | |
645 | { .mmi | |
646 | mov ar.bspstore = r16 | |
647 | ;; | |
648 | mov ar.rnat = r17 | |
649 | or r19 = 0x3, r19 | |
650 | ;; | |
651 | } | |
652 | { .mmf | |
653 | loadrs | |
654 | invala | |
655 | ;; | |
656 | } | |
657 | .L0: { .mib | |
658 | mov ar.rsc = r19 | |
659 | mov ar.pfs = r18 | |
660 | br.ret.sptk.few rp | |
661 | ;; | |
9525c690 | 662 | } |
9525c690 JW |
663 | .endp __ia64_restore_stack_nonlocal |
664 | #endif | |
97e242b0 RH |
665 | |
666 | #ifdef L__trampoline | |
667 | // Implement the nested function trampoline. This is out of line | |
668 | // so that we don't have to bother with flushing the icache, as | |
669 | // well as making the on-stack trampoline smaller. | |
670 | // | |
671 | // The trampoline has the following form: | |
672 | // | |
0024a804 | 673 | // +-------------------+ > |
97e242b0 RH |
674 | // TRAMP: | __ia64_trampoline | | |
675 | // +-------------------+ > fake function descriptor | |
676 | // | TRAMP+16 | | | |
0024a804 | 677 | // +-------------------+ > |
97e242b0 RH |
678 | // | target descriptor | |
679 | // +-------------------+ | |
680 | // | static link | | |
681 | // +-------------------+ | |
682 | ||
683 | .text | |
684 | .align 16 | |
685 | .global __ia64_trampoline | |
686 | .proc __ia64_trampoline | |
687 | __ia64_trampoline: | |
688 | { .mmi | |
689 | ld8 r2 = [r1], 8 | |
690 | ;; | |
691 | ld8 r15 = [r1] | |
692 | } | |
693 | { .mmi | |
694 | ld8 r3 = [r2], 8 | |
695 | ;; | |
696 | ld8 r1 = [r2] | |
697 | mov b6 = r3 | |
698 | } | |
699 | { .bbb | |
700 | br.sptk.many b6 | |
701 | ;; | |
702 | } | |
703 | .endp __ia64_trampoline | |
704 | #endif |