]>
Commit | Line | Data |
---|---|---|
8d9254fc | 1 | /* Copyright (C) 2000-2020 Free Software Foundation, Inc. |
92b4f0af JW |
2 | Contributed by James E. Wilson <wilson@cygnus.com>. |
3 | ||
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
748086b7 | 8 | the Free Software Foundation; either version 3, or (at your option) |
92b4f0af JW |
9 | any later version. |
10 | ||
11 | GCC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
748086b7 JJ |
16 | Under Section 7 of GPL version 3, you are granted additional |
17 | permissions described in the GCC Runtime Library Exception, version | |
18 | 3.1, as published by the Free Software Foundation. | |
92b4f0af | 19 | |
748086b7 JJ |
20 | You should have received a copy of the GNU General Public License and |
21 | a copy of the GCC Runtime Library Exception along with this program; | |
22 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | <http://www.gnu.org/licenses/>. */ | |
92b4f0af | 24 | |
02befdf4 | 25 | #ifdef L__divxf3 |
3f622353 RH |
26 | // Compute a 80-bit IEEE double-extended quotient. |
27 | // | |
28 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
29 | // alternative. | |
30 | // | |
31 | // farg0 holds the dividend. farg1 holds the divisor. | |
02befdf4 ZW |
32 | // |
33 | // __divtf3 is an alternate symbol name for backward compatibility. | |
3f622353 RH |
34 | |
35 | .text | |
36 | .align 16 | |
02befdf4 | 37 | .global __divxf3 |
02befdf4 ZW |
38 | .proc __divxf3 |
39 | __divxf3: | |
c252db20 L |
40 | #ifdef SHARED |
41 | .global __divtf3 | |
3f622353 | 42 | __divtf3: |
c252db20 | 43 | #endif |
3392dafc RH |
44 | cmp.eq p7, p0 = r0, r0 |
45 | frcpa.s0 f10, p6 = farg0, farg1 | |
3f622353 | 46 | ;; |
3392dafc RH |
47 | (p6) cmp.ne p7, p0 = r0, r0 |
48 | .pred.rel.mutex p6, p7 | |
3f622353 | 49 | (p6) fnma.s1 f11 = farg1, f10, f1 |
f327ea3e | 50 | (p6) fma.s1 f12 = farg0, f10, f0 |
3f622353 | 51 | ;; |
f327ea3e | 52 | (p6) fma.s1 f13 = f11, f11, f0 |
3392dafc | 53 | (p6) fma.s1 f14 = f11, f11, f11 |
3f622353 | 54 | ;; |
3392dafc RH |
55 | (p6) fma.s1 f11 = f13, f13, f11 |
56 | (p6) fma.s1 f13 = f14, f10, f10 | |
3f622353 | 57 | ;; |
3392dafc | 58 | (p6) fma.s1 f10 = f13, f11, f10 |
f327ea3e | 59 | (p6) fnma.s1 f11 = farg1, f12, farg0 |
3f622353 | 60 | ;; |
3392dafc | 61 | (p6) fma.s1 f11 = f11, f10, f12 |
f327ea3e | 62 | (p6) fnma.s1 f12 = farg1, f10, f1 |
3f622353 | 63 | ;; |
3392dafc RH |
64 | (p6) fma.s1 f10 = f12, f10, f10 |
65 | (p6) fnma.s1 f12 = farg1, f11, farg0 | |
3f622353 | 66 | ;; |
f327ea3e | 67 | (p6) fma.s0 fret0 = f12, f10, f11 |
3392dafc | 68 | (p7) mov fret0 = f10 |
3f622353 | 69 | br.ret.sptk rp |
02befdf4 | 70 | .endp __divxf3 |
3f622353 RH |
71 | #endif |
72 | ||
a2497896 | 73 | #ifdef L__divdf3 |
c65ebc55 JW |
74 | // Compute a 64-bit IEEE double quotient. |
75 | // | |
76 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
77 | // alternative. | |
78 | // | |
79 | // farg0 holds the dividend. farg1 holds the divisor. | |
80 | ||
81 | .text | |
82 | .align 16 | |
83 | .global __divdf3 | |
84 | .proc __divdf3 | |
85 | __divdf3: | |
3392dafc RH |
86 | cmp.eq p7, p0 = r0, r0 |
87 | frcpa.s0 f10, p6 = farg0, farg1 | |
c65ebc55 | 88 | ;; |
3392dafc RH |
89 | (p6) cmp.ne p7, p0 = r0, r0 |
90 | .pred.rel.mutex p6, p7 | |
91 | (p6) fmpy.s1 f11 = farg0, f10 | |
c65ebc55 JW |
92 | (p6) fnma.s1 f12 = farg1, f10, f1 |
93 | ;; | |
94 | (p6) fma.s1 f11 = f12, f11, f11 | |
3392dafc | 95 | (p6) fmpy.s1 f13 = f12, f12 |
c65ebc55 | 96 | ;; |
3392dafc | 97 | (p6) fma.s1 f10 = f12, f10, f10 |
c65ebc55 | 98 | (p6) fma.s1 f11 = f13, f11, f11 |
3392dafc RH |
99 | ;; |
100 | (p6) fmpy.s1 f12 = f13, f13 | |
c65ebc55 JW |
101 | (p6) fma.s1 f10 = f13, f10, f10 |
102 | ;; | |
103 | (p6) fma.d.s1 f11 = f12, f11, f11 | |
104 | (p6) fma.s1 f10 = f12, f10, f10 | |
105 | ;; | |
106 | (p6) fnma.d.s1 f8 = farg1, f11, farg0 | |
107 | ;; | |
3392dafc RH |
108 | (p6) fma.d fret0 = f8, f10, f11 |
109 | (p7) mov fret0 = f10 | |
c65ebc55 JW |
110 | br.ret.sptk rp |
111 | ;; | |
112 | .endp __divdf3 | |
113 | #endif | |
114 | ||
a2497896 | 115 | #ifdef L__divsf3 |
c65ebc55 JW |
116 | // Compute a 32-bit IEEE float quotient. |
117 | // | |
118 | // From the Intel IA-64 Optimization Guide, choose the minimum latency | |
119 | // alternative. | |
120 | // | |
121 | // farg0 holds the dividend. farg1 holds the divisor. | |
122 | ||
123 | .text | |
124 | .align 16 | |
125 | .global __divsf3 | |
126 | .proc __divsf3 | |
127 | __divsf3: | |
938566fb | 128 | cmp.eq p7, p0 = r0, r0 |
3392dafc | 129 | frcpa.s0 f10, p6 = farg0, farg1 |
c65ebc55 | 130 | ;; |
938566fb RH |
131 | (p6) cmp.ne p7, p0 = r0, r0 |
132 | .pred.rel.mutex p6, p7 | |
3392dafc | 133 | (p6) fmpy.s1 f8 = farg0, f10 |
c65ebc55 JW |
134 | (p6) fnma.s1 f9 = farg1, f10, f1 |
135 | ;; | |
136 | (p6) fma.s1 f8 = f9, f8, f8 | |
3392dafc | 137 | (p6) fmpy.s1 f9 = f9, f9 |
c65ebc55 JW |
138 | ;; |
139 | (p6) fma.s1 f8 = f9, f8, f8 | |
3392dafc | 140 | (p6) fmpy.s1 f9 = f9, f9 |
c65ebc55 | 141 | ;; |
3392dafc | 142 | (p6) fma.d.s1 f10 = f9, f8, f8 |
c65ebc55 | 143 | ;; |
938566fb RH |
144 | (p6) fnorm.s.s0 fret0 = f10 |
145 | (p7) mov fret0 = f10 | |
c65ebc55 JW |
146 | br.ret.sptk rp |
147 | ;; | |
148 | .endp __divsf3 | |
149 | #endif | |
150 | ||
a2497896 | 151 | #ifdef L__divdi3 |
c65ebc55 JW |
152 | // Compute a 64-bit integer quotient. |
153 | // | |
d8d7a286 RH |
154 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
155 | // alternative. | |
c65ebc55 | 156 | // |
d8d7a286 | 157 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
158 | |
159 | .text | |
160 | .align 16 | |
161 | .global __divdi3 | |
162 | .proc __divdi3 | |
163 | __divdi3: | |
164 | .regstk 2,0,0,0 | |
165 | // Transfer inputs to FP registers. | |
166 | setf.sig f8 = in0 | |
167 | setf.sig f9 = in1 | |
eea1d14a L |
168 | // Check divide by zero. |
169 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 JW |
170 | ;; |
171 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
172 | fcvt.xf f8 = f8 | |
173 | fcvt.xf f9 = f9 | |
eea1d14a | 174 | (p7) break 1 |
c65ebc55 JW |
175 | ;; |
176 | // Compute the reciprocal approximation. | |
660a0ebd | 177 | frcpa.s1 f10, p6 = f8, f9 |
2a7ffc85 | 178 | ;; |
c65ebc55 | 179 | // 3 Newton-Raphson iterations. |
d8d7a286 RH |
180 | (p6) fnma.s1 f11 = f9, f10, f1 |
181 | (p6) fmpy.s1 f12 = f8, f10 | |
c65ebc55 | 182 | ;; |
d8d7a286 RH |
183 | (p6) fmpy.s1 f13 = f11, f11 |
184 | (p6) fma.s1 f12 = f11, f12, f12 | |
c65ebc55 | 185 | ;; |
d8d7a286 RH |
186 | (p6) fma.s1 f10 = f11, f10, f10 |
187 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 188 | ;; |
d8d7a286 RH |
189 | (p6) fma.s1 f10 = f13, f10, f10 |
190 | (p6) fnma.s1 f12 = f9, f11, f8 | |
c65ebc55 | 191 | ;; |
d8d7a286 | 192 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
193 | ;; |
194 | // Round quotient to an integer. | |
d8d7a286 | 195 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 JW |
196 | ;; |
197 | // Transfer result to GP registers. | |
d8d7a286 | 198 | getf.sig ret0 = f10 |
c65ebc55 JW |
199 | br.ret.sptk rp |
200 | ;; | |
201 | .endp __divdi3 | |
202 | #endif | |
203 | ||
a2497896 | 204 | #ifdef L__moddi3 |
c65ebc55 JW |
205 | // Compute a 64-bit integer modulus. |
206 | // | |
d8d7a286 RH |
207 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
208 | // alternative. | |
c65ebc55 | 209 | // |
d8d7a286 | 210 | // in0 holds the dividend (a). in1 holds the divisor (b). |
c65ebc55 JW |
211 | |
212 | .text | |
213 | .align 16 | |
214 | .global __moddi3 | |
215 | .proc __moddi3 | |
216 | __moddi3: | |
217 | .regstk 2,0,0,0 | |
218 | // Transfer inputs to FP registers. | |
d8d7a286 | 219 | setf.sig f14 = in0 |
c65ebc55 | 220 | setf.sig f9 = in1 |
eea1d14a L |
221 | // Check divide by zero. |
222 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 JW |
223 | ;; |
224 | // Convert the inputs to FP, so that they won't be treated as unsigned. | |
d8d7a286 | 225 | fcvt.xf f8 = f14 |
c65ebc55 | 226 | fcvt.xf f9 = f9 |
eea1d14a | 227 | (p7) break 1 |
c65ebc55 JW |
228 | ;; |
229 | // Compute the reciprocal approximation. | |
660a0ebd | 230 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
231 | ;; |
232 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
233 | (p6) fmpy.s1 f12 = f8, f10 |
234 | (p6) fnma.s1 f11 = f9, f10, f1 | |
c65ebc55 | 235 | ;; |
d8d7a286 RH |
236 | (p6) fma.s1 f12 = f11, f12, f12 |
237 | (p6) fmpy.s1 f13 = f11, f11 | |
c65ebc55 | 238 | ;; |
d8d7a286 RH |
239 | (p6) fma.s1 f10 = f11, f10, f10 |
240 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 241 | ;; |
d8d7a286 RH |
242 | sub in1 = r0, in1 |
243 | (p6) fma.s1 f10 = f13, f10, f10 | |
c65ebc55 JW |
244 | (p6) fnma.s1 f12 = f9, f11, f8 |
245 | ;; | |
d8d7a286 | 246 | setf.sig f9 = in1 |
660a0ebd | 247 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 | 248 | ;; |
660a0ebd | 249 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 250 | ;; |
d8d7a286 RH |
251 | // r = q * (-b) + a |
252 | xma.l f10 = f10, f9, f14 | |
c65ebc55 JW |
253 | ;; |
254 | // Transfer result to GP registers. | |
d8d7a286 | 255 | getf.sig ret0 = f10 |
c65ebc55 JW |
256 | br.ret.sptk rp |
257 | ;; | |
258 | .endp __moddi3 | |
259 | #endif | |
260 | ||
a2497896 | 261 | #ifdef L__udivdi3 |
c65ebc55 JW |
262 | // Compute a 64-bit unsigned integer quotient. |
263 | // | |
d8d7a286 RH |
264 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
265 | // alternative. | |
c65ebc55 | 266 | // |
d8d7a286 | 267 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
268 | |
269 | .text | |
270 | .align 16 | |
271 | .global __udivdi3 | |
272 | .proc __udivdi3 | |
273 | __udivdi3: | |
274 | .regstk 2,0,0,0 | |
275 | // Transfer inputs to FP registers. | |
276 | setf.sig f8 = in0 | |
277 | setf.sig f9 = in1 | |
eea1d14a L |
278 | // Check divide by zero. |
279 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 JW |
280 | ;; |
281 | // Convert the inputs to FP, to avoid FP software-assist faults. | |
660a0ebd JW |
282 | fcvt.xuf.s1 f8 = f8 |
283 | fcvt.xuf.s1 f9 = f9 | |
eea1d14a | 284 | (p7) break 1 |
c65ebc55 JW |
285 | ;; |
286 | // Compute the reciprocal approximation. | |
660a0ebd | 287 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
288 | ;; |
289 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
290 | (p6) fnma.s1 f11 = f9, f10, f1 |
291 | (p6) fmpy.s1 f12 = f8, f10 | |
c65ebc55 | 292 | ;; |
d8d7a286 RH |
293 | (p6) fmpy.s1 f13 = f11, f11 |
294 | (p6) fma.s1 f12 = f11, f12, f12 | |
c65ebc55 | 295 | ;; |
d8d7a286 RH |
296 | (p6) fma.s1 f10 = f11, f10, f10 |
297 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 298 | ;; |
d8d7a286 RH |
299 | (p6) fma.s1 f10 = f13, f10, f10 |
300 | (p6) fnma.s1 f12 = f9, f11, f8 | |
c65ebc55 | 301 | ;; |
2a7ffc85 | 302 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
303 | ;; |
304 | // Round quotient to an unsigned integer. | |
d8d7a286 | 305 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 JW |
306 | ;; |
307 | // Transfer result to GP registers. | |
d8d7a286 | 308 | getf.sig ret0 = f10 |
c65ebc55 JW |
309 | br.ret.sptk rp |
310 | ;; | |
311 | .endp __udivdi3 | |
312 | #endif | |
313 | ||
a2497896 | 314 | #ifdef L__umoddi3 |
c65ebc55 JW |
315 | // Compute a 64-bit unsigned integer modulus. |
316 | // | |
d8d7a286 RH |
317 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
318 | // alternative. | |
c65ebc55 | 319 | // |
d8d7a286 | 320 | // in0 holds the dividend (a). in1 holds the divisor (b). |
c65ebc55 JW |
321 | |
322 | .text | |
323 | .align 16 | |
324 | .global __umoddi3 | |
325 | .proc __umoddi3 | |
326 | __umoddi3: | |
327 | .regstk 2,0,0,0 | |
328 | // Transfer inputs to FP registers. | |
d8d7a286 | 329 | setf.sig f14 = in0 |
c65ebc55 | 330 | setf.sig f9 = in1 |
eea1d14a L |
331 | // Check divide by zero. |
332 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 JW |
333 | ;; |
334 | // Convert the inputs to FP, to avoid FP software assist faults. | |
d8d7a286 | 335 | fcvt.xuf.s1 f8 = f14 |
660a0ebd | 336 | fcvt.xuf.s1 f9 = f9 |
eea1d14a | 337 | (p7) break 1; |
c65ebc55 JW |
338 | ;; |
339 | // Compute the reciprocal approximation. | |
660a0ebd | 340 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 JW |
341 | ;; |
342 | // 3 Newton-Raphson iterations. | |
d8d7a286 RH |
343 | (p6) fmpy.s1 f12 = f8, f10 |
344 | (p6) fnma.s1 f11 = f9, f10, f1 | |
c65ebc55 | 345 | ;; |
d8d7a286 RH |
346 | (p6) fma.s1 f12 = f11, f12, f12 |
347 | (p6) fmpy.s1 f13 = f11, f11 | |
c65ebc55 | 348 | ;; |
d8d7a286 RH |
349 | (p6) fma.s1 f10 = f11, f10, f10 |
350 | (p6) fma.s1 f11 = f13, f12, f12 | |
c65ebc55 | 351 | ;; |
d8d7a286 RH |
352 | sub in1 = r0, in1 |
353 | (p6) fma.s1 f10 = f13, f10, f10 | |
c65ebc55 JW |
354 | (p6) fnma.s1 f12 = f9, f11, f8 |
355 | ;; | |
d8d7a286 | 356 | setf.sig f9 = in1 |
660a0ebd | 357 | (p6) fma.s1 f10 = f12, f10, f11 |
c65ebc55 JW |
358 | ;; |
359 | // Round quotient to an unsigned integer. | |
660a0ebd | 360 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 361 | ;; |
d8d7a286 RH |
362 | // r = q * (-b) + a |
363 | xma.l f10 = f10, f9, f14 | |
c65ebc55 JW |
364 | ;; |
365 | // Transfer result to GP registers. | |
d8d7a286 | 366 | getf.sig ret0 = f10 |
c65ebc55 JW |
367 | br.ret.sptk rp |
368 | ;; | |
369 | .endp __umoddi3 | |
370 | #endif | |
371 | ||
a2497896 | 372 | #ifdef L__divsi3 |
c65ebc55 JW |
373 | // Compute a 32-bit integer quotient. |
374 | // | |
d8d7a286 RH |
375 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
376 | // alternative. | |
c65ebc55 | 377 | // |
d8d7a286 | 378 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
379 | |
380 | .text | |
381 | .align 16 | |
382 | .global __divsi3 | |
383 | .proc __divsi3 | |
384 | __divsi3: | |
385 | .regstk 2,0,0,0 | |
eea1d14a L |
386 | // Check divide by zero. |
387 | cmp.ne.unc p0,p7=0,in1 | |
d8d7a286 RH |
388 | sxt4 in0 = in0 |
389 | sxt4 in1 = in1 | |
390 | ;; | |
c65ebc55 JW |
391 | setf.sig f8 = in0 |
392 | setf.sig f9 = in1 | |
eea1d14a | 393 | (p7) break 1 |
c65ebc55 | 394 | ;; |
d8d7a286 | 395 | mov r2 = 0x0ffdd |
c65ebc55 JW |
396 | fcvt.xf f8 = f8 |
397 | fcvt.xf f9 = f9 | |
398 | ;; | |
d8d7a286 | 399 | setf.exp f11 = r2 |
4287b5f1 | 400 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 401 | ;; |
d8d7a286 RH |
402 | (p6) fmpy.s1 f8 = f8, f10 |
403 | (p6) fnma.s1 f9 = f9, f10, f1 | |
c65ebc55 | 404 | ;; |
d8d7a286 RH |
405 | (p6) fma.s1 f8 = f9, f8, f8 |
406 | (p6) fma.s1 f9 = f9, f9, f11 | |
c65ebc55 | 407 | ;; |
d8d7a286 | 408 | (p6) fma.s1 f10 = f9, f8, f8 |
c65ebc55 | 409 | ;; |
d8d7a286 | 410 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 411 | ;; |
d8d7a286 | 412 | getf.sig ret0 = f10 |
c65ebc55 JW |
413 | br.ret.sptk rp |
414 | ;; | |
415 | .endp __divsi3 | |
416 | #endif | |
417 | ||
a2497896 | 418 | #ifdef L__modsi3 |
c65ebc55 JW |
419 | // Compute a 32-bit integer modulus. |
420 | // | |
d8d7a286 RH |
421 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
422 | // alternative. | |
c65ebc55 | 423 | // |
d8d7a286 | 424 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
425 | |
426 | .text | |
427 | .align 16 | |
428 | .global __modsi3 | |
429 | .proc __modsi3 | |
430 | __modsi3: | |
431 | .regstk 2,0,0,0 | |
d8d7a286 RH |
432 | mov r2 = 0x0ffdd |
433 | sxt4 in0 = in0 | |
434 | sxt4 in1 = in1 | |
435 | ;; | |
436 | setf.sig f13 = r32 | |
c65ebc55 | 437 | setf.sig f9 = r33 |
eea1d14a L |
438 | // Check divide by zero. |
439 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 | 440 | ;; |
d8d7a286 RH |
441 | sub in1 = r0, in1 |
442 | fcvt.xf f8 = f13 | |
c65ebc55 JW |
443 | fcvt.xf f9 = f9 |
444 | ;; | |
d8d7a286 | 445 | setf.exp f11 = r2 |
4287b5f1 | 446 | frcpa.s1 f10, p6 = f8, f9 |
eea1d14a | 447 | (p7) break 1 |
c65ebc55 | 448 | ;; |
d8d7a286 RH |
449 | (p6) fmpy.s1 f12 = f8, f10 |
450 | (p6) fnma.s1 f10 = f9, f10, f1 | |
c65ebc55 | 451 | ;; |
d8d7a286 RH |
452 | setf.sig f9 = in1 |
453 | (p6) fma.s1 f12 = f10, f12, f12 | |
454 | (p6) fma.s1 f10 = f10, f10, f11 | |
c65ebc55 | 455 | ;; |
d8d7a286 | 456 | (p6) fma.s1 f10 = f10, f12, f12 |
c65ebc55 | 457 | ;; |
d8d7a286 | 458 | fcvt.fx.trunc.s1 f10 = f10 |
c65ebc55 | 459 | ;; |
d8d7a286 | 460 | xma.l f10 = f10, f9, f13 |
c65ebc55 | 461 | ;; |
d8d7a286 | 462 | getf.sig ret0 = f10 |
c65ebc55 JW |
463 | br.ret.sptk rp |
464 | ;; | |
465 | .endp __modsi3 | |
466 | #endif | |
467 | ||
a2497896 | 468 | #ifdef L__udivsi3 |
c65ebc55 JW |
469 | // Compute a 32-bit unsigned integer quotient. |
470 | // | |
d8d7a286 RH |
471 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
472 | // alternative. | |
c65ebc55 | 473 | // |
d8d7a286 | 474 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
475 | |
476 | .text | |
477 | .align 16 | |
478 | .global __udivsi3 | |
479 | .proc __udivsi3 | |
480 | __udivsi3: | |
481 | .regstk 2,0,0,0 | |
d8d7a286 RH |
482 | mov r2 = 0x0ffdd |
483 | zxt4 in0 = in0 | |
484 | zxt4 in1 = in1 | |
c65ebc55 | 485 | ;; |
d8d7a286 RH |
486 | setf.sig f8 = in0 |
487 | setf.sig f9 = in1 | |
eea1d14a L |
488 | // Check divide by zero. |
489 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 | 490 | ;; |
4287b5f1 RH |
491 | fcvt.xf f8 = f8 |
492 | fcvt.xf f9 = f9 | |
eea1d14a | 493 | (p7) break 1 |
4287b5f1 | 494 | ;; |
d8d7a286 | 495 | setf.exp f11 = r2 |
4287b5f1 | 496 | frcpa.s1 f10, p6 = f8, f9 |
c65ebc55 | 497 | ;; |
d8d7a286 RH |
498 | (p6) fmpy.s1 f8 = f8, f10 |
499 | (p6) fnma.s1 f9 = f9, f10, f1 | |
c65ebc55 | 500 | ;; |
d8d7a286 RH |
501 | (p6) fma.s1 f8 = f9, f8, f8 |
502 | (p6) fma.s1 f9 = f9, f9, f11 | |
c65ebc55 | 503 | ;; |
d8d7a286 | 504 | (p6) fma.s1 f10 = f9, f8, f8 |
c65ebc55 | 505 | ;; |
d8d7a286 | 506 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 507 | ;; |
d8d7a286 | 508 | getf.sig ret0 = f10 |
c65ebc55 JW |
509 | br.ret.sptk rp |
510 | ;; | |
511 | .endp __udivsi3 | |
512 | #endif | |
513 | ||
a2497896 | 514 | #ifdef L__umodsi3 |
c65ebc55 JW |
515 | // Compute a 32-bit unsigned integer modulus. |
516 | // | |
d8d7a286 RH |
517 | // From the Intel IA-64 Optimization Guide, choose the minimum latency |
518 | // alternative. | |
c65ebc55 | 519 | // |
d8d7a286 | 520 | // in0 holds the dividend. in1 holds the divisor. |
c65ebc55 JW |
521 | |
522 | .text | |
523 | .align 16 | |
524 | .global __umodsi3 | |
525 | .proc __umodsi3 | |
526 | __umodsi3: | |
527 | .regstk 2,0,0,0 | |
d8d7a286 RH |
528 | mov r2 = 0x0ffdd |
529 | zxt4 in0 = in0 | |
530 | zxt4 in1 = in1 | |
c65ebc55 | 531 | ;; |
d8d7a286 RH |
532 | setf.sig f13 = in0 |
533 | setf.sig f9 = in1 | |
eea1d14a L |
534 | // Check divide by zero. |
535 | cmp.ne.unc p0,p7=0,in1 | |
c65ebc55 | 536 | ;; |
d8d7a286 RH |
537 | sub in1 = r0, in1 |
538 | fcvt.xf f8 = f13 | |
539 | fcvt.xf f9 = f9 | |
c65ebc55 | 540 | ;; |
d8d7a286 | 541 | setf.exp f11 = r2 |
4287b5f1 | 542 | frcpa.s1 f10, p6 = f8, f9 |
eea1d14a | 543 | (p7) break 1; |
c65ebc55 | 544 | ;; |
d8d7a286 RH |
545 | (p6) fmpy.s1 f12 = f8, f10 |
546 | (p6) fnma.s1 f10 = f9, f10, f1 | |
c65ebc55 | 547 | ;; |
2a7ffc85 | 548 | setf.sig f9 = in1 |
d8d7a286 RH |
549 | (p6) fma.s1 f12 = f10, f12, f12 |
550 | (p6) fma.s1 f10 = f10, f10, f11 | |
c65ebc55 | 551 | ;; |
d8d7a286 | 552 | (p6) fma.s1 f10 = f10, f12, f12 |
c65ebc55 | 553 | ;; |
d8d7a286 | 554 | fcvt.fxu.trunc.s1 f10 = f10 |
c65ebc55 | 555 | ;; |
d8d7a286 | 556 | xma.l f10 = f10, f9, f13 |
c65ebc55 | 557 | ;; |
d8d7a286 | 558 | getf.sig ret0 = f10 |
c65ebc55 JW |
559 | br.ret.sptk rp |
560 | ;; | |
561 | .endp __umodsi3 | |
562 | #endif | |
563 | ||
a2497896 | 564 | #ifdef L__save_stack_nonlocal |
c65ebc55 JW |
565 | // Notes on save/restore stack nonlocal: We read ar.bsp but write |
566 | // ar.bspstore. This is because ar.bsp can be read at all times | |
567 | // (independent of the RSE mode) but since it's read-only we need to | |
568 | // restore the value via ar.bspstore. This is OK because | |
569 | // ar.bsp==ar.bspstore after executing "flushrs". | |
570 | ||
571 | // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) | |
572 | ||
573 | .text | |
574 | .align 16 | |
575 | .global __ia64_save_stack_nonlocal | |
576 | .proc __ia64_save_stack_nonlocal | |
577 | __ia64_save_stack_nonlocal: | |
97e242b0 RH |
578 | { .mmf |
579 | alloc r18 = ar.pfs, 2, 0, 0, 0 | |
580 | mov r19 = ar.rsc | |
581 | ;; | |
582 | } | |
583 | { .mmi | |
584 | flushrs | |
585 | st8 [in0] = in1, 24 | |
586 | and r19 = 0x1c, r19 | |
587 | ;; | |
588 | } | |
589 | { .mmi | |
590 | st8 [in0] = r18, -16 | |
591 | mov ar.rsc = r19 | |
592 | or r19 = 0x3, r19 | |
593 | ;; | |
594 | } | |
595 | { .mmi | |
596 | mov r16 = ar.bsp | |
597 | mov r17 = ar.rnat | |
598 | adds r2 = 8, in0 | |
599 | ;; | |
600 | } | |
601 | { .mmi | |
602 | st8 [in0] = r16 | |
603 | st8 [r2] = r17 | |
604 | } | |
605 | { .mib | |
606 | mov ar.rsc = r19 | |
607 | br.ret.sptk.few rp | |
608 | ;; | |
609 | } | |
c65ebc55 JW |
610 | .endp __ia64_save_stack_nonlocal |
611 | #endif | |
612 | ||
a2497896 | 613 | #ifdef L__nonlocal_goto |
97e242b0 | 614 | // void __ia64_nonlocal_goto(void *target_label, void *save_area, |
c65ebc55 JW |
615 | // void *static_chain); |
616 | ||
617 | .text | |
618 | .align 16 | |
619 | .global __ia64_nonlocal_goto | |
620 | .proc __ia64_nonlocal_goto | |
621 | __ia64_nonlocal_goto: | |
97e242b0 RH |
622 | { .mmi |
623 | alloc r20 = ar.pfs, 3, 0, 0, 0 | |
624 | ld8 r12 = [in1], 8 | |
625 | mov.ret.sptk rp = in0, .L0 | |
626 | ;; | |
627 | } | |
628 | { .mmf | |
629 | ld8 r16 = [in1], 8 | |
630 | mov r19 = ar.rsc | |
631 | ;; | |
632 | } | |
633 | { .mmi | |
634 | flushrs | |
635 | ld8 r17 = [in1], 8 | |
636 | and r19 = 0x1c, r19 | |
637 | ;; | |
638 | } | |
639 | { .mmi | |
640 | ld8 r18 = [in1] | |
641 | mov ar.rsc = r19 | |
642 | or r19 = 0x3, r19 | |
643 | ;; | |
644 | } | |
645 | { .mmi | |
646 | mov ar.bspstore = r16 | |
647 | ;; | |
648 | mov ar.rnat = r17 | |
649 | ;; | |
650 | } | |
651 | { .mmi | |
652 | loadrs | |
653 | invala | |
654 | mov r15 = in2 | |
655 | ;; | |
656 | } | |
657 | .L0: { .mib | |
658 | mov ar.rsc = r19 | |
659 | mov ar.pfs = r18 | |
660 | br.ret.sptk.few rp | |
661 | ;; | |
c65ebc55 | 662 | } |
c65ebc55 JW |
663 | .endp __ia64_nonlocal_goto |
664 | #endif | |
9525c690 | 665 | |
a2497896 | 666 | #ifdef L__restore_stack_nonlocal |
9525c690 JW |
667 | // This is mostly the same as nonlocal_goto above. |
668 | // ??? This has not been tested yet. | |
669 | ||
670 | // void __ia64_restore_stack_nonlocal(void *save_area) | |
671 | ||
672 | .text | |
673 | .align 16 | |
674 | .global __ia64_restore_stack_nonlocal | |
675 | .proc __ia64_restore_stack_nonlocal | |
676 | __ia64_restore_stack_nonlocal: | |
97e242b0 RH |
677 | { .mmf |
678 | alloc r20 = ar.pfs, 4, 0, 0, 0 | |
679 | ld8 r12 = [in0], 8 | |
680 | ;; | |
681 | } | |
682 | { .mmb | |
683 | ld8 r16=[in0], 8 | |
684 | mov r19 = ar.rsc | |
685 | ;; | |
686 | } | |
687 | { .mmi | |
688 | flushrs | |
689 | ld8 r17 = [in0], 8 | |
690 | and r19 = 0x1c, r19 | |
691 | ;; | |
692 | } | |
693 | { .mmf | |
694 | ld8 r18 = [in0] | |
695 | mov ar.rsc = r19 | |
696 | ;; | |
697 | } | |
698 | { .mmi | |
699 | mov ar.bspstore = r16 | |
700 | ;; | |
701 | mov ar.rnat = r17 | |
702 | or r19 = 0x3, r19 | |
703 | ;; | |
704 | } | |
705 | { .mmf | |
706 | loadrs | |
707 | invala | |
708 | ;; | |
709 | } | |
710 | .L0: { .mib | |
711 | mov ar.rsc = r19 | |
712 | mov ar.pfs = r18 | |
713 | br.ret.sptk.few rp | |
714 | ;; | |
9525c690 | 715 | } |
9525c690 JW |
716 | .endp __ia64_restore_stack_nonlocal |
717 | #endif | |
97e242b0 | 718 | |
a2497896 | 719 | #ifdef L__trampoline |
97e242b0 RH |
720 | // Implement the nested function trampoline. This is out of line |
721 | // so that we don't have to bother with flushing the icache, as | |
722 | // well as making the on-stack trampoline smaller. | |
723 | // | |
724 | // The trampoline has the following form: | |
725 | // | |
0024a804 | 726 | // +-------------------+ > |
97e242b0 RH |
727 | // TRAMP: | __ia64_trampoline | | |
728 | // +-------------------+ > fake function descriptor | |
729 | // | TRAMP+16 | | | |
0024a804 | 730 | // +-------------------+ > |
97e242b0 RH |
731 | // | target descriptor | |
732 | // +-------------------+ | |
733 | // | static link | | |
734 | // +-------------------+ | |
735 | ||
736 | .text | |
737 | .align 16 | |
738 | .global __ia64_trampoline | |
739 | .proc __ia64_trampoline | |
740 | __ia64_trampoline: | |
741 | { .mmi | |
742 | ld8 r2 = [r1], 8 | |
743 | ;; | |
744 | ld8 r15 = [r1] | |
745 | } | |
746 | { .mmi | |
747 | ld8 r3 = [r2], 8 | |
748 | ;; | |
749 | ld8 r1 = [r2] | |
750 | mov b6 = r3 | |
751 | } | |
752 | { .bbb | |
753 | br.sptk.many b6 | |
754 | ;; | |
755 | } | |
756 | .endp __ia64_trampoline | |
757 | #endif | |
02befdf4 | 758 | |
c252db20 | 759 | #ifdef SHARED |
02befdf4 | 760 | // Thunks for backward compatibility. |
4e9db8b2 | 761 | #ifdef L_fixtfdi |
02befdf4 ZW |
762 | .text |
763 | .align 16 | |
764 | .global __fixtfti | |
765 | .proc __fixtfti | |
766 | __fixtfti: | |
767 | { .bbb | |
768 | br.sptk.many __fixxfti | |
769 | ;; | |
770 | } | |
771 | .endp __fixtfti | |
4e9db8b2 SE |
772 | #endif |
773 | #ifdef L_fixunstfdi | |
02befdf4 ZW |
774 | .align 16 |
775 | .global __fixunstfti | |
776 | .proc __fixunstfti | |
777 | __fixunstfti: | |
778 | { .bbb | |
779 | br.sptk.many __fixunsxfti | |
780 | ;; | |
781 | } | |
782 | .endp __fixunstfti | |
4e9db8b2 | 783 | #endif |
c252db20 | 784 | #ifdef L_floatditf |
02befdf4 ZW |
785 | .align 16 |
786 | .global __floattitf | |
787 | .proc __floattitf | |
788 | __floattitf: | |
789 | { .bbb | |
790 | br.sptk.many __floattixf | |
791 | ;; | |
792 | } | |
793 | .endp __floattitf | |
02befdf4 | 794 | #endif |
c252db20 | 795 | #endif |