]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/ia64/lib1funcs.asm
Add ia64 port.
[thirdparty/gcc.git] / gcc / config / ia64 / lib1funcs.asm
CommitLineData
c65ebc55
JW
1#ifdef L__divdf3
2// Compute a 64-bit IEEE double quotient.
3//
4// From the Intel IA-64 Optimization Guide, choose the minimum latency
5// alternative.
6//
7// farg0 holds the dividend. farg1 holds the divisor.
8
9 .text
10 .align 16
11 .global __divdf3
12 .proc __divdf3
13__divdf3:
14 frcpa f10, p6 = farg0, farg1
15 ;;
16(p6) fma.s1 f11 = farg0, f10, f0
17(p6) fnma.s1 f12 = farg1, f10, f1
18 ;;
19(p6) fma.s1 f11 = f12, f11, f11
20(p6) fma.s1 f13 = f12, f12, f0
21(p6) fma.s1 f10 = f12, f10, f10
22 ;;
23(p6) fma.s1 f11 = f13, f11, f11
24(p6) fma.s1 f12 = f13, f13, f0
25(p6) fma.s1 f10 = f13, f10, f10
26 ;;
27(p6) fma.d.s1 f11 = f12, f11, f11
28(p6) fma.s1 f10 = f12, f10, f10
29 ;;
30(p6) fnma.d.s1 f8 = farg1, f11, farg0
31 ;;
32(p6) fma.d f10 = f8, f10, f11
33 ;;
34 mov fret0 = f10
35 br.ret.sptk rp
36 ;;
37 .endp __divdf3
38#endif
39
40#ifdef L__divsf3
41// Compute a 32-bit IEEE float quotient.
42//
43// From the Intel IA-64 Optimization Guide, choose the minimum latency
44// alternative.
45//
46// farg0 holds the dividend. farg1 holds the divisor.
47
48 .text
49 .align 16
50 .global __divsf3
51 .proc __divsf3
52__divsf3:
53 frcpa f10, p6 = farg0, farg1
54 ;;
55(p6) fma.s1 f8 = farg0, f10, f0
56(p6) fnma.s1 f9 = farg1, f10, f1
57 ;;
58(p6) fma.s1 f8 = f9, f8, f8
59(p6) fma.s1 f9 = f9, f9, f0
60 ;;
61(p6) fma.s1 f8 = f9, f8, f8
62(p6) fma.s1 f9 = f9, f9, f0
63 ;;
64(p6) fma.d.s1 f8 = f9, f8, f8
65 ;;
66(p6) fma.s f10 = f8, f1, f0
67 ;;
68 mov fret0 = f10
69 br.ret.sptk rp
70 ;;
71 .endp __divsf3
72#endif
73
74#ifdef L__divdi3
75// Compute a 64-bit integer quotient.
76//
77// Use reciprocal approximation and Newton-Raphson iteration to compute the
78// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
79// to get more than the 64 bits of precision that we need for DImode.
80//
81// Must use max precision for the reciprocal computations to get 64 bits of
82// precision.
83//
84// r32/f8 holds the dividend. r33/f9 holds the divisor.
85// f10 holds the value 2.0. f11 holds the reciprocal approximation.
86// f12 is a temporary.
87
88 .text
89 .align 16
90 .global __divdi3
91 .proc __divdi3
92__divdi3:
93 .regstk 2,0,0,0
94 // Transfer inputs to FP registers.
95 setf.sig f8 = in0
96 setf.sig f9 = in1
97 ;;
98 // Convert the inputs to FP, so that they won't be treated as unsigned.
99 fcvt.xf f8 = f8
100 fcvt.xf f9 = f9
101 ;;
102 // Compute the reciprocal approximation.
103 frcpa f10, p6 = f8, f9
104 ;;
105 // 3 Newton-Raphson iterations.
106(p6) fma.s1 f11 = farg0, f10, f0
107(p6) fnma.s1 f12 = farg1, f10, f1
108 ;;
109(p6) fma.s1 f11 = f12, f11, f11
110(p6) fma.s1 f13 = f12, f12, f0
111(p6) fma.s1 f10 = f12, f10, f10
112 ;;
113(p6) fma.s1 f11 = f13, f11, f11
114(p6) fma.s1 f12 = f13, f13, f0
115(p6) fma.s1 f10 = f13, f10, f10
116 ;;
117(p6) fma.s1 f11 = f12, f11, f11
118(p6) fma.s1 f10 = f12, f10, f10
119 ;;
120(p6) fnma.s1 f8 = f9, f11, f8
121 ;;
122(p6) fma f10 = f8, f10, f11
123 ;;
124 // Round quotient to an integer.
125 fcvt.fx.trunc f8 = f10
126 ;;
127 // Transfer result to GP registers.
128 getf.sig ret0 = f8
129 br.ret.sptk rp
130 ;;
131 .endp __divdi3
132#endif
133
134#ifdef L__moddi3
135// Compute a 64-bit integer modulus.
136//
137// Use reciprocal approximation and Newton-Raphson iteration to compute the
138// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
139// to get more than the 64 bits of precision that we need for DImode.
140//
141// Must use max precision for the reciprocal computations to get 64 bits of
142// precision.
143//
144// r32/f8 holds the dividend. r33/f9 holds the divisor.
145// f10 holds the value 2.0. f11 holds the reciprocal approximation.
146// f12 is a temporary.
147
148 .text
149 .align 16
150 .global __moddi3
151 .proc __moddi3
152__moddi3:
153 .regstk 2,0,0,0
154 // Transfer inputs to FP registers.
155 setf.sig f8 = in0
156 setf.sig f9 = in1
157 ;;
158 // Convert the inputs to FP, so that they won't be treated as unsigned.
159 fcvt.xf f8 = f8
160 fcvt.xf f9 = f9
161 ;;
162 // Compute the reciprocal approximation.
163 frcpa f10, p6 = f8, f9
164 ;;
165 // 3 Newton-Raphson iterations.
166(p6) fma.s1 f11 = farg0, f10, f0
167(p6) fnma.s1 f12 = farg1, f10, f1
168 ;;
169(p6) fma.s1 f11 = f12, f11, f11
170(p6) fma.s1 f13 = f12, f12, f0
171(p6) fma.s1 f10 = f12, f10, f10
172 ;;
173(p6) fma.s1 f11 = f13, f11, f11
174(p6) fma.s1 f12 = f13, f13, f0
175(p6) fma.s1 f10 = f13, f10, f10
176 ;;
177(p6) fma.s1 f11 = f12, f11, f11
178(p6) fma.s1 f10 = f12, f10, f10
179 ;;
180(p6) fnma.s1 f12 = f9, f11, f8
181 ;;
182(p6) fma f10 = f12, f10, f11
183 ;;
184 // Round quotient to an integer.
185 fcvt.fx.trunc f10 = f10
186 ;;
187 // Renormalize.
188 fcvt.xf f10 = f10
189 ;;
190 // Compute remainder.
191 fnma f8 = f10, f9, f8
192 ;;
193 // Round remainder to an integer.
194 fcvt.fx.trunc f8 = f8
195 ;;
196 // Transfer result to GP registers.
197 getf.sig ret0 = f8
198 br.ret.sptk rp
199 ;;
200 .endp __moddi3
201#endif
202
203#ifdef L__udivdi3
204// Compute a 64-bit unsigned integer quotient.
205//
206// Use reciprocal approximation and Newton-Raphson iteration to compute the
207// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
208// to get more than the 64 bits of precision that we need for DImode.
209//
210// Must use max precision for the reciprocal computations to get 64 bits of
211// precision.
212//
213// r32/f8 holds the dividend. r33/f9 holds the divisor.
214// f10 holds the value 2.0. f11 holds the reciprocal approximation.
215// f12 is a temporary.
216
217 .text
218 .align 16
219 .global __udivdi3
220 .proc __udivdi3
221__udivdi3:
222 .regstk 2,0,0,0
223 // Transfer inputs to FP registers.
224 setf.sig f8 = in0
225 setf.sig f9 = in1
226 ;;
227 // Convert the inputs to FP, to avoid FP software-assist faults.
228 fcvt.xuf f8 = f8
229 fcvt.xuf f9 = f9
230 ;;
231 // Compute the reciprocal approximation.
232 frcpa f10, p6 = f8, f9
233 ;;
234 // 3 Newton-Raphson iterations.
235(p6) fma.s1 f11 = farg0, f10, f0
236(p6) fnma.s1 f12 = farg1, f10, f1
237 ;;
238(p6) fma.s1 f11 = f12, f11, f11
239(p6) fma.s1 f13 = f12, f12, f0
240(p6) fma.s1 f10 = f12, f10, f10
241 ;;
242(p6) fma.s1 f11 = f13, f11, f11
243(p6) fma.s1 f12 = f13, f13, f0
244(p6) fma.s1 f10 = f13, f10, f10
245 ;;
246(p6) fma.s1 f11 = f12, f11, f11
247(p6) fma.s1 f10 = f12, f10, f10
248 ;;
249(p6) fnma.s1 f8 = f9, f11, f8
250 ;;
251(p6) fma f10 = f8, f10, f11
252 ;;
253 // Round quotient to an unsigned integer.
254 fcvt.fxu.trunc f8 = f10
255 ;;
256 // Transfer result to GP registers.
257 getf.sig ret0 = f8
258 br.ret.sptk rp
259 ;;
260 .endp __udivdi3
261#endif
262
263#ifdef L__umoddi3
264// Compute a 64-bit unsigned integer modulus.
265//
266// Use reciprocal approximation and Newton-Raphson iteration to compute the
267// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
268// to get more than the 64 bits of precision that we need for DImode.
269//
270// Must use max precision for the reciprocal computations to get 64 bits of
271// precision.
272//
273// r32/f8 holds the dividend. r33/f9 holds the divisor.
274// f10 holds the value 2.0. f11 holds the reciprocal approximation.
275// f12 is a temporary.
276
277 .text
278 .align 16
279 .global __umoddi3
280 .proc __umoddi3
281__umoddi3:
282 .regstk 2,0,0,0
283 // Transfer inputs to FP registers.
284 setf.sig f8 = in0
285 setf.sig f9 = in1
286 ;;
287 // Convert the inputs to FP, to avoid FP software assist faults.
288 fcvt.xuf f8 = f8
289 fcvt.xuf f9 = f9
290 ;;
291 // Compute the reciprocal approximation.
292 frcpa f10, p6 = f8, f9
293 ;;
294 // 3 Newton-Raphson iterations.
295(p6) fma.s1 f11 = farg0, f10, f0
296(p6) fnma.s1 f12 = farg1, f10, f1
297 ;;
298(p6) fma.s1 f11 = f12, f11, f11
299(p6) fma.s1 f13 = f12, f12, f0
300(p6) fma.s1 f10 = f12, f10, f10
301 ;;
302(p6) fma.s1 f11 = f13, f11, f11
303(p6) fma.s1 f12 = f13, f13, f0
304(p6) fma.s1 f10 = f13, f10, f10
305 ;;
306(p6) fma.s1 f11 = f12, f11, f11
307(p6) fma.s1 f10 = f12, f10, f10
308 ;;
309(p6) fnma.s1 f12 = f9, f11, f8
310 ;;
311(p6) fma f10 = f12, f10, f11
312 ;;
313 // Round quotient to an unsigned integer.
314 fcvt.fxu.trunc f10 = f10
315 ;;
316 // Renormalize.
317 fcvt.xuf f10 = f10
318 ;;
319 // Compute remainder.
320 fnma f8 = f10, f9, f8
321 ;;
322 // Round remainder to an integer.
323 fcvt.fxu.trunc f8 = f8
324 ;;
325 // Transfer result to GP registers.
326 getf.sig ret0 = f8
327 br.ret.sptk rp
328 ;;
329 .endp __umoddi3
330#endif
331
332#ifdef L__divsi3
333// Compute a 32-bit integer quotient.
334//
335// Use reciprocal approximation and Newton-Raphson iteration to compute the
336// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
337// to get more than the 32 bits of precision that we need for SImode.
338//
339// ??? This is currently not used. It needs to be fixed to be more like the
340// above DImode routines.
341//
342// ??? Check to see if the error is less than >.5ulp error. We may need
343// some adjustment code to get precise enough results.
344//
345// ??? Should probably use max precision for the reciprocal computations.
346//
347// r32/f8 holds the dividend. r33/f9 holds the divisor.
348// f10 holds the value 2.0. f11 holds the reciprocal approximation.
349// f12 is a temporary.
350
351 .text
352 .align 16
353 .global __divsi3
354 .proc __divsi3
355__divsi3:
356 .regstk 2,0,0,0
357 setf.sig f8 = in0
358 setf.sig f9 = in1
359 ;;
360 fcvt.xf f8 = f8
361 fcvt.xf f9 = f9
362 ;;
363 frcpa f11, p6 = f8, f9
364 fadd f10 = f1, f1
365 ;;
366 fnma f12 = f9, f11, f10
367 ;;
368 fmpy f11 = f11, f12
369 ;;
370 fnma f12 = f9, f11, f10
371 ;;
372 fmpy f11 = f11, f12
373 ;;
374 fmpy f8 = f8, f11
375 ;;
376 fcvt.fx.trunc f8 = f8
377 ;;
378 getf.sig ret0 = f8
379 br.ret.sptk rp
380 ;;
381 .endp __divsi3
382#endif
383
384#ifdef L__modsi3
385// Compute a 32-bit integer modulus.
386//
387// Use reciprocal approximation and Newton-Raphson iteration to compute the
388// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
389// to get more than the 32 bits of precision that we need for SImode.
390//
391// ??? This is currently not used. It needs to be fixed to be more like the
392// above DImode routines.
393//
394// ??? Check to see if the error is less than >.5ulp error. We may need
395// some adjustment code to get precise enough results.
396//
397// ??? Should probably use max precision for the reciprocal computations.
398//
399// r32/f8 holds the dividend. r33/f9 holds the divisor.
400// f10 holds the value 2.0. f11 holds the reciprocal approximation.
401// f12 is a temporary.
402
403 .text
404 .align 16
405 .global __modsi3
406 .proc __modsi3
407__modsi3:
408 .regstk 2,0,0,0
409 setf.sig f8 = r32
410 setf.sig f9 = r33
411 ;;
412 fcvt.xf f8 = f8
413 fcvt.xf f9 = f9
414 ;;
415 frcpa f11, p6 = f8, f9
416 fadd f10 = f1, f1
417 ;;
418 fnma f12 = f9, f11, f10
419 ;;
420 fmpy f11 = f11, f12
421 ;;
422 fnma f12 = f9, f11, f10
423 ;;
424 fmpy f11 = f11, f12
425 ;;
426 fmpy f10 = f8, f11
427 ;;
428 fcvt.fx.trunc f10 = f10
429 ;;
430 fcvt.xf f10 = f10
431 ;;
432 fnma f8 = f10, f9, f8
433 ;;
434 fcvt.fx f8 = f8
435 ;;
436 getf.sig r32 = f8
437 br.ret.sptk rp
438 ;;
439 .endp __modsi3
440#endif
441
442#ifdef L__udivsi3
443// Compute a 32-bit unsigned integer quotient.
444//
445// Use reciprocal approximation and Newton-Raphson iteration to compute the
446// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
447// to get more than the 32 bits of precision that we need for SImode.
448//
449// ??? This is currently not used. It needs to be fixed to be more like the
450// above DImode routines.
451//
452// ??? Check to see if the error is less than >.5ulp error. We may need
453// some adjustment code to get precise enough results.
454//
455// ??? Should probably use max precision for the reciprocal computations.
456//
457// r32/f8 holds the dividend. r33/f9 holds the divisor.
458// f10 holds the value 2.0. f11 holds the reciprocal approximation.
459// f12 is a temporary.
460//
461// This is the same as divsi3, except that we don't need fcvt instructions
462// before the frcpa.
463
464 .text
465 .align 16
466 .global __udivsi3
467 .proc __udivsi3
468__udivsi3:
469 .regstk 2,0,0,0
470 setf.sig f8 = r32
471 setf.sig f9 = r33
472 ;;
473 frcpa f11, p6 = f8, f9
474 fadd f10 = f1, f1
475 ;;
476 fnma f12 = f9, f11, f10
477 ;;
478 fmpy f11 = f11, f12
479 ;;
480 fnma f12 = f9, f11, f10
481 ;;
482 fmpy f11 = f11, f12
483 ;;
484 fmpy f8 = f8, f11
485 ;;
486 fcvt.fxu.trunc f8 = f8
487 ;;
488 getf.sig ret0 = f8
489 br.ret.sptk rp
490 ;;
491 .endp __udivsi3
492#endif
493
494#ifdef L__umodsi3
495// Compute a 32-bit unsigned integer modulus.
496//
497// Use reciprocal approximation and Newton-Raphson iteration to compute the
498// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
499// to get more than the 32 bits of precision that we need for SImode.
500//
501// ??? This is currently not used. It needs to be fixed to be more like the
502// above DImode routines.
503//
504// ??? Check to see if the error is less than >.5ulp error. We may need
505// some adjustment code to get precise enough results.
506//
507// ??? Should probably use max precision for the reciprocal computations.
508//
509// r32/f8 holds the dividend. r33/f9 holds the divisor.
510// f10 holds the value 2.0. f11 holds the reciprocal approximation.
511// f12 is a temporary.
512//
513// This is the same as modsi3, except that we don't need fcvt instructions
514// before the frcpa.
515
516 .text
517 .align 16
518 .global __umodsi3
519 .proc __umodsi3
520__umodsi3:
521 .regstk 2,0,0,0
522 setf.sig f8 = r32
523 setf.sig f9 = r33
524 ;;
525 frcpa f11, p6 = f8, f9
526 fadd f10 = f1, f1
527 ;;
528 fnma f12 = f9, f11, f10
529 ;;
530 fmpy f11 = f11, f12
531 ;;
532 fnma f12 = f9, f11, f10
533 ;;
534 fmpy f11 = f11, f12
535 ;;
536 fmpy f10 = f8, f11
537 ;;
538 fcvt.fxu.trunc f10 = f10
539 ;;
540 fcvt.xuf f10 = f10
541 ;;
542 fnma f8 = f10, f9, f8
543 ;;
544 fcvt.fxu f8 = f8
545 ;;
546 getf.sig r32 = f8
547 br.ret.sptk rp
548 ;;
549 .endp __umodsi3
550#endif
551
552#ifdef L__save_stack_nonlocal
553// Notes on save/restore stack nonlocal: We read ar.bsp but write
554// ar.bspstore. This is because ar.bsp can be read at all times
555// (independent of the RSE mode) but since it's read-only we need to
556// restore the value via ar.bspstore. This is OK because
557// ar.bsp==ar.bspstore after executing "flushrs".
558
559// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
560
561 .text
562 .align 16
563 .global __ia64_save_stack_nonlocal
564 .proc __ia64_save_stack_nonlocal
565__ia64_save_stack_nonlocal:
566 alloc r18=ar.pfs,2,0,0,0
567 st8 [in0]=in1,8
568 mov r19=ar.rsc
569 ;;
570 flushrs
571 and r19=0x1c,r19
572 mov ar.pfs=r18
573 ;;
574 mov ar.rsc=r19
575 mov r16=ar.bsp
576 adds r2=16,in0
577 ;;
578 mov r17=ar.rnat
579 st8 [in0]=r16,8
580 or r19=0x3,r19
581 ;;
582 st8 [in0]=r17
583 mov ar.rsc=r19
584 st8 [r2]=r18
585 mov ar.pfs=r18
586 br.ret.sptk.few rp
587 ;;
588 .endp __ia64_save_stack_nonlocal
589#endif
590
591#ifdef L__nonlocal_goto
592// void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area,
593// void *static_chain);
594
595 .text
596 .align 16
597 .global __ia64_nonlocal_goto
598 .proc __ia64_nonlocal_goto
599__ia64_nonlocal_goto:
600 alloc r20=ar.pfs,4,0,0,0
601 mov r19=ar.rsc
602 adds r2=8,in2
603 ld8 r12=[in2],16
604 mov.ret.sptk.few.dc.dc rp = r33, .L0
605// ??? flushrs must be first instruction of a group. Gas is unfortunately
606// putting the stop bit before the padding nop instead of after it, making
607// flushrs the first instruction of its bundle, but the second instruction
608// of its group. We explicitly add the nop to avoid this problem.
609 nop.i 0
610 ;;
611 flushrs
612 ld8 r16=[r2],16
613 and r19=0x1c,r19
614 ld8 r17=[in2]
615 ;;
616 ld8 r18=[r2]
617 mov ar.rsc=r19
618 ;;
619 mov ar.bspstore=r16
620 ;;
621 mov ar.rnat=r17
622 mov ar.pfs=r18
623 or r19=0x3,r19
624 ;;
625 loadrs
626 invala
627 mov r7=r32
628.L0: {
629 mov ar.rsc=r19
630 mov r15=r35
631 br.ret.sptk.few rp
632 }
633 ;;
634 .endp __ia64_nonlocal_goto
635#endif