[thirdparty/gcc.git] / gcc / config / ia64 / lib1funcs.asm

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
__divdf3:
	frcpa f10, p6 = farg0, farg1
	;;
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.d.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.d.s1 f8 = farg1, f11, farg0
	;;
(p6)	fma.d f10 = f8, f10, f11
	;;
	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
__divsf3:
	frcpa f10, p6 = farg0, farg1
	;;
(p6)	fma.s1 f8 = farg0, f10, f0
(p6)	fnma.s1 f9 = farg1, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f0
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f0
	;;
(p6)	fma.d.s1 f8 = f9, f8, f8
	;;
(p6)	fma.s f10 = f8, f1, f0
	;;
	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f8 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f8, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f8 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Renormalize.
	fcvt.xf f10 = f10
	;;
	// Compute remainder.
	fnma.s1 f8 = f10, f9, f8
	;;
	// Round remainder to an integer.
	fcvt.fx.trunc.s1 f8 = f8
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f8 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f8, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f8 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Renormalize.
	fcvt.xuf.s1 f10 = f10
	;;
	// Compute remainder.
	fnma.s1 f8 = f10, f9, f8
	;;
	// Round remainder to an integer.
	fcvt.fxu.trunc.s1 f8 = f8
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
__divsi3:
	.regstk 2,0,0,0
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f8 = f8, f11
	;;
	fcvt.fx.trunc f8 = f8
	;;
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
__modsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f10 = f8, f11
	;;
	fcvt.fx.trunc f10 = f10
	;;
	fcvt.xf f10 = f10
	;;
	fnma f8 = f10, f9, f8
	;;
	fcvt.fx f8 = f8
	;;
	getf.sig r32 = f8
	br.ret.sptk rp
	;;
	.endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.
//
// This is the same as divsi3, except that we don't need fcvt instructions
// before the frcpa.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
__udivsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f8 = f8, f11
	;;
	fcvt.fxu.trunc f8 = f8
	;;
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.
//
// This is the same as modsi3, except that we don't need fcvt instructions
// before the frcpa.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
__umodsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f10 = f8, f11
	;;
	fcvt.fxu.trunc f10 = f10
	;;
	fcvt.xuf f10 = f10
	;;
	fnma f8 = f10, f9, f8
	;;
	fcvt.fxu f8 = f8
	;;
	getf.sig r32 = f8
	br.ret.sptk rp
	;;
	.endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
	{ .mmf
	  alloc r18 = ar.pfs, 2, 0, 0, 0
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  st8 [in0] = in1, 24
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  st8 [in0] = r18, -16
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov r16 = ar.bsp
	  mov r17 = ar.rnat
	  adds r2 = 8, in0
	  ;;
	}
	{ .mmi
	  st8 [in0] = r16
	  st8 [r2] = r17
	}
	{ .mib
	  mov ar.rsc = r19
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//			     void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
	{ .mmi
	  alloc r20 = ar.pfs, 3, 0, 0, 0
	  ld8 r12 = [in1], 8
	  mov.ret.sptk rp = in0, .L0
	  ;;
	}
	{ .mmf
	  ld8 r16 = [in1], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in1], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  ld8 r18 = [in1]
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  ;;
	}
	{ .mmi
	  loadrs
	  invala
	  mov r15 = in2
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_nonlocal_goto
#endif

#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.

// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
	{ .mmf
	  alloc r20 = ar.pfs, 4, 0, 0, 0
	  ld8 r12 = [in0], 8
	  ;;
	}
	{ .mmb
	  ld8 r16=[in0], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in0], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmf
	  ld8 r18 = [in0]
	  mov ar.rsc = r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmf
	  loadrs
	  invala
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_restore_stack_nonlocal
#endif

#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//		+-------------------+ \ 
//	TRAMP:	| __ia64_trampoline | |
//		+-------------------+  > fake function descriptor
//		| TRAMP+16          | |
//		+-------------------+ /
//		| target descriptor |
//		+-------------------+
//		| static link	    |
//		+-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
__ia64_trampoline:
	{ .mmi
	  ld8 r2 = [r1], 8
	  ;;
	  ld8 r15 = [r1]
	}
	{ .mmi
	  ld8 r3 = [r2], 8
	  ;;
	  ld8 r1 = [r2]
	  mov b6 = r3
	}
	{ .bbb
	  br.sptk.many b6
	  ;;
	}
	.endp __ia64_trampoline
#endif
Commit	Line	Data
c65ebc55 JW	1	#ifdef L__divdf3
	2	// Compute a 64-bit IEEE double quotient.
	3	//
	4	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	5	// alternative.
	6	//
	7	// farg0 holds the dividend. farg1 holds the divisor.
	8
	9	.text
	10	.align 16
	11	.global __divdf3
	12	.proc __divdf3
	13	__divdf3:
	14	frcpa f10, p6 = farg0, farg1
	15	;;
	16	(p6) fma.s1 f11 = farg0, f10, f0
	17	(p6) fnma.s1 f12 = farg1, f10, f1
	18	;;
	19	(p6) fma.s1 f11 = f12, f11, f11
	20	(p6) fma.s1 f13 = f12, f12, f0
	21	(p6) fma.s1 f10 = f12, f10, f10
	22	;;
	23	(p6) fma.s1 f11 = f13, f11, f11
	24	(p6) fma.s1 f12 = f13, f13, f0
	25	(p6) fma.s1 f10 = f13, f10, f10
	26	;;
	27	(p6) fma.d.s1 f11 = f12, f11, f11
	28	(p6) fma.s1 f10 = f12, f10, f10
	29	;;
	30	(p6) fnma.d.s1 f8 = farg1, f11, farg0
	31	;;
	32	(p6) fma.d f10 = f8, f10, f11
	33	;;
	34	mov fret0 = f10
	35	br.ret.sptk rp
	36	;;
	37	.endp __divdf3
	38	#endif
	39
	40	#ifdef L__divsf3
	41	// Compute a 32-bit IEEE float quotient.
	42	//
	43	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	44	// alternative.
	45	//
	46	// farg0 holds the dividend. farg1 holds the divisor.
	47
	48	.text
	49	.align 16
	50	.global __divsf3
	51	.proc __divsf3
	52	__divsf3:
	53	frcpa f10, p6 = farg0, farg1
	54	;;
	55	(p6) fma.s1 f8 = farg0, f10, f0
	56	(p6) fnma.s1 f9 = farg1, f10, f1
	57	;;
	58	(p6) fma.s1 f8 = f9, f8, f8
	59	(p6) fma.s1 f9 = f9, f9, f0
	60	;;
	61	(p6) fma.s1 f8 = f9, f8, f8
	62	(p6) fma.s1 f9 = f9, f9, f0
	63	;;
	64	(p6) fma.d.s1 f8 = f9, f8, f8
65	;;
66	(p6) fma.s f10 = f8, f1, f0
67	;;
68	mov fret0 = f10
69	br.ret.sptk rp
70	;;
71	.endp __divsf3
72	#endif
73
74	#ifdef L__divdi3
75	// Compute a 64-bit integer quotient.
76	//
77	// Use reciprocal approximation and Newton-Raphson iteration to compute the
78	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
79	// to get more than the 64 bits of precision that we need for DImode.
80	//
81	// Must use max precision for the reciprocal computations to get 64 bits of
82	// precision.
83	//
84	// r32/f8 holds the dividend. r33/f9 holds the divisor.
85	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
86	// f12 is a temporary.
87
88	.text
89	.align 16
90	.global __divdi3
91	.proc __divdi3
92	__divdi3:
93	.regstk 2,0,0,0
94	// Transfer inputs to FP registers.
95	setf.sig f8 = in0
96	setf.sig f9 = in1
97	;;
98	// Convert the inputs to FP, so that they won't be treated as unsigned.
99	fcvt.xf f8 = f8
100	fcvt.xf f9 = f9
101	;;
102	// Compute the reciprocal approximation.
660a0ebd	103	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	104	;;
	105	// 3 Newton-Raphson iterations.
	106	(p6) fma.s1 f11 = farg0, f10, f0
	107	(p6) fnma.s1 f12 = farg1, f10, f1
	108	;;
	109	(p6) fma.s1 f11 = f12, f11, f11
	110	(p6) fma.s1 f13 = f12, f12, f0
	111	(p6) fma.s1 f10 = f12, f10, f10
	112	;;
	113	(p6) fma.s1 f11 = f13, f11, f11
	114	(p6) fma.s1 f12 = f13, f13, f0
	115	(p6) fma.s1 f10 = f13, f10, f10
	116	;;
	117	(p6) fma.s1 f11 = f12, f11, f11
	118	(p6) fma.s1 f10 = f12, f10, f10
	119	;;
	120	(p6) fnma.s1 f8 = f9, f11, f8
	121	;;
660a0ebd	122	(p6) fma.s1 f10 = f8, f10, f11
c65ebc55 JW	123	;;
c65ebc55 JW	124	// Round quotient to an integer.
660a0ebd	125	fcvt.fx.trunc.s1 f8 = f10
c65ebc55 JW	126	;;
	127	// Transfer result to GP registers.
	128	getf.sig ret0 = f8
	129	br.ret.sptk rp
	130	;;
	131	.endp __divdi3
	132	#endif
	133
	134	#ifdef L__moddi3
	135	// Compute a 64-bit integer modulus.
	136	//
	137	// Use reciprocal approximation and Newton-Raphson iteration to compute the
	138	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
	139	// to get more than the 64 bits of precision that we need for DImode.
	140	//
	141	// Must use max precision for the reciprocal computations to get 64 bits of
	142	// precision.
	143	//
	144	// r32/f8 holds the dividend. r33/f9 holds the divisor.
	145	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
	146	// f12 is a temporary.
	147
	148	.text
	149	.align 16
	150	.global __moddi3
	151	.proc __moddi3
	152	__moddi3:
	153	.regstk 2,0,0,0
	154	// Transfer inputs to FP registers.
	155	setf.sig f8 = in0
	156	setf.sig f9 = in1
	157	;;
	158	// Convert the inputs to FP, so that they won't be treated as unsigned.
	159	fcvt.xf f8 = f8
	160	fcvt.xf f9 = f9
	161	;;
	162	// Compute the reciprocal approximation.
660a0ebd	163	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	164	;;
	165	// 3 Newton-Raphson iterations.
	166	(p6) fma.s1 f11 = farg0, f10, f0
	167	(p6) fnma.s1 f12 = farg1, f10, f1
	168	;;
	169	(p6) fma.s1 f11 = f12, f11, f11
	170	(p6) fma.s1 f13 = f12, f12, f0
	171	(p6) fma.s1 f10 = f12, f10, f10
	172	;;
	173	(p6) fma.s1 f11 = f13, f11, f11
	174	(p6) fma.s1 f12 = f13, f13, f0
	175	(p6) fma.s1 f10 = f13, f10, f10
	176	;;
	177	(p6) fma.s1 f11 = f12, f11, f11
	178	(p6) fma.s1 f10 = f12, f10, f10
	179	;;
	180	(p6) fnma.s1 f12 = f9, f11, f8
	181	;;
660a0ebd	182	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55 JW	183	;;
c65ebc55 JW	184	// Round quotient to an integer.
660a0ebd	185	fcvt.fx.trunc.s1 f10 = f10
c65ebc55 JW	186	;;
	187	// Renormalize.
	188	fcvt.xf f10 = f10
	189	;;
	190	// Compute remainder.
660a0ebd	191	fnma.s1 f8 = f10, f9, f8
c65ebc55 JW	192	;;
c65ebc55 JW	193	// Round remainder to an integer.
660a0ebd	194	fcvt.fx.trunc.s1 f8 = f8
c65ebc55 JW	195	;;
	196	// Transfer result to GP registers.
	197	getf.sig ret0 = f8
	198	br.ret.sptk rp
	199	;;
	200	.endp __moddi3
	201	#endif
	202
	203	#ifdef L__udivdi3
	204	// Compute a 64-bit unsigned integer quotient.
	205	//
	206	// Use reciprocal approximation and Newton-Raphson iteration to compute the
	207	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
	208	// to get more than the 64 bits of precision that we need for DImode.
	209	//
	210	// Must use max precision for the reciprocal computations to get 64 bits of
	211	// precision.
	212	//
	213	// r32/f8 holds the dividend. r33/f9 holds the divisor.
	214	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
	215	// f12 is a temporary.
	216
	217	.text
	218	.align 16
	219	.global __udivdi3
	220	.proc __udivdi3
	221	__udivdi3:
	222	.regstk 2,0,0,0
	223	// Transfer inputs to FP registers.
	224	setf.sig f8 = in0
	225	setf.sig f9 = in1
	226	;;
	227	// Convert the inputs to FP, to avoid FP software-assist faults.
660a0ebd JW	228	fcvt.xuf.s1 f8 = f8
660a0ebd JW	229	fcvt.xuf.s1 f9 = f9
c65ebc55 JW	230	;;
c65ebc55 JW	231	// Compute the reciprocal approximation.
660a0ebd	232	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	233	;;
	234	// 3 Newton-Raphson iterations.
	235	(p6) fma.s1 f11 = farg0, f10, f0
	236	(p6) fnma.s1 f12 = farg1, f10, f1
	237	;;
	238	(p6) fma.s1 f11 = f12, f11, f11
	239	(p6) fma.s1 f13 = f12, f12, f0
	240	(p6) fma.s1 f10 = f12, f10, f10
	241	;;
	242	(p6) fma.s1 f11 = f13, f11, f11
	243	(p6) fma.s1 f12 = f13, f13, f0
	244	(p6) fma.s1 f10 = f13, f10, f10
	245	;;
	246	(p6) fma.s1 f11 = f12, f11, f11
	247	(p6) fma.s1 f10 = f12, f10, f10
	248	;;
	249	(p6) fnma.s1 f8 = f9, f11, f8
	250	;;
660a0ebd	251	(p6) fma.s1 f10 = f8, f10, f11
c65ebc55 JW	252	;;
c65ebc55 JW	253	// Round quotient to an unsigned integer.
660a0ebd	254	fcvt.fxu.trunc.s1 f8 = f10
c65ebc55 JW	255	;;
	256	// Transfer result to GP registers.
	257	getf.sig ret0 = f8
	258	br.ret.sptk rp
	259	;;
	260	.endp __udivdi3
	261	#endif
	262
	263	#ifdef L__umoddi3
	264	// Compute a 64-bit unsigned integer modulus.
	265	//
	266	// Use reciprocal approximation and Newton-Raphson iteration to compute the
	267	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
	268	// to get more than the 64 bits of precision that we need for DImode.
	269	//
	270	// Must use max precision for the reciprocal computations to get 64 bits of
	271	// precision.
	272	//
	273	// r32/f8 holds the dividend. r33/f9 holds the divisor.
	274	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
	275	// f12 is a temporary.
	276
	277	.text
	278	.align 16
	279	.global __umoddi3
	280	.proc __umoddi3
	281	__umoddi3:
	282	.regstk 2,0,0,0
	283	// Transfer inputs to FP registers.
	284	setf.sig f8 = in0
	285	setf.sig f9 = in1
	286	;;
	287	// Convert the inputs to FP, to avoid FP software assist faults.
660a0ebd JW	288	fcvt.xuf.s1 f8 = f8
660a0ebd JW	289	fcvt.xuf.s1 f9 = f9
c65ebc55 JW	290	;;
c65ebc55 JW	291	// Compute the reciprocal approximation.
660a0ebd	292	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	293	;;
	294	// 3 Newton-Raphson iterations.
	295	(p6) fma.s1 f11 = farg0, f10, f0
	296	(p6) fnma.s1 f12 = farg1, f10, f1
	297	;;
	298	(p6) fma.s1 f11 = f12, f11, f11
	299	(p6) fma.s1 f13 = f12, f12, f0
	300	(p6) fma.s1 f10 = f12, f10, f10
	301	;;
	302	(p6) fma.s1 f11 = f13, f11, f11
	303	(p6) fma.s1 f12 = f13, f13, f0
	304	(p6) fma.s1 f10 = f13, f10, f10
	305	;;
	306	(p6) fma.s1 f11 = f12, f11, f11
	307	(p6) fma.s1 f10 = f12, f10, f10
	308	;;
	309	(p6) fnma.s1 f12 = f9, f11, f8
	310	;;
660a0ebd	311	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55 JW	312	;;
c65ebc55 JW	313	// Round quotient to an unsigned integer.
660a0ebd	314	fcvt.fxu.trunc.s1 f10 = f10
c65ebc55 JW	315	;;
c65ebc55 JW	316	// Renormalize.
660a0ebd	317	fcvt.xuf.s1 f10 = f10
c65ebc55 JW	318	;;
c65ebc55 JW	319	// Compute remainder.
660a0ebd	320	fnma.s1 f8 = f10, f9, f8
c65ebc55 JW	321	;;
c65ebc55 JW	322	// Round remainder to an integer.
660a0ebd	323	fcvt.fxu.trunc.s1 f8 = f8
c65ebc55 JW	324	;;
	325	// Transfer result to GP registers.
	326	getf.sig ret0 = f8
	327	br.ret.sptk rp
	328	;;
	329	.endp __umoddi3
	330	#endif
	331
	332	#ifdef L__divsi3
	333	// Compute a 32-bit integer quotient.
	334	//
	335	// Use reciprocal approximation and Newton-Raphson iteration to compute the
	336	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
	337	// to get more than the 32 bits of precision that we need for SImode.
	338	//
	339	// ??? This is currently not used. It needs to be fixed to be more like the
	340	// above DImode routines.
	341	//
	342	// ??? Check to see if the error is less than >.5ulp error. We may need
	343	// some adjustment code to get precise enough results.
	344	//
	345	// ??? Should probably use max precision for the reciprocal computations.
	346	//
	347	// r32/f8 holds the dividend. r33/f9 holds the divisor.
	348	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
	349	// f12 is a temporary.
	350
	351	.text
	352	.align 16
	353	.global __divsi3
	354	.proc __divsi3
	355	__divsi3:
	356	.regstk 2,0,0,0
	357	setf.sig f8 = in0
	358	setf.sig f9 = in1
	359	;;
	360	fcvt.xf f8 = f8
	361	fcvt.xf f9 = f9
	362	;;
	363	frcpa f11, p6 = f8, f9
	364	fadd f10 = f1, f1
	365	;;
	366	fnma f12 = f9, f11, f10
	367	;;
	368	fmpy f11 = f11, f12
	369	;;
	370	fnma f12 = f9, f11, f10
	371	;;
	372	fmpy f11 = f11, f12
	373	;;
	374	fmpy f8 = f8, f11
	375	;;
	376	fcvt.fx.trunc f8 = f8
	377	;;
	378	getf.sig ret0 = f8
	379	br.ret.sptk rp
	380	;;
	381	.endp __divsi3
	382	#endif
	383
	384	#ifdef L__modsi3
	385	// Compute a 32-bit integer modulus.
	386	//
	387	// Use reciprocal approximation and Newton-Raphson iteration to compute the
388	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
389	// to get more than the 32 bits of precision that we need for SImode.
390	//
391	// ??? This is currently not used. It needs to be fixed to be more like the
392	// above DImode routines.
393	//
394	// ??? Check to see if the error is less than >.5ulp error. We may need
395	// some adjustment code to get precise enough results.
396	//
397	// ??? Should probably use max precision for the reciprocal computations.
398	//
399	// r32/f8 holds the dividend. r33/f9 holds the divisor.
400	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
401	// f12 is a temporary.
402
403	.text
404	.align 16
405	.global __modsi3
406	.proc __modsi3
407	__modsi3:
408	.regstk 2,0,0,0
409	setf.sig f8 = r32
410	setf.sig f9 = r33
411	;;
412	fcvt.xf f8 = f8
413	fcvt.xf f9 = f9
414	;;
415	frcpa f11, p6 = f8, f9
416	fadd f10 = f1, f1
417	;;
418	fnma f12 = f9, f11, f10
419	;;
420	fmpy f11 = f11, f12
421	;;
422	fnma f12 = f9, f11, f10
423	;;
424	fmpy f11 = f11, f12
425	;;
426	fmpy f10 = f8, f11
427	;;
428	fcvt.fx.trunc f10 = f10
429	;;
430	fcvt.xf f10 = f10
431	;;
432	fnma f8 = f10, f9, f8
433	;;
434	fcvt.fx f8 = f8
435	;;
436	getf.sig r32 = f8
437	br.ret.sptk rp
438	;;
439	.endp __modsi3
440	#endif
441
442	#ifdef L__udivsi3
443	// Compute a 32-bit unsigned integer quotient.
444	//
445	// Use reciprocal approximation and Newton-Raphson iteration to compute the
446	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
447	// to get more than the 32 bits of precision that we need for SImode.
448	//
449	// ??? This is currently not used. It needs to be fixed to be more like the
450	// above DImode routines.
451	//
452	// ??? Check to see if the error is less than >.5ulp error. We may need
453	// some adjustment code to get precise enough results.
454	//
455	// ??? Should probably use max precision for the reciprocal computations.
456	//
457	// r32/f8 holds the dividend. r33/f9 holds the divisor.
458	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
459	// f12 is a temporary.
460	//
461	// This is the same as divsi3, except that we don't need fcvt instructions
462	// before the frcpa.
463
464	.text
465	.align 16
466	.global __udivsi3
467	.proc __udivsi3
468	__udivsi3:
469	.regstk 2,0,0,0
470	setf.sig f8 = r32
471	setf.sig f9 = r33
472	;;
473	frcpa f11, p6 = f8, f9
474	fadd f10 = f1, f1
475	;;
476	fnma f12 = f9, f11, f10
477	;;
478	fmpy f11 = f11, f12
479	;;
480	fnma f12 = f9, f11, f10
481	;;
482	fmpy f11 = f11, f12
483	;;
484	fmpy f8 = f8, f11
485	;;
486	fcvt.fxu.trunc f8 = f8
487	;;
488	getf.sig ret0 = f8
489	br.ret.sptk rp
490	;;
491	.endp __udivsi3
492	#endif
493
494	#ifdef L__umodsi3
495	// Compute a 32-bit unsigned integer modulus.
496	//
497	// Use reciprocal approximation and Newton-Raphson iteration to compute the
498	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
499	// to get more than the 32 bits of precision that we need for SImode.
500	//
501	// ??? This is currently not used. It needs to be fixed to be more like the
502	// above DImode routines.
503	//
504	// ??? Check to see if the error is less than >.5ulp error. We may need
505	// some adjustment code to get precise enough results.
506	//
507	// ??? Should probably use max precision for the reciprocal computations.
508	//
509	// r32/f8 holds the dividend. r33/f9 holds the divisor.
510	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
511	// f12 is a temporary.
512	//
513	// This is the same as modsi3, except that we don't need fcvt instructions
514	// before the frcpa.
515
516	.text
517	.align 16
518	.global __umodsi3
519	.proc __umodsi3
520	__umodsi3:
521	.regstk 2,0,0,0
522	setf.sig f8 = r32
523	setf.sig f9 = r33
524	;;
525	frcpa f11, p6 = f8, f9
526	fadd f10 = f1, f1
527	;;
528	fnma f12 = f9, f11, f10
529	;;
530	fmpy f11 = f11, f12
531	;;
532	fnma f12 = f9, f11, f10
533	;;
534	fmpy f11 = f11, f12
535	;;
536	fmpy f10 = f8, f11
537	;;
538	fcvt.fxu.trunc f10 = f10
539	;;
540	fcvt.xuf f10 = f10
541	;;
542	fnma f8 = f10, f9, f8
543	;;
544	fcvt.fxu f8 = f8
545	;;
546	getf.sig r32 = f8
547	br.ret.sptk rp
548	;;
549	.endp __umodsi3
550	#endif
551
552	#ifdef L__save_stack_nonlocal
553	// Notes on save/restore stack nonlocal: We read ar.bsp but write
554	// ar.bspstore. This is because ar.bsp can be read at all times
555	// (independent of the RSE mode) but since it's read-only we need to
556	// restore the value via ar.bspstore. This is OK because
557	// ar.bsp==ar.bspstore after executing "flushrs".
558
559	// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)
560
561	.text
562	.align 16
563	.global __ia64_save_stack_nonlocal
564	.proc __ia64_save_stack_nonlocal
565	__ia64_save_stack_nonlocal:
97e242b0 RH	566	{ .mmf
	567	alloc r18 = ar.pfs, 2, 0, 0, 0
	568	mov r19 = ar.rsc
	569	;;
	570	}
	571	{ .mmi
	572	flushrs
	573	st8 [in0] = in1, 24
	574	and r19 = 0x1c, r19
	575	;;
	576	}
	577	{ .mmi
	578	st8 [in0] = r18, -16
	579	mov ar.rsc = r19
	580	or r19 = 0x3, r19
	581	;;
	582	}
	583	{ .mmi
	584	mov r16 = ar.bsp
	585	mov r17 = ar.rnat
	586	adds r2 = 8, in0
	587	;;
	588	}
	589	{ .mmi
	590	st8 [in0] = r16
	591	st8 [r2] = r17
	592	}
	593	{ .mib
	594	mov ar.rsc = r19
	595	br.ret.sptk.few rp
	596	;;
	597	}
c65ebc55 JW	598	.endp __ia64_save_stack_nonlocal
	599	#endif
	600
	601	#ifdef L__nonlocal_goto
97e242b0	602	// void __ia64_nonlocal_goto(void target_label, void save_area,
c65ebc55 JW	603	// void *static_chain);
	604
	605	.text
	606	.align 16
	607	.global __ia64_nonlocal_goto
	608	.proc __ia64_nonlocal_goto
	609	__ia64_nonlocal_goto:
97e242b0 RH	610	{ .mmi
	611	alloc r20 = ar.pfs, 3, 0, 0, 0
	612	ld8 r12 = [in1], 8
	613	mov.ret.sptk rp = in0, .L0
	614	;;
	615	}
	616	{ .mmf
	617	ld8 r16 = [in1], 8
	618	mov r19 = ar.rsc
	619	;;
	620	}
	621	{ .mmi
	622	flushrs
	623	ld8 r17 = [in1], 8
	624	and r19 = 0x1c, r19
	625	;;
	626	}
	627	{ .mmi
	628	ld8 r18 = [in1]
	629	mov ar.rsc = r19
	630	or r19 = 0x3, r19
	631	;;
	632	}
	633	{ .mmi
	634	mov ar.bspstore = r16
	635	;;
	636	mov ar.rnat = r17
	637	;;
	638	}
	639	{ .mmi
	640	loadrs
	641	invala
	642	mov r15 = in2
	643	;;
	644	}
	645	.L0: { .mib
	646	mov ar.rsc = r19
	647	mov ar.pfs = r18
	648	br.ret.sptk.few rp
	649	;;
c65ebc55	650	}
c65ebc55 JW	651	.endp __ia64_nonlocal_goto
c65ebc55 JW	652	#endif
9525c690 JW	653
	654	#ifdef L__restore_stack_nonlocal
	655	// This is mostly the same as nonlocal_goto above.
	656	// ??? This has not been tested yet.
	657
	658	// void __ia64_restore_stack_nonlocal(void *save_area)
	659
	660	.text
	661	.align 16
	662	.global __ia64_restore_stack_nonlocal
	663	.proc __ia64_restore_stack_nonlocal
	664	__ia64_restore_stack_nonlocal:
97e242b0 RH	665	{ .mmf
	666	alloc r20 = ar.pfs, 4, 0, 0, 0
	667	ld8 r12 = [in0], 8
	668	;;
	669	}
	670	{ .mmb
	671	ld8 r16=[in0], 8
	672	mov r19 = ar.rsc
	673	;;
	674	}
	675	{ .mmi
	676	flushrs
	677	ld8 r17 = [in0], 8
	678	and r19 = 0x1c, r19
	679	;;
	680	}
	681	{ .mmf
	682	ld8 r18 = [in0]
	683	mov ar.rsc = r19
	684	;;
	685	}
	686	{ .mmi
	687	mov ar.bspstore = r16
	688	;;
	689	mov ar.rnat = r17
	690	or r19 = 0x3, r19
	691	;;
	692	}
	693	{ .mmf
	694	loadrs
	695	invala
	696	;;
	697	}
	698	.L0: { .mib
	699	mov ar.rsc = r19
	700	mov ar.pfs = r18
	701	br.ret.sptk.few rp
	702	;;
9525c690	703	}
9525c690 JW	704	.endp __ia64_restore_stack_nonlocal
9525c690 JW	705	#endif
97e242b0 RH	706
	707	#ifdef L__trampoline
	708	// Implement the nested function trampoline. This is out of line
	709	// so that we don't have to bother with flushing the icache, as
	710	// well as making the on-stack trampoline smaller.
	711	//
	712	// The trampoline has the following form:
	713	//
	714	// +-------------------+ \
	715	// TRAMP: \| __ia64_trampoline \| \|
	716	// +-------------------+ > fake function descriptor
	717	// \| TRAMP+16 \| \|
	718	// +-------------------+ /
	719	// \| target descriptor \|
	720	// +-------------------+
	721	// \| static link \|
	722	// +-------------------+
	723
	724	.text
	725	.align 16
	726	.global __ia64_trampoline
	727	.proc __ia64_trampoline
	728	__ia64_trampoline:
	729	{ .mmi
	730	ld8 r2 = [r1], 8
	731	;;
	732	ld8 r15 = [r1]
	733	}
	734	{ .mmi
	735	ld8 r3 = [r2], 8
	736	;;
	737	ld8 r1 = [r2]
	738	mov b6 = r3
	739	}
	740	{ .bbb
	741	br.sptk.many b6
	742	;;
	743	}
	744	.endp __ia64_trampoline
	745	#endif