[thirdparty/gcc.git] / gcc / config / ia64 / lib1funcs.asm

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
__divdf3:
	frcpa f10, p6 = farg0, farg1
	;;
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.d.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.d.s1 f8 = farg1, f11, farg0
	;;
(p6)	fma.d f10 = f8, f10, f11
	;;
	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
__divsf3:
	frcpa f10, p6 = farg0, farg1
	;;
(p6)	fma.s1 f8 = farg0, f10, f0
(p6)	fnma.s1 f9 = farg1, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f0
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f0
	;;
(p6)	fma.d.s1 f8 = f9, f8, f8
	;;
(p6)	fma.s f10 = f8, f1, f0
	;;
	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f8 = f9, f11, f8
	;;
(p6)	fma f10 = f8, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc f8 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc f10 = f10
	;;
	// Renormalize.
	fcvt.xf f10 = f10
	;;
	// Compute remainder.
	fnma f8 = f10, f9, f8
	;;
	// Round remainder to an integer.
	fcvt.fx.trunc f8 = f8
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf f8 = f8
	fcvt.xuf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f8 = f9, f11, f8
	;;
(p6)	fma f10 = f8, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc f8 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
// to get more than the 64 bits of precision that we need for DImode.
//
// Must use max precision for the reciprocal computations to get 64 bits of
// precision.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf f8 = f8
	fcvt.xuf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fma.s1 f11 = farg0, f10, f0
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f13 = f12, f12, f0
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fma.s1 f11 = f13, f11, f11
(p6)	fma.s1 f12 = f13, f13, f0
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc f10 = f10
	;;
	// Renormalize.
	fcvt.xuf f10 = f10
	;;
	// Compute remainder.
	fnma f8 = f10, f9, f8
	;;
	// Round remainder to an integer.
	fcvt.fxu.trunc f8 = f8
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
__divsi3:
	.regstk 2,0,0,0
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f8 = f8, f11
	;;
	fcvt.fx.trunc f8 = f8
	;;
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
__modsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f10 = f8, f11
	;;
	fcvt.fx.trunc f10 = f10
	;;
	fcvt.xf f10 = f10
	;;
	fnma f8 = f10, f9, f8
	;;
	fcvt.fx f8 = f8
	;;
	getf.sig r32 = f8
	br.ret.sptk rp
	;;
	.endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.
//
// This is the same as divsi3, except that we don't need fcvt instructions
// before the frcpa.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
__udivsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f8 = f8, f11
	;;
	fcvt.fxu.trunc f8 = f8
	;;
	getf.sig ret0 = f8
	br.ret.sptk rp
	;;
	.endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// Use reciprocal approximation and Newton-Raphson iteration to compute the
// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
// to get more than the 32 bits of precision that we need for SImode.
//
// ??? This is currently not used.  It needs to be fixed to be more like the
// above DImode routines.
//
// ??? Check to see if the error is less than >.5ulp error.  We may need
// some adjustment code to get precise enough results.
//
// ??? Should probably use max precision for the reciprocal computations.
//
// r32/f8 holds the dividend.  r33/f9 holds the divisor.
// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
// f12 is a temporary.
//
// This is the same as modsi3, except that we don't need fcvt instructions
// before the frcpa.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
__umodsi3:
	.regstk 2,0,0,0
	setf.sig f8 = r32
	setf.sig f9 = r33
	;;
	frcpa f11, p6 = f8, f9
	fadd f10 = f1, f1
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fnma f12 = f9, f11, f10
	;;
	fmpy f11 = f11, f12
	;;
	fmpy f10 = f8, f11
	;;
	fcvt.fxu.trunc f10 = f10
	;;
	fcvt.xuf f10 = f10
	;;
	fnma f8 = f10, f9, f8
	;;
	fcvt.fxu f8 = f8
	;;
	getf.sig r32 = f8
	br.ret.sptk rp
	;;
	.endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
	alloc r18=ar.pfs,2,0,0,0
	st8 [in0]=in1,8
	mov r19=ar.rsc
	;;
	flushrs
	and r19=0x1c,r19
	mov ar.pfs=r18
	;;
	mov ar.rsc=r19
	mov r16=ar.bsp
	adds r2=16,in0
	;;
	mov r17=ar.rnat
	st8 [in0]=r16,8
	or r19=0x3,r19
	;;
	st8 [in0]=r17
	mov ar.rsc=r19
	st8 [r2]=r18
	mov ar.pfs=r18
	br.ret.sptk.few rp
	;;
	.endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area,
//			     void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
	alloc r20=ar.pfs,4,0,0,0
	mov r19=ar.rsc
	adds r2=8,in2
	ld8 r12=[in2],16
	mov.ret.sptk.few.dc.dc rp = r33, .L0
// ??? flushrs must be first instruction of a group.  Gas is unfortunately
// putting the stop bit before the padding nop instead of after it, making
// flushrs the first instruction of its bundle, but the second instruction
// of its group.  We explicitly add the nop to avoid this problem.
	nop.i 0
	;;
	flushrs
	ld8 r16=[r2],16
	and r19=0x1c,r19
	ld8 r17=[in2]
	;;
	ld8 r18=[r2]
	mov ar.rsc=r19
	;;
	mov ar.bspstore=r16
	;;
	mov ar.rnat=r17
	mov ar.pfs=r18
	or r19=0x3,r19
	;;
	loadrs
	invala
	mov r7=r32
.L0:	{
	mov ar.rsc=r19
	mov r15=r35
	br.ret.sptk.few rp
	}
	;;
	.endp __ia64_nonlocal_goto
#endif
Commit	Line	Data
c65ebc55 JW	1	#ifdef L__divdf3
	2	// Compute a 64-bit IEEE double quotient.
	3	//
	4	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	5	// alternative.
	6	//
	7	// farg0 holds the dividend. farg1 holds the divisor.
	8
	9	.text
	10	.align 16
	11	.global __divdf3
	12	.proc __divdf3
	13	__divdf3:
	14	frcpa f10, p6 = farg0, farg1
	15	;;
	16	(p6) fma.s1 f11 = farg0, f10, f0
	17	(p6) fnma.s1 f12 = farg1, f10, f1
	18	;;
	19	(p6) fma.s1 f11 = f12, f11, f11
	20	(p6) fma.s1 f13 = f12, f12, f0
	21	(p6) fma.s1 f10 = f12, f10, f10
	22	;;
	23	(p6) fma.s1 f11 = f13, f11, f11
	24	(p6) fma.s1 f12 = f13, f13, f0
	25	(p6) fma.s1 f10 = f13, f10, f10
	26	;;
	27	(p6) fma.d.s1 f11 = f12, f11, f11
	28	(p6) fma.s1 f10 = f12, f10, f10
	29	;;
	30	(p6) fnma.d.s1 f8 = farg1, f11, farg0
	31	;;
	32	(p6) fma.d f10 = f8, f10, f11
	33	;;
	34	mov fret0 = f10
	35	br.ret.sptk rp
	36	;;
	37	.endp __divdf3
	38	#endif
	39
	40	#ifdef L__divsf3
	41	// Compute a 32-bit IEEE float quotient.
	42	//
	43	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	44	// alternative.
	45	//
	46	// farg0 holds the dividend. farg1 holds the divisor.
	47
	48	.text
	49	.align 16
	50	.global __divsf3
	51	.proc __divsf3
	52	__divsf3:
	53	frcpa f10, p6 = farg0, farg1
	54	;;
	55	(p6) fma.s1 f8 = farg0, f10, f0
	56	(p6) fnma.s1 f9 = farg1, f10, f1
	57	;;
	58	(p6) fma.s1 f8 = f9, f8, f8
	59	(p6) fma.s1 f9 = f9, f9, f0
	60	;;
	61	(p6) fma.s1 f8 = f9, f8, f8
	62	(p6) fma.s1 f9 = f9, f9, f0
	63	;;
	64	(p6) fma.d.s1 f8 = f9, f8, f8
65	;;
66	(p6) fma.s f10 = f8, f1, f0
67	;;
68	mov fret0 = f10
69	br.ret.sptk rp
70	;;
71	.endp __divsf3
72	#endif
73
74	#ifdef L__divdi3
75	// Compute a 64-bit integer quotient.
76	//
77	// Use reciprocal approximation and Newton-Raphson iteration to compute the
78	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
79	// to get more than the 64 bits of precision that we need for DImode.
80	//
81	// Must use max precision for the reciprocal computations to get 64 bits of
82	// precision.
83	//
84	// r32/f8 holds the dividend. r33/f9 holds the divisor.
85	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
86	// f12 is a temporary.
87
88	.text
89	.align 16
90	.global __divdi3
91	.proc __divdi3
92	__divdi3:
93	.regstk 2,0,0,0
94	// Transfer inputs to FP registers.
95	setf.sig f8 = in0
96	setf.sig f9 = in1
97	;;
98	// Convert the inputs to FP, so that they won't be treated as unsigned.
99	fcvt.xf f8 = f8
100	fcvt.xf f9 = f9
101	;;
102	// Compute the reciprocal approximation.
103	frcpa f10, p6 = f8, f9
104	;;
105	// 3 Newton-Raphson iterations.
106	(p6) fma.s1 f11 = farg0, f10, f0
107	(p6) fnma.s1 f12 = farg1, f10, f1
108	;;
109	(p6) fma.s1 f11 = f12, f11, f11
110	(p6) fma.s1 f13 = f12, f12, f0
111	(p6) fma.s1 f10 = f12, f10, f10
112	;;
113	(p6) fma.s1 f11 = f13, f11, f11
114	(p6) fma.s1 f12 = f13, f13, f0
115	(p6) fma.s1 f10 = f13, f10, f10
116	;;
117	(p6) fma.s1 f11 = f12, f11, f11
118	(p6) fma.s1 f10 = f12, f10, f10
119	;;
120	(p6) fnma.s1 f8 = f9, f11, f8
121	;;
122	(p6) fma f10 = f8, f10, f11
123	;;
124	// Round quotient to an integer.
125	fcvt.fx.trunc f8 = f10
126	;;
127	// Transfer result to GP registers.
128	getf.sig ret0 = f8
129	br.ret.sptk rp
130	;;
131	.endp __divdi3
132	#endif
133
134	#ifdef L__moddi3
135	// Compute a 64-bit integer modulus.
136	//
137	// Use reciprocal approximation and Newton-Raphson iteration to compute the
138	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
139	// to get more than the 64 bits of precision that we need for DImode.
140	//
141	// Must use max precision for the reciprocal computations to get 64 bits of
142	// precision.
143	//
144	// r32/f8 holds the dividend. r33/f9 holds the divisor.
145	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
146	// f12 is a temporary.
147
148	.text
149	.align 16
150	.global __moddi3
151	.proc __moddi3
152	__moddi3:
153	.regstk 2,0,0,0
154	// Transfer inputs to FP registers.
155	setf.sig f8 = in0
156	setf.sig f9 = in1
157	;;
158	// Convert the inputs to FP, so that they won't be treated as unsigned.
159	fcvt.xf f8 = f8
160	fcvt.xf f9 = f9
161	;;
162	// Compute the reciprocal approximation.
163	frcpa f10, p6 = f8, f9
164	;;
165	// 3 Newton-Raphson iterations.
166	(p6) fma.s1 f11 = farg0, f10, f0
167	(p6) fnma.s1 f12 = farg1, f10, f1
168	;;
169	(p6) fma.s1 f11 = f12, f11, f11
170	(p6) fma.s1 f13 = f12, f12, f0
171	(p6) fma.s1 f10 = f12, f10, f10
172	;;
173	(p6) fma.s1 f11 = f13, f11, f11
174	(p6) fma.s1 f12 = f13, f13, f0
175	(p6) fma.s1 f10 = f13, f10, f10
176	;;
177	(p6) fma.s1 f11 = f12, f11, f11
178	(p6) fma.s1 f10 = f12, f10, f10
179	;;
180	(p6) fnma.s1 f12 = f9, f11, f8
181	;;
182	(p6) fma f10 = f12, f10, f11
183	;;
184	// Round quotient to an integer.
185	fcvt.fx.trunc f10 = f10
186	;;
187	// Renormalize.
188	fcvt.xf f10 = f10
189	;;
190	// Compute remainder.
191	fnma f8 = f10, f9, f8
192	;;
193	// Round remainder to an integer.
194	fcvt.fx.trunc f8 = f8
195	;;
196	// Transfer result to GP registers.
197	getf.sig ret0 = f8
198	br.ret.sptk rp
199	;;
200	.endp __moddi3
201	#endif
202
203	#ifdef L__udivdi3
204	// Compute a 64-bit unsigned integer quotient.
205	//
206	// Use reciprocal approximation and Newton-Raphson iteration to compute the
207	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
208	// to get more than the 64 bits of precision that we need for DImode.
209	//
210	// Must use max precision for the reciprocal computations to get 64 bits of
211	// precision.
212	//
213	// r32/f8 holds the dividend. r33/f9 holds the divisor.
214	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
215	// f12 is a temporary.
216
217	.text
218	.align 16
219	.global __udivdi3
220	.proc __udivdi3
221	__udivdi3:
222	.regstk 2,0,0,0
223	// Transfer inputs to FP registers.
224	setf.sig f8 = in0
225	setf.sig f9 = in1
226	;;
227	// Convert the inputs to FP, to avoid FP software-assist faults.
228	fcvt.xuf f8 = f8
229	fcvt.xuf f9 = f9
230	;;
231	// Compute the reciprocal approximation.
232	frcpa f10, p6 = f8, f9
233	;;
234	// 3 Newton-Raphson iterations.
235	(p6) fma.s1 f11 = farg0, f10, f0
236	(p6) fnma.s1 f12 = farg1, f10, f1
237	;;
238	(p6) fma.s1 f11 = f12, f11, f11
239	(p6) fma.s1 f13 = f12, f12, f0
240	(p6) fma.s1 f10 = f12, f10, f10
241	;;
242	(p6) fma.s1 f11 = f13, f11, f11
243	(p6) fma.s1 f12 = f13, f13, f0
244	(p6) fma.s1 f10 = f13, f10, f10
245	;;
246	(p6) fma.s1 f11 = f12, f11, f11
247	(p6) fma.s1 f10 = f12, f10, f10
248	;;
249	(p6) fnma.s1 f8 = f9, f11, f8
250	;;
251	(p6) fma f10 = f8, f10, f11
252	;;
253	// Round quotient to an unsigned integer.
254	fcvt.fxu.trunc f8 = f10
255	;;
256	// Transfer result to GP registers.
257	getf.sig ret0 = f8
258	br.ret.sptk rp
259	;;
260	.endp __udivdi3
261	#endif
262
263	#ifdef L__umoddi3
264	// Compute a 64-bit unsigned integer modulus.
265	//
266	// Use reciprocal approximation and Newton-Raphson iteration to compute the
267	// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
268	// to get more than the 64 bits of precision that we need for DImode.
269	//
270	// Must use max precision for the reciprocal computations to get 64 bits of
271	// precision.
272	//
273	// r32/f8 holds the dividend. r33/f9 holds the divisor.
274	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
275	// f12 is a temporary.
276
277	.text
278	.align 16
279	.global __umoddi3
280	.proc __umoddi3
281	__umoddi3:
282	.regstk 2,0,0,0
283	// Transfer inputs to FP registers.
284	setf.sig f8 = in0
285	setf.sig f9 = in1
286	;;
287	// Convert the inputs to FP, to avoid FP software assist faults.
288	fcvt.xuf f8 = f8
289	fcvt.xuf f9 = f9
290	;;
291	// Compute the reciprocal approximation.
292	frcpa f10, p6 = f8, f9
293	;;
294	// 3 Newton-Raphson iterations.
295	(p6) fma.s1 f11 = farg0, f10, f0
296	(p6) fnma.s1 f12 = farg1, f10, f1
297	;;
298	(p6) fma.s1 f11 = f12, f11, f11
299	(p6) fma.s1 f13 = f12, f12, f0
300	(p6) fma.s1 f10 = f12, f10, f10
301	;;
302	(p6) fma.s1 f11 = f13, f11, f11
303	(p6) fma.s1 f12 = f13, f13, f0
304	(p6) fma.s1 f10 = f13, f10, f10
305	;;
306	(p6) fma.s1 f11 = f12, f11, f11
307	(p6) fma.s1 f10 = f12, f10, f10
308	;;
309	(p6) fnma.s1 f12 = f9, f11, f8
310	;;
311	(p6) fma f10 = f12, f10, f11
312	;;
313	// Round quotient to an unsigned integer.
314	fcvt.fxu.trunc f10 = f10
315	;;
316	// Renormalize.
317	fcvt.xuf f10 = f10
318	;;
319	// Compute remainder.
320	fnma f8 = f10, f9, f8
321	;;
322	// Round remainder to an integer.
323	fcvt.fxu.trunc f8 = f8
324	;;
325	// Transfer result to GP registers.
326	getf.sig ret0 = f8
327	br.ret.sptk rp
328	;;
329	.endp __umoddi3
330	#endif
331
332	#ifdef L__divsi3
333	// Compute a 32-bit integer quotient.
334	//
335	// Use reciprocal approximation and Newton-Raphson iteration to compute the
336	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
337	// to get more than the 32 bits of precision that we need for SImode.
338	//
339	// ??? This is currently not used. It needs to be fixed to be more like the
340	// above DImode routines.
341	//
342	// ??? Check to see if the error is less than >.5ulp error. We may need
343	// some adjustment code to get precise enough results.
344	//
345	// ??? Should probably use max precision for the reciprocal computations.
346	//
347	// r32/f8 holds the dividend. r33/f9 holds the divisor.
348	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
349	// f12 is a temporary.
350
351	.text
352	.align 16
353	.global __divsi3
354	.proc __divsi3
355	__divsi3:
356	.regstk 2,0,0,0
357	setf.sig f8 = in0
358	setf.sig f9 = in1
359	;;
360	fcvt.xf f8 = f8
361	fcvt.xf f9 = f9
362	;;
363	frcpa f11, p6 = f8, f9
364	fadd f10 = f1, f1
365	;;
366	fnma f12 = f9, f11, f10
367	;;
368	fmpy f11 = f11, f12
369	;;
370	fnma f12 = f9, f11, f10
371	;;
372	fmpy f11 = f11, f12
373	;;
374	fmpy f8 = f8, f11
375	;;
376	fcvt.fx.trunc f8 = f8
377	;;
378	getf.sig ret0 = f8
379	br.ret.sptk rp
380	;;
381	.endp __divsi3
382	#endif
383
384	#ifdef L__modsi3
385	// Compute a 32-bit integer modulus.
386	//
387	// Use reciprocal approximation and Newton-Raphson iteration to compute the
388	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
389	// to get more than the 32 bits of precision that we need for SImode.
390	//
391	// ??? This is currently not used. It needs to be fixed to be more like the
392	// above DImode routines.
393	//
394	// ??? Check to see if the error is less than >.5ulp error. We may need
395	// some adjustment code to get precise enough results.
396	//
397	// ??? Should probably use max precision for the reciprocal computations.
398	//
399	// r32/f8 holds the dividend. r33/f9 holds the divisor.
400	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
401	// f12 is a temporary.
402
403	.text
404	.align 16
405	.global __modsi3
406	.proc __modsi3
407	__modsi3:
408	.regstk 2,0,0,0
409	setf.sig f8 = r32
410	setf.sig f9 = r33
411	;;
412	fcvt.xf f8 = f8
413	fcvt.xf f9 = f9
414	;;
415	frcpa f11, p6 = f8, f9
416	fadd f10 = f1, f1
417	;;
418	fnma f12 = f9, f11, f10
419	;;
420	fmpy f11 = f11, f12
421	;;
422	fnma f12 = f9, f11, f10
423	;;
424	fmpy f11 = f11, f12
425	;;
426	fmpy f10 = f8, f11
427	;;
428	fcvt.fx.trunc f10 = f10
429	;;
430	fcvt.xf f10 = f10
431	;;
432	fnma f8 = f10, f9, f8
433	;;
434	fcvt.fx f8 = f8
435	;;
436	getf.sig r32 = f8
437	br.ret.sptk rp
438	;;
439	.endp __modsi3
440	#endif
441
442	#ifdef L__udivsi3
443	// Compute a 32-bit unsigned integer quotient.
444	//
445	// Use reciprocal approximation and Newton-Raphson iteration to compute the
446	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
447	// to get more than the 32 bits of precision that we need for SImode.
448	//
449	// ??? This is currently not used. It needs to be fixed to be more like the
450	// above DImode routines.
451	//
452	// ??? Check to see if the error is less than >.5ulp error. We may need
453	// some adjustment code to get precise enough results.
454	//
455	// ??? Should probably use max precision for the reciprocal computations.
456	//
457	// r32/f8 holds the dividend. r33/f9 holds the divisor.
458	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
459	// f12 is a temporary.
460	//
461	// This is the same as divsi3, except that we don't need fcvt instructions
462	// before the frcpa.
463
464	.text
465	.align 16
466	.global __udivsi3
467	.proc __udivsi3
468	__udivsi3:
469	.regstk 2,0,0,0
470	setf.sig f8 = r32
471	setf.sig f9 = r33
472	;;
473	frcpa f11, p6 = f8, f9
474	fadd f10 = f1, f1
475	;;
476	fnma f12 = f9, f11, f10
477	;;
478	fmpy f11 = f11, f12
479	;;
480	fnma f12 = f9, f11, f10
481	;;
482	fmpy f11 = f11, f12
483	;;
484	fmpy f8 = f8, f11
485	;;
486	fcvt.fxu.trunc f8 = f8
487	;;
488	getf.sig ret0 = f8
489	br.ret.sptk rp
490	;;
491	.endp __udivsi3
492	#endif
493
494	#ifdef L__umodsi3
495	// Compute a 32-bit unsigned integer modulus.
496	//
497	// Use reciprocal approximation and Newton-Raphson iteration to compute the
498	// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
499	// to get more than the 32 bits of precision that we need for SImode.
500	//
501	// ??? This is currently not used. It needs to be fixed to be more like the
502	// above DImode routines.
503	//
504	// ??? Check to see if the error is less than >.5ulp error. We may need
505	// some adjustment code to get precise enough results.
506	//
507	// ??? Should probably use max precision for the reciprocal computations.
508	//
509	// r32/f8 holds the dividend. r33/f9 holds the divisor.
510	// f10 holds the value 2.0. f11 holds the reciprocal approximation.
511	// f12 is a temporary.
512	//
513	// This is the same as modsi3, except that we don't need fcvt instructions
514	// before the frcpa.
515
516	.text
517	.align 16
518	.global __umodsi3
519	.proc __umodsi3
520	__umodsi3:
521	.regstk 2,0,0,0
522	setf.sig f8 = r32
523	setf.sig f9 = r33
524	;;
525	frcpa f11, p6 = f8, f9
526	fadd f10 = f1, f1
527	;;
528	fnma f12 = f9, f11, f10
529	;;
530	fmpy f11 = f11, f12
531	;;
532	fnma f12 = f9, f11, f10
533	;;
534	fmpy f11 = f11, f12
535	;;
536	fmpy f10 = f8, f11
537	;;
538	fcvt.fxu.trunc f10 = f10
539	;;
540	fcvt.xuf f10 = f10
541	;;
542	fnma f8 = f10, f9, f8
543	;;
544	fcvt.fxu f8 = f8
545	;;
546	getf.sig r32 = f8
547	br.ret.sptk rp
548	;;
549	.endp __umodsi3
550	#endif
551
552	#ifdef L__save_stack_nonlocal
553	// Notes on save/restore stack nonlocal: We read ar.bsp but write
554	// ar.bspstore. This is because ar.bsp can be read at all times
555	// (independent of the RSE mode) but since it's read-only we need to
556	// restore the value via ar.bspstore. This is OK because
557	// ar.bsp==ar.bspstore after executing "flushrs".
558
559	// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)
560
561	.text
562	.align 16
563	.global __ia64_save_stack_nonlocal
564	.proc __ia64_save_stack_nonlocal
565	__ia64_save_stack_nonlocal:
566	alloc r18=ar.pfs,2,0,0,0
567	st8 [in0]=in1,8
568	mov r19=ar.rsc
569	;;
570	flushrs
571	and r19=0x1c,r19
572	mov ar.pfs=r18
573	;;
574	mov ar.rsc=r19
575	mov r16=ar.bsp
576	adds r2=16,in0
577	;;
578	mov r17=ar.rnat
579	st8 [in0]=r16,8
580	or r19=0x3,r19
581	;;
582	st8 [in0]=r17
583	mov ar.rsc=r19
584	st8 [r2]=r18
585	mov ar.pfs=r18
586	br.ret.sptk.few rp
587	;;
588	.endp __ia64_save_stack_nonlocal
589	#endif
590
591	#ifdef L__nonlocal_goto
592	// void __ia64_nonlocal_goto(void fp, void target_label, void *save_area,
593	// void *static_chain);
594
595	.text
596	.align 16
597	.global __ia64_nonlocal_goto
598	.proc __ia64_nonlocal_goto
599	__ia64_nonlocal_goto:
600	alloc r20=ar.pfs,4,0,0,0
601	mov r19=ar.rsc
602	adds r2=8,in2
603	ld8 r12=[in2],16
604	mov.ret.sptk.few.dc.dc rp = r33, .L0
605	// ??? flushrs must be first instruction of a group. Gas is unfortunately
606	// putting the stop bit before the padding nop instead of after it, making
607	// flushrs the first instruction of its bundle, but the second instruction
608	// of its group. We explicitly add the nop to avoid this problem.
609	nop.i 0
610	;;
611	flushrs
612	ld8 r16=[r2],16
613	and r19=0x1c,r19
614	ld8 r17=[in2]
615	;;
616	ld8 r18=[r2]
617	mov ar.rsc=r19
618	;;
619	mov ar.bspstore=r16
620	;;
621	mov ar.rnat=r17
622	mov ar.pfs=r18
623	or r19=0x3,r19
624	;;
625	loadrs
626	invala
627	mov r7=r32
628	.L0: {
629	mov ar.rsc=r19
630	mov r15=r35
631	br.ret.sptk.few rp
632	}
633	;;
634	.endp __ia64_nonlocal_goto
635	#endif