[thirdparty/gcc.git] / gcc / config / ia64 / lib1funcs.asm

#ifdef L__divtf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divtf3
	.proc __divtf3
__divtf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fnma.s1 f11 = farg1, f10, f1
(p6)	fmpy.s1 f12 = farg0, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f14 = f11, f11, f11
	;;
(p6)	fma.s1 f11 = f13, f13, f11
(p6)	fma.s1 f13 = f14, f10, f10
	;;
(p6)	fma.s1 f10 = f13, f11, f10
(p6)	fnma.s1 f12 = farg1, f12, farg0
	;;
(p6)	fma.s1 f11 = f11, f10, f12
(p6)	fnma.s1 f13 = farg1, f10, f1
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fnma.s1 f12 = farg1, f11, farg0
	;;
(p6)	fma fret0 = f12, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divtf3
#endif

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
__divdf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f11 = farg0, f10
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fmpy.s1 f13 = f12, f12
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fma.s1 f11 = f13, f11, f11
	;;
(p6)	fmpy.s1 f12 = f13, f13
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.d.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.d.s1 f8 = farg1, f11, farg0
	;;
(p6)	fma.d fret0 = f8, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
__divsf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f8 = farg0, f10
(p6)	fnma.s1 f9 = farg1, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.d.s1 f10 = f9, f8, f8
	;;
(p6)	fnorm.s.s0 fret0 = f10
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f14
	fcvt.xf f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f14
	fcvt.xuf.s1 f9 = f9
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
__divsi3:
	.regstk 2,0,0,0
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	mov r2 = 0x0ffdd
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
__modsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f13 = r32
	setf.sig f9 = r33
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11	
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
__udivsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
__umodsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f13 = in0
	setf.sig f9 = in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
	{ .mmf
	  alloc r18 = ar.pfs, 2, 0, 0, 0
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  st8 [in0] = in1, 24
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  st8 [in0] = r18, -16
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov r16 = ar.bsp
	  mov r17 = ar.rnat
	  adds r2 = 8, in0
	  ;;
	}
	{ .mmi
	  st8 [in0] = r16
	  st8 [r2] = r17
	}
	{ .mib
	  mov ar.rsc = r19
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//			     void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
	{ .mmi
	  alloc r20 = ar.pfs, 3, 0, 0, 0
	  ld8 r12 = [in1], 8
	  mov.ret.sptk rp = in0, .L0
	  ;;
	}
	{ .mmf
	  ld8 r16 = [in1], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in1], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  ld8 r18 = [in1]
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  ;;
	}
	{ .mmi
	  loadrs
	  invala
	  mov r15 = in2
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_nonlocal_goto
#endif

#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.

// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
	{ .mmf
	  alloc r20 = ar.pfs, 4, 0, 0, 0
	  ld8 r12 = [in0], 8
	  ;;
	}
	{ .mmb
	  ld8 r16=[in0], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in0], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmf
	  ld8 r18 = [in0]
	  mov ar.rsc = r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmf
	  loadrs
	  invala
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_restore_stack_nonlocal
#endif

#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//		+-------------------+ >
//	TRAMP:	| __ia64_trampoline | |
//		+-------------------+  > fake function descriptor
//		| TRAMP+16          | |
//		+-------------------+ >
//		| target descriptor |
//		+-------------------+
//		| static link	    |
//		+-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
__ia64_trampoline:
	{ .mmi
	  ld8 r2 = [r1], 8
	  ;;
	  ld8 r15 = [r1]
	}
	{ .mmi
	  ld8 r3 = [r2], 8
	  ;;
	  ld8 r1 = [r2]
	  mov b6 = r3
	}
	{ .bbb
	  br.sptk.many b6
	  ;;
	}
	.endp __ia64_trampoline
#endif
Commit	Line	Data
3f622353 RH	1	#ifdef L__divtf3
	2	// Compute a 80-bit IEEE double-extended quotient.
	3	//
	4	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	5	// alternative.
	6	//
	7	// farg0 holds the dividend. farg1 holds the divisor.
	8
	9	.text
	10	.align 16
	11	.global __divtf3
	12	.proc __divtf3
	13	__divtf3:
3392dafc RH	14	cmp.eq p7, p0 = r0, r0
3392dafc RH	15	frcpa.s0 f10, p6 = farg0, farg1
3f622353	16	;;
3392dafc RH	17	(p6) cmp.ne p7, p0 = r0, r0
3392dafc RH	18	.pred.rel.mutex p6, p7
3f622353	19	(p6) fnma.s1 f11 = farg1, f10, f1
3392dafc	20	(p6) fmpy.s1 f12 = farg0, f10
3f622353	21	;;
3392dafc RH	22	(p6) fmpy.s1 f13 = f11, f11
3392dafc RH	23	(p6) fma.s1 f14 = f11, f11, f11
3f622353	24	;;
3392dafc RH	25	(p6) fma.s1 f11 = f13, f13, f11
3392dafc RH	26	(p6) fma.s1 f13 = f14, f10, f10
3f622353	27	;;
3392dafc RH	28	(p6) fma.s1 f10 = f13, f11, f10
3392dafc RH	29	(p6) fnma.s1 f12 = farg1, f12, farg0
3f622353	30	;;
3392dafc RH	31	(p6) fma.s1 f11 = f11, f10, f12
3392dafc RH	32	(p6) fnma.s1 f13 = farg1, f10, f1
3f622353	33	;;
3392dafc RH	34	(p6) fma.s1 f10 = f12, f10, f10
3392dafc RH	35	(p6) fnma.s1 f12 = farg1, f11, farg0
3f622353	36	;;
3392dafc RH	37	(p6) fma fret0 = f12, f10, f11
3392dafc RH	38	(p7) mov fret0 = f10
3f622353 RH	39	br.ret.sptk rp
	40	;;
	41	.endp __divtf3
	42	#endif
	43
c65ebc55 JW	44	#ifdef L__divdf3
	45	// Compute a 64-bit IEEE double quotient.
	46	//
	47	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	48	// alternative.
	49	//
	50	// farg0 holds the dividend. farg1 holds the divisor.
	51
	52	.text
	53	.align 16
	54	.global __divdf3
	55	.proc __divdf3
	56	__divdf3:
3392dafc RH	57	cmp.eq p7, p0 = r0, r0
3392dafc RH	58	frcpa.s0 f10, p6 = farg0, farg1
c65ebc55	59	;;
3392dafc RH	60	(p6) cmp.ne p7, p0 = r0, r0
	61	.pred.rel.mutex p6, p7
	62	(p6) fmpy.s1 f11 = farg0, f10
c65ebc55 JW	63	(p6) fnma.s1 f12 = farg1, f10, f1
	64	;;
	65	(p6) fma.s1 f11 = f12, f11, f11
3392dafc	66	(p6) fmpy.s1 f13 = f12, f12
c65ebc55	67	;;
3392dafc	68	(p6) fma.s1 f10 = f12, f10, f10
c65ebc55	69	(p6) fma.s1 f11 = f13, f11, f11
3392dafc RH	70	;;
3392dafc RH	71	(p6) fmpy.s1 f12 = f13, f13
c65ebc55 JW	72	(p6) fma.s1 f10 = f13, f10, f10
	73	;;
	74	(p6) fma.d.s1 f11 = f12, f11, f11
	75	(p6) fma.s1 f10 = f12, f10, f10
	76	;;
	77	(p6) fnma.d.s1 f8 = farg1, f11, farg0
	78	;;
3392dafc RH	79	(p6) fma.d fret0 = f8, f10, f11
3392dafc RH	80	(p7) mov fret0 = f10
c65ebc55 JW	81	br.ret.sptk rp
	82	;;
	83	.endp __divdf3
	84	#endif
	85
	86	#ifdef L__divsf3
	87	// Compute a 32-bit IEEE float quotient.
	88	//
	89	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	90	// alternative.
	91	//
	92	// farg0 holds the dividend. farg1 holds the divisor.
	93
	94	.text
	95	.align 16
	96	.global __divsf3
	97	.proc __divsf3
	98	__divsf3:
938566fb	99	cmp.eq p7, p0 = r0, r0
3392dafc	100	frcpa.s0 f10, p6 = farg0, farg1
c65ebc55	101	;;
938566fb RH	102	(p6) cmp.ne p7, p0 = r0, r0
938566fb RH	103	.pred.rel.mutex p6, p7
3392dafc	104	(p6) fmpy.s1 f8 = farg0, f10
c65ebc55 JW	105	(p6) fnma.s1 f9 = farg1, f10, f1
	106	;;
	107	(p6) fma.s1 f8 = f9, f8, f8
3392dafc	108	(p6) fmpy.s1 f9 = f9, f9
c65ebc55 JW	109	;;
c65ebc55 JW	110	(p6) fma.s1 f8 = f9, f8, f8
3392dafc	111	(p6) fmpy.s1 f9 = f9, f9
c65ebc55	112	;;
3392dafc	113	(p6) fma.d.s1 f10 = f9, f8, f8
c65ebc55	114	;;
938566fb RH	115	(p6) fnorm.s.s0 fret0 = f10
938566fb RH	116	(p7) mov fret0 = f10
c65ebc55 JW	117	br.ret.sptk rp
	118	;;
	119	.endp __divsf3
	120	#endif
	121
	122	#ifdef L__divdi3
	123	// Compute a 64-bit integer quotient.
	124	//
d8d7a286 RH	125	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	126	// alternative.
c65ebc55	127	//
d8d7a286	128	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	129
	130	.text
	131	.align 16
	132	.global __divdi3
	133	.proc __divdi3
	134	__divdi3:
	135	.regstk 2,0,0,0
	136	// Transfer inputs to FP registers.
	137	setf.sig f8 = in0
	138	setf.sig f9 = in1
	139	;;
	140	// Convert the inputs to FP, so that they won't be treated as unsigned.
	141	fcvt.xf f8 = f8
	142	fcvt.xf f9 = f9
	143	;;
	144	// Compute the reciprocal approximation.
660a0ebd	145	frcpa.s1 f10, p6 = f8, f9
2a7ffc85	146	;;
c65ebc55	147	// 3 Newton-Raphson iterations.
d8d7a286 RH	148	(p6) fnma.s1 f11 = f9, f10, f1
d8d7a286 RH	149	(p6) fmpy.s1 f12 = f8, f10
c65ebc55	150	;;
d8d7a286 RH	151	(p6) fmpy.s1 f13 = f11, f11
d8d7a286 RH	152	(p6) fma.s1 f12 = f11, f12, f12
c65ebc55	153	;;
d8d7a286 RH	154	(p6) fma.s1 f10 = f11, f10, f10
d8d7a286 RH	155	(p6) fma.s1 f11 = f13, f12, f12
c65ebc55	156	;;
d8d7a286 RH	157	(p6) fma.s1 f10 = f13, f10, f10
d8d7a286 RH	158	(p6) fnma.s1 f12 = f9, f11, f8
c65ebc55	159	;;
d8d7a286	160	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55 JW	161	;;
c65ebc55 JW	162	// Round quotient to an integer.
d8d7a286	163	fcvt.fx.trunc.s1 f10 = f10
c65ebc55 JW	164	;;
c65ebc55 JW	165	// Transfer result to GP registers.
d8d7a286	166	getf.sig ret0 = f10
c65ebc55 JW	167	br.ret.sptk rp
	168	;;
	169	.endp __divdi3
	170	#endif
	171
	172	#ifdef L__moddi3
	173	// Compute a 64-bit integer modulus.
	174	//
d8d7a286 RH	175	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	176	// alternative.
c65ebc55	177	//
d8d7a286	178	// in0 holds the dividend (a). in1 holds the divisor (b).
c65ebc55 JW	179
	180	.text
	181	.align 16
	182	.global __moddi3
	183	.proc __moddi3
	184	__moddi3:
	185	.regstk 2,0,0,0
	186	// Transfer inputs to FP registers.
d8d7a286	187	setf.sig f14 = in0
c65ebc55 JW	188	setf.sig f9 = in1
	189	;;
	190	// Convert the inputs to FP, so that they won't be treated as unsigned.
d8d7a286	191	fcvt.xf f8 = f14
c65ebc55 JW	192	fcvt.xf f9 = f9
	193	;;
	194	// Compute the reciprocal approximation.
660a0ebd	195	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	196	;;
c65ebc55 JW	197	// 3 Newton-Raphson iterations.
d8d7a286 RH	198	(p6) fmpy.s1 f12 = f8, f10
d8d7a286 RH	199	(p6) fnma.s1 f11 = f9, f10, f1
c65ebc55	200	;;
d8d7a286 RH	201	(p6) fma.s1 f12 = f11, f12, f12
d8d7a286 RH	202	(p6) fmpy.s1 f13 = f11, f11
c65ebc55	203	;;
d8d7a286 RH	204	(p6) fma.s1 f10 = f11, f10, f10
d8d7a286 RH	205	(p6) fma.s1 f11 = f13, f12, f12
c65ebc55	206	;;
d8d7a286 RH	207	sub in1 = r0, in1
d8d7a286 RH	208	(p6) fma.s1 f10 = f13, f10, f10
c65ebc55 JW	209	(p6) fnma.s1 f12 = f9, f11, f8
c65ebc55 JW	210	;;
d8d7a286	211	setf.sig f9 = in1
660a0ebd	212	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55	213	;;
660a0ebd	214	fcvt.fx.trunc.s1 f10 = f10
c65ebc55	215	;;
d8d7a286 RH	216	// r = q * (-b) + a
d8d7a286 RH	217	xma.l f10 = f10, f9, f14
c65ebc55 JW	218	;;
c65ebc55 JW	219	// Transfer result to GP registers.
d8d7a286	220	getf.sig ret0 = f10
c65ebc55 JW	221	br.ret.sptk rp
	222	;;
	223	.endp __moddi3
	224	#endif
	225
	226	#ifdef L__udivdi3
	227	// Compute a 64-bit unsigned integer quotient.
	228	//
d8d7a286 RH	229	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	230	// alternative.
c65ebc55	231	//
d8d7a286	232	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	233
	234	.text
	235	.align 16
	236	.global __udivdi3
	237	.proc __udivdi3
	238	__udivdi3:
	239	.regstk 2,0,0,0
	240	// Transfer inputs to FP registers.
	241	setf.sig f8 = in0
	242	setf.sig f9 = in1
	243	;;
	244	// Convert the inputs to FP, to avoid FP software-assist faults.
660a0ebd JW	245	fcvt.xuf.s1 f8 = f8
660a0ebd JW	246	fcvt.xuf.s1 f9 = f9
c65ebc55 JW	247	;;
c65ebc55 JW	248	// Compute the reciprocal approximation.
660a0ebd	249	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	250	;;
c65ebc55 JW	251	// 3 Newton-Raphson iterations.
d8d7a286 RH	252	(p6) fnma.s1 f11 = f9, f10, f1
d8d7a286 RH	253	(p6) fmpy.s1 f12 = f8, f10
c65ebc55	254	;;
d8d7a286 RH	255	(p6) fmpy.s1 f13 = f11, f11
d8d7a286 RH	256	(p6) fma.s1 f12 = f11, f12, f12
c65ebc55	257	;;
d8d7a286 RH	258	(p6) fma.s1 f10 = f11, f10, f10
d8d7a286 RH	259	(p6) fma.s1 f11 = f13, f12, f12
c65ebc55	260	;;
d8d7a286 RH	261	(p6) fma.s1 f10 = f13, f10, f10
d8d7a286 RH	262	(p6) fnma.s1 f12 = f9, f11, f8
c65ebc55	263	;;
2a7ffc85	264	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55 JW	265	;;
c65ebc55 JW	266	// Round quotient to an unsigned integer.
d8d7a286	267	fcvt.fxu.trunc.s1 f10 = f10
c65ebc55 JW	268	;;
c65ebc55 JW	269	// Transfer result to GP registers.
d8d7a286	270	getf.sig ret0 = f10
c65ebc55 JW	271	br.ret.sptk rp
	272	;;
	273	.endp __udivdi3
	274	#endif
	275
	276	#ifdef L__umoddi3
	277	// Compute a 64-bit unsigned integer modulus.
	278	//
d8d7a286 RH	279	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	280	// alternative.
c65ebc55	281	//
d8d7a286	282	// in0 holds the dividend (a). in1 holds the divisor (b).
c65ebc55 JW	283
	284	.text
	285	.align 16
	286	.global __umoddi3
	287	.proc __umoddi3
	288	__umoddi3:
	289	.regstk 2,0,0,0
	290	// Transfer inputs to FP registers.
d8d7a286	291	setf.sig f14 = in0
c65ebc55 JW	292	setf.sig f9 = in1
	293	;;
	294	// Convert the inputs to FP, to avoid FP software assist faults.
d8d7a286	295	fcvt.xuf.s1 f8 = f14
660a0ebd	296	fcvt.xuf.s1 f9 = f9
c65ebc55 JW	297	;;
c65ebc55 JW	298	// Compute the reciprocal approximation.
660a0ebd	299	frcpa.s1 f10, p6 = f8, f9
c65ebc55 JW	300	;;
c65ebc55 JW	301	// 3 Newton-Raphson iterations.
d8d7a286 RH	302	(p6) fmpy.s1 f12 = f8, f10
d8d7a286 RH	303	(p6) fnma.s1 f11 = f9, f10, f1
c65ebc55	304	;;
d8d7a286 RH	305	(p6) fma.s1 f12 = f11, f12, f12
d8d7a286 RH	306	(p6) fmpy.s1 f13 = f11, f11
c65ebc55	307	;;
d8d7a286 RH	308	(p6) fma.s1 f10 = f11, f10, f10
d8d7a286 RH	309	(p6) fma.s1 f11 = f13, f12, f12
c65ebc55	310	;;
d8d7a286 RH	311	sub in1 = r0, in1
d8d7a286 RH	312	(p6) fma.s1 f10 = f13, f10, f10
c65ebc55 JW	313	(p6) fnma.s1 f12 = f9, f11, f8
c65ebc55 JW	314	;;
d8d7a286	315	setf.sig f9 = in1
660a0ebd	316	(p6) fma.s1 f10 = f12, f10, f11
c65ebc55 JW	317	;;
c65ebc55 JW	318	// Round quotient to an unsigned integer.
660a0ebd	319	fcvt.fxu.trunc.s1 f10 = f10
c65ebc55	320	;;
d8d7a286 RH	321	// r = q * (-b) + a
d8d7a286 RH	322	xma.l f10 = f10, f9, f14
c65ebc55 JW	323	;;
c65ebc55 JW	324	// Transfer result to GP registers.
d8d7a286	325	getf.sig ret0 = f10
c65ebc55 JW	326	br.ret.sptk rp
	327	;;
	328	.endp __umoddi3
	329	#endif
	330
	331	#ifdef L__divsi3
	332	// Compute a 32-bit integer quotient.
	333	//
d8d7a286 RH	334	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	335	// alternative.
c65ebc55	336	//
d8d7a286	337	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	338
	339	.text
	340	.align 16
	341	.global __divsi3
	342	.proc __divsi3
	343	__divsi3:
	344	.regstk 2,0,0,0
d8d7a286 RH	345	sxt4 in0 = in0
	346	sxt4 in1 = in1
	347	;;
c65ebc55 JW	348	setf.sig f8 = in0
	349	setf.sig f9 = in1
	350	;;
d8d7a286	351	mov r2 = 0x0ffdd
c65ebc55 JW	352	fcvt.xf f8 = f8
	353	fcvt.xf f9 = f9
	354	;;
d8d7a286	355	setf.exp f11 = r2
4287b5f1	356	frcpa.s1 f10, p6 = f8, f9
c65ebc55	357	;;
d8d7a286 RH	358	(p6) fmpy.s1 f8 = f8, f10
d8d7a286 RH	359	(p6) fnma.s1 f9 = f9, f10, f1
c65ebc55	360	;;
d8d7a286 RH	361	(p6) fma.s1 f8 = f9, f8, f8
d8d7a286 RH	362	(p6) fma.s1 f9 = f9, f9, f11
c65ebc55	363	;;
d8d7a286	364	(p6) fma.s1 f10 = f9, f8, f8
c65ebc55	365	;;
d8d7a286	366	fcvt.fx.trunc.s1 f10 = f10
c65ebc55	367	;;
d8d7a286	368	getf.sig ret0 = f10
c65ebc55 JW	369	br.ret.sptk rp
	370	;;
	371	.endp __divsi3
	372	#endif
	373
	374	#ifdef L__modsi3
	375	// Compute a 32-bit integer modulus.
	376	//
d8d7a286 RH	377	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	378	// alternative.
c65ebc55	379	//
d8d7a286	380	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	381
	382	.text
	383	.align 16
	384	.global __modsi3
	385	.proc __modsi3
	386	__modsi3:
	387	.regstk 2,0,0,0
d8d7a286 RH	388	mov r2 = 0x0ffdd
	389	sxt4 in0 = in0
	390	sxt4 in1 = in1
	391	;;
	392	setf.sig f13 = r32
c65ebc55 JW	393	setf.sig f9 = r33
c65ebc55 JW	394	;;
d8d7a286 RH	395	sub in1 = r0, in1
d8d7a286 RH	396	fcvt.xf f8 = f13
c65ebc55 JW	397	fcvt.xf f9 = f9
c65ebc55 JW	398	;;
d8d7a286	399	setf.exp f11 = r2
4287b5f1	400	frcpa.s1 f10, p6 = f8, f9
c65ebc55	401	;;
d8d7a286 RH	402	(p6) fmpy.s1 f12 = f8, f10
d8d7a286 RH	403	(p6) fnma.s1 f10 = f9, f10, f1
c65ebc55	404	;;
d8d7a286 RH	405	setf.sig f9 = in1
	406	(p6) fma.s1 f12 = f10, f12, f12
	407	(p6) fma.s1 f10 = f10, f10, f11
c65ebc55	408	;;
d8d7a286	409	(p6) fma.s1 f10 = f10, f12, f12
c65ebc55	410	;;
d8d7a286	411	fcvt.fx.trunc.s1 f10 = f10
c65ebc55	412	;;
d8d7a286	413	xma.l f10 = f10, f9, f13
c65ebc55	414	;;
d8d7a286	415	getf.sig ret0 = f10
c65ebc55 JW	416	br.ret.sptk rp
	417	;;
	418	.endp __modsi3
	419	#endif
	420
	421	#ifdef L__udivsi3
	422	// Compute a 32-bit unsigned integer quotient.
	423	//
d8d7a286 RH	424	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	425	// alternative.
c65ebc55	426	//
d8d7a286	427	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	428
	429	.text
	430	.align 16
	431	.global __udivsi3
	432	.proc __udivsi3
	433	__udivsi3:
	434	.regstk 2,0,0,0
d8d7a286 RH	435	mov r2 = 0x0ffdd
	436	zxt4 in0 = in0
	437	zxt4 in1 = in1
c65ebc55	438	;;
d8d7a286 RH	439	setf.sig f8 = in0
d8d7a286 RH	440	setf.sig f9 = in1
c65ebc55	441	;;
4287b5f1 RH	442	fcvt.xf f8 = f8
	443	fcvt.xf f9 = f9
	444	;;
d8d7a286	445	setf.exp f11 = r2
4287b5f1	446	frcpa.s1 f10, p6 = f8, f9
c65ebc55	447	;;
d8d7a286 RH	448	(p6) fmpy.s1 f8 = f8, f10
d8d7a286 RH	449	(p6) fnma.s1 f9 = f9, f10, f1
c65ebc55	450	;;
d8d7a286 RH	451	(p6) fma.s1 f8 = f9, f8, f8
d8d7a286 RH	452	(p6) fma.s1 f9 = f9, f9, f11
c65ebc55	453	;;
d8d7a286	454	(p6) fma.s1 f10 = f9, f8, f8
c65ebc55	455	;;
d8d7a286	456	fcvt.fxu.trunc.s1 f10 = f10
c65ebc55	457	;;
d8d7a286	458	getf.sig ret0 = f10
c65ebc55 JW	459	br.ret.sptk rp
	460	;;
	461	.endp __udivsi3
	462	#endif
	463
	464	#ifdef L__umodsi3
	465	// Compute a 32-bit unsigned integer modulus.
	466	//
d8d7a286 RH	467	// From the Intel IA-64 Optimization Guide, choose the minimum latency
d8d7a286 RH	468	// alternative.
c65ebc55	469	//
d8d7a286	470	// in0 holds the dividend. in1 holds the divisor.
c65ebc55 JW	471
	472	.text
	473	.align 16
	474	.global __umodsi3
	475	.proc __umodsi3
	476	__umodsi3:
	477	.regstk 2,0,0,0
d8d7a286 RH	478	mov r2 = 0x0ffdd
	479	zxt4 in0 = in0
	480	zxt4 in1 = in1
c65ebc55	481	;;
d8d7a286 RH	482	setf.sig f13 = in0
d8d7a286 RH	483	setf.sig f9 = in1
c65ebc55	484	;;
d8d7a286 RH	485	sub in1 = r0, in1
	486	fcvt.xf f8 = f13
	487	fcvt.xf f9 = f9
c65ebc55	488	;;
d8d7a286	489	setf.exp f11 = r2
4287b5f1	490	frcpa.s1 f10, p6 = f8, f9
c65ebc55	491	;;
d8d7a286 RH	492	(p6) fmpy.s1 f12 = f8, f10
d8d7a286 RH	493	(p6) fnma.s1 f10 = f9, f10, f1
c65ebc55	494	;;
2a7ffc85	495	setf.sig f9 = in1
d8d7a286 RH	496	(p6) fma.s1 f12 = f10, f12, f12
d8d7a286 RH	497	(p6) fma.s1 f10 = f10, f10, f11
c65ebc55	498	;;
d8d7a286	499	(p6) fma.s1 f10 = f10, f12, f12
c65ebc55	500	;;
d8d7a286	501	fcvt.fxu.trunc.s1 f10 = f10
c65ebc55	502	;;
d8d7a286	503	xma.l f10 = f10, f9, f13
c65ebc55	504	;;
d8d7a286	505	getf.sig ret0 = f10
c65ebc55 JW	506	br.ret.sptk rp
	507	;;
	508	.endp __umodsi3
	509	#endif
	510
	511	#ifdef L__save_stack_nonlocal
	512	// Notes on save/restore stack nonlocal: We read ar.bsp but write
	513	// ar.bspstore. This is because ar.bsp can be read at all times
	514	// (independent of the RSE mode) but since it's read-only we need to
	515	// restore the value via ar.bspstore. This is OK because
	516	// ar.bsp==ar.bspstore after executing "flushrs".
	517
	518	// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)
	519
	520	.text
	521	.align 16
	522	.global __ia64_save_stack_nonlocal
	523	.proc __ia64_save_stack_nonlocal
	524	__ia64_save_stack_nonlocal:
97e242b0 RH	525	{ .mmf
	526	alloc r18 = ar.pfs, 2, 0, 0, 0
	527	mov r19 = ar.rsc
	528	;;
	529	}
	530	{ .mmi
	531	flushrs
	532	st8 [in0] = in1, 24
	533	and r19 = 0x1c, r19
	534	;;
	535	}
	536	{ .mmi
	537	st8 [in0] = r18, -16
	538	mov ar.rsc = r19
	539	or r19 = 0x3, r19
	540	;;
	541	}
	542	{ .mmi
	543	mov r16 = ar.bsp
	544	mov r17 = ar.rnat
	545	adds r2 = 8, in0
	546	;;
	547	}
	548	{ .mmi
	549	st8 [in0] = r16
	550	st8 [r2] = r17
	551	}
	552	{ .mib
	553	mov ar.rsc = r19
	554	br.ret.sptk.few rp
	555	;;
	556	}
c65ebc55 JW	557	.endp __ia64_save_stack_nonlocal
	558	#endif
	559
	560	#ifdef L__nonlocal_goto
97e242b0	561	// void __ia64_nonlocal_goto(void target_label, void save_area,
c65ebc55 JW	562	// void *static_chain);
	563
	564	.text
	565	.align 16
	566	.global __ia64_nonlocal_goto
	567	.proc __ia64_nonlocal_goto
	568	__ia64_nonlocal_goto:
97e242b0 RH	569	{ .mmi
	570	alloc r20 = ar.pfs, 3, 0, 0, 0
	571	ld8 r12 = [in1], 8
	572	mov.ret.sptk rp = in0, .L0
	573	;;
	574	}
	575	{ .mmf
	576	ld8 r16 = [in1], 8
	577	mov r19 = ar.rsc
	578	;;
	579	}
	580	{ .mmi
	581	flushrs
	582	ld8 r17 = [in1], 8
	583	and r19 = 0x1c, r19
	584	;;
	585	}
	586	{ .mmi
	587	ld8 r18 = [in1]
	588	mov ar.rsc = r19
	589	or r19 = 0x3, r19
	590	;;
	591	}
	592	{ .mmi
	593	mov ar.bspstore = r16
	594	;;
	595	mov ar.rnat = r17
	596	;;
	597	}
	598	{ .mmi
	599	loadrs
	600	invala
	601	mov r15 = in2
	602	;;
	603	}
	604	.L0: { .mib
	605	mov ar.rsc = r19
	606	mov ar.pfs = r18
	607	br.ret.sptk.few rp
	608	;;
c65ebc55	609	}
c65ebc55 JW	610	.endp __ia64_nonlocal_goto
c65ebc55 JW	611	#endif
9525c690 JW	612
	613	#ifdef L__restore_stack_nonlocal
	614	// This is mostly the same as nonlocal_goto above.
	615	// ??? This has not been tested yet.
	616
	617	// void __ia64_restore_stack_nonlocal(void *save_area)
	618
	619	.text
	620	.align 16
	621	.global __ia64_restore_stack_nonlocal
	622	.proc __ia64_restore_stack_nonlocal
	623	__ia64_restore_stack_nonlocal:
97e242b0 RH	624	{ .mmf
	625	alloc r20 = ar.pfs, 4, 0, 0, 0
	626	ld8 r12 = [in0], 8
	627	;;
	628	}
	629	{ .mmb
	630	ld8 r16=[in0], 8
	631	mov r19 = ar.rsc
	632	;;
	633	}
	634	{ .mmi
	635	flushrs
	636	ld8 r17 = [in0], 8
	637	and r19 = 0x1c, r19
	638	;;
	639	}
	640	{ .mmf
	641	ld8 r18 = [in0]
	642	mov ar.rsc = r19
	643	;;
	644	}
	645	{ .mmi
	646	mov ar.bspstore = r16
	647	;;
	648	mov ar.rnat = r17
	649	or r19 = 0x3, r19
	650	;;
	651	}
	652	{ .mmf
	653	loadrs
	654	invala
	655	;;
	656	}
	657	.L0: { .mib
	658	mov ar.rsc = r19
	659	mov ar.pfs = r18
	660	br.ret.sptk.few rp
	661	;;
9525c690	662	}
9525c690 JW	663	.endp __ia64_restore_stack_nonlocal
9525c690 JW	664	#endif
97e242b0 RH	665
	666	#ifdef L__trampoline
	667	// Implement the nested function trampoline. This is out of line
	668	// so that we don't have to bother with flushing the icache, as
	669	// well as making the on-stack trampoline smaller.
	670	//
	671	// The trampoline has the following form:
	672	//
0024a804	673	// +-------------------+ >
97e242b0 RH	674	// TRAMP: \| __ia64_trampoline \| \|
	675	// +-------------------+ > fake function descriptor
	676	// \| TRAMP+16 \| \|
0024a804	677	// +-------------------+ >
97e242b0 RH	678	// \| target descriptor \|
	679	// +-------------------+
	680	// \| static link \|
	681	// +-------------------+
	682
	683	.text
	684	.align 16
	685	.global __ia64_trampoline
	686	.proc __ia64_trampoline
	687	__ia64_trampoline:
	688	{ .mmi
	689	ld8 r2 = [r1], 8
	690	;;
	691	ld8 r15 = [r1]
	692	}
	693	{ .mmi
	694	ld8 r3 = [r2], 8
	695	;;
	696	ld8 r1 = [r2]
	697	mov b6 = r3
	698	}
	699	{ .bbb
	700	br.sptk.many b6
	701	;;
	702	}
	703	.endp __ia64_trampoline
	704	#endif