From: Bob Wilson <bob.wilson@acm.org>
Date: Thu, 20 Dec 2007 22:35:59 +0000 (+0000)
Subject: xtensa.md (fix_return_addr): Remove.
X-Git-Tag: releases/gcc-4.3.0~857
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7f0ee69424f02f4c46bc13e6a3c77248b4d04bbe;p=thirdparty%2Fgcc.git

xtensa.md (fix_return_addr): Remove.

	* config/xtensa/xtensa.md (fix_return_addr): Remove.
	* config/xtensa/xtensa-protos.h (xtensa_initialize_trampoline): New.
	(xtensa_trampoline_template): New.
	* config/xtensa/xtensa.c (MIN_FRAME_SIZE): Moved here from xtensa.h.
	(xtensa_return_addr): Expand to standard Xtensa insns instead of
	fix_return_addr.  Get high bits from a local label.
	(xtensa_trampoline_template): New function with code moved from
	TRAMPOLINE_TEMPLATE in xtensa.h.  Use L32R instead of CALL0 except
	when using CONST16 or absolute-mode literals.
	(xtensa_initialize_trampoline): New function with code moved from
	INITIALIZE_TRAMPOLINE in xtensa.h.  Use different offsets depending
	on which trampoline version is used.
	* config/xtensa/lib2funcs.S (TRAMPOLINE_SIZE): Add comment.
	* config/xtensa/xtensa.h (TARGET_ABSOLUTE_LITERALS): Define.
	(MIN_FRAME_SIZE): Moved to xtensa.c.
	(TRAMPOLINE_TEMPLATE): Use xtensa_trampoline_template.
	(TRAMPOLINE_SIZE): Two versions of the trampoline have different sizes.
	(INITIALIZE_TRAMPOLINE): Use xtensa_initialize_trampoline.
	* config/xtensa/ieee754-df.S (XCHAL_NO_MUL): Define.
	(__muldf3): Use CALL12 instead of CALL0 to invoke .Lmul_mulsi3
	helper when not using the CALL0 ABI.  Change .Lmul_mulsi3 to match.
	* config/xtensa/lib1funcs.asm (__umulsidi3): Likewise.
	* config/xtensa/ieee754-sf.S (__mulsf3): Likewise.

From-SVN: r131108
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 33922e029067..9674a7c0877f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,29 @@
+2007-12-20  Bob Wilson  <bob.wilson@acm.org>
+
+	* config/xtensa/xtensa.md (fix_return_addr): Remove.
+	* config/xtensa/xtensa-protos.h (xtensa_initialize_trampoline): New.
+	(xtensa_trampoline_template): New.
+	* config/xtensa/xtensa.c (MIN_FRAME_SIZE): Moved here from xtensa.h.
+	(xtensa_return_addr): Expand to standard Xtensa insns instead of
+	fix_return_addr.  Get high bits from a local label.
+	(xtensa_trampoline_template): New function with code moved from
+	TRAMPOLINE_TEMPLATE in xtensa.h.  Use L32R instead of CALL0 except
+	when using CONST16 or absolute-mode literals.
+	(xtensa_initialize_trampoline): New function with code moved from
+	INITIALIZE_TRAMPOLINE in xtensa.h.  Use different offsets depending
+	on which trampoline version is used.
+	* config/xtensa/lib2funcs.S (TRAMPOLINE_SIZE): Add comment.
+	* config/xtensa/xtensa.h (TARGET_ABSOLUTE_LITERALS): Define.
+	(MIN_FRAME_SIZE): Moved to xtensa.c.
+	(TRAMPOLINE_TEMPLATE): Use xtensa_trampoline_template.
+	(TRAMPOLINE_SIZE): Two versions of the trampoline have different sizes.
+	(INITIALIZE_TRAMPOLINE): Use xtensa_initialize_trampoline.
+	* config/xtensa/ieee754-df.S (XCHAL_NO_MUL): Define.
+	(__muldf3): Use CALL12 instead of CALL0 to invoke .Lmul_mulsi3
+	helper when not using the CALL0 ABI.  Change .Lmul_mulsi3 to match.
+	* config/xtensa/lib1funcs.asm (__umulsidi3): Likewise.
+	* config/xtensa/ieee754-sf.S (__mulsf3): Likewise.
+	
 2007-12-20  Jakub Jelinek  <jakub@redhat.com>
 
 	PR c++/34459
diff --git a/gcc/config/xtensa/ieee754-df.S b/gcc/config/xtensa/ieee754-df.S
index 711b10c9df31..381e6ce31f34 100644
--- a/gcc/config/xtensa/ieee754-df.S
+++ b/gcc/config/xtensa/ieee754-df.S
@@ -1,5 +1,5 @@
 /* IEEE-754 double-precision functions for Xtensa
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
    This file is part of GCC.
@@ -607,6 +607,10 @@ __subdf3:
 #ifdef L_muldf3
 
 	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 __muldf3_aux:
 
 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -728,13 +732,19 @@ __muldf3_aux:
 	.global	__muldf3
 	.type	__muldf3, @function
 __muldf3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
 #endif
 	movi	a6, 0x7ff00000
 
@@ -809,7 +819,7 @@ __muldf3:
 	muluh	xh, xh, yh
 	add	xh, xh, a9
 
-#else
+#else /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
 	   products.  These partial products are:
@@ -847,7 +857,7 @@ __muldf3:
 
 	/* Save a7 since it is needed to hold a temporary value.  */
 	s32i	a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Calling a separate multiply function will clobber a0 and requires
 	   use of a8 as a temporary, so save those values now.  (The function
 	   uses a custom ABI so nothing else needs to be saved.)  */
@@ -915,12 +925,21 @@ __muldf3:
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a10 with carry-out in a9.  */
 	do_mul(a10, xl, l, yl, h)	/* pp 1 */
@@ -1032,11 +1051,11 @@ __muldf3:
 
 	/* Restore values saved on the stack during the multiplication.  */
 	l32i	a7, sp, 4
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	l32i	a0, sp, 0
 	l32i	a8, sp, 8
 #endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Shift left by 12 bits, unless there was a carry-out from the
 	   multiply, in which case, shift by 11 bits and increment the
@@ -1157,38 +1176,47 @@ __muldf3:
 	movi	xl, 0
 	j	.Lmul_done
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 	
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 #endif /* L_muldf3 */
 
 #ifdef L_divdf3
diff --git a/gcc/config/xtensa/ieee754-sf.S b/gcc/config/xtensa/ieee754-sf.S
index a75e742898b7..abb641d9456b 100644
--- a/gcc/config/xtensa/ieee754-sf.S
+++ b/gcc/config/xtensa/ieee754-sf.S
@@ -1,5 +1,5 @@
 /* IEEE-754 single-precision functions for Xtensa
-   Copyright (C) 2006 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
    This file is part of GCC.
@@ -488,6 +488,10 @@ __subsf3:
 #ifdef L_mulsf3
 
 	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 __mulsf3_aux:
 
 	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
@@ -570,13 +574,19 @@ __mulsf3_aux:
 	.global	__mulsf3
 	.type	__mulsf3, @function
 __mulsf3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
 #endif
 	movi	a6, 0x7f800000
 
@@ -633,7 +643,7 @@ __mulsf3:
 	   chunks can be extracted when setting up the arguments to the
 	   separate multiply function.  */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Calling a separate multiply function will clobber a0 and requires
 	   use of a8 as a temporary, so save those values now.  (The function
 	   uses a custom ABI so nothing else needs to be saved.)  */
@@ -693,12 +703,21 @@ __mulsf3:
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
 	do_mul(a6, a2, l, a3, h)	/* pp 1 */
@@ -724,12 +743,12 @@ __mulsf3:
 	do_mul(a2, a2, h, a3, h)	/* pp 3 */
 	add	a2, a2, a9
 	
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Restore values saved on the stack during the multiplication.  */
 	l32i	a0, sp, 0
 	l32i	a8, sp, 4
 #endif
-#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
 
 	/* Shift left by 9 bits, unless there was a carry-out from the
 	   multiply, in which case, shift by 8 bits and increment the
@@ -825,38 +844,47 @@ __mulsf3:
 	slli	a2, a2, 31
 	j	.Lmul_done
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 	
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 #endif /* L_mulsf3 */
 
 #ifdef L_divsf3
diff --git a/gcc/config/xtensa/lib1funcs.asm b/gcc/config/xtensa/lib1funcs.asm
index 27b67c43d6fb..69162f036e99 100644
--- a/gcc/config/xtensa/lib1funcs.asm
+++ b/gcc/config/xtensa/lib1funcs.asm
@@ -201,17 +201,28 @@ __mulsi3:
 
 
 #ifdef L_umulsidi3
+
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
 	.align	4
 	.global	__umulsidi3
 	.type	__umulsidi3, @function
 __umulsidi3:
-	leaf_entry sp, 32
 #if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
 	addi	sp, sp, -32
 	s32i	a12, sp, 16
 	s32i	a13, sp, 20
 	s32i	a14, sp, 24
 	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 48
+#else
+	leaf_entry sp, 16
 #endif
 
 #ifdef __XTENSA_EB__
@@ -232,7 +243,7 @@ __umulsidi3:
 
 #else /* ! MUL32_HIGH */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* a0 and a8 will be clobbered by calling the multiply function
 	   but a8 is not used here and need not be saved.  */
 	s32i	a0, sp, 0
@@ -290,12 +301,21 @@ __umulsidi3:
 #define set_arg_h(dst, src) \
 	srli	dst, src, 16
 
+#if __XTENSA_CALL0_ABI__
 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 	set_arg_ ## xhalf (a13, xreg); \
 	set_arg_ ## yhalf (a14, yreg); \
 	call0	.Lmul_mulsi3; \
 	mov	dst, a12
-#endif
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
 
 	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
 	do_mul(a6, a2, l, a3, h)	/* pp 1 */
@@ -324,7 +344,7 @@ __umulsidi3:
 
 #endif /* !MUL32_HIGH */
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 	/* Restore the original return address.  */
 	l32i	a0, sp, 0
 #endif
@@ -337,38 +357,47 @@ __umulsidi3:
 #endif
 	leaf_return
 
-#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#if XCHAL_NO_MUL
 
 	/* For Xtensa processors with no multiply hardware, this simplified
 	   version of _mulsi3 is used for multiplying 16-bit chunks of
-	   the floating-point mantissas.  It uses a custom ABI:	the inputs
-	   are passed in a13 and a14, the result is returned in a12, and
-	   a8 and a15 are clobbered.  */
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
 	.align	4
 .Lmul_mulsi3:
-	movi	a12, 0
-.Lmul_mult_loop:
-	add	a15, a14, a12
-	extui	a8, a13, 0, 1
-	movnez	a12, a15, a8
-
-	do_addx2 a15, a14, a12, a15
-	extui	a8, a13, 1, 1
-	movnez	a12, a15, a8
-
-	do_addx4 a15, a14, a12, a15
-	extui	a8, a13, 2, 1
-	movnez	a12, a15, a8
-
-	do_addx8 a15, a14, a12, a15
-	extui	a8, a13, 3, 1
-	movnez	a12, a15, a8
-
-	srli	a13, a13, 4
-	slli	a14, a14, 4
-	bnez	a13, .Lmul_mult_loop
-	ret
-#endif /* !MUL16 && !MUL32 && !MAC16 */
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
 
 	.size	__umulsidi3, . - __umulsidi3
 
diff --git a/gcc/config/xtensa/lib2funcs.S b/gcc/config/xtensa/lib2funcs.S
index 7e01a6ea6e19..16d6734c2773 100644
--- a/gcc/config/xtensa/lib2funcs.S
+++ b/gcc/config/xtensa/lib2funcs.S
@@ -1,5 +1,5 @@
 /* Assembly functions for libgcc2.
-   Copyright (C) 2001, 2006 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2006, 2007 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
 
 This file is part of GCC.
@@ -151,6 +151,7 @@ __xtensa_nonlocal_goto:
    make sure that the modified instructions are loaded into the instruction
    fetch buffer.  */
 
+/* Use the maximum trampoline size.  Flushing a bit extra is OK.  */
 #define TRAMPOLINE_SIZE 60
 
 	.text
diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h
index 5ea777ca2328..82d7262922de 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -69,6 +69,7 @@ extern enum reg_class xtensa_preferred_reload_class (rtx, enum reg_class, int);
 extern enum reg_class xtensa_secondary_reload_class (enum reg_class,
 						     enum machine_mode, rtx,
 						     int);
+extern void xtensa_initialize_trampoline (rtx, rtx, rtx);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
@@ -85,5 +86,6 @@ extern long compute_frame_size (int);
 extern int xtensa_frame_pointer_required (void);
 extern void xtensa_expand_prologue (void);
 extern void order_regs_for_local_alloc (void);
+extern void xtensa_trampoline_template (FILE *);
 
 #endif /* !__XTENSA_PROTOS_H__ */
diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c
index f98a66518692..62eadbb97982 100644
--- a/gcc/config/xtensa/xtensa.c
+++ b/gcc/config/xtensa/xtensa.c
@@ -2301,6 +2301,10 @@ xtensa_frame_pointer_required (void)
 }
 
 
+/* minimum frame = reg save area (4 words) plus static chain (1 word)
+   and the total number of words must be a multiple of 128 bits.  */
+#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
+
 void
 xtensa_expand_prologue (void)
 {
@@ -2379,7 +2383,7 @@ xtensa_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
 rtx
 xtensa_return_addr (int count, rtx frame)
 {
-  rtx result, retaddr;
+  rtx result, retaddr, curaddr, label;
 
   if (count == -1)
     retaddr = gen_rtx_REG (Pmode, A0_REG);
@@ -2393,10 +2397,25 @@ xtensa_return_addr (int count, rtx frame)
 
   /* The 2 most-significant bits of the return address on Xtensa hold
      the register window size.  To get the real return address, these
-     bits must be replaced with the high bits from the current PC.  */
-
+     bits must be replaced with the high bits from some address in the
+     code.  */
+
+  /* Get the 2 high bits of a local label in the code.  */
+  curaddr = gen_reg_rtx (Pmode);
+  label = gen_label_rtx ();
+  emit_label (label);
+  LABEL_PRESERVE_P (label) = 1;
+  emit_move_insn (curaddr, gen_rtx_LABEL_REF (Pmode, label));
+  emit_insn (gen_lshrsi3 (curaddr, curaddr, GEN_INT (30)));
+  emit_insn (gen_ashlsi3 (curaddr, curaddr, GEN_INT (30)));
+
+  /* Clear the 2 high bits of the return address.  */
   result = gen_reg_rtx (Pmode);
-  emit_insn (gen_fix_return_addr (result, retaddr));
+  emit_insn (gen_ashlsi3 (result, retaddr, GEN_INT (2)));
+  emit_insn (gen_lshrsi3 (result, result, GEN_INT (2)));
+
+  /* Combine them to get the result.  */
+  emit_insn (gen_iorsi3 (result, result, curaddr));
   return result;
 }
 
@@ -3126,4 +3145,95 @@ xtensa_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
 	  > 4 * UNITS_PER_WORD);
 }
 
+
+/* TRAMPOLINE_TEMPLATE: For Xtensa, the trampoline must perform an ENTRY
+   instruction with a minimal stack frame in order to get some free
+   registers.  Once the actual call target is known, the proper stack frame
+   size is extracted from the ENTRY instruction at the target and the
+   current frame is adjusted to match.  The trampoline then transfers
+   control to the instruction following the ENTRY at the target.  Note:
+   this assumes that the target begins with an ENTRY instruction.  */
+
+void
+xtensa_trampoline_template (FILE *stream)
+{
+  bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+
+  fprintf (stream, "\t.begin no-transform\n");
+  fprintf (stream, "\tentry\tsp, %d\n", MIN_FRAME_SIZE);
+
+  if (use_call0)
+    {
+      /* Save the return address.  */
+      fprintf (stream, "\tmov\ta10, a0\n");
+
+      /* Use a CALL0 instruction to skip past the constants and in the
+	 process get the PC into A0.  This allows PC-relative access to
+	 the constants without relying on L32R.  */
+      fprintf (stream, "\tcall0\t.Lskipconsts\n");
+    }
+  else
+    fprintf (stream, "\tj\t.Lskipconsts\n");
+
+  fprintf (stream, "\t.align\t4\n");
+  fprintf (stream, ".Lchainval:%s0\n", integer_asm_op (4, TRUE));
+  fprintf (stream, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE));
+  fprintf (stream, ".Lskipconsts:\n");
+
+  /* Load the static chain and function address from the trampoline.  */
+  if (use_call0)
+    {
+      fprintf (stream, "\taddi\ta0, a0, 3\n");
+      fprintf (stream, "\tl32i\ta9, a0, 0\n");
+      fprintf (stream, "\tl32i\ta8, a0, 4\n");
+    }
+  else
+    {
+      fprintf (stream, "\tl32r\ta9, .Lchainval\n");
+      fprintf (stream, "\tl32r\ta8, .Lfnaddr\n");
+    }
+
+  /* Store the static chain.  */
+  fprintf (stream, "\ts32i\ta9, sp, %d\n", MIN_FRAME_SIZE - 20);
+
+  /* Set the proper stack pointer value.  */
+  fprintf (stream, "\tl32i\ta9, a8, 0\n");
+  fprintf (stream, "\textui\ta9, a9, %d, 12\n",
+	   TARGET_BIG_ENDIAN ? 8 : 12);
+  fprintf (stream, "\tslli\ta9, a9, 3\n");
+  fprintf (stream, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE);
+  fprintf (stream, "\tsub\ta9, sp, a9\n");
+  fprintf (stream, "\tmovsp\tsp, a9\n");
+
+  if (use_call0)
+    /* Restore the return address.  */
+    fprintf (stream, "\tmov\ta0, a10\n");
+
+  /* Jump to the instruction following the ENTRY.  */
+  fprintf (stream, "\taddi\ta8, a8, 3\n");
+  fprintf (stream, "\tjx\ta8\n");
+
+  /* Pad size to a multiple of TRAMPOLINE_ALIGNMENT.  */
+  if (use_call0)
+    fprintf (stream, "\t.byte\t0\n");
+  else
+    fprintf (stream, "\tnop\n");
+
+  fprintf (stream, "\t.end no-transform\n");
+}
+
+
+void
+xtensa_initialize_trampoline (rtx addr, rtx func, rtx chain)
+{
+  bool use_call0 = (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS);
+  int chain_off = use_call0 ? 12 : 8;
+  int func_off = use_call0 ? 16 : 12;
+  emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, chain_off)), chain);
+  emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, func_off)), func);
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"),
+		     0, VOIDmode, 1, addr, Pmode);
+}
+
+
 #include "gt-xtensa.h"
diff --git a/gcc/config/xtensa/xtensa.h b/gcc/config/xtensa/xtensa.h
index 79cd05f7fb25..7e0e9400aadb 100644
--- a/gcc/config/xtensa/xtensa.h
+++ b/gcc/config/xtensa/xtensa.h
@@ -72,6 +72,7 @@ extern unsigned xtensa_current_frame_size;
 #define TARGET_ADDX		XCHAL_HAVE_ADDX
 #define TARGET_RELEASE_SYNC	XCHAL_HAVE_RELEASE_SYNC
 #define TARGET_S32C1I		XCHAL_HAVE_S32C1I
+#define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS
 
 #define TARGET_DEFAULT (						\
   (XCHAL_HAVE_L32R	? 0 : MASK_CONST16))
@@ -704,83 +705,19 @@ typedef struct xtensa_args
 /* Stack pointer value doesn't matter at exit.  */
 #define EXIT_IGNORE_STACK 1
 
-/* A C statement to output, on the stream FILE, assembler code for a
-   block of data that contains the constant parts of a trampoline. 
-   This code should not include a label--the label is taken care of
-   automatically.
-
-   For Xtensa, the trampoline must perform an entry instruction with a
-   minimal stack frame in order to get some free registers.  Once the
-   actual call target is known, the proper stack frame size is extracted
-   from the entry instruction at the target and the current frame is
-   adjusted to match.  The trampoline then transfers control to the
-   instruction following the entry at the target.  Note: this assumes
-   that the target begins with an entry instruction.  */
-
-/* minimum frame = reg save area (4 words) plus static chain (1 word)
-   and the total number of words must be a multiple of 128 bits */
-#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD)
-
-#define TRAMPOLINE_TEMPLATE(STREAM)					\
-  do {									\
-    fprintf (STREAM, "\t.begin no-transform\n");			\
-    fprintf (STREAM, "\tentry\tsp, %d\n", MIN_FRAME_SIZE);		\
-									\
-    /* save the return address */					\
-    fprintf (STREAM, "\tmov\ta10, a0\n");				\
-									\
-    /* Use a CALL0 instruction to skip past the constants and in the	\
-       process get the PC into A0.  This allows PC-relative access to	\
-       the constants without relying on L32R, which may not always be	\
-       available.  */							\
-									\
-    fprintf (STREAM, "\tcall0\t.Lskipconsts\n");			\
-    fprintf (STREAM, "\t.align\t4\n");					\
-    fprintf (STREAM, ".Lchainval:%s0\n", integer_asm_op (4, TRUE));	\
-    fprintf (STREAM, ".Lfnaddr:%s0\n", integer_asm_op (4, TRUE));	\
-    fprintf (STREAM, ".Lskipconsts:\n");				\
-									\
-    /* store the static chain */					\
-    fprintf (STREAM, "\taddi\ta0, a0, 3\n");				\
-    fprintf (STREAM, "\tl32i\ta8, a0, 0\n");				\
-    fprintf (STREAM, "\ts32i\ta8, sp, %d\n", MIN_FRAME_SIZE - 20);	\
-									\
-    /* set the proper stack pointer value */				\
-    fprintf (STREAM, "\tl32i\ta8, a0, 4\n");				\
-    fprintf (STREAM, "\tl32i\ta9, a8, 0\n");				\
-    fprintf (STREAM, "\textui\ta9, a9, %d, 12\n",			\
-	     TARGET_BIG_ENDIAN ? 8 : 12);				\
-    fprintf (STREAM, "\tslli\ta9, a9, 3\n");				\
-    fprintf (STREAM, "\taddi\ta9, a9, %d\n", -MIN_FRAME_SIZE);		\
-    fprintf (STREAM, "\tsub\ta9, sp, a9\n");				\
-    fprintf (STREAM, "\tmovsp\tsp, a9\n");				\
-									\
-    /* restore the return address */					\
-    fprintf (STREAM, "\tmov\ta0, a10\n");				\
-									\
-    /* jump to the instruction following the entry */			\
-    fprintf (STREAM, "\taddi\ta8, a8, 3\n");				\
-    fprintf (STREAM, "\tjx\ta8\n");					\
-    fprintf (STREAM, "\t.byte\t0\n");					\
-    fprintf (STREAM, "\t.end no-transform\n");				\
-  } while (0)
+#define TRAMPOLINE_TEMPLATE(STREAM) xtensa_trampoline_template (STREAM)
 
 /* Size in bytes of the trampoline, as an integer.  Make sure this is
    a multiple of TRAMPOLINE_ALIGNMENT to avoid -Wpadded warnings.  */
-#define TRAMPOLINE_SIZE 60
+#define TRAMPOLINE_SIZE (TARGET_CONST16 || TARGET_ABSOLUTE_LITERALS ? 60 : 52)
 
 /* Alignment required for trampolines, in bits.  */
-#define TRAMPOLINE_ALIGNMENT (32)
+#define TRAMPOLINE_ALIGNMENT 32
 
 /* A C statement to initialize the variable parts of a trampoline.  */
 #define INITIALIZE_TRAMPOLINE(ADDR, FUNC, CHAIN)			\
-  do {									\
-    rtx addr = ADDR;							\
-    emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 12)), CHAIN); \
-    emit_move_insn (gen_rtx_MEM (SImode, plus_constant (addr, 16)), FUNC); \
-    emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__xtensa_sync_caches"), \
-		       0, VOIDmode, 1, addr, Pmode);			\
-  } while (0)
+  xtensa_initialize_trampoline (ADDR, FUNC, CHAIN)
+
 
 /* If defined, a C expression that produces the machine-specific code
    to setup the stack so that arbitrary frames can be accessed.
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 37e29e700392..3774a61131a8 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1666,21 +1666,6 @@
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
-;; The fix_return_addr pattern sets the high 2 bits of an address in a
-;; register to match the high bits of the current PC.
-(define_insn "fix_return_addr"
-  [(set (match_operand:SI 0 "register_operand" "=a")
-	(unspec:SI [(match_operand:SI 1 "register_operand" "r")]
-		   UNSPEC_RET_ADDR))
-   (clobber (match_scratch:SI 2 "=r"))
-   (clobber (match_scratch:SI 3 "=r"))]
-  ""
-  "mov\t%2, a0\;call0\t0f\;.align\t4\;0:\;mov\t%3, a0\;mov\ta0, %2\;\
-srli\t%3, %3, 30\;slli\t%0, %1, 2\;ssai\t2\;src\t%0, %3, %0"
-  [(set_attr "type"	"multi")
-   (set_attr "mode"	"SI")
-   (set_attr "length"	"24")])
-
 
 ;; Instructions for the Xtensa "boolean" option.