From 975c8c4e22f73fb60996f6bcc2cf1a6f6af70928 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 2 Nov 2025 08:47:53 +0800
Subject: [PATCH] i386: Simplify powl computation for small integral y [BZ
 #33586]

On i386, tests added by

commit 1b657c53c21a100082b0855392e4cb40c9c43a87
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date:   Fri Oct 10 20:21:13 2025 -0400

    Simplify powl computation for small integral y [BZ #33411]

exposed the same bug in i386 e_powl.S:

FAIL: math/test-float64x-pow
original exit status 1
testing _Float64x (without inline functions)
Failure: pow (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_downward (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_towardzero (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_upward (0x1p+8192, 0x1p+0): Exception "Overflow" set

FAIL: math/test-ldouble-pow
original exit status 1
testing long double (without inline functions)
Failure: pow (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_downward (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_towardzero (0x1p+8192, 0x1p+0): Exception "Overflow" set
Failure: pow_upward (0x1p+8192, 0x1p+0): Exception "Overflow" set

Port x86-64 e_powl.S fix to i386 e_powl.S.  This fixes BZ #33586.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
---
 sysdeps/i386/fpu/e_powl.S | 45 +++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 21 deletions(-)
diff --git a/sysdeps/i386/fpu/e_powl.S b/sysdeps/i386/fpu/e_powl.S
index 9452e352a0..766d64c8b9 100644
--- a/sysdeps/i386/fpu/e_powl.S
+++ b/sysdeps/i386/fpu/e_powl.S
@@ -166,29 +166,32 @@ ENTRY(__ieee754_powl)
 	adcl	$0, %edx
 	negl	%edx
 4:	fldl	MO(one)		// 1 : x
-	fxch
 
-	/* If y is even, take the absolute value of x.  Otherwise,
-	   ensure all intermediate values that might overflow have the
-	   sign of x.  */
+	/* y range is further reduced to [0, 3].  Simply walk through the
+	   options.  First up, 0 and 1.  */
+	test	%eax, %eax
+	jz	6f
+	fxch			// x : 1
+	subl	$1, %eax
+	jz	6f
+
+	/* Finally, y == 2 and 3.  For y == 3 we do |x| * x * |x| because x * x
+	   and |x| * |x| decay faster towards infinity compared to x * |x|.  */
+	fld	%st		// x : x : 1
+	fabs			// |x| : x : 1
+	fxch			// x : |x| : 1
+	fld	%st(1)		// |x| : x : |x| : 1
 	testb	$1, %al
-	jnz	6f
-	fabs
-
-6:	shrdl	$1, %edx, %eax
-	jnc	5f
-	fxch
-	fabs
-	fmul	%st(1)		// x : ST*x
-	fxch
-5:	fld	%st		// x : x : ST*x
-	fabs			// |x| : x : ST*x
-	fmulp			// |x|*x : ST*x
-	shrl	$1, %edx
-	movl	%eax, %ecx
-	orl	%edx, %ecx
-	jnz	6b
-	fstp	%st(0)		// ST*x
+	jz	7f
+	fmulp	%st(2)		// x : |x| * |x| : 1
+	fstp	%st(0)		// |x| * |x| : 1
+	jmp	6f
+7:	fmulp			// |x| * x : |x| : 1
+	fmulp			// |x| * x * |x| : 1
+
+	/* We come here with the stack as RES : <something>, so pop off
+	   <something>.  */
+6:	fstp	%st(1)
 #ifdef	PIC
 	LOAD_PIC_REG (cx)
 #endif
-- 
2.47.3