]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/fpu/e_sqrtf.c
Optimize libm
[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrtf.c
CommitLineData
ffdd5e50 1/* Single-precision floating point square root.
0ac5ae23 2 Copyright (C) 1997, 2003, 2004, 2008, 2011 Free Software Foundation, Inc.
ffdd5e50
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
19
20#include <math.h>
21#include <math_private.h>
22#include <fenv_libc.h>
23#include <inttypes.h>
24
25#include <sysdep.h>
26#include <ldsodefs.h>
ffdd5e50
UD
27
28static const float almost_half = 0.50000006; /* 0.5 + 2^-24 */
29static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
30static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
31static const float two48 = 281474976710656.0;
32static const float twom24 = 5.9604644775390625e-8;
33extern const float __t_sqrt[1024];
34
35/* The method is based on a description in
36 Computation of elementary functions on the IBM RISC System/6000 processor,
37 P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40 38 Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50
UD
39 one to find the actual square root, and one to find its reciprocal
40 without the expense of a division operation. The tricky bit here
41 is the use of the POWER/PowerPC multiply-add operation to get the
42 required accuracy with high speed.
43
44 The argument reduction works by a combination of table lookup to
45 obtain the initial guesses, and some careful modification of the
46 generated guesses (which mostly runs on the integer unit, while the
868f7a40 47 Newton-Raphson is running on the FPU). */
ffdd5e50 48
ffdd5e50
UD
49float
50__slow_ieee754_sqrtf (float x)
ffdd5e50
UD
51{
52 const float inf = a_inf.value;
53
54 if (x > 0)
55 {
56 if (x != inf)
57 {
58 /* Variables named starting with 's' exist in the
59 argument-reduced space, so that 2 > sx >= 0.5,
60 1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
61 Variables named ending with 'i' are integer versions of
62 floating-point values. */
63 float sx; /* The value of which we're trying to find the square
64 root. */
65 float sg, g; /* Guess of the square root of x. */
66 float sd, d; /* Difference between the square of the guess and x. */
67 float sy; /* Estimate of 1/2g (overestimated by 1ulp). */
68 float sy2; /* 2*sy */
69 float e; /* Difference between y*g and 1/2 (note that e==se). */
70 float shx; /* == sx * fsg */
71 float fsg; /* sg*fsg == g. */
72 fenv_t fe; /* Saved floating-point environment (stores rounding
73 mode and whether the inexact exception is
74 enabled). */
75 uint32_t xi, sxi, fsgi;
76 const float *t_sqrt;
77
78 GET_FLOAT_WORD (xi, x);
79 fe = fegetenv_register ();
80 relax_fenv_state ();
81 sxi = (xi & 0x3fffffff) | 0x3f000000;
82 SET_FLOAT_WORD (sx, sxi);
83 t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
84 sg = t_sqrt[0];
85 sy = t_sqrt[1];
86
868f7a40 87 /* Here we have three Newton-Raphson iterations each of a
ffdd5e50
UD
88 division and a square root and the remainder of the
89 argument reduction, all interleaved. */
90 sd = -(sg * sg - sx);
91 fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
92 sy2 = sy + sy;
93 sg = sy * sd + sg; /* 16-bit approximation to sqrt(sx). */
94 e = -(sy * sg - almost_half);
95 SET_FLOAT_WORD (fsg, fsgi);
96 sd = -(sg * sg - sx);
97 sy = sy + e * sy2;
98 if ((xi & 0x7f800000) == 0)
99 goto denorm;
100 shx = sx * fsg;
101 sg = sg + sy * sd; /* 32-bit approximation to sqrt(sx),
102 but perhaps rounded incorrectly. */
103 sy2 = sy + sy;
104 g = sg * fsg;
105 e = -(sy * sg - almost_half);
106 d = -(g * sg - shx);
107 sy = sy + e * sy2;
108 fesetenv_register (fe);
109 return g + sy * d;
110 denorm:
111 /* For denormalised numbers, we normalise, calculate the
112 square root, and return an adjusted result. */
113 fesetenv_register (fe);
114 return __slow_ieee754_sqrtf (x * two48) * twom24;
115 }
116 }
117 else if (x < 0)
118 {
119 /* For some reason, some PowerPC32 processors don't implement
0ac5ae23 120 FE_INVALID_SQRT. */
ffdd5e50
UD
121#ifdef FE_INVALID_SQRT
122 feraiseexcept (FE_INVALID_SQRT);
c3a0ead4
UD
123
124 fenv_union_t u = { .fenv = fegetenv_register () };
125 if ((u.l[1] & FE_INVALID) == 0)
ffdd5e50
UD
126#endif
127 feraiseexcept (FE_INVALID);
128 x = a_nan.value;
129 }
130 return f_washf (x);
131}
132
133
ffdd5e50
UD
134float
135__ieee754_sqrtf (float x)
ffdd5e50
UD
136{
137 double z;
138
433f49c4
UD
139 /* If the CPU is 64-bit we can use the optional FP instructions. */
140 if (__CPU_HAS_FSQRT)
ffdd5e50 141 {
c3a0ead4 142 /* Volatile is required to prevent the compiler from moving the
0ac5ae23 143 fsqrt instruction above the branch. */
ffdd5e50
UD
144 __asm __volatile (" fsqrts %0,%1\n"
145 :"=f" (z):"f" (x));
146 }
147 else
148 z = __slow_ieee754_sqrtf (x);
149
150 return z;
151}
0ac5ae23 152strong_alias (__ieee754_sqrtf, __sqrtf_finite)