]> git.ipfire.org Git - thirdparty/glibc.git/blame - sysdeps/powerpc/fpu/e_sqrt.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrt.c
CommitLineData
ffdd5e50 1/* Double-precision floating point square root.
04277e02 2 Copyright (C) 1997-2019 Free Software Foundation, Inc.
ffdd5e50
UD
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
59ba27a6
PE
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
ffdd5e50
UD
18
19#include <math.h>
20#include <math_private.h>
418d99e6 21#include <fenv.h>
ffdd5e50
UD
22#include <fenv_libc.h>
23#include <inttypes.h>
e054f494 24#include <stdint.h>
ffdd5e50
UD
25#include <sysdep.h>
26#include <ldsodefs.h>
ffdd5e50 27
08cee2a4 28#ifndef _ARCH_PPCSQ
ffdd5e50
UD
29static const double almost_half = 0.5000000000000001; /* 0.5 + 2^-53 */
30static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
31static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
32static const float two108 = 3.245185536584267269e+32;
33static const float twom54 = 5.551115123125782702e-17;
34extern const float __t_sqrt[1024];
35
36/* The method is based on a description in
37 Computation of elementary functions on the IBM RISC System/6000 processor,
38 P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40 39 Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50
UD
40 one to find the actual square root, and one to find its reciprocal
41 without the expense of a division operation. The tricky bit here
42 is the use of the POWER/PowerPC multiply-add operation to get the
43 required accuracy with high speed.
44
45 The argument reduction works by a combination of table lookup to
46 obtain the initial guesses, and some careful modification of the
47 generated guesses (which mostly runs on the integer unit, while the
868f7a40 48 Newton-Raphson is running on the FPU). */
ffdd5e50 49
ffdd5e50
UD
50double
51__slow_ieee754_sqrt (double x)
ffdd5e50
UD
52{
53 const float inf = a_inf.value;
54
55 if (x > 0)
56 {
57 /* schedule the EXTRACT_WORDS to get separation between the store
0ac5ae23 58 and the load. */
ffdd5e50
UD
59 ieee_double_shape_type ew_u;
60 ieee_double_shape_type iw_u;
61 ew_u.value = (x);
62 if (x != inf)
63 {
64 /* Variables named starting with 's' exist in the
65 argument-reduced space, so that 2 > sx >= 0.5,
66 1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
67 Variables named ending with 'i' are integer versions of
68 floating-point values. */
69 double sx; /* The value of which we're trying to find the
70 square root. */
71 double sg, g; /* Guess of the square root of x. */
72 double sd, d; /* Difference between the square of the guess and x. */
73 double sy; /* Estimate of 1/2g (overestimated by 1ulp). */
74 double sy2; /* 2*sy */
75 double e; /* Difference between y*g and 1/2 (se = e * fsy). */
76 double shx; /* == sx * fsg */
77 double fsg; /* sg*fsg == g. */
78 fenv_t fe; /* Saved floating-point environment (stores rounding
79 mode and whether the inexact exception is
80 enabled). */
81 uint32_t xi0, xi1, sxi, fsgi;
82 const float *t_sqrt;
83
84 fe = fegetenv_register ();
85 /* complete the EXTRACT_WORDS (xi0,xi1,x) operation. */
86 xi0 = ew_u.parts.msw;
87 xi1 = ew_u.parts.lsw;
88 relax_fenv_state ();
89 sxi = (xi0 & 0x3fffffff) | 0x3fe00000;
90 /* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
91 between the store and the load. */
92 iw_u.parts.msw = sxi;
93 iw_u.parts.lsw = xi1;
94 t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
95 sg = t_sqrt[0];
96 sy = t_sqrt[1];
97 /* complete the INSERT_WORDS (sx, sxi, xi1) operation. */
98 sx = iw_u.value;
99
868f7a40 100 /* Here we have three Newton-Raphson iterations each of a
ffdd5e50
UD
101 division and a square root and the remainder of the
102 argument reduction, all interleaved. */
e8bd5286 103 sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50
UD
104 fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
105 sy2 = sy + sy;
e8bd5286
JM
106 sg = __builtin_fma (sy, sd, sg); /* 16-bit approximation to
107 sqrt(sx). */
ffdd5e50
UD
108
109 /* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
110 between the store and the load. */
111 INSERT_WORDS (fsg, fsgi, 0);
112 iw_u.parts.msw = fsgi;
113 iw_u.parts.lsw = (0);
e8bd5286
JM
114 e = -__builtin_fma (sy, sg, -almost_half);
115 sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50
UD
116 if ((xi0 & 0x7ff00000) == 0)
117 goto denorm;
e8bd5286
JM
118 sy = __builtin_fma (e, sy2, sy);
119 sg = __builtin_fma (sy, sd, sg); /* 32-bit approximation to
120 sqrt(sx). */
ffdd5e50
UD
121 sy2 = sy + sy;
122 /* complete the INSERT_WORDS (fsg, fsgi, 0) operation. */
123 fsg = iw_u.value;
e8bd5286
JM
124 e = -__builtin_fma (sy, sg, -almost_half);
125 sd = -__builtin_fma (sg, sg, -sx);
126 sy = __builtin_fma (e, sy2, sy);
ffdd5e50 127 shx = sx * fsg;
e8bd5286
JM
128 sg = __builtin_fma (sy, sd, sg); /* 64-bit approximation to
129 sqrt(sx), but perhaps
130 rounded incorrectly. */
ffdd5e50
UD
131 sy2 = sy + sy;
132 g = sg * fsg;
e8bd5286
JM
133 e = -__builtin_fma (sy, sg, -almost_half);
134 d = -__builtin_fma (g, sg, -shx);
135 sy = __builtin_fma (e, sy2, sy);
ffdd5e50 136 fesetenv_register (fe);
e8bd5286 137 return __builtin_fma (sy, d, g);
ffdd5e50
UD
138 denorm:
139 /* For denormalised numbers, we normalise, calculate the
140 square root, and return an adjusted result. */
141 fesetenv_register (fe);
142 return __slow_ieee754_sqrt (x * two108) * twom54;
143 }
144 }
145 else if (x < 0)
146 {
147 /* For some reason, some PowerPC32 processors don't implement
0ac5ae23 148 FE_INVALID_SQRT. */
ffdd5e50 149#ifdef FE_INVALID_SQRT
0747f818 150 __feraiseexcept (FE_INVALID_SQRT);
c3a0ead4
UD
151
152 fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca 153 if ((u.l & FE_INVALID) == 0)
ffdd5e50 154#endif
0747f818 155 __feraiseexcept (FE_INVALID);
ffdd5e50
UD
156 x = a_nan.value;
157 }
158 return f_wash (x);
159}
08cee2a4 160#endif /* _ARCH_PPCSQ */
ffdd5e50 161
8a6d5255 162#undef __ieee754_sqrt
ffdd5e50
UD
163double
164__ieee754_sqrt (double x)
ffdd5e50
UD
165{
166 double z;
167
08cee2a4
AZ
168#ifdef _ARCH_PPCSQ
169 asm ("fsqrt %0,%1\n" :"=f" (z):"f" (x));
170#else
171 z = __slow_ieee754_sqrt (x);
172#endif
ffdd5e50
UD
173
174 return z;
175}
0ac5ae23 176strong_alias (__ieee754_sqrt, __sqrt_finite)