[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrt.c

/* Double-precision floating point square root.
   Copyright (C) 1997-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <fenv_libc.h>
#include <libm-alias-finite.h>
#include <math-use-builtins.h>

double
__ieee754_sqrt (double x)
{
#if USE_SQRT_BUILTIN
  return __builtin_sqrt (x);
#else
/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

  extern const float __t_sqrt[1024];

  if (x > 0)
    {
      /* schedule the EXTRACT_WORDS to get separation between the store
	 and the load.  */
      ieee_double_shape_type ew_u;
      ieee_double_shape_type iw_u;
      ew_u.value = (x);
      if (x != INFINITY)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  double sx;	/* The value of which we're trying to find the
			   square root.  */
	  double sg, g;	/* Guess of the square root of x.  */
	  double sd, d;	/* Difference between the square of the guess and x.  */
	  double sy;	/* Estimate of 1/2g (overestimated by 1ulp).  */
	  double sy2;	/* 2*sy */
	  double e;	/* Difference between y*g and 1/2 (se = e * fsy).  */
	  double shx;	/* == sx * fsg */
	  double fsg;	/* sg*fsg == g.  */
	  fenv_t fe;	/* Saved floating-point environment (stores rounding
			   mode and whether the inexact exception is
			   enabled).  */
	  uint32_t xi0, xi1, sxi, fsgi;
	  const float *t_sqrt;

	  fe = fegetenv_register ();
	  /* complete the EXTRACT_WORDS (xi0,xi1,x) operation.  */
	  xi0 = ew_u.parts.msw;
	  xi1 = ew_u.parts.lsw;
	  relax_fenv_state ();
	  sxi = (xi0 & 0x3fffffff) | 0x3fe00000;
	  /* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
	     between the store and the load.  */
	  iw_u.parts.msw = sxi;
	  iw_u.parts.lsw = xi1;
	  t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];
	  /* complete the INSERT_WORDS (sx, sxi, xi1) operation.  */
	  sx = iw_u.value;

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -__builtin_fma (sg, sg, -sx);
	  fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
	  sy2 = sy + sy;
	  sg = __builtin_fma (sy, sd, sg);	/* 16-bit approximation to
						   sqrt(sx). */

	  /* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
	     between the store and the load.  */
	  INSERT_WORDS (fsg, fsgi, 0);
	  iw_u.parts.msw = fsgi;
	  iw_u.parts.lsw = (0);
	  e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
	  sd = -__builtin_fma (sg, sg, -sx);
	  if ((xi0 & 0x7ff00000) == 0)
	    goto denorm;
	  sy = __builtin_fma (e, sy2, sy);
	  sg = __builtin_fma (sy, sd, sg);	/* 32-bit approximation to
						   sqrt(sx).  */
	  sy2 = sy + sy;
	  /* complete the INSERT_WORDS (fsg, fsgi, 0) operation.  */
	  fsg = iw_u.value;
	  e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
	  sd = -__builtin_fma (sg, sg, -sx);
	  sy = __builtin_fma (e, sy2, sy);
	  shx = sx * fsg;
	  sg = __builtin_fma (sy, sd, sg);	/* 64-bit approximation to
						   sqrt(sx), but perhaps
						   rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
	  d = -__builtin_fma (g, sg, -shx);
	  sy = __builtin_fma (e, sy2, sy);
	  fesetenv_register (fe);
	  return __builtin_fma (sy, d, g);
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __ieee754_sqrt (x * 0x1p+108f) * 0x1p-54f;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
# ifdef FE_INVALID_SQRT
      __feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l & FE_INVALID) == 0)
# endif
	__feraiseexcept (FE_INVALID);
      x = NAN;
    }
  return f_wash (x);
#endif /* USE_SQRT_BUILTIN  */
}

libm_alias_finite (__ieee754_sqrt, __sqrt)
Commit	Line	Data
ffdd5e50	1	/* Double-precision floating point square root.
2b778ceb	2	Copyright (C) 1997-2021 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
ffdd5e50 UD	18
	19	#include <math.h>
	20	#include <math_private.h>
	21	#include <fenv_libc.h>
220622dd	22	#include <libm-alias-finite.h>
169ea8f9	23	#include <math-use-builtins.h>
ffdd5e50	24
169ea8f9 AZ	25	double
	26	__ieee754_sqrt (double x)
	27	{
	28	#if USE_SQRT_BUILTIN
	29	return __builtin_sqrt (x);
	30	#else
ffdd5e50 UD	31	/* The method is based on a description in
	32	Computation of elementary functions on the IBM RISC System/6000 processor,
	33	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	34	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	35	one to find the actual square root, and one to find its reciprocal
	36	without the expense of a division operation. The tricky bit here
	37	is the use of the POWER/PowerPC multiply-add operation to get the
	38	required accuracy with high speed.
	39
	40	The argument reduction works by a combination of table lookup to
	41	obtain the initial guesses, and some careful modification of the
	42	generated guesses (which mostly runs on the integer unit, while the
868f7a40	43	Newton-Raphson is running on the FPU). */
ffdd5e50	44
169ea8f9	45	extern const float __t_sqrt[1024];
ffdd5e50 UD	46
	47	if (x > 0)
	48	{
	49	/* schedule the EXTRACT_WORDS to get separation between the store
0ac5ae23	50	and the load. */
ffdd5e50 UD	51	ieee_double_shape_type ew_u;
	52	ieee_double_shape_type iw_u;
	53	ew_u.value = (x);
169ea8f9	54	if (x != INFINITY)
ffdd5e50 UD	55	{
	56	/* Variables named starting with 's' exist in the
	57	argument-reduced space, so that 2 > sx >= 0.5,
	58	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	59	Variables named ending with 'i' are integer versions of
	60	floating-point values. */
	61	double sx; /* The value of which we're trying to find the
	62	square root. */
	63	double sg, g; /* Guess of the square root of x. */
	64	double sd, d; /* Difference between the square of the guess and x. */
	65	double sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	66	double sy2; /* 2sy /
	67	double e; /* Difference between yg and 1/2 (se = e fsy). */
	68	double shx; /* == sx * fsg */
	69	double fsg; /* sgfsg == g. /
	70	fenv_t fe; /* Saved floating-point environment (stores rounding
	71	mode and whether the inexact exception is
	72	enabled). */
	73	uint32_t xi0, xi1, sxi, fsgi;
	74	const float *t_sqrt;
	75
	76	fe = fegetenv_register ();
	77	/* complete the EXTRACT_WORDS (xi0,xi1,x) operation. */
	78	xi0 = ew_u.parts.msw;
	79	xi1 = ew_u.parts.lsw;
	80	relax_fenv_state ();
	81	sxi = (xi0 & 0x3fffffff) \| 0x3fe00000;
	82	/* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
	83	between the store and the load. */
	84	iw_u.parts.msw = sxi;
	85	iw_u.parts.lsw = xi1;
	86	t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
	87	sg = t_sqrt[0];
	88	sy = t_sqrt[1];
	89	/* complete the INSERT_WORDS (sx, sxi, xi1) operation. */
	90	sx = iw_u.value;
	91
868f7a40	92	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	93	division and a square root and the remainder of the
ffdd5e50 UD	94	argument reduction, all interleaved. */
e8bd5286	95	sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50 UD	96	fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
ffdd5e50 UD	97	sy2 = sy + sy;
e8bd5286 JM	98	sg = __builtin_fma (sy, sd, sg); /* 16-bit approximation to
e8bd5286 JM	99	sqrt(sx). */
ffdd5e50 UD	100
	101	/* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
	102	between the store and the load. */
	103	INSERT_WORDS (fsg, fsgi, 0);
	104	iw_u.parts.msw = fsgi;
	105	iw_u.parts.lsw = (0);
169ea8f9	106	e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
e8bd5286	107	sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50 UD	108	if ((xi0 & 0x7ff00000) == 0)
ffdd5e50 UD	109	goto denorm;
e8bd5286 JM	110	sy = __builtin_fma (e, sy2, sy);
	111	sg = __builtin_fma (sy, sd, sg); /* 32-bit approximation to
	112	sqrt(sx). */
ffdd5e50 UD	113	sy2 = sy + sy;
	114	/* complete the INSERT_WORDS (fsg, fsgi, 0) operation. */
	115	fsg = iw_u.value;
169ea8f9	116	e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
e8bd5286 JM	117	sd = -__builtin_fma (sg, sg, -sx);
e8bd5286 JM	118	sy = __builtin_fma (e, sy2, sy);
ffdd5e50	119	shx = sx * fsg;
e8bd5286 JM	120	sg = __builtin_fma (sy, sd, sg); /* 64-bit approximation to
	121	sqrt(sx), but perhaps
	122	rounded incorrectly. */
ffdd5e50 UD	123	sy2 = sy + sy;
ffdd5e50 UD	124	g = sg * fsg;
169ea8f9	125	e = -__builtin_fma (sy, sg, -0x1.0000000000001p-1);
e8bd5286 JM	126	d = -__builtin_fma (g, sg, -shx);
e8bd5286 JM	127	sy = __builtin_fma (e, sy2, sy);
ffdd5e50	128	fesetenv_register (fe);
e8bd5286	129	return __builtin_fma (sy, d, g);
ffdd5e50 UD	130	denorm:
	131	/* For denormalised numbers, we normalise, calculate the
	132	square root, and return an adjusted result. */
	133	fesetenv_register (fe);
169ea8f9	134	return __ieee754_sqrt (x * 0x1p+108f) * 0x1p-54f;
ffdd5e50 UD	135	}
	136	}
	137	else if (x < 0)
	138	{
	139	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	140	FE_INVALID_SQRT. */
169ea8f9	141	# ifdef FE_INVALID_SQRT
0747f818	142	__feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	143
c3a0ead4 UD	144	fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca	145	if ((u.l & FE_INVALID) == 0)
169ea8f9	146	# endif
0747f818	147	__feraiseexcept (FE_INVALID);
169ea8f9	148	x = NAN;
ffdd5e50 UD	149	}
ffdd5e50 UD	150	return f_wash (x);
169ea8f9	151	#endif /* USE_SQRT_BUILTIN */
ffdd5e50 UD	152	}
ffdd5e50 UD	153
220622dd	154	libm_alias_finite (__ieee754_sqrt, __sqrt)