[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrt.c

/* Double-precision floating point square root.
   Copyright (C) 1997-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <fenv.h>
#include <fenv_libc.h>
#include <inttypes.h>
#include <stdint.h>
#include <sysdep.h>
#include <ldsodefs.h>

#ifndef _ARCH_PPCSQ
static const double almost_half = 0.5000000000000001;	/* 0.5 + 2^-53 */
static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float two108 = 3.245185536584267269e+32;
static const float twom54 = 5.551115123125782702e-17;
extern const float __t_sqrt[1024];

/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

double
__slow_ieee754_sqrt (double x)
{
  const float inf = a_inf.value;

  if (x > 0)
    {
      /* schedule the EXTRACT_WORDS to get separation between the store
	 and the load.  */
      ieee_double_shape_type ew_u;
      ieee_double_shape_type iw_u;
      ew_u.value = (x);
      if (x != inf)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  double sx;	/* The value of which we're trying to find the
			   square root.  */
	  double sg, g;	/* Guess of the square root of x.  */
	  double sd, d;	/* Difference between the square of the guess and x.  */
	  double sy;	/* Estimate of 1/2g (overestimated by 1ulp).  */
	  double sy2;	/* 2*sy */
	  double e;	/* Difference between y*g and 1/2 (se = e * fsy).  */
	  double shx;	/* == sx * fsg */
	  double fsg;	/* sg*fsg == g.  */
	  fenv_t fe;	/* Saved floating-point environment (stores rounding
			   mode and whether the inexact exception is
			   enabled).  */
	  uint32_t xi0, xi1, sxi, fsgi;
	  const float *t_sqrt;

	  fe = fegetenv_register ();
	  /* complete the EXTRACT_WORDS (xi0,xi1,x) operation.  */
	  xi0 = ew_u.parts.msw;
	  xi1 = ew_u.parts.lsw;
	  relax_fenv_state ();
	  sxi = (xi0 & 0x3fffffff) | 0x3fe00000;
	  /* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
	     between the store and the load.  */
	  iw_u.parts.msw = sxi;
	  iw_u.parts.lsw = xi1;
	  t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];
	  /* complete the INSERT_WORDS (sx, sxi, xi1) operation.  */
	  sx = iw_u.value;

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -__builtin_fma (sg, sg, -sx);
	  fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
	  sy2 = sy + sy;
	  sg = __builtin_fma (sy, sd, sg);	/* 16-bit approximation to
						   sqrt(sx). */

	  /* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
	     between the store and the load.  */
	  INSERT_WORDS (fsg, fsgi, 0);
	  iw_u.parts.msw = fsgi;
	  iw_u.parts.lsw = (0);
	  e = -__builtin_fma (sy, sg, -almost_half);
	  sd = -__builtin_fma (sg, sg, -sx);
	  if ((xi0 & 0x7ff00000) == 0)
	    goto denorm;
	  sy = __builtin_fma (e, sy2, sy);
	  sg = __builtin_fma (sy, sd, sg);	/* 32-bit approximation to
						   sqrt(sx).  */
	  sy2 = sy + sy;
	  /* complete the INSERT_WORDS (fsg, fsgi, 0) operation.  */
	  fsg = iw_u.value;
	  e = -__builtin_fma (sy, sg, -almost_half);
	  sd = -__builtin_fma (sg, sg, -sx);
	  sy = __builtin_fma (e, sy2, sy);
	  shx = sx * fsg;
	  sg = __builtin_fma (sy, sd, sg);	/* 64-bit approximation to
						   sqrt(sx), but perhaps
						   rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -__builtin_fma (sy, sg, -almost_half);
	  d = -__builtin_fma (g, sg, -shx);
	  sy = __builtin_fma (e, sy2, sy);
	  fesetenv_register (fe);
	  return __builtin_fma (sy, d, g);
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __slow_ieee754_sqrt (x * two108) * twom54;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
#ifdef FE_INVALID_SQRT
      __feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l & FE_INVALID) == 0)
#endif
	__feraiseexcept (FE_INVALID);
      x = a_nan.value;
    }
  return f_wash (x);
}
#endif /* _ARCH_PPCSQ  */

#undef __ieee754_sqrt
double
__ieee754_sqrt (double x)
{
  double z;

#ifdef _ARCH_PPCSQ
  asm ("fsqrt %0,%1\n" :"=f" (z):"f" (x));
#else
  z = __slow_ieee754_sqrt (x);
#endif

  return z;
}
strong_alias (__ieee754_sqrt, __sqrt_finite)
Commit	Line	Data
ffdd5e50	1	/* Double-precision floating point square root.
04277e02	2	Copyright (C) 1997-2019 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
ffdd5e50 UD	18
	19	#include <math.h>
	20	#include <math_private.h>
418d99e6	21	#include <fenv.h>
ffdd5e50 UD	22	#include <fenv_libc.h>
ffdd5e50 UD	23	#include <inttypes.h>
e054f494	24	#include <stdint.h>
ffdd5e50 UD	25	#include <sysdep.h>
ffdd5e50 UD	26	#include <ldsodefs.h>
ffdd5e50	27
08cee2a4	28	#ifndef _ARCH_PPCSQ
ffdd5e50 UD	29	static const double almost_half = 0.5000000000000001; /* 0.5 + 2^-53 */
	30	static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
	31	static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
	32	static const float two108 = 3.245185536584267269e+32;
	33	static const float twom54 = 5.551115123125782702e-17;
	34	extern const float __t_sqrt[1024];
	35
	36	/* The method is based on a description in
	37	Computation of elementary functions on the IBM RISC System/6000 processor,
	38	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	39	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	40	one to find the actual square root, and one to find its reciprocal
	41	without the expense of a division operation. The tricky bit here
	42	is the use of the POWER/PowerPC multiply-add operation to get the
	43	required accuracy with high speed.
	44
	45	The argument reduction works by a combination of table lookup to
	46	obtain the initial guesses, and some careful modification of the
	47	generated guesses (which mostly runs on the integer unit, while the
868f7a40	48	Newton-Raphson is running on the FPU). */
ffdd5e50	49
ffdd5e50 UD	50	double
ffdd5e50 UD	51	__slow_ieee754_sqrt (double x)
ffdd5e50 UD	52	{
	53	const float inf = a_inf.value;
	54
	55	if (x > 0)
	56	{
	57	/* schedule the EXTRACT_WORDS to get separation between the store
0ac5ae23	58	and the load. */
ffdd5e50 UD	59	ieee_double_shape_type ew_u;
	60	ieee_double_shape_type iw_u;
	61	ew_u.value = (x);
	62	if (x != inf)
	63	{
	64	/* Variables named starting with 's' exist in the
	65	argument-reduced space, so that 2 > sx >= 0.5,
	66	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	67	Variables named ending with 'i' are integer versions of
	68	floating-point values. */
	69	double sx; /* The value of which we're trying to find the
	70	square root. */
	71	double sg, g; /* Guess of the square root of x. */
	72	double sd, d; /* Difference between the square of the guess and x. */
	73	double sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	74	double sy2; /* 2sy /
	75	double e; /* Difference between yg and 1/2 (se = e fsy). */
	76	double shx; /* == sx * fsg */
	77	double fsg; /* sgfsg == g. /
	78	fenv_t fe; /* Saved floating-point environment (stores rounding
	79	mode and whether the inexact exception is
	80	enabled). */
	81	uint32_t xi0, xi1, sxi, fsgi;
	82	const float *t_sqrt;
	83
	84	fe = fegetenv_register ();
	85	/* complete the EXTRACT_WORDS (xi0,xi1,x) operation. */
	86	xi0 = ew_u.parts.msw;
	87	xi1 = ew_u.parts.lsw;
	88	relax_fenv_state ();
	89	sxi = (xi0 & 0x3fffffff) \| 0x3fe00000;
	90	/* schedule the INSERT_WORDS (sx, sxi, xi1) to get separation
	91	between the store and the load. */
	92	iw_u.parts.msw = sxi;
	93	iw_u.parts.lsw = xi1;
	94	t_sqrt = __t_sqrt + (xi0 >> (52 - 32 - 8 - 1) & 0x3fe);
	95	sg = t_sqrt[0];
	96	sy = t_sqrt[1];
	97	/* complete the INSERT_WORDS (sx, sxi, xi1) operation. */
	98	sx = iw_u.value;
	99
868f7a40	100	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	101	division and a square root and the remainder of the
ffdd5e50 UD	102	argument reduction, all interleaved. */
e8bd5286	103	sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50 UD	104	fsgi = (xi0 + 0x40000000) >> 1 & 0x7ff00000;
ffdd5e50 UD	105	sy2 = sy + sy;
e8bd5286 JM	106	sg = __builtin_fma (sy, sd, sg); /* 16-bit approximation to
e8bd5286 JM	107	sqrt(sx). */
ffdd5e50 UD	108
	109	/* schedule the INSERT_WORDS (fsg, fsgi, 0) to get separation
	110	between the store and the load. */
	111	INSERT_WORDS (fsg, fsgi, 0);
	112	iw_u.parts.msw = fsgi;
	113	iw_u.parts.lsw = (0);
e8bd5286 JM	114	e = -__builtin_fma (sy, sg, -almost_half);
e8bd5286 JM	115	sd = -__builtin_fma (sg, sg, -sx);
ffdd5e50 UD	116	if ((xi0 & 0x7ff00000) == 0)
ffdd5e50 UD	117	goto denorm;
e8bd5286 JM	118	sy = __builtin_fma (e, sy2, sy);
	119	sg = __builtin_fma (sy, sd, sg); /* 32-bit approximation to
	120	sqrt(sx). */
ffdd5e50 UD	121	sy2 = sy + sy;
	122	/* complete the INSERT_WORDS (fsg, fsgi, 0) operation. */
	123	fsg = iw_u.value;
e8bd5286 JM	124	e = -__builtin_fma (sy, sg, -almost_half);
	125	sd = -__builtin_fma (sg, sg, -sx);
	126	sy = __builtin_fma (e, sy2, sy);
ffdd5e50	127	shx = sx * fsg;
e8bd5286 JM	128	sg = __builtin_fma (sy, sd, sg); /* 64-bit approximation to
	129	sqrt(sx), but perhaps
	130	rounded incorrectly. */
ffdd5e50 UD	131	sy2 = sy + sy;
ffdd5e50 UD	132	g = sg * fsg;
e8bd5286 JM	133	e = -__builtin_fma (sy, sg, -almost_half);
	134	d = -__builtin_fma (g, sg, -shx);
	135	sy = __builtin_fma (e, sy2, sy);
ffdd5e50	136	fesetenv_register (fe);
e8bd5286	137	return __builtin_fma (sy, d, g);
ffdd5e50 UD	138	denorm:
	139	/* For denormalised numbers, we normalise, calculate the
	140	square root, and return an adjusted result. */
	141	fesetenv_register (fe);
	142	return __slow_ieee754_sqrt (x * two108) * twom54;
	143	}
	144	}
	145	else if (x < 0)
	146	{
	147	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	148	FE_INVALID_SQRT. */
ffdd5e50	149	#ifdef FE_INVALID_SQRT
0747f818	150	__feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	151
c3a0ead4 UD	152	fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca	153	if ((u.l & FE_INVALID) == 0)
ffdd5e50	154	#endif
0747f818	155	__feraiseexcept (FE_INVALID);
ffdd5e50 UD	156	x = a_nan.value;
	157	}
	158	return f_wash (x);
	159	}
08cee2a4	160	#endif /* _ARCH_PPCSQ */
ffdd5e50	161
8a6d5255	162	#undef __ieee754_sqrt
ffdd5e50 UD	163	double
ffdd5e50 UD	164	__ieee754_sqrt (double x)
ffdd5e50 UD	165	{
	166	double z;
	167
08cee2a4 AZ	168	#ifdef _ARCH_PPCSQ
	169	asm ("fsqrt %0,%1\n" :"=f" (z):"f" (x));
	170	#else
	171	z = __slow_ieee754_sqrt (x);
	172	#endif
ffdd5e50 UD	173
	174	return z;
	175	}
0ac5ae23	176	strong_alias (__ieee754_sqrt, __sqrt_finite)