[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrtf.c

/* Single-precision floating point square root.
   Copyright (C) 1997-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <fenv_libc.h>
#include <inttypes.h>
#include <stdint.h>
#include <sysdep.h>
#include <ldsodefs.h>

static const float almost_half = 0.50000006;	/* 0.5 + 2^-24 */
static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float two48 = 281474976710656.0;
static const float twom24 = 5.9604644775390625e-8;
extern const float __t_sqrt[1024];

/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

float
__slow_ieee754_sqrtf (float x)
{
  const float inf = a_inf.value;

  if (x > 0)
    {
      if (x != inf)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  float sx;		/* The value of which we're trying to find the square
				   root.  */
	  float sg, g;		/* Guess of the square root of x.  */
	  float sd, d;		/* Difference between the square of the guess and x.  */
	  float sy;		/* Estimate of 1/2g (overestimated by 1ulp).  */
	  float sy2;		/* 2*sy */
	  float e;		/* Difference between y*g and 1/2 (note that e==se).  */
	  float shx;		/* == sx * fsg */
	  float fsg;		/* sg*fsg == g.  */
	  fenv_t fe;		/* Saved floating-point environment (stores rounding
				   mode and whether the inexact exception is
				   enabled).  */
	  uint32_t xi, sxi, fsgi;
	  const float *t_sqrt;

	  GET_FLOAT_WORD (xi, x);
	  fe = fegetenv_register ();
	  relax_fenv_state ();
	  sxi = (xi & 0x3fffffff) | 0x3f000000;
	  SET_FLOAT_WORD (sx, sxi);
	  t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -(sg * sg - sx);
	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	  sy2 = sy + sy;
	  sg = sy * sd + sg;	/* 16-bit approximation to sqrt(sx). */
	  e = -(sy * sg - almost_half);
	  SET_FLOAT_WORD (fsg, fsgi);
	  sd = -(sg * sg - sx);
	  sy = sy + e * sy2;
	  if ((xi & 0x7f800000) == 0)
	    goto denorm;
	  shx = sx * fsg;
	  sg = sg + sy * sd;	/* 32-bit approximation to sqrt(sx),
				   but perhaps rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -(sy * sg - almost_half);
	  d = -(g * sg - shx);
	  sy = sy + e * sy2;
	  fesetenv_register (fe);
	  return g + sy * d;
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __slow_ieee754_sqrtf (x * two48) * twom24;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
#ifdef FE_INVALID_SQRT
      feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l & FE_INVALID) == 0)
#endif
	feraiseexcept (FE_INVALID);
      x = a_nan.value;
    }
  return f_washf (x);
}

#undef __ieee754_sqrtf
float
__ieee754_sqrtf (float x)
{
  double z;

  /* If the CPU is 64-bit we can use the optional FP instructions.  */
  if (__CPU_HAS_FSQRT)
    {
      /* Volatile is required to prevent the compiler from moving the
	 fsqrt instruction above the branch.  */
      __asm __volatile ("	fsqrts	%0,%1\n"
				:"=f" (z):"f" (x));
    }
  else
    z = __slow_ieee754_sqrtf (x);

  return z;
}
strong_alias (__ieee754_sqrtf, __sqrtf_finite)
Commit	Line	Data
ffdd5e50	1	/* Single-precision floating point square root.
b168057a	2	Copyright (C) 1997-2015 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
ffdd5e50 UD	18
	19	#include <math.h>
	20	#include <math_private.h>
	21	#include <fenv_libc.h>
	22	#include <inttypes.h>
e054f494	23	#include <stdint.h>
ffdd5e50 UD	24	#include <sysdep.h>
ffdd5e50 UD	25	#include <ldsodefs.h>
ffdd5e50 UD	26
	27	static const float almost_half = 0.50000006; /* 0.5 + 2^-24 */
	28	static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
	29	static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
	30	static const float two48 = 281474976710656.0;
	31	static const float twom24 = 5.9604644775390625e-8;
	32	extern const float __t_sqrt[1024];
	33
	34	/* The method is based on a description in
	35	Computation of elementary functions on the IBM RISC System/6000 processor,
	36	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	37	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	38	one to find the actual square root, and one to find its reciprocal
	39	without the expense of a division operation. The tricky bit here
	40	is the use of the POWER/PowerPC multiply-add operation to get the
	41	required accuracy with high speed.
	42
	43	The argument reduction works by a combination of table lookup to
	44	obtain the initial guesses, and some careful modification of the
	45	generated guesses (which mostly runs on the integer unit, while the
868f7a40	46	Newton-Raphson is running on the FPU). */
ffdd5e50	47
ffdd5e50 UD	48	float
ffdd5e50 UD	49	__slow_ieee754_sqrtf (float x)
ffdd5e50 UD	50	{
	51	const float inf = a_inf.value;
	52
	53	if (x > 0)
	54	{
	55	if (x != inf)
	56	{
	57	/* Variables named starting with 's' exist in the
	58	argument-reduced space, so that 2 > sx >= 0.5,
	59	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	60	Variables named ending with 'i' are integer versions of
	61	floating-point values. */
	62	float sx; /* The value of which we're trying to find the square
	63	root. */
	64	float sg, g; /* Guess of the square root of x. */
	65	float sd, d; /* Difference between the square of the guess and x. */
	66	float sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	67	float sy2; /* 2sy /
	68	float e; /* Difference between yg and 1/2 (note that e==se). /
	69	float shx; /* == sx * fsg */
	70	float fsg; /* sgfsg == g. /
	71	fenv_t fe; /* Saved floating-point environment (stores rounding
	72	mode and whether the inexact exception is
	73	enabled). */
	74	uint32_t xi, sxi, fsgi;
	75	const float *t_sqrt;
	76
	77	GET_FLOAT_WORD (xi, x);
	78	fe = fegetenv_register ();
	79	relax_fenv_state ();
	80	sxi = (xi & 0x3fffffff) \| 0x3f000000;
	81	SET_FLOAT_WORD (sx, sxi);
	82	t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	83	sg = t_sqrt[0];
	84	sy = t_sqrt[1];
	85
868f7a40	86	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	87	division and a square root and the remainder of the
	88	argument reduction, all interleaved. */
	89	sd = -(sg * sg - sx);
	90	fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	91	sy2 = sy + sy;
	92	sg = sy * sd + sg; /* 16-bit approximation to sqrt(sx). */
	93	e = -(sy * sg - almost_half);
	94	SET_FLOAT_WORD (fsg, fsgi);
	95	sd = -(sg * sg - sx);
	96	sy = sy + e * sy2;
	97	if ((xi & 0x7f800000) == 0)
	98	goto denorm;
	99	shx = sx * fsg;
	100	sg = sg + sy * sd; /* 32-bit approximation to sqrt(sx),
	101	but perhaps rounded incorrectly. */
	102	sy2 = sy + sy;
	103	g = sg * fsg;
	104	e = -(sy * sg - almost_half);
	105	d = -(g * sg - shx);
	106	sy = sy + e * sy2;
	107	fesetenv_register (fe);
	108	return g + sy * d;
	109	denorm:
	110	/* For denormalised numbers, we normalise, calculate the
	111	square root, and return an adjusted result. */
	112	fesetenv_register (fe);
	113	return __slow_ieee754_sqrtf (x * two48) * twom24;
	114	}
	115	}
	116	else if (x < 0)
	117	{
	118	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	119	FE_INVALID_SQRT. */
ffdd5e50 UD	120	#ifdef FE_INVALID_SQRT
ffdd5e50 UD	121	feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	122
c3a0ead4 UD	123	fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca	124	if ((u.l & FE_INVALID) == 0)
ffdd5e50 UD	125	#endif
	126	feraiseexcept (FE_INVALID);
	127	x = a_nan.value;
	128	}
	129	return f_washf (x);
	130	}
	131
8a6d5255	132	#undef __ieee754_sqrtf
ffdd5e50 UD	133	float
ffdd5e50 UD	134	__ieee754_sqrtf (float x)
ffdd5e50 UD	135	{
	136	double z;
	137
433f49c4 UD	138	/* If the CPU is 64-bit we can use the optional FP instructions. */
433f49c4 UD	139	if (__CPU_HAS_FSQRT)
ffdd5e50	140	{
c3a0ead4	141	/* Volatile is required to prevent the compiler from moving the
0ac5ae23	142	fsqrt instruction above the branch. */
ffdd5e50 UD	143	__asm __volatile (" fsqrts %0,%1\n"
	144	:"=f" (z):"f" (x));
	145	}
	146	else
	147	z = __slow_ieee754_sqrtf (x);
	148
	149	return z;
	150	}
0ac5ae23	151	strong_alias (__ieee754_sqrtf, __sqrtf_finite)