[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrtf.c

/* Single-precision floating point square root.
   Copyright (C) 1997, 2003, 2004, 2008, 2011 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */

#include <math.h>
#include <math_private.h>
#include <fenv_libc.h>
#include <inttypes.h>

#include <sysdep.h>
#include <ldsodefs.h>

static const float almost_half = 0.50000006;	/* 0.5 + 2^-24 */
static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float two48 = 281474976710656.0;
static const float twom24 = 5.9604644775390625e-8;
extern const float __t_sqrt[1024];

/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

float
__slow_ieee754_sqrtf (float x)
{
  const float inf = a_inf.value;

  if (x > 0)
    {
      if (x != inf)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  float sx;		/* The value of which we're trying to find the square
				   root.  */
	  float sg, g;		/* Guess of the square root of x.  */
	  float sd, d;		/* Difference between the square of the guess and x.  */
	  float sy;		/* Estimate of 1/2g (overestimated by 1ulp).  */
	  float sy2;		/* 2*sy */
	  float e;		/* Difference between y*g and 1/2 (note that e==se).  */
	  float shx;		/* == sx * fsg */
	  float fsg;		/* sg*fsg == g.  */
	  fenv_t fe;		/* Saved floating-point environment (stores rounding
				   mode and whether the inexact exception is
				   enabled).  */
	  uint32_t xi, sxi, fsgi;
	  const float *t_sqrt;

	  GET_FLOAT_WORD (xi, x);
	  fe = fegetenv_register ();
	  relax_fenv_state ();
	  sxi = (xi & 0x3fffffff) | 0x3f000000;
	  SET_FLOAT_WORD (sx, sxi);
	  t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -(sg * sg - sx);
	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	  sy2 = sy + sy;
	  sg = sy * sd + sg;	/* 16-bit approximation to sqrt(sx). */
	  e = -(sy * sg - almost_half);
	  SET_FLOAT_WORD (fsg, fsgi);
	  sd = -(sg * sg - sx);
	  sy = sy + e * sy2;
	  if ((xi & 0x7f800000) == 0)
	    goto denorm;
	  shx = sx * fsg;
	  sg = sg + sy * sd;	/* 32-bit approximation to sqrt(sx),
				   but perhaps rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -(sy * sg - almost_half);
	  d = -(g * sg - shx);
	  sy = sy + e * sy2;
	  fesetenv_register (fe);
	  return g + sy * d;
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __slow_ieee754_sqrtf (x * two48) * twom24;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
#ifdef FE_INVALID_SQRT
      feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l[1] & FE_INVALID) == 0)
#endif
	feraiseexcept (FE_INVALID);
      x = a_nan.value;
    }
  return f_washf (x);
}


float
__ieee754_sqrtf (float x)
{
  double z;

  /* If the CPU is 64-bit we can use the optional FP instructions.  */
  if (__CPU_HAS_FSQRT)
    {
      /* Volatile is required to prevent the compiler from moving the
	 fsqrt instruction above the branch.  */
      __asm __volatile ("	fsqrts	%0,%1\n"
				:"=f" (z):"f" (x));
    }
  else
    z = __slow_ieee754_sqrtf (x);

  return z;
}
strong_alias (__ieee754_sqrtf, __sqrtf_finite)
Commit	Line	Data
ffdd5e50	1	/* Single-precision floating point square root.
0ac5ae23	2	Copyright (C) 1997, 2003, 2004, 2008, 2011 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
	16	License along with the GNU C Library; if not, write to the Free
	17	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
	18	02111-1307 USA. */
	19
	20	#include <math.h>
	21	#include <math_private.h>
	22	#include <fenv_libc.h>
	23	#include <inttypes.h>
	24
	25	#include <sysdep.h>
	26	#include <ldsodefs.h>
ffdd5e50 UD	27
	28	static const float almost_half = 0.50000006; /* 0.5 + 2^-24 */
	29	static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
	30	static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
	31	static const float two48 = 281474976710656.0;
	32	static const float twom24 = 5.9604644775390625e-8;
	33	extern const float __t_sqrt[1024];
	34
	35	/* The method is based on a description in
	36	Computation of elementary functions on the IBM RISC System/6000 processor,
	37	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	38	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	39	one to find the actual square root, and one to find its reciprocal
	40	without the expense of a division operation. The tricky bit here
	41	is the use of the POWER/PowerPC multiply-add operation to get the
	42	required accuracy with high speed.
	43
	44	The argument reduction works by a combination of table lookup to
	45	obtain the initial guesses, and some careful modification of the
	46	generated guesses (which mostly runs on the integer unit, while the
868f7a40	47	Newton-Raphson is running on the FPU). */
ffdd5e50	48
ffdd5e50 UD	49	float
ffdd5e50 UD	50	__slow_ieee754_sqrtf (float x)
ffdd5e50 UD	51	{
	52	const float inf = a_inf.value;
	53
	54	if (x > 0)
	55	{
	56	if (x != inf)
	57	{
	58	/* Variables named starting with 's' exist in the
	59	argument-reduced space, so that 2 > sx >= 0.5,
	60	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	61	Variables named ending with 'i' are integer versions of
	62	floating-point values. */
	63	float sx; /* The value of which we're trying to find the square
	64	root. */
	65	float sg, g; /* Guess of the square root of x. */
	66	float sd, d; /* Difference between the square of the guess and x. */
	67	float sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	68	float sy2; /* 2sy /
	69	float e; /* Difference between yg and 1/2 (note that e==se). /
	70	float shx; /* == sx * fsg */
	71	float fsg; /* sgfsg == g. /
	72	fenv_t fe; /* Saved floating-point environment (stores rounding
	73	mode and whether the inexact exception is
	74	enabled). */
	75	uint32_t xi, sxi, fsgi;
	76	const float *t_sqrt;
	77
	78	GET_FLOAT_WORD (xi, x);
	79	fe = fegetenv_register ();
	80	relax_fenv_state ();
	81	sxi = (xi & 0x3fffffff) \| 0x3f000000;
	82	SET_FLOAT_WORD (sx, sxi);
	83	t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	84	sg = t_sqrt[0];
	85	sy = t_sqrt[1];
	86
868f7a40	87	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	88	division and a square root and the remainder of the
	89	argument reduction, all interleaved. */
	90	sd = -(sg * sg - sx);
	91	fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	92	sy2 = sy + sy;
	93	sg = sy * sd + sg; /* 16-bit approximation to sqrt(sx). */
	94	e = -(sy * sg - almost_half);
	95	SET_FLOAT_WORD (fsg, fsgi);
	96	sd = -(sg * sg - sx);
	97	sy = sy + e * sy2;
	98	if ((xi & 0x7f800000) == 0)
	99	goto denorm;
	100	shx = sx * fsg;
	101	sg = sg + sy * sd; /* 32-bit approximation to sqrt(sx),
	102	but perhaps rounded incorrectly. */
	103	sy2 = sy + sy;
	104	g = sg * fsg;
	105	e = -(sy * sg - almost_half);
	106	d = -(g * sg - shx);
	107	sy = sy + e * sy2;
	108	fesetenv_register (fe);
	109	return g + sy * d;
	110	denorm:
	111	/* For denormalised numbers, we normalise, calculate the
	112	square root, and return an adjusted result. */
	113	fesetenv_register (fe);
	114	return __slow_ieee754_sqrtf (x * two48) * twom24;
	115	}
	116	}
	117	else if (x < 0)
	118	{
	119	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	120	FE_INVALID_SQRT. */
ffdd5e50 UD	121	#ifdef FE_INVALID_SQRT
ffdd5e50 UD	122	feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	123
	124	fenv_union_t u = { .fenv = fegetenv_register () };
	125	if ((u.l[1] & FE_INVALID) == 0)
ffdd5e50 UD	126	#endif
	127	feraiseexcept (FE_INVALID);
	128	x = a_nan.value;
	129	}
	130	return f_washf (x);
	131	}
	132
	133
ffdd5e50 UD	134	float
ffdd5e50 UD	135	__ieee754_sqrtf (float x)
ffdd5e50 UD	136	{
	137	double z;
	138
433f49c4 UD	139	/* If the CPU is 64-bit we can use the optional FP instructions. */
433f49c4 UD	140	if (__CPU_HAS_FSQRT)
ffdd5e50	141	{
c3a0ead4	142	/* Volatile is required to prevent the compiler from moving the
0ac5ae23	143	fsqrt instruction above the branch. */
ffdd5e50 UD	144	__asm __volatile (" fsqrts %0,%1\n"
	145	:"=f" (z):"f" (x));
	146	}
	147	else
	148	z = __slow_ieee754_sqrtf (x);
	149
	150	return z;
	151	}
0ac5ae23	152	strong_alias (__ieee754_sqrtf, __sqrtf_finite)