[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrtf.c

/* Single-precision floating point square root.
   Copyright (C) 1997-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <fenv_libc.h>
#include <inttypes.h>
#include <stdint.h>
#include <sysdep.h>
#include <ldsodefs.h>

#ifndef _ARCH_PPCSQ
static const float almost_half = 0.50000006;	/* 0.5 + 2^-24 */
static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float two48 = 281474976710656.0;
static const float twom24 = 5.9604644775390625e-8;
extern const float __t_sqrt[1024];

/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

float
__slow_ieee754_sqrtf (float x)
{
  const float inf = a_inf.value;

  if (x > 0)
    {
      if (x != inf)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  float sx;		/* The value of which we're trying to find the square
				   root.  */
	  float sg, g;		/* Guess of the square root of x.  */
	  float sd, d;		/* Difference between the square of the guess and x.  */
	  float sy;		/* Estimate of 1/2g (overestimated by 1ulp).  */
	  float sy2;		/* 2*sy */
	  float e;		/* Difference between y*g and 1/2 (note that e==se).  */
	  float shx;		/* == sx * fsg */
	  float fsg;		/* sg*fsg == g.  */
	  fenv_t fe;		/* Saved floating-point environment (stores rounding
				   mode and whether the inexact exception is
				   enabled).  */
	  uint32_t xi, sxi, fsgi;
	  const float *t_sqrt;

	  GET_FLOAT_WORD (xi, x);
	  fe = fegetenv_register ();
	  relax_fenv_state ();
	  sxi = (xi & 0x3fffffff) | 0x3f000000;
	  SET_FLOAT_WORD (sx, sxi);
	  t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -__builtin_fmaf (sg, sg, -sx);
	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	  sy2 = sy + sy;
	  sg = __builtin_fmaf (sy, sd, sg);	/* 16-bit approximation to
						   sqrt(sx). */
	  e = -__builtin_fmaf (sy, sg, -almost_half);
	  SET_FLOAT_WORD (fsg, fsgi);
	  sd = -__builtin_fmaf (sg, sg, -sx);
	  sy = __builtin_fmaf (e, sy2, sy);
	  if ((xi & 0x7f800000) == 0)
	    goto denorm;
	  shx = sx * fsg;
	  sg = __builtin_fmaf (sy, sd, sg);	/* 32-bit approximation to
						   sqrt(sx), but perhaps
						   rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -__builtin_fmaf (sy, sg, -almost_half);
	  d = -__builtin_fmaf (g, sg, -shx);
	  sy = __builtin_fmaf (e, sy2, sy);
	  fesetenv_register (fe);
	  return __builtin_fmaf (sy, d, g);
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __slow_ieee754_sqrtf (x * two48) * twom24;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
#ifdef FE_INVALID_SQRT
      feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l & FE_INVALID) == 0)
#endif
	feraiseexcept (FE_INVALID);
      x = a_nan.value;
    }
  return f_washf (x);
}
#endif /* _ARCH_PPCSQ  */

#undef __ieee754_sqrtf
float
__ieee754_sqrtf (float x)
{
  double z;

#ifdef _ARCH_PPCSQ
  asm ("fsqrts	%0,%1\n" :"=f" (z):"f" (x));
#else
  z = __slow_ieee754_sqrtf (x);
#endif

  return z;
}
strong_alias (__ieee754_sqrtf, __sqrtf_finite)
Commit	Line	Data
ffdd5e50	1	/* Single-precision floating point square root.
688903eb	2	Copyright (C) 1997-2018 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6 PE	16	License along with the GNU C Library; if not, see
59ba27a6 PE	17	<http://www.gnu.org/licenses/>. */
ffdd5e50 UD	18
	19	#include <math.h>
	20	#include <math_private.h>
	21	#include <fenv_libc.h>
	22	#include <inttypes.h>
e054f494	23	#include <stdint.h>
ffdd5e50 UD	24	#include <sysdep.h>
ffdd5e50 UD	25	#include <ldsodefs.h>
ffdd5e50	26
08cee2a4	27	#ifndef _ARCH_PPCSQ
ffdd5e50 UD	28	static const float almost_half = 0.50000006; /* 0.5 + 2^-24 */
	29	static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
	30	static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
	31	static const float two48 = 281474976710656.0;
	32	static const float twom24 = 5.9604644775390625e-8;
	33	extern const float __t_sqrt[1024];
	34
	35	/* The method is based on a description in
	36	Computation of elementary functions on the IBM RISC System/6000 processor,
	37	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	38	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	39	one to find the actual square root, and one to find its reciprocal
	40	without the expense of a division operation. The tricky bit here
	41	is the use of the POWER/PowerPC multiply-add operation to get the
	42	required accuracy with high speed.
	43
	44	The argument reduction works by a combination of table lookup to
	45	obtain the initial guesses, and some careful modification of the
	46	generated guesses (which mostly runs on the integer unit, while the
868f7a40	47	Newton-Raphson is running on the FPU). */
ffdd5e50	48
ffdd5e50 UD	49	float
ffdd5e50 UD	50	__slow_ieee754_sqrtf (float x)
ffdd5e50 UD	51	{
	52	const float inf = a_inf.value;
	53
	54	if (x > 0)
	55	{
	56	if (x != inf)
	57	{
	58	/* Variables named starting with 's' exist in the
	59	argument-reduced space, so that 2 > sx >= 0.5,
	60	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	61	Variables named ending with 'i' are integer versions of
	62	floating-point values. */
	63	float sx; /* The value of which we're trying to find the square
	64	root. */
	65	float sg, g; /* Guess of the square root of x. */
	66	float sd, d; /* Difference between the square of the guess and x. */
	67	float sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	68	float sy2; /* 2sy /
	69	float e; /* Difference between yg and 1/2 (note that e==se). /
	70	float shx; /* == sx * fsg */
	71	float fsg; /* sgfsg == g. /
	72	fenv_t fe; /* Saved floating-point environment (stores rounding
	73	mode and whether the inexact exception is
	74	enabled). */
	75	uint32_t xi, sxi, fsgi;
	76	const float *t_sqrt;
	77
	78	GET_FLOAT_WORD (xi, x);
	79	fe = fegetenv_register ();
	80	relax_fenv_state ();
	81	sxi = (xi & 0x3fffffff) \| 0x3f000000;
	82	SET_FLOAT_WORD (sx, sxi);
	83	t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	84	sg = t_sqrt[0];
	85	sy = t_sqrt[1];
	86
868f7a40	87	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	88	division and a square root and the remainder of the
ffdd5e50 UD	89	argument reduction, all interleaved. */
95c26233	90	sd = -__builtin_fmaf (sg, sg, -sx);
ffdd5e50 UD	91	fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
ffdd5e50 UD	92	sy2 = sy + sy;
95c26233 JM	93	sg = __builtin_fmaf (sy, sd, sg); /* 16-bit approximation to
	94	sqrt(sx). */
	95	e = -__builtin_fmaf (sy, sg, -almost_half);
ffdd5e50	96	SET_FLOAT_WORD (fsg, fsgi);
95c26233 JM	97	sd = -__builtin_fmaf (sg, sg, -sx);
95c26233 JM	98	sy = __builtin_fmaf (e, sy2, sy);
ffdd5e50 UD	99	if ((xi & 0x7f800000) == 0)
	100	goto denorm;
	101	shx = sx * fsg;
95c26233 JM	102	sg = __builtin_fmaf (sy, sd, sg); /* 32-bit approximation to
	103	sqrt(sx), but perhaps
	104	rounded incorrectly. */
ffdd5e50 UD	105	sy2 = sy + sy;
ffdd5e50 UD	106	g = sg * fsg;
95c26233 JM	107	e = -__builtin_fmaf (sy, sg, -almost_half);
	108	d = -__builtin_fmaf (g, sg, -shx);
	109	sy = __builtin_fmaf (e, sy2, sy);
ffdd5e50	110	fesetenv_register (fe);
95c26233	111	return __builtin_fmaf (sy, d, g);
ffdd5e50 UD	112	denorm:
	113	/* For denormalised numbers, we normalise, calculate the
	114	square root, and return an adjusted result. */
	115	fesetenv_register (fe);
	116	return __slow_ieee754_sqrtf (x * two48) * twom24;
	117	}
	118	}
	119	else if (x < 0)
	120	{
	121	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	122	FE_INVALID_SQRT. */
ffdd5e50 UD	123	#ifdef FE_INVALID_SQRT
ffdd5e50 UD	124	feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	125
c3a0ead4 UD	126	fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca	127	if ((u.l & FE_INVALID) == 0)
ffdd5e50 UD	128	#endif
	129	feraiseexcept (FE_INVALID);
	130	x = a_nan.value;
	131	}
	132	return f_washf (x);
	133	}
08cee2a4	134	#endif /* _ARCH_PPCSQ */
ffdd5e50	135
8a6d5255	136	#undef __ieee754_sqrtf
ffdd5e50 UD	137	float
ffdd5e50 UD	138	__ieee754_sqrtf (float x)
ffdd5e50 UD	139	{
	140	double z;
	141
08cee2a4 AZ	142	#ifdef _ARCH_PPCSQ
	143	asm ("fsqrts %0,%1\n" :"=f" (z):"f" (x));
	144	#else
	145	z = __slow_ieee754_sqrtf (x);
	146	#endif
ffdd5e50 UD	147
	148	return z;
	149	}
0ac5ae23	150	strong_alias (__ieee754_sqrtf, __sqrtf_finite)