[thirdparty/glibc.git] / sysdeps / powerpc / fpu / e_sqrtf.c

/* Single-precision floating point square root.
   Copyright (C) 1997-2019 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <fenv.h>
#include <fenv_libc.h>
#include <inttypes.h>
#include <stdint.h>
#include <sysdep.h>
#include <ldsodefs.h>

#ifndef _ARCH_PPCSQ
static const float almost_half = 0.50000006;	/* 0.5 + 2^-24 */
static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float two48 = 281474976710656.0;
static const float twom24 = 5.9604644775390625e-8;
extern const float __t_sqrt[1024];

/* The method is based on a description in
   Computation of elementary functions on the IBM RISC System/6000 processor,
   P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
   Basically, it consists of two interleaved Newton-Raphson approximations,
   one to find the actual square root, and one to find its reciprocal
   without the expense of a division operation.   The tricky bit here
   is the use of the POWER/PowerPC multiply-add operation to get the
   required accuracy with high speed.

   The argument reduction works by a combination of table lookup to
   obtain the initial guesses, and some careful modification of the
   generated guesses (which mostly runs on the integer unit, while the
   Newton-Raphson is running on the FPU).  */

float
__slow_ieee754_sqrtf (float x)
{
  const float inf = a_inf.value;

  if (x > 0)
    {
      if (x != inf)
	{
	  /* Variables named starting with 's' exist in the
	     argument-reduced space, so that 2 > sx >= 0.5,
	     1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	     Variables named ending with 'i' are integer versions of
	     floating-point values.  */
	  float sx;		/* The value of which we're trying to find the square
				   root.  */
	  float sg, g;		/* Guess of the square root of x.  */
	  float sd, d;		/* Difference between the square of the guess and x.  */
	  float sy;		/* Estimate of 1/2g (overestimated by 1ulp).  */
	  float sy2;		/* 2*sy */
	  float e;		/* Difference between y*g and 1/2 (note that e==se).  */
	  float shx;		/* == sx * fsg */
	  float fsg;		/* sg*fsg == g.  */
	  fenv_t fe;		/* Saved floating-point environment (stores rounding
				   mode and whether the inexact exception is
				   enabled).  */
	  uint32_t xi, sxi, fsgi;
	  const float *t_sqrt;

	  GET_FLOAT_WORD (xi, x);
	  fe = fegetenv_register ();
	  relax_fenv_state ();
	  sxi = (xi & 0x3fffffff) | 0x3f000000;
	  SET_FLOAT_WORD (sx, sxi);
	  t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	  sg = t_sqrt[0];
	  sy = t_sqrt[1];

	  /* Here we have three Newton-Raphson iterations each of a
	     division and a square root and the remainder of the
	     argument reduction, all interleaved.   */
	  sd = -__builtin_fmaf (sg, sg, -sx);
	  fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
	  sy2 = sy + sy;
	  sg = __builtin_fmaf (sy, sd, sg);	/* 16-bit approximation to
						   sqrt(sx). */
	  e = -__builtin_fmaf (sy, sg, -almost_half);
	  SET_FLOAT_WORD (fsg, fsgi);
	  sd = -__builtin_fmaf (sg, sg, -sx);
	  sy = __builtin_fmaf (e, sy2, sy);
	  if ((xi & 0x7f800000) == 0)
	    goto denorm;
	  shx = sx * fsg;
	  sg = __builtin_fmaf (sy, sd, sg);	/* 32-bit approximation to
						   sqrt(sx), but perhaps
						   rounded incorrectly.  */
	  sy2 = sy + sy;
	  g = sg * fsg;
	  e = -__builtin_fmaf (sy, sg, -almost_half);
	  d = -__builtin_fmaf (g, sg, -shx);
	  sy = __builtin_fmaf (e, sy2, sy);
	  fesetenv_register (fe);
	  return __builtin_fmaf (sy, d, g);
	denorm:
	  /* For denormalised numbers, we normalise, calculate the
	     square root, and return an adjusted result.  */
	  fesetenv_register (fe);
	  return __slow_ieee754_sqrtf (x * two48) * twom24;
	}
    }
  else if (x < 0)
    {
      /* For some reason, some PowerPC32 processors don't implement
	 FE_INVALID_SQRT.  */
#ifdef FE_INVALID_SQRT
      feraiseexcept (FE_INVALID_SQRT);

      fenv_union_t u = { .fenv = fegetenv_register () };
      if ((u.l & FE_INVALID) == 0)
#endif
	feraiseexcept (FE_INVALID);
      x = a_nan.value;
    }
  return f_washf (x);
}
#endif /* _ARCH_PPCSQ  */

#undef __ieee754_sqrtf
float
__ieee754_sqrtf (float x)
{
  float z;

#ifdef _ARCH_PPCSQ
  asm ("fsqrts	%0,%1\n" :"=f" (z):"f" (x));
#else
  z = __slow_ieee754_sqrtf (x);
#endif

  return z;
}
strong_alias (__ieee754_sqrtf, __sqrtf_finite)
Commit	Line	Data
ffdd5e50	1	/* Single-precision floating point square root.
04277e02	2	Copyright (C) 1997-2019 Free Software Foundation, Inc.
ffdd5e50 UD	3	This file is part of the GNU C Library.
	4
	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Lesser General Public
	7	License as published by the Free Software Foundation; either
	8	version 2.1 of the License, or (at your option) any later version.
	9
	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Lesser General Public License for more details.
	14
	15	You should have received a copy of the GNU Lesser General Public
59ba27a6	16	License along with the GNU C Library; if not, see
5a82c748	17	<https://www.gnu.org/licenses/>. */
ffdd5e50 UD	18
	19	#include <math.h>
	20	#include <math_private.h>
418d99e6	21	#include <fenv.h>
ffdd5e50 UD	22	#include <fenv_libc.h>
ffdd5e50 UD	23	#include <inttypes.h>
e054f494	24	#include <stdint.h>
ffdd5e50 UD	25	#include <sysdep.h>
ffdd5e50 UD	26	#include <ldsodefs.h>
ffdd5e50	27
08cee2a4	28	#ifndef _ARCH_PPCSQ
ffdd5e50 UD	29	static const float almost_half = 0.50000006; /* 0.5 + 2^-24 */
	30	static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
	31	static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
	32	static const float two48 = 281474976710656.0;
	33	static const float twom24 = 5.9604644775390625e-8;
	34	extern const float __t_sqrt[1024];
	35
	36	/* The method is based on a description in
	37	Computation of elementary functions on the IBM RISC System/6000 processor,
	38	P. W. Markstein, IBM J. Res. Develop, 34(1) 1990.
868f7a40	39	Basically, it consists of two interleaved Newton-Raphson approximations,
ffdd5e50 UD	40	one to find the actual square root, and one to find its reciprocal
	41	without the expense of a division operation. The tricky bit here
	42	is the use of the POWER/PowerPC multiply-add operation to get the
	43	required accuracy with high speed.
	44
	45	The argument reduction works by a combination of table lookup to
	46	obtain the initial guesses, and some careful modification of the
	47	generated guesses (which mostly runs on the integer unit, while the
868f7a40	48	Newton-Raphson is running on the FPU). */
ffdd5e50	49
ffdd5e50 UD	50	float
ffdd5e50 UD	51	__slow_ieee754_sqrtf (float x)
ffdd5e50 UD	52	{
	53	const float inf = a_inf.value;
	54
	55	if (x > 0)
	56	{
	57	if (x != inf)
	58	{
	59	/* Variables named starting with 's' exist in the
	60	argument-reduced space, so that 2 > sx >= 0.5,
	61	1.41... > sg >= 0.70.., 0.70.. >= sy > 0.35... .
	62	Variables named ending with 'i' are integer versions of
	63	floating-point values. */
	64	float sx; /* The value of which we're trying to find the square
	65	root. */
	66	float sg, g; /* Guess of the square root of x. */
	67	float sd, d; /* Difference between the square of the guess and x. */
	68	float sy; /* Estimate of 1/2g (overestimated by 1ulp). */
	69	float sy2; /* 2sy /
	70	float e; /* Difference between yg and 1/2 (note that e==se). /
	71	float shx; /* == sx * fsg */
	72	float fsg; /* sgfsg == g. /
	73	fenv_t fe; /* Saved floating-point environment (stores rounding
	74	mode and whether the inexact exception is
	75	enabled). */
	76	uint32_t xi, sxi, fsgi;
	77	const float *t_sqrt;
	78
	79	GET_FLOAT_WORD (xi, x);
	80	fe = fegetenv_register ();
	81	relax_fenv_state ();
	82	sxi = (xi & 0x3fffffff) \| 0x3f000000;
	83	SET_FLOAT_WORD (sx, sxi);
	84	t_sqrt = __t_sqrt + (xi >> (23 - 8 - 1) & 0x3fe);
	85	sg = t_sqrt[0];
	86	sy = t_sqrt[1];
	87
868f7a40	88	/* Here we have three Newton-Raphson iterations each of a
ffdd5e50 UD	89	division and a square root and the remainder of the
ffdd5e50 UD	90	argument reduction, all interleaved. */
95c26233	91	sd = -__builtin_fmaf (sg, sg, -sx);
ffdd5e50 UD	92	fsgi = (xi + 0x40000000) >> 1 & 0x7f800000;
ffdd5e50 UD	93	sy2 = sy + sy;
95c26233 JM	94	sg = __builtin_fmaf (sy, sd, sg); /* 16-bit approximation to
	95	sqrt(sx). */
	96	e = -__builtin_fmaf (sy, sg, -almost_half);
ffdd5e50	97	SET_FLOAT_WORD (fsg, fsgi);
95c26233 JM	98	sd = -__builtin_fmaf (sg, sg, -sx);
95c26233 JM	99	sy = __builtin_fmaf (e, sy2, sy);
ffdd5e50 UD	100	if ((xi & 0x7f800000) == 0)
	101	goto denorm;
	102	shx = sx * fsg;
95c26233 JM	103	sg = __builtin_fmaf (sy, sd, sg); /* 32-bit approximation to
	104	sqrt(sx), but perhaps
	105	rounded incorrectly. */
ffdd5e50 UD	106	sy2 = sy + sy;
ffdd5e50 UD	107	g = sg * fsg;
95c26233 JM	108	e = -__builtin_fmaf (sy, sg, -almost_half);
	109	d = -__builtin_fmaf (g, sg, -shx);
	110	sy = __builtin_fmaf (e, sy2, sy);
ffdd5e50	111	fesetenv_register (fe);
95c26233	112	return __builtin_fmaf (sy, d, g);
ffdd5e50 UD	113	denorm:
	114	/* For denormalised numbers, we normalise, calculate the
	115	square root, and return an adjusted result. */
	116	fesetenv_register (fe);
	117	return __slow_ieee754_sqrtf (x * two48) * twom24;
	118	}
	119	}
	120	else if (x < 0)
	121	{
	122	/* For some reason, some PowerPC32 processors don't implement
0ac5ae23	123	FE_INVALID_SQRT. */
ffdd5e50 UD	124	#ifdef FE_INVALID_SQRT
ffdd5e50 UD	125	feraiseexcept (FE_INVALID_SQRT);
c3a0ead4 UD	126
c3a0ead4 UD	127	fenv_union_t u = { .fenv = fegetenv_register () };
4a28b3ca	128	if ((u.l & FE_INVALID) == 0)
ffdd5e50 UD	129	#endif
	130	feraiseexcept (FE_INVALID);
	131	x = a_nan.value;
	132	}
	133	return f_washf (x);
	134	}
08cee2a4	135	#endif /* _ARCH_PPCSQ */
ffdd5e50	136
8a6d5255	137	#undef __ieee754_sqrtf
ffdd5e50 UD	138	float
ffdd5e50 UD	139	__ieee754_sqrtf (float x)
ffdd5e50	140	{
a51bc4fe	141	float z;
ffdd5e50	142
08cee2a4 AZ	143	#ifdef _ARCH_PPCSQ
	144	asm ("fsqrts %0,%1\n" :"=f" (z):"f" (x));
	145	#else
	146	z = __slow_ieee754_sqrtf (x);
	147	#endif
ffdd5e50 UD	148
	149	return z;
	150	}
0ac5ae23	151	strong_alias (__ieee754_sqrtf, __sqrtf_finite)