[thirdparty/gcc.git] / libgcc / config / libbid / bid64_sqrt.c

/* Copyright (C) 2007, 2009  Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/*****************************************************************************
 *    BID64 square root
 *****************************************************************************
 *
 *  Algorithm description:
 *
 *  if(exponent_x is odd)
 *     scale coefficient_x by 10, adjust exponent
 *  - get lower estimate for number of digits in coefficient_x
 *  - scale coefficient x to between 31 and 33 decimal digits
 *  - in parallel, check for exact case and return if true
 *  - get high part of result coefficient using double precision sqrt
 *  - compute remainder and refine coefficient in one iteration (which 
 *                                 modifies it by at most 1)
 *  - result exponent is easy to compute from the adjusted arg. exponent 
 *
 ****************************************************************************/

#include "bid_internal.h"
#include "bid_sqrt_macros.h"
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
#include <fenv.h>

#define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
#endif

extern double sqrt (double);

#if DECIMAL_CALL_BY_REFERENCE

void
bid64_sqrt (UINT64 * pres,
	    UINT64 *
	    px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
	    _EXC_INFO_PARAM) {
  UINT64 x;
#else

UINT64
bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
	    _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
#endif
  UINT128 CA, CT;
  UINT64 sign_x, coefficient_x;
  UINT64 Q, Q2, A10, C4, R, R2, QE, res;
  SINT64 D;
  int_double t_scale;
  int_float tempx;
  double da, dq, da_h, da_l, dqe;
  int exponent_x, exponent_q, bin_expon_cx;
  int digits_x;
  int scale;
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
  fexcept_t binaryflags = 0;
#endif

#if DECIMAL_CALL_BY_REFERENCE
#if !DECIMAL_GLOBAL_ROUNDING
  _IDEC_round rnd_mode = *prnd_mode;
#endif
  x = *px;
#endif

  // unpack arguments, check for NaN or Infinity
  if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
    // x is Inf. or NaN or 0
    if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
      res = coefficient_x;
      if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64)	// -Infinity
      {
	res = NAN_MASK64;
#ifdef SET_STATUS_FLAGS
	__set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
      }
#ifdef SET_STATUS_FLAGS
      if ((x & SNAN_MASK64) == SNAN_MASK64)	// sNaN
	__set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
      BID_RETURN (res & QUIET_MASK64);
    }
    // x is 0
    exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
    res = sign_x | (((UINT64) exponent_x) << 53);
    BID_RETURN (res);
  }
  // x<0?
  if (sign_x && coefficient_x) {
    res = NAN_MASK64;
#ifdef SET_STATUS_FLAGS
    __set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
    BID_RETURN (res);
  }
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
  (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif
  //--- get number of bits in the coefficient of x ---
  tempx.d = (float) coefficient_x;
  bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
  digits_x = estimate_decimal_digits[bin_expon_cx];
  // add test for range
  if (coefficient_x >= power10_index_binexp[bin_expon_cx])
    digits_x++;

  A10 = coefficient_x;
  if (exponent_x & 1) {
    A10 = (A10 << 2) + A10;
    A10 += A10;
  }

  dqe = sqrt ((double) A10);
  QE = (UINT32) dqe;
  if (QE * QE == A10) {
    res =
      very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
			   QE);
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
    (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif
    BID_RETURN (res);
  }
  // if exponent is odd, scale coefficient by 10
  scale = 31 - digits_x;
  exponent_q = exponent_x - scale;
  scale += (exponent_q & 1);	// exp. bias is even

  CT = power10_table_128[scale];
  __mul_64x128_short (CA, coefficient_x, CT);

  // 2^64
  t_scale.i = 0x43f0000000000000ull;
  // convert CA to DP
  da_h = CA.w[1];
  da_l = CA.w[0];
  da = da_h * t_scale.d + da_l;

  dq = sqrt (da);

  Q = (UINT64) dq;

  // get sign(sqrt(CA)-Q)
  R = CA.w[0] - Q * Q;
  R = ((SINT64) R) >> 63;
  D = R + R + 1;

  exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;

#ifdef SET_STATUS_FLAGS
  __set_status_flags (pfpsf, INEXACT_EXCEPTION);
#endif

#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
  if (!((rnd_mode) & 3)) {
#endif
#endif

    // midpoint to check
    Q2 = Q + Q + D;
    C4 = CA.w[0] << 2;

    // get sign(-sqrt(CA)+Midpoint)
    R2 = Q2 * Q2 - C4;
    R2 = ((SINT64) R2) >> 63;

    // adjust Q if R!=R2
    Q += (D & (R ^ R2));
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
  } else {
    C4 = CA.w[0];
    Q += D;
    if ((SINT64) (Q * Q - C4) > 0)
      Q--;
    if (rnd_mode == ROUNDING_UP)
      Q++;
  }
#endif
#endif

  res = fast_get_BID64 (0, exponent_q, Q);
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
  (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif
  BID_RETURN (res);
}


TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)

     UINT256 M256, C4, C8;
     UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
       mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
SINT64 D;
int_float fx, f64;
int exponent_x, bin_expon_cx, done = 0;
int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
fexcept_t binaryflags = 0;
#endif

	// unpack arguments, check for NaN or Infinity
if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
  res = CX.w[1];
  // NaN ?
  if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
#ifdef SET_STATUS_FLAGS
    if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull)	// sNaN
      __set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
    Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
    Tmp.w[0] = CX.w[0];
    TP128 = reciprocals10_128[18];
    __mul_128x128_full (Qh, Ql, Tmp, TP128);
    amount = recip_scale[18];
    __shr_128 (Tmp, Qh, amount);
    res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
    BID_RETURN (res);
  }
  // x is Infinity?
  if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
    if (sign_x) {
      // -Inf, return NaN
      res = 0x7c00000000000000ull;
#ifdef SET_STATUS_FLAGS
      __set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
    }
    BID_RETURN (res);
  }
  // x is 0 otherwise

  exponent_x =
    ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
    DECIMAL_EXPONENT_BIAS;
  if (exponent_x < 0)
    exponent_x = 0;
  if (exponent_x > DECIMAL_MAX_EXPON_64)
    exponent_x = DECIMAL_MAX_EXPON_64;
  //res= sign_x | (((UINT64)exponent_x)<<53);
  res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
  BID_RETURN (res);
}
if (sign_x) {
  res = 0x7c00000000000000ull;
#ifdef SET_STATUS_FLAGS
  __set_status_flags (pfpsf, INVALID_EXCEPTION);
#endif
  BID_RETURN (res);
}
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
(void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif

	   // 2^64
f64.i = 0x5f800000;

	   // fx ~ CX
fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
digits = estimate_decimal_digits[bin_expon_cx];

A10 = CX;
if (exponent_x & 1) {
  A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
  A10.w[0] = CX.w[0] << 3;
  CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
  CX2.w[0] = CX.w[0] << 1;
  __add_128_128 (A10, A10, CX2);
}

C256.w[1] = A10.w[1];
C256.w[0] = A10.w[0];
CS.w[0] = short_sqrt128 (A10);
CS.w[1] = 0;
mul_factor = 0;
	   // check for exact result  
if (CS.w[0] < 10000000000000000ull) {
  if (CS.w[0] * CS.w[0] == A10.w[0]) {
    __sqr64_fast (S2, CS.w[0]);
    if (S2.w[1] == A10.w[1])	// && S2.w[0]==A10.w[0])
    {
      res =
	get_BID64 (0,
		   ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
		   DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
      (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif
      BID_RETURN (res);
    }
  }
  if (CS.w[0] >= 1000000000000000ull) {
    done = 1;
    exponent_q = exponent_x;
    C256.w[1] = A10.w[1];
    C256.w[0] = A10.w[0];
  }
#ifdef SET_STATUS_FLAGS
  __set_status_flags (pfpsf, INEXACT_EXCEPTION);
#endif
  exact = 0;
} else {
  B10 = 0x3333333333333334ull;
  __mul_64x64_to_128_full (CS2, CS.w[0], B10);
  CS0 = CS2.w[1] >> 1;
  if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
#ifdef SET_STATUS_FLAGS
    __set_status_flags (pfpsf, INEXACT_EXCEPTION);
#endif
    exact = 0;
  }
  done = 1;
  CS.w[0] = CS0;
  exponent_q = exponent_x + 2;
  mul_factor = 10;
  mul_factor2 = 100;
  if (CS.w[0] >= 10000000000000000ull) {
    __mul_64x64_to_128_full (CS2, CS.w[0], B10);
    CS0 = CS2.w[1] >> 1;
    if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
#ifdef SET_STATUS_FLAGS
      __set_status_flags (pfpsf, INEXACT_EXCEPTION);
#endif
      exact = 0;
    }
    exponent_q += 2;
    CS.w[0] = CS0;
    mul_factor = 100;
    mul_factor2 = 10000;
  }
  if (exact) {
    CS0 = CS.w[0] * mul_factor;
    __sqr64_fast (CS1, CS0)
      if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
#ifdef SET_STATUS_FLAGS
      __set_status_flags (pfpsf, INEXACT_EXCEPTION);
#endif
      exact = 0;
    }
  }
}

if (!done) {
  // get number of digits in CX
  D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
  if (D > 0
      || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
    digits++;

  // if exponent is odd, scale coefficient by 10
  scale = 31 - digits;
  exponent_q = exponent_x - scale;
  scale += (exponent_q & 1);	// exp. bias is even

  T128 = power10_table_128[scale];
  __mul_128x128_low (C256, CX, T128);


  CS.w[0] = short_sqrt128 (C256);
}
   //printf("CS=%016I64x\n",CS.w[0]);

exponent_q =
  ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
  DECIMAL_EXPONENT_BIAS;
if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
  extra_digits = -exponent_q;
  exponent_q = 0;

  // get coeff*(2^M[extra_digits])/10^extra_digits
  __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);

  // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
  amount = short_recip_scale[extra_digits];

  CS0 = QH.w[1] >> amount;

#ifdef SET_STATUS_FLAGS
  if (exact) {
    if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
      exact = 0;
  }
  if (!exact)
    __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
#endif

  CS.w[0] = CS0;
  if (!mul_factor)
    mul_factor = 1;
  mul_factor *= power10_table_128[extra_digits].w[0];
  __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
  if (mul_factor2_long.w[1])
    mul_factor2 = 0;
  else
    mul_factor2 = mul_factor2_long.w[1];
}
	   // 4*C256
C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
C4.w[0] = C256.w[0] << 2;

#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
if (!((rnd_mode) & 3)) {
#endif
#endif
  // compare to midpoints
  CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
  if (mul_factor)
    CSM.w[0] *= mul_factor;
  // CSM^2
  __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
  //__mul_128x128_to_256(M256, CSM, CSM);

  if (C4.w[1] > M256.w[1] ||
      (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
    // round up
    CS.w[0]++;
  } else {
    C8.w[0] = CS.w[0] << 3;
    C8.w[1] = 0;
    if (mul_factor) {
      if (mul_factor2) {
	__mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
      } else {
	__mul_64x128_low (C8, C8.w[0], mul_factor2_long);
      }
    }
    // M256 - 8*CSM
    __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
    M256.w[1] = M256.w[1] - C8.w[1] - Carry;

    // if CSM' > C256, round up
    if (M256.w[1] > C4.w[1] ||
	(M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
      // round down
      if (CS.w[0])
	CS.w[0]--;
    }
  }
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
} else {
  CS.w[0]++;
  CSM.w[0] = CS.w[0];
  C8.w[0] = CSM.w[0] << 1;
  if (mul_factor)
    CSM.w[0] *= mul_factor;
  __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
  C8.w[1] = 0;
  if (mul_factor) {
    if (mul_factor2) {
      __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
    } else {
      __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
    }
  }
  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);

  if (M256.w[1] > C256.w[1] ||
      (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
    __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
    M256.w[1] = M256.w[1] - Carry - C8.w[1];
    M256.w[0]++;
    if (!M256.w[0]) {
      M256.w[1]++;

    }

    if ((M256.w[1] > C256.w[1] ||
	 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
	&& (CS.w[0] > 1)) {

      CS.w[0]--;

      if (CS.w[0] > 1) {
	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
	M256.w[1] = M256.w[1] - Carry - C8.w[1];
	M256.w[0]++;
	if (!M256.w[0]) {
	  M256.w[1]++;
	}

	if (M256.w[1] > C256.w[1] ||
	    (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
	  CS.w[0]--;
      }
    }
  }

  else {
				/*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
				M256.w[1] = M256.w[1] + Carry + C8.w[1];
				M256.w[0]++;
				if(!M256.w[0]) 
				{
					M256.w[1]++;
				}
				CS.w[0]++;
			if(M256.w[1]<C256.w[1] ||
				(M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
			{
				CS.w[0]++;
			}*/
    CS.w[0]++;
  }
  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
  // RU?
  if (((rnd_mode) != ROUNDING_UP) || exact) {
    if (CS.w[0])
      CS.w[0]--;
  }

}
#endif
#endif
 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);

res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
#ifdef UNCHANGED_BINARY_STATUS_FLAGS
(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
#endif
BID_RETURN (res);


}
Commit	Line	Data
748086b7	1	/* Copyright (C) 2007, 2009 Free Software Foundation, Inc.
200359e8 L	2
	3	This file is part of GCC.
	4
	5	GCC is free software; you can redistribute it and/or modify it under
	6	the terms of the GNU General Public License as published by the Free
748086b7	7	Software Foundation; either version 3, or (at your option) any later
200359e8 L	8	version.
200359e8 L	9
200359e8 L	10	GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	11	WARRANTY; without even the implied warranty of MERCHANTABILITY or
	12	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	13	for more details.
	14
748086b7 JJ	15	Under Section 7 of GPL version 3, you are granted additional
	16	permissions described in the GCC Runtime Library Exception, version
	17	3.1, as published by the Free Software Foundation.
	18
	19	You should have received a copy of the GNU General Public License and
	20	a copy of the GCC Runtime Library Exception along with this program;
	21	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	22	<http://www.gnu.org/licenses/>. */
200359e8 L	23
	24	/*****************************************************************************
	25	* BID64 square root
	26	*****************************************************************************
	27	*
	28	* Algorithm description:
	29	*
	30	* if(exponent_x is odd)
	31	* scale coefficient_x by 10, adjust exponent
	32	* - get lower estimate for number of digits in coefficient_x
	33	* - scale coefficient x to between 31 and 33 decimal digits
	34	* - in parallel, check for exact case and return if true
	35	* - get high part of result coefficient using double precision sqrt
	36	* - compute remainder and refine coefficient in one iteration (which
	37	* modifies it by at most 1)
	38	* - result exponent is easy to compute from the adjusted arg. exponent
	39	*
	40	****************************************************************************/
	41
	42	#include "bid_internal.h"
b2a00c89 L	43	#include "bid_sqrt_macros.h"
	44	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	45	#include <fenv.h>
	46
	47	#define FE_ALL_FLAGS FE_INVALID\|FE_DIVBYZERO\|FE_OVERFLOW\|FE_UNDERFLOW\|FE_INEXACT
	48	#endif
200359e8 L	49
	50	extern double sqrt (double);
	51
	52	#if DECIMAL_CALL_BY_REFERENCE
	53
	54	void
b2a00c89	55	bid64_sqrt (UINT64 * pres,
200359e8 L	56	UINT64 *
	57	px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
	58	_EXC_INFO_PARAM) {
	59	UINT64 x;
	60	#else
	61
	62	UINT64
b2a00c89	63	bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
200359e8 L	64	_EXC_MASKS_PARAM _EXC_INFO_PARAM) {
	65	#endif
	66	UINT128 CA, CT;
	67	UINT64 sign_x, coefficient_x;
	68	UINT64 Q, Q2, A10, C4, R, R2, QE, res;
	69	SINT64 D;
	70	int_double t_scale;
	71	int_float tempx;
	72	double da, dq, da_h, da_l, dqe;
	73	int exponent_x, exponent_q, bin_expon_cx;
	74	int digits_x;
	75	int scale;
b2a00c89 L	76	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	77	fexcept_t binaryflags = 0;
	78	#endif
200359e8 L	79
	80	#if DECIMAL_CALL_BY_REFERENCE
	81	#if !DECIMAL_GLOBAL_ROUNDING
	82	_IDEC_round rnd_mode = *prnd_mode;
	83	#endif
	84	x = *px;
	85	#endif
	86
	87	// unpack arguments, check for NaN or Infinity
	88	if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
	89	// x is Inf. or NaN or 0
200359e8	90	if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
b2a00c89 L	91	res = coefficient_x;
b2a00c89 L	92	if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64) // -Infinity
200359e8 L	93	{
	94	res = NAN_MASK64;
	95	#ifdef SET_STATUS_FLAGS
	96	__set_status_flags (pfpsf, INVALID_EXCEPTION);
	97	#endif
	98	}
	99	#ifdef SET_STATUS_FLAGS
	100	if ((x & SNAN_MASK64) == SNAN_MASK64) // sNaN
	101	__set_status_flags (pfpsf, INVALID_EXCEPTION);
	102	#endif
	103	BID_RETURN (res & QUIET_MASK64);
	104	}
	105	// x is 0
	106	exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
	107	res = sign_x \| (((UINT64) exponent_x) << 53);
	108	BID_RETURN (res);
	109	}
	110	// x<0?
	111	if (sign_x && coefficient_x) {
	112	res = NAN_MASK64;
	113	#ifdef SET_STATUS_FLAGS
	114	__set_status_flags (pfpsf, INVALID_EXCEPTION);
	115	#endif
	116	BID_RETURN (res);
	117	}
b2a00c89 L	118	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	119	(void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
	120	#endif
200359e8 L	121	//--- get number of bits in the coefficient of x ---
	122	tempx.d = (float) coefficient_x;
	123	bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
b2a00c89	124	digits_x = estimate_decimal_digits[bin_expon_cx];
200359e8	125	// add test for range
b2a00c89	126	if (coefficient_x >= power10_index_binexp[bin_expon_cx])
200359e8 L	127	digits_x++;
	128
	129	A10 = coefficient_x;
	130	if (exponent_x & 1) {
	131	A10 = (A10 << 2) + A10;
	132	A10 += A10;
	133	}
	134
	135	dqe = sqrt ((double) A10);
	136	QE = (UINT32) dqe;
	137	if (QE * QE == A10) {
	138	res =
	139	very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
	140	QE);
b2a00c89 L	141	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	142	(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
	143	#endif
200359e8 L	144	BID_RETURN (res);
	145	}
	146	// if exponent is odd, scale coefficient by 10
	147	scale = 31 - digits_x;
	148	exponent_q = exponent_x - scale;
	149	scale += (exponent_q & 1); // exp. bias is even
	150
b2a00c89	151	CT = power10_table_128[scale];
200359e8 L	152	__mul_64x128_short (CA, coefficient_x, CT);
	153
	154	// 2^64
	155	t_scale.i = 0x43f0000000000000ull;
	156	// convert CA to DP
	157	da_h = CA.w[1];
	158	da_l = CA.w[0];
	159	da = da_h * t_scale.d + da_l;
	160
	161	dq = sqrt (da);
	162
	163	Q = (UINT64) dq;
	164
	165	// get sign(sqrt(CA)-Q)
	166	R = CA.w[0] - Q * Q;
	167	R = ((SINT64) R) >> 63;
	168	D = R + R + 1;
	169
	170	exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
	171
	172	#ifdef SET_STATUS_FLAGS
	173	__set_status_flags (pfpsf, INEXACT_EXCEPTION);
	174	#endif
	175
	176	#ifndef IEEE_ROUND_NEAREST
	177	#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
	178	if (!((rnd_mode) & 3)) {
	179	#endif
	180	#endif
	181
	182	// midpoint to check
	183	Q2 = Q + Q + D;
	184	C4 = CA.w[0] << 2;
	185
	186	// get sign(-sqrt(CA)+Midpoint)
	187	R2 = Q2 * Q2 - C4;
	188	R2 = ((SINT64) R2) >> 63;
	189
	190	// adjust Q if R!=R2
	191	Q += (D & (R ^ R2));
	192	#ifndef IEEE_ROUND_NEAREST
	193	#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
	194	} else {
	195	C4 = CA.w[0];
	196	Q += D;
	197	if ((SINT64) (Q * Q - C4) > 0)
	198	Q--;
	199	if (rnd_mode == ROUNDING_UP)
	200	Q++;
	201	}
	202	#endif
	203	#endif
	204
	205	res = fast_get_BID64 (0, exponent_q, Q);
b2a00c89 L	206	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	207	(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
	208	#endif
	209	BID_RETURN (res);
	210	}
	211
	212
	213	TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
	214
	215	UINT256 M256, C4, C8;
	216	UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
	217	mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
	218	UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
	219	SINT64 D;
	220	int_float fx, f64;
	221	int exponent_x, bin_expon_cx, done = 0;
	222	int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
	223	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	224	fexcept_t binaryflags = 0;
	225	#endif
	226
	227	// unpack arguments, check for NaN or Infinity
	228	if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
	229	res = CX.w[1];
	230	// NaN ?
	231	if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
	232	#ifdef SET_STATUS_FLAGS
	233	if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull) // sNaN
	234	__set_status_flags (pfpsf, INVALID_EXCEPTION);
	235	#endif
	236	Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
	237	Tmp.w[0] = CX.w[0];
	238	TP128 = reciprocals10_128[18];
	239	__mul_128x128_full (Qh, Ql, Tmp, TP128);
	240	amount = recip_scale[18];
	241	__shr_128 (Tmp, Qh, amount);
	242	res = (CX.w[1] & 0xfc00000000000000ull) \| Tmp.w[0];
	243	BID_RETURN (res);
	244	}
	245	// x is Infinity?
	246	if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
	247	if (sign_x) {
	248	// -Inf, return NaN
	249	res = 0x7c00000000000000ull;
	250	#ifdef SET_STATUS_FLAGS
	251	__set_status_flags (pfpsf, INVALID_EXCEPTION);
	252	#endif
	253	}
	254	BID_RETURN (res);
	255	}
	256	// x is 0 otherwise
	257
	258	exponent_x =
	259	((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
	260	DECIMAL_EXPONENT_BIAS;
	261	if (exponent_x < 0)
	262	exponent_x = 0;
	263	if (exponent_x > DECIMAL_MAX_EXPON_64)
	264	exponent_x = DECIMAL_MAX_EXPON_64;
	265	//res= sign_x \| (((UINT64)exponent_x)<<53);
	266	res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
	267	BID_RETURN (res);
	268	}
	269	if (sign_x) {
270	res = 0x7c00000000000000ull;
271	#ifdef SET_STATUS_FLAGS
272	__set_status_flags (pfpsf, INVALID_EXCEPTION);
273	#endif
200359e8 L	274	BID_RETURN (res);
200359e8 L	275	}
b2a00c89 L	276	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	277	(void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
	278	#endif
	279
	280	// 2^64
	281	f64.i = 0x5f800000;
	282
	283	// fx ~ CX
	284	fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
	285	bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
	286	digits = estimate_decimal_digits[bin_expon_cx];
	287
	288	A10 = CX;
	289	if (exponent_x & 1) {
	290	A10.w[1] = (CX.w[1] << 3) \| (CX.w[0] >> 61);
	291	A10.w[0] = CX.w[0] << 3;
	292	CX2.w[1] = (CX.w[1] << 1) \| (CX.w[0] >> 63);
	293	CX2.w[0] = CX.w[0] << 1;
	294	__add_128_128 (A10, A10, CX2);
	295	}
	296
	297	C256.w[1] = A10.w[1];
	298	C256.w[0] = A10.w[0];
	299	CS.w[0] = short_sqrt128 (A10);
	300	CS.w[1] = 0;
	301	mul_factor = 0;
	302	// check for exact result
	303	if (CS.w[0] < 10000000000000000ull) {
	304	if (CS.w[0] * CS.w[0] == A10.w[0]) {
	305	__sqr64_fast (S2, CS.w[0]);
	306	if (S2.w[1] == A10.w[1]) // && S2.w[0]==A10.w[0])
	307	{
	308	res =
	309	get_BID64 (0,
	310	((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
	311	DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
	312	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
	313	(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
	314	#endif
	315	BID_RETURN (res);
	316	}
	317	}
	318	if (CS.w[0] >= 1000000000000000ull) {
	319	done = 1;
	320	exponent_q = exponent_x;
	321	C256.w[1] = A10.w[1];
	322	C256.w[0] = A10.w[0];
	323	}
	324	#ifdef SET_STATUS_FLAGS
	325	__set_status_flags (pfpsf, INEXACT_EXCEPTION);
	326	#endif
	327	exact = 0;
	328	} else {
	329	B10 = 0x3333333333333334ull;
	330	__mul_64x64_to_128_full (CS2, CS.w[0], B10);
	331	CS0 = CS2.w[1] >> 1;
	332	if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
	333	#ifdef SET_STATUS_FLAGS
	334	__set_status_flags (pfpsf, INEXACT_EXCEPTION);
	335	#endif
	336	exact = 0;
	337	}
	338	done = 1;
	339	CS.w[0] = CS0;
340	exponent_q = exponent_x + 2;
341	mul_factor = 10;
342	mul_factor2 = 100;
343	if (CS.w[0] >= 10000000000000000ull) {
344	__mul_64x64_to_128_full (CS2, CS.w[0], B10);
345	CS0 = CS2.w[1] >> 1;
346	if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
347	#ifdef SET_STATUS_FLAGS
348	__set_status_flags (pfpsf, INEXACT_EXCEPTION);
349	#endif
350	exact = 0;
351	}
352	exponent_q += 2;
353	CS.w[0] = CS0;
354	mul_factor = 100;
355	mul_factor2 = 10000;
356	}
357	if (exact) {
358	CS0 = CS.w[0] * mul_factor;
359	__sqr64_fast (CS1, CS0)
360	if ((CS1.w[0] != A10.w[0]) \|\| (CS1.w[1] != A10.w[1])) {
361	#ifdef SET_STATUS_FLAGS
362	__set_status_flags (pfpsf, INEXACT_EXCEPTION);
363	#endif
364	exact = 0;
365	}
366	}
367	}
368
369	if (!done) {
370	// get number of digits in CX
371	D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
372	if (D > 0
373	\|\| (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
374	digits++;
375
376	// if exponent is odd, scale coefficient by 10
377	scale = 31 - digits;
378	exponent_q = exponent_x - scale;
379	scale += (exponent_q & 1); // exp. bias is even
380
381	T128 = power10_table_128[scale];
382	__mul_128x128_low (C256, CX, T128);
383
384
385	CS.w[0] = short_sqrt128 (C256);
386	}
387	//printf("CS=%016I64x\n",CS.w[0]);
388
389	exponent_q =
390	((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
391	DECIMAL_EXPONENT_BIAS;
392	if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
393	extra_digits = -exponent_q;
394	exponent_q = 0;
395
396	// get coeff*(2^M[extra_digits])/10^extra_digits
397	__mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
398
399	// now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
400	amount = short_recip_scale[extra_digits];
401
402	CS0 = QH.w[1] >> amount;
403
404	#ifdef SET_STATUS_FLAGS
405	if (exact) {
406	if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
407	exact = 0;
408	}
409	if (!exact)
410	__set_status_flags (pfpsf, UNDERFLOW_EXCEPTION \| INEXACT_EXCEPTION);
411	#endif
412
413	CS.w[0] = CS0;
414	if (!mul_factor)
415	mul_factor = 1;
416	mul_factor *= power10_table_128[extra_digits].w[0];
417	__mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
418	if (mul_factor2_long.w[1])
419	mul_factor2 = 0;
420	else
421	mul_factor2 = mul_factor2_long.w[1];
422	}
423	// 4*C256
424	C4.w[1] = (C256.w[1] << 2) \| (C256.w[0] >> 62);
425	C4.w[0] = C256.w[0] << 2;
426
427	#ifndef IEEE_ROUND_NEAREST
428	#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
429	if (!((rnd_mode) & 3)) {
430	#endif
431	#endif
432	// compare to midpoints
433	CSM.w[0] = (CS.w[0] + CS.w[0]) \| 1;
434	//printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
435	if (mul_factor)
436	CSM.w[0] *= mul_factor;
437	// CSM^2
438	__mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
439	//__mul_128x128_to_256(M256, CSM, CSM);
440
441	if (C4.w[1] > M256.w[1] \|\|
442	(C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
443	// round up
444	CS.w[0]++;
445	} else {
446	C8.w[0] = CS.w[0] << 3;
447	C8.w[1] = 0;
448	if (mul_factor) {
449	if (mul_factor2) {
450	__mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
451	} else {
452	__mul_64x128_low (C8, C8.w[0], mul_factor2_long);
453	}
454	}
455	// M256 - 8*CSM
456	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
457	M256.w[1] = M256.w[1] - C8.w[1] - Carry;
458
459	// if CSM' > C256, round up
460	if (M256.w[1] > C4.w[1] \|\|
461	(M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
462	// round down
463	if (CS.w[0])
464	CS.w[0]--;
465	}
466	}
467	#ifndef IEEE_ROUND_NEAREST
468	#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
469	} else {
470	CS.w[0]++;
471	CSM.w[0] = CS.w[0];
472	C8.w[0] = CSM.w[0] << 1;
473	if (mul_factor)
474	CSM.w[0] *= mul_factor;
475	__mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
476	C8.w[1] = 0;
477	if (mul_factor) {
478	if (mul_factor2) {
479	__mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
480	} else {
481	__mul_64x128_low (C8, C8.w[0], mul_factor2_long);
482	}
483	}
484	//printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
485
486	if (M256.w[1] > C256.w[1] \|\|
487	(M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
488	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
489	M256.w[1] = M256.w[1] - Carry - C8.w[1];
490	M256.w[0]++;
491	if (!M256.w[0]) {
492	M256.w[1]++;
493
494	}
495
496	if ((M256.w[1] > C256.w[1] \|\|
497	(M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
498	&& (CS.w[0] > 1)) {
499
500	CS.w[0]--;
501
502	if (CS.w[0] > 1) {
503	__sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
504	M256.w[1] = M256.w[1] - Carry - C8.w[1];
505	M256.w[0]++;
506	if (!M256.w[0]) {
507	M256.w[1]++;
508	}
509
510	if (M256.w[1] > C256.w[1] \|\|
511	(M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
512	CS.w[0]--;
513	}
514	}
515	}
516
517	else {
518	/*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
519	M256.w[1] = M256.w[1] + Carry + C8.w[1];
520	M256.w[0]++;
521	if(!M256.w[0])
522	{
523	M256.w[1]++;
524	}
525	CS.w[0]++;
526	if(M256.w[1]<C256.w[1] \|\|
527	(M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
528	{
529	CS.w[0]++;
530	}*/
531	CS.w[0]++;
532	}
533	//printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
534	// RU?
535	if (((rnd_mode) != ROUNDING_UP) \|\| exact) {
536	if (CS.w[0])
537	CS.w[0]--;
538	}
539
540	}
541	#endif
542	#endif
543	//printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
544
545	res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
546	#ifdef UNCHANGED_BINARY_STATUS_FLAGS
547	(void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
548	#endif
549	BID_RETURN (res);
550
551
552	}