[thirdparty/gcc.git] / libgcc / soft-fp / op-2.h

/* Software floating-point emulation.
   Basic two-word fraction declaration and manipulation.
   Copyright (C) 1997-2013 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Richard Henderson (rth@cygnus.com),
		  Jakub Jelinek (jj@ultra.linux.cz),
		  David S. Miller (davem@redhat.com) and
		  Peter Maydell (pmaydell@chiark.greenend.org.uk).

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   In addition to the permissions in the GNU Lesser General Public
   License, the Free Software Foundation gives you unlimited
   permission to link the compiled version of this file into
   combinations with other programs, and to distribute those
   combinations without any restriction coming from the use of this
   file.  (The Lesser General Public License restrictions do apply in
   other respects; for example, they cover modification of the file,
   and distribution when not linked into a combine executable.)

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#define _FP_FRAC_DECL_2(X)	_FP_W_TYPE X##_f0, X##_f1
#define _FP_FRAC_COPY_2(D,S)	(D##_f0 = S##_f0, D##_f1 = S##_f1)
#define _FP_FRAC_SET_2(X,I)	__FP_FRAC_SET_2(X, I)
#define _FP_FRAC_HIGH_2(X)	(X##_f1)
#define _FP_FRAC_LOW_2(X)	(X##_f0)
#define _FP_FRAC_WORD_2(X,w)	(X##_f##w)

#define _FP_FRAC_SLL_2(X,N)						    \
(void)(((N) < _FP_W_TYPE_SIZE)						    \
       ? ({								    \
	    if (__builtin_constant_p(N) && (N) == 1)			    \
	      {								    \
		X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
		X##_f0 += X##_f0;					    \
	      }								    \
	    else							    \
	      {								    \
		X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
		X##_f0 <<= (N);						    \
	      }								    \
	    0;								    \
	  })								    \
       : ({								    \
	    X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);			    \
	    X##_f0 = 0;							    \
	  }))


#define _FP_FRAC_SRL_2(X,N)						\
(void)(((N) < _FP_W_TYPE_SIZE)						\
       ? ({								\
	    X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));	\
	    X##_f1 >>= (N);						\
	  })								\
       : ({								\
	    X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);			\
	    X##_f1 = 0;							\
	  }))

/* Right shift with sticky-lsb.  */
#define _FP_FRAC_SRST_2(X,S, N,sz)					  \
(void)(((N) < _FP_W_TYPE_SIZE)						  \
       ? ({								  \
	    S = (__builtin_constant_p(N) && (N) == 1			  \
		 ? X##_f0 & 1						  \
		 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);		  \
	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
	    X##_f1 >>= (N);						  \
	  })								  \
       : ({								  \
	    S = ((((N) == _FP_W_TYPE_SIZE				  \
		   ? 0							  \
		   : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))		  \
		  | X##_f0) != 0);					  \
	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));		  \
	    X##_f1 = 0;							  \
	  }))

#define _FP_FRAC_SRS_2(X,N,sz)						  \
(void)(((N) < _FP_W_TYPE_SIZE)						  \
       ? ({								  \
	    X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \
		      (__builtin_constant_p(N) && (N) == 1		  \
		       ? X##_f0 & 1					  \
		       : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));	  \
	    X##_f1 >>= (N);						  \
	  })								  \
       : ({								  \
	    X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |		  \
		      ((((N) == _FP_W_TYPE_SIZE				  \
			 ? 0						  \
			 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))	  \
			| X##_f0) != 0));				  \
	    X##_f1 = 0;							  \
	  }))

#define _FP_FRAC_ADDI_2(X,I)	\
  __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)

#define _FP_FRAC_ADD_2(R,X,Y)	\
  __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_SUB_2(R,X,Y)	\
  __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_DEC_2(X,Y)	\
  __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)

#define _FP_FRAC_CLZ_2(R,X)	\
  do {				\
    if (X##_f1)			\
      __FP_CLZ(R,X##_f1);	\
    else 			\
    {				\
      __FP_CLZ(R,X##_f0);	\
      R += _FP_W_TYPE_SIZE;	\
    }				\
  } while(0)

/* Predicates */
#define _FP_FRAC_NEGP_2(X)	((_FP_WS_TYPE)X##_f1 < 0)
#define _FP_FRAC_ZEROP_2(X)	((X##_f1 | X##_f0) == 0)
#define _FP_FRAC_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
#define _FP_FRAC_CLEAR_OVERP_2(fs,X)	(_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
#define _FP_FRAC_EQ_2(X, Y)	(X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
#define _FP_FRAC_GT_2(X, Y)	\
  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
#define _FP_FRAC_GE_2(X, Y)	\
  (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))

#define _FP_ZEROFRAC_2		0, 0
#define _FP_MINFRAC_2		0, 1
#define _FP_MAXFRAC_2		(~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)

/*
 * Internals
 */

#define __FP_FRAC_SET_2(X,I1,I0)	(X##_f0 = I0, X##_f1 = I1)

#define __FP_CLZ_2(R, xh, xl)	\
  do {				\
    if (xh)			\
      __FP_CLZ(R,xh);		\
    else 			\
    {				\
      __FP_CLZ(R,xl);		\
      R += _FP_W_TYPE_SIZE;	\
    }				\
  } while(0)

#if 0

#ifndef __FP_FRAC_ADDI_2
#define __FP_FRAC_ADDI_2(xh, xl, i)	\
  (xh += ((xl += i) < i))
#endif
#ifndef __FP_FRAC_ADD_2
#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)	\
  (rh = xh + yh + ((rl = xl + yl) < xl))
#endif
#ifndef __FP_FRAC_SUB_2
#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)	\
  (rh = xh - yh - ((rl = xl - yl) > xl))
#endif
#ifndef __FP_FRAC_DEC_2
#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	\
  do {					\
    UWtype _t = xl;			\
    xh -= yh + ((xl -= yl) > _t);	\
  } while (0)
#endif

#else

#undef __FP_FRAC_ADDI_2
#define __FP_FRAC_ADDI_2(xh, xl, i)	add_ssaaaa(xh, xl, xh, xl, 0, i)
#undef __FP_FRAC_ADD_2
#define __FP_FRAC_ADD_2			add_ssaaaa
#undef __FP_FRAC_SUB_2
#define __FP_FRAC_SUB_2			sub_ddmmss
#undef __FP_FRAC_DEC_2
#define __FP_FRAC_DEC_2(xh, xl, yh, yl)	sub_ddmmss(xh, xl, xh, xl, yh, yl)

#endif

/*
 * Unpack the raw bits of a native fp value.  Do not classify or
 * normalize the data.
 */

#define _FP_UNPACK_RAW_2(fs, X, val)			\
  do {							\
    union _FP_UNION_##fs _flo; _flo.flt = (val);	\
							\
    X##_f0 = _flo.bits.frac0;				\
    X##_f1 = _flo.bits.frac1;				\
    X##_e  = _flo.bits.exp;				\
    X##_s  = _flo.bits.sign;				\
  } while (0)

#define _FP_UNPACK_RAW_2_P(fs, X, val)			\
  do {							\
    union _FP_UNION_##fs *_flo =			\
      (union _FP_UNION_##fs *)(val);			\
							\
    X##_f0 = _flo->bits.frac0;				\
    X##_f1 = _flo->bits.frac1;				\
    X##_e  = _flo->bits.exp;				\
    X##_s  = _flo->bits.sign;				\
  } while (0)


/*
 * Repack the raw bits of a native fp value.
 */

#define _FP_PACK_RAW_2(fs, val, X)			\
  do {							\
    union _FP_UNION_##fs _flo;				\
							\
    _flo.bits.frac0 = X##_f0;				\
    _flo.bits.frac1 = X##_f1;				\
    _flo.bits.exp   = X##_e;				\
    _flo.bits.sign  = X##_s;				\
							\
    (val) = _flo.flt;					\
  } while (0)

#define _FP_PACK_RAW_2_P(fs, val, X)			\
  do {							\
    union _FP_UNION_##fs *_flo =			\
      (union _FP_UNION_##fs *)(val);			\
							\
    _flo->bits.frac0 = X##_f0;				\
    _flo->bits.frac1 = X##_f1;				\
    _flo->bits.exp   = X##_e;				\
    _flo->bits.sign  = X##_s;				\
  } while (0)


/*
 * Multiplication algorithms:
 */

/* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */

#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)			\
  do {									\
    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
									\
    doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);	\
    doit(_b_f1, _b_f0, X##_f0, Y##_f1);					\
    doit(_c_f1, _c_f0, X##_f1, Y##_f0);					\
    doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1);	\
									\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,		\
		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1));				\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,		\
		    _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1));				\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
  } while (0)

/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
   Do only 3 multiplications instead of four. This one is for machines
   where multiplication is much more expensive than subtraction.  */

#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)		\
  do {									\
    _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);	\
    _FP_W_TYPE _d;							\
    int _c1, _c2;							\
									\
    _b_f0 = X##_f0 + X##_f1;						\
    _c1 = _b_f0 < X##_f0;						\
    _b_f1 = Y##_f0 + Y##_f1;						\
    _c2 = _b_f1 < Y##_f0;						\
    doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);			\
    doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);	\
    doit(_c_f1, _c_f0, X##_f1, Y##_f1);					\
									\
    _b_f0 &= -_c2;							\
    _b_f1 &= -_c1;							\
    __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,		\
		    0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));	\
    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		     _b_f0);						\
    __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		     _b_f1);						\
    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1),				\
		    0, _d, _FP_FRAC_WORD_4(_z,0));			\
    __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),	\
		    _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);		\
    __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),	\
		    _c_f1, _c_f0,					\
		    _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));	\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _FP_FRAC_WORD_4(_z,0);					\
    R##_f1 = _FP_FRAC_WORD_4(_z,1);					\
  } while (0)

#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)				\
  do {									\
    _FP_FRAC_DECL_4(_z);						\
    _FP_W_TYPE _x[2], _y[2];						\
    _x[0] = X##_f0; _x[1] = X##_f1;					\
    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
									\
    mpn_mul_n(_z_f, _x, _y, 2);						\
									\
    /* Normalize since we know where the msb of the multiplicands	\
       were (bit B), we know that the msb of the of the product is	\
       at either 2B or 2B-1.  */					\
    _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);			\
    R##_f0 = _z_f[0];							\
    R##_f1 = _z_f[1];							\
  } while (0)

/* Do at most 120x120=240 bits multiplication using double floating
   point multiplication.  This is useful if floating point
   multiplication has much bigger throughput than integer multiply.
   It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
   between 106 and 120 only.
   Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
   SETFETZ is a macro which will disable all FPU exceptions and set rounding
   towards zero,  RESETFE should optionally reset it back.  */

#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)	\
  do {										\
    static const double _const[] = {						\
      /* 2^-24 */ 5.9604644775390625e-08,					\
      /* 2^-48 */ 3.5527136788005009e-15,					\
      /* 2^-72 */ 2.1175823681357508e-22,					\
      /* 2^-96 */ 1.2621774483536189e-29,					\
      /* 2^28 */ 2.68435456e+08,						\
      /* 2^4 */ 1.600000e+01,							\
      /* 2^-20 */ 9.5367431640625e-07,						\
      /* 2^-44 */ 5.6843418860808015e-14,					\
      /* 2^-68 */ 3.3881317890172014e-21,					\
      /* 2^-92 */ 2.0194839173657902e-28,					\
      /* 2^-116 */ 1.2037062152420224e-35};					\
    double _a240, _b240, _c240, _d240, _e240, _f240, 				\
	   _g240, _h240, _i240, _j240, _k240;					\
    union { double d; UDItype i; } _l240, _m240, _n240, _o240,			\
				   _p240, _q240, _r240, _s240;			\
    UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;			\
										\
    if (wfracbits < 106 || wfracbits > 120)					\
      abort();									\
										\
    setfetz;									\
										\
    _e240 = (double)(long)(X##_f0 & 0xffffff);					\
    _j240 = (double)(long)(Y##_f0 & 0xffffff);					\
    _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);				\
    _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);				\
    _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));	\
    _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));	\
    _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);				\
    _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);				\
    _a240 = (double)(long)(X##_f1 >> 32);					\
    _f240 = (double)(long)(Y##_f1 >> 32);					\
    _e240 *= _const[3];								\
    _j240 *= _const[3];								\
    _d240 *= _const[2];								\
    _i240 *= _const[2];								\
    _c240 *= _const[1];								\
    _h240 *= _const[1];								\
    _b240 *= _const[0];								\
    _g240 *= _const[0];								\
    _s240.d =							      _e240*_j240;\
    _r240.d =						_d240*_j240 + _e240*_i240;\
    _q240.d =				  _c240*_j240 + _d240*_i240 + _e240*_h240;\
    _p240.d =		    _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
    _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
    _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;		\
    _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;				\
    _l240.d = _a240*_g240 + _b240*_f240;					\
    _k240 =   _a240*_f240;							\
    _r240.d += _s240.d;								\
    _q240.d += _r240.d;								\
    _p240.d += _q240.d;								\
    _o240.d += _p240.d;								\
    _n240.d += _o240.d;								\
    _m240.d += _n240.d;								\
    _l240.d += _m240.d;								\
    _k240 += _l240.d;								\
    _s240.d -= ((_const[10]+_s240.d)-_const[10]);				\
    _r240.d -= ((_const[9]+_r240.d)-_const[9]);					\
    _q240.d -= ((_const[8]+_q240.d)-_const[8]);					\
    _p240.d -= ((_const[7]+_p240.d)-_const[7]);					\
    _o240.d += _const[7];							\
    _n240.d += _const[6];							\
    _m240.d += _const[5];							\
    _l240.d += _const[4];							\
    if (_s240.d != 0.0) _y240 = 1;						\
    if (_r240.d != 0.0) _y240 = 1;						\
    if (_q240.d != 0.0) _y240 = 1;						\
    if (_p240.d != 0.0) _y240 = 1;						\
    _t240 = (DItype)_k240;							\
    _u240 = _l240.i;								\
    _v240 = _m240.i;								\
    _w240 = _n240.i;								\
    _x240 = _o240.i;								\
    R##_f1 = (_t240 << (128 - (wfracbits - 1)))					\
	     | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));			\
    R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))			\
	     | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))			\
	     | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))			\
	     | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))			\
	     | _y240;								\
    resetfe;									\
  } while (0)

/*
 * Division algorithms:
 */

#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)				\
  do {									\
    _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;		\
    if (_FP_FRAC_GT_2(X, Y))						\
      {									\
	_n_f2 = X##_f1 >> 1;						\
	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;		\
	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);			\
      }									\
    else								\
      {									\
	R##_e--;							\
	_n_f2 = X##_f1;							\
	_n_f1 = X##_f0;							\
	_n_f0 = 0;							\
      }									\
									\
    /* Normalize, i.e. make the most significant bit of the 		\
       denominator set. */						\
    _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);				\
									\
    udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);			\
    umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);				\
    _r_f0 = _n_f0;							\
    if (_FP_FRAC_GT_2(_m, _r))						\
      {									\
	R##_f1--;							\
	_FP_FRAC_ADD_2(_r, Y, _r);					\
	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
	  {								\
	    R##_f1--;							\
	    _FP_FRAC_ADD_2(_r, Y, _r);					\
	  }								\
      }									\
    _FP_FRAC_DEC_2(_r, _m);						\
									\
    if (_r_f1 == Y##_f1)						\
      {									\
	/* This is a special case, not an optimization			\
	   (_r/Y##_f1 would not fit into UWtype).			\
	   As _r is guaranteed to be < Y,  R##_f0 can be either		\
	   (UWtype)-1 or (UWtype)-2.  But as we know what kind		\
	   of bits it is (sticky, guard, round),  we don't care.	\
	   We also don't care what the reminder is,  because the	\
	   guard bit will be set anyway.  -jj */			\
	R##_f0 = -1;							\
      }									\
    else								\
      {									\
	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);		\
	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);			\
	_r_f0 = 0;							\
	if (_FP_FRAC_GT_2(_m, _r))					\
	  {								\
	    R##_f0--;							\
	    _FP_FRAC_ADD_2(_r, Y, _r);					\
	    if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))		\
	      {								\
		R##_f0--;						\
		_FP_FRAC_ADD_2(_r, Y, _r);				\
	      }								\
	  }								\
	if (!_FP_FRAC_EQ_2(_r, _m))					\
	  R##_f0 |= _FP_WORK_STICKY;					\
      }									\
  } while (0)


#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)					\
  do {									\
    _FP_W_TYPE _x[4], _y[2], _z[4];					\
    _y[0] = Y##_f0; _y[1] = Y##_f1;					\
    _x[0] = _x[3] = 0;							\
    if (_FP_FRAC_GT_2(X, Y))						\
      {									\
	R##_e++;							\
	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |	\
		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
			    (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE)));	\
	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);	\
      }									\
    else								\
      {									\
	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |	\
		 X##_f1 >> (_FP_W_TYPE_SIZE -				\
			    (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));	\
	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);	\
      }									\
									\
    (void) mpn_divrem (_z, 0, _x, 4, _y, 2);				\
    R##_f1 = _z[1];							\
    R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);				\
  } while (0)


/*
 * Square root algorithms:
 * We have just one right now, maybe Newton approximation
 * should be added for those machines where division is fast.
 */

#define _FP_SQRT_MEAT_2(R, S, T, X, q)			\
  do {							\
    while (q)						\
      {							\
	T##_f1 = S##_f1 + q;				\
	if (T##_f1 <= X##_f1)				\
	  {						\
	    S##_f1 = T##_f1 + q;			\
	    X##_f1 -= T##_f1;				\
	    R##_f1 += q;				\
	  }						\
	_FP_FRAC_SLL_2(X, 1);				\
	q >>= 1;					\
      }							\
    q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);		\
    while (q != _FP_WORK_ROUND)				\
      {							\
	T##_f0 = S##_f0 + q;				\
	T##_f1 = S##_f1;				\
	if (T##_f1 < X##_f1 || 				\
	    (T##_f1 == X##_f1 && T##_f0 <= X##_f0))	\
	  {						\
	    S##_f0 = T##_f0 + q;			\
	    S##_f1 += (T##_f0 > S##_f0);		\
	    _FP_FRAC_DEC_2(X, T);			\
	    R##_f0 += q;				\
	  }						\
	_FP_FRAC_SLL_2(X, 1);				\
	q >>= 1;					\
      }							\
    if (X##_f0 | X##_f1)				\
      {							\
	if (S##_f1 < X##_f1 || 				\
	    (S##_f1 == X##_f1 && S##_f0 < X##_f0))	\
	  R##_f0 |= _FP_WORK_ROUND;			\
	R##_f0 |= _FP_WORK_STICKY;			\
      }							\
  } while (0)


/*
 * Assembly/disassembly for converting to/from integral types.
 * No shifting or overflow handled here.
 */

#define _FP_FRAC_ASSEMBLE_2(r, X, rsize)	\
(void)((rsize <= _FP_W_TYPE_SIZE)		\
       ? ({ r = X##_f0; })			\
       : ({					\
	    r = X##_f1;				\
	    r <<= _FP_W_TYPE_SIZE;		\
	    r += X##_f0;			\
	  }))

#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)				\
  do {									\
    X##_f0 = r;								\
    X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);	\
  } while (0)

/*
 * Convert FP values between word sizes
 */

#define _FP_FRAC_COPY_1_2(D, S)		(D##_f = S##_f0)

#define _FP_FRAC_COPY_2_1(D, S)		((D##_f0 = S##_f), (D##_f1 = 0))

#define _FP_FRAC_COPY_2_2(D,S)		_FP_FRAC_COPY_2(D,S)
Commit	Line	Data
49721058 JM	1	/* Software floating-point emulation.
49721058 JM	2	Basic two-word fraction declaration and manipulation.
fa1e55b0	3	Copyright (C) 1997-2013 Free Software Foundation, Inc.
49721058 JM	4	This file is part of the GNU C Library.
	5	Contributed by Richard Henderson (rth@cygnus.com),
	6	Jakub Jelinek (jj@ultra.linux.cz),
	7	David S. Miller (davem@redhat.com) and
	8	Peter Maydell (pmaydell@chiark.greenend.org.uk).
	9
	10	The GNU C Library is free software; you can redistribute it and/or
	11	modify it under the terms of the GNU Lesser General Public
	12	License as published by the Free Software Foundation; either
	13	version 2.1 of the License, or (at your option) any later version.
	14
	15	In addition to the permissions in the GNU Lesser General Public
	16	License, the Free Software Foundation gives you unlimited
	17	permission to link the compiled version of this file into
	18	combinations with other programs, and to distribute those
	19	combinations without any restriction coming from the use of this
	20	file. (The Lesser General Public License restrictions do apply in
	21	other respects; for example, they cover modification of the file,
	22	and distribution when not linked into a combine executable.)
	23
	24	The GNU C Library is distributed in the hope that it will be useful,
	25	but WITHOUT ANY WARRANTY; without even the implied warranty of
	26	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	27	Lesser General Public License for more details.
	28
	29	You should have received a copy of the GNU Lesser General Public
b7633ee3 KT	30	License along with the GNU C Library; if not, see
b7633ee3 KT	31	<http://www.gnu.org/licenses/>. */
49721058 JM	32
	33	#define _FP_FRAC_DECL_2(X) _FP_W_TYPE X##_f0, X##_f1
	34	#define _FP_FRAC_COPY_2(D,S) (D##_f0 = S##_f0, D##_f1 = S##_f1)
	35	#define _FP_FRAC_SET_2(X,I) __FP_FRAC_SET_2(X, I)
	36	#define _FP_FRAC_HIGH_2(X) (X##_f1)
	37	#define _FP_FRAC_LOW_2(X) (X##_f0)
	38	#define _FP_FRAC_WORD_2(X,w) (X##_f##w)
	39
	40	#define _FP_FRAC_SLL_2(X,N) \
	41	(void)(((N) < _FP_W_TYPE_SIZE) \
	42	? ({ \
	43	if (__builtin_constant_p(N) && (N) == 1) \
	44	{ \
	45	X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \
	46	X##_f0 += X##_f0; \
	47	} \
	48	else \
	49	{ \
	50	X##_f1 = X##_f1 << (N) \| X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
	51	X##_f0 <<= (N); \
	52	} \
	53	0; \
	54	}) \
	55	: ({ \
	56	X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \
	57	X##_f0 = 0; \
	58	}))
	59
	60
	61	#define _FP_FRAC_SRL_2(X,N) \
	62	(void)(((N) < _FP_W_TYPE_SIZE) \
	63	? ({ \
	64	X##_f0 = X##_f0 >> (N) \| X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
	65	X##_f1 >>= (N); \
	66	}) \
	67	: ({ \
	68	X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \
	69	X##_f1 = 0; \
	70	}))
	71
	72	/* Right shift with sticky-lsb. */
	73	#define _FP_FRAC_SRST_2(X,S, N,sz) \
	74	(void)(((N) < _FP_W_TYPE_SIZE) \
	75	? ({ \
	76	S = (__builtin_constant_p(N) && (N) == 1 \
	77	? X##_f0 & 1 \
	78	: (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \
	79	X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) \| X##_f0 >> (N)); \
	80	X##_f1 >>= (N); \
	81	}) \
	82	: ({ \
	83	S = ((((N) == _FP_W_TYPE_SIZE \
	84	? 0 \
	85	: (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
	86	\| X##_f0) != 0); \
	87	X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \
	88	X##_f1 = 0; \
	89	}))
	90
	91	#define _FP_FRAC_SRS_2(X,N,sz) \
	92	(void)(((N) < _FP_W_TYPE_SIZE) \
	93	? ({ \
	94	X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) \| X##_f0 >> (N) \| \
	95	(__builtin_constant_p(N) && (N) == 1 \
96	? X##_f0 & 1 \
97	: (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
98	X##_f1 >>= (N); \
99	}) \
100	: ({ \
101	X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \| \
102	((((N) == _FP_W_TYPE_SIZE \
103	? 0 \
104	: (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \
105	\| X##_f0) != 0)); \
106	X##_f1 = 0; \
107	}))
108
109	#define _FP_FRAC_ADDI_2(X,I) \
110	__FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
111
112	#define _FP_FRAC_ADD_2(R,X,Y) \
113	__FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
114
115	#define _FP_FRAC_SUB_2(R,X,Y) \
116	__FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
117
118	#define _FP_FRAC_DEC_2(X,Y) \
119	__FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
120
121	#define _FP_FRAC_CLZ_2(R,X) \
122	do { \
123	if (X##_f1) \
124	__FP_CLZ(R,X##_f1); \
125	else \
126	{ \
127	__FP_CLZ(R,X##_f0); \
128	R += _FP_W_TYPE_SIZE; \
129	} \
130	} while(0)
131
132	/* Predicates */
133	#define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE)X##_f1 < 0)
134	#define _FP_FRAC_ZEROP_2(X) ((X##_f1 \| X##_f0) == 0)
135	#define _FP_FRAC_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
136	#define _FP_FRAC_CLEAR_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs)
137	#define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
138	#define _FP_FRAC_GT_2(X, Y) \
139	(X##_f1 > Y##_f1 \|\| (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
140	#define _FP_FRAC_GE_2(X, Y) \
141	(X##_f1 > Y##_f1 \|\| (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
142
143	#define _FP_ZEROFRAC_2 0, 0
144	#define _FP_MINFRAC_2 0, 1
145	#define _FP_MAXFRAC_2 (~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
146
147	/*
fa1e55b0	148	* Internals
49721058 JM	149	*/
	150
	151	#define __FP_FRAC_SET_2(X,I1,I0) (X##_f0 = I0, X##_f1 = I1)
	152
	153	#define __FP_CLZ_2(R, xh, xl) \
	154	do { \
	155	if (xh) \
	156	__FP_CLZ(R,xh); \
	157	else \
	158	{ \
	159	__FP_CLZ(R,xl); \
	160	R += _FP_W_TYPE_SIZE; \
	161	} \
	162	} while(0)
	163
	164	#if 0
	165
	166	#ifndef __FP_FRAC_ADDI_2
	167	#define __FP_FRAC_ADDI_2(xh, xl, i) \
	168	(xh += ((xl += i) < i))
	169	#endif
	170	#ifndef __FP_FRAC_ADD_2
	171	#define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \
	172	(rh = xh + yh + ((rl = xl + yl) < xl))
	173	#endif
	174	#ifndef __FP_FRAC_SUB_2
	175	#define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \
	176	(rh = xh - yh - ((rl = xl - yl) > xl))
	177	#endif
	178	#ifndef __FP_FRAC_DEC_2
	179	#define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
	180	do { \
	181	UWtype _t = xl; \
	182	xh -= yh + ((xl -= yl) > _t); \
	183	} while (0)
	184	#endif
	185
	186	#else
	187
	188	#undef __FP_FRAC_ADDI_2
	189	#define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa(xh, xl, xh, xl, 0, i)
	190	#undef __FP_FRAC_ADD_2
	191	#define __FP_FRAC_ADD_2 add_ssaaaa
	192	#undef __FP_FRAC_SUB_2
	193	#define __FP_FRAC_SUB_2 sub_ddmmss
	194	#undef __FP_FRAC_DEC_2
	195	#define __FP_FRAC_DEC_2(xh, xl, yh, yl) sub_ddmmss(xh, xl, xh, xl, yh, yl)
	196
	197	#endif
	198
	199	/*
	200	* Unpack the raw bits of a native fp value. Do not classify or
	201	* normalize the data.
	202	*/
	203
	204	#define _FP_UNPACK_RAW_2(fs, X, val) \
	205	do { \
	206	union _FP_UNION_##fs _flo; _flo.flt = (val); \
	207	\
	208	X##_f0 = _flo.bits.frac0; \
	209	X##_f1 = _flo.bits.frac1; \
	210	X##_e = _flo.bits.exp; \
	211	X##_s = _flo.bits.sign; \
	212	} while (0)
213
214	#define _FP_UNPACK_RAW_2_P(fs, X, val) \
215	do { \
216	union _FP_UNION_##fs *_flo = \
217	(union _FP_UNION_##fs *)(val); \
218	\
219	X##_f0 = _flo->bits.frac0; \
220	X##_f1 = _flo->bits.frac1; \
221	X##_e = _flo->bits.exp; \
222	X##_s = _flo->bits.sign; \
223	} while (0)
224
225
226	/*
227	* Repack the raw bits of a native fp value.
228	*/
229
230	#define _FP_PACK_RAW_2(fs, val, X) \
231	do { \
232	union _FP_UNION_##fs _flo; \
233	\
234	_flo.bits.frac0 = X##_f0; \
235	_flo.bits.frac1 = X##_f1; \
236	_flo.bits.exp = X##_e; \
237	_flo.bits.sign = X##_s; \
238	\
239	(val) = _flo.flt; \
240	} while (0)
241
242	#define _FP_PACK_RAW_2_P(fs, val, X) \
243	do { \
244	union _FP_UNION_##fs *_flo = \
245	(union _FP_UNION_##fs *)(val); \
246	\
247	_flo->bits.frac0 = X##_f0; \
248	_flo->bits.frac1 = X##_f1; \
249	_flo->bits.exp = X##_e; \
250	_flo->bits.sign = X##_s; \
251	} while (0)
252
253
254	/*
255	* Multiplication algorithms:
256	*/
257
258	/* Given a 1W * 1W => 2W primitive, do the extended multiplication. */
259
260	#define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \
261	do { \
262	_FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
263	\
264	doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \
265	doit(_b_f1, _b_f0, X##_f0, Y##_f1); \
266	doit(_c_f1, _c_f0, X##_f1, Y##_f0); \
267	doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1); \
268	\
269	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
270	_FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0, \
271	_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
272	_FP_FRAC_WORD_4(_z,1)); \
273	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
274	_FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0, \
275	_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
276	_FP_FRAC_WORD_4(_z,1)); \
277	\
278	/* Normalize since we know where the msb of the multiplicands \
279	were (bit B), we know that the msb of the of the product is \
280	at either 2B or 2B-1. */ \
281	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
282	R##_f0 = _FP_FRAC_WORD_4(_z,0); \
283	R##_f1 = _FP_FRAC_WORD_4(_z,1); \
284	} while (0)
285
286	/* Given a 1W * 1W => 2W primitive, do the extended multiplication.
287	Do only 3 multiplications instead of four. This one is for machines
288	where multiplication is much more expensive than subtraction. */
289
290	#define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \
291	do { \
292	_FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \
293	_FP_W_TYPE _d; \
294	int _c1, _c2; \
295	\
296	_b_f0 = X##_f0 + X##_f1; \
297	_c1 = _b_f0 < X##_f0; \
298	_b_f1 = Y##_f0 + Y##_f1; \
299	_c2 = _b_f1 < Y##_f0; \
300	doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \
301	doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1); \
302	doit(_c_f1, _c_f0, X##_f1, Y##_f1); \
303	\
304	_b_f0 &= -_c2; \
305	_b_f1 &= -_c1; \
306	__FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
307	_FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d, \
308	0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1)); \
309	__FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
310	_b_f0); \
311	__FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
312	_b_f1); \
313	__FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
314	_FP_FRAC_WORD_4(_z,1), \
315	0, _d, _FP_FRAC_WORD_4(_z,0)); \
316	__FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \
317	_FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0); \
318	__FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), \
319	_c_f1, _c_f0, \
320	_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2)); \
321	\
322	/* Normalize since we know where the msb of the multiplicands \
323	were (bit B), we know that the msb of the of the product is \
324	at either 2B or 2B-1. */ \
325	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
326	R##_f0 = _FP_FRAC_WORD_4(_z,0); \
327	R##_f1 = _FP_FRAC_WORD_4(_z,1); \
328	} while (0)
329
330	#define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \
331	do { \
332	_FP_FRAC_DECL_4(_z); \
333	_FP_W_TYPE _x[2], _y[2]; \
334	_x[0] = X##_f0; _x[1] = X##_f1; \
335	_y[0] = Y##_f0; _y[1] = Y##_f1; \
336	\
337	mpn_mul_n(_z_f, _x, _y, 2); \
338	\
339	/* Normalize since we know where the msb of the multiplicands \
340	were (bit B), we know that the msb of the of the product is \
341	at either 2B or 2B-1. */ \
342	_FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \
343	R##_f0 = _z_f[0]; \
344	R##_f1 = _z_f[1]; \
345	} while (0)
346
347	/* Do at most 120x120=240 bits multiplication using double floating
348	point multiplication. This is useful if floating point
349	multiplication has much bigger throughput than integer multiply.
350	It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
fa1e55b0	351	between 106 and 120 only.
49721058 JM	352	Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
	353	SETFETZ is a macro which will disable all FPU exceptions and set rounding
	354	towards zero, RESETFE should optionally reset it back. */
	355
	356	#define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
	357	do { \
	358	static const double _const[] = { \
	359	/* 2^-24 */ 5.9604644775390625e-08, \
	360	/* 2^-48 */ 3.5527136788005009e-15, \
	361	/* 2^-72 */ 2.1175823681357508e-22, \
	362	/* 2^-96 */ 1.2621774483536189e-29, \
	363	/* 2^28 */ 2.68435456e+08, \
	364	/* 2^4 */ 1.600000e+01, \
	365	/* 2^-20 */ 9.5367431640625e-07, \
	366	/* 2^-44 */ 5.6843418860808015e-14, \
	367	/* 2^-68 */ 3.3881317890172014e-21, \
	368	/* 2^-92 */ 2.0194839173657902e-28, \
	369	/* 2^-116 */ 1.2037062152420224e-35}; \
	370	double _a240, _b240, _c240, _d240, _e240, _f240, \
	371	_g240, _h240, _i240, _j240, _k240; \
	372	union { double d; UDItype i; } _l240, _m240, _n240, _o240, \
	373	_p240, _q240, _r240, _s240; \
	374	UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \
	375	\
	376	if (wfracbits < 106 \|\| wfracbits > 120) \
	377	abort(); \
	378	\
	379	setfetz; \
	380	\
	381	_e240 = (double)(long)(X##_f0 & 0xffffff); \
	382	_j240 = (double)(long)(Y##_f0 & 0xffffff); \
	383	_d240 = (double)(long)((X##_f0 >> 24) & 0xffffff); \
	384	_i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff); \
	385	_c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) \| (X##_f0 >> 48)); \
	386	_h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) \| (Y##_f0 >> 48)); \
	387	_b240 = (double)(long)((X##_f1 >> 8) & 0xffffff); \
	388	_g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff); \
	389	_a240 = (double)(long)(X##_f1 >> 32); \
	390	_f240 = (double)(long)(Y##_f1 >> 32); \
	391	_e240 *= _const[3]; \
	392	_j240 *= _const[3]; \
	393	_d240 *= _const[2]; \
	394	_i240 *= _const[2]; \
	395	_c240 *= _const[1]; \
	396	_h240 *= _const[1]; \
	397	_b240 *= _const[0]; \
	398	_g240 *= _const[0]; \
	399	_s240.d = _e240*_j240;\
	400	_r240.d = _d240_j240 + _e240_i240;\
	401	_q240.d = _c240_j240 + _d240_i240 + _e240*_h240;\
	402	_p240.d = _b240_j240 + _c240_i240 + _d240_h240 + _e240_g240;\
	403	_o240.d = _a240_j240 + _b240_i240 + _c240_h240 + _d240_g240 + _e240*_f240;\
	404	_n240.d = _a240_i240 + _b240_h240 + _c240_g240 + _d240_f240; \
	405	_m240.d = _a240_h240 + _b240_g240 + _c240*_f240; \
	406	_l240.d = _a240_g240 + _b240_f240; \
	407	_k240 = _a240*_f240; \
	408	_r240.d += _s240.d; \
	409	_q240.d += _r240.d; \
	410	_p240.d += _q240.d; \
	411	_o240.d += _p240.d; \
	412	_n240.d += _o240.d; \
	413	_m240.d += _n240.d; \
	414	_l240.d += _m240.d; \
	415	_k240 += _l240.d; \
416	_s240.d -= ((_const[10]+_s240.d)-_const[10]); \
417	_r240.d -= ((_const[9]+_r240.d)-_const[9]); \
418	_q240.d -= ((_const[8]+_q240.d)-_const[8]); \
419	_p240.d -= ((_const[7]+_p240.d)-_const[7]); \
420	_o240.d += _const[7]; \
421	_n240.d += _const[6]; \
422	_m240.d += _const[5]; \
423	_l240.d += _const[4]; \
424	if (_s240.d != 0.0) _y240 = 1; \
425	if (_r240.d != 0.0) _y240 = 1; \
426	if (_q240.d != 0.0) _y240 = 1; \
427	if (_p240.d != 0.0) _y240 = 1; \
428	_t240 = (DItype)_k240; \
429	_u240 = _l240.i; \
430	_v240 = _m240.i; \
431	_w240 = _n240.i; \
432	_x240 = _o240.i; \
433	R##_f1 = (_t240 << (128 - (wfracbits - 1))) \
434	\| ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)); \
435	R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \
fa1e55b0 JM	436	\| ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \
	437	\| ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \
	438	\| ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \
	439	\| _y240; \
49721058 JM	440	resetfe; \
	441	} while (0)
	442
	443	/*
	444	* Division algorithms:
	445	*/
	446
	447	#define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \
	448	do { \
	449	_FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0; \
	450	if (_FP_FRAC_GT_2(X, Y)) \
	451	{ \
	452	_n_f2 = X##_f1 >> 1; \
	453	_n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) \| X##_f0 >> 1; \
	454	_n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \
	455	} \
	456	else \
	457	{ \
	458	R##_e--; \
	459	_n_f2 = X##_f1; \
	460	_n_f1 = X##_f0; \
	461	_n_f0 = 0; \
	462	} \
	463	\
	464	/* Normalize, i.e. make the most significant bit of the \
	465	denominator set. */ \
	466	_FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs); \
	467	\
	468	udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1); \
	469	umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0); \
	470	_r_f0 = _n_f0; \
	471	if (_FP_FRAC_GT_2(_m, _r)) \
	472	{ \
	473	R##_f1--; \
	474	_FP_FRAC_ADD_2(_r, Y, _r); \
	475	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
	476	{ \
	477	R##_f1--; \
	478	_FP_FRAC_ADD_2(_r, Y, _r); \
	479	} \
	480	} \
	481	_FP_FRAC_DEC_2(_r, _m); \
	482	\
	483	if (_r_f1 == Y##_f1) \
	484	{ \
	485	/* This is a special case, not an optimization \
	486	(_r/Y##_f1 would not fit into UWtype). \
	487	As _r is guaranteed to be < Y, R##_f0 can be either \
	488	(UWtype)-1 or (UWtype)-2. But as we know what kind \
	489	of bits it is (sticky, guard, round), we don't care. \
	490	We also don't care what the reminder is, because the \
	491	guard bit will be set anyway. -jj */ \
	492	R##_f0 = -1; \
	493	} \
	494	else \
	495	{ \
	496	udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1); \
	497	umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0); \
	498	_r_f0 = 0; \
	499	if (_FP_FRAC_GT_2(_m, _r)) \
	500	{ \
	501	R##_f0--; \
	502	_FP_FRAC_ADD_2(_r, Y, _r); \
	503	if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \
504	{ \
505	R##_f0--; \
506	_FP_FRAC_ADD_2(_r, Y, _r); \
507	} \
508	} \
509	if (!_FP_FRAC_EQ_2(_r, _m)) \
510	R##_f0 \|= _FP_WORK_STICKY; \
511	} \
512	} while (0)
513
514
515	#define _FP_DIV_MEAT_2_gmp(fs, R, X, Y) \
516	do { \
517	_FP_W_TYPE _x[4], _y[2], _z[4]; \
518	_y[0] = Y##_f0; _y[1] = Y##_f1; \
519	_x[0] = _x[3] = 0; \
520	if (_FP_FRAC_GT_2(X, Y)) \
521	{ \
522	R##_e++; \
523	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) \| \
524	X##_f1 >> (_FP_W_TYPE_SIZE - \
525	(_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE))); \
526	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE); \
527	} \
528	else \
529	{ \
530	_x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) \| \
531	X##_f1 >> (_FP_W_TYPE_SIZE - \
532	(_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE))); \
533	_x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE); \
534	} \
535	\
536	(void) mpn_divrem (_z, 0, _x, 4, _y, 2); \
537	R##_f1 = _z[1]; \
538	R##_f0 = _z[0] \| ((_x[0] \| _x[1]) != 0); \
539	} while (0)
540
541
542	/*
543	* Square root algorithms:
544	* We have just one right now, maybe Newton approximation
545	* should be added for those machines where division is fast.
546	*/
fa1e55b0	547
49721058 JM	548	#define _FP_SQRT_MEAT_2(R, S, T, X, q) \
	549	do { \
	550	while (q) \
	551	{ \
	552	T##_f1 = S##_f1 + q; \
	553	if (T##_f1 <= X##_f1) \
	554	{ \
	555	S##_f1 = T##_f1 + q; \
	556	X##_f1 -= T##_f1; \
	557	R##_f1 += q; \
	558	} \
	559	_FP_FRAC_SLL_2(X, 1); \
	560	q >>= 1; \
	561	} \
	562	q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1); \
	563	while (q != _FP_WORK_ROUND) \
	564	{ \
	565	T##_f0 = S##_f0 + q; \
	566	T##_f1 = S##_f1; \
	567	if (T##_f1 < X##_f1 \|\| \
	568	(T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \
	569	{ \
	570	S##_f0 = T##_f0 + q; \
	571	S##_f1 += (T##_f0 > S##_f0); \
	572	_FP_FRAC_DEC_2(X, T); \
	573	R##_f0 += q; \
	574	} \
	575	_FP_FRAC_SLL_2(X, 1); \
	576	q >>= 1; \
	577	} \
	578	if (X##_f0 \| X##_f1) \
	579	{ \
	580	if (S##_f1 < X##_f1 \|\| \
	581	(S##_f1 == X##_f1 && S##_f0 < X##_f0)) \
	582	R##_f0 \|= _FP_WORK_ROUND; \
	583	R##_f0 \|= _FP_WORK_STICKY; \
	584	} \
	585	} while (0)
	586
	587
	588	/*
fa1e55b0	589	* Assembly/disassembly for converting to/from integral types.
49721058 JM	590	* No shifting or overflow handled here.
	591	*/
	592
	593	#define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \
	594	(void)((rsize <= _FP_W_TYPE_SIZE) \
	595	? ({ r = X##_f0; }) \
	596	: ({ \
	597	r = X##_f1; \
	598	r <<= _FP_W_TYPE_SIZE; \
	599	r += X##_f0; \
	600	}))
	601
	602	#define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \
	603	do { \
	604	X##_f0 = r; \
	605	X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \
	606	} while (0)
	607
	608	/*
	609	* Convert FP values between word sizes
	610	*/
	611
	612	#define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0)
	613
	614	#define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0))
13cc6d1b JM	615
13cc6d1b JM	616	#define _FP_FRAC_COPY_2_2(D,S) _FP_FRAC_COPY_2(D,S)