fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Returns the fraction bits of the half-precision floating-point value `a'.
 100 *----------------------------------------------------------------------------*/
 101
 102 static inline uint32_t extractFloat16Frac(float16 a)
 103 {
 104     return float16_val(a) & 0x3ff;
 105 }
 106
 107 /*----------------------------------------------------------------------------
 108 | Returns the exponent bits of the half-precision floating-point value `a'.
 109 *----------------------------------------------------------------------------*/
 110
 111 static inline int extractFloat16Exp(float16 a)
 112 {
 113     return (float16_val(a) >> 10) & 0x1f;
 114 }
 115
 116 /*----------------------------------------------------------------------------
 117 | Returns the sign bit of the single-precision floating-point value `a'.
 118 *----------------------------------------------------------------------------*/
 119
 120 static inline flag extractFloat16Sign(float16 a)
 121 {
 122     return float16_val(a)>>15;
 123 }
 124
 125 /*----------------------------------------------------------------------------
 126 | Returns the fraction bits of the single-precision floating-point value `a'.
 127 *----------------------------------------------------------------------------*/
 128
 129 static inline uint32_t extractFloat32Frac(float32 a)
 130 {
 131     return float32_val(a) & 0x007FFFFF;
 132 }
 133
 134 /*----------------------------------------------------------------------------
 135 | Returns the exponent bits of the single-precision floating-point value `a'.
 136 *----------------------------------------------------------------------------*/
 137
 138 static inline int extractFloat32Exp(float32 a)
 139 {
 140     return (float32_val(a) >> 23) & 0xFF;
 141 }
 142
 143 /*----------------------------------------------------------------------------
 144 | Returns the sign bit of the single-precision floating-point value `a'.
 145 *----------------------------------------------------------------------------*/
 146
 147 static inline flag extractFloat32Sign(float32 a)
 148 {
 149     return float32_val(a) >> 31;
 150 }
 151
 152 /*----------------------------------------------------------------------------
 153 | Returns the fraction bits of the double-precision floating-point value `a'.
 154 *----------------------------------------------------------------------------*/
 155
 156 static inline uint64_t extractFloat64Frac(float64 a)
 157 {
 158     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 159 }
 160
 161 /*----------------------------------------------------------------------------
 162 | Returns the exponent bits of the double-precision floating-point value `a'.
 163 *----------------------------------------------------------------------------*/
 164
 165 static inline int extractFloat64Exp(float64 a)
 166 {
 167     return (float64_val(a) >> 52) & 0x7FF;
 168 }
 169
 170 /*----------------------------------------------------------------------------
 171 | Returns the sign bit of the double-precision floating-point value `a'.
 172 *----------------------------------------------------------------------------*/
 173
 174 static inline flag extractFloat64Sign(float64 a)
 175 {
 176     return float64_val(a) >> 63;
 177 }
 178
 179 /*
 180  * Classify a floating point number. Everything above float_class_qnan
 181  * is a NaN so cls >= float_class_qnan is any NaN.
 182  */
 183
 184 typedef enum __attribute__ ((__packed__)) {
 185     float_class_unclassified,
 186     float_class_zero,
 187     float_class_normal,
 188     float_class_inf,
 189     float_class_qnan,  /* all NaNs from here */
 190     float_class_snan,
 191     float_class_dnan,
 192     float_class_msnan, /* maybe silenced */
 193 } FloatClass;
 194
 195 /*
 196  * Structure holding all of the decomposed parts of a float. The
 197  * exponent is unbiased and the fraction is normalized. All
 198  * calculations are done with a 64 bit fraction and then rounded as
 199  * appropriate for the final format.
 200  *
 201  * Thanks to the packed FloatClass a decent compiler should be able to
 202  * fit the whole structure into registers and avoid using the stack
 203  * for parameter passing.
 204  */
 205
 206 typedef struct {
 207     uint64_t frac;
 208     int32_t  exp;
 209     FloatClass cls;
 210     bool sign;
 211 } FloatParts;
 212
 213 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 214 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 215 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 216
 217 /* Structure holding all of the relevant parameters for a format.
 218  *   exp_size: the size of the exponent field
 219  *   exp_bias: the offset applied to the exponent field
 220  *   exp_max: the maximum normalised exponent
 221  *   frac_size: the size of the fraction field
 222  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 223  * The following are computed based the size of fraction
 224  *   frac_lsb: least significant bit of fraction
 225  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 226  *   round_mask/roundeven_mask: masks used for rounding
 227  */
 228 typedef struct {
 229     int exp_size;
 230     int exp_bias;
 231     int exp_max;
 232     int frac_size;
 233     int frac_shift;
 234     uint64_t frac_lsb;
 235     uint64_t frac_lsbm1;
 236     uint64_t round_mask;
 237     uint64_t roundeven_mask;
 238 } FloatFmt;
 239
 240 /* Expand fields based on the size of exponent and fraction */
 241 #define FLOAT_PARAMS(E, F)                                           \
 242     .exp_size       = E,                                             \
 243     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 244     .exp_max        = (1 << E) - 1,                                  \
 245     .frac_size      = F,                                             \
 246     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 247     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 248     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 249     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 250     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 251
 252 static const FloatFmt float16_params = {
 253     FLOAT_PARAMS(5, 10)
 254 };
 255
 256 static const FloatFmt float32_params = {
 257     FLOAT_PARAMS(8, 23)
 258 };
 259
 260 static const FloatFmt float64_params = {
 261     FLOAT_PARAMS(11, 52)
 262 };
 263
 264 /* Unpack a float to parts, but do not canonicalize.  */
 265 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 266 {
 267     const int sign_pos = fmt.frac_size + fmt.exp_size;
 268
 269     return (FloatParts) {
 270         .cls = float_class_unclassified,
 271         .sign = extract64(raw, sign_pos, 1),
 272         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 273         .frac = extract64(raw, 0, fmt.frac_size),
 274     };
 275 }
 276
 277 static inline FloatParts float16_unpack_raw(float16 f)
 278 {
 279     return unpack_raw(float16_params, f);
 280 }
 281
 282 static inline FloatParts float32_unpack_raw(float32 f)
 283 {
 284     return unpack_raw(float32_params, f);
 285 }
 286
 287 static inline FloatParts float64_unpack_raw(float64 f)
 288 {
 289     return unpack_raw(float64_params, f);
 290 }
 291
 292 /* Pack a float from parts, but do not canonicalize.  */
 293 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 294 {
 295     const int sign_pos = fmt.frac_size + fmt.exp_size;
 296     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 297     return deposit64(ret, sign_pos, 1, p.sign);
 298 }
 299
 300 static inline float16 float16_pack_raw(FloatParts p)
 301 {
 302     return make_float16(pack_raw(float16_params, p));
 303 }
 304
 305 static inline float32 float32_pack_raw(FloatParts p)
 306 {
 307     return make_float32(pack_raw(float32_params, p));
 308 }
 309
 310 static inline float64 float64_pack_raw(FloatParts p)
 311 {
 312     return make_float64(pack_raw(float64_params, p));
 313 }
 314
 315 /*----------------------------------------------------------------------------
 316 | Functions and definitions to determine:  (1) whether tininess for underflow
 317 | is detected before or after rounding by default, (2) what (if anything)
 318 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 319 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 320 | are propagated from function inputs to output.  These details are target-
 321 | specific.
 322 *----------------------------------------------------------------------------*/
 323 #include "softfloat-specialize.h"
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333             part.frac <<= parm->frac_shift;
 334             part.cls = (parts_is_snan_frac(part.frac, status)
 335                         ? float_class_snan : float_class_qnan);
 336         }
 337     } else if (part.exp == 0) {
 338         if (likely(part.frac == 0)) {
 339             part.cls = float_class_zero;
 340         } else if (status->flush_inputs_to_zero) {
 341             float_raise(float_flag_input_denormal, status);
 342             part.cls = float_class_zero;
 343             part.frac = 0;
 344         } else {
 345             int shift = clz64(part.frac) - 1;
 346             part.cls = float_class_normal;
 347             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 348             part.frac <<= shift;
 349         }
 350     } else {
 351         part.cls = float_class_normal;
 352         part.exp -= parm->exp_bias;
 353         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 354     }
 355     return part;
 356 }
 357
 358 /* Round and uncanonicalize a floating-point number by parts. There
 359  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 360  * fraction; these bits will be removed. The exponent will be biased
 361  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 362  */
 363
 364 static FloatParts round_canonical(FloatParts p, float_status *s,
 365                                   const FloatFmt *parm)
 366 {
 367     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 368     const uint64_t round_mask = parm->round_mask;
 369     const uint64_t roundeven_mask = parm->roundeven_mask;
 370     const int exp_max = parm->exp_max;
 371     const int frac_shift = parm->frac_shift;
 372     uint64_t frac, inc;
 373     int exp, flags = 0;
 374     bool overflow_norm;
 375
 376     frac = p.frac;
 377     exp = p.exp;
 378
 379     switch (p.cls) {
 380     case float_class_normal:
 381         switch (s->float_rounding_mode) {
 382         case float_round_nearest_even:
 383             overflow_norm = false;
 384             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 385             break;
 386         case float_round_ties_away:
 387             overflow_norm = false;
 388             inc = frac_lsbm1;
 389             break;
 390         case float_round_to_zero:
 391             overflow_norm = true;
 392             inc = 0;
 393             break;
 394         case float_round_up:
 395             inc = p.sign ? 0 : round_mask;
 396             overflow_norm = p.sign;
 397             break;
 398         case float_round_down:
 399             inc = p.sign ? round_mask : 0;
 400             overflow_norm = !p.sign;
 401             break;
 402         default:
 403             g_assert_not_reached();
 404         }
 405
 406         exp += parm->exp_bias;
 407         if (likely(exp > 0)) {
 408             if (frac & round_mask) {
 409                 flags |= float_flag_inexact;
 410                 frac += inc;
 411                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 412                     frac >>= 1;
 413                     exp++;
 414                 }
 415             }
 416             frac >>= frac_shift;
 417
 418             if (unlikely(exp >= exp_max)) {
 419                 flags |= float_flag_overflow | float_flag_inexact;
 420                 if (overflow_norm) {
 421                     exp = exp_max - 1;
 422                     frac = -1;
 423                 } else {
 424                     p.cls = float_class_inf;
 425                     goto do_inf;
 426                 }
 427             }
 428         } else if (s->flush_to_zero) {
 429             flags |= float_flag_output_denormal;
 430             p.cls = float_class_zero;
 431             goto do_zero;
 432         } else {
 433             bool is_tiny = (s->float_detect_tininess
 434                             == float_tininess_before_rounding)
 435                         || (exp < 0)
 436                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 437
 438             shift64RightJamming(frac, 1 - exp, &frac);
 439             if (frac & round_mask) {
 440                 /* Need to recompute round-to-even.  */
 441                 if (s->float_rounding_mode == float_round_nearest_even) {
 442                     inc = ((frac & roundeven_mask) != frac_lsbm1
 443                            ? frac_lsbm1 : 0);
 444                 }
 445                 flags |= float_flag_inexact;
 446                 frac += inc;
 447             }
 448
 449             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 450             frac >>= frac_shift;
 451
 452             if (is_tiny && (flags & float_flag_inexact)) {
 453                 flags |= float_flag_underflow;
 454             }
 455             if (exp == 0 && frac == 0) {
 456                 p.cls = float_class_zero;
 457             }
 458         }
 459         break;
 460
 461     case float_class_zero:
 462     do_zero:
 463         exp = 0;
 464         frac = 0;
 465         break;
 466
 467     case float_class_inf:
 468     do_inf:
 469         exp = exp_max;
 470         frac = 0;
 471         break;
 472
 473     case float_class_qnan:
 474     case float_class_snan:
 475         exp = exp_max;
 476         frac >>= parm->frac_shift;
 477         break;
 478
 479     default:
 480         g_assert_not_reached();
 481     }
 482
 483     float_raise(flags, s);
 484     p.exp = exp;
 485     p.frac = frac;
 486     return p;
 487 }
 488
 489 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 490 {
 491     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 492 }
 493
 494 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 495 {
 496     switch (p.cls) {
 497     case float_class_dnan:
 498         return float16_default_nan(s);
 499     case float_class_msnan:
 500         p.frac >>= float16_params.frac_shift;
 501         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 502     default:
 503         p = round_canonical(p, s, &float16_params);
 504         return float16_pack_raw(p);
 505     }
 506 }
 507
 508 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 509 {
 510     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 511 }
 512
 513 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 514 {
 515     switch (p.cls) {
 516     case float_class_dnan:
 517         return float32_default_nan(s);
 518     case float_class_msnan:
 519         p.frac >>= float32_params.frac_shift;
 520         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 521     default:
 522         p = round_canonical(p, s, &float32_params);
 523         return float32_pack_raw(p);
 524     }
 525 }
 526
 527 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 528 {
 529     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 530 }
 531
 532 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 533 {
 534     switch (p.cls) {
 535     case float_class_dnan:
 536         return float64_default_nan(s);
 537     case float_class_msnan:
 538         p.frac >>= float64_params.frac_shift;
 539         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 540     default:
 541         p = round_canonical(p, s, &float64_params);
 542         return float64_pack_raw(p);
 543     }
 544 }
 545
 546 /* Simple helpers for checking if what NaN we have */
 547 static bool is_nan(FloatClass c)
 548 {
 549     return unlikely(c >= float_class_qnan);
 550 }
 551 static bool is_snan(FloatClass c)
 552 {
 553     return c == float_class_snan;
 554 }
 555 static bool is_qnan(FloatClass c)
 556 {
 557     return c == float_class_qnan;
 558 }
 559
 560 static FloatParts return_nan(FloatParts a, float_status *s)
 561 {
 562     switch (a.cls) {
 563     case float_class_snan:
 564         s->float_exception_flags |= float_flag_invalid;
 565         a.cls = float_class_msnan;
 566         /* fall through */
 567     case float_class_qnan:
 568         if (s->default_nan_mode) {
 569             a.cls = float_class_dnan;
 570         }
 571         break;
 572
 573     default:
 574         g_assert_not_reached();
 575     }
 576     return a;
 577 }
 578
 579 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 580 {
 581     if (is_snan(a.cls) || is_snan(b.cls)) {
 582         s->float_exception_flags |= float_flag_invalid;
 583     }
 584
 585     if (s->default_nan_mode) {
 586         a.cls = float_class_dnan;
 587     } else {
 588         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 589                     is_qnan(b.cls), is_snan(b.cls),
 590                     a.frac > b.frac ||
 591                     (a.frac == b.frac && a.sign < b.sign))) {
 592             a = b;
 593         }
 594         a.cls = float_class_msnan;
 595     }
 596     return a;
 597 }
 598
 599 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 600                                   bool inf_zero, float_status *s)
 601 {
 602     int which;
 603
 604     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 605         s->float_exception_flags |= float_flag_invalid;
 606     }
 607
 608     which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 609                           is_qnan(b.cls), is_snan(b.cls),
 610                           is_qnan(c.cls), is_snan(c.cls),
 611                           inf_zero, s);
 612
 613     if (s->default_nan_mode) {
 614         /* Note that this check is after pickNaNMulAdd so that function
 615          * has an opportunity to set the Invalid flag.
 616          */
 617         a.cls = float_class_dnan;
 618         return a;
 619     }
 620
 621     switch (which) {
 622     case 0:
 623         break;
 624     case 1:
 625         a = b;
 626         break;
 627     case 2:
 628         a = c;
 629         break;
 630     case 3:
 631         a.cls = float_class_dnan;
 632         return a;
 633     default:
 634         g_assert_not_reached();
 635     }
 636     a.cls = float_class_msnan;
 637
 638     return a;
 639 }
 640
 641 /*
 642  * Returns the result of adding or subtracting the values of the
 643  * floating-point values `a' and `b'. The operation is performed
 644  * according to the IEC/IEEE Standard for Binary Floating-Point
 645  * Arithmetic.
 646  */
 647
 648 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 649                                 float_status *s)
 650 {
 651     bool a_sign = a.sign;
 652     bool b_sign = b.sign ^ subtract;
 653
 654     if (a_sign != b_sign) {
 655         /* Subtraction */
 656
 657         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 658             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 659                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 660                 a.frac = a.frac - b.frac;
 661             } else {
 662                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 663                 a.frac = b.frac - a.frac;
 664                 a.exp = b.exp;
 665                 a_sign ^= 1;
 666             }
 667
 668             if (a.frac == 0) {
 669                 a.cls = float_class_zero;
 670                 a.sign = s->float_rounding_mode == float_round_down;
 671             } else {
 672                 int shift = clz64(a.frac) - 1;
 673                 a.frac = a.frac << shift;
 674                 a.exp = a.exp - shift;
 675                 a.sign = a_sign;
 676             }
 677             return a;
 678         }
 679         if (is_nan(a.cls) || is_nan(b.cls)) {
 680             return pick_nan(a, b, s);
 681         }
 682         if (a.cls == float_class_inf) {
 683             if (b.cls == float_class_inf) {
 684                 float_raise(float_flag_invalid, s);
 685                 a.cls = float_class_dnan;
 686             }
 687             return a;
 688         }
 689         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 690             a.sign = s->float_rounding_mode == float_round_down;
 691             return a;
 692         }
 693         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 694             b.sign = a_sign ^ 1;
 695             return b;
 696         }
 697         if (b.cls == float_class_zero) {
 698             return a;
 699         }
 700     } else {
 701         /* Addition */
 702         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 703             if (a.exp > b.exp) {
 704                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 705             } else if (a.exp < b.exp) {
 706                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 707                 a.exp = b.exp;
 708             }
 709             a.frac += b.frac;
 710             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 711                 a.frac >>= 1;
 712                 a.exp += 1;
 713             }
 714             return a;
 715         }
 716         if (is_nan(a.cls) || is_nan(b.cls)) {
 717             return pick_nan(a, b, s);
 718         }
 719         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 720             return a;
 721         }
 722         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 723             b.sign = b_sign;
 724             return b;
 725         }
 726     }
 727     g_assert_not_reached();
 728 }
 729
 730 /*
 731  * Returns the result of adding or subtracting the floating-point
 732  * values `a' and `b'. The operation is performed according to the
 733  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 734  */
 735
 736 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 737                                               float_status *status)
 738 {
 739     FloatParts pa = float16_unpack_canonical(a, status);
 740     FloatParts pb = float16_unpack_canonical(b, status);
 741     FloatParts pr = addsub_floats(pa, pb, false, status);
 742
 743     return float16_round_pack_canonical(pr, status);
 744 }
 745
 746 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 747                                              float_status *status)
 748 {
 749     FloatParts pa = float32_unpack_canonical(a, status);
 750     FloatParts pb = float32_unpack_canonical(b, status);
 751     FloatParts pr = addsub_floats(pa, pb, false, status);
 752
 753     return float32_round_pack_canonical(pr, status);
 754 }
 755
 756 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 757                                              float_status *status)
 758 {
 759     FloatParts pa = float64_unpack_canonical(a, status);
 760     FloatParts pb = float64_unpack_canonical(b, status);
 761     FloatParts pr = addsub_floats(pa, pb, false, status);
 762
 763     return float64_round_pack_canonical(pr, status);
 764 }
 765
 766 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 767                                              float_status *status)
 768 {
 769     FloatParts pa = float16_unpack_canonical(a, status);
 770     FloatParts pb = float16_unpack_canonical(b, status);
 771     FloatParts pr = addsub_floats(pa, pb, true, status);
 772
 773     return float16_round_pack_canonical(pr, status);
 774 }
 775
 776 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 777                                              float_status *status)
 778 {
 779     FloatParts pa = float32_unpack_canonical(a, status);
 780     FloatParts pb = float32_unpack_canonical(b, status);
 781     FloatParts pr = addsub_floats(pa, pb, true, status);
 782
 783     return float32_round_pack_canonical(pr, status);
 784 }
 785
 786 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 787                                              float_status *status)
 788 {
 789     FloatParts pa = float64_unpack_canonical(a, status);
 790     FloatParts pb = float64_unpack_canonical(b, status);
 791     FloatParts pr = addsub_floats(pa, pb, true, status);
 792
 793     return float64_round_pack_canonical(pr, status);
 794 }
 795
 796 /*
 797  * Returns the result of multiplying the floating-point values `a' and
 798  * `b'. The operation is performed according to the IEC/IEEE Standard
 799  * for Binary Floating-Point Arithmetic.
 800  */
 801
 802 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 803 {
 804     bool sign = a.sign ^ b.sign;
 805
 806     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 807         uint64_t hi, lo;
 808         int exp = a.exp + b.exp;
 809
 810         mul64To128(a.frac, b.frac, &hi, &lo);
 811         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 812         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 813             shift64RightJamming(lo, 1, &lo);
 814             exp += 1;
 815         }
 816
 817         /* Re-use a */
 818         a.exp = exp;
 819         a.sign = sign;
 820         a.frac = lo;
 821         return a;
 822     }
 823     /* handle all the NaN cases */
 824     if (is_nan(a.cls) || is_nan(b.cls)) {
 825         return pick_nan(a, b, s);
 826     }
 827     /* Inf * Zero == NaN */
 828     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 829         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 830         s->float_exception_flags |= float_flag_invalid;
 831         a.cls = float_class_dnan;
 832         a.sign = sign;
 833         return a;
 834     }
 835     /* Multiply by 0 or Inf */
 836     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 837         a.sign = sign;
 838         return a;
 839     }
 840     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 841         b.sign = sign;
 842         return b;
 843     }
 844     g_assert_not_reached();
 845 }
 846
 847 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 848                                              float_status *status)
 849 {
 850     FloatParts pa = float16_unpack_canonical(a, status);
 851     FloatParts pb = float16_unpack_canonical(b, status);
 852     FloatParts pr = mul_floats(pa, pb, status);
 853
 854     return float16_round_pack_canonical(pr, status);
 855 }
 856
 857 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 858                                              float_status *status)
 859 {
 860     FloatParts pa = float32_unpack_canonical(a, status);
 861     FloatParts pb = float32_unpack_canonical(b, status);
 862     FloatParts pr = mul_floats(pa, pb, status);
 863
 864     return float32_round_pack_canonical(pr, status);
 865 }
 866
 867 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 868                                              float_status *status)
 869 {
 870     FloatParts pa = float64_unpack_canonical(a, status);
 871     FloatParts pb = float64_unpack_canonical(b, status);
 872     FloatParts pr = mul_floats(pa, pb, status);
 873
 874     return float64_round_pack_canonical(pr, status);
 875 }
 876
 877 /*
 878  * Returns the result of multiplying the floating-point values `a' and
 879  * `b' then adding 'c', with no intermediate rounding step after the
 880  * multiplication. The operation is performed according to the
 881  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 882  * The flags argument allows the caller to select negation of the
 883  * addend, the intermediate product, or the final result. (The
 884  * difference between this and having the caller do a separate
 885  * negation is that negating externally will flip the sign bit on
 886  * NaNs.)
 887  */
 888
 889 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 890                                 int flags, float_status *s)
 891 {
 892     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 893                     ((1 << float_class_inf) | (1 << float_class_zero));
 894     bool p_sign;
 895     bool sign_flip = flags & float_muladd_negate_result;
 896     FloatClass p_class;
 897     uint64_t hi, lo;
 898     int p_exp;
 899
 900     /* It is implementation-defined whether the cases of (0,inf,qnan)
 901      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 902      * they return if they do), so we have to hand this information
 903      * off to the target-specific pick-a-NaN routine.
 904      */
 905     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 906         return pick_nan_muladd(a, b, c, inf_zero, s);
 907     }
 908
 909     if (inf_zero) {
 910         s->float_exception_flags |= float_flag_invalid;
 911         a.cls = float_class_dnan;
 912         return a;
 913     }
 914
 915     if (flags & float_muladd_negate_c) {
 916         c.sign ^= 1;
 917     }
 918
 919     p_sign = a.sign ^ b.sign;
 920
 921     if (flags & float_muladd_negate_product) {
 922         p_sign ^= 1;
 923     }
 924
 925     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 926         p_class = float_class_inf;
 927     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 928         p_class = float_class_zero;
 929     } else {
 930         p_class = float_class_normal;
 931     }
 932
 933     if (c.cls == float_class_inf) {
 934         if (p_class == float_class_inf && p_sign != c.sign) {
 935             s->float_exception_flags |= float_flag_invalid;
 936             a.cls = float_class_dnan;
 937         } else {
 938             a.cls = float_class_inf;
 939             a.sign = c.sign ^ sign_flip;
 940         }
 941         return a;
 942     }
 943
 944     if (p_class == float_class_inf) {
 945         a.cls = float_class_inf;
 946         a.sign = p_sign ^ sign_flip;
 947         return a;
 948     }
 949
 950     if (p_class == float_class_zero) {
 951         if (c.cls == float_class_zero) {
 952             if (p_sign != c.sign) {
 953                 p_sign = s->float_rounding_mode == float_round_down;
 954             }
 955             c.sign = p_sign;
 956         } else if (flags & float_muladd_halve_result) {
 957             c.exp -= 1;
 958         }
 959         c.sign ^= sign_flip;
 960         return c;
 961     }
 962
 963     /* a & b should be normals now... */
 964     assert(a.cls == float_class_normal &&
 965            b.cls == float_class_normal);
 966
 967     p_exp = a.exp + b.exp;
 968
 969     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 970      * result.
 971      */
 972     mul64To128(a.frac, b.frac, &hi, &lo);
 973     /* binary point now at bit 124 */
 974
 975     /* check for overflow */
 976     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 977         shift128RightJamming(hi, lo, 1, &hi, &lo);
 978         p_exp += 1;
 979     }
 980
 981     /* + add/sub */
 982     if (c.cls == float_class_zero) {
 983         /* move binary point back to 62 */
 984         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 985     } else {
 986         int exp_diff = p_exp - c.exp;
 987         if (p_sign == c.sign) {
 988             /* Addition */
 989             if (exp_diff <= 0) {
 990                 shift128RightJamming(hi, lo,
 991                                      DECOMPOSED_BINARY_POINT - exp_diff,
 992                                      &hi, &lo);
 993                 lo += c.frac;
 994                 p_exp = c.exp;
 995             } else {
 996                 uint64_t c_hi, c_lo;
 997                 /* shift c to the same binary point as the product (124) */
 998                 c_hi = c.frac >> 2;
 999                 c_lo = 0;
1000                 shift128RightJamming(c_hi, c_lo,
1001                                      exp_diff,
1002                                      &c_hi, &c_lo);
1003                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1004                 /* move binary point back to 62 */
1005                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1006             }
1007
1008             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1009                 shift64RightJamming(lo, 1, &lo);
1010                 p_exp += 1;
1011             }
1012
1013         } else {
1014             /* Subtraction */
1015             uint64_t c_hi, c_lo;
1016             /* make C binary point match product at bit 124 */
1017             c_hi = c.frac >> 2;
1018             c_lo = 0;
1019
1020             if (exp_diff <= 0) {
1021                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1022                 if (exp_diff == 0
1023                     &&
1024                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1025                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026                 } else {
1027                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1028                     p_sign ^= 1;
1029                     p_exp = c.exp;
1030                 }
1031             } else {
1032                 shift128RightJamming(c_hi, c_lo,
1033                                      exp_diff,
1034                                      &c_hi, &c_lo);
1035                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1036             }
1037
1038             if (hi == 0 && lo == 0) {
1039                 a.cls = float_class_zero;
1040                 a.sign = s->float_rounding_mode == float_round_down;
1041                 a.sign ^= sign_flip;
1042                 return a;
1043             } else {
1044                 int shift;
1045                 if (hi != 0) {
1046                     shift = clz64(hi);
1047                 } else {
1048                     shift = clz64(lo) + 64;
1049                 }
1050                 /* Normalizing to a binary point of 124 is the
1051                    correct adjust for the exponent.  However since we're
1052                    shifting, we might as well put the binary point back
1053                    at 62 where we really want it.  Therefore shift as
1054                    if we're leaving 1 bit at the top of the word, but
1055                    adjust the exponent as if we're leaving 3 bits.  */
1056                 shift -= 1;
1057                 if (shift >= 64) {
1058                     lo = lo << (shift - 64);
1059                 } else {
1060                     hi = (hi << shift) | (lo >> (64 - shift));
1061                     lo = hi | ((lo << shift) != 0);
1062                 }
1063                 p_exp -= shift - 2;
1064             }
1065         }
1066     }
1067
1068     if (flags & float_muladd_halve_result) {
1069         p_exp -= 1;
1070     }
1071
1072     /* finally prepare our result */
1073     a.cls = float_class_normal;
1074     a.sign = p_sign ^ sign_flip;
1075     a.exp = p_exp;
1076     a.frac = lo;
1077
1078     return a;
1079 }
1080
1081 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1082                                                 int flags, float_status *status)
1083 {
1084     FloatParts pa = float16_unpack_canonical(a, status);
1085     FloatParts pb = float16_unpack_canonical(b, status);
1086     FloatParts pc = float16_unpack_canonical(c, status);
1087     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1088
1089     return float16_round_pack_canonical(pr, status);
1090 }
1091
1092 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1093                                                 int flags, float_status *status)
1094 {
1095     FloatParts pa = float32_unpack_canonical(a, status);
1096     FloatParts pb = float32_unpack_canonical(b, status);
1097     FloatParts pc = float32_unpack_canonical(c, status);
1098     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1099
1100     return float32_round_pack_canonical(pr, status);
1101 }
1102
1103 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1104                                                 int flags, float_status *status)
1105 {
1106     FloatParts pa = float64_unpack_canonical(a, status);
1107     FloatParts pb = float64_unpack_canonical(b, status);
1108     FloatParts pc = float64_unpack_canonical(c, status);
1109     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1110
1111     return float64_round_pack_canonical(pr, status);
1112 }
1113
1114 /*
1115  * Returns the result of dividing the floating-point value `a' by the
1116  * corresponding value `b'. The operation is performed according to
1117  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118  */
1119
1120 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1121 {
1122     bool sign = a.sign ^ b.sign;
1123
1124     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1125         uint64_t temp_lo, temp_hi;
1126         int exp = a.exp - b.exp;
1127         if (a.frac < b.frac) {
1128             exp -= 1;
1129             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1130                               &temp_hi, &temp_lo);
1131         } else {
1132             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1133                               &temp_hi, &temp_lo);
1134         }
1135         /* LSB of quot is set if inexact which roundandpack will use
1136          * to set flags. Yet again we re-use a for the result */
1137         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1138         a.sign = sign;
1139         a.exp = exp;
1140         return a;
1141     }
1142     /* handle all the NaN cases */
1143     if (is_nan(a.cls) || is_nan(b.cls)) {
1144         return pick_nan(a, b, s);
1145     }
1146     /* 0/0 or Inf/Inf */
1147     if (a.cls == b.cls
1148         &&
1149         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1150         s->float_exception_flags |= float_flag_invalid;
1151         a.cls = float_class_dnan;
1152         return a;
1153     }
1154     /* Inf / x or 0 / x */
1155     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1156         a.sign = sign;
1157         return a;
1158     }
1159     /* Div 0 => Inf */
1160     if (b.cls == float_class_zero) {
1161         s->float_exception_flags |= float_flag_divbyzero;
1162         a.cls = float_class_inf;
1163         a.sign = sign;
1164         return a;
1165     }
1166     /* Div by Inf */
1167     if (b.cls == float_class_inf) {
1168         a.cls = float_class_zero;
1169         a.sign = sign;
1170         return a;
1171     }
1172     g_assert_not_reached();
1173 }
1174
1175 float16 float16_div(float16 a, float16 b, float_status *status)
1176 {
1177     FloatParts pa = float16_unpack_canonical(a, status);
1178     FloatParts pb = float16_unpack_canonical(b, status);
1179     FloatParts pr = div_floats(pa, pb, status);
1180
1181     return float16_round_pack_canonical(pr, status);
1182 }
1183
1184 float32 float32_div(float32 a, float32 b, float_status *status)
1185 {
1186     FloatParts pa = float32_unpack_canonical(a, status);
1187     FloatParts pb = float32_unpack_canonical(b, status);
1188     FloatParts pr = div_floats(pa, pb, status);
1189
1190     return float32_round_pack_canonical(pr, status);
1191 }
1192
1193 float64 float64_div(float64 a, float64 b, float_status *status)
1194 {
1195     FloatParts pa = float64_unpack_canonical(a, status);
1196     FloatParts pb = float64_unpack_canonical(b, status);
1197     FloatParts pr = div_floats(pa, pb, status);
1198
1199     return float64_round_pack_canonical(pr, status);
1200 }
1201
1202 /*
1203  * Rounds the floating-point value `a' to an integer, and returns the
1204  * result as a floating-point value. The operation is performed
1205  * according to the IEC/IEEE Standard for Binary Floating-Point
1206  * Arithmetic.
1207  */
1208
1209 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1210 {
1211     if (is_nan(a.cls)) {
1212         return return_nan(a, s);
1213     }
1214
1215     switch (a.cls) {
1216     case float_class_zero:
1217     case float_class_inf:
1218     case float_class_qnan:
1219         /* already "integral" */
1220         break;
1221     case float_class_normal:
1222         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1223             /* already integral */
1224             break;
1225         }
1226         if (a.exp < 0) {
1227             bool one;
1228             /* all fractional */
1229             s->float_exception_flags |= float_flag_inexact;
1230             switch (rounding_mode) {
1231             case float_round_nearest_even:
1232                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1233                 break;
1234             case float_round_ties_away:
1235                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1236                 break;
1237             case float_round_to_zero:
1238                 one = false;
1239                 break;
1240             case float_round_up:
1241                 one = !a.sign;
1242                 break;
1243             case float_round_down:
1244                 one = a.sign;
1245                 break;
1246             default:
1247                 g_assert_not_reached();
1248             }
1249
1250             if (one) {
1251                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1252                 a.exp = 0;
1253             } else {
1254                 a.cls = float_class_zero;
1255             }
1256         } else {
1257             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1258             uint64_t frac_lsbm1 = frac_lsb >> 1;
1259             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1260             uint64_t rnd_mask = rnd_even_mask >> 1;
1261             uint64_t inc;
1262
1263             switch (rounding_mode) {
1264             case float_round_nearest_even:
1265                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1266                 break;
1267             case float_round_ties_away:
1268                 inc = frac_lsbm1;
1269                 break;
1270             case float_round_to_zero:
1271                 inc = 0;
1272                 break;
1273             case float_round_up:
1274                 inc = a.sign ? 0 : rnd_mask;
1275                 break;
1276             case float_round_down:
1277                 inc = a.sign ? rnd_mask : 0;
1278                 break;
1279             default:
1280                 g_assert_not_reached();
1281             }
1282
1283             if (a.frac & rnd_mask) {
1284                 s->float_exception_flags |= float_flag_inexact;
1285                 a.frac += inc;
1286                 a.frac &= ~rnd_mask;
1287                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1288                     a.frac >>= 1;
1289                     a.exp++;
1290                 }
1291             }
1292         }
1293         break;
1294     default:
1295         g_assert_not_reached();
1296     }
1297     return a;
1298 }
1299
1300 float16 float16_round_to_int(float16 a, float_status *s)
1301 {
1302     FloatParts pa = float16_unpack_canonical(a, s);
1303     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1304     return float16_round_pack_canonical(pr, s);
1305 }
1306
1307 float32 float32_round_to_int(float32 a, float_status *s)
1308 {
1309     FloatParts pa = float32_unpack_canonical(a, s);
1310     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1311     return float32_round_pack_canonical(pr, s);
1312 }
1313
1314 float64 float64_round_to_int(float64 a, float_status *s)
1315 {
1316     FloatParts pa = float64_unpack_canonical(a, s);
1317     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1318     return float64_round_pack_canonical(pr, s);
1319 }
1320
1321 float64 float64_trunc_to_int(float64 a, float_status *s)
1322 {
1323     FloatParts pa = float64_unpack_canonical(a, s);
1324     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1325     return float64_round_pack_canonical(pr, s);
1326 }
1327
1328 /*
1329  * Returns the result of converting the floating-point value `a' to
1330  * the two's complement integer format. The conversion is performed
1331  * according to the IEC/IEEE Standard for Binary Floating-Point
1332  * Arithmetic---which means in particular that the conversion is
1333  * rounded according to the current rounding mode. If `a' is a NaN,
1334  * the largest positive integer is returned. Otherwise, if the
1335  * conversion overflows, the largest integer with the same sign as `a'
1336  * is returned.
1337 */
1338
1339 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1340                                      int64_t min, int64_t max,
1341                                      float_status *s)
1342 {
1343     uint64_t r;
1344     int orig_flags = get_float_exception_flags(s);
1345     FloatParts p = round_to_int(in, rmode, s);
1346
1347     switch (p.cls) {
1348     case float_class_snan:
1349     case float_class_qnan:
1350     case float_class_dnan:
1351     case float_class_msnan:
1352         s->float_exception_flags = orig_flags | float_flag_invalid;
1353         return max;
1354     case float_class_inf:
1355         s->float_exception_flags = orig_flags | float_flag_invalid;
1356         return p.sign ? min : max;
1357     case float_class_zero:
1358         return 0;
1359     case float_class_normal:
1360         if (p.exp < DECOMPOSED_BINARY_POINT) {
1361             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1362         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1363             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1364         } else {
1365             r = UINT64_MAX;
1366         }
1367         if (p.sign) {
1368             if (r <= -(uint64_t) min) {
1369                 return -r;
1370             } else {
1371                 s->float_exception_flags = orig_flags | float_flag_invalid;
1372                 return min;
1373             }
1374         } else {
1375             if (r <= max) {
1376                 return r;
1377             } else {
1378                 s->float_exception_flags = orig_flags | float_flag_invalid;
1379                 return max;
1380             }
1381         }
1382     default:
1383         g_assert_not_reached();
1384     }
1385 }
1386
1387 #define FLOAT_TO_INT(fsz, isz)                                          \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1389                                                 float_status *s)        \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }                                                                       \
1396                                                                         \
1397 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1398  (float ## fsz a, float_status *s)                                      \
1399 {                                                                       \
1400     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1401     return round_to_int_and_pack(p, float_round_to_zero,                \
1402                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1403                                  s);                                    \
1404 }
1405
1406 FLOAT_TO_INT(16, 16)
1407 FLOAT_TO_INT(16, 32)
1408 FLOAT_TO_INT(16, 64)
1409
1410 FLOAT_TO_INT(32, 16)
1411 FLOAT_TO_INT(32, 32)
1412 FLOAT_TO_INT(32, 64)
1413
1414 FLOAT_TO_INT(64, 16)
1415 FLOAT_TO_INT(64, 32)
1416 FLOAT_TO_INT(64, 64)
1417
1418 #undef FLOAT_TO_INT
1419
1420 /*
1421  *  Returns the result of converting the floating-point value `a' to
1422  *  the unsigned integer format. The conversion is performed according
1423  *  to the IEC/IEEE Standard for Binary Floating-Point
1424  *  Arithmetic---which means in particular that the conversion is
1425  *  rounded according to the current rounding mode. If `a' is a NaN,
1426  *  the largest unsigned integer is returned. Otherwise, if the
1427  *  conversion overflows, the largest unsigned integer is returned. If
1428  *  the 'a' is negative, the result is rounded and zero is returned;
1429  *  values that do not round to zero will raise the inexact exception
1430  *  flag.
1431  */
1432
1433 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1434                                        float_status *s)
1435 {
1436     int orig_flags = get_float_exception_flags(s);
1437     FloatParts p = round_to_int(in, rmode, s);
1438
1439     switch (p.cls) {
1440     case float_class_snan:
1441     case float_class_qnan:
1442     case float_class_dnan:
1443     case float_class_msnan:
1444         s->float_exception_flags = orig_flags | float_flag_invalid;
1445         return max;
1446     case float_class_inf:
1447         s->float_exception_flags = orig_flags | float_flag_invalid;
1448         return p.sign ? 0 : max;
1449     case float_class_zero:
1450         return 0;
1451     case float_class_normal:
1452     {
1453         uint64_t r;
1454         if (p.sign) {
1455             s->float_exception_flags = orig_flags | float_flag_invalid;
1456             return 0;
1457         }
1458
1459         if (p.exp < DECOMPOSED_BINARY_POINT) {
1460             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1461         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1462             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1463         } else {
1464             s->float_exception_flags = orig_flags | float_flag_invalid;
1465             return max;
1466         }
1467
1468         /* For uint64 this will never trip, but if p.exp is too large
1469          * to shift a decomposed fraction we shall have exited via the
1470          * 3rd leg above.
1471          */
1472         if (r > max) {
1473             s->float_exception_flags = orig_flags | float_flag_invalid;
1474             return max;
1475         } else {
1476             return r;
1477         }
1478     }
1479     default:
1480         g_assert_not_reached();
1481     }
1482 }
1483
1484 #define FLOAT_TO_UINT(fsz, isz) \
1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1486                                                   float_status *s)      \
1487 {                                                                       \
1488     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1489     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1490                                  UINT ## isz ## _MAX, s);               \
1491 }                                                                       \
1492                                                                         \
1493 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1494  (float ## fsz a, float_status *s)                                      \
1495 {                                                                       \
1496     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1497     return round_to_uint_and_pack(p, float_round_to_zero,               \
1498                                   UINT ## isz ## _MAX, s);              \
1499 }
1500
1501 FLOAT_TO_UINT(16, 16)
1502 FLOAT_TO_UINT(16, 32)
1503 FLOAT_TO_UINT(16, 64)
1504
1505 FLOAT_TO_UINT(32, 16)
1506 FLOAT_TO_UINT(32, 32)
1507 FLOAT_TO_UINT(32, 64)
1508
1509 FLOAT_TO_UINT(64, 16)
1510 FLOAT_TO_UINT(64, 32)
1511 FLOAT_TO_UINT(64, 64)
1512
1513 #undef FLOAT_TO_UINT
1514
1515 /*
1516  * Integer to float conversions
1517  *
1518  * Returns the result of converting the two's complement integer `a'
1519  * to the floating-point format. The conversion is performed according
1520  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1521  */
1522
1523 static FloatParts int_to_float(int64_t a, float_status *status)
1524 {
1525     FloatParts r = {};
1526     if (a == 0) {
1527         r.cls = float_class_zero;
1528         r.sign = false;
1529     } else if (a == (1ULL << 63)) {
1530         r.cls = float_class_normal;
1531         r.sign = true;
1532         r.frac = DECOMPOSED_IMPLICIT_BIT;
1533         r.exp = 63;
1534     } else {
1535         uint64_t f;
1536         if (a < 0) {
1537             f = -a;
1538             r.sign = true;
1539         } else {
1540             f = a;
1541             r.sign = false;
1542         }
1543         int shift = clz64(f) - 1;
1544         r.cls = float_class_normal;
1545         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1546         r.frac = f << shift;
1547     }
1548
1549     return r;
1550 }
1551
1552 float16 int64_to_float16(int64_t a, float_status *status)
1553 {
1554     FloatParts pa = int_to_float(a, status);
1555     return float16_round_pack_canonical(pa, status);
1556 }
1557
1558 float16 int32_to_float16(int32_t a, float_status *status)
1559 {
1560     return int64_to_float16(a, status);
1561 }
1562
1563 float16 int16_to_float16(int16_t a, float_status *status)
1564 {
1565     return int64_to_float16(a, status);
1566 }
1567
1568 float32 int64_to_float32(int64_t a, float_status *status)
1569 {
1570     FloatParts pa = int_to_float(a, status);
1571     return float32_round_pack_canonical(pa, status);
1572 }
1573
1574 float32 int32_to_float32(int32_t a, float_status *status)
1575 {
1576     return int64_to_float32(a, status);
1577 }
1578
1579 float32 int16_to_float32(int16_t a, float_status *status)
1580 {
1581     return int64_to_float32(a, status);
1582 }
1583
1584 float64 int64_to_float64(int64_t a, float_status *status)
1585 {
1586     FloatParts pa = int_to_float(a, status);
1587     return float64_round_pack_canonical(pa, status);
1588 }
1589
1590 float64 int32_to_float64(int32_t a, float_status *status)
1591 {
1592     return int64_to_float64(a, status);
1593 }
1594
1595 float64 int16_to_float64(int16_t a, float_status *status)
1596 {
1597     return int64_to_float64(a, status);
1598 }
1599
1600
1601 /*
1602  * Unsigned Integer to float conversions
1603  *
1604  * Returns the result of converting the unsigned integer `a' to the
1605  * floating-point format. The conversion is performed according to the
1606  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1607  */
1608
1609 static FloatParts uint_to_float(uint64_t a, float_status *status)
1610 {
1611     FloatParts r = { .sign = false};
1612
1613     if (a == 0) {
1614         r.cls = float_class_zero;
1615     } else {
1616         int spare_bits = clz64(a) - 1;
1617         r.cls = float_class_normal;
1618         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1619         if (spare_bits < 0) {
1620             shift64RightJamming(a, -spare_bits, &a);
1621             r.frac = a;
1622         } else {
1623             r.frac = a << spare_bits;
1624         }
1625     }
1626
1627     return r;
1628 }
1629
1630 float16 uint64_to_float16(uint64_t a, float_status *status)
1631 {
1632     FloatParts pa = uint_to_float(a, status);
1633     return float16_round_pack_canonical(pa, status);
1634 }
1635
1636 float16 uint32_to_float16(uint32_t a, float_status *status)
1637 {
1638     return uint64_to_float16(a, status);
1639 }
1640
1641 float16 uint16_to_float16(uint16_t a, float_status *status)
1642 {
1643     return uint64_to_float16(a, status);
1644 }
1645
1646 float32 uint64_to_float32(uint64_t a, float_status *status)
1647 {
1648     FloatParts pa = uint_to_float(a, status);
1649     return float32_round_pack_canonical(pa, status);
1650 }
1651
1652 float32 uint32_to_float32(uint32_t a, float_status *status)
1653 {
1654     return uint64_to_float32(a, status);
1655 }
1656
1657 float32 uint16_to_float32(uint16_t a, float_status *status)
1658 {
1659     return uint64_to_float32(a, status);
1660 }
1661
1662 float64 uint64_to_float64(uint64_t a, float_status *status)
1663 {
1664     FloatParts pa = uint_to_float(a, status);
1665     return float64_round_pack_canonical(pa, status);
1666 }
1667
1668 float64 uint32_to_float64(uint32_t a, float_status *status)
1669 {
1670     return uint64_to_float64(a, status);
1671 }
1672
1673 float64 uint16_to_float64(uint16_t a, float_status *status)
1674 {
1675     return uint64_to_float64(a, status);
1676 }
1677
1678 /* Float Min/Max */
1679 /* min() and max() functions. These can't be implemented as
1680  * 'compare and pick one input' because that would mishandle
1681  * NaNs and +0 vs -0.
1682  *
1683  * minnum() and maxnum() functions. These are similar to the min()
1684  * and max() functions but if one of the arguments is a QNaN and
1685  * the other is numerical then the numerical argument is returned.
1686  * SNaNs will get quietened before being returned.
1687  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1688  * and maxNum() operations. min() and max() are the typical min/max
1689  * semantics provided by many CPUs which predate that specification.
1690  *
1691  * minnummag() and maxnummag() functions correspond to minNumMag()
1692  * and minNumMag() from the IEEE-754 2008.
1693  */
1694 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1695                                 bool ieee, bool ismag, float_status *s)
1696 {
1697     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1698         if (ieee) {
1699             /* Takes two floating-point values `a' and `b', one of
1700              * which is a NaN, and returns the appropriate NaN
1701              * result. If either `a' or `b' is a signaling NaN,
1702              * the invalid exception is raised.
1703              */
1704             if (is_snan(a.cls) || is_snan(b.cls)) {
1705                 return pick_nan(a, b, s);
1706             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1707                 return b;
1708             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1709                 return a;
1710             }
1711         }
1712         return pick_nan(a, b, s);
1713     } else {
1714         int a_exp, b_exp;
1715
1716         switch (a.cls) {
1717         case float_class_normal:
1718             a_exp = a.exp;
1719             break;
1720         case float_class_inf:
1721             a_exp = INT_MAX;
1722             break;
1723         case float_class_zero:
1724             a_exp = INT_MIN;
1725             break;
1726         default:
1727             g_assert_not_reached();
1728             break;
1729         }
1730         switch (b.cls) {
1731         case float_class_normal:
1732             b_exp = b.exp;
1733             break;
1734         case float_class_inf:
1735             b_exp = INT_MAX;
1736             break;
1737         case float_class_zero:
1738             b_exp = INT_MIN;
1739             break;
1740         default:
1741             g_assert_not_reached();
1742             break;
1743         }
1744
1745         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1746             bool a_less = a_exp < b_exp;
1747             if (a_exp == b_exp) {
1748                 a_less = a.frac < b.frac;
1749             }
1750             return a_less ^ ismin ? b : a;
1751         }
1752
1753         if (a.sign == b.sign) {
1754             bool a_less = a_exp < b_exp;
1755             if (a_exp == b_exp) {
1756                 a_less = a.frac < b.frac;
1757             }
1758             return a.sign ^ a_less ^ ismin ? b : a;
1759         } else {
1760             return a.sign ^ ismin ? b : a;
1761         }
1762     }
1763 }
1764
1765 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1766 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1767                                      float_status *s)                   \
1768 {                                                                       \
1769     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1770     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1771     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1772                                                                         \
1773     return float ## sz ## _round_pack_canonical(pr, s);                 \
1774 }
1775
1776 MINMAX(16, min, true, false, false)
1777 MINMAX(16, minnum, true, true, false)
1778 MINMAX(16, minnummag, true, true, true)
1779 MINMAX(16, max, false, false, false)
1780 MINMAX(16, maxnum, false, true, false)
1781 MINMAX(16, maxnummag, false, true, true)
1782
1783 MINMAX(32, min, true, false, false)
1784 MINMAX(32, minnum, true, true, false)
1785 MINMAX(32, minnummag, true, true, true)
1786 MINMAX(32, max, false, false, false)
1787 MINMAX(32, maxnum, false, true, false)
1788 MINMAX(32, maxnummag, false, true, true)
1789
1790 MINMAX(64, min, true, false, false)
1791 MINMAX(64, minnum, true, true, false)
1792 MINMAX(64, minnummag, true, true, true)
1793 MINMAX(64, max, false, false, false)
1794 MINMAX(64, maxnum, false, true, false)
1795 MINMAX(64, maxnummag, false, true, true)
1796
1797 #undef MINMAX
1798
1799 /* Floating point compare */
1800 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1801                           float_status *s)
1802 {
1803     if (is_nan(a.cls) || is_nan(b.cls)) {
1804         if (!is_quiet ||
1805             a.cls == float_class_snan ||
1806             b.cls == float_class_snan) {
1807             s->float_exception_flags |= float_flag_invalid;
1808         }
1809         return float_relation_unordered;
1810     }
1811
1812     if (a.cls == float_class_zero) {
1813         if (b.cls == float_class_zero) {
1814             return float_relation_equal;
1815         }
1816         return b.sign ? float_relation_greater : float_relation_less;
1817     } else if (b.cls == float_class_zero) {
1818         return a.sign ? float_relation_less : float_relation_greater;
1819     }
1820
1821     /* The only really important thing about infinity is its sign. If
1822      * both are infinities the sign marks the smallest of the two.
1823      */
1824     if (a.cls == float_class_inf) {
1825         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1826             return float_relation_equal;
1827         }
1828         return a.sign ? float_relation_less : float_relation_greater;
1829     } else if (b.cls == float_class_inf) {
1830         return b.sign ? float_relation_greater : float_relation_less;
1831     }
1832
1833     if (a.sign != b.sign) {
1834         return a.sign ? float_relation_less : float_relation_greater;
1835     }
1836
1837     if (a.exp == b.exp) {
1838         if (a.frac == b.frac) {
1839             return float_relation_equal;
1840         }
1841         if (a.sign) {
1842             return a.frac > b.frac ?
1843                 float_relation_less : float_relation_greater;
1844         } else {
1845             return a.frac > b.frac ?
1846                 float_relation_greater : float_relation_less;
1847         }
1848     } else {
1849         if (a.sign) {
1850             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1851         } else {
1852             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1853         }
1854     }
1855 }
1856
1857 #define COMPARE(sz)                                                     \
1858 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1859                             float_status *s)                            \
1860 {                                                                       \
1861     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1862     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1863     return compare_floats(pa, pb, false, s);                            \
1864 }                                                                       \
1865 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1866                                   float_status *s)                      \
1867 {                                                                       \
1868     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1869     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1870     return compare_floats(pa, pb, true, s);                             \
1871 }
1872
1873 COMPARE(16)
1874 COMPARE(32)
1875 COMPARE(64)
1876
1877 #undef COMPARE
1878
1879 /* Multiply A by 2 raised to the power N.  */
1880 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1881 {
1882     if (unlikely(is_nan(a.cls))) {
1883         return return_nan(a, s);
1884     }
1885     if (a.cls == float_class_normal) {
1886         /* The largest float type (even though not supported by FloatParts)
1887          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1888          * still allows rounding to infinity, without allowing overflow
1889          * within the int32_t that backs FloatParts.exp.
1890          */
1891         n = MIN(MAX(n, -0x10000), 0x10000);
1892         a.exp += n;
1893     }
1894     return a;
1895 }
1896
1897 float16 float16_scalbn(float16 a, int n, float_status *status)
1898 {
1899     FloatParts pa = float16_unpack_canonical(a, status);
1900     FloatParts pr = scalbn_decomposed(pa, n, status);
1901     return float16_round_pack_canonical(pr, status);
1902 }
1903
1904 float32 float32_scalbn(float32 a, int n, float_status *status)
1905 {
1906     FloatParts pa = float32_unpack_canonical(a, status);
1907     FloatParts pr = scalbn_decomposed(pa, n, status);
1908     return float32_round_pack_canonical(pr, status);
1909 }
1910
1911 float64 float64_scalbn(float64 a, int n, float_status *status)
1912 {
1913     FloatParts pa = float64_unpack_canonical(a, status);
1914     FloatParts pr = scalbn_decomposed(pa, n, status);
1915     return float64_round_pack_canonical(pr, status);
1916 }
1917
1918 /*
1919  * Square Root
1920  *
1921  * The old softfloat code did an approximation step before zeroing in
1922  * on the final result. However for simpleness we just compute the
1923  * square root by iterating down from the implicit bit to enough extra
1924  * bits to ensure we get a correctly rounded result.
1925  *
1926  * This does mean however the calculation is slower than before,
1927  * especially for 64 bit floats.
1928  */
1929
1930 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1931 {
1932     uint64_t a_frac, r_frac, s_frac;
1933     int bit, last_bit;
1934
1935     if (is_nan(a.cls)) {
1936         return return_nan(a, s);
1937     }
1938     if (a.cls == float_class_zero) {
1939         return a;  /* sqrt(+-0) = +-0 */
1940     }
1941     if (a.sign) {
1942         s->float_exception_flags |= float_flag_invalid;
1943         a.cls = float_class_dnan;
1944         return a;
1945     }
1946     if (a.cls == float_class_inf) {
1947         return a;  /* sqrt(+inf) = +inf */
1948     }
1949
1950     assert(a.cls == float_class_normal);
1951
1952     /* We need two overflow bits at the top. Adding room for that is a
1953      * right shift. If the exponent is odd, we can discard the low bit
1954      * by multiplying the fraction by 2; that's a left shift. Combine
1955      * those and we shift right if the exponent is even.
1956      */
1957     a_frac = a.frac;
1958     if (!(a.exp & 1)) {
1959         a_frac >>= 1;
1960     }
1961     a.exp >>= 1;
1962
1963     /* Bit-by-bit computation of sqrt.  */
1964     r_frac = 0;
1965     s_frac = 0;
1966
1967     /* Iterate from implicit bit down to the 3 extra bits to compute a
1968      * properly rounded result. Remember we've inserted one more bit
1969      * at the top, so these positions are one less.
1970      */
1971     bit = DECOMPOSED_BINARY_POINT - 1;
1972     last_bit = MAX(p->frac_shift - 4, 0);
1973     do {
1974         uint64_t q = 1ULL << bit;
1975         uint64_t t_frac = s_frac + q;
1976         if (t_frac <= a_frac) {
1977             s_frac = t_frac + q;
1978             a_frac -= t_frac;
1979             r_frac += q;
1980         }
1981         a_frac <<= 1;
1982     } while (--bit >= last_bit);
1983
1984     /* Undo the right shift done above. If there is any remaining
1985      * fraction, the result is inexact. Set the sticky bit.
1986      */
1987     a.frac = (r_frac << 1) + (a_frac != 0);
1988
1989     return a;
1990 }
1991
1992 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1993 {
1994     FloatParts pa = float16_unpack_canonical(a, status);
1995     FloatParts pr = sqrt_float(pa, status, &float16_params);
1996     return float16_round_pack_canonical(pr, status);
1997 }
1998
1999 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2000 {
2001     FloatParts pa = float32_unpack_canonical(a, status);
2002     FloatParts pr = sqrt_float(pa, status, &float32_params);
2003     return float32_round_pack_canonical(pr, status);
2004 }
2005
2006 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2007 {
2008     FloatParts pa = float64_unpack_canonical(a, status);
2009     FloatParts pr = sqrt_float(pa, status, &float64_params);
2010     return float64_round_pack_canonical(pr, status);
2011 }
2012
2013
2014 /*----------------------------------------------------------------------------
2015 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2016 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2017 | input.  If `zSign' is 1, the input is negated before being converted to an
2018 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2019 | is simply rounded to an integer, with the inexact exception raised if the
2020 | input cannot be represented exactly as an integer.  However, if the fixed-
2021 | point input is too large, the invalid exception is raised and the largest
2022 | positive or negative integer is returned.
2023 *----------------------------------------------------------------------------*/
2024
2025 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2026 {
2027     int8_t roundingMode;
2028     flag roundNearestEven;
2029     int8_t roundIncrement, roundBits;
2030     int32_t z;
2031
2032     roundingMode = status->float_rounding_mode;
2033     roundNearestEven = ( roundingMode == float_round_nearest_even );
2034     switch (roundingMode) {
2035     case float_round_nearest_even:
2036     case float_round_ties_away:
2037         roundIncrement = 0x40;
2038         break;
2039     case float_round_to_zero:
2040         roundIncrement = 0;
2041         break;
2042     case float_round_up:
2043         roundIncrement = zSign ? 0 : 0x7f;
2044         break;
2045     case float_round_down:
2046         roundIncrement = zSign ? 0x7f : 0;
2047         break;
2048     default:
2049         abort();
2050     }
2051     roundBits = absZ & 0x7F;
2052     absZ = ( absZ + roundIncrement )>>7;
2053     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2054     z = absZ;
2055     if ( zSign ) z = - z;
2056     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2057         float_raise(float_flag_invalid, status);
2058         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2059     }
2060     if (roundBits) {
2061         status->float_exception_flags |= float_flag_inexact;
2062     }
2063     return z;
2064
2065 }
2066
2067 /*----------------------------------------------------------------------------
2068 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2069 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2070 | and returns the properly rounded 64-bit integer corresponding to the input.
2071 | If `zSign' is 1, the input is negated before being converted to an integer.
2072 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2073 | the inexact exception raised if the input cannot be represented exactly as
2074 | an integer.  However, if the fixed-point input is too large, the invalid
2075 | exception is raised and the largest positive or negative integer is
2076 | returned.
2077 *----------------------------------------------------------------------------*/
2078
2079 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2080                                float_status *status)
2081 {
2082     int8_t roundingMode;
2083     flag roundNearestEven, increment;
2084     int64_t z;
2085
2086     roundingMode = status->float_rounding_mode;
2087     roundNearestEven = ( roundingMode == float_round_nearest_even );
2088     switch (roundingMode) {
2089     case float_round_nearest_even:
2090     case float_round_ties_away:
2091         increment = ((int64_t) absZ1 < 0);
2092         break;
2093     case float_round_to_zero:
2094         increment = 0;
2095         break;
2096     case float_round_up:
2097         increment = !zSign && absZ1;
2098         break;
2099     case float_round_down:
2100         increment = zSign && absZ1;
2101         break;
2102     default:
2103         abort();
2104     }
2105     if ( increment ) {
2106         ++absZ0;
2107         if ( absZ0 == 0 ) goto overflow;
2108         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2109     }
2110     z = absZ0;
2111     if ( zSign ) z = - z;
2112     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2113  overflow:
2114         float_raise(float_flag_invalid, status);
2115         return
2116               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2117             : LIT64( 0x7FFFFFFFFFFFFFFF );
2118     }
2119     if (absZ1) {
2120         status->float_exception_flags |= float_flag_inexact;
2121     }
2122     return z;
2123
2124 }
2125
2126 /*----------------------------------------------------------------------------
2127 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2128 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2129 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2130 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2131 | with the inexact exception raised if the input cannot be represented exactly
2132 | as an integer.  However, if the fixed-point input is too large, the invalid
2133 | exception is raised and the largest unsigned integer is returned.
2134 *----------------------------------------------------------------------------*/
2135
2136 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2137                                 uint64_t absZ1, float_status *status)
2138 {
2139     int8_t roundingMode;
2140     flag roundNearestEven, increment;
2141
2142     roundingMode = status->float_rounding_mode;
2143     roundNearestEven = (roundingMode == float_round_nearest_even);
2144     switch (roundingMode) {
2145     case float_round_nearest_even:
2146     case float_round_ties_away:
2147         increment = ((int64_t)absZ1 < 0);
2148         break;
2149     case float_round_to_zero:
2150         increment = 0;
2151         break;
2152     case float_round_up:
2153         increment = !zSign && absZ1;
2154         break;
2155     case float_round_down:
2156         increment = zSign && absZ1;
2157         break;
2158     default:
2159         abort();
2160     }
2161     if (increment) {
2162         ++absZ0;
2163         if (absZ0 == 0) {
2164             float_raise(float_flag_invalid, status);
2165             return LIT64(0xFFFFFFFFFFFFFFFF);
2166         }
2167         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2168     }
2169
2170     if (zSign && absZ0) {
2171         float_raise(float_flag_invalid, status);
2172         return 0;
2173     }
2174
2175     if (absZ1) {
2176         status->float_exception_flags |= float_flag_inexact;
2177     }
2178     return absZ0;
2179 }
2180
2181 /*----------------------------------------------------------------------------
2182 | If `a' is denormal and we are in flush-to-zero mode then set the
2183 | input-denormal exception and return zero. Otherwise just return the value.
2184 *----------------------------------------------------------------------------*/
2185 float32 float32_squash_input_denormal(float32 a, float_status *status)
2186 {
2187     if (status->flush_inputs_to_zero) {
2188         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2189             float_raise(float_flag_input_denormal, status);
2190             return make_float32(float32_val(a) & 0x80000000);
2191         }
2192     }
2193     return a;
2194 }
2195
2196 /*----------------------------------------------------------------------------
2197 | Normalizes the subnormal single-precision floating-point value represented
2198 | by the denormalized significand `aSig'.  The normalized exponent and
2199 | significand are stored at the locations pointed to by `zExpPtr' and
2200 | `zSigPtr', respectively.
2201 *----------------------------------------------------------------------------*/
2202
2203 static void
2204  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2205 {
2206     int8_t shiftCount;
2207
2208     shiftCount = countLeadingZeros32( aSig ) - 8;
2209     *zSigPtr = aSig<<shiftCount;
2210     *zExpPtr = 1 - shiftCount;
2211
2212 }
2213
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input.  Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly.  However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned.  If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 |     The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location.  This shifted
2228 | significand must be normalized or smaller.  If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding.  In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2235
2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2237                                    float_status *status)
2238 {
2239     int8_t roundingMode;
2240     flag roundNearestEven;
2241     int8_t roundIncrement, roundBits;
2242     flag isTiny;
2243
2244     roundingMode = status->float_rounding_mode;
2245     roundNearestEven = ( roundingMode == float_round_nearest_even );
2246     switch (roundingMode) {
2247     case float_round_nearest_even:
2248     case float_round_ties_away:
2249         roundIncrement = 0x40;
2250         break;
2251     case float_round_to_zero:
2252         roundIncrement = 0;
2253         break;
2254     case float_round_up:
2255         roundIncrement = zSign ? 0 : 0x7f;
2256         break;
2257     case float_round_down:
2258         roundIncrement = zSign ? 0x7f : 0;
2259         break;
2260     default:
2261         abort();
2262         break;
2263     }
2264     roundBits = zSig & 0x7F;
2265     if ( 0xFD <= (uint16_t) zExp ) {
2266         if (    ( 0xFD < zExp )
2267              || (    ( zExp == 0xFD )
2268                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2269            ) {
2270             float_raise(float_flag_overflow | float_flag_inexact, status);
2271             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2272         }
2273         if ( zExp < 0 ) {
2274             if (status->flush_to_zero) {
2275                 float_raise(float_flag_output_denormal, status);
2276                 return packFloat32(zSign, 0, 0);
2277             }
2278             isTiny =
2279                 (status->float_detect_tininess
2280                  == float_tininess_before_rounding)
2281                 || ( zExp < -1 )
2282                 || ( zSig + roundIncrement < 0x80000000 );
2283             shift32RightJamming( zSig, - zExp, &zSig );
2284             zExp = 0;
2285             roundBits = zSig & 0x7F;
2286             if (isTiny && roundBits) {
2287                 float_raise(float_flag_underflow, status);
2288             }
2289         }
2290     }
2291     if (roundBits) {
2292         status->float_exception_flags |= float_flag_inexact;
2293     }
2294     zSig = ( zSig + roundIncrement )>>7;
2295     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296     if ( zSig == 0 ) zExp = 0;
2297     return packFloat32( zSign, zExp, zSig );
2298
2299 }
2300
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input.  This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2309
2310 static float32
2311  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2312                               float_status *status)
2313 {
2314     int8_t shiftCount;
2315
2316     shiftCount = countLeadingZeros32( zSig ) - 1;
2317     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2318                                status);
2319
2320 }
2321
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64 float64_squash_input_denormal(float64 a, float_status *status)
2327 {
2328     if (status->flush_inputs_to_zero) {
2329         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2330             float_raise(float_flag_input_denormal, status);
2331             return make_float64(float64_val(a) & (1ULL << 63));
2332         }
2333     }
2334     return a;
2335 }
2336
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'.  The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2343
2344 static void
2345  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2346 {
2347     int8_t shiftCount;
2348
2349     shiftCount = countLeadingZeros64( aSig ) - 11;
2350     *zSigPtr = aSig<<shiftCount;
2351     *zExpPtr = 1 - shiftCount;
2352
2353 }
2354
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result.  After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result.  This means that any integer portion of `zSig'
2360 | will be added into the exponent.  Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2363 | significand.
2364 *----------------------------------------------------------------------------*/
2365
2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2367 {
2368
2369     return make_float64(
2370         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2371
2372 }
2373
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input.  Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly.  However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned.  If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 |     The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location.  This shifted
2388 | significand must be normalized or smaller.  If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding.  In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2395
2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2397                                    float_status *status)
2398 {
2399     int8_t roundingMode;
2400     flag roundNearestEven;
2401     int roundIncrement, roundBits;
2402     flag isTiny;
2403
2404     roundingMode = status->float_rounding_mode;
2405     roundNearestEven = ( roundingMode == float_round_nearest_even );
2406     switch (roundingMode) {
2407     case float_round_nearest_even:
2408     case float_round_ties_away:
2409         roundIncrement = 0x200;
2410         break;
2411     case float_round_to_zero:
2412         roundIncrement = 0;
2413         break;
2414     case float_round_up:
2415         roundIncrement = zSign ? 0 : 0x3ff;
2416         break;
2417     case float_round_down:
2418         roundIncrement = zSign ? 0x3ff : 0;
2419         break;
2420     case float_round_to_odd:
2421         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2422         break;
2423     default:
2424         abort();
2425     }
2426     roundBits = zSig & 0x3FF;
2427     if ( 0x7FD <= (uint16_t) zExp ) {
2428         if (    ( 0x7FD < zExp )
2429              || (    ( zExp == 0x7FD )
2430                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2431            ) {
2432             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2433                                    roundIncrement != 0;
2434             float_raise(float_flag_overflow | float_flag_inexact, status);
2435             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2436         }
2437         if ( zExp < 0 ) {
2438             if (status->flush_to_zero) {
2439                 float_raise(float_flag_output_denormal, status);
2440                 return packFloat64(zSign, 0, 0);
2441             }
2442             isTiny =
2443                    (status->float_detect_tininess
2444                     == float_tininess_before_rounding)
2445                 || ( zExp < -1 )
2446                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2447             shift64RightJamming( zSig, - zExp, &zSig );
2448             zExp = 0;
2449             roundBits = zSig & 0x3FF;
2450             if (isTiny && roundBits) {
2451                 float_raise(float_flag_underflow, status);
2452             }
2453             if (roundingMode == float_round_to_odd) {
2454                 /*
2455                  * For round-to-odd case, the roundIncrement depends on
2456                  * zSig which just changed.
2457                  */
2458                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2459             }
2460         }
2461     }
2462     if (roundBits) {
2463         status->float_exception_flags |= float_flag_inexact;
2464     }
2465     zSig = ( zSig + roundIncrement )>>10;
2466     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2467     if ( zSig == 0 ) zExp = 0;
2468     return packFloat64( zSign, zExp, zSig );
2469
2470 }
2471
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input.  This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2480
2481 static float64
2482  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2483                               float_status *status)
2484 {
2485     int8_t shiftCount;
2486
2487     shiftCount = countLeadingZeros64( zSig ) - 1;
2488     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2489                                status);
2490
2491 }
2492
2493 /*----------------------------------------------------------------------------
2494 | Normalizes the subnormal extended double-precision floating-point value
2495 | represented by the denormalized significand `aSig'.  The normalized exponent
2496 | and significand are stored at the locations pointed to by `zExpPtr' and
2497 | `zSigPtr', respectively.
2498 *----------------------------------------------------------------------------*/
2499
2500 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2501                                 uint64_t *zSigPtr)
2502 {
2503     int8_t shiftCount;
2504
2505     shiftCount = countLeadingZeros64( aSig );
2506     *zSigPtr = aSig<<shiftCount;
2507     *zExpPtr = 1 - shiftCount;
2508 }
2509
2510 /*----------------------------------------------------------------------------
2511 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2512 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2513 | and returns the proper extended double-precision floating-point value
2514 | corresponding to the abstract input.  Ordinarily, the abstract value is
2515 | rounded and packed into the extended double-precision format, with the
2516 | inexact exception raised if the abstract input cannot be represented
2517 | exactly.  However, if the abstract value is too large, the overflow and
2518 | inexact exceptions are raised and an infinity or maximal finite value is
2519 | returned.  If the abstract value is too small, the input value is rounded to
2520 | a subnormal number, and the underflow and inexact exceptions are raised if
2521 | the abstract input cannot be represented exactly as a subnormal extended
2522 | double-precision floating-point number.
2523 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2524 | number of bits as single or double precision, respectively.  Otherwise, the
2525 | result is rounded to the full precision of the extended double-precision
2526 | format.
2527 |     The input significand must be normalized or smaller.  If the input
2528 | significand is not normalized, `zExp' must be 0; in that case, the result
2529 | returned is a subnormal number, and it must not require rounding.  The
2530 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2531 | Floating-Point Arithmetic.
2532 *----------------------------------------------------------------------------*/
2533
2534 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2535                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2536                               float_status *status)
2537 {
2538     int8_t roundingMode;
2539     flag roundNearestEven, increment, isTiny;
2540     int64_t roundIncrement, roundMask, roundBits;
2541
2542     roundingMode = status->float_rounding_mode;
2543     roundNearestEven = ( roundingMode == float_round_nearest_even );
2544     if ( roundingPrecision == 80 ) goto precision80;
2545     if ( roundingPrecision == 64 ) {
2546         roundIncrement = LIT64( 0x0000000000000400 );
2547         roundMask = LIT64( 0x00000000000007FF );
2548     }
2549     else if ( roundingPrecision == 32 ) {
2550         roundIncrement = LIT64( 0x0000008000000000 );
2551         roundMask = LIT64( 0x000000FFFFFFFFFF );
2552     }
2553     else {
2554         goto precision80;
2555     }
2556     zSig0 |= ( zSig1 != 0 );
2557     switch (roundingMode) {
2558     case float_round_nearest_even:
2559     case float_round_ties_away:
2560         break;
2561     case float_round_to_zero:
2562         roundIncrement = 0;
2563         break;
2564     case float_round_up:
2565         roundIncrement = zSign ? 0 : roundMask;
2566         break;
2567     case float_round_down:
2568         roundIncrement = zSign ? roundMask : 0;
2569         break;
2570     default:
2571         abort();
2572     }
2573     roundBits = zSig0 & roundMask;
2574     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2575         if (    ( 0x7FFE < zExp )
2576              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2577            ) {
2578             goto overflow;
2579         }
2580         if ( zExp <= 0 ) {
2581             if (status->flush_to_zero) {
2582                 float_raise(float_flag_output_denormal, status);
2583                 return packFloatx80(zSign, 0, 0);
2584             }
2585             isTiny =
2586                    (status->float_detect_tininess
2587                     == float_tininess_before_rounding)
2588                 || ( zExp < 0 )
2589                 || ( zSig0 <= zSig0 + roundIncrement );
2590             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2591             zExp = 0;
2592             roundBits = zSig0 & roundMask;
2593             if (isTiny && roundBits) {
2594                 float_raise(float_flag_underflow, status);
2595             }
2596             if (roundBits) {
2597                 status->float_exception_flags |= float_flag_inexact;
2598             }
2599             zSig0 += roundIncrement;
2600             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2601             roundIncrement = roundMask + 1;
2602             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2603                 roundMask |= roundIncrement;
2604             }
2605             zSig0 &= ~ roundMask;
2606             return packFloatx80( zSign, zExp, zSig0 );
2607         }
2608     }
2609     if (roundBits) {
2610         status->float_exception_flags |= float_flag_inexact;
2611     }
2612     zSig0 += roundIncrement;
2613     if ( zSig0 < roundIncrement ) {
2614         ++zExp;
2615         zSig0 = LIT64( 0x8000000000000000 );
2616     }
2617     roundIncrement = roundMask + 1;
2618     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2619         roundMask |= roundIncrement;
2620     }
2621     zSig0 &= ~ roundMask;
2622     if ( zSig0 == 0 ) zExp = 0;
2623     return packFloatx80( zSign, zExp, zSig0 );
2624  precision80:
2625     switch (roundingMode) {
2626     case float_round_nearest_even:
2627     case float_round_ties_away:
2628         increment = ((int64_t)zSig1 < 0);
2629         break;
2630     case float_round_to_zero:
2631         increment = 0;
2632         break;
2633     case float_round_up:
2634         increment = !zSign && zSig1;
2635         break;
2636     case float_round_down:
2637         increment = zSign && zSig1;
2638         break;
2639     default:
2640         abort();
2641     }
2642     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2643         if (    ( 0x7FFE < zExp )
2644              || (    ( zExp == 0x7FFE )
2645                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2646                   && increment
2647                 )
2648            ) {
2649             roundMask = 0;
2650  overflow:
2651             float_raise(float_flag_overflow | float_flag_inexact, status);
2652             if (    ( roundingMode == float_round_to_zero )
2653                  || ( zSign && ( roundingMode == float_round_up ) )
2654                  || ( ! zSign && ( roundingMode == float_round_down ) )
2655                ) {
2656                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2657             }
2658             return packFloatx80(zSign,
2659                                 floatx80_infinity_high,
2660                                 floatx80_infinity_low);
2661         }
2662         if ( zExp <= 0 ) {
2663             isTiny =
2664                    (status->float_detect_tininess
2665                     == float_tininess_before_rounding)
2666                 || ( zExp < 0 )
2667                 || ! increment
2668                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2669             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2670             zExp = 0;
2671             if (isTiny && zSig1) {
2672                 float_raise(float_flag_underflow, status);
2673             }
2674             if (zSig1) {
2675                 status->float_exception_flags |= float_flag_inexact;
2676             }
2677             switch (roundingMode) {
2678             case float_round_nearest_even:
2679             case float_round_ties_away:
2680                 increment = ((int64_t)zSig1 < 0);
2681                 break;
2682             case float_round_to_zero:
2683                 increment = 0;
2684                 break;
2685             case float_round_up:
2686                 increment = !zSign && zSig1;
2687                 break;
2688             case float_round_down:
2689                 increment = zSign && zSig1;
2690                 break;
2691             default:
2692                 abort();
2693             }
2694             if ( increment ) {
2695                 ++zSig0;
2696                 zSig0 &=
2697                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2698                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2699             }
2700             return packFloatx80( zSign, zExp, zSig0 );
2701         }
2702     }
2703     if (zSig1) {
2704         status->float_exception_flags |= float_flag_inexact;
2705     }
2706     if ( increment ) {
2707         ++zSig0;
2708         if ( zSig0 == 0 ) {
2709             ++zExp;
2710             zSig0 = LIT64( 0x8000000000000000 );
2711         }
2712         else {
2713             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2714         }
2715     }
2716     else {
2717         if ( zSig0 == 0 ) zExp = 0;
2718     }
2719     return packFloatx80( zSign, zExp, zSig0 );
2720
2721 }
2722
2723 /*----------------------------------------------------------------------------
2724 | Takes an abstract floating-point value having sign `zSign', exponent
2725 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2726 | and returns the proper extended double-precision floating-point value
2727 | corresponding to the abstract input.  This routine is just like
2728 | `roundAndPackFloatx80' except that the input significand does not have to be
2729 | normalized.
2730 *----------------------------------------------------------------------------*/
2731
2732 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2733                                        flag zSign, int32_t zExp,
2734                                        uint64_t zSig0, uint64_t zSig1,
2735                                        float_status *status)
2736 {
2737     int8_t shiftCount;
2738
2739     if ( zSig0 == 0 ) {
2740         zSig0 = zSig1;
2741         zSig1 = 0;
2742         zExp -= 64;
2743     }
2744     shiftCount = countLeadingZeros64( zSig0 );
2745     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2746     zExp -= shiftCount;
2747     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2748                                 zSig0, zSig1, status);
2749
2750 }
2751
2752 /*----------------------------------------------------------------------------
2753 | Returns the least-significant 64 fraction bits of the quadruple-precision
2754 | floating-point value `a'.
2755 *----------------------------------------------------------------------------*/
2756
2757 static inline uint64_t extractFloat128Frac1( float128 a )
2758 {
2759
2760     return a.low;
2761
2762 }
2763
2764 /*----------------------------------------------------------------------------
2765 | Returns the most-significant 48 fraction bits of the quadruple-precision
2766 | floating-point value `a'.
2767 *----------------------------------------------------------------------------*/
2768
2769 static inline uint64_t extractFloat128Frac0( float128 a )
2770 {
2771
2772     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2773
2774 }
2775
2776 /*----------------------------------------------------------------------------
2777 | Returns the exponent bits of the quadruple-precision floating-point value
2778 | `a'.
2779 *----------------------------------------------------------------------------*/
2780
2781 static inline int32_t extractFloat128Exp( float128 a )
2782 {
2783
2784     return ( a.high>>48 ) & 0x7FFF;
2785
2786 }
2787
2788 /*----------------------------------------------------------------------------
2789 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2790 *----------------------------------------------------------------------------*/
2791
2792 static inline flag extractFloat128Sign( float128 a )
2793 {
2794
2795     return a.high>>63;
2796
2797 }
2798
2799 /*----------------------------------------------------------------------------
2800 | Normalizes the subnormal quadruple-precision floating-point value
2801 | represented by the denormalized significand formed by the concatenation of
2802 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2803 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2804 | significand are stored at the location pointed to by `zSig0Ptr', and the
2805 | least significant 64 bits of the normalized significand are stored at the
2806 | location pointed to by `zSig1Ptr'.
2807 *----------------------------------------------------------------------------*/
2808
2809 static void
2810  normalizeFloat128Subnormal(
2811      uint64_t aSig0,
2812      uint64_t aSig1,
2813      int32_t *zExpPtr,
2814      uint64_t *zSig0Ptr,
2815      uint64_t *zSig1Ptr
2816  )
2817 {
2818     int8_t shiftCount;
2819
2820     if ( aSig0 == 0 ) {
2821         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2822         if ( shiftCount < 0 ) {
2823             *zSig0Ptr = aSig1>>( - shiftCount );
2824             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2825         }
2826         else {
2827             *zSig0Ptr = aSig1<<shiftCount;
2828             *zSig1Ptr = 0;
2829         }
2830         *zExpPtr = - shiftCount - 63;
2831     }
2832     else {
2833         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2834         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2835         *zExpPtr = 1 - shiftCount;
2836     }
2837
2838 }
2839
2840 /*----------------------------------------------------------------------------
2841 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2842 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2843 | floating-point value, returning the result.  After being shifted into the
2844 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2845 | added together to form the most significant 32 bits of the result.  This
2846 | means that any integer portion of `zSig0' will be added into the exponent.
2847 | Since a properly normalized significand will have an integer portion equal
2848 | to 1, the `zExp' input should be 1 less than the desired result exponent
2849 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2850 | significand.
2851 *----------------------------------------------------------------------------*/
2852
2853 static inline float128
2854  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2855 {
2856     float128 z;
2857
2858     z.low = zSig1;
2859     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2860     return z;
2861
2862 }
2863
2864 /*----------------------------------------------------------------------------
2865 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2866 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2867 | and `zSig2', and returns the proper quadruple-precision floating-point value
2868 | corresponding to the abstract input.  Ordinarily, the abstract value is
2869 | simply rounded and packed into the quadruple-precision format, with the
2870 | inexact exception raised if the abstract input cannot be represented
2871 | exactly.  However, if the abstract value is too large, the overflow and
2872 | inexact exceptions are raised and an infinity or maximal finite value is
2873 | returned.  If the abstract value is too small, the input value is rounded to
2874 | a subnormal number, and the underflow and inexact exceptions are raised if
2875 | the abstract input cannot be represented exactly as a subnormal quadruple-
2876 | precision floating-point number.
2877 |     The input significand must be normalized or smaller.  If the input
2878 | significand is not normalized, `zExp' must be 0; in that case, the result
2879 | returned is a subnormal number, and it must not require rounding.  In the
2880 | usual case that the input significand is normalized, `zExp' must be 1 less
2881 | than the ``true'' floating-point exponent.  The handling of underflow and
2882 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2886                                      uint64_t zSig0, uint64_t zSig1,
2887                                      uint64_t zSig2, float_status *status)
2888 {
2889     int8_t roundingMode;
2890     flag roundNearestEven, increment, isTiny;
2891
2892     roundingMode = status->float_rounding_mode;
2893     roundNearestEven = ( roundingMode == float_round_nearest_even );
2894     switch (roundingMode) {
2895     case float_round_nearest_even:
2896     case float_round_ties_away:
2897         increment = ((int64_t)zSig2 < 0);
2898         break;
2899     case float_round_to_zero:
2900         increment = 0;
2901         break;
2902     case float_round_up:
2903         increment = !zSign && zSig2;
2904         break;
2905     case float_round_down:
2906         increment = zSign && zSig2;
2907         break;
2908     case float_round_to_odd:
2909         increment = !(zSig1 & 0x1) && zSig2;
2910         break;
2911     default:
2912         abort();
2913     }
2914     if ( 0x7FFD <= (uint32_t) zExp ) {
2915         if (    ( 0x7FFD < zExp )
2916              || (    ( zExp == 0x7FFD )
2917                   && eq128(
2918                          LIT64( 0x0001FFFFFFFFFFFF ),
2919                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2920                          zSig0,
2921                          zSig1
2922                      )
2923                   && increment
2924                 )
2925            ) {
2926             float_raise(float_flag_overflow | float_flag_inexact, status);
2927             if (    ( roundingMode == float_round_to_zero )
2928                  || ( zSign && ( roundingMode == float_round_up ) )
2929                  || ( ! zSign && ( roundingMode == float_round_down ) )
2930                  || (roundingMode == float_round_to_odd)
2931                ) {
2932                 return
2933                     packFloat128(
2934                         zSign,
2935                         0x7FFE,
2936                         LIT64( 0x0000FFFFFFFFFFFF ),
2937                         LIT64( 0xFFFFFFFFFFFFFFFF )
2938                     );
2939             }
2940             return packFloat128( zSign, 0x7FFF, 0, 0 );
2941         }
2942         if ( zExp < 0 ) {
2943             if (status->flush_to_zero) {
2944                 float_raise(float_flag_output_denormal, status);
2945                 return packFloat128(zSign, 0, 0, 0);
2946             }
2947             isTiny =
2948                    (status->float_detect_tininess
2949                     == float_tininess_before_rounding)
2950                 || ( zExp < -1 )
2951                 || ! increment
2952                 || lt128(
2953                        zSig0,
2954                        zSig1,
2955                        LIT64( 0x0001FFFFFFFFFFFF ),
2956                        LIT64( 0xFFFFFFFFFFFFFFFF )
2957                    );
2958             shift128ExtraRightJamming(
2959                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2960             zExp = 0;
2961             if (isTiny && zSig2) {
2962                 float_raise(float_flag_underflow, status);
2963             }
2964             switch (roundingMode) {
2965             case float_round_nearest_even:
2966             case float_round_ties_away:
2967                 increment = ((int64_t)zSig2 < 0);
2968                 break;
2969             case float_round_to_zero:
2970                 increment = 0;
2971                 break;
2972             case float_round_up:
2973                 increment = !zSign && zSig2;
2974                 break;
2975             case float_round_down:
2976                 increment = zSign && zSig2;
2977                 break;
2978             case float_round_to_odd:
2979                 increment = !(zSig1 & 0x1) && zSig2;
2980                 break;
2981             default:
2982                 abort();
2983             }
2984         }
2985     }
2986     if (zSig2) {
2987         status->float_exception_flags |= float_flag_inexact;
2988     }
2989     if ( increment ) {
2990         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2991         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2992     }
2993     else {
2994         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2995     }
2996     return packFloat128( zSign, zExp, zSig0, zSig1 );
2997
2998 }
2999
3000 /*----------------------------------------------------------------------------
3001 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3002 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3003 | returns the proper quadruple-precision floating-point value corresponding
3004 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3005 | except that the input significand has fewer bits and does not have to be
3006 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3007 | point exponent.
3008 *----------------------------------------------------------------------------*/
3009
3010 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3011                                               uint64_t zSig0, uint64_t zSig1,
3012                                               float_status *status)
3013 {
3014     int8_t shiftCount;
3015     uint64_t zSig2;
3016
3017     if ( zSig0 == 0 ) {
3018         zSig0 = zSig1;
3019         zSig1 = 0;
3020         zExp -= 64;
3021     }
3022     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3023     if ( 0 <= shiftCount ) {
3024         zSig2 = 0;
3025         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3026     }
3027     else {
3028         shift128ExtraRightJamming(
3029             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3030     }
3031     zExp -= shiftCount;
3032     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3033
3034 }
3035
3036
3037 /*----------------------------------------------------------------------------
3038 | Returns the result of converting the 32-bit two's complement integer `a'
3039 | to the extended double-precision floating-point format.  The conversion
3040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3041 | Arithmetic.
3042 *----------------------------------------------------------------------------*/
3043
3044 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3045 {
3046     flag zSign;
3047     uint32_t absA;
3048     int8_t shiftCount;
3049     uint64_t zSig;
3050
3051     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3052     zSign = ( a < 0 );
3053     absA = zSign ? - a : a;
3054     shiftCount = countLeadingZeros32( absA ) + 32;
3055     zSig = absA;
3056     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3057
3058 }
3059
3060 /*----------------------------------------------------------------------------
3061 | Returns the result of converting the 32-bit two's complement integer `a' to
3062 | the quadruple-precision floating-point format.  The conversion is performed
3063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3064 *----------------------------------------------------------------------------*/
3065
3066 float128 int32_to_float128(int32_t a, float_status *status)
3067 {
3068     flag zSign;
3069     uint32_t absA;
3070     int8_t shiftCount;
3071     uint64_t zSig0;
3072
3073     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3074     zSign = ( a < 0 );
3075     absA = zSign ? - a : a;
3076     shiftCount = countLeadingZeros32( absA ) + 17;
3077     zSig0 = absA;
3078     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3079
3080 }
3081
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the 64-bit two's complement integer `a'
3084 | to the extended double-precision floating-point format.  The conversion
3085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic.
3087 *----------------------------------------------------------------------------*/
3088
3089 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3090 {
3091     flag zSign;
3092     uint64_t absA;
3093     int8_t shiftCount;
3094
3095     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3096     zSign = ( a < 0 );
3097     absA = zSign ? - a : a;
3098     shiftCount = countLeadingZeros64( absA );
3099     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3100
3101 }
3102
3103 /*----------------------------------------------------------------------------
3104 | Returns the result of converting the 64-bit two's complement integer `a' to
3105 | the quadruple-precision floating-point format.  The conversion is performed
3106 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3107 *----------------------------------------------------------------------------*/
3108
3109 float128 int64_to_float128(int64_t a, float_status *status)
3110 {
3111     flag zSign;
3112     uint64_t absA;
3113     int8_t shiftCount;
3114     int32_t zExp;
3115     uint64_t zSig0, zSig1;
3116
3117     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3118     zSign = ( a < 0 );
3119     absA = zSign ? - a : a;
3120     shiftCount = countLeadingZeros64( absA ) + 49;
3121     zExp = 0x406E - shiftCount;
3122     if ( 64 <= shiftCount ) {
3123         zSig1 = 0;
3124         zSig0 = absA;
3125         shiftCount -= 64;
3126     }
3127     else {
3128         zSig1 = absA;
3129         zSig0 = 0;
3130     }
3131     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3132     return packFloat128( zSign, zExp, zSig0, zSig1 );
3133
3134 }
3135
3136 /*----------------------------------------------------------------------------
3137 | Returns the result of converting the 64-bit unsigned integer `a'
3138 | to the quadruple-precision floating-point format.  The conversion is performed
3139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 float128 uint64_to_float128(uint64_t a, float_status *status)
3143 {
3144     if (a == 0) {
3145         return float128_zero;
3146     }
3147     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3148 }
3149
3150
3151
3152
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the single-precision floating-point value
3155 | `a' to the double-precision floating-point format.  The conversion is
3156 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 | Arithmetic.
3158 *----------------------------------------------------------------------------*/
3159
3160 float64 float32_to_float64(float32 a, float_status *status)
3161 {
3162     flag aSign;
3163     int aExp;
3164     uint32_t aSig;
3165     a = float32_squash_input_denormal(a, status);
3166
3167     aSig = extractFloat32Frac( a );
3168     aExp = extractFloat32Exp( a );
3169     aSign = extractFloat32Sign( a );
3170     if ( aExp == 0xFF ) {
3171         if (aSig) {
3172             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3173         }
3174         return packFloat64( aSign, 0x7FF, 0 );
3175     }
3176     if ( aExp == 0 ) {
3177         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3178         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3179         --aExp;
3180     }
3181     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3182
3183 }
3184
3185 /*----------------------------------------------------------------------------
3186 | Returns the result of converting the single-precision floating-point value
3187 | `a' to the extended double-precision floating-point format.  The conversion
3188 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3189 | Arithmetic.
3190 *----------------------------------------------------------------------------*/
3191
3192 floatx80 float32_to_floatx80(float32 a, float_status *status)
3193 {
3194     flag aSign;
3195     int aExp;
3196     uint32_t aSig;
3197
3198     a = float32_squash_input_denormal(a, status);
3199     aSig = extractFloat32Frac( a );
3200     aExp = extractFloat32Exp( a );
3201     aSign = extractFloat32Sign( a );
3202     if ( aExp == 0xFF ) {
3203         if (aSig) {
3204             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3205         }
3206         return packFloatx80(aSign,
3207                             floatx80_infinity_high,
3208                             floatx80_infinity_low);
3209     }
3210     if ( aExp == 0 ) {
3211         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3212         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3213     }
3214     aSig |= 0x00800000;
3215     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3216
3217 }
3218
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of converting the single-precision floating-point value
3221 | `a' to the double-precision floating-point format.  The conversion is
3222 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3223 | Arithmetic.
3224 *----------------------------------------------------------------------------*/
3225
3226 float128 float32_to_float128(float32 a, float_status *status)
3227 {
3228     flag aSign;
3229     int aExp;
3230     uint32_t aSig;
3231
3232     a = float32_squash_input_denormal(a, status);
3233     aSig = extractFloat32Frac( a );
3234     aExp = extractFloat32Exp( a );
3235     aSign = extractFloat32Sign( a );
3236     if ( aExp == 0xFF ) {
3237         if (aSig) {
3238             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3239         }
3240         return packFloat128( aSign, 0x7FFF, 0, 0 );
3241     }
3242     if ( aExp == 0 ) {
3243         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3244         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3245         --aExp;
3246     }
3247     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3248
3249 }
3250
3251 /*----------------------------------------------------------------------------
3252 | Returns the remainder of the single-precision floating-point value `a'
3253 | with respect to the corresponding value `b'.  The operation is performed
3254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3255 *----------------------------------------------------------------------------*/
3256
3257 float32 float32_rem(float32 a, float32 b, float_status *status)
3258 {
3259     flag aSign, zSign;
3260     int aExp, bExp, expDiff;
3261     uint32_t aSig, bSig;
3262     uint32_t q;
3263     uint64_t aSig64, bSig64, q64;
3264     uint32_t alternateASig;
3265     int32_t sigMean;
3266     a = float32_squash_input_denormal(a, status);
3267     b = float32_squash_input_denormal(b, status);
3268
3269     aSig = extractFloat32Frac( a );
3270     aExp = extractFloat32Exp( a );
3271     aSign = extractFloat32Sign( a );
3272     bSig = extractFloat32Frac( b );
3273     bExp = extractFloat32Exp( b );
3274     if ( aExp == 0xFF ) {
3275         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3276             return propagateFloat32NaN(a, b, status);
3277         }
3278         float_raise(float_flag_invalid, status);
3279         return float32_default_nan(status);
3280     }
3281     if ( bExp == 0xFF ) {
3282         if (bSig) {
3283             return propagateFloat32NaN(a, b, status);
3284         }
3285         return a;
3286     }
3287     if ( bExp == 0 ) {
3288         if ( bSig == 0 ) {
3289             float_raise(float_flag_invalid, status);
3290             return float32_default_nan(status);
3291         }
3292         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3293     }
3294     if ( aExp == 0 ) {
3295         if ( aSig == 0 ) return a;
3296         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3297     }
3298     expDiff = aExp - bExp;
3299     aSig |= 0x00800000;
3300     bSig |= 0x00800000;
3301     if ( expDiff < 32 ) {
3302         aSig <<= 8;
3303         bSig <<= 8;
3304         if ( expDiff < 0 ) {
3305             if ( expDiff < -1 ) return a;
3306             aSig >>= 1;
3307         }
3308         q = ( bSig <= aSig );
3309         if ( q ) aSig -= bSig;
3310         if ( 0 < expDiff ) {
3311             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3312             q >>= 32 - expDiff;
3313             bSig >>= 2;
3314             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3315         }
3316         else {
3317             aSig >>= 2;
3318             bSig >>= 2;
3319         }
3320     }
3321     else {
3322         if ( bSig <= aSig ) aSig -= bSig;
3323         aSig64 = ( (uint64_t) aSig )<<40;
3324         bSig64 = ( (uint64_t) bSig )<<40;
3325         expDiff -= 64;
3326         while ( 0 < expDiff ) {
3327             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3328             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3329             aSig64 = - ( ( bSig * q64 )<<38 );
3330             expDiff -= 62;
3331         }
3332         expDiff += 64;
3333         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3334         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3335         q = q64>>( 64 - expDiff );
3336         bSig <<= 6;
3337         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3338     }
3339     do {
3340         alternateASig = aSig;
3341         ++q;
3342         aSig -= bSig;
3343     } while ( 0 <= (int32_t) aSig );
3344     sigMean = aSig + alternateASig;
3345     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3346         aSig = alternateASig;
3347     }
3348     zSign = ( (int32_t) aSig < 0 );
3349     if ( zSign ) aSig = - aSig;
3350     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3351 }
3352
3353
3354
3355 /*----------------------------------------------------------------------------
3356 | Returns the binary exponential of the single-precision floating-point value
3357 | `a'. The operation is performed according to the IEC/IEEE Standard for
3358 | Binary Floating-Point Arithmetic.
3359 |
3360 | Uses the following identities:
3361 |
3362 | 1. -------------------------------------------------------------------------
3363 |      x    x*ln(2)
3364 |     2  = e
3365 |
3366 | 2. -------------------------------------------------------------------------
3367 |                      2     3     4     5           n
3368 |      x        x     x     x     x     x           x
3369 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3370 |               1!    2!    3!    4!    5!          n!
3371 *----------------------------------------------------------------------------*/
3372
3373 static const float64 float32_exp2_coefficients[15] =
3374 {
3375     const_float64( 0x3ff0000000000000ll ), /*  1 */
3376     const_float64( 0x3fe0000000000000ll ), /*  2 */
3377     const_float64( 0x3fc5555555555555ll ), /*  3 */
3378     const_float64( 0x3fa5555555555555ll ), /*  4 */
3379     const_float64( 0x3f81111111111111ll ), /*  5 */
3380     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3381     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3382     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3383     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3384     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3385     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3386     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3387     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3388     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3389     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3390 };
3391
3392 float32 float32_exp2(float32 a, float_status *status)
3393 {
3394     flag aSign;
3395     int aExp;
3396     uint32_t aSig;
3397     float64 r, x, xn;
3398     int i;
3399     a = float32_squash_input_denormal(a, status);
3400
3401     aSig = extractFloat32Frac( a );
3402     aExp = extractFloat32Exp( a );
3403     aSign = extractFloat32Sign( a );
3404
3405     if ( aExp == 0xFF) {
3406         if (aSig) {
3407             return propagateFloat32NaN(a, float32_zero, status);
3408         }
3409         return (aSign) ? float32_zero : a;
3410     }
3411     if (aExp == 0) {
3412         if (aSig == 0) return float32_one;
3413     }
3414
3415     float_raise(float_flag_inexact, status);
3416
3417     /* ******************************* */
3418     /* using float64 for approximation */
3419     /* ******************************* */
3420     x = float32_to_float64(a, status);
3421     x = float64_mul(x, float64_ln2, status);
3422
3423     xn = x;
3424     r = float64_one;
3425     for (i = 0 ; i < 15 ; i++) {
3426         float64 f;
3427
3428         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3429         r = float64_add(r, f, status);
3430
3431         xn = float64_mul(xn, x, status);
3432     }
3433
3434     return float64_to_float32(r, status);
3435 }
3436
3437 /*----------------------------------------------------------------------------
3438 | Returns the binary log of the single-precision floating-point value `a'.
3439 | The operation is performed according to the IEC/IEEE Standard for Binary
3440 | Floating-Point Arithmetic.
3441 *----------------------------------------------------------------------------*/
3442 float32 float32_log2(float32 a, float_status *status)
3443 {
3444     flag aSign, zSign;
3445     int aExp;
3446     uint32_t aSig, zSig, i;
3447
3448     a = float32_squash_input_denormal(a, status);
3449     aSig = extractFloat32Frac( a );
3450     aExp = extractFloat32Exp( a );
3451     aSign = extractFloat32Sign( a );
3452
3453     if ( aExp == 0 ) {
3454         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3455         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3456     }
3457     if ( aSign ) {
3458         float_raise(float_flag_invalid, status);
3459         return float32_default_nan(status);
3460     }
3461     if ( aExp == 0xFF ) {
3462         if (aSig) {
3463             return propagateFloat32NaN(a, float32_zero, status);
3464         }
3465         return a;
3466     }
3467
3468     aExp -= 0x7F;
3469     aSig |= 0x00800000;
3470     zSign = aExp < 0;
3471     zSig = aExp << 23;
3472
3473     for (i = 1 << 22; i > 0; i >>= 1) {
3474         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3475         if ( aSig & 0x01000000 ) {
3476             aSig >>= 1;
3477             zSig |= i;
3478         }
3479     }
3480
3481     if ( zSign )
3482         zSig = -zSig;
3483
3484     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3485 }
3486
3487 /*----------------------------------------------------------------------------
3488 | Returns 1 if the single-precision floating-point value `a' is equal to
3489 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3490 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3491 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3492 *----------------------------------------------------------------------------*/
3493
3494 int float32_eq(float32 a, float32 b, float_status *status)
3495 {
3496     uint32_t av, bv;
3497     a = float32_squash_input_denormal(a, status);
3498     b = float32_squash_input_denormal(b, status);
3499
3500     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3501          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3502        ) {
3503         float_raise(float_flag_invalid, status);
3504         return 0;
3505     }
3506     av = float32_val(a);
3507     bv = float32_val(b);
3508     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3509 }
3510
3511 /*----------------------------------------------------------------------------
3512 | Returns 1 if the single-precision floating-point value `a' is less than
3513 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3514 | exception is raised if either operand is a NaN.  The comparison is performed
3515 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3516 *----------------------------------------------------------------------------*/
3517
3518 int float32_le(float32 a, float32 b, float_status *status)
3519 {
3520     flag aSign, bSign;
3521     uint32_t av, bv;
3522     a = float32_squash_input_denormal(a, status);
3523     b = float32_squash_input_denormal(b, status);
3524
3525     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3526          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3527        ) {
3528         float_raise(float_flag_invalid, status);
3529         return 0;
3530     }
3531     aSign = extractFloat32Sign( a );
3532     bSign = extractFloat32Sign( b );
3533     av = float32_val(a);
3534     bv = float32_val(b);
3535     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3536     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3537
3538 }
3539
3540 /*----------------------------------------------------------------------------
3541 | Returns 1 if the single-precision floating-point value `a' is less than
3542 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3543 | raised if either operand is a NaN.  The comparison is performed according
3544 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3545 *----------------------------------------------------------------------------*/
3546
3547 int float32_lt(float32 a, float32 b, float_status *status)
3548 {
3549     flag aSign, bSign;
3550     uint32_t av, bv;
3551     a = float32_squash_input_denormal(a, status);
3552     b = float32_squash_input_denormal(b, status);
3553
3554     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3555          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3556        ) {
3557         float_raise(float_flag_invalid, status);
3558         return 0;
3559     }
3560     aSign = extractFloat32Sign( a );
3561     bSign = extractFloat32Sign( b );
3562     av = float32_val(a);
3563     bv = float32_val(b);
3564     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3565     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3566
3567 }
3568
3569 /*----------------------------------------------------------------------------
3570 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3571 | be compared, and 0 otherwise.  The invalid exception is raised if either
3572 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3573 | Standard for Binary Floating-Point Arithmetic.
3574 *----------------------------------------------------------------------------*/
3575
3576 int float32_unordered(float32 a, float32 b, float_status *status)
3577 {
3578     a = float32_squash_input_denormal(a, status);
3579     b = float32_squash_input_denormal(b, status);
3580
3581     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3582          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3583        ) {
3584         float_raise(float_flag_invalid, status);
3585         return 1;
3586     }
3587     return 0;
3588 }
3589
3590 /*----------------------------------------------------------------------------
3591 | Returns 1 if the single-precision floating-point value `a' is equal to
3592 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3593 | exception.  The comparison is performed according to the IEC/IEEE Standard
3594 | for Binary Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3596
3597 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3598 {
3599     a = float32_squash_input_denormal(a, status);
3600     b = float32_squash_input_denormal(b, status);
3601
3602     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3603          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3604        ) {
3605         if (float32_is_signaling_nan(a, status)
3606          || float32_is_signaling_nan(b, status)) {
3607             float_raise(float_flag_invalid, status);
3608         }
3609         return 0;
3610     }
3611     return ( float32_val(a) == float32_val(b) ) ||
3612             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the single-precision floating-point value `a' is less than or
3617 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3618 | cause an exception.  Otherwise, the comparison is performed according to the
3619 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3621
3622 int float32_le_quiet(float32 a, float32 b, float_status *status)
3623 {
3624     flag aSign, bSign;
3625     uint32_t av, bv;
3626     a = float32_squash_input_denormal(a, status);
3627     b = float32_squash_input_denormal(b, status);
3628
3629     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3630          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3631        ) {
3632         if (float32_is_signaling_nan(a, status)
3633          || float32_is_signaling_nan(b, status)) {
3634             float_raise(float_flag_invalid, status);
3635         }
3636         return 0;
3637     }
3638     aSign = extractFloat32Sign( a );
3639     bSign = extractFloat32Sign( b );
3640     av = float32_val(a);
3641     bv = float32_val(b);
3642     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3643     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3644
3645 }
3646
3647 /*----------------------------------------------------------------------------
3648 | Returns 1 if the single-precision floating-point value `a' is less than
3649 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3650 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3651 | Standard for Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653
3654 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3655 {
3656     flag aSign, bSign;
3657     uint32_t av, bv;
3658     a = float32_squash_input_denormal(a, status);
3659     b = float32_squash_input_denormal(b, status);
3660
3661     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3662          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3663        ) {
3664         if (float32_is_signaling_nan(a, status)
3665          || float32_is_signaling_nan(b, status)) {
3666             float_raise(float_flag_invalid, status);
3667         }
3668         return 0;
3669     }
3670     aSign = extractFloat32Sign( a );
3671     bSign = extractFloat32Sign( b );
3672     av = float32_val(a);
3673     bv = float32_val(b);
3674     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3675     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3676
3677 }
3678
3679 /*----------------------------------------------------------------------------
3680 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3681 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3682 | comparison is performed according to the IEC/IEEE Standard for Binary
3683 | Floating-Point Arithmetic.
3684 *----------------------------------------------------------------------------*/
3685
3686 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3687 {
3688     a = float32_squash_input_denormal(a, status);
3689     b = float32_squash_input_denormal(b, status);
3690
3691     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3692          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3693        ) {
3694         if (float32_is_signaling_nan(a, status)
3695          || float32_is_signaling_nan(b, status)) {
3696             float_raise(float_flag_invalid, status);
3697         }
3698         return 1;
3699     }
3700     return 0;
3701 }
3702
3703
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the double-precision floating-point value
3706 | `a' to the single-precision floating-point format.  The conversion is
3707 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3708 | Arithmetic.
3709 *----------------------------------------------------------------------------*/
3710
3711 float32 float64_to_float32(float64 a, float_status *status)
3712 {
3713     flag aSign;
3714     int aExp;
3715     uint64_t aSig;
3716     uint32_t zSig;
3717     a = float64_squash_input_denormal(a, status);
3718
3719     aSig = extractFloat64Frac( a );
3720     aExp = extractFloat64Exp( a );
3721     aSign = extractFloat64Sign( a );
3722     if ( aExp == 0x7FF ) {
3723         if (aSig) {
3724             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3725         }
3726         return packFloat32( aSign, 0xFF, 0 );
3727     }
3728     shift64RightJamming( aSig, 22, &aSig );
3729     zSig = aSig;
3730     if ( aExp || zSig ) {
3731         zSig |= 0x40000000;
3732         aExp -= 0x381;
3733     }
3734     return roundAndPackFloat32(aSign, aExp, zSig, status);
3735
3736 }
3737
3738
3739 /*----------------------------------------------------------------------------
3740 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3741 | half-precision floating-point value, returning the result.  After being
3742 | shifted into the proper positions, the three fields are simply added
3743 | together to form the result.  This means that any integer portion of `zSig'
3744 | will be added into the exponent.  Since a properly normalized significand
3745 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3746 | than the desired result exponent whenever `zSig' is a complete, normalized
3747 | significand.
3748 *----------------------------------------------------------------------------*/
3749 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3750 {
3751     return make_float16(
3752         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3753 }
3754
3755 /*----------------------------------------------------------------------------
3756 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3757 | and significand `zSig', and returns the proper half-precision floating-
3758 | point value corresponding to the abstract input.  Ordinarily, the abstract
3759 | value is simply rounded and packed into the half-precision format, with
3760 | the inexact exception raised if the abstract input cannot be represented
3761 | exactly.  However, if the abstract value is too large, the overflow and
3762 | inexact exceptions are raised and an infinity or maximal finite value is
3763 | returned.  If the abstract value is too small, the input value is rounded to
3764 | a subnormal number, and the underflow and inexact exceptions are raised if
3765 | the abstract input cannot be represented exactly as a subnormal half-
3766 | precision floating-point number.
3767 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3768 | ARM-style "alternative representation", which omits the NaN and Inf
3769 | encodings in order to raise the maximum representable exponent by one.
3770 |     The input significand `zSig' has its binary point between bits 22
3771 | and 23, which is 13 bits to the left of the usual location.  This shifted
3772 | significand must be normalized or smaller.  If `zSig' is not normalized,
3773 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3774 | and it must not require rounding.  In the usual case that `zSig' is
3775 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3776 | Note the slightly odd position of the binary point in zSig compared with the
3777 | other roundAndPackFloat functions. This should probably be fixed if we
3778 | need to implement more float16 routines than just conversion.
3779 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3780 | Binary Floating-Point Arithmetic.
3781 *----------------------------------------------------------------------------*/
3782
3783 static float16 roundAndPackFloat16(flag zSign, int zExp,
3784                                    uint32_t zSig, flag ieee,
3785                                    float_status *status)
3786 {
3787     int maxexp = ieee ? 29 : 30;
3788     uint32_t mask;
3789     uint32_t increment;
3790     bool rounding_bumps_exp;
3791     bool is_tiny = false;
3792
3793     /* Calculate the mask of bits of the mantissa which are not
3794      * representable in half-precision and will be lost.
3795      */
3796     if (zExp < 1) {
3797         /* Will be denormal in halfprec */
3798         mask = 0x00ffffff;
3799         if (zExp >= -11) {
3800             mask >>= 11 + zExp;
3801         }
3802     } else {
3803         /* Normal number in halfprec */
3804         mask = 0x00001fff;
3805     }
3806
3807     switch (status->float_rounding_mode) {
3808     case float_round_nearest_even:
3809         increment = (mask + 1) >> 1;
3810         if ((zSig & mask) == increment) {
3811             increment = zSig & (increment << 1);
3812         }
3813         break;
3814     case float_round_ties_away:
3815         increment = (mask + 1) >> 1;
3816         break;
3817     case float_round_up:
3818         increment = zSign ? 0 : mask;
3819         break;
3820     case float_round_down:
3821         increment = zSign ? mask : 0;
3822         break;
3823     default: /* round_to_zero */
3824         increment = 0;
3825         break;
3826     }
3827
3828     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3829
3830     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3831         if (ieee) {
3832             float_raise(float_flag_overflow | float_flag_inexact, status);
3833             return packFloat16(zSign, 0x1f, 0);
3834         } else {
3835             float_raise(float_flag_invalid, status);
3836             return packFloat16(zSign, 0x1f, 0x3ff);
3837         }
3838     }
3839
3840     if (zExp < 0) {
3841         /* Note that flush-to-zero does not affect half-precision results */
3842         is_tiny =
3843             (status->float_detect_tininess == float_tininess_before_rounding)
3844             || (zExp < -1)
3845             || (!rounding_bumps_exp);
3846     }
3847     if (zSig & mask) {
3848         float_raise(float_flag_inexact, status);
3849         if (is_tiny) {
3850             float_raise(float_flag_underflow, status);
3851         }
3852     }
3853
3854     zSig += increment;
3855     if (rounding_bumps_exp) {
3856         zSig >>= 1;
3857         zExp++;
3858     }
3859
3860     if (zExp < -10) {
3861         return packFloat16(zSign, 0, 0);
3862     }
3863     if (zExp < 0) {
3864         zSig >>= -zExp;
3865         zExp = 0;
3866     }
3867     return packFloat16(zSign, zExp, zSig >> 13);
3868 }
3869
3870 /*----------------------------------------------------------------------------
3871 | If `a' is denormal and we are in flush-to-zero mode then set the
3872 | input-denormal exception and return zero. Otherwise just return the value.
3873 *----------------------------------------------------------------------------*/
3874 float16 float16_squash_input_denormal(float16 a, float_status *status)
3875 {
3876     if (status->flush_inputs_to_zero) {
3877         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3878             float_raise(float_flag_input_denormal, status);
3879             return make_float16(float16_val(a) & 0x8000);
3880         }
3881     }
3882     return a;
3883 }
3884
3885 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3886                                       uint32_t *zSigPtr)
3887 {
3888     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3889     *zSigPtr = aSig << shiftCount;
3890     *zExpPtr = 1 - shiftCount;
3891 }
3892
3893 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3894    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3895
3896 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3897 {
3898     flag aSign;
3899     int aExp;
3900     uint32_t aSig;
3901
3902     aSign = extractFloat16Sign(a);
3903     aExp = extractFloat16Exp(a);
3904     aSig = extractFloat16Frac(a);
3905
3906     if (aExp == 0x1f && ieee) {
3907         if (aSig) {
3908             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3909         }
3910         return packFloat32(aSign, 0xff, 0);
3911     }
3912     if (aExp == 0) {
3913         if (aSig == 0) {
3914             return packFloat32(aSign, 0, 0);
3915         }
3916
3917         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3918         aExp--;
3919     }
3920     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3921 }
3922
3923 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3924 {
3925     flag aSign;
3926     int aExp;
3927     uint32_t aSig;
3928
3929     a = float32_squash_input_denormal(a, status);
3930
3931     aSig = extractFloat32Frac( a );
3932     aExp = extractFloat32Exp( a );
3933     aSign = extractFloat32Sign( a );
3934     if ( aExp == 0xFF ) {
3935         if (aSig) {
3936             /* Input is a NaN */
3937             if (!ieee) {
3938                 float_raise(float_flag_invalid, status);
3939                 return packFloat16(aSign, 0, 0);
3940             }
3941             return commonNaNToFloat16(
3942                 float32ToCommonNaN(a, status), status);
3943         }
3944         /* Infinity */
3945         if (!ieee) {
3946             float_raise(float_flag_invalid, status);
3947             return packFloat16(aSign, 0x1f, 0x3ff);
3948         }
3949         return packFloat16(aSign, 0x1f, 0);
3950     }
3951     if (aExp == 0 && aSig == 0) {
3952         return packFloat16(aSign, 0, 0);
3953     }
3954     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3955      * even if the input is denormal; however this is harmless because
3956      * the largest possible single-precision denormal is still smaller
3957      * than the smallest representable half-precision denormal, and so we
3958      * will end up ignoring aSig and returning via the "always return zero"
3959      * codepath.
3960      */
3961     aSig |= 0x00800000;
3962     aExp -= 0x71;
3963
3964     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3965 }
3966
3967 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3968 {
3969     flag aSign;
3970     int aExp;
3971     uint32_t aSig;
3972
3973     aSign = extractFloat16Sign(a);
3974     aExp = extractFloat16Exp(a);
3975     aSig = extractFloat16Frac(a);
3976
3977     if (aExp == 0x1f && ieee) {
3978         if (aSig) {
3979             return commonNaNToFloat64(
3980                 float16ToCommonNaN(a, status), status);
3981         }
3982         return packFloat64(aSign, 0x7ff, 0);
3983     }
3984     if (aExp == 0) {
3985         if (aSig == 0) {
3986             return packFloat64(aSign, 0, 0);
3987         }
3988
3989         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3990         aExp--;
3991     }
3992     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3993 }
3994
3995 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3996 {
3997     flag aSign;
3998     int aExp;
3999     uint64_t aSig;
4000     uint32_t zSig;
4001
4002     a = float64_squash_input_denormal(a, status);
4003
4004     aSig = extractFloat64Frac(a);
4005     aExp = extractFloat64Exp(a);
4006     aSign = extractFloat64Sign(a);
4007     if (aExp == 0x7FF) {
4008         if (aSig) {
4009             /* Input is a NaN */
4010             if (!ieee) {
4011                 float_raise(float_flag_invalid, status);
4012                 return packFloat16(aSign, 0, 0);
4013             }
4014             return commonNaNToFloat16(
4015                 float64ToCommonNaN(a, status), status);
4016         }
4017         /* Infinity */
4018         if (!ieee) {
4019             float_raise(float_flag_invalid, status);
4020             return packFloat16(aSign, 0x1f, 0x3ff);
4021         }
4022         return packFloat16(aSign, 0x1f, 0);
4023     }
4024     shift64RightJamming(aSig, 29, &aSig);
4025     zSig = aSig;
4026     if (aExp == 0 && zSig == 0) {
4027         return packFloat16(aSign, 0, 0);
4028     }
4029     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4030      * even if the input is denormal; however this is harmless because
4031      * the largest possible single-precision denormal is still smaller
4032      * than the smallest representable half-precision denormal, and so we
4033      * will end up ignoring aSig and returning via the "always return zero"
4034      * codepath.
4035      */
4036     zSig |= 0x00800000;
4037     aExp -= 0x3F1;
4038
4039     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4040 }
4041
4042 /*----------------------------------------------------------------------------
4043 | Returns the result of converting the double-precision floating-point value
4044 | `a' to the extended double-precision floating-point format.  The conversion
4045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4046 | Arithmetic.
4047 *----------------------------------------------------------------------------*/
4048
4049 floatx80 float64_to_floatx80(float64 a, float_status *status)
4050 {
4051     flag aSign;
4052     int aExp;
4053     uint64_t aSig;
4054
4055     a = float64_squash_input_denormal(a, status);
4056     aSig = extractFloat64Frac( a );
4057     aExp = extractFloat64Exp( a );
4058     aSign = extractFloat64Sign( a );
4059     if ( aExp == 0x7FF ) {
4060         if (aSig) {
4061             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4062         }
4063         return packFloatx80(aSign,
4064                             floatx80_infinity_high,
4065                             floatx80_infinity_low);
4066     }
4067     if ( aExp == 0 ) {
4068         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4069         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4070     }
4071     return
4072         packFloatx80(
4073             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4074
4075 }
4076
4077 /*----------------------------------------------------------------------------
4078 | Returns the result of converting the double-precision floating-point value
4079 | `a' to the quadruple-precision floating-point format.  The conversion is
4080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4081 | Arithmetic.
4082 *----------------------------------------------------------------------------*/
4083
4084 float128 float64_to_float128(float64 a, float_status *status)
4085 {
4086     flag aSign;
4087     int aExp;
4088     uint64_t aSig, zSig0, zSig1;
4089
4090     a = float64_squash_input_denormal(a, status);
4091     aSig = extractFloat64Frac( a );
4092     aExp = extractFloat64Exp( a );
4093     aSign = extractFloat64Sign( a );
4094     if ( aExp == 0x7FF ) {
4095         if (aSig) {
4096             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4097         }
4098         return packFloat128( aSign, 0x7FFF, 0, 0 );
4099     }
4100     if ( aExp == 0 ) {
4101         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4102         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4103         --aExp;
4104     }
4105     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4106     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4107
4108 }
4109
4110
4111 /*----------------------------------------------------------------------------
4112 | Returns the remainder of the double-precision floating-point value `a'
4113 | with respect to the corresponding value `b'.  The operation is performed
4114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4116
4117 float64 float64_rem(float64 a, float64 b, float_status *status)
4118 {
4119     flag aSign, zSign;
4120     int aExp, bExp, expDiff;
4121     uint64_t aSig, bSig;
4122     uint64_t q, alternateASig;
4123     int64_t sigMean;
4124
4125     a = float64_squash_input_denormal(a, status);
4126     b = float64_squash_input_denormal(b, status);
4127     aSig = extractFloat64Frac( a );
4128     aExp = extractFloat64Exp( a );
4129     aSign = extractFloat64Sign( a );
4130     bSig = extractFloat64Frac( b );
4131     bExp = extractFloat64Exp( b );
4132     if ( aExp == 0x7FF ) {
4133         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4134             return propagateFloat64NaN(a, b, status);
4135         }
4136         float_raise(float_flag_invalid, status);
4137         return float64_default_nan(status);
4138     }
4139     if ( bExp == 0x7FF ) {
4140         if (bSig) {
4141             return propagateFloat64NaN(a, b, status);
4142         }
4143         return a;
4144     }
4145     if ( bExp == 0 ) {
4146         if ( bSig == 0 ) {
4147             float_raise(float_flag_invalid, status);
4148             return float64_default_nan(status);
4149         }
4150         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4151     }
4152     if ( aExp == 0 ) {
4153         if ( aSig == 0 ) return a;
4154         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4155     }
4156     expDiff = aExp - bExp;
4157     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4158     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4159     if ( expDiff < 0 ) {
4160         if ( expDiff < -1 ) return a;
4161         aSig >>= 1;
4162     }
4163     q = ( bSig <= aSig );
4164     if ( q ) aSig -= bSig;
4165     expDiff -= 64;
4166     while ( 0 < expDiff ) {
4167         q = estimateDiv128To64( aSig, 0, bSig );
4168         q = ( 2 < q ) ? q - 2 : 0;
4169         aSig = - ( ( bSig>>2 ) * q );
4170         expDiff -= 62;
4171     }
4172     expDiff += 64;
4173     if ( 0 < expDiff ) {
4174         q = estimateDiv128To64( aSig, 0, bSig );
4175         q = ( 2 < q ) ? q - 2 : 0;
4176         q >>= 64 - expDiff;
4177         bSig >>= 2;
4178         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4179     }
4180     else {
4181         aSig >>= 2;
4182         bSig >>= 2;
4183     }
4184     do {
4185         alternateASig = aSig;
4186         ++q;
4187         aSig -= bSig;
4188     } while ( 0 <= (int64_t) aSig );
4189     sigMean = aSig + alternateASig;
4190     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4191         aSig = alternateASig;
4192     }
4193     zSign = ( (int64_t) aSig < 0 );
4194     if ( zSign ) aSig = - aSig;
4195     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4196
4197 }
4198
4199 /*----------------------------------------------------------------------------
4200 | Returns the binary log of the double-precision floating-point value `a'.
4201 | The operation is performed according to the IEC/IEEE Standard for Binary
4202 | Floating-Point Arithmetic.
4203 *----------------------------------------------------------------------------*/
4204 float64 float64_log2(float64 a, float_status *status)
4205 {
4206     flag aSign, zSign;
4207     int aExp;
4208     uint64_t aSig, aSig0, aSig1, zSig, i;
4209     a = float64_squash_input_denormal(a, status);
4210
4211     aSig = extractFloat64Frac( a );
4212     aExp = extractFloat64Exp( a );
4213     aSign = extractFloat64Sign( a );
4214
4215     if ( aExp == 0 ) {
4216         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4217         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4218     }
4219     if ( aSign ) {
4220         float_raise(float_flag_invalid, status);
4221         return float64_default_nan(status);
4222     }
4223     if ( aExp == 0x7FF ) {
4224         if (aSig) {
4225             return propagateFloat64NaN(a, float64_zero, status);
4226         }
4227         return a;
4228     }
4229
4230     aExp -= 0x3FF;
4231     aSig |= LIT64( 0x0010000000000000 );
4232     zSign = aExp < 0;
4233     zSig = (uint64_t)aExp << 52;
4234     for (i = 1LL << 51; i > 0; i >>= 1) {
4235         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4236         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4237         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4238             aSig >>= 1;
4239             zSig |= i;
4240         }
4241     }
4242
4243     if ( zSign )
4244         zSig = -zSig;
4245     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4246 }
4247
4248 /*----------------------------------------------------------------------------
4249 | Returns 1 if the double-precision floating-point value `a' is equal to the
4250 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4251 | if either operand is a NaN.  Otherwise, the comparison is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4254
4255 int float64_eq(float64 a, float64 b, float_status *status)
4256 {
4257     uint64_t av, bv;
4258     a = float64_squash_input_denormal(a, status);
4259     b = float64_squash_input_denormal(b, status);
4260
4261     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4262          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4263        ) {
4264         float_raise(float_flag_invalid, status);
4265         return 0;
4266     }
4267     av = float64_val(a);
4268     bv = float64_val(b);
4269     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4270
4271 }
4272
4273 /*----------------------------------------------------------------------------
4274 | Returns 1 if the double-precision floating-point value `a' is less than or
4275 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4276 | exception is raised if either operand is a NaN.  The comparison is performed
4277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4279
4280 int float64_le(float64 a, float64 b, float_status *status)
4281 {
4282     flag aSign, bSign;
4283     uint64_t av, bv;
4284     a = float64_squash_input_denormal(a, status);
4285     b = float64_squash_input_denormal(b, status);
4286
4287     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4288          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4289        ) {
4290         float_raise(float_flag_invalid, status);
4291         return 0;
4292     }
4293     aSign = extractFloat64Sign( a );
4294     bSign = extractFloat64Sign( b );
4295     av = float64_val(a);
4296     bv = float64_val(b);
4297     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4298     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4299
4300 }
4301
4302 /*----------------------------------------------------------------------------
4303 | Returns 1 if the double-precision floating-point value `a' is less than
4304 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4305 | raised if either operand is a NaN.  The comparison is performed according
4306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4307 *----------------------------------------------------------------------------*/
4308
4309 int float64_lt(float64 a, float64 b, float_status *status)
4310 {
4311     flag aSign, bSign;
4312     uint64_t av, bv;
4313
4314     a = float64_squash_input_denormal(a, status);
4315     b = float64_squash_input_denormal(b, status);
4316     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4317          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4318        ) {
4319         float_raise(float_flag_invalid, status);
4320         return 0;
4321     }
4322     aSign = extractFloat64Sign( a );
4323     bSign = extractFloat64Sign( b );
4324     av = float64_val(a);
4325     bv = float64_val(b);
4326     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4327     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4328
4329 }
4330
4331 /*----------------------------------------------------------------------------
4332 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4333 | be compared, and 0 otherwise.  The invalid exception is raised if either
4334 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4335 | Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4337
4338 int float64_unordered(float64 a, float64 b, float_status *status)
4339 {
4340     a = float64_squash_input_denormal(a, status);
4341     b = float64_squash_input_denormal(b, status);
4342
4343     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4344          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4345        ) {
4346         float_raise(float_flag_invalid, status);
4347         return 1;
4348     }
4349     return 0;
4350 }
4351
4352 /*----------------------------------------------------------------------------
4353 | Returns 1 if the double-precision floating-point value `a' is equal to the
4354 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4355 | exception.The comparison is performed according to the IEC/IEEE Standard
4356 | for Binary Floating-Point Arithmetic.
4357 *----------------------------------------------------------------------------*/
4358
4359 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4360 {
4361     uint64_t av, bv;
4362     a = float64_squash_input_denormal(a, status);
4363     b = float64_squash_input_denormal(b, status);
4364
4365     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4366          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4367        ) {
4368         if (float64_is_signaling_nan(a, status)
4369          || float64_is_signaling_nan(b, status)) {
4370             float_raise(float_flag_invalid, status);
4371         }
4372         return 0;
4373     }
4374     av = float64_val(a);
4375     bv = float64_val(b);
4376     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Returns 1 if the double-precision floating-point value `a' is less than or
4382 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4383 | cause an exception.  Otherwise, the comparison is performed according to the
4384 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
4387 int float64_le_quiet(float64 a, float64 b, float_status *status)
4388 {
4389     flag aSign, bSign;
4390     uint64_t av, bv;
4391     a = float64_squash_input_denormal(a, status);
4392     b = float64_squash_input_denormal(b, status);
4393
4394     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4395          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4396        ) {
4397         if (float64_is_signaling_nan(a, status)
4398          || float64_is_signaling_nan(b, status)) {
4399             float_raise(float_flag_invalid, status);
4400         }
4401         return 0;
4402     }
4403     aSign = extractFloat64Sign( a );
4404     bSign = extractFloat64Sign( b );
4405     av = float64_val(a);
4406     bv = float64_val(b);
4407     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4408     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4409
4410 }
4411
4412 /*----------------------------------------------------------------------------
4413 | Returns 1 if the double-precision floating-point value `a' is less than
4414 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4415 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4416 | Standard for Binary Floating-Point Arithmetic.
4417 *----------------------------------------------------------------------------*/
4418
4419 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4420 {
4421     flag aSign, bSign;
4422     uint64_t av, bv;
4423     a = float64_squash_input_denormal(a, status);
4424     b = float64_squash_input_denormal(b, status);
4425
4426     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4427          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4428        ) {
4429         if (float64_is_signaling_nan(a, status)
4430          || float64_is_signaling_nan(b, status)) {
4431             float_raise(float_flag_invalid, status);
4432         }
4433         return 0;
4434     }
4435     aSign = extractFloat64Sign( a );
4436     bSign = extractFloat64Sign( b );
4437     av = float64_val(a);
4438     bv = float64_val(b);
4439     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4440     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4441
4442 }
4443
4444 /*----------------------------------------------------------------------------
4445 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4446 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4447 | comparison is performed according to the IEC/IEEE Standard for Binary
4448 | Floating-Point Arithmetic.
4449 *----------------------------------------------------------------------------*/
4450
4451 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4452 {
4453     a = float64_squash_input_denormal(a, status);
4454     b = float64_squash_input_denormal(b, status);
4455
4456     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4457          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4458        ) {
4459         if (float64_is_signaling_nan(a, status)
4460          || float64_is_signaling_nan(b, status)) {
4461             float_raise(float_flag_invalid, status);
4462         }
4463         return 1;
4464     }
4465     return 0;
4466 }
4467
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the extended double-precision floating-
4470 | point value `a' to the 32-bit two's complement integer format.  The
4471 | conversion is performed according to the IEC/IEEE Standard for Binary
4472 | Floating-Point Arithmetic---which means in particular that the conversion
4473 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4474 | largest positive integer is returned.  Otherwise, if the conversion
4475 | overflows, the largest integer with the same sign as `a' is returned.
4476 *----------------------------------------------------------------------------*/
4477
4478 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4479 {
4480     flag aSign;
4481     int32_t aExp, shiftCount;
4482     uint64_t aSig;
4483
4484     if (floatx80_invalid_encoding(a)) {
4485         float_raise(float_flag_invalid, status);
4486         return 1 << 31;
4487     }
4488     aSig = extractFloatx80Frac( a );
4489     aExp = extractFloatx80Exp( a );
4490     aSign = extractFloatx80Sign( a );
4491     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4492     shiftCount = 0x4037 - aExp;
4493     if ( shiftCount <= 0 ) shiftCount = 1;
4494     shift64RightJamming( aSig, shiftCount, &aSig );
4495     return roundAndPackInt32(aSign, aSig, status);
4496
4497 }
4498
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the extended double-precision floating-
4501 | point value `a' to the 32-bit two's complement integer format.  The
4502 | conversion is performed according to the IEC/IEEE Standard for Binary
4503 | Floating-Point Arithmetic, except that the conversion is always rounded
4504 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4505 | Otherwise, if the conversion overflows, the largest integer with the same
4506 | sign as `a' is returned.
4507 *----------------------------------------------------------------------------*/
4508
4509 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4510 {
4511     flag aSign;
4512     int32_t aExp, shiftCount;
4513     uint64_t aSig, savedASig;
4514     int32_t z;
4515
4516     if (floatx80_invalid_encoding(a)) {
4517         float_raise(float_flag_invalid, status);
4518         return 1 << 31;
4519     }
4520     aSig = extractFloatx80Frac( a );
4521     aExp = extractFloatx80Exp( a );
4522     aSign = extractFloatx80Sign( a );
4523     if ( 0x401E < aExp ) {
4524         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4525         goto invalid;
4526     }
4527     else if ( aExp < 0x3FFF ) {
4528         if (aExp || aSig) {
4529             status->float_exception_flags |= float_flag_inexact;
4530         }
4531         return 0;
4532     }
4533     shiftCount = 0x403E - aExp;
4534     savedASig = aSig;
4535     aSig >>= shiftCount;
4536     z = aSig;
4537     if ( aSign ) z = - z;
4538     if ( ( z < 0 ) ^ aSign ) {
4539  invalid:
4540         float_raise(float_flag_invalid, status);
4541         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4542     }
4543     if ( ( aSig<<shiftCount ) != savedASig ) {
4544         status->float_exception_flags |= float_flag_inexact;
4545     }
4546     return z;
4547
4548 }
4549
4550 /*----------------------------------------------------------------------------
4551 | Returns the result of converting the extended double-precision floating-
4552 | point value `a' to the 64-bit two's complement integer format.  The
4553 | conversion is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic---which means in particular that the conversion
4555 | is rounded according to the current rounding mode.  If `a' is a NaN,
4556 | the largest positive integer is returned.  Otherwise, if the conversion
4557 | overflows, the largest integer with the same sign as `a' is returned.
4558 *----------------------------------------------------------------------------*/
4559
4560 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4561 {
4562     flag aSign;
4563     int32_t aExp, shiftCount;
4564     uint64_t aSig, aSigExtra;
4565
4566     if (floatx80_invalid_encoding(a)) {
4567         float_raise(float_flag_invalid, status);
4568         return 1ULL << 63;
4569     }
4570     aSig = extractFloatx80Frac( a );
4571     aExp = extractFloatx80Exp( a );
4572     aSign = extractFloatx80Sign( a );
4573     shiftCount = 0x403E - aExp;
4574     if ( shiftCount <= 0 ) {
4575         if ( shiftCount ) {
4576             float_raise(float_flag_invalid, status);
4577             if (!aSign || floatx80_is_any_nan(a)) {
4578                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4579             }
4580             return (int64_t) LIT64( 0x8000000000000000 );
4581         }
4582         aSigExtra = 0;
4583     }
4584     else {
4585         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4586     }
4587     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4588
4589 }
4590
4591 /*----------------------------------------------------------------------------
4592 | Returns the result of converting the extended double-precision floating-
4593 | point value `a' to the 64-bit two's complement integer format.  The
4594 | conversion is performed according to the IEC/IEEE Standard for Binary
4595 | Floating-Point Arithmetic, except that the conversion is always rounded
4596 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4597 | Otherwise, if the conversion overflows, the largest integer with the same
4598 | sign as `a' is returned.
4599 *----------------------------------------------------------------------------*/
4600
4601 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4602 {
4603     flag aSign;
4604     int32_t aExp, shiftCount;
4605     uint64_t aSig;
4606     int64_t z;
4607
4608     if (floatx80_invalid_encoding(a)) {
4609         float_raise(float_flag_invalid, status);
4610         return 1ULL << 63;
4611     }
4612     aSig = extractFloatx80Frac( a );
4613     aExp = extractFloatx80Exp( a );
4614     aSign = extractFloatx80Sign( a );
4615     shiftCount = aExp - 0x403E;
4616     if ( 0 <= shiftCount ) {
4617         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4618         if ( ( a.high != 0xC03E ) || aSig ) {
4619             float_raise(float_flag_invalid, status);
4620             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4621                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4622             }
4623         }
4624         return (int64_t) LIT64( 0x8000000000000000 );
4625     }
4626     else if ( aExp < 0x3FFF ) {
4627         if (aExp | aSig) {
4628             status->float_exception_flags |= float_flag_inexact;
4629         }
4630         return 0;
4631     }
4632     z = aSig>>( - shiftCount );
4633     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4634         status->float_exception_flags |= float_flag_inexact;
4635     }
4636     if ( aSign ) z = - z;
4637     return z;
4638
4639 }
4640
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the extended double-precision floating-
4643 | point value `a' to the single-precision floating-point format.  The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4647
4648 float32 floatx80_to_float32(floatx80 a, float_status *status)
4649 {
4650     flag aSign;
4651     int32_t aExp;
4652     uint64_t aSig;
4653
4654     if (floatx80_invalid_encoding(a)) {
4655         float_raise(float_flag_invalid, status);
4656         return float32_default_nan(status);
4657     }
4658     aSig = extractFloatx80Frac( a );
4659     aExp = extractFloatx80Exp( a );
4660     aSign = extractFloatx80Sign( a );
4661     if ( aExp == 0x7FFF ) {
4662         if ( (uint64_t) ( aSig<<1 ) ) {
4663             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4664         }
4665         return packFloat32( aSign, 0xFF, 0 );
4666     }
4667     shift64RightJamming( aSig, 33, &aSig );
4668     if ( aExp || aSig ) aExp -= 0x3F81;
4669     return roundAndPackFloat32(aSign, aExp, aSig, status);
4670
4671 }
4672
4673 /*----------------------------------------------------------------------------
4674 | Returns the result of converting the extended double-precision floating-
4675 | point value `a' to the double-precision floating-point format.  The
4676 | conversion is performed according to the IEC/IEEE Standard for Binary
4677 | Floating-Point Arithmetic.
4678 *----------------------------------------------------------------------------*/
4679
4680 float64 floatx80_to_float64(floatx80 a, float_status *status)
4681 {
4682     flag aSign;
4683     int32_t aExp;
4684     uint64_t aSig, zSig;
4685
4686     if (floatx80_invalid_encoding(a)) {
4687         float_raise(float_flag_invalid, status);
4688         return float64_default_nan(status);
4689     }
4690     aSig = extractFloatx80Frac( a );
4691     aExp = extractFloatx80Exp( a );
4692     aSign = extractFloatx80Sign( a );
4693     if ( aExp == 0x7FFF ) {
4694         if ( (uint64_t) ( aSig<<1 ) ) {
4695             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4696         }
4697         return packFloat64( aSign, 0x7FF, 0 );
4698     }
4699     shift64RightJamming( aSig, 1, &zSig );
4700     if ( aExp || aSig ) aExp -= 0x3C01;
4701     return roundAndPackFloat64(aSign, aExp, zSig, status);
4702
4703 }
4704
4705 /*----------------------------------------------------------------------------
4706 | Returns the result of converting the extended double-precision floating-
4707 | point value `a' to the quadruple-precision floating-point format.  The
4708 | conversion is performed according to the IEC/IEEE Standard for Binary
4709 | Floating-Point Arithmetic.
4710 *----------------------------------------------------------------------------*/
4711
4712 float128 floatx80_to_float128(floatx80 a, float_status *status)
4713 {
4714     flag aSign;
4715     int aExp;
4716     uint64_t aSig, zSig0, zSig1;
4717
4718     if (floatx80_invalid_encoding(a)) {
4719         float_raise(float_flag_invalid, status);
4720         return float128_default_nan(status);
4721     }
4722     aSig = extractFloatx80Frac( a );
4723     aExp = extractFloatx80Exp( a );
4724     aSign = extractFloatx80Sign( a );
4725     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4726         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4727     }
4728     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4729     return packFloat128( aSign, aExp, zSig0, zSig1 );
4730
4731 }
4732
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a'
4735 | to the precision provided by floatx80_rounding_precision and returns the
4736 | result as an extended double-precision floating-point value.
4737 | The operation is performed according to the IEC/IEEE Standard for Binary
4738 | Floating-Point Arithmetic.
4739 *----------------------------------------------------------------------------*/
4740
4741 floatx80 floatx80_round(floatx80 a, float_status *status)
4742 {
4743     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4744                                 extractFloatx80Sign(a),
4745                                 extractFloatx80Exp(a),
4746                                 extractFloatx80Frac(a), 0, status);
4747 }
4748
4749 /*----------------------------------------------------------------------------
4750 | Rounds the extended double-precision floating-point value `a' to an integer,
4751 | and returns the result as an extended quadruple-precision floating-point
4752 | value.  The operation is performed according to the IEC/IEEE Standard for
4753 | Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755
4756 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4757 {
4758     flag aSign;
4759     int32_t aExp;
4760     uint64_t lastBitMask, roundBitsMask;
4761     floatx80 z;
4762
4763     if (floatx80_invalid_encoding(a)) {
4764         float_raise(float_flag_invalid, status);
4765         return floatx80_default_nan(status);
4766     }
4767     aExp = extractFloatx80Exp( a );
4768     if ( 0x403E <= aExp ) {
4769         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4770             return propagateFloatx80NaN(a, a, status);
4771         }
4772         return a;
4773     }
4774     if ( aExp < 0x3FFF ) {
4775         if (    ( aExp == 0 )
4776              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4777             return a;
4778         }
4779         status->float_exception_flags |= float_flag_inexact;
4780         aSign = extractFloatx80Sign( a );
4781         switch (status->float_rounding_mode) {
4782          case float_round_nearest_even:
4783             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4784                ) {
4785                 return
4786                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4787             }
4788             break;
4789         case float_round_ties_away:
4790             if (aExp == 0x3FFE) {
4791                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4792             }
4793             break;
4794          case float_round_down:
4795             return
4796                   aSign ?
4797                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4798                 : packFloatx80( 0, 0, 0 );
4799          case float_round_up:
4800             return
4801                   aSign ? packFloatx80( 1, 0, 0 )
4802                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4803         }
4804         return packFloatx80( aSign, 0, 0 );
4805     }
4806     lastBitMask = 1;
4807     lastBitMask <<= 0x403E - aExp;
4808     roundBitsMask = lastBitMask - 1;
4809     z = a;
4810     switch (status->float_rounding_mode) {
4811     case float_round_nearest_even:
4812         z.low += lastBitMask>>1;
4813         if ((z.low & roundBitsMask) == 0) {
4814             z.low &= ~lastBitMask;
4815         }
4816         break;
4817     case float_round_ties_away:
4818         z.low += lastBitMask >> 1;
4819         break;
4820     case float_round_to_zero:
4821         break;
4822     case float_round_up:
4823         if (!extractFloatx80Sign(z)) {
4824             z.low += roundBitsMask;
4825         }
4826         break;
4827     case float_round_down:
4828         if (extractFloatx80Sign(z)) {
4829             z.low += roundBitsMask;
4830         }
4831         break;
4832     default:
4833         abort();
4834     }
4835     z.low &= ~ roundBitsMask;
4836     if ( z.low == 0 ) {
4837         ++z.high;
4838         z.low = LIT64( 0x8000000000000000 );
4839     }
4840     if (z.low != a.low) {
4841         status->float_exception_flags |= float_flag_inexact;
4842     }
4843     return z;
4844
4845 }
4846
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of adding the absolute values of the extended double-
4849 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4850 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4851 | The addition is performed according to the IEC/IEEE Standard for Binary
4852 | Floating-Point Arithmetic.
4853 *----------------------------------------------------------------------------*/
4854
4855 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4856                                 float_status *status)
4857 {
4858     int32_t aExp, bExp, zExp;
4859     uint64_t aSig, bSig, zSig0, zSig1;
4860     int32_t expDiff;
4861
4862     aSig = extractFloatx80Frac( a );
4863     aExp = extractFloatx80Exp( a );
4864     bSig = extractFloatx80Frac( b );
4865     bExp = extractFloatx80Exp( b );
4866     expDiff = aExp - bExp;
4867     if ( 0 < expDiff ) {
4868         if ( aExp == 0x7FFF ) {
4869             if ((uint64_t)(aSig << 1)) {
4870                 return propagateFloatx80NaN(a, b, status);
4871             }
4872             return a;
4873         }
4874         if ( bExp == 0 ) --expDiff;
4875         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4876         zExp = aExp;
4877     }
4878     else if ( expDiff < 0 ) {
4879         if ( bExp == 0x7FFF ) {
4880             if ((uint64_t)(bSig << 1)) {
4881                 return propagateFloatx80NaN(a, b, status);
4882             }
4883             return packFloatx80(zSign,
4884                                 floatx80_infinity_high,
4885                                 floatx80_infinity_low);
4886         }
4887         if ( aExp == 0 ) ++expDiff;
4888         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4889         zExp = bExp;
4890     }
4891     else {
4892         if ( aExp == 0x7FFF ) {
4893             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4894                 return propagateFloatx80NaN(a, b, status);
4895             }
4896             return a;
4897         }
4898         zSig1 = 0;
4899         zSig0 = aSig + bSig;
4900         if ( aExp == 0 ) {
4901             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4902             goto roundAndPack;
4903         }
4904         zExp = aExp;
4905         goto shiftRight1;
4906     }
4907     zSig0 = aSig + bSig;
4908     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4909  shiftRight1:
4910     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4911     zSig0 |= LIT64( 0x8000000000000000 );
4912     ++zExp;
4913  roundAndPack:
4914     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4915                                 zSign, zExp, zSig0, zSig1, status);
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the absolute values of the extended
4920 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4921 | difference is negated before being returned.  `zSign' is ignored if the
4922 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4923 | Standard for Binary Floating-Point Arithmetic.
4924 *----------------------------------------------------------------------------*/
4925
4926 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4927                                 float_status *status)
4928 {
4929     int32_t aExp, bExp, zExp;
4930     uint64_t aSig, bSig, zSig0, zSig1;
4931     int32_t expDiff;
4932
4933     aSig = extractFloatx80Frac( a );
4934     aExp = extractFloatx80Exp( a );
4935     bSig = extractFloatx80Frac( b );
4936     bExp = extractFloatx80Exp( b );
4937     expDiff = aExp - bExp;
4938     if ( 0 < expDiff ) goto aExpBigger;
4939     if ( expDiff < 0 ) goto bExpBigger;
4940     if ( aExp == 0x7FFF ) {
4941         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4942             return propagateFloatx80NaN(a, b, status);
4943         }
4944         float_raise(float_flag_invalid, status);
4945         return floatx80_default_nan(status);
4946     }
4947     if ( aExp == 0 ) {
4948         aExp = 1;
4949         bExp = 1;
4950     }
4951     zSig1 = 0;
4952     if ( bSig < aSig ) goto aBigger;
4953     if ( aSig < bSig ) goto bBigger;
4954     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4955  bExpBigger:
4956     if ( bExp == 0x7FFF ) {
4957         if ((uint64_t)(bSig << 1)) {
4958             return propagateFloatx80NaN(a, b, status);
4959         }
4960         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4961                             floatx80_infinity_low);
4962     }
4963     if ( aExp == 0 ) ++expDiff;
4964     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4965  bBigger:
4966     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4967     zExp = bExp;
4968     zSign ^= 1;
4969     goto normalizeRoundAndPack;
4970  aExpBigger:
4971     if ( aExp == 0x7FFF ) {
4972         if ((uint64_t)(aSig << 1)) {
4973             return propagateFloatx80NaN(a, b, status);
4974         }
4975         return a;
4976     }
4977     if ( bExp == 0 ) --expDiff;
4978     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4979  aBigger:
4980     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4981     zExp = aExp;
4982  normalizeRoundAndPack:
4983     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4984                                          zSign, zExp, zSig0, zSig1, status);
4985 }
4986
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of adding the extended double-precision floating-point
4989 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4990 | Standard for Binary Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4992
4993 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4994 {
4995     flag aSign, bSign;
4996
4997     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4998         float_raise(float_flag_invalid, status);
4999         return floatx80_default_nan(status);
5000     }
5001     aSign = extractFloatx80Sign( a );
5002     bSign = extractFloatx80Sign( b );
5003     if ( aSign == bSign ) {
5004         return addFloatx80Sigs(a, b, aSign, status);
5005     }
5006     else {
5007         return subFloatx80Sigs(a, b, aSign, status);
5008     }
5009
5010 }
5011
5012 /*----------------------------------------------------------------------------
5013 | Returns the result of subtracting the extended double-precision floating-
5014 | point values `a' and `b'.  The operation is performed according to the
5015 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5016 *----------------------------------------------------------------------------*/
5017
5018 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5019 {
5020     flag aSign, bSign;
5021
5022     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5023         float_raise(float_flag_invalid, status);
5024         return floatx80_default_nan(status);
5025     }
5026     aSign = extractFloatx80Sign( a );
5027     bSign = extractFloatx80Sign( b );
5028     if ( aSign == bSign ) {
5029         return subFloatx80Sigs(a, b, aSign, status);
5030     }
5031     else {
5032         return addFloatx80Sigs(a, b, aSign, status);
5033     }
5034
5035 }
5036
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of multiplying the extended double-precision floating-
5039 | point values `a' and `b'.  The operation is performed according to the
5040 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5041 *----------------------------------------------------------------------------*/
5042
5043 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5044 {
5045     flag aSign, bSign, zSign;
5046     int32_t aExp, bExp, zExp;
5047     uint64_t aSig, bSig, zSig0, zSig1;
5048
5049     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5050         float_raise(float_flag_invalid, status);
5051         return floatx80_default_nan(status);
5052     }
5053     aSig = extractFloatx80Frac( a );
5054     aExp = extractFloatx80Exp( a );
5055     aSign = extractFloatx80Sign( a );
5056     bSig = extractFloatx80Frac( b );
5057     bExp = extractFloatx80Exp( b );
5058     bSign = extractFloatx80Sign( b );
5059     zSign = aSign ^ bSign;
5060     if ( aExp == 0x7FFF ) {
5061         if (    (uint64_t) ( aSig<<1 )
5062              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5063             return propagateFloatx80NaN(a, b, status);
5064         }
5065         if ( ( bExp | bSig ) == 0 ) goto invalid;
5066         return packFloatx80(zSign, floatx80_infinity_high,
5067                                    floatx80_infinity_low);
5068     }
5069     if ( bExp == 0x7FFF ) {
5070         if ((uint64_t)(bSig << 1)) {
5071             return propagateFloatx80NaN(a, b, status);
5072         }
5073         if ( ( aExp | aSig ) == 0 ) {
5074  invalid:
5075             float_raise(float_flag_invalid, status);
5076             return floatx80_default_nan(status);
5077         }
5078         return packFloatx80(zSign, floatx80_infinity_high,
5079                                    floatx80_infinity_low);
5080     }
5081     if ( aExp == 0 ) {
5082         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5083         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5084     }
5085     if ( bExp == 0 ) {
5086         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5087         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5088     }
5089     zExp = aExp + bExp - 0x3FFE;
5090     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5091     if ( 0 < (int64_t) zSig0 ) {
5092         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5093         --zExp;
5094     }
5095     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5096                                 zSign, zExp, zSig0, zSig1, status);
5097 }
5098
5099 /*----------------------------------------------------------------------------
5100 | Returns the result of dividing the extended double-precision floating-point
5101 | value `a' by the corresponding value `b'.  The operation is performed
5102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5103 *----------------------------------------------------------------------------*/
5104
5105 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5106 {
5107     flag aSign, bSign, zSign;
5108     int32_t aExp, bExp, zExp;
5109     uint64_t aSig, bSig, zSig0, zSig1;
5110     uint64_t rem0, rem1, rem2, term0, term1, term2;
5111
5112     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5113         float_raise(float_flag_invalid, status);
5114         return floatx80_default_nan(status);
5115     }
5116     aSig = extractFloatx80Frac( a );
5117     aExp = extractFloatx80Exp( a );
5118     aSign = extractFloatx80Sign( a );
5119     bSig = extractFloatx80Frac( b );
5120     bExp = extractFloatx80Exp( b );
5121     bSign = extractFloatx80Sign( b );
5122     zSign = aSign ^ bSign;
5123     if ( aExp == 0x7FFF ) {
5124         if ((uint64_t)(aSig << 1)) {
5125             return propagateFloatx80NaN(a, b, status);
5126         }
5127         if ( bExp == 0x7FFF ) {
5128             if ((uint64_t)(bSig << 1)) {
5129                 return propagateFloatx80NaN(a, b, status);
5130             }
5131             goto invalid;
5132         }
5133         return packFloatx80(zSign, floatx80_infinity_high,
5134                                    floatx80_infinity_low);
5135     }
5136     if ( bExp == 0x7FFF ) {
5137         if ((uint64_t)(bSig << 1)) {
5138             return propagateFloatx80NaN(a, b, status);
5139         }
5140         return packFloatx80( zSign, 0, 0 );
5141     }
5142     if ( bExp == 0 ) {
5143         if ( bSig == 0 ) {
5144             if ( ( aExp | aSig ) == 0 ) {
5145  invalid:
5146                 float_raise(float_flag_invalid, status);
5147                 return floatx80_default_nan(status);
5148             }
5149             float_raise(float_flag_divbyzero, status);
5150             return packFloatx80(zSign, floatx80_infinity_high,
5151                                        floatx80_infinity_low);
5152         }
5153         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5154     }
5155     if ( aExp == 0 ) {
5156         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5157         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5158     }
5159     zExp = aExp - bExp + 0x3FFE;
5160     rem1 = 0;
5161     if ( bSig <= aSig ) {
5162         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5163         ++zExp;
5164     }
5165     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5166     mul64To128( bSig, zSig0, &term0, &term1 );
5167     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5168     while ( (int64_t) rem0 < 0 ) {
5169         --zSig0;
5170         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5171     }
5172     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5173     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5174         mul64To128( bSig, zSig1, &term1, &term2 );
5175         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5176         while ( (int64_t) rem1 < 0 ) {
5177             --zSig1;
5178             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5179         }
5180         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5181     }
5182     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183                                 zSign, zExp, zSig0, zSig1, status);
5184 }
5185
5186 /*----------------------------------------------------------------------------
5187 | Returns the remainder of the extended double-precision floating-point value
5188 | `a' with respect to the corresponding value `b'.  The operation is performed
5189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5190 *----------------------------------------------------------------------------*/
5191
5192 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5193 {
5194     flag aSign, zSign;
5195     int32_t aExp, bExp, expDiff;
5196     uint64_t aSig0, aSig1, bSig;
5197     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5198
5199     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5200         float_raise(float_flag_invalid, status);
5201         return floatx80_default_nan(status);
5202     }
5203     aSig0 = extractFloatx80Frac( a );
5204     aExp = extractFloatx80Exp( a );
5205     aSign = extractFloatx80Sign( a );
5206     bSig = extractFloatx80Frac( b );
5207     bExp = extractFloatx80Exp( b );
5208     if ( aExp == 0x7FFF ) {
5209         if (    (uint64_t) ( aSig0<<1 )
5210              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5211             return propagateFloatx80NaN(a, b, status);
5212         }
5213         goto invalid;
5214     }
5215     if ( bExp == 0x7FFF ) {
5216         if ((uint64_t)(bSig << 1)) {
5217             return propagateFloatx80NaN(a, b, status);
5218         }
5219         return a;
5220     }
5221     if ( bExp == 0 ) {
5222         if ( bSig == 0 ) {
5223  invalid:
5224             float_raise(float_flag_invalid, status);
5225             return floatx80_default_nan(status);
5226         }
5227         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5228     }
5229     if ( aExp == 0 ) {
5230         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5231         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5232     }
5233     bSig |= LIT64( 0x8000000000000000 );
5234     zSign = aSign;
5235     expDiff = aExp - bExp;
5236     aSig1 = 0;
5237     if ( expDiff < 0 ) {
5238         if ( expDiff < -1 ) return a;
5239         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5240         expDiff = 0;
5241     }
5242     q = ( bSig <= aSig0 );
5243     if ( q ) aSig0 -= bSig;
5244     expDiff -= 64;
5245     while ( 0 < expDiff ) {
5246         q = estimateDiv128To64( aSig0, aSig1, bSig );
5247         q = ( 2 < q ) ? q - 2 : 0;
5248         mul64To128( bSig, q, &term0, &term1 );
5249         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5250         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5251         expDiff -= 62;
5252     }
5253     expDiff += 64;
5254     if ( 0 < expDiff ) {
5255         q = estimateDiv128To64( aSig0, aSig1, bSig );
5256         q = ( 2 < q ) ? q - 2 : 0;
5257         q >>= 64 - expDiff;
5258         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5259         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5260         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5261         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5262             ++q;
5263             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5264         }
5265     }
5266     else {
5267         term1 = 0;
5268         term0 = bSig;
5269     }
5270     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5271     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5272          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5273               && ( q & 1 ) )
5274        ) {
5275         aSig0 = alternateASig0;
5276         aSig1 = alternateASig1;
5277         zSign = ! zSign;
5278     }
5279     return
5280         normalizeRoundAndPackFloatx80(
5281             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5282
5283 }
5284
5285 /*----------------------------------------------------------------------------
5286 | Returns the square root of the extended double-precision floating-point
5287 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5288 | for Binary Floating-Point Arithmetic.
5289 *----------------------------------------------------------------------------*/
5290
5291 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5292 {
5293     flag aSign;
5294     int32_t aExp, zExp;
5295     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5296     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5297
5298     if (floatx80_invalid_encoding(a)) {
5299         float_raise(float_flag_invalid, status);
5300         return floatx80_default_nan(status);
5301     }
5302     aSig0 = extractFloatx80Frac( a );
5303     aExp = extractFloatx80Exp( a );
5304     aSign = extractFloatx80Sign( a );
5305     if ( aExp == 0x7FFF ) {
5306         if ((uint64_t)(aSig0 << 1)) {
5307             return propagateFloatx80NaN(a, a, status);
5308         }
5309         if ( ! aSign ) return a;
5310         goto invalid;
5311     }
5312     if ( aSign ) {
5313         if ( ( aExp | aSig0 ) == 0 ) return a;
5314  invalid:
5315         float_raise(float_flag_invalid, status);
5316         return floatx80_default_nan(status);
5317     }
5318     if ( aExp == 0 ) {
5319         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5320         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5321     }
5322     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5323     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5324     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5325     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5326     doubleZSig0 = zSig0<<1;
5327     mul64To128( zSig0, zSig0, &term0, &term1 );
5328     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5329     while ( (int64_t) rem0 < 0 ) {
5330         --zSig0;
5331         doubleZSig0 -= 2;
5332         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5333     }
5334     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5335     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5336         if ( zSig1 == 0 ) zSig1 = 1;
5337         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5338         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5339         mul64To128( zSig1, zSig1, &term2, &term3 );
5340         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5341         while ( (int64_t) rem1 < 0 ) {
5342             --zSig1;
5343             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5344             term3 |= 1;
5345             term2 |= doubleZSig0;
5346             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5347         }
5348         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5349     }
5350     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5351     zSig0 |= doubleZSig0;
5352     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5353                                 0, zExp, zSig0, zSig1, status);
5354 }
5355
5356 /*----------------------------------------------------------------------------
5357 | Returns 1 if the extended double-precision floating-point value `a' is equal
5358 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5359 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362
5363 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5364 {
5365
5366     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5367         || (extractFloatx80Exp(a) == 0x7FFF
5368             && (uint64_t) (extractFloatx80Frac(a) << 1))
5369         || (extractFloatx80Exp(b) == 0x7FFF
5370             && (uint64_t) (extractFloatx80Frac(b) << 1))
5371        ) {
5372         float_raise(float_flag_invalid, status);
5373         return 0;
5374     }
5375     return
5376            ( a.low == b.low )
5377         && (    ( a.high == b.high )
5378              || (    ( a.low == 0 )
5379                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5380            );
5381
5382 }
5383
5384 /*----------------------------------------------------------------------------
5385 | Returns 1 if the extended double-precision floating-point value `a' is
5386 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5387 | invalid exception is raised if either operand is a NaN.  The comparison is
5388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5389 | Arithmetic.
5390 *----------------------------------------------------------------------------*/
5391
5392 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5393 {
5394     flag aSign, bSign;
5395
5396     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5397         || (extractFloatx80Exp(a) == 0x7FFF
5398             && (uint64_t) (extractFloatx80Frac(a) << 1))
5399         || (extractFloatx80Exp(b) == 0x7FFF
5400             && (uint64_t) (extractFloatx80Frac(b) << 1))
5401        ) {
5402         float_raise(float_flag_invalid, status);
5403         return 0;
5404     }
5405     aSign = extractFloatx80Sign( a );
5406     bSign = extractFloatx80Sign( b );
5407     if ( aSign != bSign ) {
5408         return
5409                aSign
5410             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5411                  == 0 );
5412     }
5413     return
5414           aSign ? le128( b.high, b.low, a.high, a.low )
5415         : le128( a.high, a.low, b.high, b.low );
5416
5417 }
5418
5419 /*----------------------------------------------------------------------------
5420 | Returns 1 if the extended double-precision floating-point value `a' is
5421 | less than the corresponding value `b', and 0 otherwise.  The invalid
5422 | exception is raised if either operand is a NaN.  The comparison is performed
5423 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5424 *----------------------------------------------------------------------------*/
5425
5426 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5427 {
5428     flag aSign, bSign;
5429
5430     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5431         || (extractFloatx80Exp(a) == 0x7FFF
5432             && (uint64_t) (extractFloatx80Frac(a) << 1))
5433         || (extractFloatx80Exp(b) == 0x7FFF
5434             && (uint64_t) (extractFloatx80Frac(b) << 1))
5435        ) {
5436         float_raise(float_flag_invalid, status);
5437         return 0;
5438     }
5439     aSign = extractFloatx80Sign( a );
5440     bSign = extractFloatx80Sign( b );
5441     if ( aSign != bSign ) {
5442         return
5443                aSign
5444             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5445                  != 0 );
5446     }
5447     return
5448           aSign ? lt128( b.high, b.low, a.high, a.low )
5449         : lt128( a.high, a.low, b.high, b.low );
5450
5451 }
5452
5453 /*----------------------------------------------------------------------------
5454 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5455 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5456 | either operand is a NaN.   The comparison is performed according to the
5457 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5458 *----------------------------------------------------------------------------*/
5459 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5460 {
5461     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5462         || (extractFloatx80Exp(a) == 0x7FFF
5463             && (uint64_t) (extractFloatx80Frac(a) << 1))
5464         || (extractFloatx80Exp(b) == 0x7FFF
5465             && (uint64_t) (extractFloatx80Frac(b) << 1))
5466        ) {
5467         float_raise(float_flag_invalid, status);
5468         return 1;
5469     }
5470     return 0;
5471 }
5472
5473 /*----------------------------------------------------------------------------
5474 | Returns 1 if the extended double-precision floating-point value `a' is
5475 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5476 | cause an exception.  The comparison is performed according to the IEC/IEEE
5477 | Standard for Binary Floating-Point Arithmetic.
5478 *----------------------------------------------------------------------------*/
5479
5480 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5481 {
5482
5483     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5484         float_raise(float_flag_invalid, status);
5485         return 0;
5486     }
5487     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5488               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5489          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5490               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5491        ) {
5492         if (floatx80_is_signaling_nan(a, status)
5493          || floatx80_is_signaling_nan(b, status)) {
5494             float_raise(float_flag_invalid, status);
5495         }
5496         return 0;
5497     }
5498     return
5499            ( a.low == b.low )
5500         && (    ( a.high == b.high )
5501              || (    ( a.low == 0 )
5502                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5503            );
5504
5505 }
5506
5507 /*----------------------------------------------------------------------------
5508 | Returns 1 if the extended double-precision floating-point value `a' is less
5509 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5510 | do not cause an exception.  Otherwise, the comparison is performed according
5511 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5512 *----------------------------------------------------------------------------*/
5513
5514 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5515 {
5516     flag aSign, bSign;
5517
5518     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5519         float_raise(float_flag_invalid, status);
5520         return 0;
5521     }
5522     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5523               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5524          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5525               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5526        ) {
5527         if (floatx80_is_signaling_nan(a, status)
5528          || floatx80_is_signaling_nan(b, status)) {
5529             float_raise(float_flag_invalid, status);
5530         }
5531         return 0;
5532     }
5533     aSign = extractFloatx80Sign( a );
5534     bSign = extractFloatx80Sign( b );
5535     if ( aSign != bSign ) {
5536         return
5537                aSign
5538             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5539                  == 0 );
5540     }
5541     return
5542           aSign ? le128( b.high, b.low, a.high, a.low )
5543         : le128( a.high, a.low, b.high, b.low );
5544
5545 }
5546
5547 /*----------------------------------------------------------------------------
5548 | Returns 1 if the extended double-precision floating-point value `a' is less
5549 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5550 | an exception.  Otherwise, the comparison is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553
5554 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5555 {
5556     flag aSign, bSign;
5557
5558     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5559         float_raise(float_flag_invalid, status);
5560         return 0;
5561     }
5562     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5563               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5564          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5565               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5566        ) {
5567         if (floatx80_is_signaling_nan(a, status)
5568          || floatx80_is_signaling_nan(b, status)) {
5569             float_raise(float_flag_invalid, status);
5570         }
5571         return 0;
5572     }
5573     aSign = extractFloatx80Sign( a );
5574     bSign = extractFloatx80Sign( b );
5575     if ( aSign != bSign ) {
5576         return
5577                aSign
5578             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5579                  != 0 );
5580     }
5581     return
5582           aSign ? lt128( b.high, b.low, a.high, a.low )
5583         : lt128( a.high, a.low, b.high, b.low );
5584
5585 }
5586
5587 /*----------------------------------------------------------------------------
5588 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5589 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5590 | The comparison is performed according to the IEC/IEEE Standard for Binary
5591 | Floating-Point Arithmetic.
5592 *----------------------------------------------------------------------------*/
5593 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5594 {
5595     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5596         float_raise(float_flag_invalid, status);
5597         return 1;
5598     }
5599     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5600               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5601          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5602               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5603        ) {
5604         if (floatx80_is_signaling_nan(a, status)
5605          || floatx80_is_signaling_nan(b, status)) {
5606             float_raise(float_flag_invalid, status);
5607         }
5608         return 1;
5609     }
5610     return 0;
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the quadruple-precision floating-point
5615 | value `a' to the 32-bit two's complement integer format.  The conversion
5616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5617 | Arithmetic---which means in particular that the conversion is rounded
5618 | according to the current rounding mode.  If `a' is a NaN, the largest
5619 | positive integer is returned.  Otherwise, if the conversion overflows, the
5620 | largest integer with the same sign as `a' is returned.
5621 *----------------------------------------------------------------------------*/
5622
5623 int32_t float128_to_int32(float128 a, float_status *status)
5624 {
5625     flag aSign;
5626     int32_t aExp, shiftCount;
5627     uint64_t aSig0, aSig1;
5628
5629     aSig1 = extractFloat128Frac1( a );
5630     aSig0 = extractFloat128Frac0( a );
5631     aExp = extractFloat128Exp( a );
5632     aSign = extractFloat128Sign( a );
5633     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5634     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5635     aSig0 |= ( aSig1 != 0 );
5636     shiftCount = 0x4028 - aExp;
5637     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5638     return roundAndPackInt32(aSign, aSig0, status);
5639
5640 }
5641
5642 /*----------------------------------------------------------------------------
5643 | Returns the result of converting the quadruple-precision floating-point
5644 | value `a' to the 32-bit two's complement integer format.  The conversion
5645 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5646 | Arithmetic, except that the conversion is always rounded toward zero.  If
5647 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5648 | conversion overflows, the largest integer with the same sign as `a' is
5649 | returned.
5650 *----------------------------------------------------------------------------*/
5651
5652 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5653 {
5654     flag aSign;
5655     int32_t aExp, shiftCount;
5656     uint64_t aSig0, aSig1, savedASig;
5657     int32_t z;
5658
5659     aSig1 = extractFloat128Frac1( a );
5660     aSig0 = extractFloat128Frac0( a );
5661     aExp = extractFloat128Exp( a );
5662     aSign = extractFloat128Sign( a );
5663     aSig0 |= ( aSig1 != 0 );
5664     if ( 0x401E < aExp ) {
5665         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5666         goto invalid;
5667     }
5668     else if ( aExp < 0x3FFF ) {
5669         if (aExp || aSig0) {
5670             status->float_exception_flags |= float_flag_inexact;
5671         }
5672         return 0;
5673     }
5674     aSig0 |= LIT64( 0x0001000000000000 );
5675     shiftCount = 0x402F - aExp;
5676     savedASig = aSig0;
5677     aSig0 >>= shiftCount;
5678     z = aSig0;
5679     if ( aSign ) z = - z;
5680     if ( ( z < 0 ) ^ aSign ) {
5681  invalid:
5682         float_raise(float_flag_invalid, status);
5683         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5684     }
5685     if ( ( aSig0<<shiftCount ) != savedASig ) {
5686         status->float_exception_flags |= float_flag_inexact;
5687     }
5688     return z;
5689
5690 }
5691
5692 /*----------------------------------------------------------------------------
5693 | Returns the result of converting the quadruple-precision floating-point
5694 | value `a' to the 64-bit two's complement integer format.  The conversion
5695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696 | Arithmetic---which means in particular that the conversion is rounded
5697 | according to the current rounding mode.  If `a' is a NaN, the largest
5698 | positive integer is returned.  Otherwise, if the conversion overflows, the
5699 | largest integer with the same sign as `a' is returned.
5700 *----------------------------------------------------------------------------*/
5701
5702 int64_t float128_to_int64(float128 a, float_status *status)
5703 {
5704     flag aSign;
5705     int32_t aExp, shiftCount;
5706     uint64_t aSig0, aSig1;
5707
5708     aSig1 = extractFloat128Frac1( a );
5709     aSig0 = extractFloat128Frac0( a );
5710     aExp = extractFloat128Exp( a );
5711     aSign = extractFloat128Sign( a );
5712     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5713     shiftCount = 0x402F - aExp;
5714     if ( shiftCount <= 0 ) {
5715         if ( 0x403E < aExp ) {
5716             float_raise(float_flag_invalid, status);
5717             if (    ! aSign
5718                  || (    ( aExp == 0x7FFF )
5719                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5720                     )
5721                ) {
5722                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5723             }
5724             return (int64_t) LIT64( 0x8000000000000000 );
5725         }
5726         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5727     }
5728     else {
5729         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5730     }
5731     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5732
5733 }
5734
5735 /*----------------------------------------------------------------------------
5736 | Returns the result of converting the quadruple-precision floating-point
5737 | value `a' to the 64-bit two's complement integer format.  The conversion
5738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5739 | Arithmetic, except that the conversion is always rounded toward zero.
5740 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5741 | the conversion overflows, the largest integer with the same sign as `a' is
5742 | returned.
5743 *----------------------------------------------------------------------------*/
5744
5745 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5746 {
5747     flag aSign;
5748     int32_t aExp, shiftCount;
5749     uint64_t aSig0, aSig1;
5750     int64_t z;
5751
5752     aSig1 = extractFloat128Frac1( a );
5753     aSig0 = extractFloat128Frac0( a );
5754     aExp = extractFloat128Exp( a );
5755     aSign = extractFloat128Sign( a );
5756     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5757     shiftCount = aExp - 0x402F;
5758     if ( 0 < shiftCount ) {
5759         if ( 0x403E <= aExp ) {
5760             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5761             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5762                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5763                 if (aSig1) {
5764                     status->float_exception_flags |= float_flag_inexact;
5765                 }
5766             }
5767             else {
5768                 float_raise(float_flag_invalid, status);
5769                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5770                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5771                 }
5772             }
5773             return (int64_t) LIT64( 0x8000000000000000 );
5774         }
5775         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5776         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5777             status->float_exception_flags |= float_flag_inexact;
5778         }
5779     }
5780     else {
5781         if ( aExp < 0x3FFF ) {
5782             if ( aExp | aSig0 | aSig1 ) {
5783                 status->float_exception_flags |= float_flag_inexact;
5784             }
5785             return 0;
5786         }
5787         z = aSig0>>( - shiftCount );
5788         if (    aSig1
5789              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5790             status->float_exception_flags |= float_flag_inexact;
5791         }
5792     }
5793     if ( aSign ) z = - z;
5794     return z;
5795
5796 }
5797
5798 /*----------------------------------------------------------------------------
5799 | Returns the result of converting the quadruple-precision floating-point value
5800 | `a' to the 64-bit unsigned integer format.  The conversion is
5801 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5802 | Arithmetic---which means in particular that the conversion is rounded
5803 | according to the current rounding mode.  If `a' is a NaN, the largest
5804 | positive integer is returned.  If the conversion overflows, the
5805 | largest unsigned integer is returned.  If 'a' is negative, the value is
5806 | rounded and zero is returned; negative values that do not round to zero
5807 | will raise the inexact exception.
5808 *----------------------------------------------------------------------------*/
5809
5810 uint64_t float128_to_uint64(float128 a, float_status *status)
5811 {
5812     flag aSign;
5813     int aExp;
5814     int shiftCount;
5815     uint64_t aSig0, aSig1;
5816
5817     aSig0 = extractFloat128Frac0(a);
5818     aSig1 = extractFloat128Frac1(a);
5819     aExp = extractFloat128Exp(a);
5820     aSign = extractFloat128Sign(a);
5821     if (aSign && (aExp > 0x3FFE)) {
5822         float_raise(float_flag_invalid, status);
5823         if (float128_is_any_nan(a)) {
5824             return LIT64(0xFFFFFFFFFFFFFFFF);
5825         } else {
5826             return 0;
5827         }
5828     }
5829     if (aExp) {
5830         aSig0 |= LIT64(0x0001000000000000);
5831     }
5832     shiftCount = 0x402F - aExp;
5833     if (shiftCount <= 0) {
5834         if (0x403E < aExp) {
5835             float_raise(float_flag_invalid, status);
5836             return LIT64(0xFFFFFFFFFFFFFFFF);
5837         }
5838         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5839     } else {
5840         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5841     }
5842     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5843 }
5844
5845 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5846 {
5847     uint64_t v;
5848     signed char current_rounding_mode = status->float_rounding_mode;
5849
5850     set_float_rounding_mode(float_round_to_zero, status);
5851     v = float128_to_uint64(a, status);
5852     set_float_rounding_mode(current_rounding_mode, status);
5853
5854     return v;
5855 }
5856
5857 /*----------------------------------------------------------------------------
5858 | Returns the result of converting the quadruple-precision floating-point
5859 | value `a' to the 32-bit unsigned integer format.  The conversion
5860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5861 | Arithmetic except that the conversion is always rounded toward zero.
5862 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5863 | if the conversion overflows, the largest unsigned integer is returned.
5864 | If 'a' is negative, the value is rounded and zero is returned; negative
5865 | values that do not round to zero will raise the inexact exception.
5866 *----------------------------------------------------------------------------*/
5867
5868 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5869 {
5870     uint64_t v;
5871     uint32_t res;
5872     int old_exc_flags = get_float_exception_flags(status);
5873
5874     v = float128_to_uint64_round_to_zero(a, status);
5875     if (v > 0xffffffff) {
5876         res = 0xffffffff;
5877     } else {
5878         return v;
5879     }
5880     set_float_exception_flags(old_exc_flags, status);
5881     float_raise(float_flag_invalid, status);
5882     return res;
5883 }
5884
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of converting the quadruple-precision floating-point
5887 | value `a' to the single-precision floating-point format.  The conversion
5888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5889 | Arithmetic.
5890 *----------------------------------------------------------------------------*/
5891
5892 float32 float128_to_float32(float128 a, float_status *status)
5893 {
5894     flag aSign;
5895     int32_t aExp;
5896     uint64_t aSig0, aSig1;
5897     uint32_t zSig;
5898
5899     aSig1 = extractFloat128Frac1( a );
5900     aSig0 = extractFloat128Frac0( a );
5901     aExp = extractFloat128Exp( a );
5902     aSign = extractFloat128Sign( a );
5903     if ( aExp == 0x7FFF ) {
5904         if ( aSig0 | aSig1 ) {
5905             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5906         }
5907         return packFloat32( aSign, 0xFF, 0 );
5908     }
5909     aSig0 |= ( aSig1 != 0 );
5910     shift64RightJamming( aSig0, 18, &aSig0 );
5911     zSig = aSig0;
5912     if ( aExp || zSig ) {
5913         zSig |= 0x40000000;
5914         aExp -= 0x3F81;
5915     }
5916     return roundAndPackFloat32(aSign, aExp, zSig, status);
5917
5918 }
5919
5920 /*----------------------------------------------------------------------------
5921 | Returns the result of converting the quadruple-precision floating-point
5922 | value `a' to the double-precision floating-point format.  The conversion
5923 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5924 | Arithmetic.
5925 *----------------------------------------------------------------------------*/
5926
5927 float64 float128_to_float64(float128 a, float_status *status)
5928 {
5929     flag aSign;
5930     int32_t aExp;
5931     uint64_t aSig0, aSig1;
5932
5933     aSig1 = extractFloat128Frac1( a );
5934     aSig0 = extractFloat128Frac0( a );
5935     aExp = extractFloat128Exp( a );
5936     aSign = extractFloat128Sign( a );
5937     if ( aExp == 0x7FFF ) {
5938         if ( aSig0 | aSig1 ) {
5939             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5940         }
5941         return packFloat64( aSign, 0x7FF, 0 );
5942     }
5943     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5944     aSig0 |= ( aSig1 != 0 );
5945     if ( aExp || aSig0 ) {
5946         aSig0 |= LIT64( 0x4000000000000000 );
5947         aExp -= 0x3C01;
5948     }
5949     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5950
5951 }
5952
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of converting the quadruple-precision floating-point
5955 | value `a' to the extended double-precision floating-point format.  The
5956 | conversion is performed according to the IEC/IEEE Standard for Binary
5957 | Floating-Point Arithmetic.
5958 *----------------------------------------------------------------------------*/
5959
5960 floatx80 float128_to_floatx80(float128 a, float_status *status)
5961 {
5962     flag aSign;
5963     int32_t aExp;
5964     uint64_t aSig0, aSig1;
5965
5966     aSig1 = extractFloat128Frac1( a );
5967     aSig0 = extractFloat128Frac0( a );
5968     aExp = extractFloat128Exp( a );
5969     aSign = extractFloat128Sign( a );
5970     if ( aExp == 0x7FFF ) {
5971         if ( aSig0 | aSig1 ) {
5972             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5973         }
5974         return packFloatx80(aSign, floatx80_infinity_high,
5975                                    floatx80_infinity_low);
5976     }
5977     if ( aExp == 0 ) {
5978         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5979         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5980     }
5981     else {
5982         aSig0 |= LIT64( 0x0001000000000000 );
5983     }
5984     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5985     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5986
5987 }
5988
5989 /*----------------------------------------------------------------------------
5990 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5991 | returns the result as a quadruple-precision floating-point value.  The
5992 | operation is performed according to the IEC/IEEE Standard for Binary
5993 | Floating-Point Arithmetic.
5994 *----------------------------------------------------------------------------*/
5995
5996 float128 float128_round_to_int(float128 a, float_status *status)
5997 {
5998     flag aSign;
5999     int32_t aExp;
6000     uint64_t lastBitMask, roundBitsMask;
6001     float128 z;
6002
6003     aExp = extractFloat128Exp( a );
6004     if ( 0x402F <= aExp ) {
6005         if ( 0x406F <= aExp ) {
6006             if (    ( aExp == 0x7FFF )
6007                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6008                ) {
6009                 return propagateFloat128NaN(a, a, status);
6010             }
6011             return a;
6012         }
6013         lastBitMask = 1;
6014         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6015         roundBitsMask = lastBitMask - 1;
6016         z = a;
6017         switch (status->float_rounding_mode) {
6018         case float_round_nearest_even:
6019             if ( lastBitMask ) {
6020                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6021                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6022             }
6023             else {
6024                 if ( (int64_t) z.low < 0 ) {
6025                     ++z.high;
6026                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6027                 }
6028             }
6029             break;
6030         case float_round_ties_away:
6031             if (lastBitMask) {
6032                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6033             } else {
6034                 if ((int64_t) z.low < 0) {
6035                     ++z.high;
6036                 }
6037             }
6038             break;
6039         case float_round_to_zero:
6040             break;
6041         case float_round_up:
6042             if (!extractFloat128Sign(z)) {
6043                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6044             }
6045             break;
6046         case float_round_down:
6047             if (extractFloat128Sign(z)) {
6048                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6049             }
6050             break;
6051         default:
6052             abort();
6053         }
6054         z.low &= ~ roundBitsMask;
6055     }
6056     else {
6057         if ( aExp < 0x3FFF ) {
6058             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6059             status->float_exception_flags |= float_flag_inexact;
6060             aSign = extractFloat128Sign( a );
6061             switch (status->float_rounding_mode) {
6062              case float_round_nearest_even:
6063                 if (    ( aExp == 0x3FFE )
6064                      && (   extractFloat128Frac0( a )
6065                           | extractFloat128Frac1( a ) )
6066                    ) {
6067                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6068                 }
6069                 break;
6070             case float_round_ties_away:
6071                 if (aExp == 0x3FFE) {
6072                     return packFloat128(aSign, 0x3FFF, 0, 0);
6073                 }
6074                 break;
6075              case float_round_down:
6076                 return
6077                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6078                     : packFloat128( 0, 0, 0, 0 );
6079              case float_round_up:
6080                 return
6081                       aSign ? packFloat128( 1, 0, 0, 0 )
6082                     : packFloat128( 0, 0x3FFF, 0, 0 );
6083             }
6084             return packFloat128( aSign, 0, 0, 0 );
6085         }
6086         lastBitMask = 1;
6087         lastBitMask <<= 0x402F - aExp;
6088         roundBitsMask = lastBitMask - 1;
6089         z.low = 0;
6090         z.high = a.high;
6091         switch (status->float_rounding_mode) {
6092         case float_round_nearest_even:
6093             z.high += lastBitMask>>1;
6094             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6095                 z.high &= ~ lastBitMask;
6096             }
6097             break;
6098         case float_round_ties_away:
6099             z.high += lastBitMask>>1;
6100             break;
6101         case float_round_to_zero:
6102             break;
6103         case float_round_up:
6104             if (!extractFloat128Sign(z)) {
6105                 z.high |= ( a.low != 0 );
6106                 z.high += roundBitsMask;
6107             }
6108             break;
6109         case float_round_down:
6110             if (extractFloat128Sign(z)) {
6111                 z.high |= (a.low != 0);
6112                 z.high += roundBitsMask;
6113             }
6114             break;
6115         default:
6116             abort();
6117         }
6118         z.high &= ~ roundBitsMask;
6119     }
6120     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6121         status->float_exception_flags |= float_flag_inexact;
6122     }
6123     return z;
6124
6125 }
6126
6127 /*----------------------------------------------------------------------------
6128 | Returns the result of adding the absolute values of the quadruple-precision
6129 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6130 | before being returned.  `zSign' is ignored if the result is a NaN.
6131 | The addition is performed according to the IEC/IEEE Standard for Binary
6132 | Floating-Point Arithmetic.
6133 *----------------------------------------------------------------------------*/
6134
6135 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6136                                 float_status *status)
6137 {
6138     int32_t aExp, bExp, zExp;
6139     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6140     int32_t expDiff;
6141
6142     aSig1 = extractFloat128Frac1( a );
6143     aSig0 = extractFloat128Frac0( a );
6144     aExp = extractFloat128Exp( a );
6145     bSig1 = extractFloat128Frac1( b );
6146     bSig0 = extractFloat128Frac0( b );
6147     bExp = extractFloat128Exp( b );
6148     expDiff = aExp - bExp;
6149     if ( 0 < expDiff ) {
6150         if ( aExp == 0x7FFF ) {
6151             if (aSig0 | aSig1) {
6152                 return propagateFloat128NaN(a, b, status);
6153             }
6154             return a;
6155         }
6156         if ( bExp == 0 ) {
6157             --expDiff;
6158         }
6159         else {
6160             bSig0 |= LIT64( 0x0001000000000000 );
6161         }
6162         shift128ExtraRightJamming(
6163             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6164         zExp = aExp;
6165     }
6166     else if ( expDiff < 0 ) {
6167         if ( bExp == 0x7FFF ) {
6168             if (bSig0 | bSig1) {
6169                 return propagateFloat128NaN(a, b, status);
6170             }
6171             return packFloat128( zSign, 0x7FFF, 0, 0 );
6172         }
6173         if ( aExp == 0 ) {
6174             ++expDiff;
6175         }
6176         else {
6177             aSig0 |= LIT64( 0x0001000000000000 );
6178         }
6179         shift128ExtraRightJamming(
6180             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6181         zExp = bExp;
6182     }
6183     else {
6184         if ( aExp == 0x7FFF ) {
6185             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6186                 return propagateFloat128NaN(a, b, status);
6187             }
6188             return a;
6189         }
6190         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6191         if ( aExp == 0 ) {
6192             if (status->flush_to_zero) {
6193                 if (zSig0 | zSig1) {
6194                     float_raise(float_flag_output_denormal, status);
6195                 }
6196                 return packFloat128(zSign, 0, 0, 0);
6197             }
6198             return packFloat128( zSign, 0, zSig0, zSig1 );
6199         }
6200         zSig2 = 0;
6201         zSig0 |= LIT64( 0x0002000000000000 );
6202         zExp = aExp;
6203         goto shiftRight1;
6204     }
6205     aSig0 |= LIT64( 0x0001000000000000 );
6206     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6207     --zExp;
6208     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6209     ++zExp;
6210  shiftRight1:
6211     shift128ExtraRightJamming(
6212         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6213  roundAndPack:
6214     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6215
6216 }
6217
6218 /*----------------------------------------------------------------------------
6219 | Returns the result of subtracting the absolute values of the quadruple-
6220 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6221 | difference is negated before being returned.  `zSign' is ignored if the
6222 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6223 | Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6225
6226 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6227                                 float_status *status)
6228 {
6229     int32_t aExp, bExp, zExp;
6230     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6231     int32_t expDiff;
6232
6233     aSig1 = extractFloat128Frac1( a );
6234     aSig0 = extractFloat128Frac0( a );
6235     aExp = extractFloat128Exp( a );
6236     bSig1 = extractFloat128Frac1( b );
6237     bSig0 = extractFloat128Frac0( b );
6238     bExp = extractFloat128Exp( b );
6239     expDiff = aExp - bExp;
6240     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6241     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6242     if ( 0 < expDiff ) goto aExpBigger;
6243     if ( expDiff < 0 ) goto bExpBigger;
6244     if ( aExp == 0x7FFF ) {
6245         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6246             return propagateFloat128NaN(a, b, status);
6247         }
6248         float_raise(float_flag_invalid, status);
6249         return float128_default_nan(status);
6250     }
6251     if ( aExp == 0 ) {
6252         aExp = 1;
6253         bExp = 1;
6254     }
6255     if ( bSig0 < aSig0 ) goto aBigger;
6256     if ( aSig0 < bSig0 ) goto bBigger;
6257     if ( bSig1 < aSig1 ) goto aBigger;
6258     if ( aSig1 < bSig1 ) goto bBigger;
6259     return packFloat128(status->float_rounding_mode == float_round_down,
6260                         0, 0, 0);
6261  bExpBigger:
6262     if ( bExp == 0x7FFF ) {
6263         if (bSig0 | bSig1) {
6264             return propagateFloat128NaN(a, b, status);
6265         }
6266         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6267     }
6268     if ( aExp == 0 ) {
6269         ++expDiff;
6270     }
6271     else {
6272         aSig0 |= LIT64( 0x4000000000000000 );
6273     }
6274     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6275     bSig0 |= LIT64( 0x4000000000000000 );
6276  bBigger:
6277     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6278     zExp = bExp;
6279     zSign ^= 1;
6280     goto normalizeRoundAndPack;
6281  aExpBigger:
6282     if ( aExp == 0x7FFF ) {
6283         if (aSig0 | aSig1) {
6284             return propagateFloat128NaN(a, b, status);
6285         }
6286         return a;
6287     }
6288     if ( bExp == 0 ) {
6289         --expDiff;
6290     }
6291     else {
6292         bSig0 |= LIT64( 0x4000000000000000 );
6293     }
6294     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6295     aSig0 |= LIT64( 0x4000000000000000 );
6296  aBigger:
6297     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6298     zExp = aExp;
6299  normalizeRoundAndPack:
6300     --zExp;
6301     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6302                                          status);
6303
6304 }
6305
6306 /*----------------------------------------------------------------------------
6307 | Returns the result of adding the quadruple-precision floating-point values
6308 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6309 | for Binary Floating-Point Arithmetic.
6310 *----------------------------------------------------------------------------*/
6311
6312 float128 float128_add(float128 a, float128 b, float_status *status)
6313 {
6314     flag aSign, bSign;
6315
6316     aSign = extractFloat128Sign( a );
6317     bSign = extractFloat128Sign( b );
6318     if ( aSign == bSign ) {
6319         return addFloat128Sigs(a, b, aSign, status);
6320     }
6321     else {
6322         return subFloat128Sigs(a, b, aSign, status);
6323     }
6324
6325 }
6326
6327 /*----------------------------------------------------------------------------
6328 | Returns the result of subtracting the quadruple-precision floating-point
6329 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6330 | Standard for Binary Floating-Point Arithmetic.
6331 *----------------------------------------------------------------------------*/
6332
6333 float128 float128_sub(float128 a, float128 b, float_status *status)
6334 {
6335     flag aSign, bSign;
6336
6337     aSign = extractFloat128Sign( a );
6338     bSign = extractFloat128Sign( b );
6339     if ( aSign == bSign ) {
6340         return subFloat128Sigs(a, b, aSign, status);
6341     }
6342     else {
6343         return addFloat128Sigs(a, b, aSign, status);
6344     }
6345
6346 }
6347
6348 /*----------------------------------------------------------------------------
6349 | Returns the result of multiplying the quadruple-precision floating-point
6350 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6351 | Standard for Binary Floating-Point Arithmetic.
6352 *----------------------------------------------------------------------------*/
6353
6354 float128 float128_mul(float128 a, float128 b, float_status *status)
6355 {
6356     flag aSign, bSign, zSign;
6357     int32_t aExp, bExp, zExp;
6358     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6359
6360     aSig1 = extractFloat128Frac1( a );
6361     aSig0 = extractFloat128Frac0( a );
6362     aExp = extractFloat128Exp( a );
6363     aSign = extractFloat128Sign( a );
6364     bSig1 = extractFloat128Frac1( b );
6365     bSig0 = extractFloat128Frac0( b );
6366     bExp = extractFloat128Exp( b );
6367     bSign = extractFloat128Sign( b );
6368     zSign = aSign ^ bSign;
6369     if ( aExp == 0x7FFF ) {
6370         if (    ( aSig0 | aSig1 )
6371              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6372             return propagateFloat128NaN(a, b, status);
6373         }
6374         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6375         return packFloat128( zSign, 0x7FFF, 0, 0 );
6376     }
6377     if ( bExp == 0x7FFF ) {
6378         if (bSig0 | bSig1) {
6379             return propagateFloat128NaN(a, b, status);
6380         }
6381         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6382  invalid:
6383             float_raise(float_flag_invalid, status);
6384             return float128_default_nan(status);
6385         }
6386         return packFloat128( zSign, 0x7FFF, 0, 0 );
6387     }
6388     if ( aExp == 0 ) {
6389         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6390         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6391     }
6392     if ( bExp == 0 ) {
6393         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6394         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6395     }
6396     zExp = aExp + bExp - 0x4000;
6397     aSig0 |= LIT64( 0x0001000000000000 );
6398     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6399     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6400     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6401     zSig2 |= ( zSig3 != 0 );
6402     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6403         shift128ExtraRightJamming(
6404             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6405         ++zExp;
6406     }
6407     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6408
6409 }
6410
6411 /*----------------------------------------------------------------------------
6412 | Returns the result of dividing the quadruple-precision floating-point value
6413 | `a' by the corresponding value `b'.  The operation is performed according to
6414 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415 *----------------------------------------------------------------------------*/
6416
6417 float128 float128_div(float128 a, float128 b, float_status *status)
6418 {
6419     flag aSign, bSign, zSign;
6420     int32_t aExp, bExp, zExp;
6421     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6422     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6423
6424     aSig1 = extractFloat128Frac1( a );
6425     aSig0 = extractFloat128Frac0( a );
6426     aExp = extractFloat128Exp( a );
6427     aSign = extractFloat128Sign( a );
6428     bSig1 = extractFloat128Frac1( b );
6429     bSig0 = extractFloat128Frac0( b );
6430     bExp = extractFloat128Exp( b );
6431     bSign = extractFloat128Sign( b );
6432     zSign = aSign ^ bSign;
6433     if ( aExp == 0x7FFF ) {
6434         if (aSig0 | aSig1) {
6435             return propagateFloat128NaN(a, b, status);
6436         }
6437         if ( bExp == 0x7FFF ) {
6438             if (bSig0 | bSig1) {
6439                 return propagateFloat128NaN(a, b, status);
6440             }
6441             goto invalid;
6442         }
6443         return packFloat128( zSign, 0x7FFF, 0, 0 );
6444     }
6445     if ( bExp == 0x7FFF ) {
6446         if (bSig0 | bSig1) {
6447             return propagateFloat128NaN(a, b, status);
6448         }
6449         return packFloat128( zSign, 0, 0, 0 );
6450     }
6451     if ( bExp == 0 ) {
6452         if ( ( bSig0 | bSig1 ) == 0 ) {
6453             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6454  invalid:
6455                 float_raise(float_flag_invalid, status);
6456                 return float128_default_nan(status);
6457             }
6458             float_raise(float_flag_divbyzero, status);
6459             return packFloat128( zSign, 0x7FFF, 0, 0 );
6460         }
6461         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6462     }
6463     if ( aExp == 0 ) {
6464         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6465         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6466     }
6467     zExp = aExp - bExp + 0x3FFD;
6468     shortShift128Left(
6469         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6470     shortShift128Left(
6471         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6472     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6473         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6474         ++zExp;
6475     }
6476     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6477     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6478     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6479     while ( (int64_t) rem0 < 0 ) {
6480         --zSig0;
6481         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6482     }
6483     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6484     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6485         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6486         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6487         while ( (int64_t) rem1 < 0 ) {
6488             --zSig1;
6489             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6490         }
6491         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6492     }
6493     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6494     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6495
6496 }
6497
6498 /*----------------------------------------------------------------------------
6499 | Returns the remainder of the quadruple-precision floating-point value `a'
6500 | with respect to the corresponding value `b'.  The operation is performed
6501 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503
6504 float128 float128_rem(float128 a, float128 b, float_status *status)
6505 {
6506     flag aSign, zSign;
6507     int32_t aExp, bExp, expDiff;
6508     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6509     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6510     int64_t sigMean0;
6511
6512     aSig1 = extractFloat128Frac1( a );
6513     aSig0 = extractFloat128Frac0( a );
6514     aExp = extractFloat128Exp( a );
6515     aSign = extractFloat128Sign( a );
6516     bSig1 = extractFloat128Frac1( b );
6517     bSig0 = extractFloat128Frac0( b );
6518     bExp = extractFloat128Exp( b );
6519     if ( aExp == 0x7FFF ) {
6520         if (    ( aSig0 | aSig1 )
6521              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522             return propagateFloat128NaN(a, b, status);
6523         }
6524         goto invalid;
6525     }
6526     if ( bExp == 0x7FFF ) {
6527         if (bSig0 | bSig1) {
6528             return propagateFloat128NaN(a, b, status);
6529         }
6530         return a;
6531     }
6532     if ( bExp == 0 ) {
6533         if ( ( bSig0 | bSig1 ) == 0 ) {
6534  invalid:
6535             float_raise(float_flag_invalid, status);
6536             return float128_default_nan(status);
6537         }
6538         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6539     }
6540     if ( aExp == 0 ) {
6541         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6542         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543     }
6544     expDiff = aExp - bExp;
6545     if ( expDiff < -1 ) return a;
6546     shortShift128Left(
6547         aSig0 | LIT64( 0x0001000000000000 ),
6548         aSig1,
6549         15 - ( expDiff < 0 ),
6550         &aSig0,
6551         &aSig1
6552     );
6553     shortShift128Left(
6554         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6555     q = le128( bSig0, bSig1, aSig0, aSig1 );
6556     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6557     expDiff -= 64;
6558     while ( 0 < expDiff ) {
6559         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6560         q = ( 4 < q ) ? q - 4 : 0;
6561         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6562         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6563         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6564         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6565         expDiff -= 61;
6566     }
6567     if ( -64 < expDiff ) {
6568         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6569         q = ( 4 < q ) ? q - 4 : 0;
6570         q >>= - expDiff;
6571         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6572         expDiff += 52;
6573         if ( expDiff < 0 ) {
6574             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6575         }
6576         else {
6577             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6578         }
6579         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6580         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6581     }
6582     else {
6583         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6584         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6585     }
6586     do {
6587         alternateASig0 = aSig0;
6588         alternateASig1 = aSig1;
6589         ++q;
6590         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6591     } while ( 0 <= (int64_t) aSig0 );
6592     add128(
6593         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6594     if (    ( sigMean0 < 0 )
6595          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6596         aSig0 = alternateASig0;
6597         aSig1 = alternateASig1;
6598     }
6599     zSign = ( (int64_t) aSig0 < 0 );
6600     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6601     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6602                                          status);
6603 }
6604
6605 /*----------------------------------------------------------------------------
6606 | Returns the square root of the quadruple-precision floating-point value `a'.
6607 | The operation is performed according to the IEC/IEEE Standard for Binary
6608 | Floating-Point Arithmetic.
6609 *----------------------------------------------------------------------------*/
6610
6611 float128 float128_sqrt(float128 a, float_status *status)
6612 {
6613     flag aSign;
6614     int32_t aExp, zExp;
6615     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6616     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6617
6618     aSig1 = extractFloat128Frac1( a );
6619     aSig0 = extractFloat128Frac0( a );
6620     aExp = extractFloat128Exp( a );
6621     aSign = extractFloat128Sign( a );
6622     if ( aExp == 0x7FFF ) {
6623         if (aSig0 | aSig1) {
6624             return propagateFloat128NaN(a, a, status);
6625         }
6626         if ( ! aSign ) return a;
6627         goto invalid;
6628     }
6629     if ( aSign ) {
6630         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6631  invalid:
6632         float_raise(float_flag_invalid, status);
6633         return float128_default_nan(status);
6634     }
6635     if ( aExp == 0 ) {
6636         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6637         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6638     }
6639     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6640     aSig0 |= LIT64( 0x0001000000000000 );
6641     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6642     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6643     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6644     doubleZSig0 = zSig0<<1;
6645     mul64To128( zSig0, zSig0, &term0, &term1 );
6646     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6647     while ( (int64_t) rem0 < 0 ) {
6648         --zSig0;
6649         doubleZSig0 -= 2;
6650         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6651     }
6652     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6653     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6654         if ( zSig1 == 0 ) zSig1 = 1;
6655         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6656         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6657         mul64To128( zSig1, zSig1, &term2, &term3 );
6658         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6659         while ( (int64_t) rem1 < 0 ) {
6660             --zSig1;
6661             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6662             term3 |= 1;
6663             term2 |= doubleZSig0;
6664             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6665         }
6666         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6667     }
6668     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6669     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6670
6671 }
6672
6673 /*----------------------------------------------------------------------------
6674 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6675 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6676 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6677 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6678 *----------------------------------------------------------------------------*/
6679
6680 int float128_eq(float128 a, float128 b, float_status *status)
6681 {
6682
6683     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6684               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6685          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6686               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6687        ) {
6688         float_raise(float_flag_invalid, status);
6689         return 0;
6690     }
6691     return
6692            ( a.low == b.low )
6693         && (    ( a.high == b.high )
6694              || (    ( a.low == 0 )
6695                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6696            );
6697
6698 }
6699
6700 /*----------------------------------------------------------------------------
6701 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6702 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6703 | exception is raised if either operand is a NaN.  The comparison is performed
6704 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6705 *----------------------------------------------------------------------------*/
6706
6707 int float128_le(float128 a, float128 b, float_status *status)
6708 {
6709     flag aSign, bSign;
6710
6711     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6712               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6713          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6714               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6715        ) {
6716         float_raise(float_flag_invalid, status);
6717         return 0;
6718     }
6719     aSign = extractFloat128Sign( a );
6720     bSign = extractFloat128Sign( b );
6721     if ( aSign != bSign ) {
6722         return
6723                aSign
6724             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6725                  == 0 );
6726     }
6727     return
6728           aSign ? le128( b.high, b.low, a.high, a.low )
6729         : le128( a.high, a.low, b.high, b.low );
6730
6731 }
6732
6733 /*----------------------------------------------------------------------------
6734 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6735 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6736 | raised if either operand is a NaN.  The comparison is performed according
6737 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6738 *----------------------------------------------------------------------------*/
6739
6740 int float128_lt(float128 a, float128 b, float_status *status)
6741 {
6742     flag aSign, bSign;
6743
6744     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6745               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6746          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6747               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6748        ) {
6749         float_raise(float_flag_invalid, status);
6750         return 0;
6751     }
6752     aSign = extractFloat128Sign( a );
6753     bSign = extractFloat128Sign( b );
6754     if ( aSign != bSign ) {
6755         return
6756                aSign
6757             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6758                  != 0 );
6759     }
6760     return
6761           aSign ? lt128( b.high, b.low, a.high, a.low )
6762         : lt128( a.high, a.low, b.high, b.low );
6763
6764 }
6765
6766 /*----------------------------------------------------------------------------
6767 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6768 | be compared, and 0 otherwise.  The invalid exception is raised if either
6769 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6770 | Standard for Binary Floating-Point Arithmetic.
6771 *----------------------------------------------------------------------------*/
6772
6773 int float128_unordered(float128 a, float128 b, float_status *status)
6774 {
6775     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6776               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6777          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6778               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6779        ) {
6780         float_raise(float_flag_invalid, status);
6781         return 1;
6782     }
6783     return 0;
6784 }
6785
6786 /*----------------------------------------------------------------------------
6787 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6788 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6789 | exception.  The comparison is performed according to the IEC/IEEE Standard
6790 | for Binary Floating-Point Arithmetic.
6791 *----------------------------------------------------------------------------*/
6792
6793 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6794 {
6795
6796     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6797               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6798          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6799               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6800        ) {
6801         if (float128_is_signaling_nan(a, status)
6802          || float128_is_signaling_nan(b, status)) {
6803             float_raise(float_flag_invalid, status);
6804         }
6805         return 0;
6806     }
6807     return
6808            ( a.low == b.low )
6809         && (    ( a.high == b.high )
6810              || (    ( a.low == 0 )
6811                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6812            );
6813
6814 }
6815
6816 /*----------------------------------------------------------------------------
6817 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6818 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6819 | cause an exception.  Otherwise, the comparison is performed according to the
6820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6821 *----------------------------------------------------------------------------*/
6822
6823 int float128_le_quiet(float128 a, float128 b, float_status *status)
6824 {
6825     flag aSign, bSign;
6826
6827     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6828               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6829          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6830               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6831        ) {
6832         if (float128_is_signaling_nan(a, status)
6833          || float128_is_signaling_nan(b, status)) {
6834             float_raise(float_flag_invalid, status);
6835         }
6836         return 0;
6837     }
6838     aSign = extractFloat128Sign( a );
6839     bSign = extractFloat128Sign( b );
6840     if ( aSign != bSign ) {
6841         return
6842                aSign
6843             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6844                  == 0 );
6845     }
6846     return
6847           aSign ? le128( b.high, b.low, a.high, a.low )
6848         : le128( a.high, a.low, b.high, b.low );
6849
6850 }
6851
6852 /*----------------------------------------------------------------------------
6853 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6854 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6855 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6856 | Standard for Binary Floating-Point Arithmetic.
6857 *----------------------------------------------------------------------------*/
6858
6859 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6860 {
6861     flag aSign, bSign;
6862
6863     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6864               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6865          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6866               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6867        ) {
6868         if (float128_is_signaling_nan(a, status)
6869          || float128_is_signaling_nan(b, status)) {
6870             float_raise(float_flag_invalid, status);
6871         }
6872         return 0;
6873     }
6874     aSign = extractFloat128Sign( a );
6875     bSign = extractFloat128Sign( b );
6876     if ( aSign != bSign ) {
6877         return
6878                aSign
6879             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6880                  != 0 );
6881     }
6882     return
6883           aSign ? lt128( b.high, b.low, a.high, a.low )
6884         : lt128( a.high, a.low, b.high, b.low );
6885
6886 }
6887
6888 /*----------------------------------------------------------------------------
6889 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6890 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6891 | comparison is performed according to the IEC/IEEE Standard for Binary
6892 | Floating-Point Arithmetic.
6893 *----------------------------------------------------------------------------*/
6894
6895 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6896 {
6897     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6898               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6899          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6900               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6901        ) {
6902         if (float128_is_signaling_nan(a, status)
6903          || float128_is_signaling_nan(b, status)) {
6904             float_raise(float_flag_invalid, status);
6905         }
6906         return 1;
6907     }
6908     return 0;
6909 }
6910
6911 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6912                                             int is_quiet, float_status *status)
6913 {
6914     flag aSign, bSign;
6915
6916     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6917         float_raise(float_flag_invalid, status);
6918         return float_relation_unordered;
6919     }
6920     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6921           ( extractFloatx80Frac( a )<<1 ) ) ||
6922         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6923           ( extractFloatx80Frac( b )<<1 ) )) {
6924         if (!is_quiet ||
6925             floatx80_is_signaling_nan(a, status) ||
6926             floatx80_is_signaling_nan(b, status)) {
6927             float_raise(float_flag_invalid, status);
6928         }
6929         return float_relation_unordered;
6930     }
6931     aSign = extractFloatx80Sign( a );
6932     bSign = extractFloatx80Sign( b );
6933     if ( aSign != bSign ) {
6934
6935         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6936              ( ( a.low | b.low ) == 0 ) ) {
6937             /* zero case */
6938             return float_relation_equal;
6939         } else {
6940             return 1 - (2 * aSign);
6941         }
6942     } else {
6943         if (a.low == b.low && a.high == b.high) {
6944             return float_relation_equal;
6945         } else {
6946             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6947         }
6948     }
6949 }
6950
6951 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6952 {
6953     return floatx80_compare_internal(a, b, 0, status);
6954 }
6955
6956 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6957 {
6958     return floatx80_compare_internal(a, b, 1, status);
6959 }
6960
6961 static inline int float128_compare_internal(float128 a, float128 b,
6962                                             int is_quiet, float_status *status)
6963 {
6964     flag aSign, bSign;
6965
6966     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6967           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6968         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6969           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6970         if (!is_quiet ||
6971             float128_is_signaling_nan(a, status) ||
6972             float128_is_signaling_nan(b, status)) {
6973             float_raise(float_flag_invalid, status);
6974         }
6975         return float_relation_unordered;
6976     }
6977     aSign = extractFloat128Sign( a );
6978     bSign = extractFloat128Sign( b );
6979     if ( aSign != bSign ) {
6980         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6981             /* zero case */
6982             return float_relation_equal;
6983         } else {
6984             return 1 - (2 * aSign);
6985         }
6986     } else {
6987         if (a.low == b.low && a.high == b.high) {
6988             return float_relation_equal;
6989         } else {
6990             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6991         }
6992     }
6993 }
6994
6995 int float128_compare(float128 a, float128 b, float_status *status)
6996 {
6997     return float128_compare_internal(a, b, 0, status);
6998 }
6999
7000 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7001 {
7002     return float128_compare_internal(a, b, 1, status);
7003 }
7004
7005 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7006 {
7007     flag aSign;
7008     int32_t aExp;
7009     uint64_t aSig;
7010
7011     if (floatx80_invalid_encoding(a)) {
7012         float_raise(float_flag_invalid, status);
7013         return floatx80_default_nan(status);
7014     }
7015     aSig = extractFloatx80Frac( a );
7016     aExp = extractFloatx80Exp( a );
7017     aSign = extractFloatx80Sign( a );
7018
7019     if ( aExp == 0x7FFF ) {
7020         if ( aSig<<1 ) {
7021             return propagateFloatx80NaN(a, a, status);
7022         }
7023         return a;
7024     }
7025
7026     if (aExp == 0) {
7027         if (aSig == 0) {
7028             return a;
7029         }
7030         aExp++;
7031     }
7032
7033     if (n > 0x10000) {
7034         n = 0x10000;
7035     } else if (n < -0x10000) {
7036         n = -0x10000;
7037     }
7038
7039     aExp += n;
7040     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7041                                          aSign, aExp, aSig, 0, status);
7042 }
7043
7044 float128 float128_scalbn(float128 a, int n, float_status *status)
7045 {
7046     flag aSign;
7047     int32_t aExp;
7048     uint64_t aSig0, aSig1;
7049
7050     aSig1 = extractFloat128Frac1( a );
7051     aSig0 = extractFloat128Frac0( a );
7052     aExp = extractFloat128Exp( a );
7053     aSign = extractFloat128Sign( a );
7054     if ( aExp == 0x7FFF ) {
7055         if ( aSig0 | aSig1 ) {
7056             return propagateFloat128NaN(a, a, status);
7057         }
7058         return a;
7059     }
7060     if (aExp != 0) {
7061         aSig0 |= LIT64( 0x0001000000000000 );
7062     } else if (aSig0 == 0 && aSig1 == 0) {
7063         return a;
7064     } else {
7065         aExp++;
7066     }
7067
7068     if (n > 0x10000) {
7069         n = 0x10000;
7070     } else if (n < -0x10000) {
7071         n = -0x10000;
7072     }
7073
7074     aExp += n - 1;
7075     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7076                                          , status);
7077
7078 }