4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
102 static inline uint32_t extractFloat16Frac(float16 a
)
104 return float16_val(a
) & 0x3ff;
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
111 static inline int extractFloat16Exp(float16 a
)
113 return (float16_val(a
) >> 10) & 0x1f;
116 /*----------------------------------------------------------------------------
117 | Returns the sign bit of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
120 static inline flag
extractFloat16Sign(float16 a
)
122 return float16_val(a
)>>15;
125 /*----------------------------------------------------------------------------
126 | Returns the fraction bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
129 static inline uint32_t extractFloat32Frac(float32 a
)
131 return float32_val(a
) & 0x007FFFFF;
134 /*----------------------------------------------------------------------------
135 | Returns the exponent bits of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
138 static inline int extractFloat32Exp(float32 a
)
140 return (float32_val(a
) >> 23) & 0xFF;
143 /*----------------------------------------------------------------------------
144 | Returns the sign bit of the single-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
147 static inline flag
extractFloat32Sign(float32 a
)
149 return float32_val(a
) >> 31;
152 /*----------------------------------------------------------------------------
153 | Returns the fraction bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
156 static inline uint64_t extractFloat64Frac(float64 a
)
158 return float64_val(a
) & LIT64(0x000FFFFFFFFFFFFF);
161 /*----------------------------------------------------------------------------
162 | Returns the exponent bits of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
165 static inline int extractFloat64Exp(float64 a
)
167 return (float64_val(a
) >> 52) & 0x7FF;
170 /*----------------------------------------------------------------------------
171 | Returns the sign bit of the double-precision floating-point value `a'.
172 *----------------------------------------------------------------------------*/
174 static inline flag
extractFloat64Sign(float64 a
)
176 return float64_val(a
) >> 63;
180 * Classify a floating point number. Everything above float_class_qnan
181 * is a NaN so cls >= float_class_qnan is any NaN.
184 typedef enum __attribute__ ((__packed__
)) {
185 float_class_unclassified
,
189 float_class_qnan
, /* all NaNs from here */
192 float_class_msnan
, /* maybe silenced */
196 * Structure holding all of the decomposed parts of a float. The
197 * exponent is unbiased and the fraction is normalized. All
198 * calculations are done with a 64 bit fraction and then rounded as
199 * appropriate for the final format.
201 * Thanks to the packed FloatClass a decent compiler should be able to
202 * fit the whole structure into registers and avoid using the stack
203 * for parameter passing.
213 #define DECOMPOSED_BINARY_POINT (64 - 2)
214 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
215 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
217 /* Structure holding all of the relevant parameters for a format.
218 * exp_size: the size of the exponent field
219 * exp_bias: the offset applied to the exponent field
220 * exp_max: the maximum normalised exponent
221 * frac_size: the size of the fraction field
222 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
223 * The following are computed based the size of fraction
224 * frac_lsb: least significant bit of fraction
225 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
226 * round_mask/roundeven_mask: masks used for rounding
237 uint64_t roundeven_mask
;
240 /* Expand fields based on the size of exponent and fraction */
241 #define FLOAT_PARAMS(E, F) \
243 .exp_bias = ((1 << E) - 1) >> 1, \
244 .exp_max = (1 << E) - 1, \
246 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
247 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
248 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
249 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
250 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
252 static const FloatFmt float16_params
= {
256 static const FloatFmt float32_params
= {
260 static const FloatFmt float64_params
= {
264 /* Unpack a float to parts, but do not canonicalize. */
265 static inline FloatParts
unpack_raw(FloatFmt fmt
, uint64_t raw
)
267 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
269 return (FloatParts
) {
270 .cls
= float_class_unclassified
,
271 .sign
= extract64(raw
, sign_pos
, 1),
272 .exp
= extract64(raw
, fmt
.frac_size
, fmt
.exp_size
),
273 .frac
= extract64(raw
, 0, fmt
.frac_size
),
277 static inline FloatParts
float16_unpack_raw(float16 f
)
279 return unpack_raw(float16_params
, f
);
282 static inline FloatParts
float32_unpack_raw(float32 f
)
284 return unpack_raw(float32_params
, f
);
287 static inline FloatParts
float64_unpack_raw(float64 f
)
289 return unpack_raw(float64_params
, f
);
292 /* Pack a float from parts, but do not canonicalize. */
293 static inline uint64_t pack_raw(FloatFmt fmt
, FloatParts p
)
295 const int sign_pos
= fmt
.frac_size
+ fmt
.exp_size
;
296 uint64_t ret
= deposit64(p
.frac
, fmt
.frac_size
, fmt
.exp_size
, p
.exp
);
297 return deposit64(ret
, sign_pos
, 1, p
.sign
);
300 static inline float16
float16_pack_raw(FloatParts p
)
302 return make_float16(pack_raw(float16_params
, p
));
305 static inline float32
float32_pack_raw(FloatParts p
)
307 return make_float32(pack_raw(float32_params
, p
));
310 static inline float64
float64_pack_raw(FloatParts p
)
312 return make_float64(pack_raw(float64_params
, p
));
315 /*----------------------------------------------------------------------------
316 | Functions and definitions to determine: (1) whether tininess for underflow
317 | is detected before or after rounding by default, (2) what (if anything)
318 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
319 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
320 | are propagated from function inputs to output. These details are target-
322 *----------------------------------------------------------------------------*/
323 #include "softfloat-specialize.h"
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts
canonicalize(FloatParts part
, const FloatFmt
*parm
,
327 float_status
*status
)
329 if (part
.exp
== parm
->exp_max
) {
330 if (part
.frac
== 0) {
331 part
.cls
= float_class_inf
;
333 part
.frac
<<= parm
->frac_shift
;
334 part
.cls
= (parts_is_snan_frac(part
.frac
, status
)
335 ? float_class_snan
: float_class_qnan
);
337 } else if (part
.exp
== 0) {
338 if (likely(part
.frac
== 0)) {
339 part
.cls
= float_class_zero
;
340 } else if (status
->flush_inputs_to_zero
) {
341 float_raise(float_flag_input_denormal
, status
);
342 part
.cls
= float_class_zero
;
345 int shift
= clz64(part
.frac
) - 1;
346 part
.cls
= float_class_normal
;
347 part
.exp
= parm
->frac_shift
- parm
->exp_bias
- shift
+ 1;
351 part
.cls
= float_class_normal
;
352 part
.exp
-= parm
->exp_bias
;
353 part
.frac
= DECOMPOSED_IMPLICIT_BIT
+ (part
.frac
<< parm
->frac_shift
);
358 /* Round and uncanonicalize a floating-point number by parts. There
359 * are FRAC_SHIFT bits that may require rounding at the bottom of the
360 * fraction; these bits will be removed. The exponent will be biased
361 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
364 static FloatParts
round_canonical(FloatParts p
, float_status
*s
,
365 const FloatFmt
*parm
)
367 const uint64_t frac_lsbm1
= parm
->frac_lsbm1
;
368 const uint64_t round_mask
= parm
->round_mask
;
369 const uint64_t roundeven_mask
= parm
->roundeven_mask
;
370 const int exp_max
= parm
->exp_max
;
371 const int frac_shift
= parm
->frac_shift
;
380 case float_class_normal
:
381 switch (s
->float_rounding_mode
) {
382 case float_round_nearest_even
:
383 overflow_norm
= false;
384 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
386 case float_round_ties_away
:
387 overflow_norm
= false;
390 case float_round_to_zero
:
391 overflow_norm
= true;
395 inc
= p
.sign
? 0 : round_mask
;
396 overflow_norm
= p
.sign
;
398 case float_round_down
:
399 inc
= p
.sign
? round_mask
: 0;
400 overflow_norm
= !p
.sign
;
403 g_assert_not_reached();
406 exp
+= parm
->exp_bias
;
407 if (likely(exp
> 0)) {
408 if (frac
& round_mask
) {
409 flags
|= float_flag_inexact
;
411 if (frac
& DECOMPOSED_OVERFLOW_BIT
) {
418 if (unlikely(exp
>= exp_max
)) {
419 flags
|= float_flag_overflow
| float_flag_inexact
;
424 p
.cls
= float_class_inf
;
428 } else if (s
->flush_to_zero
) {
429 flags
|= float_flag_output_denormal
;
430 p
.cls
= float_class_zero
;
433 bool is_tiny
= (s
->float_detect_tininess
434 == float_tininess_before_rounding
)
436 || !((frac
+ inc
) & DECOMPOSED_OVERFLOW_BIT
);
438 shift64RightJamming(frac
, 1 - exp
, &frac
);
439 if (frac
& round_mask
) {
440 /* Need to recompute round-to-even. */
441 if (s
->float_rounding_mode
== float_round_nearest_even
) {
442 inc
= ((frac
& roundeven_mask
) != frac_lsbm1
445 flags
|= float_flag_inexact
;
449 exp
= (frac
& DECOMPOSED_IMPLICIT_BIT
? 1 : 0);
452 if (is_tiny
&& (flags
& float_flag_inexact
)) {
453 flags
|= float_flag_underflow
;
455 if (exp
== 0 && frac
== 0) {
456 p
.cls
= float_class_zero
;
461 case float_class_zero
:
467 case float_class_inf
:
473 case float_class_qnan
:
474 case float_class_snan
:
476 frac
>>= parm
->frac_shift
;
480 g_assert_not_reached();
483 float_raise(flags
, s
);
489 static FloatParts
float16_unpack_canonical(float16 f
, float_status
*s
)
491 return canonicalize(float16_unpack_raw(f
), &float16_params
, s
);
494 static float16
float16_round_pack_canonical(FloatParts p
, float_status
*s
)
497 case float_class_dnan
:
498 return float16_default_nan(s
);
499 case float_class_msnan
:
500 p
.frac
>>= float16_params
.frac_shift
;
501 return float16_maybe_silence_nan(float16_pack_raw(p
), s
);
503 p
= round_canonical(p
, s
, &float16_params
);
504 return float16_pack_raw(p
);
508 static FloatParts
float32_unpack_canonical(float32 f
, float_status
*s
)
510 return canonicalize(float32_unpack_raw(f
), &float32_params
, s
);
513 static float32
float32_round_pack_canonical(FloatParts p
, float_status
*s
)
516 case float_class_dnan
:
517 return float32_default_nan(s
);
518 case float_class_msnan
:
519 p
.frac
>>= float32_params
.frac_shift
;
520 return float32_maybe_silence_nan(float32_pack_raw(p
), s
);
522 p
= round_canonical(p
, s
, &float32_params
);
523 return float32_pack_raw(p
);
527 static FloatParts
float64_unpack_canonical(float64 f
, float_status
*s
)
529 return canonicalize(float64_unpack_raw(f
), &float64_params
, s
);
532 static float64
float64_round_pack_canonical(FloatParts p
, float_status
*s
)
535 case float_class_dnan
:
536 return float64_default_nan(s
);
537 case float_class_msnan
:
538 p
.frac
>>= float64_params
.frac_shift
;
539 return float64_maybe_silence_nan(float64_pack_raw(p
), s
);
541 p
= round_canonical(p
, s
, &float64_params
);
542 return float64_pack_raw(p
);
546 /* Simple helpers for checking if what NaN we have */
547 static bool is_nan(FloatClass c
)
549 return unlikely(c
>= float_class_qnan
);
551 static bool is_snan(FloatClass c
)
553 return c
== float_class_snan
;
555 static bool is_qnan(FloatClass c
)
557 return c
== float_class_qnan
;
560 static FloatParts
return_nan(FloatParts a
, float_status
*s
)
563 case float_class_snan
:
564 s
->float_exception_flags
|= float_flag_invalid
;
565 a
.cls
= float_class_msnan
;
567 case float_class_qnan
:
568 if (s
->default_nan_mode
) {
569 a
.cls
= float_class_dnan
;
574 g_assert_not_reached();
579 static FloatParts
pick_nan(FloatParts a
, FloatParts b
, float_status
*s
)
581 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
582 s
->float_exception_flags
|= float_flag_invalid
;
585 if (s
->default_nan_mode
) {
586 a
.cls
= float_class_dnan
;
588 if (pickNaN(is_qnan(a
.cls
), is_snan(a
.cls
),
589 is_qnan(b
.cls
), is_snan(b
.cls
),
591 (a
.frac
== b
.frac
&& a
.sign
< b
.sign
))) {
594 a
.cls
= float_class_msnan
;
599 static FloatParts
pick_nan_muladd(FloatParts a
, FloatParts b
, FloatParts c
,
600 bool inf_zero
, float_status
*s
)
604 if (is_snan(a
.cls
) || is_snan(b
.cls
) || is_snan(c
.cls
)) {
605 s
->float_exception_flags
|= float_flag_invalid
;
608 which
= pickNaNMulAdd(is_qnan(a
.cls
), is_snan(a
.cls
),
609 is_qnan(b
.cls
), is_snan(b
.cls
),
610 is_qnan(c
.cls
), is_snan(c
.cls
),
613 if (s
->default_nan_mode
) {
614 /* Note that this check is after pickNaNMulAdd so that function
615 * has an opportunity to set the Invalid flag.
617 a
.cls
= float_class_dnan
;
631 a
.cls
= float_class_dnan
;
634 g_assert_not_reached();
636 a
.cls
= float_class_msnan
;
642 * Returns the result of adding or subtracting the values of the
643 * floating-point values `a' and `b'. The operation is performed
644 * according to the IEC/IEEE Standard for Binary Floating-Point
648 static FloatParts
addsub_floats(FloatParts a
, FloatParts b
, bool subtract
,
651 bool a_sign
= a
.sign
;
652 bool b_sign
= b
.sign
^ subtract
;
654 if (a_sign
!= b_sign
) {
657 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
658 if (a
.exp
> b
.exp
|| (a
.exp
== b
.exp
&& a
.frac
>= b
.frac
)) {
659 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
660 a
.frac
= a
.frac
- b
.frac
;
662 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
663 a
.frac
= b
.frac
- a
.frac
;
669 a
.cls
= float_class_zero
;
670 a
.sign
= s
->float_rounding_mode
== float_round_down
;
672 int shift
= clz64(a
.frac
) - 1;
673 a
.frac
= a
.frac
<< shift
;
674 a
.exp
= a
.exp
- shift
;
679 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
680 return pick_nan(a
, b
, s
);
682 if (a
.cls
== float_class_inf
) {
683 if (b
.cls
== float_class_inf
) {
684 float_raise(float_flag_invalid
, s
);
685 a
.cls
= float_class_dnan
;
689 if (a
.cls
== float_class_zero
&& b
.cls
== float_class_zero
) {
690 a
.sign
= s
->float_rounding_mode
== float_round_down
;
693 if (a
.cls
== float_class_zero
|| b
.cls
== float_class_inf
) {
697 if (b
.cls
== float_class_zero
) {
702 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
704 shift64RightJamming(b
.frac
, a
.exp
- b
.exp
, &b
.frac
);
705 } else if (a
.exp
< b
.exp
) {
706 shift64RightJamming(a
.frac
, b
.exp
- a
.exp
, &a
.frac
);
710 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
716 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
717 return pick_nan(a
, b
, s
);
719 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
722 if (b
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
727 g_assert_not_reached();
731 * Returns the result of adding or subtracting the floating-point
732 * values `a' and `b'. The operation is performed according to the
733 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
736 float16
__attribute__((flatten
)) float16_add(float16 a
, float16 b
,
737 float_status
*status
)
739 FloatParts pa
= float16_unpack_canonical(a
, status
);
740 FloatParts pb
= float16_unpack_canonical(b
, status
);
741 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
743 return float16_round_pack_canonical(pr
, status
);
746 float32
__attribute__((flatten
)) float32_add(float32 a
, float32 b
,
747 float_status
*status
)
749 FloatParts pa
= float32_unpack_canonical(a
, status
);
750 FloatParts pb
= float32_unpack_canonical(b
, status
);
751 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
753 return float32_round_pack_canonical(pr
, status
);
756 float64
__attribute__((flatten
)) float64_add(float64 a
, float64 b
,
757 float_status
*status
)
759 FloatParts pa
= float64_unpack_canonical(a
, status
);
760 FloatParts pb
= float64_unpack_canonical(b
, status
);
761 FloatParts pr
= addsub_floats(pa
, pb
, false, status
);
763 return float64_round_pack_canonical(pr
, status
);
766 float16
__attribute__((flatten
)) float16_sub(float16 a
, float16 b
,
767 float_status
*status
)
769 FloatParts pa
= float16_unpack_canonical(a
, status
);
770 FloatParts pb
= float16_unpack_canonical(b
, status
);
771 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
773 return float16_round_pack_canonical(pr
, status
);
776 float32
__attribute__((flatten
)) float32_sub(float32 a
, float32 b
,
777 float_status
*status
)
779 FloatParts pa
= float32_unpack_canonical(a
, status
);
780 FloatParts pb
= float32_unpack_canonical(b
, status
);
781 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
783 return float32_round_pack_canonical(pr
, status
);
786 float64
__attribute__((flatten
)) float64_sub(float64 a
, float64 b
,
787 float_status
*status
)
789 FloatParts pa
= float64_unpack_canonical(a
, status
);
790 FloatParts pb
= float64_unpack_canonical(b
, status
);
791 FloatParts pr
= addsub_floats(pa
, pb
, true, status
);
793 return float64_round_pack_canonical(pr
, status
);
797 * Returns the result of multiplying the floating-point values `a' and
798 * `b'. The operation is performed according to the IEC/IEEE Standard
799 * for Binary Floating-Point Arithmetic.
802 static FloatParts
mul_floats(FloatParts a
, FloatParts b
, float_status
*s
)
804 bool sign
= a
.sign
^ b
.sign
;
806 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
808 int exp
= a
.exp
+ b
.exp
;
810 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
811 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
812 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
813 shift64RightJamming(lo
, 1, &lo
);
823 /* handle all the NaN cases */
824 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
825 return pick_nan(a
, b
, s
);
827 /* Inf * Zero == NaN */
828 if ((a
.cls
== float_class_inf
&& b
.cls
== float_class_zero
) ||
829 (a
.cls
== float_class_zero
&& b
.cls
== float_class_inf
)) {
830 s
->float_exception_flags
|= float_flag_invalid
;
831 a
.cls
= float_class_dnan
;
835 /* Multiply by 0 or Inf */
836 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
840 if (b
.cls
== float_class_inf
|| b
.cls
== float_class_zero
) {
844 g_assert_not_reached();
847 float16
__attribute__((flatten
)) float16_mul(float16 a
, float16 b
,
848 float_status
*status
)
850 FloatParts pa
= float16_unpack_canonical(a
, status
);
851 FloatParts pb
= float16_unpack_canonical(b
, status
);
852 FloatParts pr
= mul_floats(pa
, pb
, status
);
854 return float16_round_pack_canonical(pr
, status
);
857 float32
__attribute__((flatten
)) float32_mul(float32 a
, float32 b
,
858 float_status
*status
)
860 FloatParts pa
= float32_unpack_canonical(a
, status
);
861 FloatParts pb
= float32_unpack_canonical(b
, status
);
862 FloatParts pr
= mul_floats(pa
, pb
, status
);
864 return float32_round_pack_canonical(pr
, status
);
867 float64
__attribute__((flatten
)) float64_mul(float64 a
, float64 b
,
868 float_status
*status
)
870 FloatParts pa
= float64_unpack_canonical(a
, status
);
871 FloatParts pb
= float64_unpack_canonical(b
, status
);
872 FloatParts pr
= mul_floats(pa
, pb
, status
);
874 return float64_round_pack_canonical(pr
, status
);
878 * Returns the result of multiplying the floating-point values `a' and
879 * `b' then adding 'c', with no intermediate rounding step after the
880 * multiplication. The operation is performed according to the
881 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
882 * The flags argument allows the caller to select negation of the
883 * addend, the intermediate product, or the final result. (The
884 * difference between this and having the caller do a separate
885 * negation is that negating externally will flip the sign bit on
889 static FloatParts
muladd_floats(FloatParts a
, FloatParts b
, FloatParts c
,
890 int flags
, float_status
*s
)
892 bool inf_zero
= ((1 << a
.cls
) | (1 << b
.cls
)) ==
893 ((1 << float_class_inf
) | (1 << float_class_zero
));
895 bool sign_flip
= flags
& float_muladd_negate_result
;
900 /* It is implementation-defined whether the cases of (0,inf,qnan)
901 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
902 * they return if they do), so we have to hand this information
903 * off to the target-specific pick-a-NaN routine.
905 if (is_nan(a
.cls
) || is_nan(b
.cls
) || is_nan(c
.cls
)) {
906 return pick_nan_muladd(a
, b
, c
, inf_zero
, s
);
910 s
->float_exception_flags
|= float_flag_invalid
;
911 a
.cls
= float_class_dnan
;
915 if (flags
& float_muladd_negate_c
) {
919 p_sign
= a
.sign
^ b
.sign
;
921 if (flags
& float_muladd_negate_product
) {
925 if (a
.cls
== float_class_inf
|| b
.cls
== float_class_inf
) {
926 p_class
= float_class_inf
;
927 } else if (a
.cls
== float_class_zero
|| b
.cls
== float_class_zero
) {
928 p_class
= float_class_zero
;
930 p_class
= float_class_normal
;
933 if (c
.cls
== float_class_inf
) {
934 if (p_class
== float_class_inf
&& p_sign
!= c
.sign
) {
935 s
->float_exception_flags
|= float_flag_invalid
;
936 a
.cls
= float_class_dnan
;
938 a
.cls
= float_class_inf
;
939 a
.sign
= c
.sign
^ sign_flip
;
944 if (p_class
== float_class_inf
) {
945 a
.cls
= float_class_inf
;
946 a
.sign
= p_sign
^ sign_flip
;
950 if (p_class
== float_class_zero
) {
951 if (c
.cls
== float_class_zero
) {
952 if (p_sign
!= c
.sign
) {
953 p_sign
= s
->float_rounding_mode
== float_round_down
;
956 } else if (flags
& float_muladd_halve_result
) {
963 /* a & b should be normals now... */
964 assert(a
.cls
== float_class_normal
&&
965 b
.cls
== float_class_normal
);
967 p_exp
= a
.exp
+ b
.exp
;
969 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
972 mul64To128(a
.frac
, b
.frac
, &hi
, &lo
);
973 /* binary point now at bit 124 */
975 /* check for overflow */
976 if (hi
& (1ULL << (DECOMPOSED_BINARY_POINT
* 2 + 1 - 64))) {
977 shift128RightJamming(hi
, lo
, 1, &hi
, &lo
);
982 if (c
.cls
== float_class_zero
) {
983 /* move binary point back to 62 */
984 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
986 int exp_diff
= p_exp
- c
.exp
;
987 if (p_sign
== c
.sign
) {
990 shift128RightJamming(hi
, lo
,
991 DECOMPOSED_BINARY_POINT
- exp_diff
,
997 /* shift c to the same binary point as the product (124) */
1000 shift128RightJamming(c_hi
, c_lo
,
1003 add128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1004 /* move binary point back to 62 */
1005 shift128RightJamming(hi
, lo
, DECOMPOSED_BINARY_POINT
, &hi
, &lo
);
1008 if (lo
& DECOMPOSED_OVERFLOW_BIT
) {
1009 shift64RightJamming(lo
, 1, &lo
);
1015 uint64_t c_hi
, c_lo
;
1016 /* make C binary point match product at bit 124 */
1020 if (exp_diff
<= 0) {
1021 shift128RightJamming(hi
, lo
, -exp_diff
, &hi
, &lo
);
1024 (hi
> c_hi
|| (hi
== c_hi
&& lo
>= c_lo
))) {
1025 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1027 sub128(c_hi
, c_lo
, hi
, lo
, &hi
, &lo
);
1032 shift128RightJamming(c_hi
, c_lo
,
1035 sub128(hi
, lo
, c_hi
, c_lo
, &hi
, &lo
);
1038 if (hi
== 0 && lo
== 0) {
1039 a
.cls
= float_class_zero
;
1040 a
.sign
= s
->float_rounding_mode
== float_round_down
;
1041 a
.sign
^= sign_flip
;
1048 shift
= clz64(lo
) + 64;
1050 /* Normalizing to a binary point of 124 is the
1051 correct adjust for the exponent. However since we're
1052 shifting, we might as well put the binary point back
1053 at 62 where we really want it. Therefore shift as
1054 if we're leaving 1 bit at the top of the word, but
1055 adjust the exponent as if we're leaving 3 bits. */
1058 lo
= lo
<< (shift
- 64);
1060 hi
= (hi
<< shift
) | (lo
>> (64 - shift
));
1061 lo
= hi
| ((lo
<< shift
) != 0);
1068 if (flags
& float_muladd_halve_result
) {
1072 /* finally prepare our result */
1073 a
.cls
= float_class_normal
;
1074 a
.sign
= p_sign
^ sign_flip
;
1081 float16
__attribute__((flatten
)) float16_muladd(float16 a
, float16 b
, float16 c
,
1082 int flags
, float_status
*status
)
1084 FloatParts pa
= float16_unpack_canonical(a
, status
);
1085 FloatParts pb
= float16_unpack_canonical(b
, status
);
1086 FloatParts pc
= float16_unpack_canonical(c
, status
);
1087 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1089 return float16_round_pack_canonical(pr
, status
);
1092 float32
__attribute__((flatten
)) float32_muladd(float32 a
, float32 b
, float32 c
,
1093 int flags
, float_status
*status
)
1095 FloatParts pa
= float32_unpack_canonical(a
, status
);
1096 FloatParts pb
= float32_unpack_canonical(b
, status
);
1097 FloatParts pc
= float32_unpack_canonical(c
, status
);
1098 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1100 return float32_round_pack_canonical(pr
, status
);
1103 float64
__attribute__((flatten
)) float64_muladd(float64 a
, float64 b
, float64 c
,
1104 int flags
, float_status
*status
)
1106 FloatParts pa
= float64_unpack_canonical(a
, status
);
1107 FloatParts pb
= float64_unpack_canonical(b
, status
);
1108 FloatParts pc
= float64_unpack_canonical(c
, status
);
1109 FloatParts pr
= muladd_floats(pa
, pb
, pc
, flags
, status
);
1111 return float64_round_pack_canonical(pr
, status
);
1115 * Returns the result of dividing the floating-point value `a' by the
1116 * corresponding value `b'. The operation is performed according to
1117 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1120 static FloatParts
div_floats(FloatParts a
, FloatParts b
, float_status
*s
)
1122 bool sign
= a
.sign
^ b
.sign
;
1124 if (a
.cls
== float_class_normal
&& b
.cls
== float_class_normal
) {
1125 uint64_t temp_lo
, temp_hi
;
1126 int exp
= a
.exp
- b
.exp
;
1127 if (a
.frac
< b
.frac
) {
1129 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
+ 1,
1130 &temp_hi
, &temp_lo
);
1132 shortShift128Left(0, a
.frac
, DECOMPOSED_BINARY_POINT
,
1133 &temp_hi
, &temp_lo
);
1135 /* LSB of quot is set if inexact which roundandpack will use
1136 * to set flags. Yet again we re-use a for the result */
1137 a
.frac
= div128To64(temp_lo
, temp_hi
, b
.frac
);
1142 /* handle all the NaN cases */
1143 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1144 return pick_nan(a
, b
, s
);
1146 /* 0/0 or Inf/Inf */
1149 (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
)) {
1150 s
->float_exception_flags
|= float_flag_invalid
;
1151 a
.cls
= float_class_dnan
;
1154 /* Inf / x or 0 / x */
1155 if (a
.cls
== float_class_inf
|| a
.cls
== float_class_zero
) {
1160 if (b
.cls
== float_class_zero
) {
1161 s
->float_exception_flags
|= float_flag_divbyzero
;
1162 a
.cls
= float_class_inf
;
1167 if (b
.cls
== float_class_inf
) {
1168 a
.cls
= float_class_zero
;
1172 g_assert_not_reached();
1175 float16
float16_div(float16 a
, float16 b
, float_status
*status
)
1177 FloatParts pa
= float16_unpack_canonical(a
, status
);
1178 FloatParts pb
= float16_unpack_canonical(b
, status
);
1179 FloatParts pr
= div_floats(pa
, pb
, status
);
1181 return float16_round_pack_canonical(pr
, status
);
1184 float32
float32_div(float32 a
, float32 b
, float_status
*status
)
1186 FloatParts pa
= float32_unpack_canonical(a
, status
);
1187 FloatParts pb
= float32_unpack_canonical(b
, status
);
1188 FloatParts pr
= div_floats(pa
, pb
, status
);
1190 return float32_round_pack_canonical(pr
, status
);
1193 float64
float64_div(float64 a
, float64 b
, float_status
*status
)
1195 FloatParts pa
= float64_unpack_canonical(a
, status
);
1196 FloatParts pb
= float64_unpack_canonical(b
, status
);
1197 FloatParts pr
= div_floats(pa
, pb
, status
);
1199 return float64_round_pack_canonical(pr
, status
);
1203 * Rounds the floating-point value `a' to an integer, and returns the
1204 * result as a floating-point value. The operation is performed
1205 * according to the IEC/IEEE Standard for Binary Floating-Point
1209 static FloatParts
round_to_int(FloatParts a
, int rounding_mode
, float_status
*s
)
1211 if (is_nan(a
.cls
)) {
1212 return return_nan(a
, s
);
1216 case float_class_zero
:
1217 case float_class_inf
:
1218 case float_class_qnan
:
1219 /* already "integral" */
1221 case float_class_normal
:
1222 if (a
.exp
>= DECOMPOSED_BINARY_POINT
) {
1223 /* already integral */
1228 /* all fractional */
1229 s
->float_exception_flags
|= float_flag_inexact
;
1230 switch (rounding_mode
) {
1231 case float_round_nearest_even
:
1232 one
= a
.exp
== -1 && a
.frac
> DECOMPOSED_IMPLICIT_BIT
;
1234 case float_round_ties_away
:
1235 one
= a
.exp
== -1 && a
.frac
>= DECOMPOSED_IMPLICIT_BIT
;
1237 case float_round_to_zero
:
1240 case float_round_up
:
1243 case float_round_down
:
1247 g_assert_not_reached();
1251 a
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1254 a
.cls
= float_class_zero
;
1257 uint64_t frac_lsb
= DECOMPOSED_IMPLICIT_BIT
>> a
.exp
;
1258 uint64_t frac_lsbm1
= frac_lsb
>> 1;
1259 uint64_t rnd_even_mask
= (frac_lsb
- 1) | frac_lsb
;
1260 uint64_t rnd_mask
= rnd_even_mask
>> 1;
1263 switch (rounding_mode
) {
1264 case float_round_nearest_even
:
1265 inc
= ((a
.frac
& rnd_even_mask
) != frac_lsbm1
? frac_lsbm1
: 0);
1267 case float_round_ties_away
:
1270 case float_round_to_zero
:
1273 case float_round_up
:
1274 inc
= a
.sign
? 0 : rnd_mask
;
1276 case float_round_down
:
1277 inc
= a
.sign
? rnd_mask
: 0;
1280 g_assert_not_reached();
1283 if (a
.frac
& rnd_mask
) {
1284 s
->float_exception_flags
|= float_flag_inexact
;
1286 a
.frac
&= ~rnd_mask
;
1287 if (a
.frac
& DECOMPOSED_OVERFLOW_BIT
) {
1295 g_assert_not_reached();
1300 float16
float16_round_to_int(float16 a
, float_status
*s
)
1302 FloatParts pa
= float16_unpack_canonical(a
, s
);
1303 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1304 return float16_round_pack_canonical(pr
, s
);
1307 float32
float32_round_to_int(float32 a
, float_status
*s
)
1309 FloatParts pa
= float32_unpack_canonical(a
, s
);
1310 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1311 return float32_round_pack_canonical(pr
, s
);
1314 float64
float64_round_to_int(float64 a
, float_status
*s
)
1316 FloatParts pa
= float64_unpack_canonical(a
, s
);
1317 FloatParts pr
= round_to_int(pa
, s
->float_rounding_mode
, s
);
1318 return float64_round_pack_canonical(pr
, s
);
1321 float64
float64_trunc_to_int(float64 a
, float_status
*s
)
1323 FloatParts pa
= float64_unpack_canonical(a
, s
);
1324 FloatParts pr
= round_to_int(pa
, float_round_to_zero
, s
);
1325 return float64_round_pack_canonical(pr
, s
);
1329 * Returns the result of converting the floating-point value `a' to
1330 * the two's complement integer format. The conversion is performed
1331 * according to the IEC/IEEE Standard for Binary Floating-Point
1332 * Arithmetic---which means in particular that the conversion is
1333 * rounded according to the current rounding mode. If `a' is a NaN,
1334 * the largest positive integer is returned. Otherwise, if the
1335 * conversion overflows, the largest integer with the same sign as `a'
1339 static int64_t round_to_int_and_pack(FloatParts in
, int rmode
,
1340 int64_t min
, int64_t max
,
1344 int orig_flags
= get_float_exception_flags(s
);
1345 FloatParts p
= round_to_int(in
, rmode
, s
);
1348 case float_class_snan
:
1349 case float_class_qnan
:
1350 case float_class_dnan
:
1351 case float_class_msnan
:
1352 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1354 case float_class_inf
:
1355 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1356 return p
.sign
? min
: max
;
1357 case float_class_zero
:
1359 case float_class_normal
:
1360 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1361 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1362 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1363 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1368 if (r
<= -(uint64_t) min
) {
1371 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1378 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1383 g_assert_not_reached();
1387 #define FLOAT_TO_INT(fsz, isz) \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, s->float_rounding_mode, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1398 (float ## fsz a, float_status *s) \
1400 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1401 return round_to_int_and_pack(p, float_round_to_zero, \
1402 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1406 FLOAT_TO_INT(16, 16)
1407 FLOAT_TO_INT(16, 32)
1408 FLOAT_TO_INT(16, 64)
1410 FLOAT_TO_INT(32, 16)
1411 FLOAT_TO_INT(32, 32)
1412 FLOAT_TO_INT(32, 64)
1414 FLOAT_TO_INT(64, 16)
1415 FLOAT_TO_INT(64, 32)
1416 FLOAT_TO_INT(64, 64)
1421 * Returns the result of converting the floating-point value `a' to
1422 * the unsigned integer format. The conversion is performed according
1423 * to the IEC/IEEE Standard for Binary Floating-Point
1424 * Arithmetic---which means in particular that the conversion is
1425 * rounded according to the current rounding mode. If `a' is a NaN,
1426 * the largest unsigned integer is returned. Otherwise, if the
1427 * conversion overflows, the largest unsigned integer is returned. If
1428 * the 'a' is negative, the result is rounded and zero is returned;
1429 * values that do not round to zero will raise the inexact exception
1433 static uint64_t round_to_uint_and_pack(FloatParts in
, int rmode
, uint64_t max
,
1436 int orig_flags
= get_float_exception_flags(s
);
1437 FloatParts p
= round_to_int(in
, rmode
, s
);
1440 case float_class_snan
:
1441 case float_class_qnan
:
1442 case float_class_dnan
:
1443 case float_class_msnan
:
1444 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1446 case float_class_inf
:
1447 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1448 return p
.sign
? 0 : max
;
1449 case float_class_zero
:
1451 case float_class_normal
:
1455 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1459 if (p
.exp
< DECOMPOSED_BINARY_POINT
) {
1460 r
= p
.frac
>> (DECOMPOSED_BINARY_POINT
- p
.exp
);
1461 } else if (p
.exp
- DECOMPOSED_BINARY_POINT
< 2) {
1462 r
= p
.frac
<< (p
.exp
- DECOMPOSED_BINARY_POINT
);
1464 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1468 /* For uint64 this will never trip, but if p.exp is too large
1469 * to shift a decomposed fraction we shall have exited via the
1473 s
->float_exception_flags
= orig_flags
| float_flag_invalid
;
1480 g_assert_not_reached();
1484 #define FLOAT_TO_UINT(fsz, isz) \
1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1488 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1489 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1490 UINT ## isz ## _MAX, s); \
1493 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1494 (float ## fsz a, float_status *s) \
1496 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1497 return round_to_uint_and_pack(p, float_round_to_zero, \
1498 UINT ## isz ## _MAX, s); \
1501 FLOAT_TO_UINT(16, 16)
1502 FLOAT_TO_UINT(16, 32)
1503 FLOAT_TO_UINT(16, 64)
1505 FLOAT_TO_UINT(32, 16)
1506 FLOAT_TO_UINT(32, 32)
1507 FLOAT_TO_UINT(32, 64)
1509 FLOAT_TO_UINT(64, 16)
1510 FLOAT_TO_UINT(64, 32)
1511 FLOAT_TO_UINT(64, 64)
1513 #undef FLOAT_TO_UINT
1516 * Integer to float conversions
1518 * Returns the result of converting the two's complement integer `a'
1519 * to the floating-point format. The conversion is performed according
1520 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1523 static FloatParts
int_to_float(int64_t a
, float_status
*status
)
1527 r
.cls
= float_class_zero
;
1529 } else if (a
== (1ULL << 63)) {
1530 r
.cls
= float_class_normal
;
1532 r
.frac
= DECOMPOSED_IMPLICIT_BIT
;
1543 int shift
= clz64(f
) - 1;
1544 r
.cls
= float_class_normal
;
1545 r
.exp
= (DECOMPOSED_BINARY_POINT
- shift
);
1546 r
.frac
= f
<< shift
;
1552 float16
int64_to_float16(int64_t a
, float_status
*status
)
1554 FloatParts pa
= int_to_float(a
, status
);
1555 return float16_round_pack_canonical(pa
, status
);
1558 float16
int32_to_float16(int32_t a
, float_status
*status
)
1560 return int64_to_float16(a
, status
);
1563 float16
int16_to_float16(int16_t a
, float_status
*status
)
1565 return int64_to_float16(a
, status
);
1568 float32
int64_to_float32(int64_t a
, float_status
*status
)
1570 FloatParts pa
= int_to_float(a
, status
);
1571 return float32_round_pack_canonical(pa
, status
);
1574 float32
int32_to_float32(int32_t a
, float_status
*status
)
1576 return int64_to_float32(a
, status
);
1579 float32
int16_to_float32(int16_t a
, float_status
*status
)
1581 return int64_to_float32(a
, status
);
1584 float64
int64_to_float64(int64_t a
, float_status
*status
)
1586 FloatParts pa
= int_to_float(a
, status
);
1587 return float64_round_pack_canonical(pa
, status
);
1590 float64
int32_to_float64(int32_t a
, float_status
*status
)
1592 return int64_to_float64(a
, status
);
1595 float64
int16_to_float64(int16_t a
, float_status
*status
)
1597 return int64_to_float64(a
, status
);
1602 * Unsigned Integer to float conversions
1604 * Returns the result of converting the unsigned integer `a' to the
1605 * floating-point format. The conversion is performed according to the
1606 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1609 static FloatParts
uint_to_float(uint64_t a
, float_status
*status
)
1611 FloatParts r
= { .sign
= false};
1614 r
.cls
= float_class_zero
;
1616 int spare_bits
= clz64(a
) - 1;
1617 r
.cls
= float_class_normal
;
1618 r
.exp
= DECOMPOSED_BINARY_POINT
- spare_bits
;
1619 if (spare_bits
< 0) {
1620 shift64RightJamming(a
, -spare_bits
, &a
);
1623 r
.frac
= a
<< spare_bits
;
1630 float16
uint64_to_float16(uint64_t a
, float_status
*status
)
1632 FloatParts pa
= uint_to_float(a
, status
);
1633 return float16_round_pack_canonical(pa
, status
);
1636 float16
uint32_to_float16(uint32_t a
, float_status
*status
)
1638 return uint64_to_float16(a
, status
);
1641 float16
uint16_to_float16(uint16_t a
, float_status
*status
)
1643 return uint64_to_float16(a
, status
);
1646 float32
uint64_to_float32(uint64_t a
, float_status
*status
)
1648 FloatParts pa
= uint_to_float(a
, status
);
1649 return float32_round_pack_canonical(pa
, status
);
1652 float32
uint32_to_float32(uint32_t a
, float_status
*status
)
1654 return uint64_to_float32(a
, status
);
1657 float32
uint16_to_float32(uint16_t a
, float_status
*status
)
1659 return uint64_to_float32(a
, status
);
1662 float64
uint64_to_float64(uint64_t a
, float_status
*status
)
1664 FloatParts pa
= uint_to_float(a
, status
);
1665 return float64_round_pack_canonical(pa
, status
);
1668 float64
uint32_to_float64(uint32_t a
, float_status
*status
)
1670 return uint64_to_float64(a
, status
);
1673 float64
uint16_to_float64(uint16_t a
, float_status
*status
)
1675 return uint64_to_float64(a
, status
);
1679 /* min() and max() functions. These can't be implemented as
1680 * 'compare and pick one input' because that would mishandle
1681 * NaNs and +0 vs -0.
1683 * minnum() and maxnum() functions. These are similar to the min()
1684 * and max() functions but if one of the arguments is a QNaN and
1685 * the other is numerical then the numerical argument is returned.
1686 * SNaNs will get quietened before being returned.
1687 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1688 * and maxNum() operations. min() and max() are the typical min/max
1689 * semantics provided by many CPUs which predate that specification.
1691 * minnummag() and maxnummag() functions correspond to minNumMag()
1692 * and minNumMag() from the IEEE-754 2008.
1694 static FloatParts
minmax_floats(FloatParts a
, FloatParts b
, bool ismin
,
1695 bool ieee
, bool ismag
, float_status
*s
)
1697 if (unlikely(is_nan(a
.cls
) || is_nan(b
.cls
))) {
1699 /* Takes two floating-point values `a' and `b', one of
1700 * which is a NaN, and returns the appropriate NaN
1701 * result. If either `a' or `b' is a signaling NaN,
1702 * the invalid exception is raised.
1704 if (is_snan(a
.cls
) || is_snan(b
.cls
)) {
1705 return pick_nan(a
, b
, s
);
1706 } else if (is_nan(a
.cls
) && !is_nan(b
.cls
)) {
1708 } else if (is_nan(b
.cls
) && !is_nan(a
.cls
)) {
1712 return pick_nan(a
, b
, s
);
1717 case float_class_normal
:
1720 case float_class_inf
:
1723 case float_class_zero
:
1727 g_assert_not_reached();
1731 case float_class_normal
:
1734 case float_class_inf
:
1737 case float_class_zero
:
1741 g_assert_not_reached();
1745 if (ismag
&& (a_exp
!= b_exp
|| a
.frac
!= b
.frac
)) {
1746 bool a_less
= a_exp
< b_exp
;
1747 if (a_exp
== b_exp
) {
1748 a_less
= a
.frac
< b
.frac
;
1750 return a_less
^ ismin
? b
: a
;
1753 if (a
.sign
== b
.sign
) {
1754 bool a_less
= a_exp
< b_exp
;
1755 if (a_exp
== b_exp
) {
1756 a_less
= a
.frac
< b
.frac
;
1758 return a
.sign
^ a_less
^ ismin
? b
: a
;
1760 return a
.sign
^ ismin
? b
: a
;
1765 #define MINMAX(sz, name, ismin, isiee, ismag) \
1766 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1769 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1770 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1771 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1773 return float ## sz ## _round_pack_canonical(pr, s); \
1776 MINMAX(16, min
, true, false, false)
1777 MINMAX(16, minnum
, true, true, false)
1778 MINMAX(16, minnummag
, true, true, true)
1779 MINMAX(16, max
, false, false, false)
1780 MINMAX(16, maxnum
, false, true, false)
1781 MINMAX(16, maxnummag
, false, true, true)
1783 MINMAX(32, min
, true, false, false)
1784 MINMAX(32, minnum
, true, true, false)
1785 MINMAX(32, minnummag
, true, true, true)
1786 MINMAX(32, max
, false, false, false)
1787 MINMAX(32, maxnum
, false, true, false)
1788 MINMAX(32, maxnummag
, false, true, true)
1790 MINMAX(64, min
, true, false, false)
1791 MINMAX(64, minnum
, true, true, false)
1792 MINMAX(64, minnummag
, true, true, true)
1793 MINMAX(64, max
, false, false, false)
1794 MINMAX(64, maxnum
, false, true, false)
1795 MINMAX(64, maxnummag
, false, true, true)
1799 /* Floating point compare */
1800 static int compare_floats(FloatParts a
, FloatParts b
, bool is_quiet
,
1803 if (is_nan(a
.cls
) || is_nan(b
.cls
)) {
1805 a
.cls
== float_class_snan
||
1806 b
.cls
== float_class_snan
) {
1807 s
->float_exception_flags
|= float_flag_invalid
;
1809 return float_relation_unordered
;
1812 if (a
.cls
== float_class_zero
) {
1813 if (b
.cls
== float_class_zero
) {
1814 return float_relation_equal
;
1816 return b
.sign
? float_relation_greater
: float_relation_less
;
1817 } else if (b
.cls
== float_class_zero
) {
1818 return a
.sign
? float_relation_less
: float_relation_greater
;
1821 /* The only really important thing about infinity is its sign. If
1822 * both are infinities the sign marks the smallest of the two.
1824 if (a
.cls
== float_class_inf
) {
1825 if ((b
.cls
== float_class_inf
) && (a
.sign
== b
.sign
)) {
1826 return float_relation_equal
;
1828 return a
.sign
? float_relation_less
: float_relation_greater
;
1829 } else if (b
.cls
== float_class_inf
) {
1830 return b
.sign
? float_relation_greater
: float_relation_less
;
1833 if (a
.sign
!= b
.sign
) {
1834 return a
.sign
? float_relation_less
: float_relation_greater
;
1837 if (a
.exp
== b
.exp
) {
1838 if (a
.frac
== b
.frac
) {
1839 return float_relation_equal
;
1842 return a
.frac
> b
.frac
?
1843 float_relation_less
: float_relation_greater
;
1845 return a
.frac
> b
.frac
?
1846 float_relation_greater
: float_relation_less
;
1850 return a
.exp
> b
.exp
? float_relation_less
: float_relation_greater
;
1852 return a
.exp
> b
.exp
? float_relation_greater
: float_relation_less
;
1857 #define COMPARE(sz) \
1858 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1861 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1862 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1863 return compare_floats(pa, pb, false, s); \
1865 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1868 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1869 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1870 return compare_floats(pa, pb, true, s); \
1879 /* Multiply A by 2 raised to the power N. */
1880 static FloatParts
scalbn_decomposed(FloatParts a
, int n
, float_status
*s
)
1882 if (unlikely(is_nan(a
.cls
))) {
1883 return return_nan(a
, s
);
1885 if (a
.cls
== float_class_normal
) {
1886 /* The largest float type (even though not supported by FloatParts)
1887 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1888 * still allows rounding to infinity, without allowing overflow
1889 * within the int32_t that backs FloatParts.exp.
1891 n
= MIN(MAX(n
, -0x10000), 0x10000);
1897 float16
float16_scalbn(float16 a
, int n
, float_status
*status
)
1899 FloatParts pa
= float16_unpack_canonical(a
, status
);
1900 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1901 return float16_round_pack_canonical(pr
, status
);
1904 float32
float32_scalbn(float32 a
, int n
, float_status
*status
)
1906 FloatParts pa
= float32_unpack_canonical(a
, status
);
1907 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1908 return float32_round_pack_canonical(pr
, status
);
1911 float64
float64_scalbn(float64 a
, int n
, float_status
*status
)
1913 FloatParts pa
= float64_unpack_canonical(a
, status
);
1914 FloatParts pr
= scalbn_decomposed(pa
, n
, status
);
1915 return float64_round_pack_canonical(pr
, status
);
1921 * The old softfloat code did an approximation step before zeroing in
1922 * on the final result. However for simpleness we just compute the
1923 * square root by iterating down from the implicit bit to enough extra
1924 * bits to ensure we get a correctly rounded result.
1926 * This does mean however the calculation is slower than before,
1927 * especially for 64 bit floats.
1930 static FloatParts
sqrt_float(FloatParts a
, float_status
*s
, const FloatFmt
*p
)
1932 uint64_t a_frac
, r_frac
, s_frac
;
1935 if (is_nan(a
.cls
)) {
1936 return return_nan(a
, s
);
1938 if (a
.cls
== float_class_zero
) {
1939 return a
; /* sqrt(+-0) = +-0 */
1942 s
->float_exception_flags
|= float_flag_invalid
;
1943 a
.cls
= float_class_dnan
;
1946 if (a
.cls
== float_class_inf
) {
1947 return a
; /* sqrt(+inf) = +inf */
1950 assert(a
.cls
== float_class_normal
);
1952 /* We need two overflow bits at the top. Adding room for that is a
1953 * right shift. If the exponent is odd, we can discard the low bit
1954 * by multiplying the fraction by 2; that's a left shift. Combine
1955 * those and we shift right if the exponent is even.
1963 /* Bit-by-bit computation of sqrt. */
1967 /* Iterate from implicit bit down to the 3 extra bits to compute a
1968 * properly rounded result. Remember we've inserted one more bit
1969 * at the top, so these positions are one less.
1971 bit
= DECOMPOSED_BINARY_POINT
- 1;
1972 last_bit
= MAX(p
->frac_shift
- 4, 0);
1974 uint64_t q
= 1ULL << bit
;
1975 uint64_t t_frac
= s_frac
+ q
;
1976 if (t_frac
<= a_frac
) {
1977 s_frac
= t_frac
+ q
;
1982 } while (--bit
>= last_bit
);
1984 /* Undo the right shift done above. If there is any remaining
1985 * fraction, the result is inexact. Set the sticky bit.
1987 a
.frac
= (r_frac
<< 1) + (a_frac
!= 0);
1992 float16
__attribute__((flatten
)) float16_sqrt(float16 a
, float_status
*status
)
1994 FloatParts pa
= float16_unpack_canonical(a
, status
);
1995 FloatParts pr
= sqrt_float(pa
, status
, &float16_params
);
1996 return float16_round_pack_canonical(pr
, status
);
1999 float32
__attribute__((flatten
)) float32_sqrt(float32 a
, float_status
*status
)
2001 FloatParts pa
= float32_unpack_canonical(a
, status
);
2002 FloatParts pr
= sqrt_float(pa
, status
, &float32_params
);
2003 return float32_round_pack_canonical(pr
, status
);
2006 float64
__attribute__((flatten
)) float64_sqrt(float64 a
, float_status
*status
)
2008 FloatParts pa
= float64_unpack_canonical(a
, status
);
2009 FloatParts pr
= sqrt_float(pa
, status
, &float64_params
);
2010 return float64_round_pack_canonical(pr
, status
);
2014 /*----------------------------------------------------------------------------
2015 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2016 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2017 | input. If `zSign' is 1, the input is negated before being converted to an
2018 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2019 | is simply rounded to an integer, with the inexact exception raised if the
2020 | input cannot be represented exactly as an integer. However, if the fixed-
2021 | point input is too large, the invalid exception is raised and the largest
2022 | positive or negative integer is returned.
2023 *----------------------------------------------------------------------------*/
2025 static int32_t roundAndPackInt32(flag zSign
, uint64_t absZ
, float_status
*status
)
2027 int8_t roundingMode
;
2028 flag roundNearestEven
;
2029 int8_t roundIncrement
, roundBits
;
2032 roundingMode
= status
->float_rounding_mode
;
2033 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2034 switch (roundingMode
) {
2035 case float_round_nearest_even
:
2036 case float_round_ties_away
:
2037 roundIncrement
= 0x40;
2039 case float_round_to_zero
:
2042 case float_round_up
:
2043 roundIncrement
= zSign
? 0 : 0x7f;
2045 case float_round_down
:
2046 roundIncrement
= zSign
? 0x7f : 0;
2051 roundBits
= absZ
& 0x7F;
2052 absZ
= ( absZ
+ roundIncrement
)>>7;
2053 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2055 if ( zSign
) z
= - z
;
2056 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
2057 float_raise(float_flag_invalid
, status
);
2058 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2061 status
->float_exception_flags
|= float_flag_inexact
;
2067 /*----------------------------------------------------------------------------
2068 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2069 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2070 | and returns the properly rounded 64-bit integer corresponding to the input.
2071 | If `zSign' is 1, the input is negated before being converted to an integer.
2072 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2073 | the inexact exception raised if the input cannot be represented exactly as
2074 | an integer. However, if the fixed-point input is too large, the invalid
2075 | exception is raised and the largest positive or negative integer is
2077 *----------------------------------------------------------------------------*/
2079 static int64_t roundAndPackInt64(flag zSign
, uint64_t absZ0
, uint64_t absZ1
,
2080 float_status
*status
)
2082 int8_t roundingMode
;
2083 flag roundNearestEven
, increment
;
2086 roundingMode
= status
->float_rounding_mode
;
2087 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2088 switch (roundingMode
) {
2089 case float_round_nearest_even
:
2090 case float_round_ties_away
:
2091 increment
= ((int64_t) absZ1
< 0);
2093 case float_round_to_zero
:
2096 case float_round_up
:
2097 increment
= !zSign
&& absZ1
;
2099 case float_round_down
:
2100 increment
= zSign
&& absZ1
;
2107 if ( absZ0
== 0 ) goto overflow
;
2108 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
2111 if ( zSign
) z
= - z
;
2112 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
2114 float_raise(float_flag_invalid
, status
);
2116 zSign
? (int64_t) LIT64( 0x8000000000000000 )
2117 : LIT64( 0x7FFFFFFFFFFFFFFF );
2120 status
->float_exception_flags
|= float_flag_inexact
;
2126 /*----------------------------------------------------------------------------
2127 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2128 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2129 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2130 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2131 | with the inexact exception raised if the input cannot be represented exactly
2132 | as an integer. However, if the fixed-point input is too large, the invalid
2133 | exception is raised and the largest unsigned integer is returned.
2134 *----------------------------------------------------------------------------*/
2136 static int64_t roundAndPackUint64(flag zSign
, uint64_t absZ0
,
2137 uint64_t absZ1
, float_status
*status
)
2139 int8_t roundingMode
;
2140 flag roundNearestEven
, increment
;
2142 roundingMode
= status
->float_rounding_mode
;
2143 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
2144 switch (roundingMode
) {
2145 case float_round_nearest_even
:
2146 case float_round_ties_away
:
2147 increment
= ((int64_t)absZ1
< 0);
2149 case float_round_to_zero
:
2152 case float_round_up
:
2153 increment
= !zSign
&& absZ1
;
2155 case float_round_down
:
2156 increment
= zSign
&& absZ1
;
2164 float_raise(float_flag_invalid
, status
);
2165 return LIT64(0xFFFFFFFFFFFFFFFF);
2167 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
2170 if (zSign
&& absZ0
) {
2171 float_raise(float_flag_invalid
, status
);
2176 status
->float_exception_flags
|= float_flag_inexact
;
2181 /*----------------------------------------------------------------------------
2182 | If `a' is denormal and we are in flush-to-zero mode then set the
2183 | input-denormal exception and return zero. Otherwise just return the value.
2184 *----------------------------------------------------------------------------*/
2185 float32
float32_squash_input_denormal(float32 a
, float_status
*status
)
2187 if (status
->flush_inputs_to_zero
) {
2188 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
2189 float_raise(float_flag_input_denormal
, status
);
2190 return make_float32(float32_val(a
) & 0x80000000);
2196 /*----------------------------------------------------------------------------
2197 | Normalizes the subnormal single-precision floating-point value represented
2198 | by the denormalized significand `aSig'. The normalized exponent and
2199 | significand are stored at the locations pointed to by `zExpPtr' and
2200 | `zSigPtr', respectively.
2201 *----------------------------------------------------------------------------*/
2204 normalizeFloat32Subnormal(uint32_t aSig
, int *zExpPtr
, uint32_t *zSigPtr
)
2208 shiftCount
= countLeadingZeros32( aSig
) - 8;
2209 *zSigPtr
= aSig
<<shiftCount
;
2210 *zExpPtr
= 1 - shiftCount
;
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input. Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly. However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned. If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 | The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location. This shifted
2228 | significand must be normalized or smaller. If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding. In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2236 static float32
roundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2237 float_status
*status
)
2239 int8_t roundingMode
;
2240 flag roundNearestEven
;
2241 int8_t roundIncrement
, roundBits
;
2244 roundingMode
= status
->float_rounding_mode
;
2245 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2246 switch (roundingMode
) {
2247 case float_round_nearest_even
:
2248 case float_round_ties_away
:
2249 roundIncrement
= 0x40;
2251 case float_round_to_zero
:
2254 case float_round_up
:
2255 roundIncrement
= zSign
? 0 : 0x7f;
2257 case float_round_down
:
2258 roundIncrement
= zSign
? 0x7f : 0;
2264 roundBits
= zSig
& 0x7F;
2265 if ( 0xFD <= (uint16_t) zExp
) {
2266 if ( ( 0xFD < zExp
)
2267 || ( ( zExp
== 0xFD )
2268 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
2270 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2271 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
2274 if (status
->flush_to_zero
) {
2275 float_raise(float_flag_output_denormal
, status
);
2276 return packFloat32(zSign
, 0, 0);
2279 (status
->float_detect_tininess
2280 == float_tininess_before_rounding
)
2282 || ( zSig
+ roundIncrement
< 0x80000000 );
2283 shift32RightJamming( zSig
, - zExp
, &zSig
);
2285 roundBits
= zSig
& 0x7F;
2286 if (isTiny
&& roundBits
) {
2287 float_raise(float_flag_underflow
, status
);
2292 status
->float_exception_flags
|= float_flag_inexact
;
2294 zSig
= ( zSig
+ roundIncrement
)>>7;
2295 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
2296 if ( zSig
== 0 ) zExp
= 0;
2297 return packFloat32( zSign
, zExp
, zSig
);
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input. This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2311 normalizeRoundAndPackFloat32(flag zSign
, int zExp
, uint32_t zSig
,
2312 float_status
*status
)
2316 shiftCount
= countLeadingZeros32( zSig
) - 1;
2317 return roundAndPackFloat32(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64
float64_squash_input_denormal(float64 a
, float_status
*status
)
2328 if (status
->flush_inputs_to_zero
) {
2329 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
2330 float_raise(float_flag_input_denormal
, status
);
2331 return make_float64(float64_val(a
) & (1ULL << 63));
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'. The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2345 normalizeFloat64Subnormal(uint64_t aSig
, int *zExpPtr
, uint64_t *zSigPtr
)
2349 shiftCount
= countLeadingZeros64( aSig
) - 11;
2350 *zSigPtr
= aSig
<<shiftCount
;
2351 *zExpPtr
= 1 - shiftCount
;
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result. After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result. This means that any integer portion of `zSig'
2360 | will be added into the exponent. Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2364 *----------------------------------------------------------------------------*/
2366 static inline float64
packFloat64(flag zSign
, int zExp
, uint64_t zSig
)
2369 return make_float64(
2370 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input. Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly. However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned. If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 | The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location. This shifted
2388 | significand must be normalized or smaller. If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding. In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2396 static float64
roundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2397 float_status
*status
)
2399 int8_t roundingMode
;
2400 flag roundNearestEven
;
2401 int roundIncrement
, roundBits
;
2404 roundingMode
= status
->float_rounding_mode
;
2405 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2406 switch (roundingMode
) {
2407 case float_round_nearest_even
:
2408 case float_round_ties_away
:
2409 roundIncrement
= 0x200;
2411 case float_round_to_zero
:
2414 case float_round_up
:
2415 roundIncrement
= zSign
? 0 : 0x3ff;
2417 case float_round_down
:
2418 roundIncrement
= zSign
? 0x3ff : 0;
2420 case float_round_to_odd
:
2421 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2426 roundBits
= zSig
& 0x3FF;
2427 if ( 0x7FD <= (uint16_t) zExp
) {
2428 if ( ( 0x7FD < zExp
)
2429 || ( ( zExp
== 0x7FD )
2430 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
2432 bool overflow_to_inf
= roundingMode
!= float_round_to_odd
&&
2433 roundIncrement
!= 0;
2434 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2435 return packFloat64(zSign
, 0x7FF, -(!overflow_to_inf
));
2438 if (status
->flush_to_zero
) {
2439 float_raise(float_flag_output_denormal
, status
);
2440 return packFloat64(zSign
, 0, 0);
2443 (status
->float_detect_tininess
2444 == float_tininess_before_rounding
)
2446 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
2447 shift64RightJamming( zSig
, - zExp
, &zSig
);
2449 roundBits
= zSig
& 0x3FF;
2450 if (isTiny
&& roundBits
) {
2451 float_raise(float_flag_underflow
, status
);
2453 if (roundingMode
== float_round_to_odd
) {
2455 * For round-to-odd case, the roundIncrement depends on
2456 * zSig which just changed.
2458 roundIncrement
= (zSig
& 0x400) ? 0 : 0x3ff;
2463 status
->float_exception_flags
|= float_flag_inexact
;
2465 zSig
= ( zSig
+ roundIncrement
)>>10;
2466 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
2467 if ( zSig
== 0 ) zExp
= 0;
2468 return packFloat64( zSign
, zExp
, zSig
);
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input. This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2482 normalizeRoundAndPackFloat64(flag zSign
, int zExp
, uint64_t zSig
,
2483 float_status
*status
)
2487 shiftCount
= countLeadingZeros64( zSig
) - 1;
2488 return roundAndPackFloat64(zSign
, zExp
- shiftCount
, zSig
<<shiftCount
,
2493 /*----------------------------------------------------------------------------
2494 | Normalizes the subnormal extended double-precision floating-point value
2495 | represented by the denormalized significand `aSig'. The normalized exponent
2496 | and significand are stored at the locations pointed to by `zExpPtr' and
2497 | `zSigPtr', respectively.
2498 *----------------------------------------------------------------------------*/
2500 void normalizeFloatx80Subnormal(uint64_t aSig
, int32_t *zExpPtr
,
2505 shiftCount
= countLeadingZeros64( aSig
);
2506 *zSigPtr
= aSig
<<shiftCount
;
2507 *zExpPtr
= 1 - shiftCount
;
2510 /*----------------------------------------------------------------------------
2511 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2512 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2513 | and returns the proper extended double-precision floating-point value
2514 | corresponding to the abstract input. Ordinarily, the abstract value is
2515 | rounded and packed into the extended double-precision format, with the
2516 | inexact exception raised if the abstract input cannot be represented
2517 | exactly. However, if the abstract value is too large, the overflow and
2518 | inexact exceptions are raised and an infinity or maximal finite value is
2519 | returned. If the abstract value is too small, the input value is rounded to
2520 | a subnormal number, and the underflow and inexact exceptions are raised if
2521 | the abstract input cannot be represented exactly as a subnormal extended
2522 | double-precision floating-point number.
2523 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2524 | number of bits as single or double precision, respectively. Otherwise, the
2525 | result is rounded to the full precision of the extended double-precision
2527 | The input significand must be normalized or smaller. If the input
2528 | significand is not normalized, `zExp' must be 0; in that case, the result
2529 | returned is a subnormal number, and it must not require rounding. The
2530 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2531 | Floating-Point Arithmetic.
2532 *----------------------------------------------------------------------------*/
2534 floatx80
roundAndPackFloatx80(int8_t roundingPrecision
, flag zSign
,
2535 int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
,
2536 float_status
*status
)
2538 int8_t roundingMode
;
2539 flag roundNearestEven
, increment
, isTiny
;
2540 int64_t roundIncrement
, roundMask
, roundBits
;
2542 roundingMode
= status
->float_rounding_mode
;
2543 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2544 if ( roundingPrecision
== 80 ) goto precision80
;
2545 if ( roundingPrecision
== 64 ) {
2546 roundIncrement
= LIT64( 0x0000000000000400 );
2547 roundMask
= LIT64( 0x00000000000007FF );
2549 else if ( roundingPrecision
== 32 ) {
2550 roundIncrement
= LIT64( 0x0000008000000000 );
2551 roundMask
= LIT64( 0x000000FFFFFFFFFF );
2556 zSig0
|= ( zSig1
!= 0 );
2557 switch (roundingMode
) {
2558 case float_round_nearest_even
:
2559 case float_round_ties_away
:
2561 case float_round_to_zero
:
2564 case float_round_up
:
2565 roundIncrement
= zSign
? 0 : roundMask
;
2567 case float_round_down
:
2568 roundIncrement
= zSign
? roundMask
: 0;
2573 roundBits
= zSig0
& roundMask
;
2574 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2575 if ( ( 0x7FFE < zExp
)
2576 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
2581 if (status
->flush_to_zero
) {
2582 float_raise(float_flag_output_denormal
, status
);
2583 return packFloatx80(zSign
, 0, 0);
2586 (status
->float_detect_tininess
2587 == float_tininess_before_rounding
)
2589 || ( zSig0
<= zSig0
+ roundIncrement
);
2590 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
2592 roundBits
= zSig0
& roundMask
;
2593 if (isTiny
&& roundBits
) {
2594 float_raise(float_flag_underflow
, status
);
2597 status
->float_exception_flags
|= float_flag_inexact
;
2599 zSig0
+= roundIncrement
;
2600 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2601 roundIncrement
= roundMask
+ 1;
2602 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2603 roundMask
|= roundIncrement
;
2605 zSig0
&= ~ roundMask
;
2606 return packFloatx80( zSign
, zExp
, zSig0
);
2610 status
->float_exception_flags
|= float_flag_inexact
;
2612 zSig0
+= roundIncrement
;
2613 if ( zSig0
< roundIncrement
) {
2615 zSig0
= LIT64( 0x8000000000000000 );
2617 roundIncrement
= roundMask
+ 1;
2618 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
2619 roundMask
|= roundIncrement
;
2621 zSig0
&= ~ roundMask
;
2622 if ( zSig0
== 0 ) zExp
= 0;
2623 return packFloatx80( zSign
, zExp
, zSig0
);
2625 switch (roundingMode
) {
2626 case float_round_nearest_even
:
2627 case float_round_ties_away
:
2628 increment
= ((int64_t)zSig1
< 0);
2630 case float_round_to_zero
:
2633 case float_round_up
:
2634 increment
= !zSign
&& zSig1
;
2636 case float_round_down
:
2637 increment
= zSign
&& zSig1
;
2642 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
2643 if ( ( 0x7FFE < zExp
)
2644 || ( ( zExp
== 0x7FFE )
2645 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
2651 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2652 if ( ( roundingMode
== float_round_to_zero
)
2653 || ( zSign
&& ( roundingMode
== float_round_up
) )
2654 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2656 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
2658 return packFloatx80(zSign
,
2659 floatx80_infinity_high
,
2660 floatx80_infinity_low
);
2664 (status
->float_detect_tininess
2665 == float_tininess_before_rounding
)
2668 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
2669 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
2671 if (isTiny
&& zSig1
) {
2672 float_raise(float_flag_underflow
, status
);
2675 status
->float_exception_flags
|= float_flag_inexact
;
2677 switch (roundingMode
) {
2678 case float_round_nearest_even
:
2679 case float_round_ties_away
:
2680 increment
= ((int64_t)zSig1
< 0);
2682 case float_round_to_zero
:
2685 case float_round_up
:
2686 increment
= !zSign
&& zSig1
;
2688 case float_round_down
:
2689 increment
= zSign
&& zSig1
;
2697 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2698 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
2700 return packFloatx80( zSign
, zExp
, zSig0
);
2704 status
->float_exception_flags
|= float_flag_inexact
;
2710 zSig0
= LIT64( 0x8000000000000000 );
2713 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
2717 if ( zSig0
== 0 ) zExp
= 0;
2719 return packFloatx80( zSign
, zExp
, zSig0
);
2723 /*----------------------------------------------------------------------------
2724 | Takes an abstract floating-point value having sign `zSign', exponent
2725 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2726 | and returns the proper extended double-precision floating-point value
2727 | corresponding to the abstract input. This routine is just like
2728 | `roundAndPackFloatx80' except that the input significand does not have to be
2730 *----------------------------------------------------------------------------*/
2732 floatx80
normalizeRoundAndPackFloatx80(int8_t roundingPrecision
,
2733 flag zSign
, int32_t zExp
,
2734 uint64_t zSig0
, uint64_t zSig1
,
2735 float_status
*status
)
2744 shiftCount
= countLeadingZeros64( zSig0
);
2745 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
2747 return roundAndPackFloatx80(roundingPrecision
, zSign
, zExp
,
2748 zSig0
, zSig1
, status
);
2752 /*----------------------------------------------------------------------------
2753 | Returns the least-significant 64 fraction bits of the quadruple-precision
2754 | floating-point value `a'.
2755 *----------------------------------------------------------------------------*/
2757 static inline uint64_t extractFloat128Frac1( float128 a
)
2764 /*----------------------------------------------------------------------------
2765 | Returns the most-significant 48 fraction bits of the quadruple-precision
2766 | floating-point value `a'.
2767 *----------------------------------------------------------------------------*/
2769 static inline uint64_t extractFloat128Frac0( float128 a
)
2772 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
2776 /*----------------------------------------------------------------------------
2777 | Returns the exponent bits of the quadruple-precision floating-point value
2779 *----------------------------------------------------------------------------*/
2781 static inline int32_t extractFloat128Exp( float128 a
)
2784 return ( a
.high
>>48 ) & 0x7FFF;
2788 /*----------------------------------------------------------------------------
2789 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2790 *----------------------------------------------------------------------------*/
2792 static inline flag
extractFloat128Sign( float128 a
)
2799 /*----------------------------------------------------------------------------
2800 | Normalizes the subnormal quadruple-precision floating-point value
2801 | represented by the denormalized significand formed by the concatenation of
2802 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2803 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2804 | significand are stored at the location pointed to by `zSig0Ptr', and the
2805 | least significant 64 bits of the normalized significand are stored at the
2806 | location pointed to by `zSig1Ptr'.
2807 *----------------------------------------------------------------------------*/
2810 normalizeFloat128Subnormal(
2821 shiftCount
= countLeadingZeros64( aSig1
) - 15;
2822 if ( shiftCount
< 0 ) {
2823 *zSig0Ptr
= aSig1
>>( - shiftCount
);
2824 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
2827 *zSig0Ptr
= aSig1
<<shiftCount
;
2830 *zExpPtr
= - shiftCount
- 63;
2833 shiftCount
= countLeadingZeros64( aSig0
) - 15;
2834 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
2835 *zExpPtr
= 1 - shiftCount
;
2840 /*----------------------------------------------------------------------------
2841 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2842 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2843 | floating-point value, returning the result. After being shifted into the
2844 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2845 | added together to form the most significant 32 bits of the result. This
2846 | means that any integer portion of `zSig0' will be added into the exponent.
2847 | Since a properly normalized significand will have an integer portion equal
2848 | to 1, the `zExp' input should be 1 less than the desired result exponent
2849 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2851 *----------------------------------------------------------------------------*/
2853 static inline float128
2854 packFloat128( flag zSign
, int32_t zExp
, uint64_t zSig0
, uint64_t zSig1
)
2859 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
2864 /*----------------------------------------------------------------------------
2865 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2866 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2867 | and `zSig2', and returns the proper quadruple-precision floating-point value
2868 | corresponding to the abstract input. Ordinarily, the abstract value is
2869 | simply rounded and packed into the quadruple-precision format, with the
2870 | inexact exception raised if the abstract input cannot be represented
2871 | exactly. However, if the abstract value is too large, the overflow and
2872 | inexact exceptions are raised and an infinity or maximal finite value is
2873 | returned. If the abstract value is too small, the input value is rounded to
2874 | a subnormal number, and the underflow and inexact exceptions are raised if
2875 | the abstract input cannot be represented exactly as a subnormal quadruple-
2876 | precision floating-point number.
2877 | The input significand must be normalized or smaller. If the input
2878 | significand is not normalized, `zExp' must be 0; in that case, the result
2879 | returned is a subnormal number, and it must not require rounding. In the
2880 | usual case that the input significand is normalized, `zExp' must be 1 less
2881 | than the ``true'' floating-point exponent. The handling of underflow and
2882 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 static float128
roundAndPackFloat128(flag zSign
, int32_t zExp
,
2886 uint64_t zSig0
, uint64_t zSig1
,
2887 uint64_t zSig2
, float_status
*status
)
2889 int8_t roundingMode
;
2890 flag roundNearestEven
, increment
, isTiny
;
2892 roundingMode
= status
->float_rounding_mode
;
2893 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
2894 switch (roundingMode
) {
2895 case float_round_nearest_even
:
2896 case float_round_ties_away
:
2897 increment
= ((int64_t)zSig2
< 0);
2899 case float_round_to_zero
:
2902 case float_round_up
:
2903 increment
= !zSign
&& zSig2
;
2905 case float_round_down
:
2906 increment
= zSign
&& zSig2
;
2908 case float_round_to_odd
:
2909 increment
= !(zSig1
& 0x1) && zSig2
;
2914 if ( 0x7FFD <= (uint32_t) zExp
) {
2915 if ( ( 0x7FFD < zExp
)
2916 || ( ( zExp
== 0x7FFD )
2918 LIT64( 0x0001FFFFFFFFFFFF ),
2919 LIT64( 0xFFFFFFFFFFFFFFFF ),
2926 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
2927 if ( ( roundingMode
== float_round_to_zero
)
2928 || ( zSign
&& ( roundingMode
== float_round_up
) )
2929 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
2930 || (roundingMode
== float_round_to_odd
)
2936 LIT64( 0x0000FFFFFFFFFFFF ),
2937 LIT64( 0xFFFFFFFFFFFFFFFF )
2940 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2943 if (status
->flush_to_zero
) {
2944 float_raise(float_flag_output_denormal
, status
);
2945 return packFloat128(zSign
, 0, 0, 0);
2948 (status
->float_detect_tininess
2949 == float_tininess_before_rounding
)
2955 LIT64( 0x0001FFFFFFFFFFFF ),
2956 LIT64( 0xFFFFFFFFFFFFFFFF )
2958 shift128ExtraRightJamming(
2959 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
2961 if (isTiny
&& zSig2
) {
2962 float_raise(float_flag_underflow
, status
);
2964 switch (roundingMode
) {
2965 case float_round_nearest_even
:
2966 case float_round_ties_away
:
2967 increment
= ((int64_t)zSig2
< 0);
2969 case float_round_to_zero
:
2972 case float_round_up
:
2973 increment
= !zSign
&& zSig2
;
2975 case float_round_down
:
2976 increment
= zSign
&& zSig2
;
2978 case float_round_to_odd
:
2979 increment
= !(zSig1
& 0x1) && zSig2
;
2987 status
->float_exception_flags
|= float_flag_inexact
;
2990 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
2991 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
2994 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
2996 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3000 /*----------------------------------------------------------------------------
3001 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3002 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3003 | returns the proper quadruple-precision floating-point value corresponding
3004 | to the abstract input. This routine is just like `roundAndPackFloat128'
3005 | except that the input significand has fewer bits and does not have to be
3006 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3008 *----------------------------------------------------------------------------*/
3010 static float128
normalizeRoundAndPackFloat128(flag zSign
, int32_t zExp
,
3011 uint64_t zSig0
, uint64_t zSig1
,
3012 float_status
*status
)
3022 shiftCount
= countLeadingZeros64( zSig0
) - 15;
3023 if ( 0 <= shiftCount
) {
3025 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3028 shift128ExtraRightJamming(
3029 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
3032 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
3037 /*----------------------------------------------------------------------------
3038 | Returns the result of converting the 32-bit two's complement integer `a'
3039 | to the extended double-precision floating-point format. The conversion
3040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3042 *----------------------------------------------------------------------------*/
3044 floatx80
int32_to_floatx80(int32_t a
, float_status
*status
)
3051 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3053 absA
= zSign
? - a
: a
;
3054 shiftCount
= countLeadingZeros32( absA
) + 32;
3056 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
3060 /*----------------------------------------------------------------------------
3061 | Returns the result of converting the 32-bit two's complement integer `a' to
3062 | the quadruple-precision floating-point format. The conversion is performed
3063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3064 *----------------------------------------------------------------------------*/
3066 float128
int32_to_float128(int32_t a
, float_status
*status
)
3073 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3075 absA
= zSign
? - a
: a
;
3076 shiftCount
= countLeadingZeros32( absA
) + 17;
3078 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the 64-bit two's complement integer `a'
3084 | to the extended double-precision floating-point format. The conversion
3085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3087 *----------------------------------------------------------------------------*/
3089 floatx80
int64_to_floatx80(int64_t a
, float_status
*status
)
3095 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
3097 absA
= zSign
? - a
: a
;
3098 shiftCount
= countLeadingZeros64( absA
);
3099 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
3103 /*----------------------------------------------------------------------------
3104 | Returns the result of converting the 64-bit two's complement integer `a' to
3105 | the quadruple-precision floating-point format. The conversion is performed
3106 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3107 *----------------------------------------------------------------------------*/
3109 float128
int64_to_float128(int64_t a
, float_status
*status
)
3115 uint64_t zSig0
, zSig1
;
3117 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
3119 absA
= zSign
? - a
: a
;
3120 shiftCount
= countLeadingZeros64( absA
) + 49;
3121 zExp
= 0x406E - shiftCount
;
3122 if ( 64 <= shiftCount
) {
3131 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
3132 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
3136 /*----------------------------------------------------------------------------
3137 | Returns the result of converting the 64-bit unsigned integer `a'
3138 | to the quadruple-precision floating-point format. The conversion is performed
3139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 *----------------------------------------------------------------------------*/
3142 float128
uint64_to_float128(uint64_t a
, float_status
*status
)
3145 return float128_zero
;
3147 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a
, status
);
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the single-precision floating-point value
3155 | `a' to the double-precision floating-point format. The conversion is
3156 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 *----------------------------------------------------------------------------*/
3160 float64
float32_to_float64(float32 a
, float_status
*status
)
3165 a
= float32_squash_input_denormal(a
, status
);
3167 aSig
= extractFloat32Frac( a
);
3168 aExp
= extractFloat32Exp( a
);
3169 aSign
= extractFloat32Sign( a
);
3170 if ( aExp
== 0xFF ) {
3172 return commonNaNToFloat64(float32ToCommonNaN(a
, status
), status
);
3174 return packFloat64( aSign
, 0x7FF, 0 );
3177 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
3178 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3181 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
3185 /*----------------------------------------------------------------------------
3186 | Returns the result of converting the single-precision floating-point value
3187 | `a' to the extended double-precision floating-point format. The conversion
3188 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3190 *----------------------------------------------------------------------------*/
3192 floatx80
float32_to_floatx80(float32 a
, float_status
*status
)
3198 a
= float32_squash_input_denormal(a
, status
);
3199 aSig
= extractFloat32Frac( a
);
3200 aExp
= extractFloat32Exp( a
);
3201 aSign
= extractFloat32Sign( a
);
3202 if ( aExp
== 0xFF ) {
3204 return commonNaNToFloatx80(float32ToCommonNaN(a
, status
), status
);
3206 return packFloatx80(aSign
,
3207 floatx80_infinity_high
,
3208 floatx80_infinity_low
);
3211 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3212 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3215 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of converting the single-precision floating-point value
3221 | `a' to the double-precision floating-point format. The conversion is
3222 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3224 *----------------------------------------------------------------------------*/
3226 float128
float32_to_float128(float32 a
, float_status
*status
)
3232 a
= float32_squash_input_denormal(a
, status
);
3233 aSig
= extractFloat32Frac( a
);
3234 aExp
= extractFloat32Exp( a
);
3235 aSign
= extractFloat32Sign( a
);
3236 if ( aExp
== 0xFF ) {
3238 return commonNaNToFloat128(float32ToCommonNaN(a
, status
), status
);
3240 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3243 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3244 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3247 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
3251 /*----------------------------------------------------------------------------
3252 | Returns the remainder of the single-precision floating-point value `a'
3253 | with respect to the corresponding value `b'. The operation is performed
3254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3255 *----------------------------------------------------------------------------*/
3257 float32
float32_rem(float32 a
, float32 b
, float_status
*status
)
3260 int aExp
, bExp
, expDiff
;
3261 uint32_t aSig
, bSig
;
3263 uint64_t aSig64
, bSig64
, q64
;
3264 uint32_t alternateASig
;
3266 a
= float32_squash_input_denormal(a
, status
);
3267 b
= float32_squash_input_denormal(b
, status
);
3269 aSig
= extractFloat32Frac( a
);
3270 aExp
= extractFloat32Exp( a
);
3271 aSign
= extractFloat32Sign( a
);
3272 bSig
= extractFloat32Frac( b
);
3273 bExp
= extractFloat32Exp( b
);
3274 if ( aExp
== 0xFF ) {
3275 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
3276 return propagateFloat32NaN(a
, b
, status
);
3278 float_raise(float_flag_invalid
, status
);
3279 return float32_default_nan(status
);
3281 if ( bExp
== 0xFF ) {
3283 return propagateFloat32NaN(a
, b
, status
);
3289 float_raise(float_flag_invalid
, status
);
3290 return float32_default_nan(status
);
3292 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
3295 if ( aSig
== 0 ) return a
;
3296 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3298 expDiff
= aExp
- bExp
;
3301 if ( expDiff
< 32 ) {
3304 if ( expDiff
< 0 ) {
3305 if ( expDiff
< -1 ) return a
;
3308 q
= ( bSig
<= aSig
);
3309 if ( q
) aSig
-= bSig
;
3310 if ( 0 < expDiff
) {
3311 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
3314 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3322 if ( bSig
<= aSig
) aSig
-= bSig
;
3323 aSig64
= ( (uint64_t) aSig
)<<40;
3324 bSig64
= ( (uint64_t) bSig
)<<40;
3326 while ( 0 < expDiff
) {
3327 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3328 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3329 aSig64
= - ( ( bSig
* q64
)<<38 );
3333 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
3334 q64
= ( 2 < q64
) ? q64
- 2 : 0;
3335 q
= q64
>>( 64 - expDiff
);
3337 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
3340 alternateASig
= aSig
;
3343 } while ( 0 <= (int32_t) aSig
);
3344 sigMean
= aSig
+ alternateASig
;
3345 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3346 aSig
= alternateASig
;
3348 zSign
= ( (int32_t) aSig
< 0 );
3349 if ( zSign
) aSig
= - aSig
;
3350 return normalizeRoundAndPackFloat32(aSign
^ zSign
, bExp
, aSig
, status
);
3355 /*----------------------------------------------------------------------------
3356 | Returns the binary exponential of the single-precision floating-point value
3357 | `a'. The operation is performed according to the IEC/IEEE Standard for
3358 | Binary Floating-Point Arithmetic.
3360 | Uses the following identities:
3362 | 1. -------------------------------------------------------------------------
3366 | 2. -------------------------------------------------------------------------
3369 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3371 *----------------------------------------------------------------------------*/
3373 static const float64 float32_exp2_coefficients
[15] =
3375 const_float64( 0x3ff0000000000000ll
), /* 1 */
3376 const_float64( 0x3fe0000000000000ll
), /* 2 */
3377 const_float64( 0x3fc5555555555555ll
), /* 3 */
3378 const_float64( 0x3fa5555555555555ll
), /* 4 */
3379 const_float64( 0x3f81111111111111ll
), /* 5 */
3380 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
3381 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
3382 const_float64( 0x3efa01a01a01a01all
), /* 8 */
3383 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
3384 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
3385 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
3386 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
3387 const_float64( 0x3de6124613a86d09ll
), /* 13 */
3388 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
3389 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
3392 float32
float32_exp2(float32 a
, float_status
*status
)
3399 a
= float32_squash_input_denormal(a
, status
);
3401 aSig
= extractFloat32Frac( a
);
3402 aExp
= extractFloat32Exp( a
);
3403 aSign
= extractFloat32Sign( a
);
3405 if ( aExp
== 0xFF) {
3407 return propagateFloat32NaN(a
, float32_zero
, status
);
3409 return (aSign
) ? float32_zero
: a
;
3412 if (aSig
== 0) return float32_one
;
3415 float_raise(float_flag_inexact
, status
);
3417 /* ******************************* */
3418 /* using float64 for approximation */
3419 /* ******************************* */
3420 x
= float32_to_float64(a
, status
);
3421 x
= float64_mul(x
, float64_ln2
, status
);
3425 for (i
= 0 ; i
< 15 ; i
++) {
3428 f
= float64_mul(xn
, float32_exp2_coefficients
[i
], status
);
3429 r
= float64_add(r
, f
, status
);
3431 xn
= float64_mul(xn
, x
, status
);
3434 return float64_to_float32(r
, status
);
3437 /*----------------------------------------------------------------------------
3438 | Returns the binary log of the single-precision floating-point value `a'.
3439 | The operation is performed according to the IEC/IEEE Standard for Binary
3440 | Floating-Point Arithmetic.
3441 *----------------------------------------------------------------------------*/
3442 float32
float32_log2(float32 a
, float_status
*status
)
3446 uint32_t aSig
, zSig
, i
;
3448 a
= float32_squash_input_denormal(a
, status
);
3449 aSig
= extractFloat32Frac( a
);
3450 aExp
= extractFloat32Exp( a
);
3451 aSign
= extractFloat32Sign( a
);
3454 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
3455 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
3458 float_raise(float_flag_invalid
, status
);
3459 return float32_default_nan(status
);
3461 if ( aExp
== 0xFF ) {
3463 return propagateFloat32NaN(a
, float32_zero
, status
);
3473 for (i
= 1 << 22; i
> 0; i
>>= 1) {
3474 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
3475 if ( aSig
& 0x01000000 ) {
3484 return normalizeRoundAndPackFloat32(zSign
, 0x85, zSig
, status
);
3487 /*----------------------------------------------------------------------------
3488 | Returns 1 if the single-precision floating-point value `a' is equal to
3489 | the corresponding value `b', and 0 otherwise. The invalid exception is
3490 | raised if either operand is a NaN. Otherwise, the comparison is performed
3491 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3492 *----------------------------------------------------------------------------*/
3494 int float32_eq(float32 a
, float32 b
, float_status
*status
)
3497 a
= float32_squash_input_denormal(a
, status
);
3498 b
= float32_squash_input_denormal(b
, status
);
3500 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3501 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3503 float_raise(float_flag_invalid
, status
);
3506 av
= float32_val(a
);
3507 bv
= float32_val(b
);
3508 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3511 /*----------------------------------------------------------------------------
3512 | Returns 1 if the single-precision floating-point value `a' is less than
3513 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3514 | exception is raised if either operand is a NaN. The comparison is performed
3515 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3516 *----------------------------------------------------------------------------*/
3518 int float32_le(float32 a
, float32 b
, float_status
*status
)
3522 a
= float32_squash_input_denormal(a
, status
);
3523 b
= float32_squash_input_denormal(b
, status
);
3525 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3526 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3528 float_raise(float_flag_invalid
, status
);
3531 aSign
= extractFloat32Sign( a
);
3532 bSign
= extractFloat32Sign( b
);
3533 av
= float32_val(a
);
3534 bv
= float32_val(b
);
3535 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3536 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3540 /*----------------------------------------------------------------------------
3541 | Returns 1 if the single-precision floating-point value `a' is less than
3542 | the corresponding value `b', and 0 otherwise. The invalid exception is
3543 | raised if either operand is a NaN. The comparison is performed according
3544 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3545 *----------------------------------------------------------------------------*/
3547 int float32_lt(float32 a
, float32 b
, float_status
*status
)
3551 a
= float32_squash_input_denormal(a
, status
);
3552 b
= float32_squash_input_denormal(b
, status
);
3554 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3555 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3557 float_raise(float_flag_invalid
, status
);
3560 aSign
= extractFloat32Sign( a
);
3561 bSign
= extractFloat32Sign( b
);
3562 av
= float32_val(a
);
3563 bv
= float32_val(b
);
3564 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3565 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3569 /*----------------------------------------------------------------------------
3570 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3571 | be compared, and 0 otherwise. The invalid exception is raised if either
3572 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3573 | Standard for Binary Floating-Point Arithmetic.
3574 *----------------------------------------------------------------------------*/
3576 int float32_unordered(float32 a
, float32 b
, float_status
*status
)
3578 a
= float32_squash_input_denormal(a
, status
);
3579 b
= float32_squash_input_denormal(b
, status
);
3581 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3582 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3584 float_raise(float_flag_invalid
, status
);
3590 /*----------------------------------------------------------------------------
3591 | Returns 1 if the single-precision floating-point value `a' is equal to
3592 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3593 | exception. The comparison is performed according to the IEC/IEEE Standard
3594 | for Binary Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3597 int float32_eq_quiet(float32 a
, float32 b
, float_status
*status
)
3599 a
= float32_squash_input_denormal(a
, status
);
3600 b
= float32_squash_input_denormal(b
, status
);
3602 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3603 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3605 if (float32_is_signaling_nan(a
, status
)
3606 || float32_is_signaling_nan(b
, status
)) {
3607 float_raise(float_flag_invalid
, status
);
3611 return ( float32_val(a
) == float32_val(b
) ) ||
3612 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the single-precision floating-point value `a' is less than or
3617 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3618 | cause an exception. Otherwise, the comparison is performed according to the
3619 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 int float32_le_quiet(float32 a
, float32 b
, float_status
*status
)
3626 a
= float32_squash_input_denormal(a
, status
);
3627 b
= float32_squash_input_denormal(b
, status
);
3629 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3630 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3632 if (float32_is_signaling_nan(a
, status
)
3633 || float32_is_signaling_nan(b
, status
)) {
3634 float_raise(float_flag_invalid
, status
);
3638 aSign
= extractFloat32Sign( a
);
3639 bSign
= extractFloat32Sign( b
);
3640 av
= float32_val(a
);
3641 bv
= float32_val(b
);
3642 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
3643 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3647 /*----------------------------------------------------------------------------
3648 | Returns 1 if the single-precision floating-point value `a' is less than
3649 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3650 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3651 | Standard for Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3654 int float32_lt_quiet(float32 a
, float32 b
, float_status
*status
)
3658 a
= float32_squash_input_denormal(a
, status
);
3659 b
= float32_squash_input_denormal(b
, status
);
3661 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3662 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3664 if (float32_is_signaling_nan(a
, status
)
3665 || float32_is_signaling_nan(b
, status
)) {
3666 float_raise(float_flag_invalid
, status
);
3670 aSign
= extractFloat32Sign( a
);
3671 bSign
= extractFloat32Sign( b
);
3672 av
= float32_val(a
);
3673 bv
= float32_val(b
);
3674 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
3675 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3679 /*----------------------------------------------------------------------------
3680 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3681 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3682 | comparison is performed according to the IEC/IEEE Standard for Binary
3683 | Floating-Point Arithmetic.
3684 *----------------------------------------------------------------------------*/
3686 int float32_unordered_quiet(float32 a
, float32 b
, float_status
*status
)
3688 a
= float32_squash_input_denormal(a
, status
);
3689 b
= float32_squash_input_denormal(b
, status
);
3691 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
3692 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
3694 if (float32_is_signaling_nan(a
, status
)
3695 || float32_is_signaling_nan(b
, status
)) {
3696 float_raise(float_flag_invalid
, status
);
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the double-precision floating-point value
3706 | `a' to the single-precision floating-point format. The conversion is
3707 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3709 *----------------------------------------------------------------------------*/
3711 float32
float64_to_float32(float64 a
, float_status
*status
)
3717 a
= float64_squash_input_denormal(a
, status
);
3719 aSig
= extractFloat64Frac( a
);
3720 aExp
= extractFloat64Exp( a
);
3721 aSign
= extractFloat64Sign( a
);
3722 if ( aExp
== 0x7FF ) {
3724 return commonNaNToFloat32(float64ToCommonNaN(a
, status
), status
);
3726 return packFloat32( aSign
, 0xFF, 0 );
3728 shift64RightJamming( aSig
, 22, &aSig
);
3730 if ( aExp
|| zSig
) {
3734 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
3739 /*----------------------------------------------------------------------------
3740 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3741 | half-precision floating-point value, returning the result. After being
3742 | shifted into the proper positions, the three fields are simply added
3743 | together to form the result. This means that any integer portion of `zSig'
3744 | will be added into the exponent. Since a properly normalized significand
3745 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3746 | than the desired result exponent whenever `zSig' is a complete, normalized
3748 *----------------------------------------------------------------------------*/
3749 static float16
packFloat16(flag zSign
, int zExp
, uint16_t zSig
)
3751 return make_float16(
3752 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3755 /*----------------------------------------------------------------------------
3756 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3757 | and significand `zSig', and returns the proper half-precision floating-
3758 | point value corresponding to the abstract input. Ordinarily, the abstract
3759 | value is simply rounded and packed into the half-precision format, with
3760 | the inexact exception raised if the abstract input cannot be represented
3761 | exactly. However, if the abstract value is too large, the overflow and
3762 | inexact exceptions are raised and an infinity or maximal finite value is
3763 | returned. If the abstract value is too small, the input value is rounded to
3764 | a subnormal number, and the underflow and inexact exceptions are raised if
3765 | the abstract input cannot be represented exactly as a subnormal half-
3766 | precision floating-point number.
3767 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3768 | ARM-style "alternative representation", which omits the NaN and Inf
3769 | encodings in order to raise the maximum representable exponent by one.
3770 | The input significand `zSig' has its binary point between bits 22
3771 | and 23, which is 13 bits to the left of the usual location. This shifted
3772 | significand must be normalized or smaller. If `zSig' is not normalized,
3773 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3774 | and it must not require rounding. In the usual case that `zSig' is
3775 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3776 | Note the slightly odd position of the binary point in zSig compared with the
3777 | other roundAndPackFloat functions. This should probably be fixed if we
3778 | need to implement more float16 routines than just conversion.
3779 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3780 | Binary Floating-Point Arithmetic.
3781 *----------------------------------------------------------------------------*/
3783 static float16
roundAndPackFloat16(flag zSign
, int zExp
,
3784 uint32_t zSig
, flag ieee
,
3785 float_status
*status
)
3787 int maxexp
= ieee
? 29 : 30;
3790 bool rounding_bumps_exp
;
3791 bool is_tiny
= false;
3793 /* Calculate the mask of bits of the mantissa which are not
3794 * representable in half-precision and will be lost.
3797 /* Will be denormal in halfprec */
3803 /* Normal number in halfprec */
3807 switch (status
->float_rounding_mode
) {
3808 case float_round_nearest_even
:
3809 increment
= (mask
+ 1) >> 1;
3810 if ((zSig
& mask
) == increment
) {
3811 increment
= zSig
& (increment
<< 1);
3814 case float_round_ties_away
:
3815 increment
= (mask
+ 1) >> 1;
3817 case float_round_up
:
3818 increment
= zSign
? 0 : mask
;
3820 case float_round_down
:
3821 increment
= zSign
? mask
: 0;
3823 default: /* round_to_zero */
3828 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3830 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3832 float_raise(float_flag_overflow
| float_flag_inexact
, status
);
3833 return packFloat16(zSign
, 0x1f, 0);
3835 float_raise(float_flag_invalid
, status
);
3836 return packFloat16(zSign
, 0x1f, 0x3ff);
3841 /* Note that flush-to-zero does not affect half-precision results */
3843 (status
->float_detect_tininess
== float_tininess_before_rounding
)
3845 || (!rounding_bumps_exp
);
3848 float_raise(float_flag_inexact
, status
);
3850 float_raise(float_flag_underflow
, status
);
3855 if (rounding_bumps_exp
) {
3861 return packFloat16(zSign
, 0, 0);
3867 return packFloat16(zSign
, zExp
, zSig
>> 13);
3870 /*----------------------------------------------------------------------------
3871 | If `a' is denormal and we are in flush-to-zero mode then set the
3872 | input-denormal exception and return zero. Otherwise just return the value.
3873 *----------------------------------------------------------------------------*/
3874 float16
float16_squash_input_denormal(float16 a
, float_status
*status
)
3876 if (status
->flush_inputs_to_zero
) {
3877 if (extractFloat16Exp(a
) == 0 && extractFloat16Frac(a
) != 0) {
3878 float_raise(float_flag_input_denormal
, status
);
3879 return make_float16(float16_val(a
) & 0x8000);
3885 static void normalizeFloat16Subnormal(uint32_t aSig
, int *zExpPtr
,
3888 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3889 *zSigPtr
= aSig
<< shiftCount
;
3890 *zExpPtr
= 1 - shiftCount
;
3893 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3894 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3896 float32
float16_to_float32(float16 a
, flag ieee
, float_status
*status
)
3902 aSign
= extractFloat16Sign(a
);
3903 aExp
= extractFloat16Exp(a
);
3904 aSig
= extractFloat16Frac(a
);
3906 if (aExp
== 0x1f && ieee
) {
3908 return commonNaNToFloat32(float16ToCommonNaN(a
, status
), status
);
3910 return packFloat32(aSign
, 0xff, 0);
3914 return packFloat32(aSign
, 0, 0);
3917 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3920 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3923 float16
float32_to_float16(float32 a
, flag ieee
, float_status
*status
)
3929 a
= float32_squash_input_denormal(a
, status
);
3931 aSig
= extractFloat32Frac( a
);
3932 aExp
= extractFloat32Exp( a
);
3933 aSign
= extractFloat32Sign( a
);
3934 if ( aExp
== 0xFF ) {
3936 /* Input is a NaN */
3938 float_raise(float_flag_invalid
, status
);
3939 return packFloat16(aSign
, 0, 0);
3941 return commonNaNToFloat16(
3942 float32ToCommonNaN(a
, status
), status
);
3946 float_raise(float_flag_invalid
, status
);
3947 return packFloat16(aSign
, 0x1f, 0x3ff);
3949 return packFloat16(aSign
, 0x1f, 0);
3951 if (aExp
== 0 && aSig
== 0) {
3952 return packFloat16(aSign
, 0, 0);
3954 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3955 * even if the input is denormal; however this is harmless because
3956 * the largest possible single-precision denormal is still smaller
3957 * than the smallest representable half-precision denormal, and so we
3958 * will end up ignoring aSig and returning via the "always return zero"
3964 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee
, status
);
3967 float64
float16_to_float64(float16 a
, flag ieee
, float_status
*status
)
3973 aSign
= extractFloat16Sign(a
);
3974 aExp
= extractFloat16Exp(a
);
3975 aSig
= extractFloat16Frac(a
);
3977 if (aExp
== 0x1f && ieee
) {
3979 return commonNaNToFloat64(
3980 float16ToCommonNaN(a
, status
), status
);
3982 return packFloat64(aSign
, 0x7ff, 0);
3986 return packFloat64(aSign
, 0, 0);
3989 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3992 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3995 float16
float64_to_float16(float64 a
, flag ieee
, float_status
*status
)
4002 a
= float64_squash_input_denormal(a
, status
);
4004 aSig
= extractFloat64Frac(a
);
4005 aExp
= extractFloat64Exp(a
);
4006 aSign
= extractFloat64Sign(a
);
4007 if (aExp
== 0x7FF) {
4009 /* Input is a NaN */
4011 float_raise(float_flag_invalid
, status
);
4012 return packFloat16(aSign
, 0, 0);
4014 return commonNaNToFloat16(
4015 float64ToCommonNaN(a
, status
), status
);
4019 float_raise(float_flag_invalid
, status
);
4020 return packFloat16(aSign
, 0x1f, 0x3ff);
4022 return packFloat16(aSign
, 0x1f, 0);
4024 shift64RightJamming(aSig
, 29, &aSig
);
4026 if (aExp
== 0 && zSig
== 0) {
4027 return packFloat16(aSign
, 0, 0);
4029 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4030 * even if the input is denormal; however this is harmless because
4031 * the largest possible single-precision denormal is still smaller
4032 * than the smallest representable half-precision denormal, and so we
4033 * will end up ignoring aSig and returning via the "always return zero"
4039 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee
, status
);
4042 /*----------------------------------------------------------------------------
4043 | Returns the result of converting the double-precision floating-point value
4044 | `a' to the extended double-precision floating-point format. The conversion
4045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4047 *----------------------------------------------------------------------------*/
4049 floatx80
float64_to_floatx80(float64 a
, float_status
*status
)
4055 a
= float64_squash_input_denormal(a
, status
);
4056 aSig
= extractFloat64Frac( a
);
4057 aExp
= extractFloat64Exp( a
);
4058 aSign
= extractFloat64Sign( a
);
4059 if ( aExp
== 0x7FF ) {
4061 return commonNaNToFloatx80(float64ToCommonNaN(a
, status
), status
);
4063 return packFloatx80(aSign
,
4064 floatx80_infinity_high
,
4065 floatx80_infinity_low
);
4068 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
4069 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4073 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
4077 /*----------------------------------------------------------------------------
4078 | Returns the result of converting the double-precision floating-point value
4079 | `a' to the quadruple-precision floating-point format. The conversion is
4080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4082 *----------------------------------------------------------------------------*/
4084 float128
float64_to_float128(float64 a
, float_status
*status
)
4088 uint64_t aSig
, zSig0
, zSig1
;
4090 a
= float64_squash_input_denormal(a
, status
);
4091 aSig
= extractFloat64Frac( a
);
4092 aExp
= extractFloat64Exp( a
);
4093 aSign
= extractFloat64Sign( a
);
4094 if ( aExp
== 0x7FF ) {
4096 return commonNaNToFloat128(float64ToCommonNaN(a
, status
), status
);
4098 return packFloat128( aSign
, 0x7FFF, 0, 0 );
4101 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
4102 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4105 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
4106 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
4111 /*----------------------------------------------------------------------------
4112 | Returns the remainder of the double-precision floating-point value `a'
4113 | with respect to the corresponding value `b'. The operation is performed
4114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4117 float64
float64_rem(float64 a
, float64 b
, float_status
*status
)
4120 int aExp
, bExp
, expDiff
;
4121 uint64_t aSig
, bSig
;
4122 uint64_t q
, alternateASig
;
4125 a
= float64_squash_input_denormal(a
, status
);
4126 b
= float64_squash_input_denormal(b
, status
);
4127 aSig
= extractFloat64Frac( a
);
4128 aExp
= extractFloat64Exp( a
);
4129 aSign
= extractFloat64Sign( a
);
4130 bSig
= extractFloat64Frac( b
);
4131 bExp
= extractFloat64Exp( b
);
4132 if ( aExp
== 0x7FF ) {
4133 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4134 return propagateFloat64NaN(a
, b
, status
);
4136 float_raise(float_flag_invalid
, status
);
4137 return float64_default_nan(status
);
4139 if ( bExp
== 0x7FF ) {
4141 return propagateFloat64NaN(a
, b
, status
);
4147 float_raise(float_flag_invalid
, status
);
4148 return float64_default_nan(status
);
4150 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4153 if ( aSig
== 0 ) return a
;
4154 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4156 expDiff
= aExp
- bExp
;
4157 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4158 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4159 if ( expDiff
< 0 ) {
4160 if ( expDiff
< -1 ) return a
;
4163 q
= ( bSig
<= aSig
);
4164 if ( q
) aSig
-= bSig
;
4166 while ( 0 < expDiff
) {
4167 q
= estimateDiv128To64( aSig
, 0, bSig
);
4168 q
= ( 2 < q
) ? q
- 2 : 0;
4169 aSig
= - ( ( bSig
>>2 ) * q
);
4173 if ( 0 < expDiff
) {
4174 q
= estimateDiv128To64( aSig
, 0, bSig
);
4175 q
= ( 2 < q
) ? q
- 2 : 0;
4178 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4185 alternateASig
= aSig
;
4188 } while ( 0 <= (int64_t) aSig
);
4189 sigMean
= aSig
+ alternateASig
;
4190 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4191 aSig
= alternateASig
;
4193 zSign
= ( (int64_t) aSig
< 0 );
4194 if ( zSign
) aSig
= - aSig
;
4195 return normalizeRoundAndPackFloat64(aSign
^ zSign
, bExp
, aSig
, status
);
4199 /*----------------------------------------------------------------------------
4200 | Returns the binary log of the double-precision floating-point value `a'.
4201 | The operation is performed according to the IEC/IEEE Standard for Binary
4202 | Floating-Point Arithmetic.
4203 *----------------------------------------------------------------------------*/
4204 float64
float64_log2(float64 a
, float_status
*status
)
4208 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4209 a
= float64_squash_input_denormal(a
, status
);
4211 aSig
= extractFloat64Frac( a
);
4212 aExp
= extractFloat64Exp( a
);
4213 aSign
= extractFloat64Sign( a
);
4216 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4217 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4220 float_raise(float_flag_invalid
, status
);
4221 return float64_default_nan(status
);
4223 if ( aExp
== 0x7FF ) {
4225 return propagateFloat64NaN(a
, float64_zero
, status
);
4231 aSig
|= LIT64( 0x0010000000000000 );
4233 zSig
= (uint64_t)aExp
<< 52;
4234 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4235 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4236 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4237 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4245 return normalizeRoundAndPackFloat64(zSign
, 0x408, zSig
, status
);
4248 /*----------------------------------------------------------------------------
4249 | Returns 1 if the double-precision floating-point value `a' is equal to the
4250 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4251 | if either operand is a NaN. Otherwise, the comparison is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4255 int float64_eq(float64 a
, float64 b
, float_status
*status
)
4258 a
= float64_squash_input_denormal(a
, status
);
4259 b
= float64_squash_input_denormal(b
, status
);
4261 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4262 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4264 float_raise(float_flag_invalid
, status
);
4267 av
= float64_val(a
);
4268 bv
= float64_val(b
);
4269 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4273 /*----------------------------------------------------------------------------
4274 | Returns 1 if the double-precision floating-point value `a' is less than or
4275 | equal to the corresponding value `b', and 0 otherwise. The invalid
4276 | exception is raised if either operand is a NaN. The comparison is performed
4277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4280 int float64_le(float64 a
, float64 b
, float_status
*status
)
4284 a
= float64_squash_input_denormal(a
, status
);
4285 b
= float64_squash_input_denormal(b
, status
);
4287 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4288 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4290 float_raise(float_flag_invalid
, status
);
4293 aSign
= extractFloat64Sign( a
);
4294 bSign
= extractFloat64Sign( b
);
4295 av
= float64_val(a
);
4296 bv
= float64_val(b
);
4297 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4298 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4302 /*----------------------------------------------------------------------------
4303 | Returns 1 if the double-precision floating-point value `a' is less than
4304 | the corresponding value `b', and 0 otherwise. The invalid exception is
4305 | raised if either operand is a NaN. The comparison is performed according
4306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4307 *----------------------------------------------------------------------------*/
4309 int float64_lt(float64 a
, float64 b
, float_status
*status
)
4314 a
= float64_squash_input_denormal(a
, status
);
4315 b
= float64_squash_input_denormal(b
, status
);
4316 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4317 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4319 float_raise(float_flag_invalid
, status
);
4322 aSign
= extractFloat64Sign( a
);
4323 bSign
= extractFloat64Sign( b
);
4324 av
= float64_val(a
);
4325 bv
= float64_val(b
);
4326 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4327 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4331 /*----------------------------------------------------------------------------
4332 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4333 | be compared, and 0 otherwise. The invalid exception is raised if either
4334 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4335 | Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4338 int float64_unordered(float64 a
, float64 b
, float_status
*status
)
4340 a
= float64_squash_input_denormal(a
, status
);
4341 b
= float64_squash_input_denormal(b
, status
);
4343 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4344 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4346 float_raise(float_flag_invalid
, status
);
4352 /*----------------------------------------------------------------------------
4353 | Returns 1 if the double-precision floating-point value `a' is equal to the
4354 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4355 | exception.The comparison is performed according to the IEC/IEEE Standard
4356 | for Binary Floating-Point Arithmetic.
4357 *----------------------------------------------------------------------------*/
4359 int float64_eq_quiet(float64 a
, float64 b
, float_status
*status
)
4362 a
= float64_squash_input_denormal(a
, status
);
4363 b
= float64_squash_input_denormal(b
, status
);
4365 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4366 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4368 if (float64_is_signaling_nan(a
, status
)
4369 || float64_is_signaling_nan(b
, status
)) {
4370 float_raise(float_flag_invalid
, status
);
4374 av
= float64_val(a
);
4375 bv
= float64_val(b
);
4376 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4380 /*----------------------------------------------------------------------------
4381 | Returns 1 if the double-precision floating-point value `a' is less than or
4382 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4383 | cause an exception. Otherwise, the comparison is performed according to the
4384 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4387 int float64_le_quiet(float64 a
, float64 b
, float_status
*status
)
4391 a
= float64_squash_input_denormal(a
, status
);
4392 b
= float64_squash_input_denormal(b
, status
);
4394 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4395 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4397 if (float64_is_signaling_nan(a
, status
)
4398 || float64_is_signaling_nan(b
, status
)) {
4399 float_raise(float_flag_invalid
, status
);
4403 aSign
= extractFloat64Sign( a
);
4404 bSign
= extractFloat64Sign( b
);
4405 av
= float64_val(a
);
4406 bv
= float64_val(b
);
4407 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4408 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4412 /*----------------------------------------------------------------------------
4413 | Returns 1 if the double-precision floating-point value `a' is less than
4414 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4415 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4416 | Standard for Binary Floating-Point Arithmetic.
4417 *----------------------------------------------------------------------------*/
4419 int float64_lt_quiet(float64 a
, float64 b
, float_status
*status
)
4423 a
= float64_squash_input_denormal(a
, status
);
4424 b
= float64_squash_input_denormal(b
, status
);
4426 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4427 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4429 if (float64_is_signaling_nan(a
, status
)
4430 || float64_is_signaling_nan(b
, status
)) {
4431 float_raise(float_flag_invalid
, status
);
4435 aSign
= extractFloat64Sign( a
);
4436 bSign
= extractFloat64Sign( b
);
4437 av
= float64_val(a
);
4438 bv
= float64_val(b
);
4439 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4440 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4444 /*----------------------------------------------------------------------------
4445 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4446 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4447 | comparison is performed according to the IEC/IEEE Standard for Binary
4448 | Floating-Point Arithmetic.
4449 *----------------------------------------------------------------------------*/
4451 int float64_unordered_quiet(float64 a
, float64 b
, float_status
*status
)
4453 a
= float64_squash_input_denormal(a
, status
);
4454 b
= float64_squash_input_denormal(b
, status
);
4456 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4457 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4459 if (float64_is_signaling_nan(a
, status
)
4460 || float64_is_signaling_nan(b
, status
)) {
4461 float_raise(float_flag_invalid
, status
);
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the extended double-precision floating-
4470 | point value `a' to the 32-bit two's complement integer format. The
4471 | conversion is performed according to the IEC/IEEE Standard for Binary
4472 | Floating-Point Arithmetic---which means in particular that the conversion
4473 | is rounded according to the current rounding mode. If `a' is a NaN, the
4474 | largest positive integer is returned. Otherwise, if the conversion
4475 | overflows, the largest integer with the same sign as `a' is returned.
4476 *----------------------------------------------------------------------------*/
4478 int32_t floatx80_to_int32(floatx80 a
, float_status
*status
)
4481 int32_t aExp
, shiftCount
;
4484 if (floatx80_invalid_encoding(a
)) {
4485 float_raise(float_flag_invalid
, status
);
4488 aSig
= extractFloatx80Frac( a
);
4489 aExp
= extractFloatx80Exp( a
);
4490 aSign
= extractFloatx80Sign( a
);
4491 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4492 shiftCount
= 0x4037 - aExp
;
4493 if ( shiftCount
<= 0 ) shiftCount
= 1;
4494 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4495 return roundAndPackInt32(aSign
, aSig
, status
);
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the extended double-precision floating-
4501 | point value `a' to the 32-bit two's complement integer format. The
4502 | conversion is performed according to the IEC/IEEE Standard for Binary
4503 | Floating-Point Arithmetic, except that the conversion is always rounded
4504 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4505 | Otherwise, if the conversion overflows, the largest integer with the same
4506 | sign as `a' is returned.
4507 *----------------------------------------------------------------------------*/
4509 int32_t floatx80_to_int32_round_to_zero(floatx80 a
, float_status
*status
)
4512 int32_t aExp
, shiftCount
;
4513 uint64_t aSig
, savedASig
;
4516 if (floatx80_invalid_encoding(a
)) {
4517 float_raise(float_flag_invalid
, status
);
4520 aSig
= extractFloatx80Frac( a
);
4521 aExp
= extractFloatx80Exp( a
);
4522 aSign
= extractFloatx80Sign( a
);
4523 if ( 0x401E < aExp
) {
4524 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4527 else if ( aExp
< 0x3FFF ) {
4529 status
->float_exception_flags
|= float_flag_inexact
;
4533 shiftCount
= 0x403E - aExp
;
4535 aSig
>>= shiftCount
;
4537 if ( aSign
) z
= - z
;
4538 if ( ( z
< 0 ) ^ aSign
) {
4540 float_raise(float_flag_invalid
, status
);
4541 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4543 if ( ( aSig
<<shiftCount
) != savedASig
) {
4544 status
->float_exception_flags
|= float_flag_inexact
;
4550 /*----------------------------------------------------------------------------
4551 | Returns the result of converting the extended double-precision floating-
4552 | point value `a' to the 64-bit two's complement integer format. The
4553 | conversion is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic---which means in particular that the conversion
4555 | is rounded according to the current rounding mode. If `a' is a NaN,
4556 | the largest positive integer is returned. Otherwise, if the conversion
4557 | overflows, the largest integer with the same sign as `a' is returned.
4558 *----------------------------------------------------------------------------*/
4560 int64_t floatx80_to_int64(floatx80 a
, float_status
*status
)
4563 int32_t aExp
, shiftCount
;
4564 uint64_t aSig
, aSigExtra
;
4566 if (floatx80_invalid_encoding(a
)) {
4567 float_raise(float_flag_invalid
, status
);
4570 aSig
= extractFloatx80Frac( a
);
4571 aExp
= extractFloatx80Exp( a
);
4572 aSign
= extractFloatx80Sign( a
);
4573 shiftCount
= 0x403E - aExp
;
4574 if ( shiftCount
<= 0 ) {
4576 float_raise(float_flag_invalid
, status
);
4577 if (!aSign
|| floatx80_is_any_nan(a
)) {
4578 return LIT64( 0x7FFFFFFFFFFFFFFF );
4580 return (int64_t) LIT64( 0x8000000000000000 );
4585 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4587 return roundAndPackInt64(aSign
, aSig
, aSigExtra
, status
);
4591 /*----------------------------------------------------------------------------
4592 | Returns the result of converting the extended double-precision floating-
4593 | point value `a' to the 64-bit two's complement integer format. The
4594 | conversion is performed according to the IEC/IEEE Standard for Binary
4595 | Floating-Point Arithmetic, except that the conversion is always rounded
4596 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4597 | Otherwise, if the conversion overflows, the largest integer with the same
4598 | sign as `a' is returned.
4599 *----------------------------------------------------------------------------*/
4601 int64_t floatx80_to_int64_round_to_zero(floatx80 a
, float_status
*status
)
4604 int32_t aExp
, shiftCount
;
4608 if (floatx80_invalid_encoding(a
)) {
4609 float_raise(float_flag_invalid
, status
);
4612 aSig
= extractFloatx80Frac( a
);
4613 aExp
= extractFloatx80Exp( a
);
4614 aSign
= extractFloatx80Sign( a
);
4615 shiftCount
= aExp
- 0x403E;
4616 if ( 0 <= shiftCount
) {
4617 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4618 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4619 float_raise(float_flag_invalid
, status
);
4620 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4621 return LIT64( 0x7FFFFFFFFFFFFFFF );
4624 return (int64_t) LIT64( 0x8000000000000000 );
4626 else if ( aExp
< 0x3FFF ) {
4628 status
->float_exception_flags
|= float_flag_inexact
;
4632 z
= aSig
>>( - shiftCount
);
4633 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4634 status
->float_exception_flags
|= float_flag_inexact
;
4636 if ( aSign
) z
= - z
;
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the extended double-precision floating-
4643 | point value `a' to the single-precision floating-point format. The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4648 float32
floatx80_to_float32(floatx80 a
, float_status
*status
)
4654 if (floatx80_invalid_encoding(a
)) {
4655 float_raise(float_flag_invalid
, status
);
4656 return float32_default_nan(status
);
4658 aSig
= extractFloatx80Frac( a
);
4659 aExp
= extractFloatx80Exp( a
);
4660 aSign
= extractFloatx80Sign( a
);
4661 if ( aExp
== 0x7FFF ) {
4662 if ( (uint64_t) ( aSig
<<1 ) ) {
4663 return commonNaNToFloat32(floatx80ToCommonNaN(a
, status
), status
);
4665 return packFloat32( aSign
, 0xFF, 0 );
4667 shift64RightJamming( aSig
, 33, &aSig
);
4668 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4669 return roundAndPackFloat32(aSign
, aExp
, aSig
, status
);
4673 /*----------------------------------------------------------------------------
4674 | Returns the result of converting the extended double-precision floating-
4675 | point value `a' to the double-precision floating-point format. The
4676 | conversion is performed according to the IEC/IEEE Standard for Binary
4677 | Floating-Point Arithmetic.
4678 *----------------------------------------------------------------------------*/
4680 float64
floatx80_to_float64(floatx80 a
, float_status
*status
)
4684 uint64_t aSig
, zSig
;
4686 if (floatx80_invalid_encoding(a
)) {
4687 float_raise(float_flag_invalid
, status
);
4688 return float64_default_nan(status
);
4690 aSig
= extractFloatx80Frac( a
);
4691 aExp
= extractFloatx80Exp( a
);
4692 aSign
= extractFloatx80Sign( a
);
4693 if ( aExp
== 0x7FFF ) {
4694 if ( (uint64_t) ( aSig
<<1 ) ) {
4695 return commonNaNToFloat64(floatx80ToCommonNaN(a
, status
), status
);
4697 return packFloat64( aSign
, 0x7FF, 0 );
4699 shift64RightJamming( aSig
, 1, &zSig
);
4700 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4701 return roundAndPackFloat64(aSign
, aExp
, zSig
, status
);
4705 /*----------------------------------------------------------------------------
4706 | Returns the result of converting the extended double-precision floating-
4707 | point value `a' to the quadruple-precision floating-point format. The
4708 | conversion is performed according to the IEC/IEEE Standard for Binary
4709 | Floating-Point Arithmetic.
4710 *----------------------------------------------------------------------------*/
4712 float128
floatx80_to_float128(floatx80 a
, float_status
*status
)
4716 uint64_t aSig
, zSig0
, zSig1
;
4718 if (floatx80_invalid_encoding(a
)) {
4719 float_raise(float_flag_invalid
, status
);
4720 return float128_default_nan(status
);
4722 aSig
= extractFloatx80Frac( a
);
4723 aExp
= extractFloatx80Exp( a
);
4724 aSign
= extractFloatx80Sign( a
);
4725 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4726 return commonNaNToFloat128(floatx80ToCommonNaN(a
, status
), status
);
4728 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4729 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a'
4735 | to the precision provided by floatx80_rounding_precision and returns the
4736 | result as an extended double-precision floating-point value.
4737 | The operation is performed according to the IEC/IEEE Standard for Binary
4738 | Floating-Point Arithmetic.
4739 *----------------------------------------------------------------------------*/
4741 floatx80
floatx80_round(floatx80 a
, float_status
*status
)
4743 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4744 extractFloatx80Sign(a
),
4745 extractFloatx80Exp(a
),
4746 extractFloatx80Frac(a
), 0, status
);
4749 /*----------------------------------------------------------------------------
4750 | Rounds the extended double-precision floating-point value `a' to an integer,
4751 | and returns the result as an extended quadruple-precision floating-point
4752 | value. The operation is performed according to the IEC/IEEE Standard for
4753 | Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4756 floatx80
floatx80_round_to_int(floatx80 a
, float_status
*status
)
4760 uint64_t lastBitMask
, roundBitsMask
;
4763 if (floatx80_invalid_encoding(a
)) {
4764 float_raise(float_flag_invalid
, status
);
4765 return floatx80_default_nan(status
);
4767 aExp
= extractFloatx80Exp( a
);
4768 if ( 0x403E <= aExp
) {
4769 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4770 return propagateFloatx80NaN(a
, a
, status
);
4774 if ( aExp
< 0x3FFF ) {
4776 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4779 status
->float_exception_flags
|= float_flag_inexact
;
4780 aSign
= extractFloatx80Sign( a
);
4781 switch (status
->float_rounding_mode
) {
4782 case float_round_nearest_even
:
4783 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4786 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4789 case float_round_ties_away
:
4790 if (aExp
== 0x3FFE) {
4791 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4794 case float_round_down
:
4797 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4798 : packFloatx80( 0, 0, 0 );
4799 case float_round_up
:
4801 aSign
? packFloatx80( 1, 0, 0 )
4802 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4804 return packFloatx80( aSign
, 0, 0 );
4807 lastBitMask
<<= 0x403E - aExp
;
4808 roundBitsMask
= lastBitMask
- 1;
4810 switch (status
->float_rounding_mode
) {
4811 case float_round_nearest_even
:
4812 z
.low
+= lastBitMask
>>1;
4813 if ((z
.low
& roundBitsMask
) == 0) {
4814 z
.low
&= ~lastBitMask
;
4817 case float_round_ties_away
:
4818 z
.low
+= lastBitMask
>> 1;
4820 case float_round_to_zero
:
4822 case float_round_up
:
4823 if (!extractFloatx80Sign(z
)) {
4824 z
.low
+= roundBitsMask
;
4827 case float_round_down
:
4828 if (extractFloatx80Sign(z
)) {
4829 z
.low
+= roundBitsMask
;
4835 z
.low
&= ~ roundBitsMask
;
4838 z
.low
= LIT64( 0x8000000000000000 );
4840 if (z
.low
!= a
.low
) {
4841 status
->float_exception_flags
|= float_flag_inexact
;
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of adding the absolute values of the extended double-
4849 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4850 | negated before being returned. `zSign' is ignored if the result is a NaN.
4851 | The addition is performed according to the IEC/IEEE Standard for Binary
4852 | Floating-Point Arithmetic.
4853 *----------------------------------------------------------------------------*/
4855 static floatx80
addFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4856 float_status
*status
)
4858 int32_t aExp
, bExp
, zExp
;
4859 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4862 aSig
= extractFloatx80Frac( a
);
4863 aExp
= extractFloatx80Exp( a
);
4864 bSig
= extractFloatx80Frac( b
);
4865 bExp
= extractFloatx80Exp( b
);
4866 expDiff
= aExp
- bExp
;
4867 if ( 0 < expDiff
) {
4868 if ( aExp
== 0x7FFF ) {
4869 if ((uint64_t)(aSig
<< 1)) {
4870 return propagateFloatx80NaN(a
, b
, status
);
4874 if ( bExp
== 0 ) --expDiff
;
4875 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4878 else if ( expDiff
< 0 ) {
4879 if ( bExp
== 0x7FFF ) {
4880 if ((uint64_t)(bSig
<< 1)) {
4881 return propagateFloatx80NaN(a
, b
, status
);
4883 return packFloatx80(zSign
,
4884 floatx80_infinity_high
,
4885 floatx80_infinity_low
);
4887 if ( aExp
== 0 ) ++expDiff
;
4888 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4892 if ( aExp
== 0x7FFF ) {
4893 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4894 return propagateFloatx80NaN(a
, b
, status
);
4899 zSig0
= aSig
+ bSig
;
4901 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4907 zSig0
= aSig
+ bSig
;
4908 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4910 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4911 zSig0
|= LIT64( 0x8000000000000000 );
4914 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
4915 zSign
, zExp
, zSig0
, zSig1
, status
);
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the absolute values of the extended
4920 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4921 | difference is negated before being returned. `zSign' is ignored if the
4922 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4923 | Standard for Binary Floating-Point Arithmetic.
4924 *----------------------------------------------------------------------------*/
4926 static floatx80
subFloatx80Sigs(floatx80 a
, floatx80 b
, flag zSign
,
4927 float_status
*status
)
4929 int32_t aExp
, bExp
, zExp
;
4930 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4933 aSig
= extractFloatx80Frac( a
);
4934 aExp
= extractFloatx80Exp( a
);
4935 bSig
= extractFloatx80Frac( b
);
4936 bExp
= extractFloatx80Exp( b
);
4937 expDiff
= aExp
- bExp
;
4938 if ( 0 < expDiff
) goto aExpBigger
;
4939 if ( expDiff
< 0 ) goto bExpBigger
;
4940 if ( aExp
== 0x7FFF ) {
4941 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4942 return propagateFloatx80NaN(a
, b
, status
);
4944 float_raise(float_flag_invalid
, status
);
4945 return floatx80_default_nan(status
);
4952 if ( bSig
< aSig
) goto aBigger
;
4953 if ( aSig
< bSig
) goto bBigger
;
4954 return packFloatx80(status
->float_rounding_mode
== float_round_down
, 0, 0);
4956 if ( bExp
== 0x7FFF ) {
4957 if ((uint64_t)(bSig
<< 1)) {
4958 return propagateFloatx80NaN(a
, b
, status
);
4960 return packFloatx80(zSign
^ 1, floatx80_infinity_high
,
4961 floatx80_infinity_low
);
4963 if ( aExp
== 0 ) ++expDiff
;
4964 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4966 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4969 goto normalizeRoundAndPack
;
4971 if ( aExp
== 0x7FFF ) {
4972 if ((uint64_t)(aSig
<< 1)) {
4973 return propagateFloatx80NaN(a
, b
, status
);
4977 if ( bExp
== 0 ) --expDiff
;
4978 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4980 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4982 normalizeRoundAndPack
:
4983 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
4984 zSign
, zExp
, zSig0
, zSig1
, status
);
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of adding the extended double-precision floating-point
4989 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4990 | Standard for Binary Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4993 floatx80
floatx80_add(floatx80 a
, floatx80 b
, float_status
*status
)
4997 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
4998 float_raise(float_flag_invalid
, status
);
4999 return floatx80_default_nan(status
);
5001 aSign
= extractFloatx80Sign( a
);
5002 bSign
= extractFloatx80Sign( b
);
5003 if ( aSign
== bSign
) {
5004 return addFloatx80Sigs(a
, b
, aSign
, status
);
5007 return subFloatx80Sigs(a
, b
, aSign
, status
);
5012 /*----------------------------------------------------------------------------
5013 | Returns the result of subtracting the extended double-precision floating-
5014 | point values `a' and `b'. The operation is performed according to the
5015 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5016 *----------------------------------------------------------------------------*/
5018 floatx80
floatx80_sub(floatx80 a
, floatx80 b
, float_status
*status
)
5022 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5023 float_raise(float_flag_invalid
, status
);
5024 return floatx80_default_nan(status
);
5026 aSign
= extractFloatx80Sign( a
);
5027 bSign
= extractFloatx80Sign( b
);
5028 if ( aSign
== bSign
) {
5029 return subFloatx80Sigs(a
, b
, aSign
, status
);
5032 return addFloatx80Sigs(a
, b
, aSign
, status
);
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of multiplying the extended double-precision floating-
5039 | point values `a' and `b'. The operation is performed according to the
5040 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5041 *----------------------------------------------------------------------------*/
5043 floatx80
floatx80_mul(floatx80 a
, floatx80 b
, float_status
*status
)
5045 flag aSign
, bSign
, zSign
;
5046 int32_t aExp
, bExp
, zExp
;
5047 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5049 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5050 float_raise(float_flag_invalid
, status
);
5051 return floatx80_default_nan(status
);
5053 aSig
= extractFloatx80Frac( a
);
5054 aExp
= extractFloatx80Exp( a
);
5055 aSign
= extractFloatx80Sign( a
);
5056 bSig
= extractFloatx80Frac( b
);
5057 bExp
= extractFloatx80Exp( b
);
5058 bSign
= extractFloatx80Sign( b
);
5059 zSign
= aSign
^ bSign
;
5060 if ( aExp
== 0x7FFF ) {
5061 if ( (uint64_t) ( aSig
<<1 )
5062 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5063 return propagateFloatx80NaN(a
, b
, status
);
5065 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5066 return packFloatx80(zSign
, floatx80_infinity_high
,
5067 floatx80_infinity_low
);
5069 if ( bExp
== 0x7FFF ) {
5070 if ((uint64_t)(bSig
<< 1)) {
5071 return propagateFloatx80NaN(a
, b
, status
);
5073 if ( ( aExp
| aSig
) == 0 ) {
5075 float_raise(float_flag_invalid
, status
);
5076 return floatx80_default_nan(status
);
5078 return packFloatx80(zSign
, floatx80_infinity_high
,
5079 floatx80_infinity_low
);
5082 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5083 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5086 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5087 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5089 zExp
= aExp
+ bExp
- 0x3FFE;
5090 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5091 if ( 0 < (int64_t) zSig0
) {
5092 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5095 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5096 zSign
, zExp
, zSig0
, zSig1
, status
);
5099 /*----------------------------------------------------------------------------
5100 | Returns the result of dividing the extended double-precision floating-point
5101 | value `a' by the corresponding value `b'. The operation is performed
5102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5103 *----------------------------------------------------------------------------*/
5105 floatx80
floatx80_div(floatx80 a
, floatx80 b
, float_status
*status
)
5107 flag aSign
, bSign
, zSign
;
5108 int32_t aExp
, bExp
, zExp
;
5109 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5110 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5112 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5113 float_raise(float_flag_invalid
, status
);
5114 return floatx80_default_nan(status
);
5116 aSig
= extractFloatx80Frac( a
);
5117 aExp
= extractFloatx80Exp( a
);
5118 aSign
= extractFloatx80Sign( a
);
5119 bSig
= extractFloatx80Frac( b
);
5120 bExp
= extractFloatx80Exp( b
);
5121 bSign
= extractFloatx80Sign( b
);
5122 zSign
= aSign
^ bSign
;
5123 if ( aExp
== 0x7FFF ) {
5124 if ((uint64_t)(aSig
<< 1)) {
5125 return propagateFloatx80NaN(a
, b
, status
);
5127 if ( bExp
== 0x7FFF ) {
5128 if ((uint64_t)(bSig
<< 1)) {
5129 return propagateFloatx80NaN(a
, b
, status
);
5133 return packFloatx80(zSign
, floatx80_infinity_high
,
5134 floatx80_infinity_low
);
5136 if ( bExp
== 0x7FFF ) {
5137 if ((uint64_t)(bSig
<< 1)) {
5138 return propagateFloatx80NaN(a
, b
, status
);
5140 return packFloatx80( zSign
, 0, 0 );
5144 if ( ( aExp
| aSig
) == 0 ) {
5146 float_raise(float_flag_invalid
, status
);
5147 return floatx80_default_nan(status
);
5149 float_raise(float_flag_divbyzero
, status
);
5150 return packFloatx80(zSign
, floatx80_infinity_high
,
5151 floatx80_infinity_low
);
5153 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5156 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5157 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5159 zExp
= aExp
- bExp
+ 0x3FFE;
5161 if ( bSig
<= aSig
) {
5162 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5165 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5166 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5167 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5168 while ( (int64_t) rem0
< 0 ) {
5170 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5172 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5173 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5174 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5175 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5176 while ( (int64_t) rem1
< 0 ) {
5178 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5180 zSig1
|= ( ( rem1
| rem2
) != 0 );
5182 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5183 zSign
, zExp
, zSig0
, zSig1
, status
);
5186 /*----------------------------------------------------------------------------
5187 | Returns the remainder of the extended double-precision floating-point value
5188 | `a' with respect to the corresponding value `b'. The operation is performed
5189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5190 *----------------------------------------------------------------------------*/
5192 floatx80
floatx80_rem(floatx80 a
, floatx80 b
, float_status
*status
)
5195 int32_t aExp
, bExp
, expDiff
;
5196 uint64_t aSig0
, aSig1
, bSig
;
5197 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5199 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5200 float_raise(float_flag_invalid
, status
);
5201 return floatx80_default_nan(status
);
5203 aSig0
= extractFloatx80Frac( a
);
5204 aExp
= extractFloatx80Exp( a
);
5205 aSign
= extractFloatx80Sign( a
);
5206 bSig
= extractFloatx80Frac( b
);
5207 bExp
= extractFloatx80Exp( b
);
5208 if ( aExp
== 0x7FFF ) {
5209 if ( (uint64_t) ( aSig0
<<1 )
5210 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5211 return propagateFloatx80NaN(a
, b
, status
);
5215 if ( bExp
== 0x7FFF ) {
5216 if ((uint64_t)(bSig
<< 1)) {
5217 return propagateFloatx80NaN(a
, b
, status
);
5224 float_raise(float_flag_invalid
, status
);
5225 return floatx80_default_nan(status
);
5227 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5230 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5231 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5233 bSig
|= LIT64( 0x8000000000000000 );
5235 expDiff
= aExp
- bExp
;
5237 if ( expDiff
< 0 ) {
5238 if ( expDiff
< -1 ) return a
;
5239 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5242 q
= ( bSig
<= aSig0
);
5243 if ( q
) aSig0
-= bSig
;
5245 while ( 0 < expDiff
) {
5246 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5247 q
= ( 2 < q
) ? q
- 2 : 0;
5248 mul64To128( bSig
, q
, &term0
, &term1
);
5249 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5250 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5254 if ( 0 < expDiff
) {
5255 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5256 q
= ( 2 < q
) ? q
- 2 : 0;
5258 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5259 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5260 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5261 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5263 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5270 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5271 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5272 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5275 aSig0
= alternateASig0
;
5276 aSig1
= alternateASig1
;
5280 normalizeRoundAndPackFloatx80(
5281 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
, status
);
5285 /*----------------------------------------------------------------------------
5286 | Returns the square root of the extended double-precision floating-point
5287 | value `a'. The operation is performed according to the IEC/IEEE Standard
5288 | for Binary Floating-Point Arithmetic.
5289 *----------------------------------------------------------------------------*/
5291 floatx80
floatx80_sqrt(floatx80 a
, float_status
*status
)
5295 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5296 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5298 if (floatx80_invalid_encoding(a
)) {
5299 float_raise(float_flag_invalid
, status
);
5300 return floatx80_default_nan(status
);
5302 aSig0
= extractFloatx80Frac( a
);
5303 aExp
= extractFloatx80Exp( a
);
5304 aSign
= extractFloatx80Sign( a
);
5305 if ( aExp
== 0x7FFF ) {
5306 if ((uint64_t)(aSig0
<< 1)) {
5307 return propagateFloatx80NaN(a
, a
, status
);
5309 if ( ! aSign
) return a
;
5313 if ( ( aExp
| aSig0
) == 0 ) return a
;
5315 float_raise(float_flag_invalid
, status
);
5316 return floatx80_default_nan(status
);
5319 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5320 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5322 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5323 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5324 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5325 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5326 doubleZSig0
= zSig0
<<1;
5327 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5328 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5329 while ( (int64_t) rem0
< 0 ) {
5332 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5334 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5335 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5336 if ( zSig1
== 0 ) zSig1
= 1;
5337 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5338 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5339 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5340 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5341 while ( (int64_t) rem1
< 0 ) {
5343 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5345 term2
|= doubleZSig0
;
5346 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5348 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5350 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5351 zSig0
|= doubleZSig0
;
5352 return roundAndPackFloatx80(status
->floatx80_rounding_precision
,
5353 0, zExp
, zSig0
, zSig1
, status
);
5356 /*----------------------------------------------------------------------------
5357 | Returns 1 if the extended double-precision floating-point value `a' is equal
5358 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5359 | raised if either operand is a NaN. Otherwise, the comparison is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5363 int floatx80_eq(floatx80 a
, floatx80 b
, float_status
*status
)
5366 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5367 || (extractFloatx80Exp(a
) == 0x7FFF
5368 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5369 || (extractFloatx80Exp(b
) == 0x7FFF
5370 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5372 float_raise(float_flag_invalid
, status
);
5377 && ( ( a
.high
== b
.high
)
5379 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5384 /*----------------------------------------------------------------------------
5385 | Returns 1 if the extended double-precision floating-point value `a' is
5386 | less than or equal to the corresponding value `b', and 0 otherwise. The
5387 | invalid exception is raised if either operand is a NaN. The comparison is
5388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5390 *----------------------------------------------------------------------------*/
5392 int floatx80_le(floatx80 a
, floatx80 b
, float_status
*status
)
5396 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5397 || (extractFloatx80Exp(a
) == 0x7FFF
5398 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5399 || (extractFloatx80Exp(b
) == 0x7FFF
5400 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5402 float_raise(float_flag_invalid
, status
);
5405 aSign
= extractFloatx80Sign( a
);
5406 bSign
= extractFloatx80Sign( b
);
5407 if ( aSign
!= bSign
) {
5410 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5414 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5415 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5419 /*----------------------------------------------------------------------------
5420 | Returns 1 if the extended double-precision floating-point value `a' is
5421 | less than the corresponding value `b', and 0 otherwise. The invalid
5422 | exception is raised if either operand is a NaN. The comparison is performed
5423 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5424 *----------------------------------------------------------------------------*/
5426 int floatx80_lt(floatx80 a
, floatx80 b
, float_status
*status
)
5430 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5431 || (extractFloatx80Exp(a
) == 0x7FFF
5432 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5433 || (extractFloatx80Exp(b
) == 0x7FFF
5434 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5436 float_raise(float_flag_invalid
, status
);
5439 aSign
= extractFloatx80Sign( a
);
5440 bSign
= extractFloatx80Sign( b
);
5441 if ( aSign
!= bSign
) {
5444 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5448 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5449 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5453 /*----------------------------------------------------------------------------
5454 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5455 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5456 | either operand is a NaN. The comparison is performed according to the
5457 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5458 *----------------------------------------------------------------------------*/
5459 int floatx80_unordered(floatx80 a
, floatx80 b
, float_status
*status
)
5461 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)
5462 || (extractFloatx80Exp(a
) == 0x7FFF
5463 && (uint64_t) (extractFloatx80Frac(a
) << 1))
5464 || (extractFloatx80Exp(b
) == 0x7FFF
5465 && (uint64_t) (extractFloatx80Frac(b
) << 1))
5467 float_raise(float_flag_invalid
, status
);
5473 /*----------------------------------------------------------------------------
5474 | Returns 1 if the extended double-precision floating-point value `a' is
5475 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5476 | cause an exception. The comparison is performed according to the IEC/IEEE
5477 | Standard for Binary Floating-Point Arithmetic.
5478 *----------------------------------------------------------------------------*/
5480 int floatx80_eq_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5483 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5484 float_raise(float_flag_invalid
, status
);
5487 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5488 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5489 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5490 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5492 if (floatx80_is_signaling_nan(a
, status
)
5493 || floatx80_is_signaling_nan(b
, status
)) {
5494 float_raise(float_flag_invalid
, status
);
5500 && ( ( a
.high
== b
.high
)
5502 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5507 /*----------------------------------------------------------------------------
5508 | Returns 1 if the extended double-precision floating-point value `a' is less
5509 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5510 | do not cause an exception. Otherwise, the comparison is performed according
5511 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5512 *----------------------------------------------------------------------------*/
5514 int floatx80_le_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5518 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5519 float_raise(float_flag_invalid
, status
);
5522 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5523 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5524 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5525 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5527 if (floatx80_is_signaling_nan(a
, status
)
5528 || floatx80_is_signaling_nan(b
, status
)) {
5529 float_raise(float_flag_invalid
, status
);
5533 aSign
= extractFloatx80Sign( a
);
5534 bSign
= extractFloatx80Sign( b
);
5535 if ( aSign
!= bSign
) {
5538 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5542 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5543 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5547 /*----------------------------------------------------------------------------
5548 | Returns 1 if the extended double-precision floating-point value `a' is less
5549 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5550 | an exception. Otherwise, the comparison is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5554 int floatx80_lt_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5558 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5559 float_raise(float_flag_invalid
, status
);
5562 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5563 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5564 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5565 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5567 if (floatx80_is_signaling_nan(a
, status
)
5568 || floatx80_is_signaling_nan(b
, status
)) {
5569 float_raise(float_flag_invalid
, status
);
5573 aSign
= extractFloatx80Sign( a
);
5574 bSign
= extractFloatx80Sign( b
);
5575 if ( aSign
!= bSign
) {
5578 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5582 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5583 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5587 /*----------------------------------------------------------------------------
5588 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5589 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5590 | The comparison is performed according to the IEC/IEEE Standard for Binary
5591 | Floating-Point Arithmetic.
5592 *----------------------------------------------------------------------------*/
5593 int floatx80_unordered_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
5595 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
5596 float_raise(float_flag_invalid
, status
);
5599 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5600 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5601 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5602 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5604 if (floatx80_is_signaling_nan(a
, status
)
5605 || floatx80_is_signaling_nan(b
, status
)) {
5606 float_raise(float_flag_invalid
, status
);
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the quadruple-precision floating-point
5615 | value `a' to the 32-bit two's complement integer format. The conversion
5616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5617 | Arithmetic---which means in particular that the conversion is rounded
5618 | according to the current rounding mode. If `a' is a NaN, the largest
5619 | positive integer is returned. Otherwise, if the conversion overflows, the
5620 | largest integer with the same sign as `a' is returned.
5621 *----------------------------------------------------------------------------*/
5623 int32_t float128_to_int32(float128 a
, float_status
*status
)
5626 int32_t aExp
, shiftCount
;
5627 uint64_t aSig0
, aSig1
;
5629 aSig1
= extractFloat128Frac1( a
);
5630 aSig0
= extractFloat128Frac0( a
);
5631 aExp
= extractFloat128Exp( a
);
5632 aSign
= extractFloat128Sign( a
);
5633 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5634 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5635 aSig0
|= ( aSig1
!= 0 );
5636 shiftCount
= 0x4028 - aExp
;
5637 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5638 return roundAndPackInt32(aSign
, aSig0
, status
);
5642 /*----------------------------------------------------------------------------
5643 | Returns the result of converting the quadruple-precision floating-point
5644 | value `a' to the 32-bit two's complement integer format. The conversion
5645 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5646 | Arithmetic, except that the conversion is always rounded toward zero. If
5647 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5648 | conversion overflows, the largest integer with the same sign as `a' is
5650 *----------------------------------------------------------------------------*/
5652 int32_t float128_to_int32_round_to_zero(float128 a
, float_status
*status
)
5655 int32_t aExp
, shiftCount
;
5656 uint64_t aSig0
, aSig1
, savedASig
;
5659 aSig1
= extractFloat128Frac1( a
);
5660 aSig0
= extractFloat128Frac0( a
);
5661 aExp
= extractFloat128Exp( a
);
5662 aSign
= extractFloat128Sign( a
);
5663 aSig0
|= ( aSig1
!= 0 );
5664 if ( 0x401E < aExp
) {
5665 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5668 else if ( aExp
< 0x3FFF ) {
5669 if (aExp
|| aSig0
) {
5670 status
->float_exception_flags
|= float_flag_inexact
;
5674 aSig0
|= LIT64( 0x0001000000000000 );
5675 shiftCount
= 0x402F - aExp
;
5677 aSig0
>>= shiftCount
;
5679 if ( aSign
) z
= - z
;
5680 if ( ( z
< 0 ) ^ aSign
) {
5682 float_raise(float_flag_invalid
, status
);
5683 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5685 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5686 status
->float_exception_flags
|= float_flag_inexact
;
5692 /*----------------------------------------------------------------------------
5693 | Returns the result of converting the quadruple-precision floating-point
5694 | value `a' to the 64-bit two's complement integer format. The conversion
5695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696 | Arithmetic---which means in particular that the conversion is rounded
5697 | according to the current rounding mode. If `a' is a NaN, the largest
5698 | positive integer is returned. Otherwise, if the conversion overflows, the
5699 | largest integer with the same sign as `a' is returned.
5700 *----------------------------------------------------------------------------*/
5702 int64_t float128_to_int64(float128 a
, float_status
*status
)
5705 int32_t aExp
, shiftCount
;
5706 uint64_t aSig0
, aSig1
;
5708 aSig1
= extractFloat128Frac1( a
);
5709 aSig0
= extractFloat128Frac0( a
);
5710 aExp
= extractFloat128Exp( a
);
5711 aSign
= extractFloat128Sign( a
);
5712 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5713 shiftCount
= 0x402F - aExp
;
5714 if ( shiftCount
<= 0 ) {
5715 if ( 0x403E < aExp
) {
5716 float_raise(float_flag_invalid
, status
);
5718 || ( ( aExp
== 0x7FFF )
5719 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5722 return LIT64( 0x7FFFFFFFFFFFFFFF );
5724 return (int64_t) LIT64( 0x8000000000000000 );
5726 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5729 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5731 return roundAndPackInt64(aSign
, aSig0
, aSig1
, status
);
5735 /*----------------------------------------------------------------------------
5736 | Returns the result of converting the quadruple-precision floating-point
5737 | value `a' to the 64-bit two's complement integer format. The conversion
5738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5739 | Arithmetic, except that the conversion is always rounded toward zero.
5740 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5741 | the conversion overflows, the largest integer with the same sign as `a' is
5743 *----------------------------------------------------------------------------*/
5745 int64_t float128_to_int64_round_to_zero(float128 a
, float_status
*status
)
5748 int32_t aExp
, shiftCount
;
5749 uint64_t aSig0
, aSig1
;
5752 aSig1
= extractFloat128Frac1( a
);
5753 aSig0
= extractFloat128Frac0( a
);
5754 aExp
= extractFloat128Exp( a
);
5755 aSign
= extractFloat128Sign( a
);
5756 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5757 shiftCount
= aExp
- 0x402F;
5758 if ( 0 < shiftCount
) {
5759 if ( 0x403E <= aExp
) {
5760 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5761 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5762 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5764 status
->float_exception_flags
|= float_flag_inexact
;
5768 float_raise(float_flag_invalid
, status
);
5769 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5770 return LIT64( 0x7FFFFFFFFFFFFFFF );
5773 return (int64_t) LIT64( 0x8000000000000000 );
5775 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5776 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5777 status
->float_exception_flags
|= float_flag_inexact
;
5781 if ( aExp
< 0x3FFF ) {
5782 if ( aExp
| aSig0
| aSig1
) {
5783 status
->float_exception_flags
|= float_flag_inexact
;
5787 z
= aSig0
>>( - shiftCount
);
5789 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5790 status
->float_exception_flags
|= float_flag_inexact
;
5793 if ( aSign
) z
= - z
;
5798 /*----------------------------------------------------------------------------
5799 | Returns the result of converting the quadruple-precision floating-point value
5800 | `a' to the 64-bit unsigned integer format. The conversion is
5801 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5802 | Arithmetic---which means in particular that the conversion is rounded
5803 | according to the current rounding mode. If `a' is a NaN, the largest
5804 | positive integer is returned. If the conversion overflows, the
5805 | largest unsigned integer is returned. If 'a' is negative, the value is
5806 | rounded and zero is returned; negative values that do not round to zero
5807 | will raise the inexact exception.
5808 *----------------------------------------------------------------------------*/
5810 uint64_t float128_to_uint64(float128 a
, float_status
*status
)
5815 uint64_t aSig0
, aSig1
;
5817 aSig0
= extractFloat128Frac0(a
);
5818 aSig1
= extractFloat128Frac1(a
);
5819 aExp
= extractFloat128Exp(a
);
5820 aSign
= extractFloat128Sign(a
);
5821 if (aSign
&& (aExp
> 0x3FFE)) {
5822 float_raise(float_flag_invalid
, status
);
5823 if (float128_is_any_nan(a
)) {
5824 return LIT64(0xFFFFFFFFFFFFFFFF);
5830 aSig0
|= LIT64(0x0001000000000000);
5832 shiftCount
= 0x402F - aExp
;
5833 if (shiftCount
<= 0) {
5834 if (0x403E < aExp
) {
5835 float_raise(float_flag_invalid
, status
);
5836 return LIT64(0xFFFFFFFFFFFFFFFF);
5838 shortShift128Left(aSig0
, aSig1
, -shiftCount
, &aSig0
, &aSig1
);
5840 shift64ExtraRightJamming(aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5842 return roundAndPackUint64(aSign
, aSig0
, aSig1
, status
);
5845 uint64_t float128_to_uint64_round_to_zero(float128 a
, float_status
*status
)
5848 signed char current_rounding_mode
= status
->float_rounding_mode
;
5850 set_float_rounding_mode(float_round_to_zero
, status
);
5851 v
= float128_to_uint64(a
, status
);
5852 set_float_rounding_mode(current_rounding_mode
, status
);
5857 /*----------------------------------------------------------------------------
5858 | Returns the result of converting the quadruple-precision floating-point
5859 | value `a' to the 32-bit unsigned integer format. The conversion
5860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5861 | Arithmetic except that the conversion is always rounded toward zero.
5862 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5863 | if the conversion overflows, the largest unsigned integer is returned.
5864 | If 'a' is negative, the value is rounded and zero is returned; negative
5865 | values that do not round to zero will raise the inexact exception.
5866 *----------------------------------------------------------------------------*/
5868 uint32_t float128_to_uint32_round_to_zero(float128 a
, float_status
*status
)
5872 int old_exc_flags
= get_float_exception_flags(status
);
5874 v
= float128_to_uint64_round_to_zero(a
, status
);
5875 if (v
> 0xffffffff) {
5880 set_float_exception_flags(old_exc_flags
, status
);
5881 float_raise(float_flag_invalid
, status
);
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of converting the quadruple-precision floating-point
5887 | value `a' to the single-precision floating-point format. The conversion
5888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5890 *----------------------------------------------------------------------------*/
5892 float32
float128_to_float32(float128 a
, float_status
*status
)
5896 uint64_t aSig0
, aSig1
;
5899 aSig1
= extractFloat128Frac1( a
);
5900 aSig0
= extractFloat128Frac0( a
);
5901 aExp
= extractFloat128Exp( a
);
5902 aSign
= extractFloat128Sign( a
);
5903 if ( aExp
== 0x7FFF ) {
5904 if ( aSig0
| aSig1
) {
5905 return commonNaNToFloat32(float128ToCommonNaN(a
, status
), status
);
5907 return packFloat32( aSign
, 0xFF, 0 );
5909 aSig0
|= ( aSig1
!= 0 );
5910 shift64RightJamming( aSig0
, 18, &aSig0
);
5912 if ( aExp
|| zSig
) {
5916 return roundAndPackFloat32(aSign
, aExp
, zSig
, status
);
5920 /*----------------------------------------------------------------------------
5921 | Returns the result of converting the quadruple-precision floating-point
5922 | value `a' to the double-precision floating-point format. The conversion
5923 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5925 *----------------------------------------------------------------------------*/
5927 float64
float128_to_float64(float128 a
, float_status
*status
)
5931 uint64_t aSig0
, aSig1
;
5933 aSig1
= extractFloat128Frac1( a
);
5934 aSig0
= extractFloat128Frac0( a
);
5935 aExp
= extractFloat128Exp( a
);
5936 aSign
= extractFloat128Sign( a
);
5937 if ( aExp
== 0x7FFF ) {
5938 if ( aSig0
| aSig1
) {
5939 return commonNaNToFloat64(float128ToCommonNaN(a
, status
), status
);
5941 return packFloat64( aSign
, 0x7FF, 0 );
5943 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5944 aSig0
|= ( aSig1
!= 0 );
5945 if ( aExp
|| aSig0
) {
5946 aSig0
|= LIT64( 0x4000000000000000 );
5949 return roundAndPackFloat64(aSign
, aExp
, aSig0
, status
);
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of converting the quadruple-precision floating-point
5955 | value `a' to the extended double-precision floating-point format. The
5956 | conversion is performed according to the IEC/IEEE Standard for Binary
5957 | Floating-Point Arithmetic.
5958 *----------------------------------------------------------------------------*/
5960 floatx80
float128_to_floatx80(float128 a
, float_status
*status
)
5964 uint64_t aSig0
, aSig1
;
5966 aSig1
= extractFloat128Frac1( a
);
5967 aSig0
= extractFloat128Frac0( a
);
5968 aExp
= extractFloat128Exp( a
);
5969 aSign
= extractFloat128Sign( a
);
5970 if ( aExp
== 0x7FFF ) {
5971 if ( aSig0
| aSig1
) {
5972 return commonNaNToFloatx80(float128ToCommonNaN(a
, status
), status
);
5974 return packFloatx80(aSign
, floatx80_infinity_high
,
5975 floatx80_infinity_low
);
5978 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5979 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5982 aSig0
|= LIT64( 0x0001000000000000 );
5984 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5985 return roundAndPackFloatx80(80, aSign
, aExp
, aSig0
, aSig1
, status
);
5989 /*----------------------------------------------------------------------------
5990 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5991 | returns the result as a quadruple-precision floating-point value. The
5992 | operation is performed according to the IEC/IEEE Standard for Binary
5993 | Floating-Point Arithmetic.
5994 *----------------------------------------------------------------------------*/
5996 float128
float128_round_to_int(float128 a
, float_status
*status
)
6000 uint64_t lastBitMask
, roundBitsMask
;
6003 aExp
= extractFloat128Exp( a
);
6004 if ( 0x402F <= aExp
) {
6005 if ( 0x406F <= aExp
) {
6006 if ( ( aExp
== 0x7FFF )
6007 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6009 return propagateFloat128NaN(a
, a
, status
);
6014 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6015 roundBitsMask
= lastBitMask
- 1;
6017 switch (status
->float_rounding_mode
) {
6018 case float_round_nearest_even
:
6019 if ( lastBitMask
) {
6020 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6021 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6024 if ( (int64_t) z
.low
< 0 ) {
6026 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6030 case float_round_ties_away
:
6032 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6034 if ((int64_t) z
.low
< 0) {
6039 case float_round_to_zero
:
6041 case float_round_up
:
6042 if (!extractFloat128Sign(z
)) {
6043 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6046 case float_round_down
:
6047 if (extractFloat128Sign(z
)) {
6048 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6054 z
.low
&= ~ roundBitsMask
;
6057 if ( aExp
< 0x3FFF ) {
6058 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6059 status
->float_exception_flags
|= float_flag_inexact
;
6060 aSign
= extractFloat128Sign( a
);
6061 switch (status
->float_rounding_mode
) {
6062 case float_round_nearest_even
:
6063 if ( ( aExp
== 0x3FFE )
6064 && ( extractFloat128Frac0( a
)
6065 | extractFloat128Frac1( a
) )
6067 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6070 case float_round_ties_away
:
6071 if (aExp
== 0x3FFE) {
6072 return packFloat128(aSign
, 0x3FFF, 0, 0);
6075 case float_round_down
:
6077 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6078 : packFloat128( 0, 0, 0, 0 );
6079 case float_round_up
:
6081 aSign
? packFloat128( 1, 0, 0, 0 )
6082 : packFloat128( 0, 0x3FFF, 0, 0 );
6084 return packFloat128( aSign
, 0, 0, 0 );
6087 lastBitMask
<<= 0x402F - aExp
;
6088 roundBitsMask
= lastBitMask
- 1;
6091 switch (status
->float_rounding_mode
) {
6092 case float_round_nearest_even
:
6093 z
.high
+= lastBitMask
>>1;
6094 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6095 z
.high
&= ~ lastBitMask
;
6098 case float_round_ties_away
:
6099 z
.high
+= lastBitMask
>>1;
6101 case float_round_to_zero
:
6103 case float_round_up
:
6104 if (!extractFloat128Sign(z
)) {
6105 z
.high
|= ( a
.low
!= 0 );
6106 z
.high
+= roundBitsMask
;
6109 case float_round_down
:
6110 if (extractFloat128Sign(z
)) {
6111 z
.high
|= (a
.low
!= 0);
6112 z
.high
+= roundBitsMask
;
6118 z
.high
&= ~ roundBitsMask
;
6120 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6121 status
->float_exception_flags
|= float_flag_inexact
;
6127 /*----------------------------------------------------------------------------
6128 | Returns the result of adding the absolute values of the quadruple-precision
6129 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6130 | before being returned. `zSign' is ignored if the result is a NaN.
6131 | The addition is performed according to the IEC/IEEE Standard for Binary
6132 | Floating-Point Arithmetic.
6133 *----------------------------------------------------------------------------*/
6135 static float128
addFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6136 float_status
*status
)
6138 int32_t aExp
, bExp
, zExp
;
6139 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6142 aSig1
= extractFloat128Frac1( a
);
6143 aSig0
= extractFloat128Frac0( a
);
6144 aExp
= extractFloat128Exp( a
);
6145 bSig1
= extractFloat128Frac1( b
);
6146 bSig0
= extractFloat128Frac0( b
);
6147 bExp
= extractFloat128Exp( b
);
6148 expDiff
= aExp
- bExp
;
6149 if ( 0 < expDiff
) {
6150 if ( aExp
== 0x7FFF ) {
6151 if (aSig0
| aSig1
) {
6152 return propagateFloat128NaN(a
, b
, status
);
6160 bSig0
|= LIT64( 0x0001000000000000 );
6162 shift128ExtraRightJamming(
6163 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6166 else if ( expDiff
< 0 ) {
6167 if ( bExp
== 0x7FFF ) {
6168 if (bSig0
| bSig1
) {
6169 return propagateFloat128NaN(a
, b
, status
);
6171 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6177 aSig0
|= LIT64( 0x0001000000000000 );
6179 shift128ExtraRightJamming(
6180 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6184 if ( aExp
== 0x7FFF ) {
6185 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6186 return propagateFloat128NaN(a
, b
, status
);
6190 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6192 if (status
->flush_to_zero
) {
6193 if (zSig0
| zSig1
) {
6194 float_raise(float_flag_output_denormal
, status
);
6196 return packFloat128(zSign
, 0, 0, 0);
6198 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6201 zSig0
|= LIT64( 0x0002000000000000 );
6205 aSig0
|= LIT64( 0x0001000000000000 );
6206 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6208 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6211 shift128ExtraRightJamming(
6212 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6214 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6218 /*----------------------------------------------------------------------------
6219 | Returns the result of subtracting the absolute values of the quadruple-
6220 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6221 | difference is negated before being returned. `zSign' is ignored if the
6222 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6223 | Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6226 static float128
subFloat128Sigs(float128 a
, float128 b
, flag zSign
,
6227 float_status
*status
)
6229 int32_t aExp
, bExp
, zExp
;
6230 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6233 aSig1
= extractFloat128Frac1( a
);
6234 aSig0
= extractFloat128Frac0( a
);
6235 aExp
= extractFloat128Exp( a
);
6236 bSig1
= extractFloat128Frac1( b
);
6237 bSig0
= extractFloat128Frac0( b
);
6238 bExp
= extractFloat128Exp( b
);
6239 expDiff
= aExp
- bExp
;
6240 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6241 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6242 if ( 0 < expDiff
) goto aExpBigger
;
6243 if ( expDiff
< 0 ) goto bExpBigger
;
6244 if ( aExp
== 0x7FFF ) {
6245 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6246 return propagateFloat128NaN(a
, b
, status
);
6248 float_raise(float_flag_invalid
, status
);
6249 return float128_default_nan(status
);
6255 if ( bSig0
< aSig0
) goto aBigger
;
6256 if ( aSig0
< bSig0
) goto bBigger
;
6257 if ( bSig1
< aSig1
) goto aBigger
;
6258 if ( aSig1
< bSig1
) goto bBigger
;
6259 return packFloat128(status
->float_rounding_mode
== float_round_down
,
6262 if ( bExp
== 0x7FFF ) {
6263 if (bSig0
| bSig1
) {
6264 return propagateFloat128NaN(a
, b
, status
);
6266 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6272 aSig0
|= LIT64( 0x4000000000000000 );
6274 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6275 bSig0
|= LIT64( 0x4000000000000000 );
6277 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6280 goto normalizeRoundAndPack
;
6282 if ( aExp
== 0x7FFF ) {
6283 if (aSig0
| aSig1
) {
6284 return propagateFloat128NaN(a
, b
, status
);
6292 bSig0
|= LIT64( 0x4000000000000000 );
6294 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6295 aSig0
|= LIT64( 0x4000000000000000 );
6297 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6299 normalizeRoundAndPack
:
6301 return normalizeRoundAndPackFloat128(zSign
, zExp
- 14, zSig0
, zSig1
,
6306 /*----------------------------------------------------------------------------
6307 | Returns the result of adding the quadruple-precision floating-point values
6308 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6309 | for Binary Floating-Point Arithmetic.
6310 *----------------------------------------------------------------------------*/
6312 float128
float128_add(float128 a
, float128 b
, float_status
*status
)
6316 aSign
= extractFloat128Sign( a
);
6317 bSign
= extractFloat128Sign( b
);
6318 if ( aSign
== bSign
) {
6319 return addFloat128Sigs(a
, b
, aSign
, status
);
6322 return subFloat128Sigs(a
, b
, aSign
, status
);
6327 /*----------------------------------------------------------------------------
6328 | Returns the result of subtracting the quadruple-precision floating-point
6329 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6330 | Standard for Binary Floating-Point Arithmetic.
6331 *----------------------------------------------------------------------------*/
6333 float128
float128_sub(float128 a
, float128 b
, float_status
*status
)
6337 aSign
= extractFloat128Sign( a
);
6338 bSign
= extractFloat128Sign( b
);
6339 if ( aSign
== bSign
) {
6340 return subFloat128Sigs(a
, b
, aSign
, status
);
6343 return addFloat128Sigs(a
, b
, aSign
, status
);
6348 /*----------------------------------------------------------------------------
6349 | Returns the result of multiplying the quadruple-precision floating-point
6350 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6351 | Standard for Binary Floating-Point Arithmetic.
6352 *----------------------------------------------------------------------------*/
6354 float128
float128_mul(float128 a
, float128 b
, float_status
*status
)
6356 flag aSign
, bSign
, zSign
;
6357 int32_t aExp
, bExp
, zExp
;
6358 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6360 aSig1
= extractFloat128Frac1( a
);
6361 aSig0
= extractFloat128Frac0( a
);
6362 aExp
= extractFloat128Exp( a
);
6363 aSign
= extractFloat128Sign( a
);
6364 bSig1
= extractFloat128Frac1( b
);
6365 bSig0
= extractFloat128Frac0( b
);
6366 bExp
= extractFloat128Exp( b
);
6367 bSign
= extractFloat128Sign( b
);
6368 zSign
= aSign
^ bSign
;
6369 if ( aExp
== 0x7FFF ) {
6370 if ( ( aSig0
| aSig1
)
6371 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6372 return propagateFloat128NaN(a
, b
, status
);
6374 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6375 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6377 if ( bExp
== 0x7FFF ) {
6378 if (bSig0
| bSig1
) {
6379 return propagateFloat128NaN(a
, b
, status
);
6381 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6383 float_raise(float_flag_invalid
, status
);
6384 return float128_default_nan(status
);
6386 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6389 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6390 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6393 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6394 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6396 zExp
= aExp
+ bExp
- 0x4000;
6397 aSig0
|= LIT64( 0x0001000000000000 );
6398 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6399 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6400 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6401 zSig2
|= ( zSig3
!= 0 );
6402 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6403 shift128ExtraRightJamming(
6404 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6407 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6411 /*----------------------------------------------------------------------------
6412 | Returns the result of dividing the quadruple-precision floating-point value
6413 | `a' by the corresponding value `b'. The operation is performed according to
6414 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415 *----------------------------------------------------------------------------*/
6417 float128
float128_div(float128 a
, float128 b
, float_status
*status
)
6419 flag aSign
, bSign
, zSign
;
6420 int32_t aExp
, bExp
, zExp
;
6421 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6422 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6424 aSig1
= extractFloat128Frac1( a
);
6425 aSig0
= extractFloat128Frac0( a
);
6426 aExp
= extractFloat128Exp( a
);
6427 aSign
= extractFloat128Sign( a
);
6428 bSig1
= extractFloat128Frac1( b
);
6429 bSig0
= extractFloat128Frac0( b
);
6430 bExp
= extractFloat128Exp( b
);
6431 bSign
= extractFloat128Sign( b
);
6432 zSign
= aSign
^ bSign
;
6433 if ( aExp
== 0x7FFF ) {
6434 if (aSig0
| aSig1
) {
6435 return propagateFloat128NaN(a
, b
, status
);
6437 if ( bExp
== 0x7FFF ) {
6438 if (bSig0
| bSig1
) {
6439 return propagateFloat128NaN(a
, b
, status
);
6443 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6445 if ( bExp
== 0x7FFF ) {
6446 if (bSig0
| bSig1
) {
6447 return propagateFloat128NaN(a
, b
, status
);
6449 return packFloat128( zSign
, 0, 0, 0 );
6452 if ( ( bSig0
| bSig1
) == 0 ) {
6453 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6455 float_raise(float_flag_invalid
, status
);
6456 return float128_default_nan(status
);
6458 float_raise(float_flag_divbyzero
, status
);
6459 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6461 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6464 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6465 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6467 zExp
= aExp
- bExp
+ 0x3FFD;
6469 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6471 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6472 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6473 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6476 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6477 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6478 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6479 while ( (int64_t) rem0
< 0 ) {
6481 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6483 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6484 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6485 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6486 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6487 while ( (int64_t) rem1
< 0 ) {
6489 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6491 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6493 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6494 return roundAndPackFloat128(zSign
, zExp
, zSig0
, zSig1
, zSig2
, status
);
6498 /*----------------------------------------------------------------------------
6499 | Returns the remainder of the quadruple-precision floating-point value `a'
6500 | with respect to the corresponding value `b'. The operation is performed
6501 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6504 float128
float128_rem(float128 a
, float128 b
, float_status
*status
)
6507 int32_t aExp
, bExp
, expDiff
;
6508 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6509 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6512 aSig1
= extractFloat128Frac1( a
);
6513 aSig0
= extractFloat128Frac0( a
);
6514 aExp
= extractFloat128Exp( a
);
6515 aSign
= extractFloat128Sign( a
);
6516 bSig1
= extractFloat128Frac1( b
);
6517 bSig0
= extractFloat128Frac0( b
);
6518 bExp
= extractFloat128Exp( b
);
6519 if ( aExp
== 0x7FFF ) {
6520 if ( ( aSig0
| aSig1
)
6521 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6522 return propagateFloat128NaN(a
, b
, status
);
6526 if ( bExp
== 0x7FFF ) {
6527 if (bSig0
| bSig1
) {
6528 return propagateFloat128NaN(a
, b
, status
);
6533 if ( ( bSig0
| bSig1
) == 0 ) {
6535 float_raise(float_flag_invalid
, status
);
6536 return float128_default_nan(status
);
6538 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6541 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6542 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6544 expDiff
= aExp
- bExp
;
6545 if ( expDiff
< -1 ) return a
;
6547 aSig0
| LIT64( 0x0001000000000000 ),
6549 15 - ( expDiff
< 0 ),
6554 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6555 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6556 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6558 while ( 0 < expDiff
) {
6559 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6560 q
= ( 4 < q
) ? q
- 4 : 0;
6561 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6562 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6563 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6564 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6567 if ( -64 < expDiff
) {
6568 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6569 q
= ( 4 < q
) ? q
- 4 : 0;
6571 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6573 if ( expDiff
< 0 ) {
6574 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6577 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6579 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6580 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6583 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6584 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6587 alternateASig0
= aSig0
;
6588 alternateASig1
= aSig1
;
6590 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6591 } while ( 0 <= (int64_t) aSig0
);
6593 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6594 if ( ( sigMean0
< 0 )
6595 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6596 aSig0
= alternateASig0
;
6597 aSig1
= alternateASig1
;
6599 zSign
= ( (int64_t) aSig0
< 0 );
6600 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6601 return normalizeRoundAndPackFloat128(aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
,
6605 /*----------------------------------------------------------------------------
6606 | Returns the square root of the quadruple-precision floating-point value `a'.
6607 | The operation is performed according to the IEC/IEEE Standard for Binary
6608 | Floating-Point Arithmetic.
6609 *----------------------------------------------------------------------------*/
6611 float128
float128_sqrt(float128 a
, float_status
*status
)
6615 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6616 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6618 aSig1
= extractFloat128Frac1( a
);
6619 aSig0
= extractFloat128Frac0( a
);
6620 aExp
= extractFloat128Exp( a
);
6621 aSign
= extractFloat128Sign( a
);
6622 if ( aExp
== 0x7FFF ) {
6623 if (aSig0
| aSig1
) {
6624 return propagateFloat128NaN(a
, a
, status
);
6626 if ( ! aSign
) return a
;
6630 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6632 float_raise(float_flag_invalid
, status
);
6633 return float128_default_nan(status
);
6636 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6637 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6639 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6640 aSig0
|= LIT64( 0x0001000000000000 );
6641 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6642 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6643 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6644 doubleZSig0
= zSig0
<<1;
6645 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6646 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6647 while ( (int64_t) rem0
< 0 ) {
6650 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6652 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6653 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6654 if ( zSig1
== 0 ) zSig1
= 1;
6655 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6656 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6657 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6658 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6659 while ( (int64_t) rem1
< 0 ) {
6661 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6663 term2
|= doubleZSig0
;
6664 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6666 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6668 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6669 return roundAndPackFloat128(0, zExp
, zSig0
, zSig1
, zSig2
, status
);
6673 /*----------------------------------------------------------------------------
6674 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6675 | the corresponding value `b', and 0 otherwise. The invalid exception is
6676 | raised if either operand is a NaN. Otherwise, the comparison is performed
6677 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6678 *----------------------------------------------------------------------------*/
6680 int float128_eq(float128 a
, float128 b
, float_status
*status
)
6683 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6684 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6685 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6686 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6688 float_raise(float_flag_invalid
, status
);
6693 && ( ( a
.high
== b
.high
)
6695 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6700 /*----------------------------------------------------------------------------
6701 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6702 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6703 | exception is raised if either operand is a NaN. The comparison is performed
6704 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6705 *----------------------------------------------------------------------------*/
6707 int float128_le(float128 a
, float128 b
, float_status
*status
)
6711 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6712 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6713 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6714 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6716 float_raise(float_flag_invalid
, status
);
6719 aSign
= extractFloat128Sign( a
);
6720 bSign
= extractFloat128Sign( b
);
6721 if ( aSign
!= bSign
) {
6724 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6728 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6729 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6733 /*----------------------------------------------------------------------------
6734 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6735 | the corresponding value `b', and 0 otherwise. The invalid exception is
6736 | raised if either operand is a NaN. The comparison is performed according
6737 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6738 *----------------------------------------------------------------------------*/
6740 int float128_lt(float128 a
, float128 b
, float_status
*status
)
6744 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6745 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6746 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6747 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6749 float_raise(float_flag_invalid
, status
);
6752 aSign
= extractFloat128Sign( a
);
6753 bSign
= extractFloat128Sign( b
);
6754 if ( aSign
!= bSign
) {
6757 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6761 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6762 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6766 /*----------------------------------------------------------------------------
6767 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6768 | be compared, and 0 otherwise. The invalid exception is raised if either
6769 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6770 | Standard for Binary Floating-Point Arithmetic.
6771 *----------------------------------------------------------------------------*/
6773 int float128_unordered(float128 a
, float128 b
, float_status
*status
)
6775 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6776 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6777 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6778 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6780 float_raise(float_flag_invalid
, status
);
6786 /*----------------------------------------------------------------------------
6787 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6788 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6789 | exception. The comparison is performed according to the IEC/IEEE Standard
6790 | for Binary Floating-Point Arithmetic.
6791 *----------------------------------------------------------------------------*/
6793 int float128_eq_quiet(float128 a
, float128 b
, float_status
*status
)
6796 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6797 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6798 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6799 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6801 if (float128_is_signaling_nan(a
, status
)
6802 || float128_is_signaling_nan(b
, status
)) {
6803 float_raise(float_flag_invalid
, status
);
6809 && ( ( a
.high
== b
.high
)
6811 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6816 /*----------------------------------------------------------------------------
6817 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6818 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6819 | cause an exception. Otherwise, the comparison is performed according to the
6820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6821 *----------------------------------------------------------------------------*/
6823 int float128_le_quiet(float128 a
, float128 b
, float_status
*status
)
6827 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6828 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6829 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6830 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6832 if (float128_is_signaling_nan(a
, status
)
6833 || float128_is_signaling_nan(b
, status
)) {
6834 float_raise(float_flag_invalid
, status
);
6838 aSign
= extractFloat128Sign( a
);
6839 bSign
= extractFloat128Sign( b
);
6840 if ( aSign
!= bSign
) {
6843 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6847 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6848 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6852 /*----------------------------------------------------------------------------
6853 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6854 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6855 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6856 | Standard for Binary Floating-Point Arithmetic.
6857 *----------------------------------------------------------------------------*/
6859 int float128_lt_quiet(float128 a
, float128 b
, float_status
*status
)
6863 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6864 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6865 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6866 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6868 if (float128_is_signaling_nan(a
, status
)
6869 || float128_is_signaling_nan(b
, status
)) {
6870 float_raise(float_flag_invalid
, status
);
6874 aSign
= extractFloat128Sign( a
);
6875 bSign
= extractFloat128Sign( b
);
6876 if ( aSign
!= bSign
) {
6879 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6883 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6884 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6888 /*----------------------------------------------------------------------------
6889 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6890 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6891 | comparison is performed according to the IEC/IEEE Standard for Binary
6892 | Floating-Point Arithmetic.
6893 *----------------------------------------------------------------------------*/
6895 int float128_unordered_quiet(float128 a
, float128 b
, float_status
*status
)
6897 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6898 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6899 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6900 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6902 if (float128_is_signaling_nan(a
, status
)
6903 || float128_is_signaling_nan(b
, status
)) {
6904 float_raise(float_flag_invalid
, status
);
6911 static inline int floatx80_compare_internal(floatx80 a
, floatx80 b
,
6912 int is_quiet
, float_status
*status
)
6916 if (floatx80_invalid_encoding(a
) || floatx80_invalid_encoding(b
)) {
6917 float_raise(float_flag_invalid
, status
);
6918 return float_relation_unordered
;
6920 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6921 ( extractFloatx80Frac( a
)<<1 ) ) ||
6922 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6923 ( extractFloatx80Frac( b
)<<1 ) )) {
6925 floatx80_is_signaling_nan(a
, status
) ||
6926 floatx80_is_signaling_nan(b
, status
)) {
6927 float_raise(float_flag_invalid
, status
);
6929 return float_relation_unordered
;
6931 aSign
= extractFloatx80Sign( a
);
6932 bSign
= extractFloatx80Sign( b
);
6933 if ( aSign
!= bSign
) {
6935 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6936 ( ( a
.low
| b
.low
) == 0 ) ) {
6938 return float_relation_equal
;
6940 return 1 - (2 * aSign
);
6943 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6944 return float_relation_equal
;
6946 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6951 int floatx80_compare(floatx80 a
, floatx80 b
, float_status
*status
)
6953 return floatx80_compare_internal(a
, b
, 0, status
);
6956 int floatx80_compare_quiet(floatx80 a
, floatx80 b
, float_status
*status
)
6958 return floatx80_compare_internal(a
, b
, 1, status
);
6961 static inline int float128_compare_internal(float128 a
, float128 b
,
6962 int is_quiet
, float_status
*status
)
6966 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6967 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6968 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6969 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6971 float128_is_signaling_nan(a
, status
) ||
6972 float128_is_signaling_nan(b
, status
)) {
6973 float_raise(float_flag_invalid
, status
);
6975 return float_relation_unordered
;
6977 aSign
= extractFloat128Sign( a
);
6978 bSign
= extractFloat128Sign( b
);
6979 if ( aSign
!= bSign
) {
6980 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6982 return float_relation_equal
;
6984 return 1 - (2 * aSign
);
6987 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6988 return float_relation_equal
;
6990 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6995 int float128_compare(float128 a
, float128 b
, float_status
*status
)
6997 return float128_compare_internal(a
, b
, 0, status
);
7000 int float128_compare_quiet(float128 a
, float128 b
, float_status
*status
)
7002 return float128_compare_internal(a
, b
, 1, status
);
7005 floatx80
floatx80_scalbn(floatx80 a
, int n
, float_status
*status
)
7011 if (floatx80_invalid_encoding(a
)) {
7012 float_raise(float_flag_invalid
, status
);
7013 return floatx80_default_nan(status
);
7015 aSig
= extractFloatx80Frac( a
);
7016 aExp
= extractFloatx80Exp( a
);
7017 aSign
= extractFloatx80Sign( a
);
7019 if ( aExp
== 0x7FFF ) {
7021 return propagateFloatx80NaN(a
, a
, status
);
7035 } else if (n
< -0x10000) {
7040 return normalizeRoundAndPackFloatx80(status
->floatx80_rounding_precision
,
7041 aSign
, aExp
, aSig
, 0, status
);
7044 float128
float128_scalbn(float128 a
, int n
, float_status
*status
)
7048 uint64_t aSig0
, aSig1
;
7050 aSig1
= extractFloat128Frac1( a
);
7051 aSig0
= extractFloat128Frac0( a
);
7052 aExp
= extractFloat128Exp( a
);
7053 aSign
= extractFloat128Sign( a
);
7054 if ( aExp
== 0x7FFF ) {
7055 if ( aSig0
| aSig1
) {
7056 return propagateFloat128NaN(a
, a
, status
);
7061 aSig0
|= LIT64( 0x0001000000000000 );
7062 } else if (aSig0
== 0 && aSig1
== 0) {
7070 } else if (n
< -0x10000) {
7075 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1