]> git.ipfire.org Git - thirdparty/qemu.git/blob - fpu/softfloat.c
19f40d69322030b4f60c01d65cd1c04da819dcd4
[thirdparty/qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88
89 /* We only need stdlib for abort() */
90
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations. (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
101
102 static inline uint32_t extractFloat16Frac(float16 a)
103 {
104 return float16_val(a) & 0x3ff;
105 }
106
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110
111 static inline int extractFloat16Exp(float16 a)
112 {
113 return (float16_val(a) >> 10) & 0x1f;
114 }
115
116 /*----------------------------------------------------------------------------
117 | Returns the sign bit of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119
120 static inline flag extractFloat16Sign(float16 a)
121 {
122 return float16_val(a)>>15;
123 }
124
125 /*----------------------------------------------------------------------------
126 | Returns the fraction bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128
129 static inline uint32_t extractFloat32Frac(float32 a)
130 {
131 return float32_val(a) & 0x007FFFFF;
132 }
133
134 /*----------------------------------------------------------------------------
135 | Returns the exponent bits of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137
138 static inline int extractFloat32Exp(float32 a)
139 {
140 return (float32_val(a) >> 23) & 0xFF;
141 }
142
143 /*----------------------------------------------------------------------------
144 | Returns the sign bit of the single-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146
147 static inline flag extractFloat32Sign(float32 a)
148 {
149 return float32_val(a) >> 31;
150 }
151
152 /*----------------------------------------------------------------------------
153 | Returns the fraction bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155
156 static inline uint64_t extractFloat64Frac(float64 a)
157 {
158 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
159 }
160
161 /*----------------------------------------------------------------------------
162 | Returns the exponent bits of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164
165 static inline int extractFloat64Exp(float64 a)
166 {
167 return (float64_val(a) >> 52) & 0x7FF;
168 }
169
170 /*----------------------------------------------------------------------------
171 | Returns the sign bit of the double-precision floating-point value `a'.
172 *----------------------------------------------------------------------------*/
173
174 static inline flag extractFloat64Sign(float64 a)
175 {
176 return float64_val(a) >> 63;
177 }
178
179 /*
180 * Classify a floating point number. Everything above float_class_qnan
181 * is a NaN so cls >= float_class_qnan is any NaN.
182 */
183
184 typedef enum __attribute__ ((__packed__)) {
185 float_class_unclassified,
186 float_class_zero,
187 float_class_normal,
188 float_class_inf,
189 float_class_qnan, /* all NaNs from here */
190 float_class_snan,
191 float_class_dnan,
192 float_class_msnan, /* maybe silenced */
193 } FloatClass;
194
195 /*
196 * Structure holding all of the decomposed parts of a float. The
197 * exponent is unbiased and the fraction is normalized. All
198 * calculations are done with a 64 bit fraction and then rounded as
199 * appropriate for the final format.
200 *
201 * Thanks to the packed FloatClass a decent compiler should be able to
202 * fit the whole structure into registers and avoid using the stack
203 * for parameter passing.
204 */
205
206 typedef struct {
207 uint64_t frac;
208 int32_t exp;
209 FloatClass cls;
210 bool sign;
211 } FloatParts;
212
213 #define DECOMPOSED_BINARY_POINT (64 - 2)
214 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
215 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
216
217 /* Structure holding all of the relevant parameters for a format.
218 * exp_size: the size of the exponent field
219 * exp_bias: the offset applied to the exponent field
220 * exp_max: the maximum normalised exponent
221 * frac_size: the size of the fraction field
222 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
223 * The following are computed based the size of fraction
224 * frac_lsb: least significant bit of fraction
225 * fram_lsbm1: the bit bellow the least significant bit (for rounding)
226 * round_mask/roundeven_mask: masks used for rounding
227 */
228 typedef struct {
229 int exp_size;
230 int exp_bias;
231 int exp_max;
232 int frac_size;
233 int frac_shift;
234 uint64_t frac_lsb;
235 uint64_t frac_lsbm1;
236 uint64_t round_mask;
237 uint64_t roundeven_mask;
238 } FloatFmt;
239
240 /* Expand fields based on the size of exponent and fraction */
241 #define FLOAT_PARAMS(E, F) \
242 .exp_size = E, \
243 .exp_bias = ((1 << E) - 1) >> 1, \
244 .exp_max = (1 << E) - 1, \
245 .frac_size = F, \
246 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
247 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
248 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
249 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
250 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
251
252 static const FloatFmt float16_params = {
253 FLOAT_PARAMS(5, 10)
254 };
255
256 static const FloatFmt float32_params = {
257 FLOAT_PARAMS(8, 23)
258 };
259
260 static const FloatFmt float64_params = {
261 FLOAT_PARAMS(11, 52)
262 };
263
264 /* Unpack a float to parts, but do not canonicalize. */
265 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
266 {
267 const int sign_pos = fmt.frac_size + fmt.exp_size;
268
269 return (FloatParts) {
270 .cls = float_class_unclassified,
271 .sign = extract64(raw, sign_pos, 1),
272 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
273 .frac = extract64(raw, 0, fmt.frac_size),
274 };
275 }
276
277 static inline FloatParts float16_unpack_raw(float16 f)
278 {
279 return unpack_raw(float16_params, f);
280 }
281
282 static inline FloatParts float32_unpack_raw(float32 f)
283 {
284 return unpack_raw(float32_params, f);
285 }
286
287 static inline FloatParts float64_unpack_raw(float64 f)
288 {
289 return unpack_raw(float64_params, f);
290 }
291
292 /* Pack a float from parts, but do not canonicalize. */
293 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
294 {
295 const int sign_pos = fmt.frac_size + fmt.exp_size;
296 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
297 return deposit64(ret, sign_pos, 1, p.sign);
298 }
299
300 static inline float16 float16_pack_raw(FloatParts p)
301 {
302 return make_float16(pack_raw(float16_params, p));
303 }
304
305 static inline float32 float32_pack_raw(FloatParts p)
306 {
307 return make_float32(pack_raw(float32_params, p));
308 }
309
310 static inline float64 float64_pack_raw(FloatParts p)
311 {
312 return make_float64(pack_raw(float64_params, p));
313 }
314
315 /*----------------------------------------------------------------------------
316 | Functions and definitions to determine: (1) whether tininess for underflow
317 | is detected before or after rounding by default, (2) what (if anything)
318 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
319 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
320 | are propagated from function inputs to output. These details are target-
321 | specific.
322 *----------------------------------------------------------------------------*/
323 #include "softfloat-specialize.h"
324
325 /* Canonicalize EXP and FRAC, setting CLS. */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327 float_status *status)
328 {
329 if (part.exp == parm->exp_max) {
330 if (part.frac == 0) {
331 part.cls = float_class_inf;
332 } else {
333 part.frac <<= parm->frac_shift;
334 part.cls = (parts_is_snan_frac(part.frac, status)
335 ? float_class_snan : float_class_qnan);
336 }
337 } else if (part.exp == 0) {
338 if (likely(part.frac == 0)) {
339 part.cls = float_class_zero;
340 } else if (status->flush_inputs_to_zero) {
341 float_raise(float_flag_input_denormal, status);
342 part.cls = float_class_zero;
343 part.frac = 0;
344 } else {
345 int shift = clz64(part.frac) - 1;
346 part.cls = float_class_normal;
347 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
348 part.frac <<= shift;
349 }
350 } else {
351 part.cls = float_class_normal;
352 part.exp -= parm->exp_bias;
353 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
354 }
355 return part;
356 }
357
358 /* Round and uncanonicalize a floating-point number by parts. There
359 * are FRAC_SHIFT bits that may require rounding at the bottom of the
360 * fraction; these bits will be removed. The exponent will be biased
361 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
362 */
363
364 static FloatParts round_canonical(FloatParts p, float_status *s,
365 const FloatFmt *parm)
366 {
367 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
368 const uint64_t round_mask = parm->round_mask;
369 const uint64_t roundeven_mask = parm->roundeven_mask;
370 const int exp_max = parm->exp_max;
371 const int frac_shift = parm->frac_shift;
372 uint64_t frac, inc;
373 int exp, flags = 0;
374 bool overflow_norm;
375
376 frac = p.frac;
377 exp = p.exp;
378
379 switch (p.cls) {
380 case float_class_normal:
381 switch (s->float_rounding_mode) {
382 case float_round_nearest_even:
383 overflow_norm = false;
384 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
385 break;
386 case float_round_ties_away:
387 overflow_norm = false;
388 inc = frac_lsbm1;
389 break;
390 case float_round_to_zero:
391 overflow_norm = true;
392 inc = 0;
393 break;
394 case float_round_up:
395 inc = p.sign ? 0 : round_mask;
396 overflow_norm = p.sign;
397 break;
398 case float_round_down:
399 inc = p.sign ? round_mask : 0;
400 overflow_norm = !p.sign;
401 break;
402 default:
403 g_assert_not_reached();
404 }
405
406 exp += parm->exp_bias;
407 if (likely(exp > 0)) {
408 if (frac & round_mask) {
409 flags |= float_flag_inexact;
410 frac += inc;
411 if (frac & DECOMPOSED_OVERFLOW_BIT) {
412 frac >>= 1;
413 exp++;
414 }
415 }
416 frac >>= frac_shift;
417
418 if (unlikely(exp >= exp_max)) {
419 flags |= float_flag_overflow | float_flag_inexact;
420 if (overflow_norm) {
421 exp = exp_max - 1;
422 frac = -1;
423 } else {
424 p.cls = float_class_inf;
425 goto do_inf;
426 }
427 }
428 } else if (s->flush_to_zero) {
429 flags |= float_flag_output_denormal;
430 p.cls = float_class_zero;
431 goto do_zero;
432 } else {
433 bool is_tiny = (s->float_detect_tininess
434 == float_tininess_before_rounding)
435 || (exp < 0)
436 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
437
438 shift64RightJamming(frac, 1 - exp, &frac);
439 if (frac & round_mask) {
440 /* Need to recompute round-to-even. */
441 if (s->float_rounding_mode == float_round_nearest_even) {
442 inc = ((frac & roundeven_mask) != frac_lsbm1
443 ? frac_lsbm1 : 0);
444 }
445 flags |= float_flag_inexact;
446 frac += inc;
447 }
448
449 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
450 frac >>= frac_shift;
451
452 if (is_tiny && (flags & float_flag_inexact)) {
453 flags |= float_flag_underflow;
454 }
455 if (exp == 0 && frac == 0) {
456 p.cls = float_class_zero;
457 }
458 }
459 break;
460
461 case float_class_zero:
462 do_zero:
463 exp = 0;
464 frac = 0;
465 break;
466
467 case float_class_inf:
468 do_inf:
469 exp = exp_max;
470 frac = 0;
471 break;
472
473 case float_class_qnan:
474 case float_class_snan:
475 exp = exp_max;
476 frac >>= parm->frac_shift;
477 break;
478
479 default:
480 g_assert_not_reached();
481 }
482
483 float_raise(flags, s);
484 p.exp = exp;
485 p.frac = frac;
486 return p;
487 }
488
489 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
490 {
491 return canonicalize(float16_unpack_raw(f), &float16_params, s);
492 }
493
494 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
495 {
496 switch (p.cls) {
497 case float_class_dnan:
498 return float16_default_nan(s);
499 case float_class_msnan:
500 p.frac >>= float16_params.frac_shift;
501 return float16_maybe_silence_nan(float16_pack_raw(p), s);
502 default:
503 p = round_canonical(p, s, &float16_params);
504 return float16_pack_raw(p);
505 }
506 }
507
508 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
509 {
510 return canonicalize(float32_unpack_raw(f), &float32_params, s);
511 }
512
513 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
514 {
515 switch (p.cls) {
516 case float_class_dnan:
517 return float32_default_nan(s);
518 case float_class_msnan:
519 p.frac >>= float32_params.frac_shift;
520 return float32_maybe_silence_nan(float32_pack_raw(p), s);
521 default:
522 p = round_canonical(p, s, &float32_params);
523 return float32_pack_raw(p);
524 }
525 }
526
527 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
528 {
529 return canonicalize(float64_unpack_raw(f), &float64_params, s);
530 }
531
532 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
533 {
534 switch (p.cls) {
535 case float_class_dnan:
536 return float64_default_nan(s);
537 case float_class_msnan:
538 p.frac >>= float64_params.frac_shift;
539 return float64_maybe_silence_nan(float64_pack_raw(p), s);
540 default:
541 p = round_canonical(p, s, &float64_params);
542 return float64_pack_raw(p);
543 }
544 }
545
546 /* Simple helpers for checking if what NaN we have */
547 static bool is_nan(FloatClass c)
548 {
549 return unlikely(c >= float_class_qnan);
550 }
551 static bool is_snan(FloatClass c)
552 {
553 return c == float_class_snan;
554 }
555 static bool is_qnan(FloatClass c)
556 {
557 return c == float_class_qnan;
558 }
559
560 static FloatParts return_nan(FloatParts a, float_status *s)
561 {
562 switch (a.cls) {
563 case float_class_snan:
564 s->float_exception_flags |= float_flag_invalid;
565 a.cls = float_class_msnan;
566 /* fall through */
567 case float_class_qnan:
568 if (s->default_nan_mode) {
569 a.cls = float_class_dnan;
570 }
571 break;
572
573 default:
574 g_assert_not_reached();
575 }
576 return a;
577 }
578
579 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
580 {
581 if (is_snan(a.cls) || is_snan(b.cls)) {
582 s->float_exception_flags |= float_flag_invalid;
583 }
584
585 if (s->default_nan_mode) {
586 a.cls = float_class_dnan;
587 } else {
588 if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
589 is_qnan(b.cls), is_snan(b.cls),
590 a.frac > b.frac ||
591 (a.frac == b.frac && a.sign < b.sign))) {
592 a = b;
593 }
594 a.cls = float_class_msnan;
595 }
596 return a;
597 }
598
599 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
600 bool inf_zero, float_status *s)
601 {
602 int which;
603
604 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
605 s->float_exception_flags |= float_flag_invalid;
606 }
607
608 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
609 is_qnan(b.cls), is_snan(b.cls),
610 is_qnan(c.cls), is_snan(c.cls),
611 inf_zero, s);
612
613 if (s->default_nan_mode) {
614 /* Note that this check is after pickNaNMulAdd so that function
615 * has an opportunity to set the Invalid flag.
616 */
617 a.cls = float_class_dnan;
618 return a;
619 }
620
621 switch (which) {
622 case 0:
623 break;
624 case 1:
625 a = b;
626 break;
627 case 2:
628 a = c;
629 break;
630 case 3:
631 a.cls = float_class_dnan;
632 return a;
633 default:
634 g_assert_not_reached();
635 }
636 a.cls = float_class_msnan;
637
638 return a;
639 }
640
641 /*
642 * Returns the result of adding or subtracting the values of the
643 * floating-point values `a' and `b'. The operation is performed
644 * according to the IEC/IEEE Standard for Binary Floating-Point
645 * Arithmetic.
646 */
647
648 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
649 float_status *s)
650 {
651 bool a_sign = a.sign;
652 bool b_sign = b.sign ^ subtract;
653
654 if (a_sign != b_sign) {
655 /* Subtraction */
656
657 if (a.cls == float_class_normal && b.cls == float_class_normal) {
658 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
659 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
660 a.frac = a.frac - b.frac;
661 } else {
662 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
663 a.frac = b.frac - a.frac;
664 a.exp = b.exp;
665 a_sign ^= 1;
666 }
667
668 if (a.frac == 0) {
669 a.cls = float_class_zero;
670 a.sign = s->float_rounding_mode == float_round_down;
671 } else {
672 int shift = clz64(a.frac) - 1;
673 a.frac = a.frac << shift;
674 a.exp = a.exp - shift;
675 a.sign = a_sign;
676 }
677 return a;
678 }
679 if (is_nan(a.cls) || is_nan(b.cls)) {
680 return pick_nan(a, b, s);
681 }
682 if (a.cls == float_class_inf) {
683 if (b.cls == float_class_inf) {
684 float_raise(float_flag_invalid, s);
685 a.cls = float_class_dnan;
686 }
687 return a;
688 }
689 if (a.cls == float_class_zero && b.cls == float_class_zero) {
690 a.sign = s->float_rounding_mode == float_round_down;
691 return a;
692 }
693 if (a.cls == float_class_zero || b.cls == float_class_inf) {
694 b.sign = a_sign ^ 1;
695 return b;
696 }
697 if (b.cls == float_class_zero) {
698 return a;
699 }
700 } else {
701 /* Addition */
702 if (a.cls == float_class_normal && b.cls == float_class_normal) {
703 if (a.exp > b.exp) {
704 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
705 } else if (a.exp < b.exp) {
706 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
707 a.exp = b.exp;
708 }
709 a.frac += b.frac;
710 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
711 a.frac >>= 1;
712 a.exp += 1;
713 }
714 return a;
715 }
716 if (is_nan(a.cls) || is_nan(b.cls)) {
717 return pick_nan(a, b, s);
718 }
719 if (a.cls == float_class_inf || b.cls == float_class_zero) {
720 return a;
721 }
722 if (b.cls == float_class_inf || a.cls == float_class_zero) {
723 b.sign = b_sign;
724 return b;
725 }
726 }
727 g_assert_not_reached();
728 }
729
730 /*
731 * Returns the result of adding or subtracting the floating-point
732 * values `a' and `b'. The operation is performed according to the
733 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
734 */
735
736 float16 __attribute__((flatten)) float16_add(float16 a, float16 b,
737 float_status *status)
738 {
739 FloatParts pa = float16_unpack_canonical(a, status);
740 FloatParts pb = float16_unpack_canonical(b, status);
741 FloatParts pr = addsub_floats(pa, pb, false, status);
742
743 return float16_round_pack_canonical(pr, status);
744 }
745
746 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
747 float_status *status)
748 {
749 FloatParts pa = float32_unpack_canonical(a, status);
750 FloatParts pb = float32_unpack_canonical(b, status);
751 FloatParts pr = addsub_floats(pa, pb, false, status);
752
753 return float32_round_pack_canonical(pr, status);
754 }
755
756 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
757 float_status *status)
758 {
759 FloatParts pa = float64_unpack_canonical(a, status);
760 FloatParts pb = float64_unpack_canonical(b, status);
761 FloatParts pr = addsub_floats(pa, pb, false, status);
762
763 return float64_round_pack_canonical(pr, status);
764 }
765
766 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
767 float_status *status)
768 {
769 FloatParts pa = float16_unpack_canonical(a, status);
770 FloatParts pb = float16_unpack_canonical(b, status);
771 FloatParts pr = addsub_floats(pa, pb, true, status);
772
773 return float16_round_pack_canonical(pr, status);
774 }
775
776 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
777 float_status *status)
778 {
779 FloatParts pa = float32_unpack_canonical(a, status);
780 FloatParts pb = float32_unpack_canonical(b, status);
781 FloatParts pr = addsub_floats(pa, pb, true, status);
782
783 return float32_round_pack_canonical(pr, status);
784 }
785
786 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
787 float_status *status)
788 {
789 FloatParts pa = float64_unpack_canonical(a, status);
790 FloatParts pb = float64_unpack_canonical(b, status);
791 FloatParts pr = addsub_floats(pa, pb, true, status);
792
793 return float64_round_pack_canonical(pr, status);
794 }
795
796 /*
797 * Returns the result of multiplying the floating-point values `a' and
798 * `b'. The operation is performed according to the IEC/IEEE Standard
799 * for Binary Floating-Point Arithmetic.
800 */
801
802 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
803 {
804 bool sign = a.sign ^ b.sign;
805
806 if (a.cls == float_class_normal && b.cls == float_class_normal) {
807 uint64_t hi, lo;
808 int exp = a.exp + b.exp;
809
810 mul64To128(a.frac, b.frac, &hi, &lo);
811 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
812 if (lo & DECOMPOSED_OVERFLOW_BIT) {
813 shift64RightJamming(lo, 1, &lo);
814 exp += 1;
815 }
816
817 /* Re-use a */
818 a.exp = exp;
819 a.sign = sign;
820 a.frac = lo;
821 return a;
822 }
823 /* handle all the NaN cases */
824 if (is_nan(a.cls) || is_nan(b.cls)) {
825 return pick_nan(a, b, s);
826 }
827 /* Inf * Zero == NaN */
828 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
829 (a.cls == float_class_zero && b.cls == float_class_inf)) {
830 s->float_exception_flags |= float_flag_invalid;
831 a.cls = float_class_dnan;
832 a.sign = sign;
833 return a;
834 }
835 /* Multiply by 0 or Inf */
836 if (a.cls == float_class_inf || a.cls == float_class_zero) {
837 a.sign = sign;
838 return a;
839 }
840 if (b.cls == float_class_inf || b.cls == float_class_zero) {
841 b.sign = sign;
842 return b;
843 }
844 g_assert_not_reached();
845 }
846
847 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
848 float_status *status)
849 {
850 FloatParts pa = float16_unpack_canonical(a, status);
851 FloatParts pb = float16_unpack_canonical(b, status);
852 FloatParts pr = mul_floats(pa, pb, status);
853
854 return float16_round_pack_canonical(pr, status);
855 }
856
857 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
858 float_status *status)
859 {
860 FloatParts pa = float32_unpack_canonical(a, status);
861 FloatParts pb = float32_unpack_canonical(b, status);
862 FloatParts pr = mul_floats(pa, pb, status);
863
864 return float32_round_pack_canonical(pr, status);
865 }
866
867 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
868 float_status *status)
869 {
870 FloatParts pa = float64_unpack_canonical(a, status);
871 FloatParts pb = float64_unpack_canonical(b, status);
872 FloatParts pr = mul_floats(pa, pb, status);
873
874 return float64_round_pack_canonical(pr, status);
875 }
876
877 /*
878 * Returns the result of multiplying the floating-point values `a' and
879 * `b' then adding 'c', with no intermediate rounding step after the
880 * multiplication. The operation is performed according to the
881 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
882 * The flags argument allows the caller to select negation of the
883 * addend, the intermediate product, or the final result. (The
884 * difference between this and having the caller do a separate
885 * negation is that negating externally will flip the sign bit on
886 * NaNs.)
887 */
888
889 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
890 int flags, float_status *s)
891 {
892 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
893 ((1 << float_class_inf) | (1 << float_class_zero));
894 bool p_sign;
895 bool sign_flip = flags & float_muladd_negate_result;
896 FloatClass p_class;
897 uint64_t hi, lo;
898 int p_exp;
899
900 /* It is implementation-defined whether the cases of (0,inf,qnan)
901 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
902 * they return if they do), so we have to hand this information
903 * off to the target-specific pick-a-NaN routine.
904 */
905 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
906 return pick_nan_muladd(a, b, c, inf_zero, s);
907 }
908
909 if (inf_zero) {
910 s->float_exception_flags |= float_flag_invalid;
911 a.cls = float_class_dnan;
912 return a;
913 }
914
915 if (flags & float_muladd_negate_c) {
916 c.sign ^= 1;
917 }
918
919 p_sign = a.sign ^ b.sign;
920
921 if (flags & float_muladd_negate_product) {
922 p_sign ^= 1;
923 }
924
925 if (a.cls == float_class_inf || b.cls == float_class_inf) {
926 p_class = float_class_inf;
927 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
928 p_class = float_class_zero;
929 } else {
930 p_class = float_class_normal;
931 }
932
933 if (c.cls == float_class_inf) {
934 if (p_class == float_class_inf && p_sign != c.sign) {
935 s->float_exception_flags |= float_flag_invalid;
936 a.cls = float_class_dnan;
937 } else {
938 a.cls = float_class_inf;
939 a.sign = c.sign ^ sign_flip;
940 }
941 return a;
942 }
943
944 if (p_class == float_class_inf) {
945 a.cls = float_class_inf;
946 a.sign = p_sign ^ sign_flip;
947 return a;
948 }
949
950 if (p_class == float_class_zero) {
951 if (c.cls == float_class_zero) {
952 if (p_sign != c.sign) {
953 p_sign = s->float_rounding_mode == float_round_down;
954 }
955 c.sign = p_sign;
956 } else if (flags & float_muladd_halve_result) {
957 c.exp -= 1;
958 }
959 c.sign ^= sign_flip;
960 return c;
961 }
962
963 /* a & b should be normals now... */
964 assert(a.cls == float_class_normal &&
965 b.cls == float_class_normal);
966
967 p_exp = a.exp + b.exp;
968
969 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
970 * result.
971 */
972 mul64To128(a.frac, b.frac, &hi, &lo);
973 /* binary point now at bit 124 */
974
975 /* check for overflow */
976 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
977 shift128RightJamming(hi, lo, 1, &hi, &lo);
978 p_exp += 1;
979 }
980
981 /* + add/sub */
982 if (c.cls == float_class_zero) {
983 /* move binary point back to 62 */
984 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
985 } else {
986 int exp_diff = p_exp - c.exp;
987 if (p_sign == c.sign) {
988 /* Addition */
989 if (exp_diff <= 0) {
990 shift128RightJamming(hi, lo,
991 DECOMPOSED_BINARY_POINT - exp_diff,
992 &hi, &lo);
993 lo += c.frac;
994 p_exp = c.exp;
995 } else {
996 uint64_t c_hi, c_lo;
997 /* shift c to the same binary point as the product (124) */
998 c_hi = c.frac >> 2;
999 c_lo = 0;
1000 shift128RightJamming(c_hi, c_lo,
1001 exp_diff,
1002 &c_hi, &c_lo);
1003 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1004 /* move binary point back to 62 */
1005 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1006 }
1007
1008 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1009 shift64RightJamming(lo, 1, &lo);
1010 p_exp += 1;
1011 }
1012
1013 } else {
1014 /* Subtraction */
1015 uint64_t c_hi, c_lo;
1016 /* make C binary point match product at bit 124 */
1017 c_hi = c.frac >> 2;
1018 c_lo = 0;
1019
1020 if (exp_diff <= 0) {
1021 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1022 if (exp_diff == 0
1023 &&
1024 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026 } else {
1027 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1028 p_sign ^= 1;
1029 p_exp = c.exp;
1030 }
1031 } else {
1032 shift128RightJamming(c_hi, c_lo,
1033 exp_diff,
1034 &c_hi, &c_lo);
1035 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1036 }
1037
1038 if (hi == 0 && lo == 0) {
1039 a.cls = float_class_zero;
1040 a.sign = s->float_rounding_mode == float_round_down;
1041 a.sign ^= sign_flip;
1042 return a;
1043 } else {
1044 int shift;
1045 if (hi != 0) {
1046 shift = clz64(hi);
1047 } else {
1048 shift = clz64(lo) + 64;
1049 }
1050 /* Normalizing to a binary point of 124 is the
1051 correct adjust for the exponent. However since we're
1052 shifting, we might as well put the binary point back
1053 at 62 where we really want it. Therefore shift as
1054 if we're leaving 1 bit at the top of the word, but
1055 adjust the exponent as if we're leaving 3 bits. */
1056 shift -= 1;
1057 if (shift >= 64) {
1058 lo = lo << (shift - 64);
1059 } else {
1060 hi = (hi << shift) | (lo >> (64 - shift));
1061 lo = hi | ((lo << shift) != 0);
1062 }
1063 p_exp -= shift - 2;
1064 }
1065 }
1066 }
1067
1068 if (flags & float_muladd_halve_result) {
1069 p_exp -= 1;
1070 }
1071
1072 /* finally prepare our result */
1073 a.cls = float_class_normal;
1074 a.sign = p_sign ^ sign_flip;
1075 a.exp = p_exp;
1076 a.frac = lo;
1077
1078 return a;
1079 }
1080
1081 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1082 int flags, float_status *status)
1083 {
1084 FloatParts pa = float16_unpack_canonical(a, status);
1085 FloatParts pb = float16_unpack_canonical(b, status);
1086 FloatParts pc = float16_unpack_canonical(c, status);
1087 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1088
1089 return float16_round_pack_canonical(pr, status);
1090 }
1091
1092 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1093 int flags, float_status *status)
1094 {
1095 FloatParts pa = float32_unpack_canonical(a, status);
1096 FloatParts pb = float32_unpack_canonical(b, status);
1097 FloatParts pc = float32_unpack_canonical(c, status);
1098 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1099
1100 return float32_round_pack_canonical(pr, status);
1101 }
1102
1103 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1104 int flags, float_status *status)
1105 {
1106 FloatParts pa = float64_unpack_canonical(a, status);
1107 FloatParts pb = float64_unpack_canonical(b, status);
1108 FloatParts pc = float64_unpack_canonical(c, status);
1109 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1110
1111 return float64_round_pack_canonical(pr, status);
1112 }
1113
1114 /*
1115 * Returns the result of dividing the floating-point value `a' by the
1116 * corresponding value `b'. The operation is performed according to
1117 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 */
1119
1120 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1121 {
1122 bool sign = a.sign ^ b.sign;
1123
1124 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1125 uint64_t temp_lo, temp_hi;
1126 int exp = a.exp - b.exp;
1127 if (a.frac < b.frac) {
1128 exp -= 1;
1129 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1130 &temp_hi, &temp_lo);
1131 } else {
1132 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1133 &temp_hi, &temp_lo);
1134 }
1135 /* LSB of quot is set if inexact which roundandpack will use
1136 * to set flags. Yet again we re-use a for the result */
1137 a.frac = div128To64(temp_lo, temp_hi, b.frac);
1138 a.sign = sign;
1139 a.exp = exp;
1140 return a;
1141 }
1142 /* handle all the NaN cases */
1143 if (is_nan(a.cls) || is_nan(b.cls)) {
1144 return pick_nan(a, b, s);
1145 }
1146 /* 0/0 or Inf/Inf */
1147 if (a.cls == b.cls
1148 &&
1149 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1150 s->float_exception_flags |= float_flag_invalid;
1151 a.cls = float_class_dnan;
1152 return a;
1153 }
1154 /* Inf / x or 0 / x */
1155 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1156 a.sign = sign;
1157 return a;
1158 }
1159 /* Div 0 => Inf */
1160 if (b.cls == float_class_zero) {
1161 s->float_exception_flags |= float_flag_divbyzero;
1162 a.cls = float_class_inf;
1163 a.sign = sign;
1164 return a;
1165 }
1166 /* Div by Inf */
1167 if (b.cls == float_class_inf) {
1168 a.cls = float_class_zero;
1169 a.sign = sign;
1170 return a;
1171 }
1172 g_assert_not_reached();
1173 }
1174
1175 float16 float16_div(float16 a, float16 b, float_status *status)
1176 {
1177 FloatParts pa = float16_unpack_canonical(a, status);
1178 FloatParts pb = float16_unpack_canonical(b, status);
1179 FloatParts pr = div_floats(pa, pb, status);
1180
1181 return float16_round_pack_canonical(pr, status);
1182 }
1183
1184 float32 float32_div(float32 a, float32 b, float_status *status)
1185 {
1186 FloatParts pa = float32_unpack_canonical(a, status);
1187 FloatParts pb = float32_unpack_canonical(b, status);
1188 FloatParts pr = div_floats(pa, pb, status);
1189
1190 return float32_round_pack_canonical(pr, status);
1191 }
1192
1193 float64 float64_div(float64 a, float64 b, float_status *status)
1194 {
1195 FloatParts pa = float64_unpack_canonical(a, status);
1196 FloatParts pb = float64_unpack_canonical(b, status);
1197 FloatParts pr = div_floats(pa, pb, status);
1198
1199 return float64_round_pack_canonical(pr, status);
1200 }
1201
1202 /*
1203 * Rounds the floating-point value `a' to an integer, and returns the
1204 * result as a floating-point value. The operation is performed
1205 * according to the IEC/IEEE Standard for Binary Floating-Point
1206 * Arithmetic.
1207 */
1208
1209 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1210 {
1211 if (is_nan(a.cls)) {
1212 return return_nan(a, s);
1213 }
1214
1215 switch (a.cls) {
1216 case float_class_zero:
1217 case float_class_inf:
1218 case float_class_qnan:
1219 /* already "integral" */
1220 break;
1221 case float_class_normal:
1222 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1223 /* already integral */
1224 break;
1225 }
1226 if (a.exp < 0) {
1227 bool one;
1228 /* all fractional */
1229 s->float_exception_flags |= float_flag_inexact;
1230 switch (rounding_mode) {
1231 case float_round_nearest_even:
1232 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1233 break;
1234 case float_round_ties_away:
1235 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1236 break;
1237 case float_round_to_zero:
1238 one = false;
1239 break;
1240 case float_round_up:
1241 one = !a.sign;
1242 break;
1243 case float_round_down:
1244 one = a.sign;
1245 break;
1246 default:
1247 g_assert_not_reached();
1248 }
1249
1250 if (one) {
1251 a.frac = DECOMPOSED_IMPLICIT_BIT;
1252 a.exp = 0;
1253 } else {
1254 a.cls = float_class_zero;
1255 }
1256 } else {
1257 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1258 uint64_t frac_lsbm1 = frac_lsb >> 1;
1259 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1260 uint64_t rnd_mask = rnd_even_mask >> 1;
1261 uint64_t inc;
1262
1263 switch (rounding_mode) {
1264 case float_round_nearest_even:
1265 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1266 break;
1267 case float_round_ties_away:
1268 inc = frac_lsbm1;
1269 break;
1270 case float_round_to_zero:
1271 inc = 0;
1272 break;
1273 case float_round_up:
1274 inc = a.sign ? 0 : rnd_mask;
1275 break;
1276 case float_round_down:
1277 inc = a.sign ? rnd_mask : 0;
1278 break;
1279 default:
1280 g_assert_not_reached();
1281 }
1282
1283 if (a.frac & rnd_mask) {
1284 s->float_exception_flags |= float_flag_inexact;
1285 a.frac += inc;
1286 a.frac &= ~rnd_mask;
1287 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1288 a.frac >>= 1;
1289 a.exp++;
1290 }
1291 }
1292 }
1293 break;
1294 default:
1295 g_assert_not_reached();
1296 }
1297 return a;
1298 }
1299
1300 float16 float16_round_to_int(float16 a, float_status *s)
1301 {
1302 FloatParts pa = float16_unpack_canonical(a, s);
1303 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1304 return float16_round_pack_canonical(pr, s);
1305 }
1306
1307 float32 float32_round_to_int(float32 a, float_status *s)
1308 {
1309 FloatParts pa = float32_unpack_canonical(a, s);
1310 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1311 return float32_round_pack_canonical(pr, s);
1312 }
1313
1314 float64 float64_round_to_int(float64 a, float_status *s)
1315 {
1316 FloatParts pa = float64_unpack_canonical(a, s);
1317 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1318 return float64_round_pack_canonical(pr, s);
1319 }
1320
1321 float64 float64_trunc_to_int(float64 a, float_status *s)
1322 {
1323 FloatParts pa = float64_unpack_canonical(a, s);
1324 FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1325 return float64_round_pack_canonical(pr, s);
1326 }
1327
1328 /*
1329 * Returns the result of converting the floating-point value `a' to
1330 * the two's complement integer format. The conversion is performed
1331 * according to the IEC/IEEE Standard for Binary Floating-Point
1332 * Arithmetic---which means in particular that the conversion is
1333 * rounded according to the current rounding mode. If `a' is a NaN,
1334 * the largest positive integer is returned. Otherwise, if the
1335 * conversion overflows, the largest integer with the same sign as `a'
1336 * is returned.
1337 */
1338
1339 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1340 int64_t min, int64_t max,
1341 float_status *s)
1342 {
1343 uint64_t r;
1344 int orig_flags = get_float_exception_flags(s);
1345 FloatParts p = round_to_int(in, rmode, s);
1346
1347 switch (p.cls) {
1348 case float_class_snan:
1349 case float_class_qnan:
1350 case float_class_dnan:
1351 case float_class_msnan:
1352 s->float_exception_flags = orig_flags | float_flag_invalid;
1353 return max;
1354 case float_class_inf:
1355 s->float_exception_flags = orig_flags | float_flag_invalid;
1356 return p.sign ? min : max;
1357 case float_class_zero:
1358 return 0;
1359 case float_class_normal:
1360 if (p.exp < DECOMPOSED_BINARY_POINT) {
1361 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1362 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1363 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1364 } else {
1365 r = UINT64_MAX;
1366 }
1367 if (p.sign) {
1368 if (r <= -(uint64_t) min) {
1369 return -r;
1370 } else {
1371 s->float_exception_flags = orig_flags | float_flag_invalid;
1372 return min;
1373 }
1374 } else {
1375 if (r <= max) {
1376 return r;
1377 } else {
1378 s->float_exception_flags = orig_flags | float_flag_invalid;
1379 return max;
1380 }
1381 }
1382 default:
1383 g_assert_not_reached();
1384 }
1385 }
1386
1387 #define FLOAT_TO_INT(fsz, isz) \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \
1389 float_status *s) \
1390 { \
1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1392 return round_to_int_and_pack(p, s->float_rounding_mode, \
1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394 s); \
1395 } \
1396 \
1397 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \
1398 (float ## fsz a, float_status *s) \
1399 { \
1400 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1401 return round_to_int_and_pack(p, float_round_to_zero, \
1402 INT ## isz ## _MIN, INT ## isz ## _MAX,\
1403 s); \
1404 }
1405
1406 FLOAT_TO_INT(16, 16)
1407 FLOAT_TO_INT(16, 32)
1408 FLOAT_TO_INT(16, 64)
1409
1410 FLOAT_TO_INT(32, 16)
1411 FLOAT_TO_INT(32, 32)
1412 FLOAT_TO_INT(32, 64)
1413
1414 FLOAT_TO_INT(64, 16)
1415 FLOAT_TO_INT(64, 32)
1416 FLOAT_TO_INT(64, 64)
1417
1418 #undef FLOAT_TO_INT
1419
1420 /*
1421 * Returns the result of converting the floating-point value `a' to
1422 * the unsigned integer format. The conversion is performed according
1423 * to the IEC/IEEE Standard for Binary Floating-Point
1424 * Arithmetic---which means in particular that the conversion is
1425 * rounded according to the current rounding mode. If `a' is a NaN,
1426 * the largest unsigned integer is returned. Otherwise, if the
1427 * conversion overflows, the largest unsigned integer is returned. If
1428 * the 'a' is negative, the result is rounded and zero is returned;
1429 * values that do not round to zero will raise the inexact exception
1430 * flag.
1431 */
1432
1433 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1434 float_status *s)
1435 {
1436 int orig_flags = get_float_exception_flags(s);
1437 FloatParts p = round_to_int(in, rmode, s);
1438
1439 switch (p.cls) {
1440 case float_class_snan:
1441 case float_class_qnan:
1442 case float_class_dnan:
1443 case float_class_msnan:
1444 s->float_exception_flags = orig_flags | float_flag_invalid;
1445 return max;
1446 case float_class_inf:
1447 s->float_exception_flags = orig_flags | float_flag_invalid;
1448 return p.sign ? 0 : max;
1449 case float_class_zero:
1450 return 0;
1451 case float_class_normal:
1452 {
1453 uint64_t r;
1454 if (p.sign) {
1455 s->float_exception_flags = orig_flags | float_flag_invalid;
1456 return 0;
1457 }
1458
1459 if (p.exp < DECOMPOSED_BINARY_POINT) {
1460 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1461 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1462 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1463 } else {
1464 s->float_exception_flags = orig_flags | float_flag_invalid;
1465 return max;
1466 }
1467
1468 /* For uint64 this will never trip, but if p.exp is too large
1469 * to shift a decomposed fraction we shall have exited via the
1470 * 3rd leg above.
1471 */
1472 if (r > max) {
1473 s->float_exception_flags = orig_flags | float_flag_invalid;
1474 return max;
1475 } else {
1476 return r;
1477 }
1478 }
1479 default:
1480 g_assert_not_reached();
1481 }
1482 }
1483
1484 #define FLOAT_TO_UINT(fsz, isz) \
1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \
1486 float_status *s) \
1487 { \
1488 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1489 return round_to_uint_and_pack(p, s->float_rounding_mode, \
1490 UINT ## isz ## _MAX, s); \
1491 } \
1492 \
1493 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \
1494 (float ## fsz a, float_status *s) \
1495 { \
1496 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \
1497 return round_to_uint_and_pack(p, float_round_to_zero, \
1498 UINT ## isz ## _MAX, s); \
1499 }
1500
1501 FLOAT_TO_UINT(16, 16)
1502 FLOAT_TO_UINT(16, 32)
1503 FLOAT_TO_UINT(16, 64)
1504
1505 FLOAT_TO_UINT(32, 16)
1506 FLOAT_TO_UINT(32, 32)
1507 FLOAT_TO_UINT(32, 64)
1508
1509 FLOAT_TO_UINT(64, 16)
1510 FLOAT_TO_UINT(64, 32)
1511 FLOAT_TO_UINT(64, 64)
1512
1513 #undef FLOAT_TO_UINT
1514
1515 /*
1516 * Integer to float conversions
1517 *
1518 * Returns the result of converting the two's complement integer `a'
1519 * to the floating-point format. The conversion is performed according
1520 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1521 */
1522
1523 static FloatParts int_to_float(int64_t a, float_status *status)
1524 {
1525 FloatParts r = {};
1526 if (a == 0) {
1527 r.cls = float_class_zero;
1528 r.sign = false;
1529 } else if (a == (1ULL << 63)) {
1530 r.cls = float_class_normal;
1531 r.sign = true;
1532 r.frac = DECOMPOSED_IMPLICIT_BIT;
1533 r.exp = 63;
1534 } else {
1535 uint64_t f;
1536 if (a < 0) {
1537 f = -a;
1538 r.sign = true;
1539 } else {
1540 f = a;
1541 r.sign = false;
1542 }
1543 int shift = clz64(f) - 1;
1544 r.cls = float_class_normal;
1545 r.exp = (DECOMPOSED_BINARY_POINT - shift);
1546 r.frac = f << shift;
1547 }
1548
1549 return r;
1550 }
1551
1552 float16 int64_to_float16(int64_t a, float_status *status)
1553 {
1554 FloatParts pa = int_to_float(a, status);
1555 return float16_round_pack_canonical(pa, status);
1556 }
1557
1558 float16 int32_to_float16(int32_t a, float_status *status)
1559 {
1560 return int64_to_float16(a, status);
1561 }
1562
1563 float16 int16_to_float16(int16_t a, float_status *status)
1564 {
1565 return int64_to_float16(a, status);
1566 }
1567
1568 float32 int64_to_float32(int64_t a, float_status *status)
1569 {
1570 FloatParts pa = int_to_float(a, status);
1571 return float32_round_pack_canonical(pa, status);
1572 }
1573
1574 float32 int32_to_float32(int32_t a, float_status *status)
1575 {
1576 return int64_to_float32(a, status);
1577 }
1578
1579 float32 int16_to_float32(int16_t a, float_status *status)
1580 {
1581 return int64_to_float32(a, status);
1582 }
1583
1584 float64 int64_to_float64(int64_t a, float_status *status)
1585 {
1586 FloatParts pa = int_to_float(a, status);
1587 return float64_round_pack_canonical(pa, status);
1588 }
1589
1590 float64 int32_to_float64(int32_t a, float_status *status)
1591 {
1592 return int64_to_float64(a, status);
1593 }
1594
1595 float64 int16_to_float64(int16_t a, float_status *status)
1596 {
1597 return int64_to_float64(a, status);
1598 }
1599
1600
1601 /*
1602 * Unsigned Integer to float conversions
1603 *
1604 * Returns the result of converting the unsigned integer `a' to the
1605 * floating-point format. The conversion is performed according to the
1606 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1607 */
1608
1609 static FloatParts uint_to_float(uint64_t a, float_status *status)
1610 {
1611 FloatParts r = { .sign = false};
1612
1613 if (a == 0) {
1614 r.cls = float_class_zero;
1615 } else {
1616 int spare_bits = clz64(a) - 1;
1617 r.cls = float_class_normal;
1618 r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1619 if (spare_bits < 0) {
1620 shift64RightJamming(a, -spare_bits, &a);
1621 r.frac = a;
1622 } else {
1623 r.frac = a << spare_bits;
1624 }
1625 }
1626
1627 return r;
1628 }
1629
1630 float16 uint64_to_float16(uint64_t a, float_status *status)
1631 {
1632 FloatParts pa = uint_to_float(a, status);
1633 return float16_round_pack_canonical(pa, status);
1634 }
1635
1636 float16 uint32_to_float16(uint32_t a, float_status *status)
1637 {
1638 return uint64_to_float16(a, status);
1639 }
1640
1641 float16 uint16_to_float16(uint16_t a, float_status *status)
1642 {
1643 return uint64_to_float16(a, status);
1644 }
1645
1646 float32 uint64_to_float32(uint64_t a, float_status *status)
1647 {
1648 FloatParts pa = uint_to_float(a, status);
1649 return float32_round_pack_canonical(pa, status);
1650 }
1651
1652 float32 uint32_to_float32(uint32_t a, float_status *status)
1653 {
1654 return uint64_to_float32(a, status);
1655 }
1656
1657 float32 uint16_to_float32(uint16_t a, float_status *status)
1658 {
1659 return uint64_to_float32(a, status);
1660 }
1661
1662 float64 uint64_to_float64(uint64_t a, float_status *status)
1663 {
1664 FloatParts pa = uint_to_float(a, status);
1665 return float64_round_pack_canonical(pa, status);
1666 }
1667
1668 float64 uint32_to_float64(uint32_t a, float_status *status)
1669 {
1670 return uint64_to_float64(a, status);
1671 }
1672
1673 float64 uint16_to_float64(uint16_t a, float_status *status)
1674 {
1675 return uint64_to_float64(a, status);
1676 }
1677
1678 /* Float Min/Max */
1679 /* min() and max() functions. These can't be implemented as
1680 * 'compare and pick one input' because that would mishandle
1681 * NaNs and +0 vs -0.
1682 *
1683 * minnum() and maxnum() functions. These are similar to the min()
1684 * and max() functions but if one of the arguments is a QNaN and
1685 * the other is numerical then the numerical argument is returned.
1686 * SNaNs will get quietened before being returned.
1687 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1688 * and maxNum() operations. min() and max() are the typical min/max
1689 * semantics provided by many CPUs which predate that specification.
1690 *
1691 * minnummag() and maxnummag() functions correspond to minNumMag()
1692 * and minNumMag() from the IEEE-754 2008.
1693 */
1694 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1695 bool ieee, bool ismag, float_status *s)
1696 {
1697 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1698 if (ieee) {
1699 /* Takes two floating-point values `a' and `b', one of
1700 * which is a NaN, and returns the appropriate NaN
1701 * result. If either `a' or `b' is a signaling NaN,
1702 * the invalid exception is raised.
1703 */
1704 if (is_snan(a.cls) || is_snan(b.cls)) {
1705 return pick_nan(a, b, s);
1706 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1707 return b;
1708 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1709 return a;
1710 }
1711 }
1712 return pick_nan(a, b, s);
1713 } else {
1714 int a_exp, b_exp;
1715
1716 switch (a.cls) {
1717 case float_class_normal:
1718 a_exp = a.exp;
1719 break;
1720 case float_class_inf:
1721 a_exp = INT_MAX;
1722 break;
1723 case float_class_zero:
1724 a_exp = INT_MIN;
1725 break;
1726 default:
1727 g_assert_not_reached();
1728 break;
1729 }
1730 switch (b.cls) {
1731 case float_class_normal:
1732 b_exp = b.exp;
1733 break;
1734 case float_class_inf:
1735 b_exp = INT_MAX;
1736 break;
1737 case float_class_zero:
1738 b_exp = INT_MIN;
1739 break;
1740 default:
1741 g_assert_not_reached();
1742 break;
1743 }
1744
1745 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1746 bool a_less = a_exp < b_exp;
1747 if (a_exp == b_exp) {
1748 a_less = a.frac < b.frac;
1749 }
1750 return a_less ^ ismin ? b : a;
1751 }
1752
1753 if (a.sign == b.sign) {
1754 bool a_less = a_exp < b_exp;
1755 if (a_exp == b_exp) {
1756 a_less = a.frac < b.frac;
1757 }
1758 return a.sign ^ a_less ^ ismin ? b : a;
1759 } else {
1760 return a.sign ^ ismin ? b : a;
1761 }
1762 }
1763 }
1764
1765 #define MINMAX(sz, name, ismin, isiee, ismag) \
1766 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
1767 float_status *s) \
1768 { \
1769 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1770 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1771 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
1772 \
1773 return float ## sz ## _round_pack_canonical(pr, s); \
1774 }
1775
1776 MINMAX(16, min, true, false, false)
1777 MINMAX(16, minnum, true, true, false)
1778 MINMAX(16, minnummag, true, true, true)
1779 MINMAX(16, max, false, false, false)
1780 MINMAX(16, maxnum, false, true, false)
1781 MINMAX(16, maxnummag, false, true, true)
1782
1783 MINMAX(32, min, true, false, false)
1784 MINMAX(32, minnum, true, true, false)
1785 MINMAX(32, minnummag, true, true, true)
1786 MINMAX(32, max, false, false, false)
1787 MINMAX(32, maxnum, false, true, false)
1788 MINMAX(32, maxnummag, false, true, true)
1789
1790 MINMAX(64, min, true, false, false)
1791 MINMAX(64, minnum, true, true, false)
1792 MINMAX(64, minnummag, true, true, true)
1793 MINMAX(64, max, false, false, false)
1794 MINMAX(64, maxnum, false, true, false)
1795 MINMAX(64, maxnummag, false, true, true)
1796
1797 #undef MINMAX
1798
1799 /* Floating point compare */
1800 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1801 float_status *s)
1802 {
1803 if (is_nan(a.cls) || is_nan(b.cls)) {
1804 if (!is_quiet ||
1805 a.cls == float_class_snan ||
1806 b.cls == float_class_snan) {
1807 s->float_exception_flags |= float_flag_invalid;
1808 }
1809 return float_relation_unordered;
1810 }
1811
1812 if (a.cls == float_class_zero) {
1813 if (b.cls == float_class_zero) {
1814 return float_relation_equal;
1815 }
1816 return b.sign ? float_relation_greater : float_relation_less;
1817 } else if (b.cls == float_class_zero) {
1818 return a.sign ? float_relation_less : float_relation_greater;
1819 }
1820
1821 /* The only really important thing about infinity is its sign. If
1822 * both are infinities the sign marks the smallest of the two.
1823 */
1824 if (a.cls == float_class_inf) {
1825 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1826 return float_relation_equal;
1827 }
1828 return a.sign ? float_relation_less : float_relation_greater;
1829 } else if (b.cls == float_class_inf) {
1830 return b.sign ? float_relation_greater : float_relation_less;
1831 }
1832
1833 if (a.sign != b.sign) {
1834 return a.sign ? float_relation_less : float_relation_greater;
1835 }
1836
1837 if (a.exp == b.exp) {
1838 if (a.frac == b.frac) {
1839 return float_relation_equal;
1840 }
1841 if (a.sign) {
1842 return a.frac > b.frac ?
1843 float_relation_less : float_relation_greater;
1844 } else {
1845 return a.frac > b.frac ?
1846 float_relation_greater : float_relation_less;
1847 }
1848 } else {
1849 if (a.sign) {
1850 return a.exp > b.exp ? float_relation_less : float_relation_greater;
1851 } else {
1852 return a.exp > b.exp ? float_relation_greater : float_relation_less;
1853 }
1854 }
1855 }
1856
1857 #define COMPARE(sz) \
1858 int float ## sz ## _compare(float ## sz a, float ## sz b, \
1859 float_status *s) \
1860 { \
1861 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1862 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1863 return compare_floats(pa, pb, false, s); \
1864 } \
1865 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \
1866 float_status *s) \
1867 { \
1868 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
1869 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
1870 return compare_floats(pa, pb, true, s); \
1871 }
1872
1873 COMPARE(16)
1874 COMPARE(32)
1875 COMPARE(64)
1876
1877 #undef COMPARE
1878
1879 /* Multiply A by 2 raised to the power N. */
1880 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1881 {
1882 if (unlikely(is_nan(a.cls))) {
1883 return return_nan(a, s);
1884 }
1885 if (a.cls == float_class_normal) {
1886 /* The largest float type (even though not supported by FloatParts)
1887 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
1888 * still allows rounding to infinity, without allowing overflow
1889 * within the int32_t that backs FloatParts.exp.
1890 */
1891 n = MIN(MAX(n, -0x10000), 0x10000);
1892 a.exp += n;
1893 }
1894 return a;
1895 }
1896
1897 float16 float16_scalbn(float16 a, int n, float_status *status)
1898 {
1899 FloatParts pa = float16_unpack_canonical(a, status);
1900 FloatParts pr = scalbn_decomposed(pa, n, status);
1901 return float16_round_pack_canonical(pr, status);
1902 }
1903
1904 float32 float32_scalbn(float32 a, int n, float_status *status)
1905 {
1906 FloatParts pa = float32_unpack_canonical(a, status);
1907 FloatParts pr = scalbn_decomposed(pa, n, status);
1908 return float32_round_pack_canonical(pr, status);
1909 }
1910
1911 float64 float64_scalbn(float64 a, int n, float_status *status)
1912 {
1913 FloatParts pa = float64_unpack_canonical(a, status);
1914 FloatParts pr = scalbn_decomposed(pa, n, status);
1915 return float64_round_pack_canonical(pr, status);
1916 }
1917
1918 /*
1919 * Square Root
1920 *
1921 * The old softfloat code did an approximation step before zeroing in
1922 * on the final result. However for simpleness we just compute the
1923 * square root by iterating down from the implicit bit to enough extra
1924 * bits to ensure we get a correctly rounded result.
1925 *
1926 * This does mean however the calculation is slower than before,
1927 * especially for 64 bit floats.
1928 */
1929
1930 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1931 {
1932 uint64_t a_frac, r_frac, s_frac;
1933 int bit, last_bit;
1934
1935 if (is_nan(a.cls)) {
1936 return return_nan(a, s);
1937 }
1938 if (a.cls == float_class_zero) {
1939 return a; /* sqrt(+-0) = +-0 */
1940 }
1941 if (a.sign) {
1942 s->float_exception_flags |= float_flag_invalid;
1943 a.cls = float_class_dnan;
1944 return a;
1945 }
1946 if (a.cls == float_class_inf) {
1947 return a; /* sqrt(+inf) = +inf */
1948 }
1949
1950 assert(a.cls == float_class_normal);
1951
1952 /* We need two overflow bits at the top. Adding room for that is a
1953 * right shift. If the exponent is odd, we can discard the low bit
1954 * by multiplying the fraction by 2; that's a left shift. Combine
1955 * those and we shift right if the exponent is even.
1956 */
1957 a_frac = a.frac;
1958 if (!(a.exp & 1)) {
1959 a_frac >>= 1;
1960 }
1961 a.exp >>= 1;
1962
1963 /* Bit-by-bit computation of sqrt. */
1964 r_frac = 0;
1965 s_frac = 0;
1966
1967 /* Iterate from implicit bit down to the 3 extra bits to compute a
1968 * properly rounded result. Remember we've inserted one more bit
1969 * at the top, so these positions are one less.
1970 */
1971 bit = DECOMPOSED_BINARY_POINT - 1;
1972 last_bit = MAX(p->frac_shift - 4, 0);
1973 do {
1974 uint64_t q = 1ULL << bit;
1975 uint64_t t_frac = s_frac + q;
1976 if (t_frac <= a_frac) {
1977 s_frac = t_frac + q;
1978 a_frac -= t_frac;
1979 r_frac += q;
1980 }
1981 a_frac <<= 1;
1982 } while (--bit >= last_bit);
1983
1984 /* Undo the right shift done above. If there is any remaining
1985 * fraction, the result is inexact. Set the sticky bit.
1986 */
1987 a.frac = (r_frac << 1) + (a_frac != 0);
1988
1989 return a;
1990 }
1991
1992 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1993 {
1994 FloatParts pa = float16_unpack_canonical(a, status);
1995 FloatParts pr = sqrt_float(pa, status, &float16_params);
1996 return float16_round_pack_canonical(pr, status);
1997 }
1998
1999 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2000 {
2001 FloatParts pa = float32_unpack_canonical(a, status);
2002 FloatParts pr = sqrt_float(pa, status, &float32_params);
2003 return float32_round_pack_canonical(pr, status);
2004 }
2005
2006 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2007 {
2008 FloatParts pa = float64_unpack_canonical(a, status);
2009 FloatParts pr = sqrt_float(pa, status, &float64_params);
2010 return float64_round_pack_canonical(pr, status);
2011 }
2012
2013
2014 /*----------------------------------------------------------------------------
2015 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2016 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2017 | input. If `zSign' is 1, the input is negated before being converted to an
2018 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
2019 | is simply rounded to an integer, with the inexact exception raised if the
2020 | input cannot be represented exactly as an integer. However, if the fixed-
2021 | point input is too large, the invalid exception is raised and the largest
2022 | positive or negative integer is returned.
2023 *----------------------------------------------------------------------------*/
2024
2025 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2026 {
2027 int8_t roundingMode;
2028 flag roundNearestEven;
2029 int8_t roundIncrement, roundBits;
2030 int32_t z;
2031
2032 roundingMode = status->float_rounding_mode;
2033 roundNearestEven = ( roundingMode == float_round_nearest_even );
2034 switch (roundingMode) {
2035 case float_round_nearest_even:
2036 case float_round_ties_away:
2037 roundIncrement = 0x40;
2038 break;
2039 case float_round_to_zero:
2040 roundIncrement = 0;
2041 break;
2042 case float_round_up:
2043 roundIncrement = zSign ? 0 : 0x7f;
2044 break;
2045 case float_round_down:
2046 roundIncrement = zSign ? 0x7f : 0;
2047 break;
2048 default:
2049 abort();
2050 }
2051 roundBits = absZ & 0x7F;
2052 absZ = ( absZ + roundIncrement )>>7;
2053 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2054 z = absZ;
2055 if ( zSign ) z = - z;
2056 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2057 float_raise(float_flag_invalid, status);
2058 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2059 }
2060 if (roundBits) {
2061 status->float_exception_flags |= float_flag_inexact;
2062 }
2063 return z;
2064
2065 }
2066
2067 /*----------------------------------------------------------------------------
2068 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2069 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2070 | and returns the properly rounded 64-bit integer corresponding to the input.
2071 | If `zSign' is 1, the input is negated before being converted to an integer.
2072 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2073 | the inexact exception raised if the input cannot be represented exactly as
2074 | an integer. However, if the fixed-point input is too large, the invalid
2075 | exception is raised and the largest positive or negative integer is
2076 | returned.
2077 *----------------------------------------------------------------------------*/
2078
2079 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2080 float_status *status)
2081 {
2082 int8_t roundingMode;
2083 flag roundNearestEven, increment;
2084 int64_t z;
2085
2086 roundingMode = status->float_rounding_mode;
2087 roundNearestEven = ( roundingMode == float_round_nearest_even );
2088 switch (roundingMode) {
2089 case float_round_nearest_even:
2090 case float_round_ties_away:
2091 increment = ((int64_t) absZ1 < 0);
2092 break;
2093 case float_round_to_zero:
2094 increment = 0;
2095 break;
2096 case float_round_up:
2097 increment = !zSign && absZ1;
2098 break;
2099 case float_round_down:
2100 increment = zSign && absZ1;
2101 break;
2102 default:
2103 abort();
2104 }
2105 if ( increment ) {
2106 ++absZ0;
2107 if ( absZ0 == 0 ) goto overflow;
2108 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2109 }
2110 z = absZ0;
2111 if ( zSign ) z = - z;
2112 if ( z && ( ( z < 0 ) ^ zSign ) ) {
2113 overflow:
2114 float_raise(float_flag_invalid, status);
2115 return
2116 zSign ? (int64_t) LIT64( 0x8000000000000000 )
2117 : LIT64( 0x7FFFFFFFFFFFFFFF );
2118 }
2119 if (absZ1) {
2120 status->float_exception_flags |= float_flag_inexact;
2121 }
2122 return z;
2123
2124 }
2125
2126 /*----------------------------------------------------------------------------
2127 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2128 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2129 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2130 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
2131 | with the inexact exception raised if the input cannot be represented exactly
2132 | as an integer. However, if the fixed-point input is too large, the invalid
2133 | exception is raised and the largest unsigned integer is returned.
2134 *----------------------------------------------------------------------------*/
2135
2136 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2137 uint64_t absZ1, float_status *status)
2138 {
2139 int8_t roundingMode;
2140 flag roundNearestEven, increment;
2141
2142 roundingMode = status->float_rounding_mode;
2143 roundNearestEven = (roundingMode == float_round_nearest_even);
2144 switch (roundingMode) {
2145 case float_round_nearest_even:
2146 case float_round_ties_away:
2147 increment = ((int64_t)absZ1 < 0);
2148 break;
2149 case float_round_to_zero:
2150 increment = 0;
2151 break;
2152 case float_round_up:
2153 increment = !zSign && absZ1;
2154 break;
2155 case float_round_down:
2156 increment = zSign && absZ1;
2157 break;
2158 default:
2159 abort();
2160 }
2161 if (increment) {
2162 ++absZ0;
2163 if (absZ0 == 0) {
2164 float_raise(float_flag_invalid, status);
2165 return LIT64(0xFFFFFFFFFFFFFFFF);
2166 }
2167 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2168 }
2169
2170 if (zSign && absZ0) {
2171 float_raise(float_flag_invalid, status);
2172 return 0;
2173 }
2174
2175 if (absZ1) {
2176 status->float_exception_flags |= float_flag_inexact;
2177 }
2178 return absZ0;
2179 }
2180
2181 /*----------------------------------------------------------------------------
2182 | If `a' is denormal and we are in flush-to-zero mode then set the
2183 | input-denormal exception and return zero. Otherwise just return the value.
2184 *----------------------------------------------------------------------------*/
2185 float32 float32_squash_input_denormal(float32 a, float_status *status)
2186 {
2187 if (status->flush_inputs_to_zero) {
2188 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2189 float_raise(float_flag_input_denormal, status);
2190 return make_float32(float32_val(a) & 0x80000000);
2191 }
2192 }
2193 return a;
2194 }
2195
2196 /*----------------------------------------------------------------------------
2197 | Normalizes the subnormal single-precision floating-point value represented
2198 | by the denormalized significand `aSig'. The normalized exponent and
2199 | significand are stored at the locations pointed to by `zExpPtr' and
2200 | `zSigPtr', respectively.
2201 *----------------------------------------------------------------------------*/
2202
2203 static void
2204 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2205 {
2206 int8_t shiftCount;
2207
2208 shiftCount = countLeadingZeros32( aSig ) - 8;
2209 *zSigPtr = aSig<<shiftCount;
2210 *zExpPtr = 1 - shiftCount;
2211
2212 }
2213
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input. Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly. However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned. If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 | The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location. This shifted
2228 | significand must be normalized or smaller. If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding. In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2235
2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2237 float_status *status)
2238 {
2239 int8_t roundingMode;
2240 flag roundNearestEven;
2241 int8_t roundIncrement, roundBits;
2242 flag isTiny;
2243
2244 roundingMode = status->float_rounding_mode;
2245 roundNearestEven = ( roundingMode == float_round_nearest_even );
2246 switch (roundingMode) {
2247 case float_round_nearest_even:
2248 case float_round_ties_away:
2249 roundIncrement = 0x40;
2250 break;
2251 case float_round_to_zero:
2252 roundIncrement = 0;
2253 break;
2254 case float_round_up:
2255 roundIncrement = zSign ? 0 : 0x7f;
2256 break;
2257 case float_round_down:
2258 roundIncrement = zSign ? 0x7f : 0;
2259 break;
2260 default:
2261 abort();
2262 break;
2263 }
2264 roundBits = zSig & 0x7F;
2265 if ( 0xFD <= (uint16_t) zExp ) {
2266 if ( ( 0xFD < zExp )
2267 || ( ( zExp == 0xFD )
2268 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2269 ) {
2270 float_raise(float_flag_overflow | float_flag_inexact, status);
2271 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2272 }
2273 if ( zExp < 0 ) {
2274 if (status->flush_to_zero) {
2275 float_raise(float_flag_output_denormal, status);
2276 return packFloat32(zSign, 0, 0);
2277 }
2278 isTiny =
2279 (status->float_detect_tininess
2280 == float_tininess_before_rounding)
2281 || ( zExp < -1 )
2282 || ( zSig + roundIncrement < 0x80000000 );
2283 shift32RightJamming( zSig, - zExp, &zSig );
2284 zExp = 0;
2285 roundBits = zSig & 0x7F;
2286 if (isTiny && roundBits) {
2287 float_raise(float_flag_underflow, status);
2288 }
2289 }
2290 }
2291 if (roundBits) {
2292 status->float_exception_flags |= float_flag_inexact;
2293 }
2294 zSig = ( zSig + roundIncrement )>>7;
2295 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296 if ( zSig == 0 ) zExp = 0;
2297 return packFloat32( zSign, zExp, zSig );
2298
2299 }
2300
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input. This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2309
2310 static float32
2311 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2312 float_status *status)
2313 {
2314 int8_t shiftCount;
2315
2316 shiftCount = countLeadingZeros32( zSig ) - 1;
2317 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2318 status);
2319
2320 }
2321
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64 float64_squash_input_denormal(float64 a, float_status *status)
2327 {
2328 if (status->flush_inputs_to_zero) {
2329 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2330 float_raise(float_flag_input_denormal, status);
2331 return make_float64(float64_val(a) & (1ULL << 63));
2332 }
2333 }
2334 return a;
2335 }
2336
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'. The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2343
2344 static void
2345 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2346 {
2347 int8_t shiftCount;
2348
2349 shiftCount = countLeadingZeros64( aSig ) - 11;
2350 *zSigPtr = aSig<<shiftCount;
2351 *zExpPtr = 1 - shiftCount;
2352
2353 }
2354
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result. After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result. This means that any integer portion of `zSig'
2360 | will be added into the exponent. Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2363 | significand.
2364 *----------------------------------------------------------------------------*/
2365
2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2367 {
2368
2369 return make_float64(
2370 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2371
2372 }
2373
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input. Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly. However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned. If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 | The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location. This shifted
2388 | significand must be normalized or smaller. If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding. In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2395
2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2397 float_status *status)
2398 {
2399 int8_t roundingMode;
2400 flag roundNearestEven;
2401 int roundIncrement, roundBits;
2402 flag isTiny;
2403
2404 roundingMode = status->float_rounding_mode;
2405 roundNearestEven = ( roundingMode == float_round_nearest_even );
2406 switch (roundingMode) {
2407 case float_round_nearest_even:
2408 case float_round_ties_away:
2409 roundIncrement = 0x200;
2410 break;
2411 case float_round_to_zero:
2412 roundIncrement = 0;
2413 break;
2414 case float_round_up:
2415 roundIncrement = zSign ? 0 : 0x3ff;
2416 break;
2417 case float_round_down:
2418 roundIncrement = zSign ? 0x3ff : 0;
2419 break;
2420 case float_round_to_odd:
2421 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2422 break;
2423 default:
2424 abort();
2425 }
2426 roundBits = zSig & 0x3FF;
2427 if ( 0x7FD <= (uint16_t) zExp ) {
2428 if ( ( 0x7FD < zExp )
2429 || ( ( zExp == 0x7FD )
2430 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2431 ) {
2432 bool overflow_to_inf = roundingMode != float_round_to_odd &&
2433 roundIncrement != 0;
2434 float_raise(float_flag_overflow | float_flag_inexact, status);
2435 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2436 }
2437 if ( zExp < 0 ) {
2438 if (status->flush_to_zero) {
2439 float_raise(float_flag_output_denormal, status);
2440 return packFloat64(zSign, 0, 0);
2441 }
2442 isTiny =
2443 (status->float_detect_tininess
2444 == float_tininess_before_rounding)
2445 || ( zExp < -1 )
2446 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2447 shift64RightJamming( zSig, - zExp, &zSig );
2448 zExp = 0;
2449 roundBits = zSig & 0x3FF;
2450 if (isTiny && roundBits) {
2451 float_raise(float_flag_underflow, status);
2452 }
2453 if (roundingMode == float_round_to_odd) {
2454 /*
2455 * For round-to-odd case, the roundIncrement depends on
2456 * zSig which just changed.
2457 */
2458 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2459 }
2460 }
2461 }
2462 if (roundBits) {
2463 status->float_exception_flags |= float_flag_inexact;
2464 }
2465 zSig = ( zSig + roundIncrement )>>10;
2466 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2467 if ( zSig == 0 ) zExp = 0;
2468 return packFloat64( zSign, zExp, zSig );
2469
2470 }
2471
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input. This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2480
2481 static float64
2482 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2483 float_status *status)
2484 {
2485 int8_t shiftCount;
2486
2487 shiftCount = countLeadingZeros64( zSig ) - 1;
2488 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2489 status);
2490
2491 }
2492
2493 /*----------------------------------------------------------------------------
2494 | Normalizes the subnormal extended double-precision floating-point value
2495 | represented by the denormalized significand `aSig'. The normalized exponent
2496 | and significand are stored at the locations pointed to by `zExpPtr' and
2497 | `zSigPtr', respectively.
2498 *----------------------------------------------------------------------------*/
2499
2500 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2501 uint64_t *zSigPtr)
2502 {
2503 int8_t shiftCount;
2504
2505 shiftCount = countLeadingZeros64( aSig );
2506 *zSigPtr = aSig<<shiftCount;
2507 *zExpPtr = 1 - shiftCount;
2508 }
2509
2510 /*----------------------------------------------------------------------------
2511 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2512 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2513 | and returns the proper extended double-precision floating-point value
2514 | corresponding to the abstract input. Ordinarily, the abstract value is
2515 | rounded and packed into the extended double-precision format, with the
2516 | inexact exception raised if the abstract input cannot be represented
2517 | exactly. However, if the abstract value is too large, the overflow and
2518 | inexact exceptions are raised and an infinity or maximal finite value is
2519 | returned. If the abstract value is too small, the input value is rounded to
2520 | a subnormal number, and the underflow and inexact exceptions are raised if
2521 | the abstract input cannot be represented exactly as a subnormal extended
2522 | double-precision floating-point number.
2523 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
2524 | number of bits as single or double precision, respectively. Otherwise, the
2525 | result is rounded to the full precision of the extended double-precision
2526 | format.
2527 | The input significand must be normalized or smaller. If the input
2528 | significand is not normalized, `zExp' must be 0; in that case, the result
2529 | returned is a subnormal number, and it must not require rounding. The
2530 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2531 | Floating-Point Arithmetic.
2532 *----------------------------------------------------------------------------*/
2533
2534 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2535 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2536 float_status *status)
2537 {
2538 int8_t roundingMode;
2539 flag roundNearestEven, increment, isTiny;
2540 int64_t roundIncrement, roundMask, roundBits;
2541
2542 roundingMode = status->float_rounding_mode;
2543 roundNearestEven = ( roundingMode == float_round_nearest_even );
2544 if ( roundingPrecision == 80 ) goto precision80;
2545 if ( roundingPrecision == 64 ) {
2546 roundIncrement = LIT64( 0x0000000000000400 );
2547 roundMask = LIT64( 0x00000000000007FF );
2548 }
2549 else if ( roundingPrecision == 32 ) {
2550 roundIncrement = LIT64( 0x0000008000000000 );
2551 roundMask = LIT64( 0x000000FFFFFFFFFF );
2552 }
2553 else {
2554 goto precision80;
2555 }
2556 zSig0 |= ( zSig1 != 0 );
2557 switch (roundingMode) {
2558 case float_round_nearest_even:
2559 case float_round_ties_away:
2560 break;
2561 case float_round_to_zero:
2562 roundIncrement = 0;
2563 break;
2564 case float_round_up:
2565 roundIncrement = zSign ? 0 : roundMask;
2566 break;
2567 case float_round_down:
2568 roundIncrement = zSign ? roundMask : 0;
2569 break;
2570 default:
2571 abort();
2572 }
2573 roundBits = zSig0 & roundMask;
2574 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2575 if ( ( 0x7FFE < zExp )
2576 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2577 ) {
2578 goto overflow;
2579 }
2580 if ( zExp <= 0 ) {
2581 if (status->flush_to_zero) {
2582 float_raise(float_flag_output_denormal, status);
2583 return packFloatx80(zSign, 0, 0);
2584 }
2585 isTiny =
2586 (status->float_detect_tininess
2587 == float_tininess_before_rounding)
2588 || ( zExp < 0 )
2589 || ( zSig0 <= zSig0 + roundIncrement );
2590 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2591 zExp = 0;
2592 roundBits = zSig0 & roundMask;
2593 if (isTiny && roundBits) {
2594 float_raise(float_flag_underflow, status);
2595 }
2596 if (roundBits) {
2597 status->float_exception_flags |= float_flag_inexact;
2598 }
2599 zSig0 += roundIncrement;
2600 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2601 roundIncrement = roundMask + 1;
2602 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2603 roundMask |= roundIncrement;
2604 }
2605 zSig0 &= ~ roundMask;
2606 return packFloatx80( zSign, zExp, zSig0 );
2607 }
2608 }
2609 if (roundBits) {
2610 status->float_exception_flags |= float_flag_inexact;
2611 }
2612 zSig0 += roundIncrement;
2613 if ( zSig0 < roundIncrement ) {
2614 ++zExp;
2615 zSig0 = LIT64( 0x8000000000000000 );
2616 }
2617 roundIncrement = roundMask + 1;
2618 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2619 roundMask |= roundIncrement;
2620 }
2621 zSig0 &= ~ roundMask;
2622 if ( zSig0 == 0 ) zExp = 0;
2623 return packFloatx80( zSign, zExp, zSig0 );
2624 precision80:
2625 switch (roundingMode) {
2626 case float_round_nearest_even:
2627 case float_round_ties_away:
2628 increment = ((int64_t)zSig1 < 0);
2629 break;
2630 case float_round_to_zero:
2631 increment = 0;
2632 break;
2633 case float_round_up:
2634 increment = !zSign && zSig1;
2635 break;
2636 case float_round_down:
2637 increment = zSign && zSig1;
2638 break;
2639 default:
2640 abort();
2641 }
2642 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2643 if ( ( 0x7FFE < zExp )
2644 || ( ( zExp == 0x7FFE )
2645 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2646 && increment
2647 )
2648 ) {
2649 roundMask = 0;
2650 overflow:
2651 float_raise(float_flag_overflow | float_flag_inexact, status);
2652 if ( ( roundingMode == float_round_to_zero )
2653 || ( zSign && ( roundingMode == float_round_up ) )
2654 || ( ! zSign && ( roundingMode == float_round_down ) )
2655 ) {
2656 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2657 }
2658 return packFloatx80(zSign,
2659 floatx80_infinity_high,
2660 floatx80_infinity_low);
2661 }
2662 if ( zExp <= 0 ) {
2663 isTiny =
2664 (status->float_detect_tininess
2665 == float_tininess_before_rounding)
2666 || ( zExp < 0 )
2667 || ! increment
2668 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2669 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2670 zExp = 0;
2671 if (isTiny && zSig1) {
2672 float_raise(float_flag_underflow, status);
2673 }
2674 if (zSig1) {
2675 status->float_exception_flags |= float_flag_inexact;
2676 }
2677 switch (roundingMode) {
2678 case float_round_nearest_even:
2679 case float_round_ties_away:
2680 increment = ((int64_t)zSig1 < 0);
2681 break;
2682 case float_round_to_zero:
2683 increment = 0;
2684 break;
2685 case float_round_up:
2686 increment = !zSign && zSig1;
2687 break;
2688 case float_round_down:
2689 increment = zSign && zSig1;
2690 break;
2691 default:
2692 abort();
2693 }
2694 if ( increment ) {
2695 ++zSig0;
2696 zSig0 &=
2697 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2698 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2699 }
2700 return packFloatx80( zSign, zExp, zSig0 );
2701 }
2702 }
2703 if (zSig1) {
2704 status->float_exception_flags |= float_flag_inexact;
2705 }
2706 if ( increment ) {
2707 ++zSig0;
2708 if ( zSig0 == 0 ) {
2709 ++zExp;
2710 zSig0 = LIT64( 0x8000000000000000 );
2711 }
2712 else {
2713 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2714 }
2715 }
2716 else {
2717 if ( zSig0 == 0 ) zExp = 0;
2718 }
2719 return packFloatx80( zSign, zExp, zSig0 );
2720
2721 }
2722
2723 /*----------------------------------------------------------------------------
2724 | Takes an abstract floating-point value having sign `zSign', exponent
2725 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2726 | and returns the proper extended double-precision floating-point value
2727 | corresponding to the abstract input. This routine is just like
2728 | `roundAndPackFloatx80' except that the input significand does not have to be
2729 | normalized.
2730 *----------------------------------------------------------------------------*/
2731
2732 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2733 flag zSign, int32_t zExp,
2734 uint64_t zSig0, uint64_t zSig1,
2735 float_status *status)
2736 {
2737 int8_t shiftCount;
2738
2739 if ( zSig0 == 0 ) {
2740 zSig0 = zSig1;
2741 zSig1 = 0;
2742 zExp -= 64;
2743 }
2744 shiftCount = countLeadingZeros64( zSig0 );
2745 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2746 zExp -= shiftCount;
2747 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2748 zSig0, zSig1, status);
2749
2750 }
2751
2752 /*----------------------------------------------------------------------------
2753 | Returns the least-significant 64 fraction bits of the quadruple-precision
2754 | floating-point value `a'.
2755 *----------------------------------------------------------------------------*/
2756
2757 static inline uint64_t extractFloat128Frac1( float128 a )
2758 {
2759
2760 return a.low;
2761
2762 }
2763
2764 /*----------------------------------------------------------------------------
2765 | Returns the most-significant 48 fraction bits of the quadruple-precision
2766 | floating-point value `a'.
2767 *----------------------------------------------------------------------------*/
2768
2769 static inline uint64_t extractFloat128Frac0( float128 a )
2770 {
2771
2772 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2773
2774 }
2775
2776 /*----------------------------------------------------------------------------
2777 | Returns the exponent bits of the quadruple-precision floating-point value
2778 | `a'.
2779 *----------------------------------------------------------------------------*/
2780
2781 static inline int32_t extractFloat128Exp( float128 a )
2782 {
2783
2784 return ( a.high>>48 ) & 0x7FFF;
2785
2786 }
2787
2788 /*----------------------------------------------------------------------------
2789 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2790 *----------------------------------------------------------------------------*/
2791
2792 static inline flag extractFloat128Sign( float128 a )
2793 {
2794
2795 return a.high>>63;
2796
2797 }
2798
2799 /*----------------------------------------------------------------------------
2800 | Normalizes the subnormal quadruple-precision floating-point value
2801 | represented by the denormalized significand formed by the concatenation of
2802 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
2803 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
2804 | significand are stored at the location pointed to by `zSig0Ptr', and the
2805 | least significant 64 bits of the normalized significand are stored at the
2806 | location pointed to by `zSig1Ptr'.
2807 *----------------------------------------------------------------------------*/
2808
2809 static void
2810 normalizeFloat128Subnormal(
2811 uint64_t aSig0,
2812 uint64_t aSig1,
2813 int32_t *zExpPtr,
2814 uint64_t *zSig0Ptr,
2815 uint64_t *zSig1Ptr
2816 )
2817 {
2818 int8_t shiftCount;
2819
2820 if ( aSig0 == 0 ) {
2821 shiftCount = countLeadingZeros64( aSig1 ) - 15;
2822 if ( shiftCount < 0 ) {
2823 *zSig0Ptr = aSig1>>( - shiftCount );
2824 *zSig1Ptr = aSig1<<( shiftCount & 63 );
2825 }
2826 else {
2827 *zSig0Ptr = aSig1<<shiftCount;
2828 *zSig1Ptr = 0;
2829 }
2830 *zExpPtr = - shiftCount - 63;
2831 }
2832 else {
2833 shiftCount = countLeadingZeros64( aSig0 ) - 15;
2834 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2835 *zExpPtr = 1 - shiftCount;
2836 }
2837
2838 }
2839
2840 /*----------------------------------------------------------------------------
2841 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2842 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2843 | floating-point value, returning the result. After being shifted into the
2844 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2845 | added together to form the most significant 32 bits of the result. This
2846 | means that any integer portion of `zSig0' will be added into the exponent.
2847 | Since a properly normalized significand will have an integer portion equal
2848 | to 1, the `zExp' input should be 1 less than the desired result exponent
2849 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2850 | significand.
2851 *----------------------------------------------------------------------------*/
2852
2853 static inline float128
2854 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2855 {
2856 float128 z;
2857
2858 z.low = zSig1;
2859 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2860 return z;
2861
2862 }
2863
2864 /*----------------------------------------------------------------------------
2865 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2866 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2867 | and `zSig2', and returns the proper quadruple-precision floating-point value
2868 | corresponding to the abstract input. Ordinarily, the abstract value is
2869 | simply rounded and packed into the quadruple-precision format, with the
2870 | inexact exception raised if the abstract input cannot be represented
2871 | exactly. However, if the abstract value is too large, the overflow and
2872 | inexact exceptions are raised and an infinity or maximal finite value is
2873 | returned. If the abstract value is too small, the input value is rounded to
2874 | a subnormal number, and the underflow and inexact exceptions are raised if
2875 | the abstract input cannot be represented exactly as a subnormal quadruple-
2876 | precision floating-point number.
2877 | The input significand must be normalized or smaller. If the input
2878 | significand is not normalized, `zExp' must be 0; in that case, the result
2879 | returned is a subnormal number, and it must not require rounding. In the
2880 | usual case that the input significand is normalized, `zExp' must be 1 less
2881 | than the ``true'' floating-point exponent. The handling of underflow and
2882 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2886 uint64_t zSig0, uint64_t zSig1,
2887 uint64_t zSig2, float_status *status)
2888 {
2889 int8_t roundingMode;
2890 flag roundNearestEven, increment, isTiny;
2891
2892 roundingMode = status->float_rounding_mode;
2893 roundNearestEven = ( roundingMode == float_round_nearest_even );
2894 switch (roundingMode) {
2895 case float_round_nearest_even:
2896 case float_round_ties_away:
2897 increment = ((int64_t)zSig2 < 0);
2898 break;
2899 case float_round_to_zero:
2900 increment = 0;
2901 break;
2902 case float_round_up:
2903 increment = !zSign && zSig2;
2904 break;
2905 case float_round_down:
2906 increment = zSign && zSig2;
2907 break;
2908 case float_round_to_odd:
2909 increment = !(zSig1 & 0x1) && zSig2;
2910 break;
2911 default:
2912 abort();
2913 }
2914 if ( 0x7FFD <= (uint32_t) zExp ) {
2915 if ( ( 0x7FFD < zExp )
2916 || ( ( zExp == 0x7FFD )
2917 && eq128(
2918 LIT64( 0x0001FFFFFFFFFFFF ),
2919 LIT64( 0xFFFFFFFFFFFFFFFF ),
2920 zSig0,
2921 zSig1
2922 )
2923 && increment
2924 )
2925 ) {
2926 float_raise(float_flag_overflow | float_flag_inexact, status);
2927 if ( ( roundingMode == float_round_to_zero )
2928 || ( zSign && ( roundingMode == float_round_up ) )
2929 || ( ! zSign && ( roundingMode == float_round_down ) )
2930 || (roundingMode == float_round_to_odd)
2931 ) {
2932 return
2933 packFloat128(
2934 zSign,
2935 0x7FFE,
2936 LIT64( 0x0000FFFFFFFFFFFF ),
2937 LIT64( 0xFFFFFFFFFFFFFFFF )
2938 );
2939 }
2940 return packFloat128( zSign, 0x7FFF, 0, 0 );
2941 }
2942 if ( zExp < 0 ) {
2943 if (status->flush_to_zero) {
2944 float_raise(float_flag_output_denormal, status);
2945 return packFloat128(zSign, 0, 0, 0);
2946 }
2947 isTiny =
2948 (status->float_detect_tininess
2949 == float_tininess_before_rounding)
2950 || ( zExp < -1 )
2951 || ! increment
2952 || lt128(
2953 zSig0,
2954 zSig1,
2955 LIT64( 0x0001FFFFFFFFFFFF ),
2956 LIT64( 0xFFFFFFFFFFFFFFFF )
2957 );
2958 shift128ExtraRightJamming(
2959 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2960 zExp = 0;
2961 if (isTiny && zSig2) {
2962 float_raise(float_flag_underflow, status);
2963 }
2964 switch (roundingMode) {
2965 case float_round_nearest_even:
2966 case float_round_ties_away:
2967 increment = ((int64_t)zSig2 < 0);
2968 break;
2969 case float_round_to_zero:
2970 increment = 0;
2971 break;
2972 case float_round_up:
2973 increment = !zSign && zSig2;
2974 break;
2975 case float_round_down:
2976 increment = zSign && zSig2;
2977 break;
2978 case float_round_to_odd:
2979 increment = !(zSig1 & 0x1) && zSig2;
2980 break;
2981 default:
2982 abort();
2983 }
2984 }
2985 }
2986 if (zSig2) {
2987 status->float_exception_flags |= float_flag_inexact;
2988 }
2989 if ( increment ) {
2990 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2991 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2992 }
2993 else {
2994 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2995 }
2996 return packFloat128( zSign, zExp, zSig0, zSig1 );
2997
2998 }
2999
3000 /*----------------------------------------------------------------------------
3001 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3002 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3003 | returns the proper quadruple-precision floating-point value corresponding
3004 | to the abstract input. This routine is just like `roundAndPackFloat128'
3005 | except that the input significand has fewer bits and does not have to be
3006 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
3007 | point exponent.
3008 *----------------------------------------------------------------------------*/
3009
3010 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3011 uint64_t zSig0, uint64_t zSig1,
3012 float_status *status)
3013 {
3014 int8_t shiftCount;
3015 uint64_t zSig2;
3016
3017 if ( zSig0 == 0 ) {
3018 zSig0 = zSig1;
3019 zSig1 = 0;
3020 zExp -= 64;
3021 }
3022 shiftCount = countLeadingZeros64( zSig0 ) - 15;
3023 if ( 0 <= shiftCount ) {
3024 zSig2 = 0;
3025 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3026 }
3027 else {
3028 shift128ExtraRightJamming(
3029 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3030 }
3031 zExp -= shiftCount;
3032 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3033
3034 }
3035
3036
3037 /*----------------------------------------------------------------------------
3038 | Returns the result of converting the 32-bit two's complement integer `a'
3039 | to the extended double-precision floating-point format. The conversion
3040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3041 | Arithmetic.
3042 *----------------------------------------------------------------------------*/
3043
3044 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3045 {
3046 flag zSign;
3047 uint32_t absA;
3048 int8_t shiftCount;
3049 uint64_t zSig;
3050
3051 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3052 zSign = ( a < 0 );
3053 absA = zSign ? - a : a;
3054 shiftCount = countLeadingZeros32( absA ) + 32;
3055 zSig = absA;
3056 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3057
3058 }
3059
3060 /*----------------------------------------------------------------------------
3061 | Returns the result of converting the 32-bit two's complement integer `a' to
3062 | the quadruple-precision floating-point format. The conversion is performed
3063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3064 *----------------------------------------------------------------------------*/
3065
3066 float128 int32_to_float128(int32_t a, float_status *status)
3067 {
3068 flag zSign;
3069 uint32_t absA;
3070 int8_t shiftCount;
3071 uint64_t zSig0;
3072
3073 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3074 zSign = ( a < 0 );
3075 absA = zSign ? - a : a;
3076 shiftCount = countLeadingZeros32( absA ) + 17;
3077 zSig0 = absA;
3078 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3079
3080 }
3081
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the 64-bit two's complement integer `a'
3084 | to the extended double-precision floating-point format. The conversion
3085 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic.
3087 *----------------------------------------------------------------------------*/
3088
3089 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3090 {
3091 flag zSign;
3092 uint64_t absA;
3093 int8_t shiftCount;
3094
3095 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3096 zSign = ( a < 0 );
3097 absA = zSign ? - a : a;
3098 shiftCount = countLeadingZeros64( absA );
3099 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3100
3101 }
3102
3103 /*----------------------------------------------------------------------------
3104 | Returns the result of converting the 64-bit two's complement integer `a' to
3105 | the quadruple-precision floating-point format. The conversion is performed
3106 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3107 *----------------------------------------------------------------------------*/
3108
3109 float128 int64_to_float128(int64_t a, float_status *status)
3110 {
3111 flag zSign;
3112 uint64_t absA;
3113 int8_t shiftCount;
3114 int32_t zExp;
3115 uint64_t zSig0, zSig1;
3116
3117 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3118 zSign = ( a < 0 );
3119 absA = zSign ? - a : a;
3120 shiftCount = countLeadingZeros64( absA ) + 49;
3121 zExp = 0x406E - shiftCount;
3122 if ( 64 <= shiftCount ) {
3123 zSig1 = 0;
3124 zSig0 = absA;
3125 shiftCount -= 64;
3126 }
3127 else {
3128 zSig1 = absA;
3129 zSig0 = 0;
3130 }
3131 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3132 return packFloat128( zSign, zExp, zSig0, zSig1 );
3133
3134 }
3135
3136 /*----------------------------------------------------------------------------
3137 | Returns the result of converting the 64-bit unsigned integer `a'
3138 | to the quadruple-precision floating-point format. The conversion is performed
3139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 float128 uint64_to_float128(uint64_t a, float_status *status)
3143 {
3144 if (a == 0) {
3145 return float128_zero;
3146 }
3147 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3148 }
3149
3150
3151
3152
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the single-precision floating-point value
3155 | `a' to the double-precision floating-point format. The conversion is
3156 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 | Arithmetic.
3158 *----------------------------------------------------------------------------*/
3159
3160 float64 float32_to_float64(float32 a, float_status *status)
3161 {
3162 flag aSign;
3163 int aExp;
3164 uint32_t aSig;
3165 a = float32_squash_input_denormal(a, status);
3166
3167 aSig = extractFloat32Frac( a );
3168 aExp = extractFloat32Exp( a );
3169 aSign = extractFloat32Sign( a );
3170 if ( aExp == 0xFF ) {
3171 if (aSig) {
3172 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3173 }
3174 return packFloat64( aSign, 0x7FF, 0 );
3175 }
3176 if ( aExp == 0 ) {
3177 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3178 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3179 --aExp;
3180 }
3181 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3182
3183 }
3184
3185 /*----------------------------------------------------------------------------
3186 | Returns the result of converting the single-precision floating-point value
3187 | `a' to the extended double-precision floating-point format. The conversion
3188 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3189 | Arithmetic.
3190 *----------------------------------------------------------------------------*/
3191
3192 floatx80 float32_to_floatx80(float32 a, float_status *status)
3193 {
3194 flag aSign;
3195 int aExp;
3196 uint32_t aSig;
3197
3198 a = float32_squash_input_denormal(a, status);
3199 aSig = extractFloat32Frac( a );
3200 aExp = extractFloat32Exp( a );
3201 aSign = extractFloat32Sign( a );
3202 if ( aExp == 0xFF ) {
3203 if (aSig) {
3204 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3205 }
3206 return packFloatx80(aSign,
3207 floatx80_infinity_high,
3208 floatx80_infinity_low);
3209 }
3210 if ( aExp == 0 ) {
3211 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3212 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3213 }
3214 aSig |= 0x00800000;
3215 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3216
3217 }
3218
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of converting the single-precision floating-point value
3221 | `a' to the double-precision floating-point format. The conversion is
3222 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3223 | Arithmetic.
3224 *----------------------------------------------------------------------------*/
3225
3226 float128 float32_to_float128(float32 a, float_status *status)
3227 {
3228 flag aSign;
3229 int aExp;
3230 uint32_t aSig;
3231
3232 a = float32_squash_input_denormal(a, status);
3233 aSig = extractFloat32Frac( a );
3234 aExp = extractFloat32Exp( a );
3235 aSign = extractFloat32Sign( a );
3236 if ( aExp == 0xFF ) {
3237 if (aSig) {
3238 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3239 }
3240 return packFloat128( aSign, 0x7FFF, 0, 0 );
3241 }
3242 if ( aExp == 0 ) {
3243 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3244 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3245 --aExp;
3246 }
3247 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3248
3249 }
3250
3251 /*----------------------------------------------------------------------------
3252 | Returns the remainder of the single-precision floating-point value `a'
3253 | with respect to the corresponding value `b'. The operation is performed
3254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3255 *----------------------------------------------------------------------------*/
3256
3257 float32 float32_rem(float32 a, float32 b, float_status *status)
3258 {
3259 flag aSign, zSign;
3260 int aExp, bExp, expDiff;
3261 uint32_t aSig, bSig;
3262 uint32_t q;
3263 uint64_t aSig64, bSig64, q64;
3264 uint32_t alternateASig;
3265 int32_t sigMean;
3266 a = float32_squash_input_denormal(a, status);
3267 b = float32_squash_input_denormal(b, status);
3268
3269 aSig = extractFloat32Frac( a );
3270 aExp = extractFloat32Exp( a );
3271 aSign = extractFloat32Sign( a );
3272 bSig = extractFloat32Frac( b );
3273 bExp = extractFloat32Exp( b );
3274 if ( aExp == 0xFF ) {
3275 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3276 return propagateFloat32NaN(a, b, status);
3277 }
3278 float_raise(float_flag_invalid, status);
3279 return float32_default_nan(status);
3280 }
3281 if ( bExp == 0xFF ) {
3282 if (bSig) {
3283 return propagateFloat32NaN(a, b, status);
3284 }
3285 return a;
3286 }
3287 if ( bExp == 0 ) {
3288 if ( bSig == 0 ) {
3289 float_raise(float_flag_invalid, status);
3290 return float32_default_nan(status);
3291 }
3292 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3293 }
3294 if ( aExp == 0 ) {
3295 if ( aSig == 0 ) return a;
3296 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3297 }
3298 expDiff = aExp - bExp;
3299 aSig |= 0x00800000;
3300 bSig |= 0x00800000;
3301 if ( expDiff < 32 ) {
3302 aSig <<= 8;
3303 bSig <<= 8;
3304 if ( expDiff < 0 ) {
3305 if ( expDiff < -1 ) return a;
3306 aSig >>= 1;
3307 }
3308 q = ( bSig <= aSig );
3309 if ( q ) aSig -= bSig;
3310 if ( 0 < expDiff ) {
3311 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3312 q >>= 32 - expDiff;
3313 bSig >>= 2;
3314 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3315 }
3316 else {
3317 aSig >>= 2;
3318 bSig >>= 2;
3319 }
3320 }
3321 else {
3322 if ( bSig <= aSig ) aSig -= bSig;
3323 aSig64 = ( (uint64_t) aSig )<<40;
3324 bSig64 = ( (uint64_t) bSig )<<40;
3325 expDiff -= 64;
3326 while ( 0 < expDiff ) {
3327 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3328 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3329 aSig64 = - ( ( bSig * q64 )<<38 );
3330 expDiff -= 62;
3331 }
3332 expDiff += 64;
3333 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3334 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3335 q = q64>>( 64 - expDiff );
3336 bSig <<= 6;
3337 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3338 }
3339 do {
3340 alternateASig = aSig;
3341 ++q;
3342 aSig -= bSig;
3343 } while ( 0 <= (int32_t) aSig );
3344 sigMean = aSig + alternateASig;
3345 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3346 aSig = alternateASig;
3347 }
3348 zSign = ( (int32_t) aSig < 0 );
3349 if ( zSign ) aSig = - aSig;
3350 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3351 }
3352
3353
3354
3355 /*----------------------------------------------------------------------------
3356 | Returns the binary exponential of the single-precision floating-point value
3357 | `a'. The operation is performed according to the IEC/IEEE Standard for
3358 | Binary Floating-Point Arithmetic.
3359 |
3360 | Uses the following identities:
3361 |
3362 | 1. -------------------------------------------------------------------------
3363 | x x*ln(2)
3364 | 2 = e
3365 |
3366 | 2. -------------------------------------------------------------------------
3367 | 2 3 4 5 n
3368 | x x x x x x x
3369 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3370 | 1! 2! 3! 4! 5! n!
3371 *----------------------------------------------------------------------------*/
3372
3373 static const float64 float32_exp2_coefficients[15] =
3374 {
3375 const_float64( 0x3ff0000000000000ll ), /* 1 */
3376 const_float64( 0x3fe0000000000000ll ), /* 2 */
3377 const_float64( 0x3fc5555555555555ll ), /* 3 */
3378 const_float64( 0x3fa5555555555555ll ), /* 4 */
3379 const_float64( 0x3f81111111111111ll ), /* 5 */
3380 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
3381 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
3382 const_float64( 0x3efa01a01a01a01all ), /* 8 */
3383 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
3384 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3385 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3386 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3387 const_float64( 0x3de6124613a86d09ll ), /* 13 */
3388 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3389 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3390 };
3391
3392 float32 float32_exp2(float32 a, float_status *status)
3393 {
3394 flag aSign;
3395 int aExp;
3396 uint32_t aSig;
3397 float64 r, x, xn;
3398 int i;
3399 a = float32_squash_input_denormal(a, status);
3400
3401 aSig = extractFloat32Frac( a );
3402 aExp = extractFloat32Exp( a );
3403 aSign = extractFloat32Sign( a );
3404
3405 if ( aExp == 0xFF) {
3406 if (aSig) {
3407 return propagateFloat32NaN(a, float32_zero, status);
3408 }
3409 return (aSign) ? float32_zero : a;
3410 }
3411 if (aExp == 0) {
3412 if (aSig == 0) return float32_one;
3413 }
3414
3415 float_raise(float_flag_inexact, status);
3416
3417 /* ******************************* */
3418 /* using float64 for approximation */
3419 /* ******************************* */
3420 x = float32_to_float64(a, status);
3421 x = float64_mul(x, float64_ln2, status);
3422
3423 xn = x;
3424 r = float64_one;
3425 for (i = 0 ; i < 15 ; i++) {
3426 float64 f;
3427
3428 f = float64_mul(xn, float32_exp2_coefficients[i], status);
3429 r = float64_add(r, f, status);
3430
3431 xn = float64_mul(xn, x, status);
3432 }
3433
3434 return float64_to_float32(r, status);
3435 }
3436
3437 /*----------------------------------------------------------------------------
3438 | Returns the binary log of the single-precision floating-point value `a'.
3439 | The operation is performed according to the IEC/IEEE Standard for Binary
3440 | Floating-Point Arithmetic.
3441 *----------------------------------------------------------------------------*/
3442 float32 float32_log2(float32 a, float_status *status)
3443 {
3444 flag aSign, zSign;
3445 int aExp;
3446 uint32_t aSig, zSig, i;
3447
3448 a = float32_squash_input_denormal(a, status);
3449 aSig = extractFloat32Frac( a );
3450 aExp = extractFloat32Exp( a );
3451 aSign = extractFloat32Sign( a );
3452
3453 if ( aExp == 0 ) {
3454 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3455 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3456 }
3457 if ( aSign ) {
3458 float_raise(float_flag_invalid, status);
3459 return float32_default_nan(status);
3460 }
3461 if ( aExp == 0xFF ) {
3462 if (aSig) {
3463 return propagateFloat32NaN(a, float32_zero, status);
3464 }
3465 return a;
3466 }
3467
3468 aExp -= 0x7F;
3469 aSig |= 0x00800000;
3470 zSign = aExp < 0;
3471 zSig = aExp << 23;
3472
3473 for (i = 1 << 22; i > 0; i >>= 1) {
3474 aSig = ( (uint64_t)aSig * aSig ) >> 23;
3475 if ( aSig & 0x01000000 ) {
3476 aSig >>= 1;
3477 zSig |= i;
3478 }
3479 }
3480
3481 if ( zSign )
3482 zSig = -zSig;
3483
3484 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3485 }
3486
3487 /*----------------------------------------------------------------------------
3488 | Returns 1 if the single-precision floating-point value `a' is equal to
3489 | the corresponding value `b', and 0 otherwise. The invalid exception is
3490 | raised if either operand is a NaN. Otherwise, the comparison is performed
3491 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3492 *----------------------------------------------------------------------------*/
3493
3494 int float32_eq(float32 a, float32 b, float_status *status)
3495 {
3496 uint32_t av, bv;
3497 a = float32_squash_input_denormal(a, status);
3498 b = float32_squash_input_denormal(b, status);
3499
3500 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3501 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3502 ) {
3503 float_raise(float_flag_invalid, status);
3504 return 0;
3505 }
3506 av = float32_val(a);
3507 bv = float32_val(b);
3508 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3509 }
3510
3511 /*----------------------------------------------------------------------------
3512 | Returns 1 if the single-precision floating-point value `a' is less than
3513 | or equal to the corresponding value `b', and 0 otherwise. The invalid
3514 | exception is raised if either operand is a NaN. The comparison is performed
3515 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3516 *----------------------------------------------------------------------------*/
3517
3518 int float32_le(float32 a, float32 b, float_status *status)
3519 {
3520 flag aSign, bSign;
3521 uint32_t av, bv;
3522 a = float32_squash_input_denormal(a, status);
3523 b = float32_squash_input_denormal(b, status);
3524
3525 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3526 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3527 ) {
3528 float_raise(float_flag_invalid, status);
3529 return 0;
3530 }
3531 aSign = extractFloat32Sign( a );
3532 bSign = extractFloat32Sign( b );
3533 av = float32_val(a);
3534 bv = float32_val(b);
3535 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3536 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3537
3538 }
3539
3540 /*----------------------------------------------------------------------------
3541 | Returns 1 if the single-precision floating-point value `a' is less than
3542 | the corresponding value `b', and 0 otherwise. The invalid exception is
3543 | raised if either operand is a NaN. The comparison is performed according
3544 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3545 *----------------------------------------------------------------------------*/
3546
3547 int float32_lt(float32 a, float32 b, float_status *status)
3548 {
3549 flag aSign, bSign;
3550 uint32_t av, bv;
3551 a = float32_squash_input_denormal(a, status);
3552 b = float32_squash_input_denormal(b, status);
3553
3554 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3555 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3556 ) {
3557 float_raise(float_flag_invalid, status);
3558 return 0;
3559 }
3560 aSign = extractFloat32Sign( a );
3561 bSign = extractFloat32Sign( b );
3562 av = float32_val(a);
3563 bv = float32_val(b);
3564 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3565 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3566
3567 }
3568
3569 /*----------------------------------------------------------------------------
3570 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3571 | be compared, and 0 otherwise. The invalid exception is raised if either
3572 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3573 | Standard for Binary Floating-Point Arithmetic.
3574 *----------------------------------------------------------------------------*/
3575
3576 int float32_unordered(float32 a, float32 b, float_status *status)
3577 {
3578 a = float32_squash_input_denormal(a, status);
3579 b = float32_squash_input_denormal(b, status);
3580
3581 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3582 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3583 ) {
3584 float_raise(float_flag_invalid, status);
3585 return 1;
3586 }
3587 return 0;
3588 }
3589
3590 /*----------------------------------------------------------------------------
3591 | Returns 1 if the single-precision floating-point value `a' is equal to
3592 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3593 | exception. The comparison is performed according to the IEC/IEEE Standard
3594 | for Binary Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3596
3597 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3598 {
3599 a = float32_squash_input_denormal(a, status);
3600 b = float32_squash_input_denormal(b, status);
3601
3602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3604 ) {
3605 if (float32_is_signaling_nan(a, status)
3606 || float32_is_signaling_nan(b, status)) {
3607 float_raise(float_flag_invalid, status);
3608 }
3609 return 0;
3610 }
3611 return ( float32_val(a) == float32_val(b) ) ||
3612 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3613 }
3614
3615 /*----------------------------------------------------------------------------
3616 | Returns 1 if the single-precision floating-point value `a' is less than or
3617 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3618 | cause an exception. Otherwise, the comparison is performed according to the
3619 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620 *----------------------------------------------------------------------------*/
3621
3622 int float32_le_quiet(float32 a, float32 b, float_status *status)
3623 {
3624 flag aSign, bSign;
3625 uint32_t av, bv;
3626 a = float32_squash_input_denormal(a, status);
3627 b = float32_squash_input_denormal(b, status);
3628
3629 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3630 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3631 ) {
3632 if (float32_is_signaling_nan(a, status)
3633 || float32_is_signaling_nan(b, status)) {
3634 float_raise(float_flag_invalid, status);
3635 }
3636 return 0;
3637 }
3638 aSign = extractFloat32Sign( a );
3639 bSign = extractFloat32Sign( b );
3640 av = float32_val(a);
3641 bv = float32_val(b);
3642 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3643 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3644
3645 }
3646
3647 /*----------------------------------------------------------------------------
3648 | Returns 1 if the single-precision floating-point value `a' is less than
3649 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3650 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3651 | Standard for Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653
3654 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3655 {
3656 flag aSign, bSign;
3657 uint32_t av, bv;
3658 a = float32_squash_input_denormal(a, status);
3659 b = float32_squash_input_denormal(b, status);
3660
3661 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3662 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3663 ) {
3664 if (float32_is_signaling_nan(a, status)
3665 || float32_is_signaling_nan(b, status)) {
3666 float_raise(float_flag_invalid, status);
3667 }
3668 return 0;
3669 }
3670 aSign = extractFloat32Sign( a );
3671 bSign = extractFloat32Sign( b );
3672 av = float32_val(a);
3673 bv = float32_val(b);
3674 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3675 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3676
3677 }
3678
3679 /*----------------------------------------------------------------------------
3680 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3681 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3682 | comparison is performed according to the IEC/IEEE Standard for Binary
3683 | Floating-Point Arithmetic.
3684 *----------------------------------------------------------------------------*/
3685
3686 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3687 {
3688 a = float32_squash_input_denormal(a, status);
3689 b = float32_squash_input_denormal(b, status);
3690
3691 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3692 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3693 ) {
3694 if (float32_is_signaling_nan(a, status)
3695 || float32_is_signaling_nan(b, status)) {
3696 float_raise(float_flag_invalid, status);
3697 }
3698 return 1;
3699 }
3700 return 0;
3701 }
3702
3703
3704 /*----------------------------------------------------------------------------
3705 | Returns the result of converting the double-precision floating-point value
3706 | `a' to the single-precision floating-point format. The conversion is
3707 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3708 | Arithmetic.
3709 *----------------------------------------------------------------------------*/
3710
3711 float32 float64_to_float32(float64 a, float_status *status)
3712 {
3713 flag aSign;
3714 int aExp;
3715 uint64_t aSig;
3716 uint32_t zSig;
3717 a = float64_squash_input_denormal(a, status);
3718
3719 aSig = extractFloat64Frac( a );
3720 aExp = extractFloat64Exp( a );
3721 aSign = extractFloat64Sign( a );
3722 if ( aExp == 0x7FF ) {
3723 if (aSig) {
3724 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3725 }
3726 return packFloat32( aSign, 0xFF, 0 );
3727 }
3728 shift64RightJamming( aSig, 22, &aSig );
3729 zSig = aSig;
3730 if ( aExp || zSig ) {
3731 zSig |= 0x40000000;
3732 aExp -= 0x381;
3733 }
3734 return roundAndPackFloat32(aSign, aExp, zSig, status);
3735
3736 }
3737
3738
3739 /*----------------------------------------------------------------------------
3740 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3741 | half-precision floating-point value, returning the result. After being
3742 | shifted into the proper positions, the three fields are simply added
3743 | together to form the result. This means that any integer portion of `zSig'
3744 | will be added into the exponent. Since a properly normalized significand
3745 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3746 | than the desired result exponent whenever `zSig' is a complete, normalized
3747 | significand.
3748 *----------------------------------------------------------------------------*/
3749 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3750 {
3751 return make_float16(
3752 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3753 }
3754
3755 /*----------------------------------------------------------------------------
3756 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3757 | and significand `zSig', and returns the proper half-precision floating-
3758 | point value corresponding to the abstract input. Ordinarily, the abstract
3759 | value is simply rounded and packed into the half-precision format, with
3760 | the inexact exception raised if the abstract input cannot be represented
3761 | exactly. However, if the abstract value is too large, the overflow and
3762 | inexact exceptions are raised and an infinity or maximal finite value is
3763 | returned. If the abstract value is too small, the input value is rounded to
3764 | a subnormal number, and the underflow and inexact exceptions are raised if
3765 | the abstract input cannot be represented exactly as a subnormal half-
3766 | precision floating-point number.
3767 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3768 | ARM-style "alternative representation", which omits the NaN and Inf
3769 | encodings in order to raise the maximum representable exponent by one.
3770 | The input significand `zSig' has its binary point between bits 22
3771 | and 23, which is 13 bits to the left of the usual location. This shifted
3772 | significand must be normalized or smaller. If `zSig' is not normalized,
3773 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3774 | and it must not require rounding. In the usual case that `zSig' is
3775 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3776 | Note the slightly odd position of the binary point in zSig compared with the
3777 | other roundAndPackFloat functions. This should probably be fixed if we
3778 | need to implement more float16 routines than just conversion.
3779 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3780 | Binary Floating-Point Arithmetic.
3781 *----------------------------------------------------------------------------*/
3782
3783 static float16 roundAndPackFloat16(flag zSign, int zExp,
3784 uint32_t zSig, flag ieee,
3785 float_status *status)
3786 {
3787 int maxexp = ieee ? 29 : 30;
3788 uint32_t mask;
3789 uint32_t increment;
3790 bool rounding_bumps_exp;
3791 bool is_tiny = false;
3792
3793 /* Calculate the mask of bits of the mantissa which are not
3794 * representable in half-precision and will be lost.
3795 */
3796 if (zExp < 1) {
3797 /* Will be denormal in halfprec */
3798 mask = 0x00ffffff;
3799 if (zExp >= -11) {
3800 mask >>= 11 + zExp;
3801 }
3802 } else {
3803 /* Normal number in halfprec */
3804 mask = 0x00001fff;
3805 }
3806
3807 switch (status->float_rounding_mode) {
3808 case float_round_nearest_even:
3809 increment = (mask + 1) >> 1;
3810 if ((zSig & mask) == increment) {
3811 increment = zSig & (increment << 1);
3812 }
3813 break;
3814 case float_round_ties_away:
3815 increment = (mask + 1) >> 1;
3816 break;
3817 case float_round_up:
3818 increment = zSign ? 0 : mask;
3819 break;
3820 case float_round_down:
3821 increment = zSign ? mask : 0;
3822 break;
3823 default: /* round_to_zero */
3824 increment = 0;
3825 break;
3826 }
3827
3828 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3829
3830 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3831 if (ieee) {
3832 float_raise(float_flag_overflow | float_flag_inexact, status);
3833 return packFloat16(zSign, 0x1f, 0);
3834 } else {
3835 float_raise(float_flag_invalid, status);
3836 return packFloat16(zSign, 0x1f, 0x3ff);
3837 }
3838 }
3839
3840 if (zExp < 0) {
3841 /* Note that flush-to-zero does not affect half-precision results */
3842 is_tiny =
3843 (status->float_detect_tininess == float_tininess_before_rounding)
3844 || (zExp < -1)
3845 || (!rounding_bumps_exp);
3846 }
3847 if (zSig & mask) {
3848 float_raise(float_flag_inexact, status);
3849 if (is_tiny) {
3850 float_raise(float_flag_underflow, status);
3851 }
3852 }
3853
3854 zSig += increment;
3855 if (rounding_bumps_exp) {
3856 zSig >>= 1;
3857 zExp++;
3858 }
3859
3860 if (zExp < -10) {
3861 return packFloat16(zSign, 0, 0);
3862 }
3863 if (zExp < 0) {
3864 zSig >>= -zExp;
3865 zExp = 0;
3866 }
3867 return packFloat16(zSign, zExp, zSig >> 13);
3868 }
3869
3870 /*----------------------------------------------------------------------------
3871 | If `a' is denormal and we are in flush-to-zero mode then set the
3872 | input-denormal exception and return zero. Otherwise just return the value.
3873 *----------------------------------------------------------------------------*/
3874 float16 float16_squash_input_denormal(float16 a, float_status *status)
3875 {
3876 if (status->flush_inputs_to_zero) {
3877 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3878 float_raise(float_flag_input_denormal, status);
3879 return make_float16(float16_val(a) & 0x8000);
3880 }
3881 }
3882 return a;
3883 }
3884
3885 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3886 uint32_t *zSigPtr)
3887 {
3888 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3889 *zSigPtr = aSig << shiftCount;
3890 *zExpPtr = 1 - shiftCount;
3891 }
3892
3893 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3894 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3895
3896 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3897 {
3898 flag aSign;
3899 int aExp;
3900 uint32_t aSig;
3901
3902 aSign = extractFloat16Sign(a);
3903 aExp = extractFloat16Exp(a);
3904 aSig = extractFloat16Frac(a);
3905
3906 if (aExp == 0x1f && ieee) {
3907 if (aSig) {
3908 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3909 }
3910 return packFloat32(aSign, 0xff, 0);
3911 }
3912 if (aExp == 0) {
3913 if (aSig == 0) {
3914 return packFloat32(aSign, 0, 0);
3915 }
3916
3917 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3918 aExp--;
3919 }
3920 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3921 }
3922
3923 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3924 {
3925 flag aSign;
3926 int aExp;
3927 uint32_t aSig;
3928
3929 a = float32_squash_input_denormal(a, status);
3930
3931 aSig = extractFloat32Frac( a );
3932 aExp = extractFloat32Exp( a );
3933 aSign = extractFloat32Sign( a );
3934 if ( aExp == 0xFF ) {
3935 if (aSig) {
3936 /* Input is a NaN */
3937 if (!ieee) {
3938 float_raise(float_flag_invalid, status);
3939 return packFloat16(aSign, 0, 0);
3940 }
3941 return commonNaNToFloat16(
3942 float32ToCommonNaN(a, status), status);
3943 }
3944 /* Infinity */
3945 if (!ieee) {
3946 float_raise(float_flag_invalid, status);
3947 return packFloat16(aSign, 0x1f, 0x3ff);
3948 }
3949 return packFloat16(aSign, 0x1f, 0);
3950 }
3951 if (aExp == 0 && aSig == 0) {
3952 return packFloat16(aSign, 0, 0);
3953 }
3954 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3955 * even if the input is denormal; however this is harmless because
3956 * the largest possible single-precision denormal is still smaller
3957 * than the smallest representable half-precision denormal, and so we
3958 * will end up ignoring aSig and returning via the "always return zero"
3959 * codepath.
3960 */
3961 aSig |= 0x00800000;
3962 aExp -= 0x71;
3963
3964 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3965 }
3966
3967 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3968 {
3969 flag aSign;
3970 int aExp;
3971 uint32_t aSig;
3972
3973 aSign = extractFloat16Sign(a);
3974 aExp = extractFloat16Exp(a);
3975 aSig = extractFloat16Frac(a);
3976
3977 if (aExp == 0x1f && ieee) {
3978 if (aSig) {
3979 return commonNaNToFloat64(
3980 float16ToCommonNaN(a, status), status);
3981 }
3982 return packFloat64(aSign, 0x7ff, 0);
3983 }
3984 if (aExp == 0) {
3985 if (aSig == 0) {
3986 return packFloat64(aSign, 0, 0);
3987 }
3988
3989 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3990 aExp--;
3991 }
3992 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3993 }
3994
3995 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3996 {
3997 flag aSign;
3998 int aExp;
3999 uint64_t aSig;
4000 uint32_t zSig;
4001
4002 a = float64_squash_input_denormal(a, status);
4003
4004 aSig = extractFloat64Frac(a);
4005 aExp = extractFloat64Exp(a);
4006 aSign = extractFloat64Sign(a);
4007 if (aExp == 0x7FF) {
4008 if (aSig) {
4009 /* Input is a NaN */
4010 if (!ieee) {
4011 float_raise(float_flag_invalid, status);
4012 return packFloat16(aSign, 0, 0);
4013 }
4014 return commonNaNToFloat16(
4015 float64ToCommonNaN(a, status), status);
4016 }
4017 /* Infinity */
4018 if (!ieee) {
4019 float_raise(float_flag_invalid, status);
4020 return packFloat16(aSign, 0x1f, 0x3ff);
4021 }
4022 return packFloat16(aSign, 0x1f, 0);
4023 }
4024 shift64RightJamming(aSig, 29, &aSig);
4025 zSig = aSig;
4026 if (aExp == 0 && zSig == 0) {
4027 return packFloat16(aSign, 0, 0);
4028 }
4029 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4030 * even if the input is denormal; however this is harmless because
4031 * the largest possible single-precision denormal is still smaller
4032 * than the smallest representable half-precision denormal, and so we
4033 * will end up ignoring aSig and returning via the "always return zero"
4034 * codepath.
4035 */
4036 zSig |= 0x00800000;
4037 aExp -= 0x3F1;
4038
4039 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4040 }
4041
4042 /*----------------------------------------------------------------------------
4043 | Returns the result of converting the double-precision floating-point value
4044 | `a' to the extended double-precision floating-point format. The conversion
4045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4046 | Arithmetic.
4047 *----------------------------------------------------------------------------*/
4048
4049 floatx80 float64_to_floatx80(float64 a, float_status *status)
4050 {
4051 flag aSign;
4052 int aExp;
4053 uint64_t aSig;
4054
4055 a = float64_squash_input_denormal(a, status);
4056 aSig = extractFloat64Frac( a );
4057 aExp = extractFloat64Exp( a );
4058 aSign = extractFloat64Sign( a );
4059 if ( aExp == 0x7FF ) {
4060 if (aSig) {
4061 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4062 }
4063 return packFloatx80(aSign,
4064 floatx80_infinity_high,
4065 floatx80_infinity_low);
4066 }
4067 if ( aExp == 0 ) {
4068 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4069 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4070 }
4071 return
4072 packFloatx80(
4073 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4074
4075 }
4076
4077 /*----------------------------------------------------------------------------
4078 | Returns the result of converting the double-precision floating-point value
4079 | `a' to the quadruple-precision floating-point format. The conversion is
4080 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4081 | Arithmetic.
4082 *----------------------------------------------------------------------------*/
4083
4084 float128 float64_to_float128(float64 a, float_status *status)
4085 {
4086 flag aSign;
4087 int aExp;
4088 uint64_t aSig, zSig0, zSig1;
4089
4090 a = float64_squash_input_denormal(a, status);
4091 aSig = extractFloat64Frac( a );
4092 aExp = extractFloat64Exp( a );
4093 aSign = extractFloat64Sign( a );
4094 if ( aExp == 0x7FF ) {
4095 if (aSig) {
4096 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4097 }
4098 return packFloat128( aSign, 0x7FFF, 0, 0 );
4099 }
4100 if ( aExp == 0 ) {
4101 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4102 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4103 --aExp;
4104 }
4105 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4106 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4107
4108 }
4109
4110
4111 /*----------------------------------------------------------------------------
4112 | Returns the remainder of the double-precision floating-point value `a'
4113 | with respect to the corresponding value `b'. The operation is performed
4114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115 *----------------------------------------------------------------------------*/
4116
4117 float64 float64_rem(float64 a, float64 b, float_status *status)
4118 {
4119 flag aSign, zSign;
4120 int aExp, bExp, expDiff;
4121 uint64_t aSig, bSig;
4122 uint64_t q, alternateASig;
4123 int64_t sigMean;
4124
4125 a = float64_squash_input_denormal(a, status);
4126 b = float64_squash_input_denormal(b, status);
4127 aSig = extractFloat64Frac( a );
4128 aExp = extractFloat64Exp( a );
4129 aSign = extractFloat64Sign( a );
4130 bSig = extractFloat64Frac( b );
4131 bExp = extractFloat64Exp( b );
4132 if ( aExp == 0x7FF ) {
4133 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4134 return propagateFloat64NaN(a, b, status);
4135 }
4136 float_raise(float_flag_invalid, status);
4137 return float64_default_nan(status);
4138 }
4139 if ( bExp == 0x7FF ) {
4140 if (bSig) {
4141 return propagateFloat64NaN(a, b, status);
4142 }
4143 return a;
4144 }
4145 if ( bExp == 0 ) {
4146 if ( bSig == 0 ) {
4147 float_raise(float_flag_invalid, status);
4148 return float64_default_nan(status);
4149 }
4150 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4151 }
4152 if ( aExp == 0 ) {
4153 if ( aSig == 0 ) return a;
4154 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4155 }
4156 expDiff = aExp - bExp;
4157 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4158 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4159 if ( expDiff < 0 ) {
4160 if ( expDiff < -1 ) return a;
4161 aSig >>= 1;
4162 }
4163 q = ( bSig <= aSig );
4164 if ( q ) aSig -= bSig;
4165 expDiff -= 64;
4166 while ( 0 < expDiff ) {
4167 q = estimateDiv128To64( aSig, 0, bSig );
4168 q = ( 2 < q ) ? q - 2 : 0;
4169 aSig = - ( ( bSig>>2 ) * q );
4170 expDiff -= 62;
4171 }
4172 expDiff += 64;
4173 if ( 0 < expDiff ) {
4174 q = estimateDiv128To64( aSig, 0, bSig );
4175 q = ( 2 < q ) ? q - 2 : 0;
4176 q >>= 64 - expDiff;
4177 bSig >>= 2;
4178 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4179 }
4180 else {
4181 aSig >>= 2;
4182 bSig >>= 2;
4183 }
4184 do {
4185 alternateASig = aSig;
4186 ++q;
4187 aSig -= bSig;
4188 } while ( 0 <= (int64_t) aSig );
4189 sigMean = aSig + alternateASig;
4190 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4191 aSig = alternateASig;
4192 }
4193 zSign = ( (int64_t) aSig < 0 );
4194 if ( zSign ) aSig = - aSig;
4195 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4196
4197 }
4198
4199 /*----------------------------------------------------------------------------
4200 | Returns the binary log of the double-precision floating-point value `a'.
4201 | The operation is performed according to the IEC/IEEE Standard for Binary
4202 | Floating-Point Arithmetic.
4203 *----------------------------------------------------------------------------*/
4204 float64 float64_log2(float64 a, float_status *status)
4205 {
4206 flag aSign, zSign;
4207 int aExp;
4208 uint64_t aSig, aSig0, aSig1, zSig, i;
4209 a = float64_squash_input_denormal(a, status);
4210
4211 aSig = extractFloat64Frac( a );
4212 aExp = extractFloat64Exp( a );
4213 aSign = extractFloat64Sign( a );
4214
4215 if ( aExp == 0 ) {
4216 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4217 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4218 }
4219 if ( aSign ) {
4220 float_raise(float_flag_invalid, status);
4221 return float64_default_nan(status);
4222 }
4223 if ( aExp == 0x7FF ) {
4224 if (aSig) {
4225 return propagateFloat64NaN(a, float64_zero, status);
4226 }
4227 return a;
4228 }
4229
4230 aExp -= 0x3FF;
4231 aSig |= LIT64( 0x0010000000000000 );
4232 zSign = aExp < 0;
4233 zSig = (uint64_t)aExp << 52;
4234 for (i = 1LL << 51; i > 0; i >>= 1) {
4235 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4236 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4237 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4238 aSig >>= 1;
4239 zSig |= i;
4240 }
4241 }
4242
4243 if ( zSign )
4244 zSig = -zSig;
4245 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4246 }
4247
4248 /*----------------------------------------------------------------------------
4249 | Returns 1 if the double-precision floating-point value `a' is equal to the
4250 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4251 | if either operand is a NaN. Otherwise, the comparison is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4254
4255 int float64_eq(float64 a, float64 b, float_status *status)
4256 {
4257 uint64_t av, bv;
4258 a = float64_squash_input_denormal(a, status);
4259 b = float64_squash_input_denormal(b, status);
4260
4261 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4262 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4263 ) {
4264 float_raise(float_flag_invalid, status);
4265 return 0;
4266 }
4267 av = float64_val(a);
4268 bv = float64_val(b);
4269 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4270
4271 }
4272
4273 /*----------------------------------------------------------------------------
4274 | Returns 1 if the double-precision floating-point value `a' is less than or
4275 | equal to the corresponding value `b', and 0 otherwise. The invalid
4276 | exception is raised if either operand is a NaN. The comparison is performed
4277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4279
4280 int float64_le(float64 a, float64 b, float_status *status)
4281 {
4282 flag aSign, bSign;
4283 uint64_t av, bv;
4284 a = float64_squash_input_denormal(a, status);
4285 b = float64_squash_input_denormal(b, status);
4286
4287 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4288 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4289 ) {
4290 float_raise(float_flag_invalid, status);
4291 return 0;
4292 }
4293 aSign = extractFloat64Sign( a );
4294 bSign = extractFloat64Sign( b );
4295 av = float64_val(a);
4296 bv = float64_val(b);
4297 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4298 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4299
4300 }
4301
4302 /*----------------------------------------------------------------------------
4303 | Returns 1 if the double-precision floating-point value `a' is less than
4304 | the corresponding value `b', and 0 otherwise. The invalid exception is
4305 | raised if either operand is a NaN. The comparison is performed according
4306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4307 *----------------------------------------------------------------------------*/
4308
4309 int float64_lt(float64 a, float64 b, float_status *status)
4310 {
4311 flag aSign, bSign;
4312 uint64_t av, bv;
4313
4314 a = float64_squash_input_denormal(a, status);
4315 b = float64_squash_input_denormal(b, status);
4316 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4317 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4318 ) {
4319 float_raise(float_flag_invalid, status);
4320 return 0;
4321 }
4322 aSign = extractFloat64Sign( a );
4323 bSign = extractFloat64Sign( b );
4324 av = float64_val(a);
4325 bv = float64_val(b);
4326 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4327 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4328
4329 }
4330
4331 /*----------------------------------------------------------------------------
4332 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4333 | be compared, and 0 otherwise. The invalid exception is raised if either
4334 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4335 | Standard for Binary Floating-Point Arithmetic.
4336 *----------------------------------------------------------------------------*/
4337
4338 int float64_unordered(float64 a, float64 b, float_status *status)
4339 {
4340 a = float64_squash_input_denormal(a, status);
4341 b = float64_squash_input_denormal(b, status);
4342
4343 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4344 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4345 ) {
4346 float_raise(float_flag_invalid, status);
4347 return 1;
4348 }
4349 return 0;
4350 }
4351
4352 /*----------------------------------------------------------------------------
4353 | Returns 1 if the double-precision floating-point value `a' is equal to the
4354 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4355 | exception.The comparison is performed according to the IEC/IEEE Standard
4356 | for Binary Floating-Point Arithmetic.
4357 *----------------------------------------------------------------------------*/
4358
4359 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4360 {
4361 uint64_t av, bv;
4362 a = float64_squash_input_denormal(a, status);
4363 b = float64_squash_input_denormal(b, status);
4364
4365 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4366 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4367 ) {
4368 if (float64_is_signaling_nan(a, status)
4369 || float64_is_signaling_nan(b, status)) {
4370 float_raise(float_flag_invalid, status);
4371 }
4372 return 0;
4373 }
4374 av = float64_val(a);
4375 bv = float64_val(b);
4376 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4377
4378 }
4379
4380 /*----------------------------------------------------------------------------
4381 | Returns 1 if the double-precision floating-point value `a' is less than or
4382 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4383 | cause an exception. Otherwise, the comparison is performed according to the
4384 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
4387 int float64_le_quiet(float64 a, float64 b, float_status *status)
4388 {
4389 flag aSign, bSign;
4390 uint64_t av, bv;
4391 a = float64_squash_input_denormal(a, status);
4392 b = float64_squash_input_denormal(b, status);
4393
4394 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4395 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4396 ) {
4397 if (float64_is_signaling_nan(a, status)
4398 || float64_is_signaling_nan(b, status)) {
4399 float_raise(float_flag_invalid, status);
4400 }
4401 return 0;
4402 }
4403 aSign = extractFloat64Sign( a );
4404 bSign = extractFloat64Sign( b );
4405 av = float64_val(a);
4406 bv = float64_val(b);
4407 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4408 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4409
4410 }
4411
4412 /*----------------------------------------------------------------------------
4413 | Returns 1 if the double-precision floating-point value `a' is less than
4414 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4415 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4416 | Standard for Binary Floating-Point Arithmetic.
4417 *----------------------------------------------------------------------------*/
4418
4419 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4420 {
4421 flag aSign, bSign;
4422 uint64_t av, bv;
4423 a = float64_squash_input_denormal(a, status);
4424 b = float64_squash_input_denormal(b, status);
4425
4426 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4427 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4428 ) {
4429 if (float64_is_signaling_nan(a, status)
4430 || float64_is_signaling_nan(b, status)) {
4431 float_raise(float_flag_invalid, status);
4432 }
4433 return 0;
4434 }
4435 aSign = extractFloat64Sign( a );
4436 bSign = extractFloat64Sign( b );
4437 av = float64_val(a);
4438 bv = float64_val(b);
4439 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4440 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4441
4442 }
4443
4444 /*----------------------------------------------------------------------------
4445 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4446 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4447 | comparison is performed according to the IEC/IEEE Standard for Binary
4448 | Floating-Point Arithmetic.
4449 *----------------------------------------------------------------------------*/
4450
4451 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4452 {
4453 a = float64_squash_input_denormal(a, status);
4454 b = float64_squash_input_denormal(b, status);
4455
4456 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4457 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4458 ) {
4459 if (float64_is_signaling_nan(a, status)
4460 || float64_is_signaling_nan(b, status)) {
4461 float_raise(float_flag_invalid, status);
4462 }
4463 return 1;
4464 }
4465 return 0;
4466 }
4467
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the extended double-precision floating-
4470 | point value `a' to the 32-bit two's complement integer format. The
4471 | conversion is performed according to the IEC/IEEE Standard for Binary
4472 | Floating-Point Arithmetic---which means in particular that the conversion
4473 | is rounded according to the current rounding mode. If `a' is a NaN, the
4474 | largest positive integer is returned. Otherwise, if the conversion
4475 | overflows, the largest integer with the same sign as `a' is returned.
4476 *----------------------------------------------------------------------------*/
4477
4478 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4479 {
4480 flag aSign;
4481 int32_t aExp, shiftCount;
4482 uint64_t aSig;
4483
4484 if (floatx80_invalid_encoding(a)) {
4485 float_raise(float_flag_invalid, status);
4486 return 1 << 31;
4487 }
4488 aSig = extractFloatx80Frac( a );
4489 aExp = extractFloatx80Exp( a );
4490 aSign = extractFloatx80Sign( a );
4491 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4492 shiftCount = 0x4037 - aExp;
4493 if ( shiftCount <= 0 ) shiftCount = 1;
4494 shift64RightJamming( aSig, shiftCount, &aSig );
4495 return roundAndPackInt32(aSign, aSig, status);
4496
4497 }
4498
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the extended double-precision floating-
4501 | point value `a' to the 32-bit two's complement integer format. The
4502 | conversion is performed according to the IEC/IEEE Standard for Binary
4503 | Floating-Point Arithmetic, except that the conversion is always rounded
4504 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4505 | Otherwise, if the conversion overflows, the largest integer with the same
4506 | sign as `a' is returned.
4507 *----------------------------------------------------------------------------*/
4508
4509 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4510 {
4511 flag aSign;
4512 int32_t aExp, shiftCount;
4513 uint64_t aSig, savedASig;
4514 int32_t z;
4515
4516 if (floatx80_invalid_encoding(a)) {
4517 float_raise(float_flag_invalid, status);
4518 return 1 << 31;
4519 }
4520 aSig = extractFloatx80Frac( a );
4521 aExp = extractFloatx80Exp( a );
4522 aSign = extractFloatx80Sign( a );
4523 if ( 0x401E < aExp ) {
4524 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4525 goto invalid;
4526 }
4527 else if ( aExp < 0x3FFF ) {
4528 if (aExp || aSig) {
4529 status->float_exception_flags |= float_flag_inexact;
4530 }
4531 return 0;
4532 }
4533 shiftCount = 0x403E - aExp;
4534 savedASig = aSig;
4535 aSig >>= shiftCount;
4536 z = aSig;
4537 if ( aSign ) z = - z;
4538 if ( ( z < 0 ) ^ aSign ) {
4539 invalid:
4540 float_raise(float_flag_invalid, status);
4541 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4542 }
4543 if ( ( aSig<<shiftCount ) != savedASig ) {
4544 status->float_exception_flags |= float_flag_inexact;
4545 }
4546 return z;
4547
4548 }
4549
4550 /*----------------------------------------------------------------------------
4551 | Returns the result of converting the extended double-precision floating-
4552 | point value `a' to the 64-bit two's complement integer format. The
4553 | conversion is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic---which means in particular that the conversion
4555 | is rounded according to the current rounding mode. If `a' is a NaN,
4556 | the largest positive integer is returned. Otherwise, if the conversion
4557 | overflows, the largest integer with the same sign as `a' is returned.
4558 *----------------------------------------------------------------------------*/
4559
4560 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4561 {
4562 flag aSign;
4563 int32_t aExp, shiftCount;
4564 uint64_t aSig, aSigExtra;
4565
4566 if (floatx80_invalid_encoding(a)) {
4567 float_raise(float_flag_invalid, status);
4568 return 1ULL << 63;
4569 }
4570 aSig = extractFloatx80Frac( a );
4571 aExp = extractFloatx80Exp( a );
4572 aSign = extractFloatx80Sign( a );
4573 shiftCount = 0x403E - aExp;
4574 if ( shiftCount <= 0 ) {
4575 if ( shiftCount ) {
4576 float_raise(float_flag_invalid, status);
4577 if (!aSign || floatx80_is_any_nan(a)) {
4578 return LIT64( 0x7FFFFFFFFFFFFFFF );
4579 }
4580 return (int64_t) LIT64( 0x8000000000000000 );
4581 }
4582 aSigExtra = 0;
4583 }
4584 else {
4585 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4586 }
4587 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4588
4589 }
4590
4591 /*----------------------------------------------------------------------------
4592 | Returns the result of converting the extended double-precision floating-
4593 | point value `a' to the 64-bit two's complement integer format. The
4594 | conversion is performed according to the IEC/IEEE Standard for Binary
4595 | Floating-Point Arithmetic, except that the conversion is always rounded
4596 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4597 | Otherwise, if the conversion overflows, the largest integer with the same
4598 | sign as `a' is returned.
4599 *----------------------------------------------------------------------------*/
4600
4601 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4602 {
4603 flag aSign;
4604 int32_t aExp, shiftCount;
4605 uint64_t aSig;
4606 int64_t z;
4607
4608 if (floatx80_invalid_encoding(a)) {
4609 float_raise(float_flag_invalid, status);
4610 return 1ULL << 63;
4611 }
4612 aSig = extractFloatx80Frac( a );
4613 aExp = extractFloatx80Exp( a );
4614 aSign = extractFloatx80Sign( a );
4615 shiftCount = aExp - 0x403E;
4616 if ( 0 <= shiftCount ) {
4617 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4618 if ( ( a.high != 0xC03E ) || aSig ) {
4619 float_raise(float_flag_invalid, status);
4620 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4621 return LIT64( 0x7FFFFFFFFFFFFFFF );
4622 }
4623 }
4624 return (int64_t) LIT64( 0x8000000000000000 );
4625 }
4626 else if ( aExp < 0x3FFF ) {
4627 if (aExp | aSig) {
4628 status->float_exception_flags |= float_flag_inexact;
4629 }
4630 return 0;
4631 }
4632 z = aSig>>( - shiftCount );
4633 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4634 status->float_exception_flags |= float_flag_inexact;
4635 }
4636 if ( aSign ) z = - z;
4637 return z;
4638
4639 }
4640
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the extended double-precision floating-
4643 | point value `a' to the single-precision floating-point format. The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4647
4648 float32 floatx80_to_float32(floatx80 a, float_status *status)
4649 {
4650 flag aSign;
4651 int32_t aExp;
4652 uint64_t aSig;
4653
4654 if (floatx80_invalid_encoding(a)) {
4655 float_raise(float_flag_invalid, status);
4656 return float32_default_nan(status);
4657 }
4658 aSig = extractFloatx80Frac( a );
4659 aExp = extractFloatx80Exp( a );
4660 aSign = extractFloatx80Sign( a );
4661 if ( aExp == 0x7FFF ) {
4662 if ( (uint64_t) ( aSig<<1 ) ) {
4663 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4664 }
4665 return packFloat32( aSign, 0xFF, 0 );
4666 }
4667 shift64RightJamming( aSig, 33, &aSig );
4668 if ( aExp || aSig ) aExp -= 0x3F81;
4669 return roundAndPackFloat32(aSign, aExp, aSig, status);
4670
4671 }
4672
4673 /*----------------------------------------------------------------------------
4674 | Returns the result of converting the extended double-precision floating-
4675 | point value `a' to the double-precision floating-point format. The
4676 | conversion is performed according to the IEC/IEEE Standard for Binary
4677 | Floating-Point Arithmetic.
4678 *----------------------------------------------------------------------------*/
4679
4680 float64 floatx80_to_float64(floatx80 a, float_status *status)
4681 {
4682 flag aSign;
4683 int32_t aExp;
4684 uint64_t aSig, zSig;
4685
4686 if (floatx80_invalid_encoding(a)) {
4687 float_raise(float_flag_invalid, status);
4688 return float64_default_nan(status);
4689 }
4690 aSig = extractFloatx80Frac( a );
4691 aExp = extractFloatx80Exp( a );
4692 aSign = extractFloatx80Sign( a );
4693 if ( aExp == 0x7FFF ) {
4694 if ( (uint64_t) ( aSig<<1 ) ) {
4695 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4696 }
4697 return packFloat64( aSign, 0x7FF, 0 );
4698 }
4699 shift64RightJamming( aSig, 1, &zSig );
4700 if ( aExp || aSig ) aExp -= 0x3C01;
4701 return roundAndPackFloat64(aSign, aExp, zSig, status);
4702
4703 }
4704
4705 /*----------------------------------------------------------------------------
4706 | Returns the result of converting the extended double-precision floating-
4707 | point value `a' to the quadruple-precision floating-point format. The
4708 | conversion is performed according to the IEC/IEEE Standard for Binary
4709 | Floating-Point Arithmetic.
4710 *----------------------------------------------------------------------------*/
4711
4712 float128 floatx80_to_float128(floatx80 a, float_status *status)
4713 {
4714 flag aSign;
4715 int aExp;
4716 uint64_t aSig, zSig0, zSig1;
4717
4718 if (floatx80_invalid_encoding(a)) {
4719 float_raise(float_flag_invalid, status);
4720 return float128_default_nan(status);
4721 }
4722 aSig = extractFloatx80Frac( a );
4723 aExp = extractFloatx80Exp( a );
4724 aSign = extractFloatx80Sign( a );
4725 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4726 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4727 }
4728 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4729 return packFloat128( aSign, aExp, zSig0, zSig1 );
4730
4731 }
4732
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a'
4735 | to the precision provided by floatx80_rounding_precision and returns the
4736 | result as an extended double-precision floating-point value.
4737 | The operation is performed according to the IEC/IEEE Standard for Binary
4738 | Floating-Point Arithmetic.
4739 *----------------------------------------------------------------------------*/
4740
4741 floatx80 floatx80_round(floatx80 a, float_status *status)
4742 {
4743 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4744 extractFloatx80Sign(a),
4745 extractFloatx80Exp(a),
4746 extractFloatx80Frac(a), 0, status);
4747 }
4748
4749 /*----------------------------------------------------------------------------
4750 | Rounds the extended double-precision floating-point value `a' to an integer,
4751 | and returns the result as an extended quadruple-precision floating-point
4752 | value. The operation is performed according to the IEC/IEEE Standard for
4753 | Binary Floating-Point Arithmetic.
4754 *----------------------------------------------------------------------------*/
4755
4756 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4757 {
4758 flag aSign;
4759 int32_t aExp;
4760 uint64_t lastBitMask, roundBitsMask;
4761 floatx80 z;
4762
4763 if (floatx80_invalid_encoding(a)) {
4764 float_raise(float_flag_invalid, status);
4765 return floatx80_default_nan(status);
4766 }
4767 aExp = extractFloatx80Exp( a );
4768 if ( 0x403E <= aExp ) {
4769 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4770 return propagateFloatx80NaN(a, a, status);
4771 }
4772 return a;
4773 }
4774 if ( aExp < 0x3FFF ) {
4775 if ( ( aExp == 0 )
4776 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4777 return a;
4778 }
4779 status->float_exception_flags |= float_flag_inexact;
4780 aSign = extractFloatx80Sign( a );
4781 switch (status->float_rounding_mode) {
4782 case float_round_nearest_even:
4783 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4784 ) {
4785 return
4786 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4787 }
4788 break;
4789 case float_round_ties_away:
4790 if (aExp == 0x3FFE) {
4791 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4792 }
4793 break;
4794 case float_round_down:
4795 return
4796 aSign ?
4797 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4798 : packFloatx80( 0, 0, 0 );
4799 case float_round_up:
4800 return
4801 aSign ? packFloatx80( 1, 0, 0 )
4802 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4803 }
4804 return packFloatx80( aSign, 0, 0 );
4805 }
4806 lastBitMask = 1;
4807 lastBitMask <<= 0x403E - aExp;
4808 roundBitsMask = lastBitMask - 1;
4809 z = a;
4810 switch (status->float_rounding_mode) {
4811 case float_round_nearest_even:
4812 z.low += lastBitMask>>1;
4813 if ((z.low & roundBitsMask) == 0) {
4814 z.low &= ~lastBitMask;
4815 }
4816 break;
4817 case float_round_ties_away:
4818 z.low += lastBitMask >> 1;
4819 break;
4820 case float_round_to_zero:
4821 break;
4822 case float_round_up:
4823 if (!extractFloatx80Sign(z)) {
4824 z.low += roundBitsMask;
4825 }
4826 break;
4827 case float_round_down:
4828 if (extractFloatx80Sign(z)) {
4829 z.low += roundBitsMask;
4830 }
4831 break;
4832 default:
4833 abort();
4834 }
4835 z.low &= ~ roundBitsMask;
4836 if ( z.low == 0 ) {
4837 ++z.high;
4838 z.low = LIT64( 0x8000000000000000 );
4839 }
4840 if (z.low != a.low) {
4841 status->float_exception_flags |= float_flag_inexact;
4842 }
4843 return z;
4844
4845 }
4846
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of adding the absolute values of the extended double-
4849 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4850 | negated before being returned. `zSign' is ignored if the result is a NaN.
4851 | The addition is performed according to the IEC/IEEE Standard for Binary
4852 | Floating-Point Arithmetic.
4853 *----------------------------------------------------------------------------*/
4854
4855 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4856 float_status *status)
4857 {
4858 int32_t aExp, bExp, zExp;
4859 uint64_t aSig, bSig, zSig0, zSig1;
4860 int32_t expDiff;
4861
4862 aSig = extractFloatx80Frac( a );
4863 aExp = extractFloatx80Exp( a );
4864 bSig = extractFloatx80Frac( b );
4865 bExp = extractFloatx80Exp( b );
4866 expDiff = aExp - bExp;
4867 if ( 0 < expDiff ) {
4868 if ( aExp == 0x7FFF ) {
4869 if ((uint64_t)(aSig << 1)) {
4870 return propagateFloatx80NaN(a, b, status);
4871 }
4872 return a;
4873 }
4874 if ( bExp == 0 ) --expDiff;
4875 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4876 zExp = aExp;
4877 }
4878 else if ( expDiff < 0 ) {
4879 if ( bExp == 0x7FFF ) {
4880 if ((uint64_t)(bSig << 1)) {
4881 return propagateFloatx80NaN(a, b, status);
4882 }
4883 return packFloatx80(zSign,
4884 floatx80_infinity_high,
4885 floatx80_infinity_low);
4886 }
4887 if ( aExp == 0 ) ++expDiff;
4888 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4889 zExp = bExp;
4890 }
4891 else {
4892 if ( aExp == 0x7FFF ) {
4893 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4894 return propagateFloatx80NaN(a, b, status);
4895 }
4896 return a;
4897 }
4898 zSig1 = 0;
4899 zSig0 = aSig + bSig;
4900 if ( aExp == 0 ) {
4901 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4902 goto roundAndPack;
4903 }
4904 zExp = aExp;
4905 goto shiftRight1;
4906 }
4907 zSig0 = aSig + bSig;
4908 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4909 shiftRight1:
4910 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4911 zSig0 |= LIT64( 0x8000000000000000 );
4912 ++zExp;
4913 roundAndPack:
4914 return roundAndPackFloatx80(status->floatx80_rounding_precision,
4915 zSign, zExp, zSig0, zSig1, status);
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the absolute values of the extended
4920 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4921 | difference is negated before being returned. `zSign' is ignored if the
4922 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4923 | Standard for Binary Floating-Point Arithmetic.
4924 *----------------------------------------------------------------------------*/
4925
4926 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4927 float_status *status)
4928 {
4929 int32_t aExp, bExp, zExp;
4930 uint64_t aSig, bSig, zSig0, zSig1;
4931 int32_t expDiff;
4932
4933 aSig = extractFloatx80Frac( a );
4934 aExp = extractFloatx80Exp( a );
4935 bSig = extractFloatx80Frac( b );
4936 bExp = extractFloatx80Exp( b );
4937 expDiff = aExp - bExp;
4938 if ( 0 < expDiff ) goto aExpBigger;
4939 if ( expDiff < 0 ) goto bExpBigger;
4940 if ( aExp == 0x7FFF ) {
4941 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4942 return propagateFloatx80NaN(a, b, status);
4943 }
4944 float_raise(float_flag_invalid, status);
4945 return floatx80_default_nan(status);
4946 }
4947 if ( aExp == 0 ) {
4948 aExp = 1;
4949 bExp = 1;
4950 }
4951 zSig1 = 0;
4952 if ( bSig < aSig ) goto aBigger;
4953 if ( aSig < bSig ) goto bBigger;
4954 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4955 bExpBigger:
4956 if ( bExp == 0x7FFF ) {
4957 if ((uint64_t)(bSig << 1)) {
4958 return propagateFloatx80NaN(a, b, status);
4959 }
4960 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4961 floatx80_infinity_low);
4962 }
4963 if ( aExp == 0 ) ++expDiff;
4964 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4965 bBigger:
4966 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4967 zExp = bExp;
4968 zSign ^= 1;
4969 goto normalizeRoundAndPack;
4970 aExpBigger:
4971 if ( aExp == 0x7FFF ) {
4972 if ((uint64_t)(aSig << 1)) {
4973 return propagateFloatx80NaN(a, b, status);
4974 }
4975 return a;
4976 }
4977 if ( bExp == 0 ) --expDiff;
4978 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4979 aBigger:
4980 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4981 zExp = aExp;
4982 normalizeRoundAndPack:
4983 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4984 zSign, zExp, zSig0, zSig1, status);
4985 }
4986
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of adding the extended double-precision floating-point
4989 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4990 | Standard for Binary Floating-Point Arithmetic.
4991 *----------------------------------------------------------------------------*/
4992
4993 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4994 {
4995 flag aSign, bSign;
4996
4997 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4998 float_raise(float_flag_invalid, status);
4999 return floatx80_default_nan(status);
5000 }
5001 aSign = extractFloatx80Sign( a );
5002 bSign = extractFloatx80Sign( b );
5003 if ( aSign == bSign ) {
5004 return addFloatx80Sigs(a, b, aSign, status);
5005 }
5006 else {
5007 return subFloatx80Sigs(a, b, aSign, status);
5008 }
5009
5010 }
5011
5012 /*----------------------------------------------------------------------------
5013 | Returns the result of subtracting the extended double-precision floating-
5014 | point values `a' and `b'. The operation is performed according to the
5015 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5016 *----------------------------------------------------------------------------*/
5017
5018 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5019 {
5020 flag aSign, bSign;
5021
5022 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5023 float_raise(float_flag_invalid, status);
5024 return floatx80_default_nan(status);
5025 }
5026 aSign = extractFloatx80Sign( a );
5027 bSign = extractFloatx80Sign( b );
5028 if ( aSign == bSign ) {
5029 return subFloatx80Sigs(a, b, aSign, status);
5030 }
5031 else {
5032 return addFloatx80Sigs(a, b, aSign, status);
5033 }
5034
5035 }
5036
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of multiplying the extended double-precision floating-
5039 | point values `a' and `b'. The operation is performed according to the
5040 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5041 *----------------------------------------------------------------------------*/
5042
5043 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5044 {
5045 flag aSign, bSign, zSign;
5046 int32_t aExp, bExp, zExp;
5047 uint64_t aSig, bSig, zSig0, zSig1;
5048
5049 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5050 float_raise(float_flag_invalid, status);
5051 return floatx80_default_nan(status);
5052 }
5053 aSig = extractFloatx80Frac( a );
5054 aExp = extractFloatx80Exp( a );
5055 aSign = extractFloatx80Sign( a );
5056 bSig = extractFloatx80Frac( b );
5057 bExp = extractFloatx80Exp( b );
5058 bSign = extractFloatx80Sign( b );
5059 zSign = aSign ^ bSign;
5060 if ( aExp == 0x7FFF ) {
5061 if ( (uint64_t) ( aSig<<1 )
5062 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5063 return propagateFloatx80NaN(a, b, status);
5064 }
5065 if ( ( bExp | bSig ) == 0 ) goto invalid;
5066 return packFloatx80(zSign, floatx80_infinity_high,
5067 floatx80_infinity_low);
5068 }
5069 if ( bExp == 0x7FFF ) {
5070 if ((uint64_t)(bSig << 1)) {
5071 return propagateFloatx80NaN(a, b, status);
5072 }
5073 if ( ( aExp | aSig ) == 0 ) {
5074 invalid:
5075 float_raise(float_flag_invalid, status);
5076 return floatx80_default_nan(status);
5077 }
5078 return packFloatx80(zSign, floatx80_infinity_high,
5079 floatx80_infinity_low);
5080 }
5081 if ( aExp == 0 ) {
5082 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5083 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5084 }
5085 if ( bExp == 0 ) {
5086 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5087 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5088 }
5089 zExp = aExp + bExp - 0x3FFE;
5090 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5091 if ( 0 < (int64_t) zSig0 ) {
5092 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5093 --zExp;
5094 }
5095 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5096 zSign, zExp, zSig0, zSig1, status);
5097 }
5098
5099 /*----------------------------------------------------------------------------
5100 | Returns the result of dividing the extended double-precision floating-point
5101 | value `a' by the corresponding value `b'. The operation is performed
5102 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5103 *----------------------------------------------------------------------------*/
5104
5105 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5106 {
5107 flag aSign, bSign, zSign;
5108 int32_t aExp, bExp, zExp;
5109 uint64_t aSig, bSig, zSig0, zSig1;
5110 uint64_t rem0, rem1, rem2, term0, term1, term2;
5111
5112 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5113 float_raise(float_flag_invalid, status);
5114 return floatx80_default_nan(status);
5115 }
5116 aSig = extractFloatx80Frac( a );
5117 aExp = extractFloatx80Exp( a );
5118 aSign = extractFloatx80Sign( a );
5119 bSig = extractFloatx80Frac( b );
5120 bExp = extractFloatx80Exp( b );
5121 bSign = extractFloatx80Sign( b );
5122 zSign = aSign ^ bSign;
5123 if ( aExp == 0x7FFF ) {
5124 if ((uint64_t)(aSig << 1)) {
5125 return propagateFloatx80NaN(a, b, status);
5126 }
5127 if ( bExp == 0x7FFF ) {
5128 if ((uint64_t)(bSig << 1)) {
5129 return propagateFloatx80NaN(a, b, status);
5130 }
5131 goto invalid;
5132 }
5133 return packFloatx80(zSign, floatx80_infinity_high,
5134 floatx80_infinity_low);
5135 }
5136 if ( bExp == 0x7FFF ) {
5137 if ((uint64_t)(bSig << 1)) {
5138 return propagateFloatx80NaN(a, b, status);
5139 }
5140 return packFloatx80( zSign, 0, 0 );
5141 }
5142 if ( bExp == 0 ) {
5143 if ( bSig == 0 ) {
5144 if ( ( aExp | aSig ) == 0 ) {
5145 invalid:
5146 float_raise(float_flag_invalid, status);
5147 return floatx80_default_nan(status);
5148 }
5149 float_raise(float_flag_divbyzero, status);
5150 return packFloatx80(zSign, floatx80_infinity_high,
5151 floatx80_infinity_low);
5152 }
5153 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5154 }
5155 if ( aExp == 0 ) {
5156 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5157 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5158 }
5159 zExp = aExp - bExp + 0x3FFE;
5160 rem1 = 0;
5161 if ( bSig <= aSig ) {
5162 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5163 ++zExp;
5164 }
5165 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5166 mul64To128( bSig, zSig0, &term0, &term1 );
5167 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5168 while ( (int64_t) rem0 < 0 ) {
5169 --zSig0;
5170 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5171 }
5172 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5173 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5174 mul64To128( bSig, zSig1, &term1, &term2 );
5175 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5176 while ( (int64_t) rem1 < 0 ) {
5177 --zSig1;
5178 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5179 }
5180 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5181 }
5182 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183 zSign, zExp, zSig0, zSig1, status);
5184 }
5185
5186 /*----------------------------------------------------------------------------
5187 | Returns the remainder of the extended double-precision floating-point value
5188 | `a' with respect to the corresponding value `b'. The operation is performed
5189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5190 *----------------------------------------------------------------------------*/
5191
5192 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5193 {
5194 flag aSign, zSign;
5195 int32_t aExp, bExp, expDiff;
5196 uint64_t aSig0, aSig1, bSig;
5197 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5198
5199 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5200 float_raise(float_flag_invalid, status);
5201 return floatx80_default_nan(status);
5202 }
5203 aSig0 = extractFloatx80Frac( a );
5204 aExp = extractFloatx80Exp( a );
5205 aSign = extractFloatx80Sign( a );
5206 bSig = extractFloatx80Frac( b );
5207 bExp = extractFloatx80Exp( b );
5208 if ( aExp == 0x7FFF ) {
5209 if ( (uint64_t) ( aSig0<<1 )
5210 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5211 return propagateFloatx80NaN(a, b, status);
5212 }
5213 goto invalid;
5214 }
5215 if ( bExp == 0x7FFF ) {
5216 if ((uint64_t)(bSig << 1)) {
5217 return propagateFloatx80NaN(a, b, status);
5218 }
5219 return a;
5220 }
5221 if ( bExp == 0 ) {
5222 if ( bSig == 0 ) {
5223 invalid:
5224 float_raise(float_flag_invalid, status);
5225 return floatx80_default_nan(status);
5226 }
5227 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5228 }
5229 if ( aExp == 0 ) {
5230 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5231 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5232 }
5233 bSig |= LIT64( 0x8000000000000000 );
5234 zSign = aSign;
5235 expDiff = aExp - bExp;
5236 aSig1 = 0;
5237 if ( expDiff < 0 ) {
5238 if ( expDiff < -1 ) return a;
5239 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5240 expDiff = 0;
5241 }
5242 q = ( bSig <= aSig0 );
5243 if ( q ) aSig0 -= bSig;
5244 expDiff -= 64;
5245 while ( 0 < expDiff ) {
5246 q = estimateDiv128To64( aSig0, aSig1, bSig );
5247 q = ( 2 < q ) ? q - 2 : 0;
5248 mul64To128( bSig, q, &term0, &term1 );
5249 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5250 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5251 expDiff -= 62;
5252 }
5253 expDiff += 64;
5254 if ( 0 < expDiff ) {
5255 q = estimateDiv128To64( aSig0, aSig1, bSig );
5256 q = ( 2 < q ) ? q - 2 : 0;
5257 q >>= 64 - expDiff;
5258 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5259 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5260 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5261 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5262 ++q;
5263 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5264 }
5265 }
5266 else {
5267 term1 = 0;
5268 term0 = bSig;
5269 }
5270 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5271 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5272 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5273 && ( q & 1 ) )
5274 ) {
5275 aSig0 = alternateASig0;
5276 aSig1 = alternateASig1;
5277 zSign = ! zSign;
5278 }
5279 return
5280 normalizeRoundAndPackFloatx80(
5281 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5282
5283 }
5284
5285 /*----------------------------------------------------------------------------
5286 | Returns the square root of the extended double-precision floating-point
5287 | value `a'. The operation is performed according to the IEC/IEEE Standard
5288 | for Binary Floating-Point Arithmetic.
5289 *----------------------------------------------------------------------------*/
5290
5291 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5292 {
5293 flag aSign;
5294 int32_t aExp, zExp;
5295 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5296 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5297
5298 if (floatx80_invalid_encoding(a)) {
5299 float_raise(float_flag_invalid, status);
5300 return floatx80_default_nan(status);
5301 }
5302 aSig0 = extractFloatx80Frac( a );
5303 aExp = extractFloatx80Exp( a );
5304 aSign = extractFloatx80Sign( a );
5305 if ( aExp == 0x7FFF ) {
5306 if ((uint64_t)(aSig0 << 1)) {
5307 return propagateFloatx80NaN(a, a, status);
5308 }
5309 if ( ! aSign ) return a;
5310 goto invalid;
5311 }
5312 if ( aSign ) {
5313 if ( ( aExp | aSig0 ) == 0 ) return a;
5314 invalid:
5315 float_raise(float_flag_invalid, status);
5316 return floatx80_default_nan(status);
5317 }
5318 if ( aExp == 0 ) {
5319 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5320 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5321 }
5322 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5323 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5324 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5325 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5326 doubleZSig0 = zSig0<<1;
5327 mul64To128( zSig0, zSig0, &term0, &term1 );
5328 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5329 while ( (int64_t) rem0 < 0 ) {
5330 --zSig0;
5331 doubleZSig0 -= 2;
5332 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5333 }
5334 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5335 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5336 if ( zSig1 == 0 ) zSig1 = 1;
5337 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5338 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5339 mul64To128( zSig1, zSig1, &term2, &term3 );
5340 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5341 while ( (int64_t) rem1 < 0 ) {
5342 --zSig1;
5343 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5344 term3 |= 1;
5345 term2 |= doubleZSig0;
5346 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5347 }
5348 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5349 }
5350 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5351 zSig0 |= doubleZSig0;
5352 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5353 0, zExp, zSig0, zSig1, status);
5354 }
5355
5356 /*----------------------------------------------------------------------------
5357 | Returns 1 if the extended double-precision floating-point value `a' is equal
5358 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5359 | raised if either operand is a NaN. Otherwise, the comparison is performed
5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5361 *----------------------------------------------------------------------------*/
5362
5363 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5364 {
5365
5366 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5367 || (extractFloatx80Exp(a) == 0x7FFF
5368 && (uint64_t) (extractFloatx80Frac(a) << 1))
5369 || (extractFloatx80Exp(b) == 0x7FFF
5370 && (uint64_t) (extractFloatx80Frac(b) << 1))
5371 ) {
5372 float_raise(float_flag_invalid, status);
5373 return 0;
5374 }
5375 return
5376 ( a.low == b.low )
5377 && ( ( a.high == b.high )
5378 || ( ( a.low == 0 )
5379 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5380 );
5381
5382 }
5383
5384 /*----------------------------------------------------------------------------
5385 | Returns 1 if the extended double-precision floating-point value `a' is
5386 | less than or equal to the corresponding value `b', and 0 otherwise. The
5387 | invalid exception is raised if either operand is a NaN. The comparison is
5388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5389 | Arithmetic.
5390 *----------------------------------------------------------------------------*/
5391
5392 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5393 {
5394 flag aSign, bSign;
5395
5396 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5397 || (extractFloatx80Exp(a) == 0x7FFF
5398 && (uint64_t) (extractFloatx80Frac(a) << 1))
5399 || (extractFloatx80Exp(b) == 0x7FFF
5400 && (uint64_t) (extractFloatx80Frac(b) << 1))
5401 ) {
5402 float_raise(float_flag_invalid, status);
5403 return 0;
5404 }
5405 aSign = extractFloatx80Sign( a );
5406 bSign = extractFloatx80Sign( b );
5407 if ( aSign != bSign ) {
5408 return
5409 aSign
5410 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5411 == 0 );
5412 }
5413 return
5414 aSign ? le128( b.high, b.low, a.high, a.low )
5415 : le128( a.high, a.low, b.high, b.low );
5416
5417 }
5418
5419 /*----------------------------------------------------------------------------
5420 | Returns 1 if the extended double-precision floating-point value `a' is
5421 | less than the corresponding value `b', and 0 otherwise. The invalid
5422 | exception is raised if either operand is a NaN. The comparison is performed
5423 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5424 *----------------------------------------------------------------------------*/
5425
5426 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5427 {
5428 flag aSign, bSign;
5429
5430 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5431 || (extractFloatx80Exp(a) == 0x7FFF
5432 && (uint64_t) (extractFloatx80Frac(a) << 1))
5433 || (extractFloatx80Exp(b) == 0x7FFF
5434 && (uint64_t) (extractFloatx80Frac(b) << 1))
5435 ) {
5436 float_raise(float_flag_invalid, status);
5437 return 0;
5438 }
5439 aSign = extractFloatx80Sign( a );
5440 bSign = extractFloatx80Sign( b );
5441 if ( aSign != bSign ) {
5442 return
5443 aSign
5444 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5445 != 0 );
5446 }
5447 return
5448 aSign ? lt128( b.high, b.low, a.high, a.low )
5449 : lt128( a.high, a.low, b.high, b.low );
5450
5451 }
5452
5453 /*----------------------------------------------------------------------------
5454 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5455 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5456 | either operand is a NaN. The comparison is performed according to the
5457 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5458 *----------------------------------------------------------------------------*/
5459 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5460 {
5461 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5462 || (extractFloatx80Exp(a) == 0x7FFF
5463 && (uint64_t) (extractFloatx80Frac(a) << 1))
5464 || (extractFloatx80Exp(b) == 0x7FFF
5465 && (uint64_t) (extractFloatx80Frac(b) << 1))
5466 ) {
5467 float_raise(float_flag_invalid, status);
5468 return 1;
5469 }
5470 return 0;
5471 }
5472
5473 /*----------------------------------------------------------------------------
5474 | Returns 1 if the extended double-precision floating-point value `a' is
5475 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5476 | cause an exception. The comparison is performed according to the IEC/IEEE
5477 | Standard for Binary Floating-Point Arithmetic.
5478 *----------------------------------------------------------------------------*/
5479
5480 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5481 {
5482
5483 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5484 float_raise(float_flag_invalid, status);
5485 return 0;
5486 }
5487 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5488 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5489 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5490 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5491 ) {
5492 if (floatx80_is_signaling_nan(a, status)
5493 || floatx80_is_signaling_nan(b, status)) {
5494 float_raise(float_flag_invalid, status);
5495 }
5496 return 0;
5497 }
5498 return
5499 ( a.low == b.low )
5500 && ( ( a.high == b.high )
5501 || ( ( a.low == 0 )
5502 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5503 );
5504
5505 }
5506
5507 /*----------------------------------------------------------------------------
5508 | Returns 1 if the extended double-precision floating-point value `a' is less
5509 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5510 | do not cause an exception. Otherwise, the comparison is performed according
5511 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5512 *----------------------------------------------------------------------------*/
5513
5514 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5515 {
5516 flag aSign, bSign;
5517
5518 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5519 float_raise(float_flag_invalid, status);
5520 return 0;
5521 }
5522 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5523 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5524 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5525 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5526 ) {
5527 if (floatx80_is_signaling_nan(a, status)
5528 || floatx80_is_signaling_nan(b, status)) {
5529 float_raise(float_flag_invalid, status);
5530 }
5531 return 0;
5532 }
5533 aSign = extractFloatx80Sign( a );
5534 bSign = extractFloatx80Sign( b );
5535 if ( aSign != bSign ) {
5536 return
5537 aSign
5538 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5539 == 0 );
5540 }
5541 return
5542 aSign ? le128( b.high, b.low, a.high, a.low )
5543 : le128( a.high, a.low, b.high, b.low );
5544
5545 }
5546
5547 /*----------------------------------------------------------------------------
5548 | Returns 1 if the extended double-precision floating-point value `a' is less
5549 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5550 | an exception. Otherwise, the comparison is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553
5554 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5555 {
5556 flag aSign, bSign;
5557
5558 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5559 float_raise(float_flag_invalid, status);
5560 return 0;
5561 }
5562 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5563 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5564 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5565 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5566 ) {
5567 if (floatx80_is_signaling_nan(a, status)
5568 || floatx80_is_signaling_nan(b, status)) {
5569 float_raise(float_flag_invalid, status);
5570 }
5571 return 0;
5572 }
5573 aSign = extractFloatx80Sign( a );
5574 bSign = extractFloatx80Sign( b );
5575 if ( aSign != bSign ) {
5576 return
5577 aSign
5578 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5579 != 0 );
5580 }
5581 return
5582 aSign ? lt128( b.high, b.low, a.high, a.low )
5583 : lt128( a.high, a.low, b.high, b.low );
5584
5585 }
5586
5587 /*----------------------------------------------------------------------------
5588 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5589 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5590 | The comparison is performed according to the IEC/IEEE Standard for Binary
5591 | Floating-Point Arithmetic.
5592 *----------------------------------------------------------------------------*/
5593 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5594 {
5595 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5596 float_raise(float_flag_invalid, status);
5597 return 1;
5598 }
5599 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5600 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5601 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5602 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5603 ) {
5604 if (floatx80_is_signaling_nan(a, status)
5605 || floatx80_is_signaling_nan(b, status)) {
5606 float_raise(float_flag_invalid, status);
5607 }
5608 return 1;
5609 }
5610 return 0;
5611 }
5612
5613 /*----------------------------------------------------------------------------
5614 | Returns the result of converting the quadruple-precision floating-point
5615 | value `a' to the 32-bit two's complement integer format. The conversion
5616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5617 | Arithmetic---which means in particular that the conversion is rounded
5618 | according to the current rounding mode. If `a' is a NaN, the largest
5619 | positive integer is returned. Otherwise, if the conversion overflows, the
5620 | largest integer with the same sign as `a' is returned.
5621 *----------------------------------------------------------------------------*/
5622
5623 int32_t float128_to_int32(float128 a, float_status *status)
5624 {
5625 flag aSign;
5626 int32_t aExp, shiftCount;
5627 uint64_t aSig0, aSig1;
5628
5629 aSig1 = extractFloat128Frac1( a );
5630 aSig0 = extractFloat128Frac0( a );
5631 aExp = extractFloat128Exp( a );
5632 aSign = extractFloat128Sign( a );
5633 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5634 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5635 aSig0 |= ( aSig1 != 0 );
5636 shiftCount = 0x4028 - aExp;
5637 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5638 return roundAndPackInt32(aSign, aSig0, status);
5639
5640 }
5641
5642 /*----------------------------------------------------------------------------
5643 | Returns the result of converting the quadruple-precision floating-point
5644 | value `a' to the 32-bit two's complement integer format. The conversion
5645 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5646 | Arithmetic, except that the conversion is always rounded toward zero. If
5647 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5648 | conversion overflows, the largest integer with the same sign as `a' is
5649 | returned.
5650 *----------------------------------------------------------------------------*/
5651
5652 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5653 {
5654 flag aSign;
5655 int32_t aExp, shiftCount;
5656 uint64_t aSig0, aSig1, savedASig;
5657 int32_t z;
5658
5659 aSig1 = extractFloat128Frac1( a );
5660 aSig0 = extractFloat128Frac0( a );
5661 aExp = extractFloat128Exp( a );
5662 aSign = extractFloat128Sign( a );
5663 aSig0 |= ( aSig1 != 0 );
5664 if ( 0x401E < aExp ) {
5665 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5666 goto invalid;
5667 }
5668 else if ( aExp < 0x3FFF ) {
5669 if (aExp || aSig0) {
5670 status->float_exception_flags |= float_flag_inexact;
5671 }
5672 return 0;
5673 }
5674 aSig0 |= LIT64( 0x0001000000000000 );
5675 shiftCount = 0x402F - aExp;
5676 savedASig = aSig0;
5677 aSig0 >>= shiftCount;
5678 z = aSig0;
5679 if ( aSign ) z = - z;
5680 if ( ( z < 0 ) ^ aSign ) {
5681 invalid:
5682 float_raise(float_flag_invalid, status);
5683 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5684 }
5685 if ( ( aSig0<<shiftCount ) != savedASig ) {
5686 status->float_exception_flags |= float_flag_inexact;
5687 }
5688 return z;
5689
5690 }
5691
5692 /*----------------------------------------------------------------------------
5693 | Returns the result of converting the quadruple-precision floating-point
5694 | value `a' to the 64-bit two's complement integer format. The conversion
5695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696 | Arithmetic---which means in particular that the conversion is rounded
5697 | according to the current rounding mode. If `a' is a NaN, the largest
5698 | positive integer is returned. Otherwise, if the conversion overflows, the
5699 | largest integer with the same sign as `a' is returned.
5700 *----------------------------------------------------------------------------*/
5701
5702 int64_t float128_to_int64(float128 a, float_status *status)
5703 {
5704 flag aSign;
5705 int32_t aExp, shiftCount;
5706 uint64_t aSig0, aSig1;
5707
5708 aSig1 = extractFloat128Frac1( a );
5709 aSig0 = extractFloat128Frac0( a );
5710 aExp = extractFloat128Exp( a );
5711 aSign = extractFloat128Sign( a );
5712 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5713 shiftCount = 0x402F - aExp;
5714 if ( shiftCount <= 0 ) {
5715 if ( 0x403E < aExp ) {
5716 float_raise(float_flag_invalid, status);
5717 if ( ! aSign
5718 || ( ( aExp == 0x7FFF )
5719 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5720 )
5721 ) {
5722 return LIT64( 0x7FFFFFFFFFFFFFFF );
5723 }
5724 return (int64_t) LIT64( 0x8000000000000000 );
5725 }
5726 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5727 }
5728 else {
5729 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5730 }
5731 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5732
5733 }
5734
5735 /*----------------------------------------------------------------------------
5736 | Returns the result of converting the quadruple-precision floating-point
5737 | value `a' to the 64-bit two's complement integer format. The conversion
5738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5739 | Arithmetic, except that the conversion is always rounded toward zero.
5740 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5741 | the conversion overflows, the largest integer with the same sign as `a' is
5742 | returned.
5743 *----------------------------------------------------------------------------*/
5744
5745 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5746 {
5747 flag aSign;
5748 int32_t aExp, shiftCount;
5749 uint64_t aSig0, aSig1;
5750 int64_t z;
5751
5752 aSig1 = extractFloat128Frac1( a );
5753 aSig0 = extractFloat128Frac0( a );
5754 aExp = extractFloat128Exp( a );
5755 aSign = extractFloat128Sign( a );
5756 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5757 shiftCount = aExp - 0x402F;
5758 if ( 0 < shiftCount ) {
5759 if ( 0x403E <= aExp ) {
5760 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5761 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5762 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5763 if (aSig1) {
5764 status->float_exception_flags |= float_flag_inexact;
5765 }
5766 }
5767 else {
5768 float_raise(float_flag_invalid, status);
5769 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5770 return LIT64( 0x7FFFFFFFFFFFFFFF );
5771 }
5772 }
5773 return (int64_t) LIT64( 0x8000000000000000 );
5774 }
5775 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5776 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5777 status->float_exception_flags |= float_flag_inexact;
5778 }
5779 }
5780 else {
5781 if ( aExp < 0x3FFF ) {
5782 if ( aExp | aSig0 | aSig1 ) {
5783 status->float_exception_flags |= float_flag_inexact;
5784 }
5785 return 0;
5786 }
5787 z = aSig0>>( - shiftCount );
5788 if ( aSig1
5789 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5790 status->float_exception_flags |= float_flag_inexact;
5791 }
5792 }
5793 if ( aSign ) z = - z;
5794 return z;
5795
5796 }
5797
5798 /*----------------------------------------------------------------------------
5799 | Returns the result of converting the quadruple-precision floating-point value
5800 | `a' to the 64-bit unsigned integer format. The conversion is
5801 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5802 | Arithmetic---which means in particular that the conversion is rounded
5803 | according to the current rounding mode. If `a' is a NaN, the largest
5804 | positive integer is returned. If the conversion overflows, the
5805 | largest unsigned integer is returned. If 'a' is negative, the value is
5806 | rounded and zero is returned; negative values that do not round to zero
5807 | will raise the inexact exception.
5808 *----------------------------------------------------------------------------*/
5809
5810 uint64_t float128_to_uint64(float128 a, float_status *status)
5811 {
5812 flag aSign;
5813 int aExp;
5814 int shiftCount;
5815 uint64_t aSig0, aSig1;
5816
5817 aSig0 = extractFloat128Frac0(a);
5818 aSig1 = extractFloat128Frac1(a);
5819 aExp = extractFloat128Exp(a);
5820 aSign = extractFloat128Sign(a);
5821 if (aSign && (aExp > 0x3FFE)) {
5822 float_raise(float_flag_invalid, status);
5823 if (float128_is_any_nan(a)) {
5824 return LIT64(0xFFFFFFFFFFFFFFFF);
5825 } else {
5826 return 0;
5827 }
5828 }
5829 if (aExp) {
5830 aSig0 |= LIT64(0x0001000000000000);
5831 }
5832 shiftCount = 0x402F - aExp;
5833 if (shiftCount <= 0) {
5834 if (0x403E < aExp) {
5835 float_raise(float_flag_invalid, status);
5836 return LIT64(0xFFFFFFFFFFFFFFFF);
5837 }
5838 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5839 } else {
5840 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5841 }
5842 return roundAndPackUint64(aSign, aSig0, aSig1, status);
5843 }
5844
5845 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5846 {
5847 uint64_t v;
5848 signed char current_rounding_mode = status->float_rounding_mode;
5849
5850 set_float_rounding_mode(float_round_to_zero, status);
5851 v = float128_to_uint64(a, status);
5852 set_float_rounding_mode(current_rounding_mode, status);
5853
5854 return v;
5855 }
5856
5857 /*----------------------------------------------------------------------------
5858 | Returns the result of converting the quadruple-precision floating-point
5859 | value `a' to the 32-bit unsigned integer format. The conversion
5860 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5861 | Arithmetic except that the conversion is always rounded toward zero.
5862 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
5863 | if the conversion overflows, the largest unsigned integer is returned.
5864 | If 'a' is negative, the value is rounded and zero is returned; negative
5865 | values that do not round to zero will raise the inexact exception.
5866 *----------------------------------------------------------------------------*/
5867
5868 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5869 {
5870 uint64_t v;
5871 uint32_t res;
5872 int old_exc_flags = get_float_exception_flags(status);
5873
5874 v = float128_to_uint64_round_to_zero(a, status);
5875 if (v > 0xffffffff) {
5876 res = 0xffffffff;
5877 } else {
5878 return v;
5879 }
5880 set_float_exception_flags(old_exc_flags, status);
5881 float_raise(float_flag_invalid, status);
5882 return res;
5883 }
5884
5885 /*----------------------------------------------------------------------------
5886 | Returns the result of converting the quadruple-precision floating-point
5887 | value `a' to the single-precision floating-point format. The conversion
5888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5889 | Arithmetic.
5890 *----------------------------------------------------------------------------*/
5891
5892 float32 float128_to_float32(float128 a, float_status *status)
5893 {
5894 flag aSign;
5895 int32_t aExp;
5896 uint64_t aSig0, aSig1;
5897 uint32_t zSig;
5898
5899 aSig1 = extractFloat128Frac1( a );
5900 aSig0 = extractFloat128Frac0( a );
5901 aExp = extractFloat128Exp( a );
5902 aSign = extractFloat128Sign( a );
5903 if ( aExp == 0x7FFF ) {
5904 if ( aSig0 | aSig1 ) {
5905 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5906 }
5907 return packFloat32( aSign, 0xFF, 0 );
5908 }
5909 aSig0 |= ( aSig1 != 0 );
5910 shift64RightJamming( aSig0, 18, &aSig0 );
5911 zSig = aSig0;
5912 if ( aExp || zSig ) {
5913 zSig |= 0x40000000;
5914 aExp -= 0x3F81;
5915 }
5916 return roundAndPackFloat32(aSign, aExp, zSig, status);
5917
5918 }
5919
5920 /*----------------------------------------------------------------------------
5921 | Returns the result of converting the quadruple-precision floating-point
5922 | value `a' to the double-precision floating-point format. The conversion
5923 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5924 | Arithmetic.
5925 *----------------------------------------------------------------------------*/
5926
5927 float64 float128_to_float64(float128 a, float_status *status)
5928 {
5929 flag aSign;
5930 int32_t aExp;
5931 uint64_t aSig0, aSig1;
5932
5933 aSig1 = extractFloat128Frac1( a );
5934 aSig0 = extractFloat128Frac0( a );
5935 aExp = extractFloat128Exp( a );
5936 aSign = extractFloat128Sign( a );
5937 if ( aExp == 0x7FFF ) {
5938 if ( aSig0 | aSig1 ) {
5939 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5940 }
5941 return packFloat64( aSign, 0x7FF, 0 );
5942 }
5943 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5944 aSig0 |= ( aSig1 != 0 );
5945 if ( aExp || aSig0 ) {
5946 aSig0 |= LIT64( 0x4000000000000000 );
5947 aExp -= 0x3C01;
5948 }
5949 return roundAndPackFloat64(aSign, aExp, aSig0, status);
5950
5951 }
5952
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of converting the quadruple-precision floating-point
5955 | value `a' to the extended double-precision floating-point format. The
5956 | conversion is performed according to the IEC/IEEE Standard for Binary
5957 | Floating-Point Arithmetic.
5958 *----------------------------------------------------------------------------*/
5959
5960 floatx80 float128_to_floatx80(float128 a, float_status *status)
5961 {
5962 flag aSign;
5963 int32_t aExp;
5964 uint64_t aSig0, aSig1;
5965
5966 aSig1 = extractFloat128Frac1( a );
5967 aSig0 = extractFloat128Frac0( a );
5968 aExp = extractFloat128Exp( a );
5969 aSign = extractFloat128Sign( a );
5970 if ( aExp == 0x7FFF ) {
5971 if ( aSig0 | aSig1 ) {
5972 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5973 }
5974 return packFloatx80(aSign, floatx80_infinity_high,
5975 floatx80_infinity_low);
5976 }
5977 if ( aExp == 0 ) {
5978 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5979 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5980 }
5981 else {
5982 aSig0 |= LIT64( 0x0001000000000000 );
5983 }
5984 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5985 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5986
5987 }
5988
5989 /*----------------------------------------------------------------------------
5990 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5991 | returns the result as a quadruple-precision floating-point value. The
5992 | operation is performed according to the IEC/IEEE Standard for Binary
5993 | Floating-Point Arithmetic.
5994 *----------------------------------------------------------------------------*/
5995
5996 float128 float128_round_to_int(float128 a, float_status *status)
5997 {
5998 flag aSign;
5999 int32_t aExp;
6000 uint64_t lastBitMask, roundBitsMask;
6001 float128 z;
6002
6003 aExp = extractFloat128Exp( a );
6004 if ( 0x402F <= aExp ) {
6005 if ( 0x406F <= aExp ) {
6006 if ( ( aExp == 0x7FFF )
6007 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6008 ) {
6009 return propagateFloat128NaN(a, a, status);
6010 }
6011 return a;
6012 }
6013 lastBitMask = 1;
6014 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6015 roundBitsMask = lastBitMask - 1;
6016 z = a;
6017 switch (status->float_rounding_mode) {
6018 case float_round_nearest_even:
6019 if ( lastBitMask ) {
6020 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6021 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6022 }
6023 else {
6024 if ( (int64_t) z.low < 0 ) {
6025 ++z.high;
6026 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6027 }
6028 }
6029 break;
6030 case float_round_ties_away:
6031 if (lastBitMask) {
6032 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6033 } else {
6034 if ((int64_t) z.low < 0) {
6035 ++z.high;
6036 }
6037 }
6038 break;
6039 case float_round_to_zero:
6040 break;
6041 case float_round_up:
6042 if (!extractFloat128Sign(z)) {
6043 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6044 }
6045 break;
6046 case float_round_down:
6047 if (extractFloat128Sign(z)) {
6048 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6049 }
6050 break;
6051 default:
6052 abort();
6053 }
6054 z.low &= ~ roundBitsMask;
6055 }
6056 else {
6057 if ( aExp < 0x3FFF ) {
6058 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6059 status->float_exception_flags |= float_flag_inexact;
6060 aSign = extractFloat128Sign( a );
6061 switch (status->float_rounding_mode) {
6062 case float_round_nearest_even:
6063 if ( ( aExp == 0x3FFE )
6064 && ( extractFloat128Frac0( a )
6065 | extractFloat128Frac1( a ) )
6066 ) {
6067 return packFloat128( aSign, 0x3FFF, 0, 0 );
6068 }
6069 break;
6070 case float_round_ties_away:
6071 if (aExp == 0x3FFE) {
6072 return packFloat128(aSign, 0x3FFF, 0, 0);
6073 }
6074 break;
6075 case float_round_down:
6076 return
6077 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6078 : packFloat128( 0, 0, 0, 0 );
6079 case float_round_up:
6080 return
6081 aSign ? packFloat128( 1, 0, 0, 0 )
6082 : packFloat128( 0, 0x3FFF, 0, 0 );
6083 }
6084 return packFloat128( aSign, 0, 0, 0 );
6085 }
6086 lastBitMask = 1;
6087 lastBitMask <<= 0x402F - aExp;
6088 roundBitsMask = lastBitMask - 1;
6089 z.low = 0;
6090 z.high = a.high;
6091 switch (status->float_rounding_mode) {
6092 case float_round_nearest_even:
6093 z.high += lastBitMask>>1;
6094 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6095 z.high &= ~ lastBitMask;
6096 }
6097 break;
6098 case float_round_ties_away:
6099 z.high += lastBitMask>>1;
6100 break;
6101 case float_round_to_zero:
6102 break;
6103 case float_round_up:
6104 if (!extractFloat128Sign(z)) {
6105 z.high |= ( a.low != 0 );
6106 z.high += roundBitsMask;
6107 }
6108 break;
6109 case float_round_down:
6110 if (extractFloat128Sign(z)) {
6111 z.high |= (a.low != 0);
6112 z.high += roundBitsMask;
6113 }
6114 break;
6115 default:
6116 abort();
6117 }
6118 z.high &= ~ roundBitsMask;
6119 }
6120 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6121 status->float_exception_flags |= float_flag_inexact;
6122 }
6123 return z;
6124
6125 }
6126
6127 /*----------------------------------------------------------------------------
6128 | Returns the result of adding the absolute values of the quadruple-precision
6129 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6130 | before being returned. `zSign' is ignored if the result is a NaN.
6131 | The addition is performed according to the IEC/IEEE Standard for Binary
6132 | Floating-Point Arithmetic.
6133 *----------------------------------------------------------------------------*/
6134
6135 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6136 float_status *status)
6137 {
6138 int32_t aExp, bExp, zExp;
6139 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6140 int32_t expDiff;
6141
6142 aSig1 = extractFloat128Frac1( a );
6143 aSig0 = extractFloat128Frac0( a );
6144 aExp = extractFloat128Exp( a );
6145 bSig1 = extractFloat128Frac1( b );
6146 bSig0 = extractFloat128Frac0( b );
6147 bExp = extractFloat128Exp( b );
6148 expDiff = aExp - bExp;
6149 if ( 0 < expDiff ) {
6150 if ( aExp == 0x7FFF ) {
6151 if (aSig0 | aSig1) {
6152 return propagateFloat128NaN(a, b, status);
6153 }
6154 return a;
6155 }
6156 if ( bExp == 0 ) {
6157 --expDiff;
6158 }
6159 else {
6160 bSig0 |= LIT64( 0x0001000000000000 );
6161 }
6162 shift128ExtraRightJamming(
6163 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6164 zExp = aExp;
6165 }
6166 else if ( expDiff < 0 ) {
6167 if ( bExp == 0x7FFF ) {
6168 if (bSig0 | bSig1) {
6169 return propagateFloat128NaN(a, b, status);
6170 }
6171 return packFloat128( zSign, 0x7FFF, 0, 0 );
6172 }
6173 if ( aExp == 0 ) {
6174 ++expDiff;
6175 }
6176 else {
6177 aSig0 |= LIT64( 0x0001000000000000 );
6178 }
6179 shift128ExtraRightJamming(
6180 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6181 zExp = bExp;
6182 }
6183 else {
6184 if ( aExp == 0x7FFF ) {
6185 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6186 return propagateFloat128NaN(a, b, status);
6187 }
6188 return a;
6189 }
6190 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6191 if ( aExp == 0 ) {
6192 if (status->flush_to_zero) {
6193 if (zSig0 | zSig1) {
6194 float_raise(float_flag_output_denormal, status);
6195 }
6196 return packFloat128(zSign, 0, 0, 0);
6197 }
6198 return packFloat128( zSign, 0, zSig0, zSig1 );
6199 }
6200 zSig2 = 0;
6201 zSig0 |= LIT64( 0x0002000000000000 );
6202 zExp = aExp;
6203 goto shiftRight1;
6204 }
6205 aSig0 |= LIT64( 0x0001000000000000 );
6206 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6207 --zExp;
6208 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6209 ++zExp;
6210 shiftRight1:
6211 shift128ExtraRightJamming(
6212 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6213 roundAndPack:
6214 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6215
6216 }
6217
6218 /*----------------------------------------------------------------------------
6219 | Returns the result of subtracting the absolute values of the quadruple-
6220 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6221 | difference is negated before being returned. `zSign' is ignored if the
6222 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6223 | Standard for Binary Floating-Point Arithmetic.
6224 *----------------------------------------------------------------------------*/
6225
6226 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6227 float_status *status)
6228 {
6229 int32_t aExp, bExp, zExp;
6230 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6231 int32_t expDiff;
6232
6233 aSig1 = extractFloat128Frac1( a );
6234 aSig0 = extractFloat128Frac0( a );
6235 aExp = extractFloat128Exp( a );
6236 bSig1 = extractFloat128Frac1( b );
6237 bSig0 = extractFloat128Frac0( b );
6238 bExp = extractFloat128Exp( b );
6239 expDiff = aExp - bExp;
6240 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6241 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6242 if ( 0 < expDiff ) goto aExpBigger;
6243 if ( expDiff < 0 ) goto bExpBigger;
6244 if ( aExp == 0x7FFF ) {
6245 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6246 return propagateFloat128NaN(a, b, status);
6247 }
6248 float_raise(float_flag_invalid, status);
6249 return float128_default_nan(status);
6250 }
6251 if ( aExp == 0 ) {
6252 aExp = 1;
6253 bExp = 1;
6254 }
6255 if ( bSig0 < aSig0 ) goto aBigger;
6256 if ( aSig0 < bSig0 ) goto bBigger;
6257 if ( bSig1 < aSig1 ) goto aBigger;
6258 if ( aSig1 < bSig1 ) goto bBigger;
6259 return packFloat128(status->float_rounding_mode == float_round_down,
6260 0, 0, 0);
6261 bExpBigger:
6262 if ( bExp == 0x7FFF ) {
6263 if (bSig0 | bSig1) {
6264 return propagateFloat128NaN(a, b, status);
6265 }
6266 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6267 }
6268 if ( aExp == 0 ) {
6269 ++expDiff;
6270 }
6271 else {
6272 aSig0 |= LIT64( 0x4000000000000000 );
6273 }
6274 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6275 bSig0 |= LIT64( 0x4000000000000000 );
6276 bBigger:
6277 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6278 zExp = bExp;
6279 zSign ^= 1;
6280 goto normalizeRoundAndPack;
6281 aExpBigger:
6282 if ( aExp == 0x7FFF ) {
6283 if (aSig0 | aSig1) {
6284 return propagateFloat128NaN(a, b, status);
6285 }
6286 return a;
6287 }
6288 if ( bExp == 0 ) {
6289 --expDiff;
6290 }
6291 else {
6292 bSig0 |= LIT64( 0x4000000000000000 );
6293 }
6294 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6295 aSig0 |= LIT64( 0x4000000000000000 );
6296 aBigger:
6297 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6298 zExp = aExp;
6299 normalizeRoundAndPack:
6300 --zExp;
6301 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6302 status);
6303
6304 }
6305
6306 /*----------------------------------------------------------------------------
6307 | Returns the result of adding the quadruple-precision floating-point values
6308 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6309 | for Binary Floating-Point Arithmetic.
6310 *----------------------------------------------------------------------------*/
6311
6312 float128 float128_add(float128 a, float128 b, float_status *status)
6313 {
6314 flag aSign, bSign;
6315
6316 aSign = extractFloat128Sign( a );
6317 bSign = extractFloat128Sign( b );
6318 if ( aSign == bSign ) {
6319 return addFloat128Sigs(a, b, aSign, status);
6320 }
6321 else {
6322 return subFloat128Sigs(a, b, aSign, status);
6323 }
6324
6325 }
6326
6327 /*----------------------------------------------------------------------------
6328 | Returns the result of subtracting the quadruple-precision floating-point
6329 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6330 | Standard for Binary Floating-Point Arithmetic.
6331 *----------------------------------------------------------------------------*/
6332
6333 float128 float128_sub(float128 a, float128 b, float_status *status)
6334 {
6335 flag aSign, bSign;
6336
6337 aSign = extractFloat128Sign( a );
6338 bSign = extractFloat128Sign( b );
6339 if ( aSign == bSign ) {
6340 return subFloat128Sigs(a, b, aSign, status);
6341 }
6342 else {
6343 return addFloat128Sigs(a, b, aSign, status);
6344 }
6345
6346 }
6347
6348 /*----------------------------------------------------------------------------
6349 | Returns the result of multiplying the quadruple-precision floating-point
6350 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6351 | Standard for Binary Floating-Point Arithmetic.
6352 *----------------------------------------------------------------------------*/
6353
6354 float128 float128_mul(float128 a, float128 b, float_status *status)
6355 {
6356 flag aSign, bSign, zSign;
6357 int32_t aExp, bExp, zExp;
6358 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6359
6360 aSig1 = extractFloat128Frac1( a );
6361 aSig0 = extractFloat128Frac0( a );
6362 aExp = extractFloat128Exp( a );
6363 aSign = extractFloat128Sign( a );
6364 bSig1 = extractFloat128Frac1( b );
6365 bSig0 = extractFloat128Frac0( b );
6366 bExp = extractFloat128Exp( b );
6367 bSign = extractFloat128Sign( b );
6368 zSign = aSign ^ bSign;
6369 if ( aExp == 0x7FFF ) {
6370 if ( ( aSig0 | aSig1 )
6371 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6372 return propagateFloat128NaN(a, b, status);
6373 }
6374 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6375 return packFloat128( zSign, 0x7FFF, 0, 0 );
6376 }
6377 if ( bExp == 0x7FFF ) {
6378 if (bSig0 | bSig1) {
6379 return propagateFloat128NaN(a, b, status);
6380 }
6381 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6382 invalid:
6383 float_raise(float_flag_invalid, status);
6384 return float128_default_nan(status);
6385 }
6386 return packFloat128( zSign, 0x7FFF, 0, 0 );
6387 }
6388 if ( aExp == 0 ) {
6389 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6390 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6391 }
6392 if ( bExp == 0 ) {
6393 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6394 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6395 }
6396 zExp = aExp + bExp - 0x4000;
6397 aSig0 |= LIT64( 0x0001000000000000 );
6398 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6399 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6400 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6401 zSig2 |= ( zSig3 != 0 );
6402 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6403 shift128ExtraRightJamming(
6404 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6405 ++zExp;
6406 }
6407 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6408
6409 }
6410
6411 /*----------------------------------------------------------------------------
6412 | Returns the result of dividing the quadruple-precision floating-point value
6413 | `a' by the corresponding value `b'. The operation is performed according to
6414 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415 *----------------------------------------------------------------------------*/
6416
6417 float128 float128_div(float128 a, float128 b, float_status *status)
6418 {
6419 flag aSign, bSign, zSign;
6420 int32_t aExp, bExp, zExp;
6421 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6422 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6423
6424 aSig1 = extractFloat128Frac1( a );
6425 aSig0 = extractFloat128Frac0( a );
6426 aExp = extractFloat128Exp( a );
6427 aSign = extractFloat128Sign( a );
6428 bSig1 = extractFloat128Frac1( b );
6429 bSig0 = extractFloat128Frac0( b );
6430 bExp = extractFloat128Exp( b );
6431 bSign = extractFloat128Sign( b );
6432 zSign = aSign ^ bSign;
6433 if ( aExp == 0x7FFF ) {
6434 if (aSig0 | aSig1) {
6435 return propagateFloat128NaN(a, b, status);
6436 }
6437 if ( bExp == 0x7FFF ) {
6438 if (bSig0 | bSig1) {
6439 return propagateFloat128NaN(a, b, status);
6440 }
6441 goto invalid;
6442 }
6443 return packFloat128( zSign, 0x7FFF, 0, 0 );
6444 }
6445 if ( bExp == 0x7FFF ) {
6446 if (bSig0 | bSig1) {
6447 return propagateFloat128NaN(a, b, status);
6448 }
6449 return packFloat128( zSign, 0, 0, 0 );
6450 }
6451 if ( bExp == 0 ) {
6452 if ( ( bSig0 | bSig1 ) == 0 ) {
6453 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6454 invalid:
6455 float_raise(float_flag_invalid, status);
6456 return float128_default_nan(status);
6457 }
6458 float_raise(float_flag_divbyzero, status);
6459 return packFloat128( zSign, 0x7FFF, 0, 0 );
6460 }
6461 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6462 }
6463 if ( aExp == 0 ) {
6464 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6465 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6466 }
6467 zExp = aExp - bExp + 0x3FFD;
6468 shortShift128Left(
6469 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6470 shortShift128Left(
6471 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6472 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6473 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6474 ++zExp;
6475 }
6476 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6477 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6478 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6479 while ( (int64_t) rem0 < 0 ) {
6480 --zSig0;
6481 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6482 }
6483 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6484 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6485 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6486 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6487 while ( (int64_t) rem1 < 0 ) {
6488 --zSig1;
6489 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6490 }
6491 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6492 }
6493 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6494 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6495
6496 }
6497
6498 /*----------------------------------------------------------------------------
6499 | Returns the remainder of the quadruple-precision floating-point value `a'
6500 | with respect to the corresponding value `b'. The operation is performed
6501 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503
6504 float128 float128_rem(float128 a, float128 b, float_status *status)
6505 {
6506 flag aSign, zSign;
6507 int32_t aExp, bExp, expDiff;
6508 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6509 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6510 int64_t sigMean0;
6511
6512 aSig1 = extractFloat128Frac1( a );
6513 aSig0 = extractFloat128Frac0( a );
6514 aExp = extractFloat128Exp( a );
6515 aSign = extractFloat128Sign( a );
6516 bSig1 = extractFloat128Frac1( b );
6517 bSig0 = extractFloat128Frac0( b );
6518 bExp = extractFloat128Exp( b );
6519 if ( aExp == 0x7FFF ) {
6520 if ( ( aSig0 | aSig1 )
6521 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522 return propagateFloat128NaN(a, b, status);
6523 }
6524 goto invalid;
6525 }
6526 if ( bExp == 0x7FFF ) {
6527 if (bSig0 | bSig1) {
6528 return propagateFloat128NaN(a, b, status);
6529 }
6530 return a;
6531 }
6532 if ( bExp == 0 ) {
6533 if ( ( bSig0 | bSig1 ) == 0 ) {
6534 invalid:
6535 float_raise(float_flag_invalid, status);
6536 return float128_default_nan(status);
6537 }
6538 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6539 }
6540 if ( aExp == 0 ) {
6541 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6542 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6543 }
6544 expDiff = aExp - bExp;
6545 if ( expDiff < -1 ) return a;
6546 shortShift128Left(
6547 aSig0 | LIT64( 0x0001000000000000 ),
6548 aSig1,
6549 15 - ( expDiff < 0 ),
6550 &aSig0,
6551 &aSig1
6552 );
6553 shortShift128Left(
6554 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6555 q = le128( bSig0, bSig1, aSig0, aSig1 );
6556 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6557 expDiff -= 64;
6558 while ( 0 < expDiff ) {
6559 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6560 q = ( 4 < q ) ? q - 4 : 0;
6561 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6562 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6563 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6564 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6565 expDiff -= 61;
6566 }
6567 if ( -64 < expDiff ) {
6568 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6569 q = ( 4 < q ) ? q - 4 : 0;
6570 q >>= - expDiff;
6571 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6572 expDiff += 52;
6573 if ( expDiff < 0 ) {
6574 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6575 }
6576 else {
6577 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6578 }
6579 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6580 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6581 }
6582 else {
6583 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6584 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6585 }
6586 do {
6587 alternateASig0 = aSig0;
6588 alternateASig1 = aSig1;
6589 ++q;
6590 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6591 } while ( 0 <= (int64_t) aSig0 );
6592 add128(
6593 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6594 if ( ( sigMean0 < 0 )
6595 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6596 aSig0 = alternateASig0;
6597 aSig1 = alternateASig1;
6598 }
6599 zSign = ( (int64_t) aSig0 < 0 );
6600 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6601 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6602 status);
6603 }
6604
6605 /*----------------------------------------------------------------------------
6606 | Returns the square root of the quadruple-precision floating-point value `a'.
6607 | The operation is performed according to the IEC/IEEE Standard for Binary
6608 | Floating-Point Arithmetic.
6609 *----------------------------------------------------------------------------*/
6610
6611 float128 float128_sqrt(float128 a, float_status *status)
6612 {
6613 flag aSign;
6614 int32_t aExp, zExp;
6615 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6616 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6617
6618 aSig1 = extractFloat128Frac1( a );
6619 aSig0 = extractFloat128Frac0( a );
6620 aExp = extractFloat128Exp( a );
6621 aSign = extractFloat128Sign( a );
6622 if ( aExp == 0x7FFF ) {
6623 if (aSig0 | aSig1) {
6624 return propagateFloat128NaN(a, a, status);
6625 }
6626 if ( ! aSign ) return a;
6627 goto invalid;
6628 }
6629 if ( aSign ) {
6630 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6631 invalid:
6632 float_raise(float_flag_invalid, status);
6633 return float128_default_nan(status);
6634 }
6635 if ( aExp == 0 ) {
6636 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6637 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6638 }
6639 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6640 aSig0 |= LIT64( 0x0001000000000000 );
6641 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6642 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6643 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6644 doubleZSig0 = zSig0<<1;
6645 mul64To128( zSig0, zSig0, &term0, &term1 );
6646 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6647 while ( (int64_t) rem0 < 0 ) {
6648 --zSig0;
6649 doubleZSig0 -= 2;
6650 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6651 }
6652 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6653 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6654 if ( zSig1 == 0 ) zSig1 = 1;
6655 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6656 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6657 mul64To128( zSig1, zSig1, &term2, &term3 );
6658 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6659 while ( (int64_t) rem1 < 0 ) {
6660 --zSig1;
6661 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6662 term3 |= 1;
6663 term2 |= doubleZSig0;
6664 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6665 }
6666 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6667 }
6668 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6669 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6670
6671 }
6672
6673 /*----------------------------------------------------------------------------
6674 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6675 | the corresponding value `b', and 0 otherwise. The invalid exception is
6676 | raised if either operand is a NaN. Otherwise, the comparison is performed
6677 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6678 *----------------------------------------------------------------------------*/
6679
6680 int float128_eq(float128 a, float128 b, float_status *status)
6681 {
6682
6683 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6684 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6685 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6686 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6687 ) {
6688 float_raise(float_flag_invalid, status);
6689 return 0;
6690 }
6691 return
6692 ( a.low == b.low )
6693 && ( ( a.high == b.high )
6694 || ( ( a.low == 0 )
6695 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6696 );
6697
6698 }
6699
6700 /*----------------------------------------------------------------------------
6701 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6702 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6703 | exception is raised if either operand is a NaN. The comparison is performed
6704 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6705 *----------------------------------------------------------------------------*/
6706
6707 int float128_le(float128 a, float128 b, float_status *status)
6708 {
6709 flag aSign, bSign;
6710
6711 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6712 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6713 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6714 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6715 ) {
6716 float_raise(float_flag_invalid, status);
6717 return 0;
6718 }
6719 aSign = extractFloat128Sign( a );
6720 bSign = extractFloat128Sign( b );
6721 if ( aSign != bSign ) {
6722 return
6723 aSign
6724 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6725 == 0 );
6726 }
6727 return
6728 aSign ? le128( b.high, b.low, a.high, a.low )
6729 : le128( a.high, a.low, b.high, b.low );
6730
6731 }
6732
6733 /*----------------------------------------------------------------------------
6734 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6735 | the corresponding value `b', and 0 otherwise. The invalid exception is
6736 | raised if either operand is a NaN. The comparison is performed according
6737 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6738 *----------------------------------------------------------------------------*/
6739
6740 int float128_lt(float128 a, float128 b, float_status *status)
6741 {
6742 flag aSign, bSign;
6743
6744 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6745 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6746 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6747 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6748 ) {
6749 float_raise(float_flag_invalid, status);
6750 return 0;
6751 }
6752 aSign = extractFloat128Sign( a );
6753 bSign = extractFloat128Sign( b );
6754 if ( aSign != bSign ) {
6755 return
6756 aSign
6757 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6758 != 0 );
6759 }
6760 return
6761 aSign ? lt128( b.high, b.low, a.high, a.low )
6762 : lt128( a.high, a.low, b.high, b.low );
6763
6764 }
6765
6766 /*----------------------------------------------------------------------------
6767 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6768 | be compared, and 0 otherwise. The invalid exception is raised if either
6769 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6770 | Standard for Binary Floating-Point Arithmetic.
6771 *----------------------------------------------------------------------------*/
6772
6773 int float128_unordered(float128 a, float128 b, float_status *status)
6774 {
6775 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6776 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6777 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6778 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6779 ) {
6780 float_raise(float_flag_invalid, status);
6781 return 1;
6782 }
6783 return 0;
6784 }
6785
6786 /*----------------------------------------------------------------------------
6787 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6788 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6789 | exception. The comparison is performed according to the IEC/IEEE Standard
6790 | for Binary Floating-Point Arithmetic.
6791 *----------------------------------------------------------------------------*/
6792
6793 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6794 {
6795
6796 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6797 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6798 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6799 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6800 ) {
6801 if (float128_is_signaling_nan(a, status)
6802 || float128_is_signaling_nan(b, status)) {
6803 float_raise(float_flag_invalid, status);
6804 }
6805 return 0;
6806 }
6807 return
6808 ( a.low == b.low )
6809 && ( ( a.high == b.high )
6810 || ( ( a.low == 0 )
6811 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6812 );
6813
6814 }
6815
6816 /*----------------------------------------------------------------------------
6817 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6818 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6819 | cause an exception. Otherwise, the comparison is performed according to the
6820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6821 *----------------------------------------------------------------------------*/
6822
6823 int float128_le_quiet(float128 a, float128 b, float_status *status)
6824 {
6825 flag aSign, bSign;
6826
6827 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6828 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6829 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6830 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6831 ) {
6832 if (float128_is_signaling_nan(a, status)
6833 || float128_is_signaling_nan(b, status)) {
6834 float_raise(float_flag_invalid, status);
6835 }
6836 return 0;
6837 }
6838 aSign = extractFloat128Sign( a );
6839 bSign = extractFloat128Sign( b );
6840 if ( aSign != bSign ) {
6841 return
6842 aSign
6843 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6844 == 0 );
6845 }
6846 return
6847 aSign ? le128( b.high, b.low, a.high, a.low )
6848 : le128( a.high, a.low, b.high, b.low );
6849
6850 }
6851
6852 /*----------------------------------------------------------------------------
6853 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6854 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6855 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6856 | Standard for Binary Floating-Point Arithmetic.
6857 *----------------------------------------------------------------------------*/
6858
6859 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6860 {
6861 flag aSign, bSign;
6862
6863 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6864 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6865 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6866 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6867 ) {
6868 if (float128_is_signaling_nan(a, status)
6869 || float128_is_signaling_nan(b, status)) {
6870 float_raise(float_flag_invalid, status);
6871 }
6872 return 0;
6873 }
6874 aSign = extractFloat128Sign( a );
6875 bSign = extractFloat128Sign( b );
6876 if ( aSign != bSign ) {
6877 return
6878 aSign
6879 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6880 != 0 );
6881 }
6882 return
6883 aSign ? lt128( b.high, b.low, a.high, a.low )
6884 : lt128( a.high, a.low, b.high, b.low );
6885
6886 }
6887
6888 /*----------------------------------------------------------------------------
6889 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6890 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6891 | comparison is performed according to the IEC/IEEE Standard for Binary
6892 | Floating-Point Arithmetic.
6893 *----------------------------------------------------------------------------*/
6894
6895 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6896 {
6897 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6898 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6899 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6900 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6901 ) {
6902 if (float128_is_signaling_nan(a, status)
6903 || float128_is_signaling_nan(b, status)) {
6904 float_raise(float_flag_invalid, status);
6905 }
6906 return 1;
6907 }
6908 return 0;
6909 }
6910
6911 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6912 int is_quiet, float_status *status)
6913 {
6914 flag aSign, bSign;
6915
6916 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6917 float_raise(float_flag_invalid, status);
6918 return float_relation_unordered;
6919 }
6920 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6921 ( extractFloatx80Frac( a )<<1 ) ) ||
6922 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6923 ( extractFloatx80Frac( b )<<1 ) )) {
6924 if (!is_quiet ||
6925 floatx80_is_signaling_nan(a, status) ||
6926 floatx80_is_signaling_nan(b, status)) {
6927 float_raise(float_flag_invalid, status);
6928 }
6929 return float_relation_unordered;
6930 }
6931 aSign = extractFloatx80Sign( a );
6932 bSign = extractFloatx80Sign( b );
6933 if ( aSign != bSign ) {
6934
6935 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6936 ( ( a.low | b.low ) == 0 ) ) {
6937 /* zero case */
6938 return float_relation_equal;
6939 } else {
6940 return 1 - (2 * aSign);
6941 }
6942 } else {
6943 if (a.low == b.low && a.high == b.high) {
6944 return float_relation_equal;
6945 } else {
6946 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6947 }
6948 }
6949 }
6950
6951 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6952 {
6953 return floatx80_compare_internal(a, b, 0, status);
6954 }
6955
6956 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6957 {
6958 return floatx80_compare_internal(a, b, 1, status);
6959 }
6960
6961 static inline int float128_compare_internal(float128 a, float128 b,
6962 int is_quiet, float_status *status)
6963 {
6964 flag aSign, bSign;
6965
6966 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6967 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6968 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6969 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6970 if (!is_quiet ||
6971 float128_is_signaling_nan(a, status) ||
6972 float128_is_signaling_nan(b, status)) {
6973 float_raise(float_flag_invalid, status);
6974 }
6975 return float_relation_unordered;
6976 }
6977 aSign = extractFloat128Sign( a );
6978 bSign = extractFloat128Sign( b );
6979 if ( aSign != bSign ) {
6980 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6981 /* zero case */
6982 return float_relation_equal;
6983 } else {
6984 return 1 - (2 * aSign);
6985 }
6986 } else {
6987 if (a.low == b.low && a.high == b.high) {
6988 return float_relation_equal;
6989 } else {
6990 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6991 }
6992 }
6993 }
6994
6995 int float128_compare(float128 a, float128 b, float_status *status)
6996 {
6997 return float128_compare_internal(a, b, 0, status);
6998 }
6999
7000 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7001 {
7002 return float128_compare_internal(a, b, 1, status);
7003 }
7004
7005 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7006 {
7007 flag aSign;
7008 int32_t aExp;
7009 uint64_t aSig;
7010
7011 if (floatx80_invalid_encoding(a)) {
7012 float_raise(float_flag_invalid, status);
7013 return floatx80_default_nan(status);
7014 }
7015 aSig = extractFloatx80Frac( a );
7016 aExp = extractFloatx80Exp( a );
7017 aSign = extractFloatx80Sign( a );
7018
7019 if ( aExp == 0x7FFF ) {
7020 if ( aSig<<1 ) {
7021 return propagateFloatx80NaN(a, a, status);
7022 }
7023 return a;
7024 }
7025
7026 if (aExp == 0) {
7027 if (aSig == 0) {
7028 return a;
7029 }
7030 aExp++;
7031 }
7032
7033 if (n > 0x10000) {
7034 n = 0x10000;
7035 } else if (n < -0x10000) {
7036 n = -0x10000;
7037 }
7038
7039 aExp += n;
7040 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7041 aSign, aExp, aSig, 0, status);
7042 }
7043
7044 float128 float128_scalbn(float128 a, int n, float_status *status)
7045 {
7046 flag aSign;
7047 int32_t aExp;
7048 uint64_t aSig0, aSig1;
7049
7050 aSig1 = extractFloat128Frac1( a );
7051 aSig0 = extractFloat128Frac0( a );
7052 aExp = extractFloat128Exp( a );
7053 aSign = extractFloat128Sign( a );
7054 if ( aExp == 0x7FFF ) {
7055 if ( aSig0 | aSig1 ) {
7056 return propagateFloat128NaN(a, a, status);
7057 }
7058 return a;
7059 }
7060 if (aExp != 0) {
7061 aSig0 |= LIT64( 0x0001000000000000 );
7062 } else if (aSig0 == 0 && aSig1 == 0) {
7063 return a;
7064 } else {
7065 aExp++;
7066 }
7067
7068 if (n > 0x10000) {
7069 n = 0x10000;
7070 } else if (n < -0x10000) {
7071 n = -0x10000;
7072 }
7073
7074 aExp += n - 1;
7075 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7076 , status);
7077
7078 }