]> git.ipfire.org Git - thirdparty/qemu.git/blob - fpu/softfloat.c
Merge remote-tracking branch 'remotes/stsquad/tags/pull-fpu-next-230119-2' into staging
[thirdparty/qemu.git] / fpu / softfloat.c
1 /*
2 * QEMU float support
3 *
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
16 */
17
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43
44 ===============================================================================
45 */
46
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89
90 /* We only need stdlib for abort() */
91
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98
99 /*
100 * Hardfloat
101 *
102 * Fast emulation of guest FP instructions is challenging for two reasons.
103 * First, FP instruction semantics are similar but not identical, particularly
104 * when handling NaNs. Second, emulating at reasonable speed the guest FP
105 * exception flags is not trivial: reading the host's flags register with a
106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107 * and trapping on every FP exception is not fast nor pleasant to work with.
108 *
109 * We address these challenges by leveraging the host FPU for a subset of the
110 * operations. To do this we expand on the idea presented in this paper:
111 *
112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114 *
115 * The idea is thus to leverage the host FPU to (1) compute FP operations
116 * and (2) identify whether FP exceptions occurred while avoiding
117 * expensive exception flag register accesses.
118 *
119 * An important optimization shown in the paper is that given that exception
120 * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121 * This is particularly useful for the inexact flag, which is very frequently
122 * raised in floating-point workloads.
123 *
124 * We optimize the code further by deferring to soft-fp whenever FP exception
125 * detection might get hairy. Two examples: (1) when at least one operand is
126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127 * and the result is < the minimum normal.
128 */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \
130 static inline void name(soft_t *a, float_status *s) \
131 { \
132 if (unlikely(soft_t ## _is_denormal(*a))) { \
133 *a = soft_t ## _set_sign(soft_t ## _zero, \
134 soft_t ## _is_neg(*a)); \
135 s->float_exception_flags |= float_flag_input_denormal; \
136 } \
137 }
138
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142
143 #define GEN_INPUT_FLUSH1(name, soft_t) \
144 static inline void name(soft_t *a, float_status *s) \
145 { \
146 if (likely(!s->flush_inputs_to_zero)) { \
147 return; \
148 } \
149 soft_t ## _input_flush__nocheck(a, s); \
150 }
151
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155
156 #define GEN_INPUT_FLUSH2(name, soft_t) \
157 static inline void name(soft_t *a, soft_t *b, float_status *s) \
158 { \
159 if (likely(!s->flush_inputs_to_zero)) { \
160 return; \
161 } \
162 soft_t ## _input_flush__nocheck(a, s); \
163 soft_t ## _input_flush__nocheck(b, s); \
164 }
165
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169
170 #define GEN_INPUT_FLUSH3(name, soft_t) \
171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172 { \
173 if (likely(!s->flush_inputs_to_zero)) { \
174 return; \
175 } \
176 soft_t ## _input_flush__nocheck(a, s); \
177 soft_t ## _input_flush__nocheck(b, s); \
178 soft_t ## _input_flush__nocheck(c, s); \
179 }
180
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184
185 /*
186 * Choose whether to use fpclassify or float32/64_* primitives in the generated
187 * hardfloat functions. Each combination of number of inputs and float size
188 * gets its own value.
189 */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205
206 /*
207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208 * float{32,64}_is_infinity when !USE_FP.
209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211 */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF 1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF 0
216 #endif
217
218 /*
219 * Some targets clear the FP flags before most FP operations. This prevents
220 * the use of hardfloat, since hardfloat relies on the inexact flag being
221 * already set.
222 */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226 IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234
235 static inline bool can_use_fpu(const float_status *s)
236 {
237 if (QEMU_NO_HARDFLOAT) {
238 return false;
239 }
240 return likely(s->float_exception_flags & float_flag_inexact &&
241 s->float_rounding_mode == float_round_nearest_even);
242 }
243
244 /*
245 * Hardfloat generation functions. Each operation can have two flavors:
246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247 * most condition checks, or native ones (e.g. fpclassify).
248 *
249 * The flavor is chosen by the callers. Instead of using macros, we rely on the
250 * compiler to propagate constants and inline everything into the callers.
251 *
252 * We only generate functions for operations with two inputs, since only
253 * these are common enough to justify consolidating them into common code.
254 */
255
256 typedef union {
257 float32 s;
258 float h;
259 } union_float32;
260
261 typedef union {
262 float64 s;
263 double h;
264 } union_float64;
265
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float (*hard_f32_op2_fn)(float a, float b);
272 typedef double (*hard_f64_op2_fn)(double a, double b);
273
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277 if (QEMU_HARDFLOAT_2F32_USE_FP) {
278 /*
279 * Not using a temp variable for consecutive fpclassify calls ends up
280 * generating faster code.
281 */
282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284 }
285 return float32_is_zero_or_normal(a.s) &&
286 float32_is_zero_or_normal(b.s);
287 }
288
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291 if (QEMU_HARDFLOAT_2F64_USE_FP) {
292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294 }
295 return float64_is_zero_or_normal(a.s) &&
296 float64_is_zero_or_normal(b.s);
297 }
298
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303 if (QEMU_HARDFLOAT_3F32_USE_FP) {
304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307 }
308 return float32_is_zero_or_normal(a.s) &&
309 float32_is_zero_or_normal(b.s) &&
310 float32_is_zero_or_normal(c.s);
311 }
312
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316 if (QEMU_HARDFLOAT_3F64_USE_FP) {
317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320 }
321 return float64_is_zero_or_normal(a.s) &&
322 float64_is_zero_or_normal(b.s) &&
323 float64_is_zero_or_normal(c.s);
324 }
325
326 static inline bool f32_is_inf(union_float32 a)
327 {
328 if (QEMU_HARDFLOAT_USE_ISINF) {
329 return isinf(a.h);
330 }
331 return float32_is_infinity(a.s);
332 }
333
334 static inline bool f64_is_inf(union_float64 a)
335 {
336 if (QEMU_HARDFLOAT_USE_ISINF) {
337 return isinf(a.h);
338 }
339 return float64_is_infinity(a.s);
340 }
341
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345 hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346 f32_check_fn pre, f32_check_fn post,
347 f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349 union_float32 ua, ub, ur;
350
351 ua.s = xa;
352 ub.s = xb;
353
354 if (unlikely(!can_use_fpu(s))) {
355 goto soft;
356 }
357
358 float32_input_flush2(&ua.s, &ub.s, s);
359 if (unlikely(!pre(ua, ub))) {
360 goto soft;
361 }
362 if (fast_test && fast_test(ua, ub)) {
363 return fast_op(ua.s, ub.s, s);
364 }
365
366 ur.h = hard(ua.h, ub.h);
367 if (unlikely(f32_is_inf(ur))) {
368 s->float_exception_flags |= float_flag_overflow;
369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370 if (post == NULL || post(ua, ub)) {
371 goto soft;
372 }
373 }
374 return ur.s;
375
376 soft:
377 return soft(ua.s, ub.s, s);
378 }
379
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382 hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383 f64_check_fn pre, f64_check_fn post,
384 f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386 union_float64 ua, ub, ur;
387
388 ua.s = xa;
389 ub.s = xb;
390
391 if (unlikely(!can_use_fpu(s))) {
392 goto soft;
393 }
394
395 float64_input_flush2(&ua.s, &ub.s, s);
396 if (unlikely(!pre(ua, ub))) {
397 goto soft;
398 }
399 if (fast_test && fast_test(ua, ub)) {
400 return fast_op(ua.s, ub.s, s);
401 }
402
403 ur.h = hard(ua.h, ub.h);
404 if (unlikely(f64_is_inf(ur))) {
405 s->float_exception_flags |= float_flag_overflow;
406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407 if (post == NULL || post(ua, ub)) {
408 goto soft;
409 }
410 }
411 return ur.s;
412
413 soft:
414 return soft(ua.s, ub.s, s);
415 }
416
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423 return float16_val(a) & 0x3ff;
424 }
425
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429
430 static inline int extractFloat16Exp(float16 a)
431 {
432 return (float16_val(a) >> 10) & 0x1f;
433 }
434
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441 return float32_val(a) & 0x007FFFFF;
442 }
443
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447
448 static inline int extractFloat32Exp(float32 a)
449 {
450 return (float32_val(a) >> 23) & 0xFF;
451 }
452
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456
457 static inline flag extractFloat32Sign(float32 a)
458 {
459 return float32_val(a) >> 31;
460 }
461
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474
475 static inline int extractFloat64Exp(float64 a)
476 {
477 return (float64_val(a) >> 52) & 0x7FF;
478 }
479
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483
484 static inline flag extractFloat64Sign(float64 a)
485 {
486 return float64_val(a) >> 63;
487 }
488
489 /*
490 * Classify a floating point number. Everything above float_class_qnan
491 * is a NaN so cls >= float_class_qnan is any NaN.
492 */
493
494 typedef enum __attribute__ ((__packed__)) {
495 float_class_unclassified,
496 float_class_zero,
497 float_class_normal,
498 float_class_inf,
499 float_class_qnan, /* all NaNs from here */
500 float_class_snan,
501 } FloatClass;
502
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506 return unlikely(c >= float_class_qnan);
507 }
508
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511 return c == float_class_snan;
512 }
513
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516 return c == float_class_qnan;
517 }
518
519 /*
520 * Structure holding all of the decomposed parts of a float. The
521 * exponent is unbiased and the fraction is normalized. All
522 * calculations are done with a 64 bit fraction and then rounded as
523 * appropriate for the final format.
524 *
525 * Thanks to the packed FloatClass a decent compiler should be able to
526 * fit the whole structure into registers and avoid using the stack
527 * for parameter passing.
528 */
529
530 typedef struct {
531 uint64_t frac;
532 int32_t exp;
533 FloatClass cls;
534 bool sign;
535 } FloatParts;
536
537 #define DECOMPOSED_BINARY_POINT (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1)
540
541 /* Structure holding all of the relevant parameters for a format.
542 * exp_size: the size of the exponent field
543 * exp_bias: the offset applied to the exponent field
544 * exp_max: the maximum normalised exponent
545 * frac_size: the size of the fraction field
546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547 * The following are computed based the size of fraction
548 * frac_lsb: least significant bit of fraction
549 * frac_lsbm1: the bit below the least significant bit (for rounding)
550 * round_mask/roundeven_mask: masks used for rounding
551 * The following optional modifiers are available:
552 * arm_althp: handle ARM Alternative Half Precision
553 */
554 typedef struct {
555 int exp_size;
556 int exp_bias;
557 int exp_max;
558 int frac_size;
559 int frac_shift;
560 uint64_t frac_lsb;
561 uint64_t frac_lsbm1;
562 uint64_t round_mask;
563 uint64_t roundeven_mask;
564 bool arm_althp;
565 } FloatFmt;
566
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F) \
569 .exp_size = E, \
570 .exp_bias = ((1 << E) - 1) >> 1, \
571 .exp_max = (1 << E) - 1, \
572 .frac_size = F, \
573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \
574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \
575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \
576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \
577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578
579 static const FloatFmt float16_params = {
580 FLOAT_PARAMS(5, 10)
581 };
582
583 static const FloatFmt float16_params_ahp = {
584 FLOAT_PARAMS(5, 10),
585 .arm_althp = true
586 };
587
588 static const FloatFmt float32_params = {
589 FLOAT_PARAMS(8, 23)
590 };
591
592 static const FloatFmt float64_params = {
593 FLOAT_PARAMS(11, 52)
594 };
595
596 /* Unpack a float to parts, but do not canonicalize. */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599 const int sign_pos = fmt.frac_size + fmt.exp_size;
600
601 return (FloatParts) {
602 .cls = float_class_unclassified,
603 .sign = extract64(raw, sign_pos, 1),
604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605 .frac = extract64(raw, 0, fmt.frac_size),
606 };
607 }
608
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611 return unpack_raw(float16_params, f);
612 }
613
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616 return unpack_raw(float32_params, f);
617 }
618
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621 return unpack_raw(float64_params, f);
622 }
623
624 /* Pack a float from parts, but do not canonicalize. */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627 const int sign_pos = fmt.frac_size + fmt.exp_size;
628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629 return deposit64(ret, sign_pos, 1, p.sign);
630 }
631
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634 return make_float16(pack_raw(float16_params, p));
635 }
636
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639 return make_float32(pack_raw(float32_params, p));
640 }
641
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644 return make_float64(pack_raw(float64_params, p));
645 }
646
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine: (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output. These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656
657 /* Canonicalize EXP and FRAC, setting CLS. */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659 float_status *status)
660 {
661 if (part.exp == parm->exp_max && !parm->arm_althp) {
662 if (part.frac == 0) {
663 part.cls = float_class_inf;
664 } else {
665 part.frac <<= parm->frac_shift;
666 part.cls = (parts_is_snan_frac(part.frac, status)
667 ? float_class_snan : float_class_qnan);
668 }
669 } else if (part.exp == 0) {
670 if (likely(part.frac == 0)) {
671 part.cls = float_class_zero;
672 } else if (status->flush_inputs_to_zero) {
673 float_raise(float_flag_input_denormal, status);
674 part.cls = float_class_zero;
675 part.frac = 0;
676 } else {
677 int shift = clz64(part.frac) - 1;
678 part.cls = float_class_normal;
679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680 part.frac <<= shift;
681 }
682 } else {
683 part.cls = float_class_normal;
684 part.exp -= parm->exp_bias;
685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686 }
687 return part;
688 }
689
690 /* Round and uncanonicalize a floating-point number by parts. There
691 * are FRAC_SHIFT bits that may require rounding at the bottom of the
692 * fraction; these bits will be removed. The exponent will be biased
693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694 */
695
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697 const FloatFmt *parm)
698 {
699 const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700 const uint64_t round_mask = parm->round_mask;
701 const uint64_t roundeven_mask = parm->roundeven_mask;
702 const int exp_max = parm->exp_max;
703 const int frac_shift = parm->frac_shift;
704 uint64_t frac, inc;
705 int exp, flags = 0;
706 bool overflow_norm;
707
708 frac = p.frac;
709 exp = p.exp;
710
711 switch (p.cls) {
712 case float_class_normal:
713 switch (s->float_rounding_mode) {
714 case float_round_nearest_even:
715 overflow_norm = false;
716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717 break;
718 case float_round_ties_away:
719 overflow_norm = false;
720 inc = frac_lsbm1;
721 break;
722 case float_round_to_zero:
723 overflow_norm = true;
724 inc = 0;
725 break;
726 case float_round_up:
727 inc = p.sign ? 0 : round_mask;
728 overflow_norm = p.sign;
729 break;
730 case float_round_down:
731 inc = p.sign ? round_mask : 0;
732 overflow_norm = !p.sign;
733 break;
734 default:
735 g_assert_not_reached();
736 }
737
738 exp += parm->exp_bias;
739 if (likely(exp > 0)) {
740 if (frac & round_mask) {
741 flags |= float_flag_inexact;
742 frac += inc;
743 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744 frac >>= 1;
745 exp++;
746 }
747 }
748 frac >>= frac_shift;
749
750 if (parm->arm_althp) {
751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
752 if (unlikely(exp > exp_max)) {
753 /* Overflow. Return the maximum normal. */
754 flags = float_flag_invalid;
755 exp = exp_max;
756 frac = -1;
757 }
758 } else if (unlikely(exp >= exp_max)) {
759 flags |= float_flag_overflow | float_flag_inexact;
760 if (overflow_norm) {
761 exp = exp_max - 1;
762 frac = -1;
763 } else {
764 p.cls = float_class_inf;
765 goto do_inf;
766 }
767 }
768 } else if (s->flush_to_zero) {
769 flags |= float_flag_output_denormal;
770 p.cls = float_class_zero;
771 goto do_zero;
772 } else {
773 bool is_tiny = (s->float_detect_tininess
774 == float_tininess_before_rounding)
775 || (exp < 0)
776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777
778 shift64RightJamming(frac, 1 - exp, &frac);
779 if (frac & round_mask) {
780 /* Need to recompute round-to-even. */
781 if (s->float_rounding_mode == float_round_nearest_even) {
782 inc = ((frac & roundeven_mask) != frac_lsbm1
783 ? frac_lsbm1 : 0);
784 }
785 flags |= float_flag_inexact;
786 frac += inc;
787 }
788
789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790 frac >>= frac_shift;
791
792 if (is_tiny && (flags & float_flag_inexact)) {
793 flags |= float_flag_underflow;
794 }
795 if (exp == 0 && frac == 0) {
796 p.cls = float_class_zero;
797 }
798 }
799 break;
800
801 case float_class_zero:
802 do_zero:
803 exp = 0;
804 frac = 0;
805 break;
806
807 case float_class_inf:
808 do_inf:
809 assert(!parm->arm_althp);
810 exp = exp_max;
811 frac = 0;
812 break;
813
814 case float_class_qnan:
815 case float_class_snan:
816 assert(!parm->arm_althp);
817 exp = exp_max;
818 frac >>= parm->frac_shift;
819 break;
820
821 default:
822 g_assert_not_reached();
823 }
824
825 float_raise(flags, s);
826 p.exp = exp;
827 p.frac = frac;
828 return p;
829 }
830
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833 const FloatFmt *params)
834 {
835 return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840 return float16a_unpack_canonical(f, s, &float16_params);
841 }
842
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844 const FloatFmt *params)
845 {
846 return float16_pack_raw(round_canonical(p, s, params));
847 }
848
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851 return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861 return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871 return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876 switch (a.cls) {
877 case float_class_snan:
878 s->float_exception_flags |= float_flag_invalid;
879 a = parts_silence_nan(a, s);
880 /* fall through */
881 case float_class_qnan:
882 if (s->default_nan_mode) {
883 return parts_default_nan(s);
884 }
885 break;
886
887 default:
888 g_assert_not_reached();
889 }
890 return a;
891 }
892
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895 if (is_snan(a.cls) || is_snan(b.cls)) {
896 s->float_exception_flags |= float_flag_invalid;
897 }
898
899 if (s->default_nan_mode) {
900 return parts_default_nan(s);
901 } else {
902 if (pickNaN(a.cls, b.cls,
903 a.frac > b.frac ||
904 (a.frac == b.frac && a.sign < b.sign))) {
905 a = b;
906 }
907 if (is_snan(a.cls)) {
908 return parts_silence_nan(a, s);
909 }
910 }
911 return a;
912 }
913
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915 bool inf_zero, float_status *s)
916 {
917 int which;
918
919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920 s->float_exception_flags |= float_flag_invalid;
921 }
922
923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924
925 if (s->default_nan_mode) {
926 /* Note that this check is after pickNaNMulAdd so that function
927 * has an opportunity to set the Invalid flag.
928 */
929 which = 3;
930 }
931
932 switch (which) {
933 case 0:
934 break;
935 case 1:
936 a = b;
937 break;
938 case 2:
939 a = c;
940 break;
941 case 3:
942 return parts_default_nan(s);
943 default:
944 g_assert_not_reached();
945 }
946
947 if (is_snan(a.cls)) {
948 return parts_silence_nan(a, s);
949 }
950 return a;
951 }
952
953 /*
954 * Returns the result of adding or subtracting the values of the
955 * floating-point values `a' and `b'. The operation is performed
956 * according to the IEC/IEEE Standard for Binary Floating-Point
957 * Arithmetic.
958 */
959
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961 float_status *s)
962 {
963 bool a_sign = a.sign;
964 bool b_sign = b.sign ^ subtract;
965
966 if (a_sign != b_sign) {
967 /* Subtraction */
968
969 if (a.cls == float_class_normal && b.cls == float_class_normal) {
970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972 a.frac = a.frac - b.frac;
973 } else {
974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975 a.frac = b.frac - a.frac;
976 a.exp = b.exp;
977 a_sign ^= 1;
978 }
979
980 if (a.frac == 0) {
981 a.cls = float_class_zero;
982 a.sign = s->float_rounding_mode == float_round_down;
983 } else {
984 int shift = clz64(a.frac) - 1;
985 a.frac = a.frac << shift;
986 a.exp = a.exp - shift;
987 a.sign = a_sign;
988 }
989 return a;
990 }
991 if (is_nan(a.cls) || is_nan(b.cls)) {
992 return pick_nan(a, b, s);
993 }
994 if (a.cls == float_class_inf) {
995 if (b.cls == float_class_inf) {
996 float_raise(float_flag_invalid, s);
997 return parts_default_nan(s);
998 }
999 return a;
1000 }
1001 if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002 a.sign = s->float_rounding_mode == float_round_down;
1003 return a;
1004 }
1005 if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006 b.sign = a_sign ^ 1;
1007 return b;
1008 }
1009 if (b.cls == float_class_zero) {
1010 return a;
1011 }
1012 } else {
1013 /* Addition */
1014 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015 if (a.exp > b.exp) {
1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017 } else if (a.exp < b.exp) {
1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019 a.exp = b.exp;
1020 }
1021 a.frac += b.frac;
1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023 shift64RightJamming(a.frac, 1, &a.frac);
1024 a.exp += 1;
1025 }
1026 return a;
1027 }
1028 if (is_nan(a.cls) || is_nan(b.cls)) {
1029 return pick_nan(a, b, s);
1030 }
1031 if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032 return a;
1033 }
1034 if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035 b.sign = b_sign;
1036 return b;
1037 }
1038 }
1039 g_assert_not_reached();
1040 }
1041
1042 /*
1043 * Returns the result of adding or subtracting the floating-point
1044 * values `a' and `b'. The operation is performed according to the
1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046 */
1047
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050 FloatParts pa = float16_unpack_canonical(a, status);
1051 FloatParts pb = float16_unpack_canonical(b, status);
1052 FloatParts pr = addsub_floats(pa, pb, false, status);
1053
1054 return float16_round_pack_canonical(pr, status);
1055 }
1056
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059 FloatParts pa = float16_unpack_canonical(a, status);
1060 FloatParts pb = float16_unpack_canonical(b, status);
1061 FloatParts pr = addsub_floats(pa, pb, true, status);
1062
1063 return float16_round_pack_canonical(pr, status);
1064 }
1065
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069 FloatParts pa = float32_unpack_canonical(a, status);
1070 FloatParts pb = float32_unpack_canonical(b, status);
1071 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072
1073 return float32_round_pack_canonical(pr, status);
1074 }
1075
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078 return soft_f32_addsub(a, b, false, status);
1079 }
1080
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083 return soft_f32_addsub(a, b, true, status);
1084 }
1085
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089 FloatParts pa = float64_unpack_canonical(a, status);
1090 FloatParts pb = float64_unpack_canonical(b, status);
1091 FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092
1093 return float64_round_pack_canonical(pr, status);
1094 }
1095
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098 return soft_f64_addsub(a, b, false, status);
1099 }
1100
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103 return soft_f64_addsub(a, b, true, status);
1104 }
1105
1106 static float hard_f32_add(float a, float b)
1107 {
1108 return a + b;
1109 }
1110
1111 static float hard_f32_sub(float a, float b)
1112 {
1113 return a - b;
1114 }
1115
1116 static double hard_f64_add(double a, double b)
1117 {
1118 return a + b;
1119 }
1120
1121 static double hard_f64_sub(double a, double b)
1122 {
1123 return a - b;
1124 }
1125
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130 }
1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138 } else {
1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140 }
1141 }
1142
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146 return float32_gen2(a, b, s, hard, soft,
1147 f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153 return float64_gen2(a, b, s, hard, soft,
1154 f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180
1181 /*
1182 * Returns the result of multiplying the floating-point values `a' and
1183 * `b'. The operation is performed according to the IEC/IEEE Standard
1184 * for Binary Floating-Point Arithmetic.
1185 */
1186
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189 bool sign = a.sign ^ b.sign;
1190
1191 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192 uint64_t hi, lo;
1193 int exp = a.exp + b.exp;
1194
1195 mul64To128(a.frac, b.frac, &hi, &lo);
1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198 shift64RightJamming(lo, 1, &lo);
1199 exp += 1;
1200 }
1201
1202 /* Re-use a */
1203 a.exp = exp;
1204 a.sign = sign;
1205 a.frac = lo;
1206 return a;
1207 }
1208 /* handle all the NaN cases */
1209 if (is_nan(a.cls) || is_nan(b.cls)) {
1210 return pick_nan(a, b, s);
1211 }
1212 /* Inf * Zero == NaN */
1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214 (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215 s->float_exception_flags |= float_flag_invalid;
1216 return parts_default_nan(s);
1217 }
1218 /* Multiply by 0 or Inf */
1219 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220 a.sign = sign;
1221 return a;
1222 }
1223 if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224 b.sign = sign;
1225 return b;
1226 }
1227 g_assert_not_reached();
1228 }
1229
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232 FloatParts pa = float16_unpack_canonical(a, status);
1233 FloatParts pb = float16_unpack_canonical(b, status);
1234 FloatParts pr = mul_floats(pa, pb, status);
1235
1236 return float16_round_pack_canonical(pr, status);
1237 }
1238
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242 FloatParts pa = float32_unpack_canonical(a, status);
1243 FloatParts pb = float32_unpack_canonical(b, status);
1244 FloatParts pr = mul_floats(pa, pb, status);
1245
1246 return float32_round_pack_canonical(pr, status);
1247 }
1248
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252 FloatParts pa = float64_unpack_canonical(a, status);
1253 FloatParts pb = float64_unpack_canonical(b, status);
1254 FloatParts pr = mul_floats(pa, pb, status);
1255
1256 return float64_round_pack_canonical(pr, status);
1257 }
1258
1259 static float hard_f32_mul(float a, float b)
1260 {
1261 return a * b;
1262 }
1263
1264 static double hard_f64_mul(double a, double b)
1265 {
1266 return a * b;
1267 }
1268
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271 return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276 return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282
1283 return float32_set_sign(float32_zero, signbit);
1284 }
1285
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289
1290 return float64_set_sign(float64_zero, signbit);
1291 }
1292
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306
1307 /*
1308 * Returns the result of multiplying the floating-point values `a' and
1309 * `b' then adding 'c', with no intermediate rounding step after the
1310 * multiplication. The operation is performed according to the
1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312 * The flags argument allows the caller to select negation of the
1313 * addend, the intermediate product, or the final result. (The
1314 * difference between this and having the caller do a separate
1315 * negation is that negating externally will flip the sign bit on
1316 * NaNs.)
1317 */
1318
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320 int flags, float_status *s)
1321 {
1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323 ((1 << float_class_inf) | (1 << float_class_zero));
1324 bool p_sign;
1325 bool sign_flip = flags & float_muladd_negate_result;
1326 FloatClass p_class;
1327 uint64_t hi, lo;
1328 int p_exp;
1329
1330 /* It is implementation-defined whether the cases of (0,inf,qnan)
1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332 * they return if they do), so we have to hand this information
1333 * off to the target-specific pick-a-NaN routine.
1334 */
1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336 return pick_nan_muladd(a, b, c, inf_zero, s);
1337 }
1338
1339 if (inf_zero) {
1340 s->float_exception_flags |= float_flag_invalid;
1341 return parts_default_nan(s);
1342 }
1343
1344 if (flags & float_muladd_negate_c) {
1345 c.sign ^= 1;
1346 }
1347
1348 p_sign = a.sign ^ b.sign;
1349
1350 if (flags & float_muladd_negate_product) {
1351 p_sign ^= 1;
1352 }
1353
1354 if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355 p_class = float_class_inf;
1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357 p_class = float_class_zero;
1358 } else {
1359 p_class = float_class_normal;
1360 }
1361
1362 if (c.cls == float_class_inf) {
1363 if (p_class == float_class_inf && p_sign != c.sign) {
1364 s->float_exception_flags |= float_flag_invalid;
1365 return parts_default_nan(s);
1366 } else {
1367 a.cls = float_class_inf;
1368 a.sign = c.sign ^ sign_flip;
1369 return a;
1370 }
1371 }
1372
1373 if (p_class == float_class_inf) {
1374 a.cls = float_class_inf;
1375 a.sign = p_sign ^ sign_flip;
1376 return a;
1377 }
1378
1379 if (p_class == float_class_zero) {
1380 if (c.cls == float_class_zero) {
1381 if (p_sign != c.sign) {
1382 p_sign = s->float_rounding_mode == float_round_down;
1383 }
1384 c.sign = p_sign;
1385 } else if (flags & float_muladd_halve_result) {
1386 c.exp -= 1;
1387 }
1388 c.sign ^= sign_flip;
1389 return c;
1390 }
1391
1392 /* a & b should be normals now... */
1393 assert(a.cls == float_class_normal &&
1394 b.cls == float_class_normal);
1395
1396 p_exp = a.exp + b.exp;
1397
1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399 * result.
1400 */
1401 mul64To128(a.frac, b.frac, &hi, &lo);
1402 /* binary point now at bit 124 */
1403
1404 /* check for overflow */
1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406 shift128RightJamming(hi, lo, 1, &hi, &lo);
1407 p_exp += 1;
1408 }
1409
1410 /* + add/sub */
1411 if (c.cls == float_class_zero) {
1412 /* move binary point back to 62 */
1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414 } else {
1415 int exp_diff = p_exp - c.exp;
1416 if (p_sign == c.sign) {
1417 /* Addition */
1418 if (exp_diff <= 0) {
1419 shift128RightJamming(hi, lo,
1420 DECOMPOSED_BINARY_POINT - exp_diff,
1421 &hi, &lo);
1422 lo += c.frac;
1423 p_exp = c.exp;
1424 } else {
1425 uint64_t c_hi, c_lo;
1426 /* shift c to the same binary point as the product (124) */
1427 c_hi = c.frac >> 2;
1428 c_lo = 0;
1429 shift128RightJamming(c_hi, c_lo,
1430 exp_diff,
1431 &c_hi, &c_lo);
1432 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433 /* move binary point back to 62 */
1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435 }
1436
1437 if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438 shift64RightJamming(lo, 1, &lo);
1439 p_exp += 1;
1440 }
1441
1442 } else {
1443 /* Subtraction */
1444 uint64_t c_hi, c_lo;
1445 /* make C binary point match product at bit 124 */
1446 c_hi = c.frac >> 2;
1447 c_lo = 0;
1448
1449 if (exp_diff <= 0) {
1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451 if (exp_diff == 0
1452 &&
1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455 } else {
1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457 p_sign ^= 1;
1458 p_exp = c.exp;
1459 }
1460 } else {
1461 shift128RightJamming(c_hi, c_lo,
1462 exp_diff,
1463 &c_hi, &c_lo);
1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465 }
1466
1467 if (hi == 0 && lo == 0) {
1468 a.cls = float_class_zero;
1469 a.sign = s->float_rounding_mode == float_round_down;
1470 a.sign ^= sign_flip;
1471 return a;
1472 } else {
1473 int shift;
1474 if (hi != 0) {
1475 shift = clz64(hi);
1476 } else {
1477 shift = clz64(lo) + 64;
1478 }
1479 /* Normalizing to a binary point of 124 is the
1480 correct adjust for the exponent. However since we're
1481 shifting, we might as well put the binary point back
1482 at 62 where we really want it. Therefore shift as
1483 if we're leaving 1 bit at the top of the word, but
1484 adjust the exponent as if we're leaving 3 bits. */
1485 shift -= 1;
1486 if (shift >= 64) {
1487 lo = lo << (shift - 64);
1488 } else {
1489 hi = (hi << shift) | (lo >> (64 - shift));
1490 lo = hi | ((lo << shift) != 0);
1491 }
1492 p_exp -= shift - 2;
1493 }
1494 }
1495 }
1496
1497 if (flags & float_muladd_halve_result) {
1498 p_exp -= 1;
1499 }
1500
1501 /* finally prepare our result */
1502 a.cls = float_class_normal;
1503 a.sign = p_sign ^ sign_flip;
1504 a.exp = p_exp;
1505 a.frac = lo;
1506
1507 return a;
1508 }
1509
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511 int flags, float_status *status)
1512 {
1513 FloatParts pa = float16_unpack_canonical(a, status);
1514 FloatParts pb = float16_unpack_canonical(b, status);
1515 FloatParts pc = float16_unpack_canonical(c, status);
1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517
1518 return float16_round_pack_canonical(pr, status);
1519 }
1520
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523 float_status *status)
1524 {
1525 FloatParts pa = float32_unpack_canonical(a, status);
1526 FloatParts pb = float32_unpack_canonical(b, status);
1527 FloatParts pc = float32_unpack_canonical(c, status);
1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1529
1530 return float32_round_pack_canonical(pr, status);
1531 }
1532
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535 float_status *status)
1536 {
1537 FloatParts pa = float64_unpack_canonical(a, status);
1538 FloatParts pb = float64_unpack_canonical(b, status);
1539 FloatParts pc = float64_unpack_canonical(c, status);
1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1541
1542 return float64_round_pack_canonical(pr, status);
1543 }
1544
1545 static bool force_soft_fma;
1546
1547 float32 QEMU_FLATTEN
1548 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1549 {
1550 union_float32 ua, ub, uc, ur;
1551
1552 ua.s = xa;
1553 ub.s = xb;
1554 uc.s = xc;
1555
1556 if (unlikely(!can_use_fpu(s))) {
1557 goto soft;
1558 }
1559 if (unlikely(flags & float_muladd_halve_result)) {
1560 goto soft;
1561 }
1562
1563 float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1564 if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1565 goto soft;
1566 }
1567
1568 if (unlikely(force_soft_fma)) {
1569 goto soft;
1570 }
1571
1572 /*
1573 * When (a || b) == 0, there's no need to check for under/over flow,
1574 * since we know the addend is (normal || 0) and the product is 0.
1575 */
1576 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1577 union_float32 up;
1578 bool prod_sign;
1579
1580 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1581 prod_sign ^= !!(flags & float_muladd_negate_product);
1582 up.s = float32_set_sign(float32_zero, prod_sign);
1583
1584 if (flags & float_muladd_negate_c) {
1585 uc.h = -uc.h;
1586 }
1587 ur.h = up.h + uc.h;
1588 } else {
1589 if (flags & float_muladd_negate_product) {
1590 ua.h = -ua.h;
1591 }
1592 if (flags & float_muladd_negate_c) {
1593 uc.h = -uc.h;
1594 }
1595
1596 ur.h = fmaf(ua.h, ub.h, uc.h);
1597
1598 if (unlikely(f32_is_inf(ur))) {
1599 s->float_exception_flags |= float_flag_overflow;
1600 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1601 goto soft;
1602 }
1603 }
1604 if (flags & float_muladd_negate_result) {
1605 return float32_chs(ur.s);
1606 }
1607 return ur.s;
1608
1609 soft:
1610 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1611 }
1612
1613 float64 QEMU_FLATTEN
1614 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1615 {
1616 union_float64 ua, ub, uc, ur;
1617
1618 ua.s = xa;
1619 ub.s = xb;
1620 uc.s = xc;
1621
1622 if (unlikely(!can_use_fpu(s))) {
1623 goto soft;
1624 }
1625 if (unlikely(flags & float_muladd_halve_result)) {
1626 goto soft;
1627 }
1628
1629 float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1630 if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1631 goto soft;
1632 }
1633
1634 if (unlikely(force_soft_fma)) {
1635 goto soft;
1636 }
1637
1638 /*
1639 * When (a || b) == 0, there's no need to check for under/over flow,
1640 * since we know the addend is (normal || 0) and the product is 0.
1641 */
1642 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1643 union_float64 up;
1644 bool prod_sign;
1645
1646 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1647 prod_sign ^= !!(flags & float_muladd_negate_product);
1648 up.s = float64_set_sign(float64_zero, prod_sign);
1649
1650 if (flags & float_muladd_negate_c) {
1651 uc.h = -uc.h;
1652 }
1653 ur.h = up.h + uc.h;
1654 } else {
1655 if (flags & float_muladd_negate_product) {
1656 ua.h = -ua.h;
1657 }
1658 if (flags & float_muladd_negate_c) {
1659 uc.h = -uc.h;
1660 }
1661
1662 ur.h = fma(ua.h, ub.h, uc.h);
1663
1664 if (unlikely(f64_is_inf(ur))) {
1665 s->float_exception_flags |= float_flag_overflow;
1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667 goto soft;
1668 }
1669 }
1670 if (flags & float_muladd_negate_result) {
1671 return float64_chs(ur.s);
1672 }
1673 return ur.s;
1674
1675 soft:
1676 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1677 }
1678
1679 /*
1680 * Returns the result of dividing the floating-point value `a' by the
1681 * corresponding value `b'. The operation is performed according to
1682 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1683 */
1684
1685 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1686 {
1687 bool sign = a.sign ^ b.sign;
1688
1689 if (a.cls == float_class_normal && b.cls == float_class_normal) {
1690 uint64_t n0, n1, q, r;
1691 int exp = a.exp - b.exp;
1692
1693 /*
1694 * We want a 2*N / N-bit division to produce exactly an N-bit
1695 * result, so that we do not lose any precision and so that we
1696 * do not have to renormalize afterward. If A.frac < B.frac,
1697 * then division would produce an (N-1)-bit result; shift A left
1698 * by one to produce the an N-bit result, and decrement the
1699 * exponent to match.
1700 *
1701 * The udiv_qrnnd algorithm that we're using requires normalization,
1702 * i.e. the msb of the denominator must be set. Since we know that
1703 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1704 * by one (more), and the remainder must be shifted right by one.
1705 */
1706 if (a.frac < b.frac) {
1707 exp -= 1;
1708 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1709 } else {
1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1711 }
1712 q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1713
1714 /*
1715 * Set lsb if there is a remainder, to set inexact.
1716 * As mentioned above, to find the actual value of the remainder we
1717 * would need to shift right, but (1) we are only concerned about
1718 * non-zero-ness, and (2) the remainder will always be even because
1719 * both inputs to the division primitive are even.
1720 */
1721 a.frac = q | (r != 0);
1722 a.sign = sign;
1723 a.exp = exp;
1724 return a;
1725 }
1726 /* handle all the NaN cases */
1727 if (is_nan(a.cls) || is_nan(b.cls)) {
1728 return pick_nan(a, b, s);
1729 }
1730 /* 0/0 or Inf/Inf */
1731 if (a.cls == b.cls
1732 &&
1733 (a.cls == float_class_inf || a.cls == float_class_zero)) {
1734 s->float_exception_flags |= float_flag_invalid;
1735 return parts_default_nan(s);
1736 }
1737 /* Inf / x or 0 / x */
1738 if (a.cls == float_class_inf || a.cls == float_class_zero) {
1739 a.sign = sign;
1740 return a;
1741 }
1742 /* Div 0 => Inf */
1743 if (b.cls == float_class_zero) {
1744 s->float_exception_flags |= float_flag_divbyzero;
1745 a.cls = float_class_inf;
1746 a.sign = sign;
1747 return a;
1748 }
1749 /* Div by Inf */
1750 if (b.cls == float_class_inf) {
1751 a.cls = float_class_zero;
1752 a.sign = sign;
1753 return a;
1754 }
1755 g_assert_not_reached();
1756 }
1757
1758 float16 float16_div(float16 a, float16 b, float_status *status)
1759 {
1760 FloatParts pa = float16_unpack_canonical(a, status);
1761 FloatParts pb = float16_unpack_canonical(b, status);
1762 FloatParts pr = div_floats(pa, pb, status);
1763
1764 return float16_round_pack_canonical(pr, status);
1765 }
1766
1767 static float32 QEMU_SOFTFLOAT_ATTR
1768 soft_f32_div(float32 a, float32 b, float_status *status)
1769 {
1770 FloatParts pa = float32_unpack_canonical(a, status);
1771 FloatParts pb = float32_unpack_canonical(b, status);
1772 FloatParts pr = div_floats(pa, pb, status);
1773
1774 return float32_round_pack_canonical(pr, status);
1775 }
1776
1777 static float64 QEMU_SOFTFLOAT_ATTR
1778 soft_f64_div(float64 a, float64 b, float_status *status)
1779 {
1780 FloatParts pa = float64_unpack_canonical(a, status);
1781 FloatParts pb = float64_unpack_canonical(b, status);
1782 FloatParts pr = div_floats(pa, pb, status);
1783
1784 return float64_round_pack_canonical(pr, status);
1785 }
1786
1787 static float hard_f32_div(float a, float b)
1788 {
1789 return a / b;
1790 }
1791
1792 static double hard_f64_div(double a, double b)
1793 {
1794 return a / b;
1795 }
1796
1797 static bool f32_div_pre(union_float32 a, union_float32 b)
1798 {
1799 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1800 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1801 fpclassify(b.h) == FP_NORMAL;
1802 }
1803 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1804 }
1805
1806 static bool f64_div_pre(union_float64 a, union_float64 b)
1807 {
1808 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1809 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1810 fpclassify(b.h) == FP_NORMAL;
1811 }
1812 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1813 }
1814
1815 static bool f32_div_post(union_float32 a, union_float32 b)
1816 {
1817 if (QEMU_HARDFLOAT_2F32_USE_FP) {
1818 return fpclassify(a.h) != FP_ZERO;
1819 }
1820 return !float32_is_zero(a.s);
1821 }
1822
1823 static bool f64_div_post(union_float64 a, union_float64 b)
1824 {
1825 if (QEMU_HARDFLOAT_2F64_USE_FP) {
1826 return fpclassify(a.h) != FP_ZERO;
1827 }
1828 return !float64_is_zero(a.s);
1829 }
1830
1831 float32 QEMU_FLATTEN
1832 float32_div(float32 a, float32 b, float_status *s)
1833 {
1834 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1835 f32_div_pre, f32_div_post, NULL, NULL);
1836 }
1837
1838 float64 QEMU_FLATTEN
1839 float64_div(float64 a, float64 b, float_status *s)
1840 {
1841 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1842 f64_div_pre, f64_div_post, NULL, NULL);
1843 }
1844
1845 /*
1846 * Float to Float conversions
1847 *
1848 * Returns the result of converting one float format to another. The
1849 * conversion is performed according to the IEC/IEEE Standard for
1850 * Binary Floating-Point Arithmetic.
1851 *
1852 * The float_to_float helper only needs to take care of raising
1853 * invalid exceptions and handling the conversion on NaNs.
1854 */
1855
1856 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1857 float_status *s)
1858 {
1859 if (dstf->arm_althp) {
1860 switch (a.cls) {
1861 case float_class_qnan:
1862 case float_class_snan:
1863 /* There is no NaN in the destination format. Raise Invalid
1864 * and return a zero with the sign of the input NaN.
1865 */
1866 s->float_exception_flags |= float_flag_invalid;
1867 a.cls = float_class_zero;
1868 a.frac = 0;
1869 a.exp = 0;
1870 break;
1871
1872 case float_class_inf:
1873 /* There is no Inf in the destination format. Raise Invalid
1874 * and return the maximum normal with the correct sign.
1875 */
1876 s->float_exception_flags |= float_flag_invalid;
1877 a.cls = float_class_normal;
1878 a.exp = dstf->exp_max;
1879 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1880 break;
1881
1882 default:
1883 break;
1884 }
1885 } else if (is_nan(a.cls)) {
1886 if (is_snan(a.cls)) {
1887 s->float_exception_flags |= float_flag_invalid;
1888 a = parts_silence_nan(a, s);
1889 }
1890 if (s->default_nan_mode) {
1891 return parts_default_nan(s);
1892 }
1893 }
1894 return a;
1895 }
1896
1897 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1898 {
1899 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1900 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1901 FloatParts pr = float_to_float(p, &float32_params, s);
1902 return float32_round_pack_canonical(pr, s);
1903 }
1904
1905 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1906 {
1907 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1908 FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1909 FloatParts pr = float_to_float(p, &float64_params, s);
1910 return float64_round_pack_canonical(pr, s);
1911 }
1912
1913 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1914 {
1915 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1916 FloatParts p = float32_unpack_canonical(a, s);
1917 FloatParts pr = float_to_float(p, fmt16, s);
1918 return float16a_round_pack_canonical(pr, s, fmt16);
1919 }
1920
1921 float64 float32_to_float64(float32 a, float_status *s)
1922 {
1923 FloatParts p = float32_unpack_canonical(a, s);
1924 FloatParts pr = float_to_float(p, &float64_params, s);
1925 return float64_round_pack_canonical(pr, s);
1926 }
1927
1928 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1929 {
1930 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1931 FloatParts p = float64_unpack_canonical(a, s);
1932 FloatParts pr = float_to_float(p, fmt16, s);
1933 return float16a_round_pack_canonical(pr, s, fmt16);
1934 }
1935
1936 float32 float64_to_float32(float64 a, float_status *s)
1937 {
1938 FloatParts p = float64_unpack_canonical(a, s);
1939 FloatParts pr = float_to_float(p, &float32_params, s);
1940 return float32_round_pack_canonical(pr, s);
1941 }
1942
1943 /*
1944 * Rounds the floating-point value `a' to an integer, and returns the
1945 * result as a floating-point value. The operation is performed
1946 * according to the IEC/IEEE Standard for Binary Floating-Point
1947 * Arithmetic.
1948 */
1949
1950 static FloatParts round_to_int(FloatParts a, int rmode,
1951 int scale, float_status *s)
1952 {
1953 switch (a.cls) {
1954 case float_class_qnan:
1955 case float_class_snan:
1956 return return_nan(a, s);
1957
1958 case float_class_zero:
1959 case float_class_inf:
1960 /* already "integral" */
1961 break;
1962
1963 case float_class_normal:
1964 scale = MIN(MAX(scale, -0x10000), 0x10000);
1965 a.exp += scale;
1966
1967 if (a.exp >= DECOMPOSED_BINARY_POINT) {
1968 /* already integral */
1969 break;
1970 }
1971 if (a.exp < 0) {
1972 bool one;
1973 /* all fractional */
1974 s->float_exception_flags |= float_flag_inexact;
1975 switch (rmode) {
1976 case float_round_nearest_even:
1977 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1978 break;
1979 case float_round_ties_away:
1980 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1981 break;
1982 case float_round_to_zero:
1983 one = false;
1984 break;
1985 case float_round_up:
1986 one = !a.sign;
1987 break;
1988 case float_round_down:
1989 one = a.sign;
1990 break;
1991 default:
1992 g_assert_not_reached();
1993 }
1994
1995 if (one) {
1996 a.frac = DECOMPOSED_IMPLICIT_BIT;
1997 a.exp = 0;
1998 } else {
1999 a.cls = float_class_zero;
2000 }
2001 } else {
2002 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2003 uint64_t frac_lsbm1 = frac_lsb >> 1;
2004 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2005 uint64_t rnd_mask = rnd_even_mask >> 1;
2006 uint64_t inc;
2007
2008 switch (rmode) {
2009 case float_round_nearest_even:
2010 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2011 break;
2012 case float_round_ties_away:
2013 inc = frac_lsbm1;
2014 break;
2015 case float_round_to_zero:
2016 inc = 0;
2017 break;
2018 case float_round_up:
2019 inc = a.sign ? 0 : rnd_mask;
2020 break;
2021 case float_round_down:
2022 inc = a.sign ? rnd_mask : 0;
2023 break;
2024 default:
2025 g_assert_not_reached();
2026 }
2027
2028 if (a.frac & rnd_mask) {
2029 s->float_exception_flags |= float_flag_inexact;
2030 a.frac += inc;
2031 a.frac &= ~rnd_mask;
2032 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2033 a.frac >>= 1;
2034 a.exp++;
2035 }
2036 }
2037 }
2038 break;
2039 default:
2040 g_assert_not_reached();
2041 }
2042 return a;
2043 }
2044
2045 float16 float16_round_to_int(float16 a, float_status *s)
2046 {
2047 FloatParts pa = float16_unpack_canonical(a, s);
2048 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2049 return float16_round_pack_canonical(pr, s);
2050 }
2051
2052 float32 float32_round_to_int(float32 a, float_status *s)
2053 {
2054 FloatParts pa = float32_unpack_canonical(a, s);
2055 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2056 return float32_round_pack_canonical(pr, s);
2057 }
2058
2059 float64 float64_round_to_int(float64 a, float_status *s)
2060 {
2061 FloatParts pa = float64_unpack_canonical(a, s);
2062 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2063 return float64_round_pack_canonical(pr, s);
2064 }
2065
2066 /*
2067 * Returns the result of converting the floating-point value `a' to
2068 * the two's complement integer format. The conversion is performed
2069 * according to the IEC/IEEE Standard for Binary Floating-Point
2070 * Arithmetic---which means in particular that the conversion is
2071 * rounded according to the current rounding mode. If `a' is a NaN,
2072 * the largest positive integer is returned. Otherwise, if the
2073 * conversion overflows, the largest integer with the same sign as `a'
2074 * is returned.
2075 */
2076
2077 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2078 int64_t min, int64_t max,
2079 float_status *s)
2080 {
2081 uint64_t r;
2082 int orig_flags = get_float_exception_flags(s);
2083 FloatParts p = round_to_int(in, rmode, scale, s);
2084
2085 switch (p.cls) {
2086 case float_class_snan:
2087 case float_class_qnan:
2088 s->float_exception_flags = orig_flags | float_flag_invalid;
2089 return max;
2090 case float_class_inf:
2091 s->float_exception_flags = orig_flags | float_flag_invalid;
2092 return p.sign ? min : max;
2093 case float_class_zero:
2094 return 0;
2095 case float_class_normal:
2096 if (p.exp < DECOMPOSED_BINARY_POINT) {
2097 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2098 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2099 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2100 } else {
2101 r = UINT64_MAX;
2102 }
2103 if (p.sign) {
2104 if (r <= -(uint64_t) min) {
2105 return -r;
2106 } else {
2107 s->float_exception_flags = orig_flags | float_flag_invalid;
2108 return min;
2109 }
2110 } else {
2111 if (r <= max) {
2112 return r;
2113 } else {
2114 s->float_exception_flags = orig_flags | float_flag_invalid;
2115 return max;
2116 }
2117 }
2118 default:
2119 g_assert_not_reached();
2120 }
2121 }
2122
2123 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2124 float_status *s)
2125 {
2126 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2127 rmode, scale, INT16_MIN, INT16_MAX, s);
2128 }
2129
2130 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2131 float_status *s)
2132 {
2133 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2134 rmode, scale, INT32_MIN, INT32_MAX, s);
2135 }
2136
2137 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2138 float_status *s)
2139 {
2140 return round_to_int_and_pack(float16_unpack_canonical(a, s),
2141 rmode, scale, INT64_MIN, INT64_MAX, s);
2142 }
2143
2144 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2145 float_status *s)
2146 {
2147 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2148 rmode, scale, INT16_MIN, INT16_MAX, s);
2149 }
2150
2151 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2152 float_status *s)
2153 {
2154 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2155 rmode, scale, INT32_MIN, INT32_MAX, s);
2156 }
2157
2158 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2159 float_status *s)
2160 {
2161 return round_to_int_and_pack(float32_unpack_canonical(a, s),
2162 rmode, scale, INT64_MIN, INT64_MAX, s);
2163 }
2164
2165 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2166 float_status *s)
2167 {
2168 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2169 rmode, scale, INT16_MIN, INT16_MAX, s);
2170 }
2171
2172 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2173 float_status *s)
2174 {
2175 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2176 rmode, scale, INT32_MIN, INT32_MAX, s);
2177 }
2178
2179 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2180 float_status *s)
2181 {
2182 return round_to_int_and_pack(float64_unpack_canonical(a, s),
2183 rmode, scale, INT64_MIN, INT64_MAX, s);
2184 }
2185
2186 int16_t float16_to_int16(float16 a, float_status *s)
2187 {
2188 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2189 }
2190
2191 int32_t float16_to_int32(float16 a, float_status *s)
2192 {
2193 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2194 }
2195
2196 int64_t float16_to_int64(float16 a, float_status *s)
2197 {
2198 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2199 }
2200
2201 int16_t float32_to_int16(float32 a, float_status *s)
2202 {
2203 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2204 }
2205
2206 int32_t float32_to_int32(float32 a, float_status *s)
2207 {
2208 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2209 }
2210
2211 int64_t float32_to_int64(float32 a, float_status *s)
2212 {
2213 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2214 }
2215
2216 int16_t float64_to_int16(float64 a, float_status *s)
2217 {
2218 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2219 }
2220
2221 int32_t float64_to_int32(float64 a, float_status *s)
2222 {
2223 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2224 }
2225
2226 int64_t float64_to_int64(float64 a, float_status *s)
2227 {
2228 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2229 }
2230
2231 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2232 {
2233 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2234 }
2235
2236 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2237 {
2238 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2239 }
2240
2241 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2242 {
2243 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2244 }
2245
2246 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2247 {
2248 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2249 }
2250
2251 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2252 {
2253 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2254 }
2255
2256 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2257 {
2258 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2259 }
2260
2261 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2262 {
2263 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2264 }
2265
2266 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2267 {
2268 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2269 }
2270
2271 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2272 {
2273 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2274 }
2275
2276 /*
2277 * Returns the result of converting the floating-point value `a' to
2278 * the unsigned integer format. The conversion is performed according
2279 * to the IEC/IEEE Standard for Binary Floating-Point
2280 * Arithmetic---which means in particular that the conversion is
2281 * rounded according to the current rounding mode. If `a' is a NaN,
2282 * the largest unsigned integer is returned. Otherwise, if the
2283 * conversion overflows, the largest unsigned integer is returned. If
2284 * the 'a' is negative, the result is rounded and zero is returned;
2285 * values that do not round to zero will raise the inexact exception
2286 * flag.
2287 */
2288
2289 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2290 uint64_t max, float_status *s)
2291 {
2292 int orig_flags = get_float_exception_flags(s);
2293 FloatParts p = round_to_int(in, rmode, scale, s);
2294 uint64_t r;
2295
2296 switch (p.cls) {
2297 case float_class_snan:
2298 case float_class_qnan:
2299 s->float_exception_flags = orig_flags | float_flag_invalid;
2300 return max;
2301 case float_class_inf:
2302 s->float_exception_flags = orig_flags | float_flag_invalid;
2303 return p.sign ? 0 : max;
2304 case float_class_zero:
2305 return 0;
2306 case float_class_normal:
2307 if (p.sign) {
2308 s->float_exception_flags = orig_flags | float_flag_invalid;
2309 return 0;
2310 }
2311
2312 if (p.exp < DECOMPOSED_BINARY_POINT) {
2313 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2314 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2315 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2316 } else {
2317 s->float_exception_flags = orig_flags | float_flag_invalid;
2318 return max;
2319 }
2320
2321 /* For uint64 this will never trip, but if p.exp is too large
2322 * to shift a decomposed fraction we shall have exited via the
2323 * 3rd leg above.
2324 */
2325 if (r > max) {
2326 s->float_exception_flags = orig_flags | float_flag_invalid;
2327 return max;
2328 }
2329 return r;
2330 default:
2331 g_assert_not_reached();
2332 }
2333 }
2334
2335 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2336 float_status *s)
2337 {
2338 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2339 rmode, scale, UINT16_MAX, s);
2340 }
2341
2342 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2343 float_status *s)
2344 {
2345 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2346 rmode, scale, UINT32_MAX, s);
2347 }
2348
2349 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2350 float_status *s)
2351 {
2352 return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2353 rmode, scale, UINT64_MAX, s);
2354 }
2355
2356 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2357 float_status *s)
2358 {
2359 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2360 rmode, scale, UINT16_MAX, s);
2361 }
2362
2363 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2364 float_status *s)
2365 {
2366 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2367 rmode, scale, UINT32_MAX, s);
2368 }
2369
2370 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2371 float_status *s)
2372 {
2373 return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2374 rmode, scale, UINT64_MAX, s);
2375 }
2376
2377 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2378 float_status *s)
2379 {
2380 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2381 rmode, scale, UINT16_MAX, s);
2382 }
2383
2384 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2385 float_status *s)
2386 {
2387 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2388 rmode, scale, UINT32_MAX, s);
2389 }
2390
2391 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2392 float_status *s)
2393 {
2394 return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2395 rmode, scale, UINT64_MAX, s);
2396 }
2397
2398 uint16_t float16_to_uint16(float16 a, float_status *s)
2399 {
2400 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2401 }
2402
2403 uint32_t float16_to_uint32(float16 a, float_status *s)
2404 {
2405 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2406 }
2407
2408 uint64_t float16_to_uint64(float16 a, float_status *s)
2409 {
2410 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2411 }
2412
2413 uint16_t float32_to_uint16(float32 a, float_status *s)
2414 {
2415 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2416 }
2417
2418 uint32_t float32_to_uint32(float32 a, float_status *s)
2419 {
2420 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2421 }
2422
2423 uint64_t float32_to_uint64(float32 a, float_status *s)
2424 {
2425 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427
2428 uint16_t float64_to_uint16(float64 a, float_status *s)
2429 {
2430 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432
2433 uint32_t float64_to_uint32(float64 a, float_status *s)
2434 {
2435 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2436 }
2437
2438 uint64_t float64_to_uint64(float64 a, float_status *s)
2439 {
2440 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2441 }
2442
2443 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2444 {
2445 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2446 }
2447
2448 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2449 {
2450 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2451 }
2452
2453 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2454 {
2455 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2456 }
2457
2458 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2459 {
2460 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2461 }
2462
2463 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2464 {
2465 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2466 }
2467
2468 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2469 {
2470 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472
2473 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2474 {
2475 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477
2478 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2479 {
2480 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2481 }
2482
2483 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2484 {
2485 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2486 }
2487
2488 /*
2489 * Integer to float conversions
2490 *
2491 * Returns the result of converting the two's complement integer `a'
2492 * to the floating-point format. The conversion is performed according
2493 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2494 */
2495
2496 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2497 {
2498 FloatParts r = { .sign = false };
2499
2500 if (a == 0) {
2501 r.cls = float_class_zero;
2502 } else {
2503 uint64_t f = a;
2504 int shift;
2505
2506 r.cls = float_class_normal;
2507 if (a < 0) {
2508 f = -f;
2509 r.sign = true;
2510 }
2511 shift = clz64(f) - 1;
2512 scale = MIN(MAX(scale, -0x10000), 0x10000);
2513
2514 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2515 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2516 }
2517
2518 return r;
2519 }
2520
2521 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2522 {
2523 FloatParts pa = int_to_float(a, scale, status);
2524 return float16_round_pack_canonical(pa, status);
2525 }
2526
2527 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2528 {
2529 return int64_to_float16_scalbn(a, scale, status);
2530 }
2531
2532 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2533 {
2534 return int64_to_float16_scalbn(a, scale, status);
2535 }
2536
2537 float16 int64_to_float16(int64_t a, float_status *status)
2538 {
2539 return int64_to_float16_scalbn(a, 0, status);
2540 }
2541
2542 float16 int32_to_float16(int32_t a, float_status *status)
2543 {
2544 return int64_to_float16_scalbn(a, 0, status);
2545 }
2546
2547 float16 int16_to_float16(int16_t a, float_status *status)
2548 {
2549 return int64_to_float16_scalbn(a, 0, status);
2550 }
2551
2552 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2553 {
2554 FloatParts pa = int_to_float(a, scale, status);
2555 return float32_round_pack_canonical(pa, status);
2556 }
2557
2558 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2559 {
2560 return int64_to_float32_scalbn(a, scale, status);
2561 }
2562
2563 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2564 {
2565 return int64_to_float32_scalbn(a, scale, status);
2566 }
2567
2568 float32 int64_to_float32(int64_t a, float_status *status)
2569 {
2570 return int64_to_float32_scalbn(a, 0, status);
2571 }
2572
2573 float32 int32_to_float32(int32_t a, float_status *status)
2574 {
2575 return int64_to_float32_scalbn(a, 0, status);
2576 }
2577
2578 float32 int16_to_float32(int16_t a, float_status *status)
2579 {
2580 return int64_to_float32_scalbn(a, 0, status);
2581 }
2582
2583 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2584 {
2585 FloatParts pa = int_to_float(a, scale, status);
2586 return float64_round_pack_canonical(pa, status);
2587 }
2588
2589 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2590 {
2591 return int64_to_float64_scalbn(a, scale, status);
2592 }
2593
2594 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2595 {
2596 return int64_to_float64_scalbn(a, scale, status);
2597 }
2598
2599 float64 int64_to_float64(int64_t a, float_status *status)
2600 {
2601 return int64_to_float64_scalbn(a, 0, status);
2602 }
2603
2604 float64 int32_to_float64(int32_t a, float_status *status)
2605 {
2606 return int64_to_float64_scalbn(a, 0, status);
2607 }
2608
2609 float64 int16_to_float64(int16_t a, float_status *status)
2610 {
2611 return int64_to_float64_scalbn(a, 0, status);
2612 }
2613
2614
2615 /*
2616 * Unsigned Integer to float conversions
2617 *
2618 * Returns the result of converting the unsigned integer `a' to the
2619 * floating-point format. The conversion is performed according to the
2620 * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2621 */
2622
2623 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2624 {
2625 FloatParts r = { .sign = false };
2626
2627 if (a == 0) {
2628 r.cls = float_class_zero;
2629 } else {
2630 scale = MIN(MAX(scale, -0x10000), 0x10000);
2631 r.cls = float_class_normal;
2632 if ((int64_t)a < 0) {
2633 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2634 shift64RightJamming(a, 1, &a);
2635 r.frac = a;
2636 } else {
2637 int shift = clz64(a) - 1;
2638 r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2639 r.frac = a << shift;
2640 }
2641 }
2642
2643 return r;
2644 }
2645
2646 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2647 {
2648 FloatParts pa = uint_to_float(a, scale, status);
2649 return float16_round_pack_canonical(pa, status);
2650 }
2651
2652 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2653 {
2654 return uint64_to_float16_scalbn(a, scale, status);
2655 }
2656
2657 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2658 {
2659 return uint64_to_float16_scalbn(a, scale, status);
2660 }
2661
2662 float16 uint64_to_float16(uint64_t a, float_status *status)
2663 {
2664 return uint64_to_float16_scalbn(a, 0, status);
2665 }
2666
2667 float16 uint32_to_float16(uint32_t a, float_status *status)
2668 {
2669 return uint64_to_float16_scalbn(a, 0, status);
2670 }
2671
2672 float16 uint16_to_float16(uint16_t a, float_status *status)
2673 {
2674 return uint64_to_float16_scalbn(a, 0, status);
2675 }
2676
2677 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2678 {
2679 FloatParts pa = uint_to_float(a, scale, status);
2680 return float32_round_pack_canonical(pa, status);
2681 }
2682
2683 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2684 {
2685 return uint64_to_float32_scalbn(a, scale, status);
2686 }
2687
2688 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2689 {
2690 return uint64_to_float32_scalbn(a, scale, status);
2691 }
2692
2693 float32 uint64_to_float32(uint64_t a, float_status *status)
2694 {
2695 return uint64_to_float32_scalbn(a, 0, status);
2696 }
2697
2698 float32 uint32_to_float32(uint32_t a, float_status *status)
2699 {
2700 return uint64_to_float32_scalbn(a, 0, status);
2701 }
2702
2703 float32 uint16_to_float32(uint16_t a, float_status *status)
2704 {
2705 return uint64_to_float32_scalbn(a, 0, status);
2706 }
2707
2708 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2709 {
2710 FloatParts pa = uint_to_float(a, scale, status);
2711 return float64_round_pack_canonical(pa, status);
2712 }
2713
2714 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2715 {
2716 return uint64_to_float64_scalbn(a, scale, status);
2717 }
2718
2719 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2720 {
2721 return uint64_to_float64_scalbn(a, scale, status);
2722 }
2723
2724 float64 uint64_to_float64(uint64_t a, float_status *status)
2725 {
2726 return uint64_to_float64_scalbn(a, 0, status);
2727 }
2728
2729 float64 uint32_to_float64(uint32_t a, float_status *status)
2730 {
2731 return uint64_to_float64_scalbn(a, 0, status);
2732 }
2733
2734 float64 uint16_to_float64(uint16_t a, float_status *status)
2735 {
2736 return uint64_to_float64_scalbn(a, 0, status);
2737 }
2738
2739 /* Float Min/Max */
2740 /* min() and max() functions. These can't be implemented as
2741 * 'compare and pick one input' because that would mishandle
2742 * NaNs and +0 vs -0.
2743 *
2744 * minnum() and maxnum() functions. These are similar to the min()
2745 * and max() functions but if one of the arguments is a QNaN and
2746 * the other is numerical then the numerical argument is returned.
2747 * SNaNs will get quietened before being returned.
2748 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2749 * and maxNum() operations. min() and max() are the typical min/max
2750 * semantics provided by many CPUs which predate that specification.
2751 *
2752 * minnummag() and maxnummag() functions correspond to minNumMag()
2753 * and minNumMag() from the IEEE-754 2008.
2754 */
2755 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2756 bool ieee, bool ismag, float_status *s)
2757 {
2758 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2759 if (ieee) {
2760 /* Takes two floating-point values `a' and `b', one of
2761 * which is a NaN, and returns the appropriate NaN
2762 * result. If either `a' or `b' is a signaling NaN,
2763 * the invalid exception is raised.
2764 */
2765 if (is_snan(a.cls) || is_snan(b.cls)) {
2766 return pick_nan(a, b, s);
2767 } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2768 return b;
2769 } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2770 return a;
2771 }
2772 }
2773 return pick_nan(a, b, s);
2774 } else {
2775 int a_exp, b_exp;
2776
2777 switch (a.cls) {
2778 case float_class_normal:
2779 a_exp = a.exp;
2780 break;
2781 case float_class_inf:
2782 a_exp = INT_MAX;
2783 break;
2784 case float_class_zero:
2785 a_exp = INT_MIN;
2786 break;
2787 default:
2788 g_assert_not_reached();
2789 break;
2790 }
2791 switch (b.cls) {
2792 case float_class_normal:
2793 b_exp = b.exp;
2794 break;
2795 case float_class_inf:
2796 b_exp = INT_MAX;
2797 break;
2798 case float_class_zero:
2799 b_exp = INT_MIN;
2800 break;
2801 default:
2802 g_assert_not_reached();
2803 break;
2804 }
2805
2806 if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2807 bool a_less = a_exp < b_exp;
2808 if (a_exp == b_exp) {
2809 a_less = a.frac < b.frac;
2810 }
2811 return a_less ^ ismin ? b : a;
2812 }
2813
2814 if (a.sign == b.sign) {
2815 bool a_less = a_exp < b_exp;
2816 if (a_exp == b_exp) {
2817 a_less = a.frac < b.frac;
2818 }
2819 return a.sign ^ a_less ^ ismin ? b : a;
2820 } else {
2821 return a.sign ^ ismin ? b : a;
2822 }
2823 }
2824 }
2825
2826 #define MINMAX(sz, name, ismin, isiee, ismag) \
2827 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \
2828 float_status *s) \
2829 { \
2830 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2831 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2832 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \
2833 \
2834 return float ## sz ## _round_pack_canonical(pr, s); \
2835 }
2836
2837 MINMAX(16, min, true, false, false)
2838 MINMAX(16, minnum, true, true, false)
2839 MINMAX(16, minnummag, true, true, true)
2840 MINMAX(16, max, false, false, false)
2841 MINMAX(16, maxnum, false, true, false)
2842 MINMAX(16, maxnummag, false, true, true)
2843
2844 MINMAX(32, min, true, false, false)
2845 MINMAX(32, minnum, true, true, false)
2846 MINMAX(32, minnummag, true, true, true)
2847 MINMAX(32, max, false, false, false)
2848 MINMAX(32, maxnum, false, true, false)
2849 MINMAX(32, maxnummag, false, true, true)
2850
2851 MINMAX(64, min, true, false, false)
2852 MINMAX(64, minnum, true, true, false)
2853 MINMAX(64, minnummag, true, true, true)
2854 MINMAX(64, max, false, false, false)
2855 MINMAX(64, maxnum, false, true, false)
2856 MINMAX(64, maxnummag, false, true, true)
2857
2858 #undef MINMAX
2859
2860 /* Floating point compare */
2861 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2862 float_status *s)
2863 {
2864 if (is_nan(a.cls) || is_nan(b.cls)) {
2865 if (!is_quiet ||
2866 a.cls == float_class_snan ||
2867 b.cls == float_class_snan) {
2868 s->float_exception_flags |= float_flag_invalid;
2869 }
2870 return float_relation_unordered;
2871 }
2872
2873 if (a.cls == float_class_zero) {
2874 if (b.cls == float_class_zero) {
2875 return float_relation_equal;
2876 }
2877 return b.sign ? float_relation_greater : float_relation_less;
2878 } else if (b.cls == float_class_zero) {
2879 return a.sign ? float_relation_less : float_relation_greater;
2880 }
2881
2882 /* The only really important thing about infinity is its sign. If
2883 * both are infinities the sign marks the smallest of the two.
2884 */
2885 if (a.cls == float_class_inf) {
2886 if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2887 return float_relation_equal;
2888 }
2889 return a.sign ? float_relation_less : float_relation_greater;
2890 } else if (b.cls == float_class_inf) {
2891 return b.sign ? float_relation_greater : float_relation_less;
2892 }
2893
2894 if (a.sign != b.sign) {
2895 return a.sign ? float_relation_less : float_relation_greater;
2896 }
2897
2898 if (a.exp == b.exp) {
2899 if (a.frac == b.frac) {
2900 return float_relation_equal;
2901 }
2902 if (a.sign) {
2903 return a.frac > b.frac ?
2904 float_relation_less : float_relation_greater;
2905 } else {
2906 return a.frac > b.frac ?
2907 float_relation_greater : float_relation_less;
2908 }
2909 } else {
2910 if (a.sign) {
2911 return a.exp > b.exp ? float_relation_less : float_relation_greater;
2912 } else {
2913 return a.exp > b.exp ? float_relation_greater : float_relation_less;
2914 }
2915 }
2916 }
2917
2918 #define COMPARE(name, attr, sz) \
2919 static int attr \
2920 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \
2921 { \
2922 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \
2923 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \
2924 return compare_floats(pa, pb, is_quiet, s); \
2925 }
2926
2927 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2928 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2929 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2930
2931 #undef COMPARE
2932
2933 int float16_compare(float16 a, float16 b, float_status *s)
2934 {
2935 return soft_f16_compare(a, b, false, s);
2936 }
2937
2938 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2939 {
2940 return soft_f16_compare(a, b, true, s);
2941 }
2942
2943 static int QEMU_FLATTEN
2944 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2945 {
2946 union_float32 ua, ub;
2947
2948 ua.s = xa;
2949 ub.s = xb;
2950
2951 if (QEMU_NO_HARDFLOAT) {
2952 goto soft;
2953 }
2954
2955 float32_input_flush2(&ua.s, &ub.s, s);
2956 if (isgreaterequal(ua.h, ub.h)) {
2957 if (isgreater(ua.h, ub.h)) {
2958 return float_relation_greater;
2959 }
2960 return float_relation_equal;
2961 }
2962 if (likely(isless(ua.h, ub.h))) {
2963 return float_relation_less;
2964 }
2965 /* The only condition remaining is unordered.
2966 * Fall through to set flags.
2967 */
2968 soft:
2969 return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2970 }
2971
2972 int float32_compare(float32 a, float32 b, float_status *s)
2973 {
2974 return f32_compare(a, b, false, s);
2975 }
2976
2977 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2978 {
2979 return f32_compare(a, b, true, s);
2980 }
2981
2982 static int QEMU_FLATTEN
2983 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2984 {
2985 union_float64 ua, ub;
2986
2987 ua.s = xa;
2988 ub.s = xb;
2989
2990 if (QEMU_NO_HARDFLOAT) {
2991 goto soft;
2992 }
2993
2994 float64_input_flush2(&ua.s, &ub.s, s);
2995 if (isgreaterequal(ua.h, ub.h)) {
2996 if (isgreater(ua.h, ub.h)) {
2997 return float_relation_greater;
2998 }
2999 return float_relation_equal;
3000 }
3001 if (likely(isless(ua.h, ub.h))) {
3002 return float_relation_less;
3003 }
3004 /* The only condition remaining is unordered.
3005 * Fall through to set flags.
3006 */
3007 soft:
3008 return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3009 }
3010
3011 int float64_compare(float64 a, float64 b, float_status *s)
3012 {
3013 return f64_compare(a, b, false, s);
3014 }
3015
3016 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3017 {
3018 return f64_compare(a, b, true, s);
3019 }
3020
3021 /* Multiply A by 2 raised to the power N. */
3022 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3023 {
3024 if (unlikely(is_nan(a.cls))) {
3025 return return_nan(a, s);
3026 }
3027 if (a.cls == float_class_normal) {
3028 /* The largest float type (even though not supported by FloatParts)
3029 * is float128, which has a 15 bit exponent. Bounding N to 16 bits
3030 * still allows rounding to infinity, without allowing overflow
3031 * within the int32_t that backs FloatParts.exp.
3032 */
3033 n = MIN(MAX(n, -0x10000), 0x10000);
3034 a.exp += n;
3035 }
3036 return a;
3037 }
3038
3039 float16 float16_scalbn(float16 a, int n, float_status *status)
3040 {
3041 FloatParts pa = float16_unpack_canonical(a, status);
3042 FloatParts pr = scalbn_decomposed(pa, n, status);
3043 return float16_round_pack_canonical(pr, status);
3044 }
3045
3046 float32 float32_scalbn(float32 a, int n, float_status *status)
3047 {
3048 FloatParts pa = float32_unpack_canonical(a, status);
3049 FloatParts pr = scalbn_decomposed(pa, n, status);
3050 return float32_round_pack_canonical(pr, status);
3051 }
3052
3053 float64 float64_scalbn(float64 a, int n, float_status *status)
3054 {
3055 FloatParts pa = float64_unpack_canonical(a, status);
3056 FloatParts pr = scalbn_decomposed(pa, n, status);
3057 return float64_round_pack_canonical(pr, status);
3058 }
3059
3060 /*
3061 * Square Root
3062 *
3063 * The old softfloat code did an approximation step before zeroing in
3064 * on the final result. However for simpleness we just compute the
3065 * square root by iterating down from the implicit bit to enough extra
3066 * bits to ensure we get a correctly rounded result.
3067 *
3068 * This does mean however the calculation is slower than before,
3069 * especially for 64 bit floats.
3070 */
3071
3072 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3073 {
3074 uint64_t a_frac, r_frac, s_frac;
3075 int bit, last_bit;
3076
3077 if (is_nan(a.cls)) {
3078 return return_nan(a, s);
3079 }
3080 if (a.cls == float_class_zero) {
3081 return a; /* sqrt(+-0) = +-0 */
3082 }
3083 if (a.sign) {
3084 s->float_exception_flags |= float_flag_invalid;
3085 return parts_default_nan(s);
3086 }
3087 if (a.cls == float_class_inf) {
3088 return a; /* sqrt(+inf) = +inf */
3089 }
3090
3091 assert(a.cls == float_class_normal);
3092
3093 /* We need two overflow bits at the top. Adding room for that is a
3094 * right shift. If the exponent is odd, we can discard the low bit
3095 * by multiplying the fraction by 2; that's a left shift. Combine
3096 * those and we shift right if the exponent is even.
3097 */
3098 a_frac = a.frac;
3099 if (!(a.exp & 1)) {
3100 a_frac >>= 1;
3101 }
3102 a.exp >>= 1;
3103
3104 /* Bit-by-bit computation of sqrt. */
3105 r_frac = 0;
3106 s_frac = 0;
3107
3108 /* Iterate from implicit bit down to the 3 extra bits to compute a
3109 * properly rounded result. Remember we've inserted one more bit
3110 * at the top, so these positions are one less.
3111 */
3112 bit = DECOMPOSED_BINARY_POINT - 1;
3113 last_bit = MAX(p->frac_shift - 4, 0);
3114 do {
3115 uint64_t q = 1ULL << bit;
3116 uint64_t t_frac = s_frac + q;
3117 if (t_frac <= a_frac) {
3118 s_frac = t_frac + q;
3119 a_frac -= t_frac;
3120 r_frac += q;
3121 }
3122 a_frac <<= 1;
3123 } while (--bit >= last_bit);
3124
3125 /* Undo the right shift done above. If there is any remaining
3126 * fraction, the result is inexact. Set the sticky bit.
3127 */
3128 a.frac = (r_frac << 1) + (a_frac != 0);
3129
3130 return a;
3131 }
3132
3133 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3134 {
3135 FloatParts pa = float16_unpack_canonical(a, status);
3136 FloatParts pr = sqrt_float(pa, status, &float16_params);
3137 return float16_round_pack_canonical(pr, status);
3138 }
3139
3140 static float32 QEMU_SOFTFLOAT_ATTR
3141 soft_f32_sqrt(float32 a, float_status *status)
3142 {
3143 FloatParts pa = float32_unpack_canonical(a, status);
3144 FloatParts pr = sqrt_float(pa, status, &float32_params);
3145 return float32_round_pack_canonical(pr, status);
3146 }
3147
3148 static float64 QEMU_SOFTFLOAT_ATTR
3149 soft_f64_sqrt(float64 a, float_status *status)
3150 {
3151 FloatParts pa = float64_unpack_canonical(a, status);
3152 FloatParts pr = sqrt_float(pa, status, &float64_params);
3153 return float64_round_pack_canonical(pr, status);
3154 }
3155
3156 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3157 {
3158 union_float32 ua, ur;
3159
3160 ua.s = xa;
3161 if (unlikely(!can_use_fpu(s))) {
3162 goto soft;
3163 }
3164
3165 float32_input_flush1(&ua.s, s);
3166 if (QEMU_HARDFLOAT_1F32_USE_FP) {
3167 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3168 fpclassify(ua.h) == FP_ZERO) ||
3169 signbit(ua.h))) {
3170 goto soft;
3171 }
3172 } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3173 float32_is_neg(ua.s))) {
3174 goto soft;
3175 }
3176 ur.h = sqrtf(ua.h);
3177 return ur.s;
3178
3179 soft:
3180 return soft_f32_sqrt(ua.s, s);
3181 }
3182
3183 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3184 {
3185 union_float64 ua, ur;
3186
3187 ua.s = xa;
3188 if (unlikely(!can_use_fpu(s))) {
3189 goto soft;
3190 }
3191
3192 float64_input_flush1(&ua.s, s);
3193 if (QEMU_HARDFLOAT_1F64_USE_FP) {
3194 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3195 fpclassify(ua.h) == FP_ZERO) ||
3196 signbit(ua.h))) {
3197 goto soft;
3198 }
3199 } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3200 float64_is_neg(ua.s))) {
3201 goto soft;
3202 }
3203 ur.h = sqrt(ua.h);
3204 return ur.s;
3205
3206 soft:
3207 return soft_f64_sqrt(ua.s, s);
3208 }
3209
3210 /*----------------------------------------------------------------------------
3211 | The pattern for a default generated NaN.
3212 *----------------------------------------------------------------------------*/
3213
3214 float16 float16_default_nan(float_status *status)
3215 {
3216 FloatParts p = parts_default_nan(status);
3217 p.frac >>= float16_params.frac_shift;
3218 return float16_pack_raw(p);
3219 }
3220
3221 float32 float32_default_nan(float_status *status)
3222 {
3223 FloatParts p = parts_default_nan(status);
3224 p.frac >>= float32_params.frac_shift;
3225 return float32_pack_raw(p);
3226 }
3227
3228 float64 float64_default_nan(float_status *status)
3229 {
3230 FloatParts p = parts_default_nan(status);
3231 p.frac >>= float64_params.frac_shift;
3232 return float64_pack_raw(p);
3233 }
3234
3235 float128 float128_default_nan(float_status *status)
3236 {
3237 FloatParts p = parts_default_nan(status);
3238 float128 r;
3239
3240 /* Extrapolate from the choices made by parts_default_nan to fill
3241 * in the quad-floating format. If the low bit is set, assume we
3242 * want to set all non-snan bits.
3243 */
3244 r.low = -(p.frac & 1);
3245 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3246 r.high |= LIT64(0x7FFF000000000000);
3247 r.high |= (uint64_t)p.sign << 63;
3248
3249 return r;
3250 }
3251
3252 /*----------------------------------------------------------------------------
3253 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3254 *----------------------------------------------------------------------------*/
3255
3256 float16 float16_silence_nan(float16 a, float_status *status)
3257 {
3258 FloatParts p = float16_unpack_raw(a);
3259 p.frac <<= float16_params.frac_shift;
3260 p = parts_silence_nan(p, status);
3261 p.frac >>= float16_params.frac_shift;
3262 return float16_pack_raw(p);
3263 }
3264
3265 float32 float32_silence_nan(float32 a, float_status *status)
3266 {
3267 FloatParts p = float32_unpack_raw(a);
3268 p.frac <<= float32_params.frac_shift;
3269 p = parts_silence_nan(p, status);
3270 p.frac >>= float32_params.frac_shift;
3271 return float32_pack_raw(p);
3272 }
3273
3274 float64 float64_silence_nan(float64 a, float_status *status)
3275 {
3276 FloatParts p = float64_unpack_raw(a);
3277 p.frac <<= float64_params.frac_shift;
3278 p = parts_silence_nan(p, status);
3279 p.frac >>= float64_params.frac_shift;
3280 return float64_pack_raw(p);
3281 }
3282
3283 /*----------------------------------------------------------------------------
3284 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3285 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3286 | input. If `zSign' is 1, the input is negated before being converted to an
3287 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
3288 | is simply rounded to an integer, with the inexact exception raised if the
3289 | input cannot be represented exactly as an integer. However, if the fixed-
3290 | point input is too large, the invalid exception is raised and the largest
3291 | positive or negative integer is returned.
3292 *----------------------------------------------------------------------------*/
3293
3294 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3295 {
3296 int8_t roundingMode;
3297 flag roundNearestEven;
3298 int8_t roundIncrement, roundBits;
3299 int32_t z;
3300
3301 roundingMode = status->float_rounding_mode;
3302 roundNearestEven = ( roundingMode == float_round_nearest_even );
3303 switch (roundingMode) {
3304 case float_round_nearest_even:
3305 case float_round_ties_away:
3306 roundIncrement = 0x40;
3307 break;
3308 case float_round_to_zero:
3309 roundIncrement = 0;
3310 break;
3311 case float_round_up:
3312 roundIncrement = zSign ? 0 : 0x7f;
3313 break;
3314 case float_round_down:
3315 roundIncrement = zSign ? 0x7f : 0;
3316 break;
3317 default:
3318 abort();
3319 }
3320 roundBits = absZ & 0x7F;
3321 absZ = ( absZ + roundIncrement )>>7;
3322 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3323 z = absZ;
3324 if ( zSign ) z = - z;
3325 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3326 float_raise(float_flag_invalid, status);
3327 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3328 }
3329 if (roundBits) {
3330 status->float_exception_flags |= float_flag_inexact;
3331 }
3332 return z;
3333
3334 }
3335
3336 /*----------------------------------------------------------------------------
3337 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3338 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3339 | and returns the properly rounded 64-bit integer corresponding to the input.
3340 | If `zSign' is 1, the input is negated before being converted to an integer.
3341 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3342 | the inexact exception raised if the input cannot be represented exactly as
3343 | an integer. However, if the fixed-point input is too large, the invalid
3344 | exception is raised and the largest positive or negative integer is
3345 | returned.
3346 *----------------------------------------------------------------------------*/
3347
3348 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3349 float_status *status)
3350 {
3351 int8_t roundingMode;
3352 flag roundNearestEven, increment;
3353 int64_t z;
3354
3355 roundingMode = status->float_rounding_mode;
3356 roundNearestEven = ( roundingMode == float_round_nearest_even );
3357 switch (roundingMode) {
3358 case float_round_nearest_even:
3359 case float_round_ties_away:
3360 increment = ((int64_t) absZ1 < 0);
3361 break;
3362 case float_round_to_zero:
3363 increment = 0;
3364 break;
3365 case float_round_up:
3366 increment = !zSign && absZ1;
3367 break;
3368 case float_round_down:
3369 increment = zSign && absZ1;
3370 break;
3371 default:
3372 abort();
3373 }
3374 if ( increment ) {
3375 ++absZ0;
3376 if ( absZ0 == 0 ) goto overflow;
3377 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3378 }
3379 z = absZ0;
3380 if ( zSign ) z = - z;
3381 if ( z && ( ( z < 0 ) ^ zSign ) ) {
3382 overflow:
3383 float_raise(float_flag_invalid, status);
3384 return
3385 zSign ? (int64_t) LIT64( 0x8000000000000000 )
3386 : LIT64( 0x7FFFFFFFFFFFFFFF );
3387 }
3388 if (absZ1) {
3389 status->float_exception_flags |= float_flag_inexact;
3390 }
3391 return z;
3392
3393 }
3394
3395 /*----------------------------------------------------------------------------
3396 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3397 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3398 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3399 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
3400 | with the inexact exception raised if the input cannot be represented exactly
3401 | as an integer. However, if the fixed-point input is too large, the invalid
3402 | exception is raised and the largest unsigned integer is returned.
3403 *----------------------------------------------------------------------------*/
3404
3405 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3406 uint64_t absZ1, float_status *status)
3407 {
3408 int8_t roundingMode;
3409 flag roundNearestEven, increment;
3410
3411 roundingMode = status->float_rounding_mode;
3412 roundNearestEven = (roundingMode == float_round_nearest_even);
3413 switch (roundingMode) {
3414 case float_round_nearest_even:
3415 case float_round_ties_away:
3416 increment = ((int64_t)absZ1 < 0);
3417 break;
3418 case float_round_to_zero:
3419 increment = 0;
3420 break;
3421 case float_round_up:
3422 increment = !zSign && absZ1;
3423 break;
3424 case float_round_down:
3425 increment = zSign && absZ1;
3426 break;
3427 default:
3428 abort();
3429 }
3430 if (increment) {
3431 ++absZ0;
3432 if (absZ0 == 0) {
3433 float_raise(float_flag_invalid, status);
3434 return LIT64(0xFFFFFFFFFFFFFFFF);
3435 }
3436 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3437 }
3438
3439 if (zSign && absZ0) {
3440 float_raise(float_flag_invalid, status);
3441 return 0;
3442 }
3443
3444 if (absZ1) {
3445 status->float_exception_flags |= float_flag_inexact;
3446 }
3447 return absZ0;
3448 }
3449
3450 /*----------------------------------------------------------------------------
3451 | If `a' is denormal and we are in flush-to-zero mode then set the
3452 | input-denormal exception and return zero. Otherwise just return the value.
3453 *----------------------------------------------------------------------------*/
3454 float32 float32_squash_input_denormal(float32 a, float_status *status)
3455 {
3456 if (status->flush_inputs_to_zero) {
3457 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3458 float_raise(float_flag_input_denormal, status);
3459 return make_float32(float32_val(a) & 0x80000000);
3460 }
3461 }
3462 return a;
3463 }
3464
3465 /*----------------------------------------------------------------------------
3466 | Normalizes the subnormal single-precision floating-point value represented
3467 | by the denormalized significand `aSig'. The normalized exponent and
3468 | significand are stored at the locations pointed to by `zExpPtr' and
3469 | `zSigPtr', respectively.
3470 *----------------------------------------------------------------------------*/
3471
3472 static void
3473 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3474 {
3475 int8_t shiftCount;
3476
3477 shiftCount = clz32(aSig) - 8;
3478 *zSigPtr = aSig<<shiftCount;
3479 *zExpPtr = 1 - shiftCount;
3480
3481 }
3482
3483 /*----------------------------------------------------------------------------
3484 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3485 | and significand `zSig', and returns the proper single-precision floating-
3486 | point value corresponding to the abstract input. Ordinarily, the abstract
3487 | value is simply rounded and packed into the single-precision format, with
3488 | the inexact exception raised if the abstract input cannot be represented
3489 | exactly. However, if the abstract value is too large, the overflow and
3490 | inexact exceptions are raised and an infinity or maximal finite value is
3491 | returned. If the abstract value is too small, the input value is rounded to
3492 | a subnormal number, and the underflow and inexact exceptions are raised if
3493 | the abstract input cannot be represented exactly as a subnormal single-
3494 | precision floating-point number.
3495 | The input significand `zSig' has its binary point between bits 30
3496 | and 29, which is 7 bits to the left of the usual location. This shifted
3497 | significand must be normalized or smaller. If `zSig' is not normalized,
3498 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3499 | and it must not require rounding. In the usual case that `zSig' is
3500 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3501 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3502 | Binary Floating-Point Arithmetic.
3503 *----------------------------------------------------------------------------*/
3504
3505 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3506 float_status *status)
3507 {
3508 int8_t roundingMode;
3509 flag roundNearestEven;
3510 int8_t roundIncrement, roundBits;
3511 flag isTiny;
3512
3513 roundingMode = status->float_rounding_mode;
3514 roundNearestEven = ( roundingMode == float_round_nearest_even );
3515 switch (roundingMode) {
3516 case float_round_nearest_even:
3517 case float_round_ties_away:
3518 roundIncrement = 0x40;
3519 break;
3520 case float_round_to_zero:
3521 roundIncrement = 0;
3522 break;
3523 case float_round_up:
3524 roundIncrement = zSign ? 0 : 0x7f;
3525 break;
3526 case float_round_down:
3527 roundIncrement = zSign ? 0x7f : 0;
3528 break;
3529 default:
3530 abort();
3531 break;
3532 }
3533 roundBits = zSig & 0x7F;
3534 if ( 0xFD <= (uint16_t) zExp ) {
3535 if ( ( 0xFD < zExp )
3536 || ( ( zExp == 0xFD )
3537 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3538 ) {
3539 float_raise(float_flag_overflow | float_flag_inexact, status);
3540 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3541 }
3542 if ( zExp < 0 ) {
3543 if (status->flush_to_zero) {
3544 float_raise(float_flag_output_denormal, status);
3545 return packFloat32(zSign, 0, 0);
3546 }
3547 isTiny =
3548 (status->float_detect_tininess
3549 == float_tininess_before_rounding)
3550 || ( zExp < -1 )
3551 || ( zSig + roundIncrement < 0x80000000 );
3552 shift32RightJamming( zSig, - zExp, &zSig );
3553 zExp = 0;
3554 roundBits = zSig & 0x7F;
3555 if (isTiny && roundBits) {
3556 float_raise(float_flag_underflow, status);
3557 }
3558 }
3559 }
3560 if (roundBits) {
3561 status->float_exception_flags |= float_flag_inexact;
3562 }
3563 zSig = ( zSig + roundIncrement )>>7;
3564 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3565 if ( zSig == 0 ) zExp = 0;
3566 return packFloat32( zSign, zExp, zSig );
3567
3568 }
3569
3570 /*----------------------------------------------------------------------------
3571 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3572 | and significand `zSig', and returns the proper single-precision floating-
3573 | point value corresponding to the abstract input. This routine is just like
3574 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3575 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3576 | floating-point exponent.
3577 *----------------------------------------------------------------------------*/
3578
3579 static float32
3580 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3581 float_status *status)
3582 {
3583 int8_t shiftCount;
3584
3585 shiftCount = clz32(zSig) - 1;
3586 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3587 status);
3588
3589 }
3590
3591 /*----------------------------------------------------------------------------
3592 | If `a' is denormal and we are in flush-to-zero mode then set the
3593 | input-denormal exception and return zero. Otherwise just return the value.
3594 *----------------------------------------------------------------------------*/
3595 float64 float64_squash_input_denormal(float64 a, float_status *status)
3596 {
3597 if (status->flush_inputs_to_zero) {
3598 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3599 float_raise(float_flag_input_denormal, status);
3600 return make_float64(float64_val(a) & (1ULL << 63));
3601 }
3602 }
3603 return a;
3604 }
3605
3606 /*----------------------------------------------------------------------------
3607 | Normalizes the subnormal double-precision floating-point value represented
3608 | by the denormalized significand `aSig'. The normalized exponent and
3609 | significand are stored at the locations pointed to by `zExpPtr' and
3610 | `zSigPtr', respectively.
3611 *----------------------------------------------------------------------------*/
3612
3613 static void
3614 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3615 {
3616 int8_t shiftCount;
3617
3618 shiftCount = clz64(aSig) - 11;
3619 *zSigPtr = aSig<<shiftCount;
3620 *zExpPtr = 1 - shiftCount;
3621
3622 }
3623
3624 /*----------------------------------------------------------------------------
3625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3626 | double-precision floating-point value, returning the result. After being
3627 | shifted into the proper positions, the three fields are simply added
3628 | together to form the result. This means that any integer portion of `zSig'
3629 | will be added into the exponent. Since a properly normalized significand
3630 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3631 | than the desired result exponent whenever `zSig' is a complete, normalized
3632 | significand.
3633 *----------------------------------------------------------------------------*/
3634
3635 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3636 {
3637
3638 return make_float64(
3639 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3640
3641 }
3642
3643 /*----------------------------------------------------------------------------
3644 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3645 | and significand `zSig', and returns the proper double-precision floating-
3646 | point value corresponding to the abstract input. Ordinarily, the abstract
3647 | value is simply rounded and packed into the double-precision format, with
3648 | the inexact exception raised if the abstract input cannot be represented
3649 | exactly. However, if the abstract value is too large, the overflow and
3650 | inexact exceptions are raised and an infinity or maximal finite value is
3651 | returned. If the abstract value is too small, the input value is rounded to
3652 | a subnormal number, and the underflow and inexact exceptions are raised if
3653 | the abstract input cannot be represented exactly as a subnormal double-
3654 | precision floating-point number.
3655 | The input significand `zSig' has its binary point between bits 62
3656 | and 61, which is 10 bits to the left of the usual location. This shifted
3657 | significand must be normalized or smaller. If `zSig' is not normalized,
3658 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3659 | and it must not require rounding. In the usual case that `zSig' is
3660 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3661 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3662 | Binary Floating-Point Arithmetic.
3663 *----------------------------------------------------------------------------*/
3664
3665 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3666 float_status *status)
3667 {
3668 int8_t roundingMode;
3669 flag roundNearestEven;
3670 int roundIncrement, roundBits;
3671 flag isTiny;
3672
3673 roundingMode = status->float_rounding_mode;
3674 roundNearestEven = ( roundingMode == float_round_nearest_even );
3675 switch (roundingMode) {
3676 case float_round_nearest_even:
3677 case float_round_ties_away:
3678 roundIncrement = 0x200;
3679 break;
3680 case float_round_to_zero:
3681 roundIncrement = 0;
3682 break;
3683 case float_round_up:
3684 roundIncrement = zSign ? 0 : 0x3ff;
3685 break;
3686 case float_round_down:
3687 roundIncrement = zSign ? 0x3ff : 0;
3688 break;
3689 case float_round_to_odd:
3690 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3691 break;
3692 default:
3693 abort();
3694 }
3695 roundBits = zSig & 0x3FF;
3696 if ( 0x7FD <= (uint16_t) zExp ) {
3697 if ( ( 0x7FD < zExp )
3698 || ( ( zExp == 0x7FD )
3699 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3700 ) {
3701 bool overflow_to_inf = roundingMode != float_round_to_odd &&
3702 roundIncrement != 0;
3703 float_raise(float_flag_overflow | float_flag_inexact, status);
3704 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3705 }
3706 if ( zExp < 0 ) {
3707 if (status->flush_to_zero) {
3708 float_raise(float_flag_output_denormal, status);
3709 return packFloat64(zSign, 0, 0);
3710 }
3711 isTiny =
3712 (status->float_detect_tininess
3713 == float_tininess_before_rounding)
3714 || ( zExp < -1 )
3715 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3716 shift64RightJamming( zSig, - zExp, &zSig );
3717 zExp = 0;
3718 roundBits = zSig & 0x3FF;
3719 if (isTiny && roundBits) {
3720 float_raise(float_flag_underflow, status);
3721 }
3722 if (roundingMode == float_round_to_odd) {
3723 /*
3724 * For round-to-odd case, the roundIncrement depends on
3725 * zSig which just changed.
3726 */
3727 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3728 }
3729 }
3730 }
3731 if (roundBits) {
3732 status->float_exception_flags |= float_flag_inexact;
3733 }
3734 zSig = ( zSig + roundIncrement )>>10;
3735 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3736 if ( zSig == 0 ) zExp = 0;
3737 return packFloat64( zSign, zExp, zSig );
3738
3739 }
3740
3741 /*----------------------------------------------------------------------------
3742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3743 | and significand `zSig', and returns the proper double-precision floating-
3744 | point value corresponding to the abstract input. This routine is just like
3745 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3746 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3747 | floating-point exponent.
3748 *----------------------------------------------------------------------------*/
3749
3750 static float64
3751 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3752 float_status *status)
3753 {
3754 int8_t shiftCount;
3755
3756 shiftCount = clz64(zSig) - 1;
3757 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3758 status);
3759
3760 }
3761
3762 /*----------------------------------------------------------------------------
3763 | Normalizes the subnormal extended double-precision floating-point value
3764 | represented by the denormalized significand `aSig'. The normalized exponent
3765 | and significand are stored at the locations pointed to by `zExpPtr' and
3766 | `zSigPtr', respectively.
3767 *----------------------------------------------------------------------------*/
3768
3769 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3770 uint64_t *zSigPtr)
3771 {
3772 int8_t shiftCount;
3773
3774 shiftCount = clz64(aSig);
3775 *zSigPtr = aSig<<shiftCount;
3776 *zExpPtr = 1 - shiftCount;
3777 }
3778
3779 /*----------------------------------------------------------------------------
3780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3781 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3782 | and returns the proper extended double-precision floating-point value
3783 | corresponding to the abstract input. Ordinarily, the abstract value is
3784 | rounded and packed into the extended double-precision format, with the
3785 | inexact exception raised if the abstract input cannot be represented
3786 | exactly. However, if the abstract value is too large, the overflow and
3787 | inexact exceptions are raised and an infinity or maximal finite value is
3788 | returned. If the abstract value is too small, the input value is rounded to
3789 | a subnormal number, and the underflow and inexact exceptions are raised if
3790 | the abstract input cannot be represented exactly as a subnormal extended
3791 | double-precision floating-point number.
3792 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
3793 | number of bits as single or double precision, respectively. Otherwise, the
3794 | result is rounded to the full precision of the extended double-precision
3795 | format.
3796 | The input significand must be normalized or smaller. If the input
3797 | significand is not normalized, `zExp' must be 0; in that case, the result
3798 | returned is a subnormal number, and it must not require rounding. The
3799 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3802
3803 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3804 int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3805 float_status *status)
3806 {
3807 int8_t roundingMode;
3808 flag roundNearestEven, increment, isTiny;
3809 int64_t roundIncrement, roundMask, roundBits;
3810
3811 roundingMode = status->float_rounding_mode;
3812 roundNearestEven = ( roundingMode == float_round_nearest_even );
3813 if ( roundingPrecision == 80 ) goto precision80;
3814 if ( roundingPrecision == 64 ) {
3815 roundIncrement = LIT64( 0x0000000000000400 );
3816 roundMask = LIT64( 0x00000000000007FF );
3817 }
3818 else if ( roundingPrecision == 32 ) {
3819 roundIncrement = LIT64( 0x0000008000000000 );
3820 roundMask = LIT64( 0x000000FFFFFFFFFF );
3821 }
3822 else {
3823 goto precision80;
3824 }
3825 zSig0 |= ( zSig1 != 0 );
3826 switch (roundingMode) {
3827 case float_round_nearest_even:
3828 case float_round_ties_away:
3829 break;
3830 case float_round_to_zero:
3831 roundIncrement = 0;
3832 break;
3833 case float_round_up:
3834 roundIncrement = zSign ? 0 : roundMask;
3835 break;
3836 case float_round_down:
3837 roundIncrement = zSign ? roundMask : 0;
3838 break;
3839 default:
3840 abort();
3841 }
3842 roundBits = zSig0 & roundMask;
3843 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3844 if ( ( 0x7FFE < zExp )
3845 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3846 ) {
3847 goto overflow;
3848 }
3849 if ( zExp <= 0 ) {
3850 if (status->flush_to_zero) {
3851 float_raise(float_flag_output_denormal, status);
3852 return packFloatx80(zSign, 0, 0);
3853 }
3854 isTiny =
3855 (status->float_detect_tininess
3856 == float_tininess_before_rounding)
3857 || ( zExp < 0 )
3858 || ( zSig0 <= zSig0 + roundIncrement );
3859 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3860 zExp = 0;
3861 roundBits = zSig0 & roundMask;
3862 if (isTiny && roundBits) {
3863 float_raise(float_flag_underflow, status);
3864 }
3865 if (roundBits) {
3866 status->float_exception_flags |= float_flag_inexact;
3867 }
3868 zSig0 += roundIncrement;
3869 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3870 roundIncrement = roundMask + 1;
3871 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3872 roundMask |= roundIncrement;
3873 }
3874 zSig0 &= ~ roundMask;
3875 return packFloatx80( zSign, zExp, zSig0 );
3876 }
3877 }
3878 if (roundBits) {
3879 status->float_exception_flags |= float_flag_inexact;
3880 }
3881 zSig0 += roundIncrement;
3882 if ( zSig0 < roundIncrement ) {
3883 ++zExp;
3884 zSig0 = LIT64( 0x8000000000000000 );
3885 }
3886 roundIncrement = roundMask + 1;
3887 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3888 roundMask |= roundIncrement;
3889 }
3890 zSig0 &= ~ roundMask;
3891 if ( zSig0 == 0 ) zExp = 0;
3892 return packFloatx80( zSign, zExp, zSig0 );
3893 precision80:
3894 switch (roundingMode) {
3895 case float_round_nearest_even:
3896 case float_round_ties_away:
3897 increment = ((int64_t)zSig1 < 0);
3898 break;
3899 case float_round_to_zero:
3900 increment = 0;
3901 break;
3902 case float_round_up:
3903 increment = !zSign && zSig1;
3904 break;
3905 case float_round_down:
3906 increment = zSign && zSig1;
3907 break;
3908 default:
3909 abort();
3910 }
3911 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3912 if ( ( 0x7FFE < zExp )
3913 || ( ( zExp == 0x7FFE )
3914 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3915 && increment
3916 )
3917 ) {
3918 roundMask = 0;
3919 overflow:
3920 float_raise(float_flag_overflow | float_flag_inexact, status);
3921 if ( ( roundingMode == float_round_to_zero )
3922 || ( zSign && ( roundingMode == float_round_up ) )
3923 || ( ! zSign && ( roundingMode == float_round_down ) )
3924 ) {
3925 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3926 }
3927 return packFloatx80(zSign,
3928 floatx80_infinity_high,
3929 floatx80_infinity_low);
3930 }
3931 if ( zExp <= 0 ) {
3932 isTiny =
3933 (status->float_detect_tininess
3934 == float_tininess_before_rounding)
3935 || ( zExp < 0 )
3936 || ! increment
3937 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3938 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3939 zExp = 0;
3940 if (isTiny && zSig1) {
3941 float_raise(float_flag_underflow, status);
3942 }
3943 if (zSig1) {
3944 status->float_exception_flags |= float_flag_inexact;
3945 }
3946 switch (roundingMode) {
3947 case float_round_nearest_even:
3948 case float_round_ties_away:
3949 increment = ((int64_t)zSig1 < 0);
3950 break;
3951 case float_round_to_zero:
3952 increment = 0;
3953 break;
3954 case float_round_up:
3955 increment = !zSign && zSig1;
3956 break;
3957 case float_round_down:
3958 increment = zSign && zSig1;
3959 break;
3960 default:
3961 abort();
3962 }
3963 if ( increment ) {
3964 ++zSig0;
3965 zSig0 &=
3966 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3967 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3968 }
3969 return packFloatx80( zSign, zExp, zSig0 );
3970 }
3971 }
3972 if (zSig1) {
3973 status->float_exception_flags |= float_flag_inexact;
3974 }
3975 if ( increment ) {
3976 ++zSig0;
3977 if ( zSig0 == 0 ) {
3978 ++zExp;
3979 zSig0 = LIT64( 0x8000000000000000 );
3980 }
3981 else {
3982 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3983 }
3984 }
3985 else {
3986 if ( zSig0 == 0 ) zExp = 0;
3987 }
3988 return packFloatx80( zSign, zExp, zSig0 );
3989
3990 }
3991
3992 /*----------------------------------------------------------------------------
3993 | Takes an abstract floating-point value having sign `zSign', exponent
3994 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3995 | and returns the proper extended double-precision floating-point value
3996 | corresponding to the abstract input. This routine is just like
3997 | `roundAndPackFloatx80' except that the input significand does not have to be
3998 | normalized.
3999 *----------------------------------------------------------------------------*/
4000
4001 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4002 flag zSign, int32_t zExp,
4003 uint64_t zSig0, uint64_t zSig1,
4004 float_status *status)
4005 {
4006 int8_t shiftCount;
4007
4008 if ( zSig0 == 0 ) {
4009 zSig0 = zSig1;
4010 zSig1 = 0;
4011 zExp -= 64;
4012 }
4013 shiftCount = clz64(zSig0);
4014 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4015 zExp -= shiftCount;
4016 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4017 zSig0, zSig1, status);
4018
4019 }
4020
4021 /*----------------------------------------------------------------------------
4022 | Returns the least-significant 64 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4025
4026 static inline uint64_t extractFloat128Frac1( float128 a )
4027 {
4028
4029 return a.low;
4030
4031 }
4032
4033 /*----------------------------------------------------------------------------
4034 | Returns the most-significant 48 fraction bits of the quadruple-precision
4035 | floating-point value `a'.
4036 *----------------------------------------------------------------------------*/
4037
4038 static inline uint64_t extractFloat128Frac0( float128 a )
4039 {
4040
4041 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4042
4043 }
4044
4045 /*----------------------------------------------------------------------------
4046 | Returns the exponent bits of the quadruple-precision floating-point value
4047 | `a'.
4048 *----------------------------------------------------------------------------*/
4049
4050 static inline int32_t extractFloat128Exp( float128 a )
4051 {
4052
4053 return ( a.high>>48 ) & 0x7FFF;
4054
4055 }
4056
4057 /*----------------------------------------------------------------------------
4058 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4059 *----------------------------------------------------------------------------*/
4060
4061 static inline flag extractFloat128Sign( float128 a )
4062 {
4063
4064 return a.high>>63;
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Normalizes the subnormal quadruple-precision floating-point value
4070 | represented by the denormalized significand formed by the concatenation of
4071 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
4072 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
4073 | significand are stored at the location pointed to by `zSig0Ptr', and the
4074 | least significant 64 bits of the normalized significand are stored at the
4075 | location pointed to by `zSig1Ptr'.
4076 *----------------------------------------------------------------------------*/
4077
4078 static void
4079 normalizeFloat128Subnormal(
4080 uint64_t aSig0,
4081 uint64_t aSig1,
4082 int32_t *zExpPtr,
4083 uint64_t *zSig0Ptr,
4084 uint64_t *zSig1Ptr
4085 )
4086 {
4087 int8_t shiftCount;
4088
4089 if ( aSig0 == 0 ) {
4090 shiftCount = clz64(aSig1) - 15;
4091 if ( shiftCount < 0 ) {
4092 *zSig0Ptr = aSig1>>( - shiftCount );
4093 *zSig1Ptr = aSig1<<( shiftCount & 63 );
4094 }
4095 else {
4096 *zSig0Ptr = aSig1<<shiftCount;
4097 *zSig1Ptr = 0;
4098 }
4099 *zExpPtr = - shiftCount - 63;
4100 }
4101 else {
4102 shiftCount = clz64(aSig0) - 15;
4103 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4104 *zExpPtr = 1 - shiftCount;
4105 }
4106
4107 }
4108
4109 /*----------------------------------------------------------------------------
4110 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4111 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4112 | floating-point value, returning the result. After being shifted into the
4113 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4114 | added together to form the most significant 32 bits of the result. This
4115 | means that any integer portion of `zSig0' will be added into the exponent.
4116 | Since a properly normalized significand will have an integer portion equal
4117 | to 1, the `zExp' input should be 1 less than the desired result exponent
4118 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4119 | significand.
4120 *----------------------------------------------------------------------------*/
4121
4122 static inline float128
4123 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4124 {
4125 float128 z;
4126
4127 z.low = zSig1;
4128 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4129 return z;
4130
4131 }
4132
4133 /*----------------------------------------------------------------------------
4134 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4135 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4136 | and `zSig2', and returns the proper quadruple-precision floating-point value
4137 | corresponding to the abstract input. Ordinarily, the abstract value is
4138 | simply rounded and packed into the quadruple-precision format, with the
4139 | inexact exception raised if the abstract input cannot be represented
4140 | exactly. However, if the abstract value is too large, the overflow and
4141 | inexact exceptions are raised and an infinity or maximal finite value is
4142 | returned. If the abstract value is too small, the input value is rounded to
4143 | a subnormal number, and the underflow and inexact exceptions are raised if
4144 | the abstract input cannot be represented exactly as a subnormal quadruple-
4145 | precision floating-point number.
4146 | The input significand must be normalized or smaller. If the input
4147 | significand is not normalized, `zExp' must be 0; in that case, the result
4148 | returned is a subnormal number, and it must not require rounding. In the
4149 | usual case that the input significand is normalized, `zExp' must be 1 less
4150 | than the ``true'' floating-point exponent. The handling of underflow and
4151 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4152 *----------------------------------------------------------------------------*/
4153
4154 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4155 uint64_t zSig0, uint64_t zSig1,
4156 uint64_t zSig2, float_status *status)
4157 {
4158 int8_t roundingMode;
4159 flag roundNearestEven, increment, isTiny;
4160
4161 roundingMode = status->float_rounding_mode;
4162 roundNearestEven = ( roundingMode == float_round_nearest_even );
4163 switch (roundingMode) {
4164 case float_round_nearest_even:
4165 case float_round_ties_away:
4166 increment = ((int64_t)zSig2 < 0);
4167 break;
4168 case float_round_to_zero:
4169 increment = 0;
4170 break;
4171 case float_round_up:
4172 increment = !zSign && zSig2;
4173 break;
4174 case float_round_down:
4175 increment = zSign && zSig2;
4176 break;
4177 case float_round_to_odd:
4178 increment = !(zSig1 & 0x1) && zSig2;
4179 break;
4180 default:
4181 abort();
4182 }
4183 if ( 0x7FFD <= (uint32_t) zExp ) {
4184 if ( ( 0x7FFD < zExp )
4185 || ( ( zExp == 0x7FFD )
4186 && eq128(
4187 LIT64( 0x0001FFFFFFFFFFFF ),
4188 LIT64( 0xFFFFFFFFFFFFFFFF ),
4189 zSig0,
4190 zSig1
4191 )
4192 && increment
4193 )
4194 ) {
4195 float_raise(float_flag_overflow | float_flag_inexact, status);
4196 if ( ( roundingMode == float_round_to_zero )
4197 || ( zSign && ( roundingMode == float_round_up ) )
4198 || ( ! zSign && ( roundingMode == float_round_down ) )
4199 || (roundingMode == float_round_to_odd)
4200 ) {
4201 return
4202 packFloat128(
4203 zSign,
4204 0x7FFE,
4205 LIT64( 0x0000FFFFFFFFFFFF ),
4206 LIT64( 0xFFFFFFFFFFFFFFFF )
4207 );
4208 }
4209 return packFloat128( zSign, 0x7FFF, 0, 0 );
4210 }
4211 if ( zExp < 0 ) {
4212 if (status->flush_to_zero) {
4213 float_raise(float_flag_output_denormal, status);
4214 return packFloat128(zSign, 0, 0, 0);
4215 }
4216 isTiny =
4217 (status->float_detect_tininess
4218 == float_tininess_before_rounding)
4219 || ( zExp < -1 )
4220 || ! increment
4221 || lt128(
4222 zSig0,
4223 zSig1,
4224 LIT64( 0x0001FFFFFFFFFFFF ),
4225 LIT64( 0xFFFFFFFFFFFFFFFF )
4226 );
4227 shift128ExtraRightJamming(
4228 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4229 zExp = 0;
4230 if (isTiny && zSig2) {
4231 float_raise(float_flag_underflow, status);
4232 }
4233 switch (roundingMode) {
4234 case float_round_nearest_even:
4235 case float_round_ties_away:
4236 increment = ((int64_t)zSig2 < 0);
4237 break;
4238 case float_round_to_zero:
4239 increment = 0;
4240 break;
4241 case float_round_up:
4242 increment = !zSign && zSig2;
4243 break;
4244 case float_round_down:
4245 increment = zSign && zSig2;
4246 break;
4247 case float_round_to_odd:
4248 increment = !(zSig1 & 0x1) && zSig2;
4249 break;
4250 default:
4251 abort();
4252 }
4253 }
4254 }
4255 if (zSig2) {
4256 status->float_exception_flags |= float_flag_inexact;
4257 }
4258 if ( increment ) {
4259 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4260 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4261 }
4262 else {
4263 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4264 }
4265 return packFloat128( zSign, zExp, zSig0, zSig1 );
4266
4267 }
4268
4269 /*----------------------------------------------------------------------------
4270 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4271 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4272 | returns the proper quadruple-precision floating-point value corresponding
4273 | to the abstract input. This routine is just like `roundAndPackFloat128'
4274 | except that the input significand has fewer bits and does not have to be
4275 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
4276 | point exponent.
4277 *----------------------------------------------------------------------------*/
4278
4279 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4280 uint64_t zSig0, uint64_t zSig1,
4281 float_status *status)
4282 {
4283 int8_t shiftCount;
4284 uint64_t zSig2;
4285
4286 if ( zSig0 == 0 ) {
4287 zSig0 = zSig1;
4288 zSig1 = 0;
4289 zExp -= 64;
4290 }
4291 shiftCount = clz64(zSig0) - 15;
4292 if ( 0 <= shiftCount ) {
4293 zSig2 = 0;
4294 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4295 }
4296 else {
4297 shift128ExtraRightJamming(
4298 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4299 }
4300 zExp -= shiftCount;
4301 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4302
4303 }
4304
4305
4306 /*----------------------------------------------------------------------------
4307 | Returns the result of converting the 32-bit two's complement integer `a'
4308 | to the extended double-precision floating-point format. The conversion
4309 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4310 | Arithmetic.
4311 *----------------------------------------------------------------------------*/
4312
4313 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4314 {
4315 flag zSign;
4316 uint32_t absA;
4317 int8_t shiftCount;
4318 uint64_t zSig;
4319
4320 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4321 zSign = ( a < 0 );
4322 absA = zSign ? - a : a;
4323 shiftCount = clz32(absA) + 32;
4324 zSig = absA;
4325 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4326
4327 }
4328
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the 32-bit two's complement integer `a' to
4331 | the quadruple-precision floating-point format. The conversion is performed
4332 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4333 *----------------------------------------------------------------------------*/
4334
4335 float128 int32_to_float128(int32_t a, float_status *status)
4336 {
4337 flag zSign;
4338 uint32_t absA;
4339 int8_t shiftCount;
4340 uint64_t zSig0;
4341
4342 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4343 zSign = ( a < 0 );
4344 absA = zSign ? - a : a;
4345 shiftCount = clz32(absA) + 17;
4346 zSig0 = absA;
4347 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4348
4349 }
4350
4351 /*----------------------------------------------------------------------------
4352 | Returns the result of converting the 64-bit two's complement integer `a'
4353 | to the extended double-precision floating-point format. The conversion
4354 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4355 | Arithmetic.
4356 *----------------------------------------------------------------------------*/
4357
4358 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4359 {
4360 flag zSign;
4361 uint64_t absA;
4362 int8_t shiftCount;
4363
4364 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4365 zSign = ( a < 0 );
4366 absA = zSign ? - a : a;
4367 shiftCount = clz64(absA);
4368 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4369
4370 }
4371
4372 /*----------------------------------------------------------------------------
4373 | Returns the result of converting the 64-bit two's complement integer `a' to
4374 | the quadruple-precision floating-point format. The conversion is performed
4375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4376 *----------------------------------------------------------------------------*/
4377
4378 float128 int64_to_float128(int64_t a, float_status *status)
4379 {
4380 flag zSign;
4381 uint64_t absA;
4382 int8_t shiftCount;
4383 int32_t zExp;
4384 uint64_t zSig0, zSig1;
4385
4386 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4387 zSign = ( a < 0 );
4388 absA = zSign ? - a : a;
4389 shiftCount = clz64(absA) + 49;
4390 zExp = 0x406E - shiftCount;
4391 if ( 64 <= shiftCount ) {
4392 zSig1 = 0;
4393 zSig0 = absA;
4394 shiftCount -= 64;
4395 }
4396 else {
4397 zSig1 = absA;
4398 zSig0 = 0;
4399 }
4400 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4401 return packFloat128( zSign, zExp, zSig0, zSig1 );
4402
4403 }
4404
4405 /*----------------------------------------------------------------------------
4406 | Returns the result of converting the 64-bit unsigned integer `a'
4407 | to the quadruple-precision floating-point format. The conversion is performed
4408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4409 *----------------------------------------------------------------------------*/
4410
4411 float128 uint64_to_float128(uint64_t a, float_status *status)
4412 {
4413 if (a == 0) {
4414 return float128_zero;
4415 }
4416 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4417 }
4418
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the single-precision floating-point value
4421 | `a' to the extended double-precision floating-point format. The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic.
4424 *----------------------------------------------------------------------------*/
4425
4426 floatx80 float32_to_floatx80(float32 a, float_status *status)
4427 {
4428 flag aSign;
4429 int aExp;
4430 uint32_t aSig;
4431
4432 a = float32_squash_input_denormal(a, status);
4433 aSig = extractFloat32Frac( a );
4434 aExp = extractFloat32Exp( a );
4435 aSign = extractFloat32Sign( a );
4436 if ( aExp == 0xFF ) {
4437 if (aSig) {
4438 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4439 }
4440 return packFloatx80(aSign,
4441 floatx80_infinity_high,
4442 floatx80_infinity_low);
4443 }
4444 if ( aExp == 0 ) {
4445 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4446 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4447 }
4448 aSig |= 0x00800000;
4449 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4450
4451 }
4452
4453 /*----------------------------------------------------------------------------
4454 | Returns the result of converting the single-precision floating-point value
4455 | `a' to the double-precision floating-point format. The conversion is
4456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4457 | Arithmetic.
4458 *----------------------------------------------------------------------------*/
4459
4460 float128 float32_to_float128(float32 a, float_status *status)
4461 {
4462 flag aSign;
4463 int aExp;
4464 uint32_t aSig;
4465
4466 a = float32_squash_input_denormal(a, status);
4467 aSig = extractFloat32Frac( a );
4468 aExp = extractFloat32Exp( a );
4469 aSign = extractFloat32Sign( a );
4470 if ( aExp == 0xFF ) {
4471 if (aSig) {
4472 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4473 }
4474 return packFloat128( aSign, 0x7FFF, 0, 0 );
4475 }
4476 if ( aExp == 0 ) {
4477 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4478 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4479 --aExp;
4480 }
4481 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4482
4483 }
4484
4485 /*----------------------------------------------------------------------------
4486 | Returns the remainder of the single-precision floating-point value `a'
4487 | with respect to the corresponding value `b'. The operation is performed
4488 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4489 *----------------------------------------------------------------------------*/
4490
4491 float32 float32_rem(float32 a, float32 b, float_status *status)
4492 {
4493 flag aSign, zSign;
4494 int aExp, bExp, expDiff;
4495 uint32_t aSig, bSig;
4496 uint32_t q;
4497 uint64_t aSig64, bSig64, q64;
4498 uint32_t alternateASig;
4499 int32_t sigMean;
4500 a = float32_squash_input_denormal(a, status);
4501 b = float32_squash_input_denormal(b, status);
4502
4503 aSig = extractFloat32Frac( a );
4504 aExp = extractFloat32Exp( a );
4505 aSign = extractFloat32Sign( a );
4506 bSig = extractFloat32Frac( b );
4507 bExp = extractFloat32Exp( b );
4508 if ( aExp == 0xFF ) {
4509 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4510 return propagateFloat32NaN(a, b, status);
4511 }
4512 float_raise(float_flag_invalid, status);
4513 return float32_default_nan(status);
4514 }
4515 if ( bExp == 0xFF ) {
4516 if (bSig) {
4517 return propagateFloat32NaN(a, b, status);
4518 }
4519 return a;
4520 }
4521 if ( bExp == 0 ) {
4522 if ( bSig == 0 ) {
4523 float_raise(float_flag_invalid, status);
4524 return float32_default_nan(status);
4525 }
4526 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4527 }
4528 if ( aExp == 0 ) {
4529 if ( aSig == 0 ) return a;
4530 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4531 }
4532 expDiff = aExp - bExp;
4533 aSig |= 0x00800000;
4534 bSig |= 0x00800000;
4535 if ( expDiff < 32 ) {
4536 aSig <<= 8;
4537 bSig <<= 8;
4538 if ( expDiff < 0 ) {
4539 if ( expDiff < -1 ) return a;
4540 aSig >>= 1;
4541 }
4542 q = ( bSig <= aSig );
4543 if ( q ) aSig -= bSig;
4544 if ( 0 < expDiff ) {
4545 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4546 q >>= 32 - expDiff;
4547 bSig >>= 2;
4548 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4549 }
4550 else {
4551 aSig >>= 2;
4552 bSig >>= 2;
4553 }
4554 }
4555 else {
4556 if ( bSig <= aSig ) aSig -= bSig;
4557 aSig64 = ( (uint64_t) aSig )<<40;
4558 bSig64 = ( (uint64_t) bSig )<<40;
4559 expDiff -= 64;
4560 while ( 0 < expDiff ) {
4561 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4562 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4563 aSig64 = - ( ( bSig * q64 )<<38 );
4564 expDiff -= 62;
4565 }
4566 expDiff += 64;
4567 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4568 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4569 q = q64>>( 64 - expDiff );
4570 bSig <<= 6;
4571 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4572 }
4573 do {
4574 alternateASig = aSig;
4575 ++q;
4576 aSig -= bSig;
4577 } while ( 0 <= (int32_t) aSig );
4578 sigMean = aSig + alternateASig;
4579 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4580 aSig = alternateASig;
4581 }
4582 zSign = ( (int32_t) aSig < 0 );
4583 if ( zSign ) aSig = - aSig;
4584 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4585 }
4586
4587
4588
4589 /*----------------------------------------------------------------------------
4590 | Returns the binary exponential of the single-precision floating-point value
4591 | `a'. The operation is performed according to the IEC/IEEE Standard for
4592 | Binary Floating-Point Arithmetic.
4593 |
4594 | Uses the following identities:
4595 |
4596 | 1. -------------------------------------------------------------------------
4597 | x x*ln(2)
4598 | 2 = e
4599 |
4600 | 2. -------------------------------------------------------------------------
4601 | 2 3 4 5 n
4602 | x x x x x x x
4603 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4604 | 1! 2! 3! 4! 5! n!
4605 *----------------------------------------------------------------------------*/
4606
4607 static const float64 float32_exp2_coefficients[15] =
4608 {
4609 const_float64( 0x3ff0000000000000ll ), /* 1 */
4610 const_float64( 0x3fe0000000000000ll ), /* 2 */
4611 const_float64( 0x3fc5555555555555ll ), /* 3 */
4612 const_float64( 0x3fa5555555555555ll ), /* 4 */
4613 const_float64( 0x3f81111111111111ll ), /* 5 */
4614 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
4615 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
4616 const_float64( 0x3efa01a01a01a01all ), /* 8 */
4617 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
4618 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4619 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4620 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4621 const_float64( 0x3de6124613a86d09ll ), /* 13 */
4622 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4623 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4624 };
4625
4626 float32 float32_exp2(float32 a, float_status *status)
4627 {
4628 flag aSign;
4629 int aExp;
4630 uint32_t aSig;
4631 float64 r, x, xn;
4632 int i;
4633 a = float32_squash_input_denormal(a, status);
4634
4635 aSig = extractFloat32Frac( a );
4636 aExp = extractFloat32Exp( a );
4637 aSign = extractFloat32Sign( a );
4638
4639 if ( aExp == 0xFF) {
4640 if (aSig) {
4641 return propagateFloat32NaN(a, float32_zero, status);
4642 }
4643 return (aSign) ? float32_zero : a;
4644 }
4645 if (aExp == 0) {
4646 if (aSig == 0) return float32_one;
4647 }
4648
4649 float_raise(float_flag_inexact, status);
4650
4651 /* ******************************* */
4652 /* using float64 for approximation */
4653 /* ******************************* */
4654 x = float32_to_float64(a, status);
4655 x = float64_mul(x, float64_ln2, status);
4656
4657 xn = x;
4658 r = float64_one;
4659 for (i = 0 ; i < 15 ; i++) {
4660 float64 f;
4661
4662 f = float64_mul(xn, float32_exp2_coefficients[i], status);
4663 r = float64_add(r, f, status);
4664
4665 xn = float64_mul(xn, x, status);
4666 }
4667
4668 return float64_to_float32(r, status);
4669 }
4670
4671 /*----------------------------------------------------------------------------
4672 | Returns the binary log of the single-precision floating-point value `a'.
4673 | The operation is performed according to the IEC/IEEE Standard for Binary
4674 | Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4676 float32 float32_log2(float32 a, float_status *status)
4677 {
4678 flag aSign, zSign;
4679 int aExp;
4680 uint32_t aSig, zSig, i;
4681
4682 a = float32_squash_input_denormal(a, status);
4683 aSig = extractFloat32Frac( a );
4684 aExp = extractFloat32Exp( a );
4685 aSign = extractFloat32Sign( a );
4686
4687 if ( aExp == 0 ) {
4688 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4689 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4690 }
4691 if ( aSign ) {
4692 float_raise(float_flag_invalid, status);
4693 return float32_default_nan(status);
4694 }
4695 if ( aExp == 0xFF ) {
4696 if (aSig) {
4697 return propagateFloat32NaN(a, float32_zero, status);
4698 }
4699 return a;
4700 }
4701
4702 aExp -= 0x7F;
4703 aSig |= 0x00800000;
4704 zSign = aExp < 0;
4705 zSig = aExp << 23;
4706
4707 for (i = 1 << 22; i > 0; i >>= 1) {
4708 aSig = ( (uint64_t)aSig * aSig ) >> 23;
4709 if ( aSig & 0x01000000 ) {
4710 aSig >>= 1;
4711 zSig |= i;
4712 }
4713 }
4714
4715 if ( zSign )
4716 zSig = -zSig;
4717
4718 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4719 }
4720
4721 /*----------------------------------------------------------------------------
4722 | Returns 1 if the single-precision floating-point value `a' is equal to
4723 | the corresponding value `b', and 0 otherwise. The invalid exception is
4724 | raised if either operand is a NaN. Otherwise, the comparison is performed
4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4726 *----------------------------------------------------------------------------*/
4727
4728 int float32_eq(float32 a, float32 b, float_status *status)
4729 {
4730 uint32_t av, bv;
4731 a = float32_squash_input_denormal(a, status);
4732 b = float32_squash_input_denormal(b, status);
4733
4734 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4735 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4736 ) {
4737 float_raise(float_flag_invalid, status);
4738 return 0;
4739 }
4740 av = float32_val(a);
4741 bv = float32_val(b);
4742 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4743 }
4744
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the single-precision floating-point value `a' is less than
4747 | or equal to the corresponding value `b', and 0 otherwise. The invalid
4748 | exception is raised if either operand is a NaN. The comparison is performed
4749 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4751
4752 int float32_le(float32 a, float32 b, float_status *status)
4753 {
4754 flag aSign, bSign;
4755 uint32_t av, bv;
4756 a = float32_squash_input_denormal(a, status);
4757 b = float32_squash_input_denormal(b, status);
4758
4759 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4760 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4761 ) {
4762 float_raise(float_flag_invalid, status);
4763 return 0;
4764 }
4765 aSign = extractFloat32Sign( a );
4766 bSign = extractFloat32Sign( b );
4767 av = float32_val(a);
4768 bv = float32_val(b);
4769 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4770 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4771
4772 }
4773
4774 /*----------------------------------------------------------------------------
4775 | Returns 1 if the single-precision floating-point value `a' is less than
4776 | the corresponding value `b', and 0 otherwise. The invalid exception is
4777 | raised if either operand is a NaN. The comparison is performed according
4778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4779 *----------------------------------------------------------------------------*/
4780
4781 int float32_lt(float32 a, float32 b, float_status *status)
4782 {
4783 flag aSign, bSign;
4784 uint32_t av, bv;
4785 a = float32_squash_input_denormal(a, status);
4786 b = float32_squash_input_denormal(b, status);
4787
4788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4790 ) {
4791 float_raise(float_flag_invalid, status);
4792 return 0;
4793 }
4794 aSign = extractFloat32Sign( a );
4795 bSign = extractFloat32Sign( b );
4796 av = float32_val(a);
4797 bv = float32_val(b);
4798 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4799 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4800
4801 }
4802
4803 /*----------------------------------------------------------------------------
4804 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4805 | be compared, and 0 otherwise. The invalid exception is raised if either
4806 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4807 | Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4809
4810 int float32_unordered(float32 a, float32 b, float_status *status)
4811 {
4812 a = float32_squash_input_denormal(a, status);
4813 b = float32_squash_input_denormal(b, status);
4814
4815 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4816 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4817 ) {
4818 float_raise(float_flag_invalid, status);
4819 return 1;
4820 }
4821 return 0;
4822 }
4823
4824 /*----------------------------------------------------------------------------
4825 | Returns 1 if the single-precision floating-point value `a' is equal to
4826 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4827 | exception. The comparison is performed according to the IEC/IEEE Standard
4828 | for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4830
4831 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4832 {
4833 a = float32_squash_input_denormal(a, status);
4834 b = float32_squash_input_denormal(b, status);
4835
4836 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4837 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4838 ) {
4839 if (float32_is_signaling_nan(a, status)
4840 || float32_is_signaling_nan(b, status)) {
4841 float_raise(float_flag_invalid, status);
4842 }
4843 return 0;
4844 }
4845 return ( float32_val(a) == float32_val(b) ) ||
4846 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4847 }
4848
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point value `a' is less than or
4851 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4852 | cause an exception. Otherwise, the comparison is performed according to the
4853 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4855
4856 int float32_le_quiet(float32 a, float32 b, float_status *status)
4857 {
4858 flag aSign, bSign;
4859 uint32_t av, bv;
4860 a = float32_squash_input_denormal(a, status);
4861 b = float32_squash_input_denormal(b, status);
4862
4863 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4864 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4865 ) {
4866 if (float32_is_signaling_nan(a, status)
4867 || float32_is_signaling_nan(b, status)) {
4868 float_raise(float_flag_invalid, status);
4869 }
4870 return 0;
4871 }
4872 aSign = extractFloat32Sign( a );
4873 bSign = extractFloat32Sign( b );
4874 av = float32_val(a);
4875 bv = float32_val(b);
4876 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4877 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4878
4879 }
4880
4881 /*----------------------------------------------------------------------------
4882 | Returns 1 if the single-precision floating-point value `a' is less than
4883 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4884 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4885 | Standard for Binary Floating-Point Arithmetic.
4886 *----------------------------------------------------------------------------*/
4887
4888 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4889 {
4890 flag aSign, bSign;
4891 uint32_t av, bv;
4892 a = float32_squash_input_denormal(a, status);
4893 b = float32_squash_input_denormal(b, status);
4894
4895 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4896 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4897 ) {
4898 if (float32_is_signaling_nan(a, status)
4899 || float32_is_signaling_nan(b, status)) {
4900 float_raise(float_flag_invalid, status);
4901 }
4902 return 0;
4903 }
4904 aSign = extractFloat32Sign( a );
4905 bSign = extractFloat32Sign( b );
4906 av = float32_val(a);
4907 bv = float32_val(b);
4908 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4909 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4910
4911 }
4912
4913 /*----------------------------------------------------------------------------
4914 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4915 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4916 | comparison is performed according to the IEC/IEEE Standard for Binary
4917 | Floating-Point Arithmetic.
4918 *----------------------------------------------------------------------------*/
4919
4920 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4921 {
4922 a = float32_squash_input_denormal(a, status);
4923 b = float32_squash_input_denormal(b, status);
4924
4925 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4926 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4927 ) {
4928 if (float32_is_signaling_nan(a, status)
4929 || float32_is_signaling_nan(b, status)) {
4930 float_raise(float_flag_invalid, status);
4931 }
4932 return 1;
4933 }
4934 return 0;
4935 }
4936
4937 /*----------------------------------------------------------------------------
4938 | If `a' is denormal and we are in flush-to-zero mode then set the
4939 | input-denormal exception and return zero. Otherwise just return the value.
4940 *----------------------------------------------------------------------------*/
4941 float16 float16_squash_input_denormal(float16 a, float_status *status)
4942 {
4943 if (status->flush_inputs_to_zero) {
4944 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4945 float_raise(float_flag_input_denormal, status);
4946 return make_float16(float16_val(a) & 0x8000);
4947 }
4948 }
4949 return a;
4950 }
4951
4952 /*----------------------------------------------------------------------------
4953 | Returns the result of converting the double-precision floating-point value
4954 | `a' to the extended double-precision floating-point format. The conversion
4955 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4956 | Arithmetic.
4957 *----------------------------------------------------------------------------*/
4958
4959 floatx80 float64_to_floatx80(float64 a, float_status *status)
4960 {
4961 flag aSign;
4962 int aExp;
4963 uint64_t aSig;
4964
4965 a = float64_squash_input_denormal(a, status);
4966 aSig = extractFloat64Frac( a );
4967 aExp = extractFloat64Exp( a );
4968 aSign = extractFloat64Sign( a );
4969 if ( aExp == 0x7FF ) {
4970 if (aSig) {
4971 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4972 }
4973 return packFloatx80(aSign,
4974 floatx80_infinity_high,
4975 floatx80_infinity_low);
4976 }
4977 if ( aExp == 0 ) {
4978 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4979 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4980 }
4981 return
4982 packFloatx80(
4983 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4984
4985 }
4986
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of converting the double-precision floating-point value
4989 | `a' to the quadruple-precision floating-point format. The conversion is
4990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4991 | Arithmetic.
4992 *----------------------------------------------------------------------------*/
4993
4994 float128 float64_to_float128(float64 a, float_status *status)
4995 {
4996 flag aSign;
4997 int aExp;
4998 uint64_t aSig, zSig0, zSig1;
4999
5000 a = float64_squash_input_denormal(a, status);
5001 aSig = extractFloat64Frac( a );
5002 aExp = extractFloat64Exp( a );
5003 aSign = extractFloat64Sign( a );
5004 if ( aExp == 0x7FF ) {
5005 if (aSig) {
5006 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5007 }
5008 return packFloat128( aSign, 0x7FFF, 0, 0 );
5009 }
5010 if ( aExp == 0 ) {
5011 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5012 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5013 --aExp;
5014 }
5015 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5016 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5017
5018 }
5019
5020
5021 /*----------------------------------------------------------------------------
5022 | Returns the remainder of the double-precision floating-point value `a'
5023 | with respect to the corresponding value `b'. The operation is performed
5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026
5027 float64 float64_rem(float64 a, float64 b, float_status *status)
5028 {
5029 flag aSign, zSign;
5030 int aExp, bExp, expDiff;
5031 uint64_t aSig, bSig;
5032 uint64_t q, alternateASig;
5033 int64_t sigMean;
5034
5035 a = float64_squash_input_denormal(a, status);
5036 b = float64_squash_input_denormal(b, status);
5037 aSig = extractFloat64Frac( a );
5038 aExp = extractFloat64Exp( a );
5039 aSign = extractFloat64Sign( a );
5040 bSig = extractFloat64Frac( b );
5041 bExp = extractFloat64Exp( b );
5042 if ( aExp == 0x7FF ) {
5043 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5044 return propagateFloat64NaN(a, b, status);
5045 }
5046 float_raise(float_flag_invalid, status);
5047 return float64_default_nan(status);
5048 }
5049 if ( bExp == 0x7FF ) {
5050 if (bSig) {
5051 return propagateFloat64NaN(a, b, status);
5052 }
5053 return a;
5054 }
5055 if ( bExp == 0 ) {
5056 if ( bSig == 0 ) {
5057 float_raise(float_flag_invalid, status);
5058 return float64_default_nan(status);
5059 }
5060 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5061 }
5062 if ( aExp == 0 ) {
5063 if ( aSig == 0 ) return a;
5064 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5065 }
5066 expDiff = aExp - bExp;
5067 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5068 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5069 if ( expDiff < 0 ) {
5070 if ( expDiff < -1 ) return a;
5071 aSig >>= 1;
5072 }
5073 q = ( bSig <= aSig );
5074 if ( q ) aSig -= bSig;
5075 expDiff -= 64;
5076 while ( 0 < expDiff ) {
5077 q = estimateDiv128To64( aSig, 0, bSig );
5078 q = ( 2 < q ) ? q - 2 : 0;
5079 aSig = - ( ( bSig>>2 ) * q );
5080 expDiff -= 62;
5081 }
5082 expDiff += 64;
5083 if ( 0 < expDiff ) {
5084 q = estimateDiv128To64( aSig, 0, bSig );
5085 q = ( 2 < q ) ? q - 2 : 0;
5086 q >>= 64 - expDiff;
5087 bSig >>= 2;
5088 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5089 }
5090 else {
5091 aSig >>= 2;
5092 bSig >>= 2;
5093 }
5094 do {
5095 alternateASig = aSig;
5096 ++q;
5097 aSig -= bSig;
5098 } while ( 0 <= (int64_t) aSig );
5099 sigMean = aSig + alternateASig;
5100 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5101 aSig = alternateASig;
5102 }
5103 zSign = ( (int64_t) aSig < 0 );
5104 if ( zSign ) aSig = - aSig;
5105 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5106
5107 }
5108
5109 /*----------------------------------------------------------------------------
5110 | Returns the binary log of the double-precision floating-point value `a'.
5111 | The operation is performed according to the IEC/IEEE Standard for Binary
5112 | Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 float64 float64_log2(float64 a, float_status *status)
5115 {
5116 flag aSign, zSign;
5117 int aExp;
5118 uint64_t aSig, aSig0, aSig1, zSig, i;
5119 a = float64_squash_input_denormal(a, status);
5120
5121 aSig = extractFloat64Frac( a );
5122 aExp = extractFloat64Exp( a );
5123 aSign = extractFloat64Sign( a );
5124
5125 if ( aExp == 0 ) {
5126 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5127 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5128 }
5129 if ( aSign ) {
5130 float_raise(float_flag_invalid, status);
5131 return float64_default_nan(status);
5132 }
5133 if ( aExp == 0x7FF ) {
5134 if (aSig) {
5135 return propagateFloat64NaN(a, float64_zero, status);
5136 }
5137 return a;
5138 }
5139
5140 aExp -= 0x3FF;
5141 aSig |= LIT64( 0x0010000000000000 );
5142 zSign = aExp < 0;
5143 zSig = (uint64_t)aExp << 52;
5144 for (i = 1LL << 51; i > 0; i >>= 1) {
5145 mul64To128( aSig, aSig, &aSig0, &aSig1 );
5146 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5147 if ( aSig & LIT64( 0x0020000000000000 ) ) {
5148 aSig >>= 1;
5149 zSig |= i;
5150 }
5151 }
5152
5153 if ( zSign )
5154 zSig = -zSig;
5155 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5156 }
5157
5158 /*----------------------------------------------------------------------------
5159 | Returns 1 if the double-precision floating-point value `a' is equal to the
5160 | corresponding value `b', and 0 otherwise. The invalid exception is raised
5161 | if either operand is a NaN. Otherwise, the comparison is performed
5162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163 *----------------------------------------------------------------------------*/
5164
5165 int float64_eq(float64 a, float64 b, float_status *status)
5166 {
5167 uint64_t av, bv;
5168 a = float64_squash_input_denormal(a, status);
5169 b = float64_squash_input_denormal(b, status);
5170
5171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5173 ) {
5174 float_raise(float_flag_invalid, status);
5175 return 0;
5176 }
5177 av = float64_val(a);
5178 bv = float64_val(b);
5179 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5180
5181 }
5182
5183 /*----------------------------------------------------------------------------
5184 | Returns 1 if the double-precision floating-point value `a' is less than or
5185 | equal to the corresponding value `b', and 0 otherwise. The invalid
5186 | exception is raised if either operand is a NaN. The comparison is performed
5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5188 *----------------------------------------------------------------------------*/
5189
5190 int float64_le(float64 a, float64 b, float_status *status)
5191 {
5192 flag aSign, bSign;
5193 uint64_t av, bv;
5194 a = float64_squash_input_denormal(a, status);
5195 b = float64_squash_input_denormal(b, status);
5196
5197 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5198 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5199 ) {
5200 float_raise(float_flag_invalid, status);
5201 return 0;
5202 }
5203 aSign = extractFloat64Sign( a );
5204 bSign = extractFloat64Sign( b );
5205 av = float64_val(a);
5206 bv = float64_val(b);
5207 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5208 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5209
5210 }
5211
5212 /*----------------------------------------------------------------------------
5213 | Returns 1 if the double-precision floating-point value `a' is less than
5214 | the corresponding value `b', and 0 otherwise. The invalid exception is
5215 | raised if either operand is a NaN. The comparison is performed according
5216 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5217 *----------------------------------------------------------------------------*/
5218
5219 int float64_lt(float64 a, float64 b, float_status *status)
5220 {
5221 flag aSign, bSign;
5222 uint64_t av, bv;
5223
5224 a = float64_squash_input_denormal(a, status);
5225 b = float64_squash_input_denormal(b, status);
5226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5228 ) {
5229 float_raise(float_flag_invalid, status);
5230 return 0;
5231 }
5232 aSign = extractFloat64Sign( a );
5233 bSign = extractFloat64Sign( b );
5234 av = float64_val(a);
5235 bv = float64_val(b);
5236 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5237 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5238
5239 }
5240
5241 /*----------------------------------------------------------------------------
5242 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5243 | be compared, and 0 otherwise. The invalid exception is raised if either
5244 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5245 | Standard for Binary Floating-Point Arithmetic.
5246 *----------------------------------------------------------------------------*/
5247
5248 int float64_unordered(float64 a, float64 b, float_status *status)
5249 {
5250 a = float64_squash_input_denormal(a, status);
5251 b = float64_squash_input_denormal(b, status);
5252
5253 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5254 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5255 ) {
5256 float_raise(float_flag_invalid, status);
5257 return 1;
5258 }
5259 return 0;
5260 }
5261
5262 /*----------------------------------------------------------------------------
5263 | Returns 1 if the double-precision floating-point value `a' is equal to the
5264 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5265 | exception.The comparison is performed according to the IEC/IEEE Standard
5266 | for Binary Floating-Point Arithmetic.
5267 *----------------------------------------------------------------------------*/
5268
5269 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5270 {
5271 uint64_t av, bv;
5272 a = float64_squash_input_denormal(a, status);
5273 b = float64_squash_input_denormal(b, status);
5274
5275 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5276 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5277 ) {
5278 if (float64_is_signaling_nan(a, status)
5279 || float64_is_signaling_nan(b, status)) {
5280 float_raise(float_flag_invalid, status);
5281 }
5282 return 0;
5283 }
5284 av = float64_val(a);
5285 bv = float64_val(b);
5286 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5287
5288 }
5289
5290 /*----------------------------------------------------------------------------
5291 | Returns 1 if the double-precision floating-point value `a' is less than or
5292 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5293 | cause an exception. Otherwise, the comparison is performed according to the
5294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296
5297 int float64_le_quiet(float64 a, float64 b, float_status *status)
5298 {
5299 flag aSign, bSign;
5300 uint64_t av, bv;
5301 a = float64_squash_input_denormal(a, status);
5302 b = float64_squash_input_denormal(b, status);
5303
5304 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5305 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5306 ) {
5307 if (float64_is_signaling_nan(a, status)
5308 || float64_is_signaling_nan(b, status)) {
5309 float_raise(float_flag_invalid, status);
5310 }
5311 return 0;
5312 }
5313 aSign = extractFloat64Sign( a );
5314 bSign = extractFloat64Sign( b );
5315 av = float64_val(a);
5316 bv = float64_val(b);
5317 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5318 return ( av == bv ) || ( aSign ^ ( av < bv ) );
5319
5320 }
5321
5322 /*----------------------------------------------------------------------------
5323 | Returns 1 if the double-precision floating-point value `a' is less than
5324 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5325 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5326 | Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328
5329 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5330 {
5331 flag aSign, bSign;
5332 uint64_t av, bv;
5333 a = float64_squash_input_denormal(a, status);
5334 b = float64_squash_input_denormal(b, status);
5335
5336 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5337 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5338 ) {
5339 if (float64_is_signaling_nan(a, status)
5340 || float64_is_signaling_nan(b, status)) {
5341 float_raise(float_flag_invalid, status);
5342 }
5343 return 0;
5344 }
5345 aSign = extractFloat64Sign( a );
5346 bSign = extractFloat64Sign( b );
5347 av = float64_val(a);
5348 bv = float64_val(b);
5349 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5350 return ( av != bv ) && ( aSign ^ ( av < bv ) );
5351
5352 }
5353
5354 /*----------------------------------------------------------------------------
5355 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5356 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
5357 | comparison is performed according to the IEC/IEEE Standard for Binary
5358 | Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5360
5361 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5362 {
5363 a = float64_squash_input_denormal(a, status);
5364 b = float64_squash_input_denormal(b, status);
5365
5366 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5367 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5368 ) {
5369 if (float64_is_signaling_nan(a, status)
5370 || float64_is_signaling_nan(b, status)) {
5371 float_raise(float_flag_invalid, status);
5372 }
5373 return 1;
5374 }
5375 return 0;
5376 }
5377
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format. The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic---which means in particular that the conversion
5383 | is rounded according to the current rounding mode. If `a' is a NaN, the
5384 | largest positive integer is returned. Otherwise, if the conversion
5385 | overflows, the largest integer with the same sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5387
5388 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5389 {
5390 flag aSign;
5391 int32_t aExp, shiftCount;
5392 uint64_t aSig;
5393
5394 if (floatx80_invalid_encoding(a)) {
5395 float_raise(float_flag_invalid, status);
5396 return 1 << 31;
5397 }
5398 aSig = extractFloatx80Frac( a );
5399 aExp = extractFloatx80Exp( a );
5400 aSign = extractFloatx80Sign( a );
5401 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5402 shiftCount = 0x4037 - aExp;
5403 if ( shiftCount <= 0 ) shiftCount = 1;
5404 shift64RightJamming( aSig, shiftCount, &aSig );
5405 return roundAndPackInt32(aSign, aSig, status);
5406
5407 }
5408
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format. The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic, except that the conversion is always rounded
5414 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5415 | Otherwise, if the conversion overflows, the largest integer with the same
5416 | sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5418
5419 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5420 {
5421 flag aSign;
5422 int32_t aExp, shiftCount;
5423 uint64_t aSig, savedASig;
5424 int32_t z;
5425
5426 if (floatx80_invalid_encoding(a)) {
5427 float_raise(float_flag_invalid, status);
5428 return 1 << 31;
5429 }
5430 aSig = extractFloatx80Frac( a );
5431 aExp = extractFloatx80Exp( a );
5432 aSign = extractFloatx80Sign( a );
5433 if ( 0x401E < aExp ) {
5434 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5435 goto invalid;
5436 }
5437 else if ( aExp < 0x3FFF ) {
5438 if (aExp || aSig) {
5439 status->float_exception_flags |= float_flag_inexact;
5440 }
5441 return 0;
5442 }
5443 shiftCount = 0x403E - aExp;
5444 savedASig = aSig;
5445 aSig >>= shiftCount;
5446 z = aSig;
5447 if ( aSign ) z = - z;
5448 if ( ( z < 0 ) ^ aSign ) {
5449 invalid:
5450 float_raise(float_flag_invalid, status);
5451 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5452 }
5453 if ( ( aSig<<shiftCount ) != savedASig ) {
5454 status->float_exception_flags |= float_flag_inexact;
5455 }
5456 return z;
5457
5458 }
5459
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of converting the extended double-precision floating-
5462 | point value `a' to the 64-bit two's complement integer format. The
5463 | conversion is performed according to the IEC/IEEE Standard for Binary
5464 | Floating-Point Arithmetic---which means in particular that the conversion
5465 | is rounded according to the current rounding mode. If `a' is a NaN,
5466 | the largest positive integer is returned. Otherwise, if the conversion
5467 | overflows, the largest integer with the same sign as `a' is returned.
5468 *----------------------------------------------------------------------------*/
5469
5470 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5471 {
5472 flag aSign;
5473 int32_t aExp, shiftCount;
5474 uint64_t aSig, aSigExtra;
5475
5476 if (floatx80_invalid_encoding(a)) {
5477 float_raise(float_flag_invalid, status);
5478 return 1ULL << 63;
5479 }
5480 aSig = extractFloatx80Frac( a );
5481 aExp = extractFloatx80Exp( a );
5482 aSign = extractFloatx80Sign( a );
5483 shiftCount = 0x403E - aExp;
5484 if ( shiftCount <= 0 ) {
5485 if ( shiftCount ) {
5486 float_raise(float_flag_invalid, status);
5487 if (!aSign || floatx80_is_any_nan(a)) {
5488 return LIT64( 0x7FFFFFFFFFFFFFFF );
5489 }
5490 return (int64_t) LIT64( 0x8000000000000000 );
5491 }
5492 aSigExtra = 0;
5493 }
5494 else {
5495 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5496 }
5497 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5498
5499 }
5500
5501 /*----------------------------------------------------------------------------
5502 | Returns the result of converting the extended double-precision floating-
5503 | point value `a' to the 64-bit two's complement integer format. The
5504 | conversion is performed according to the IEC/IEEE Standard for Binary
5505 | Floating-Point Arithmetic, except that the conversion is always rounded
5506 | toward zero. If `a' is a NaN, the largest positive integer is returned.
5507 | Otherwise, if the conversion overflows, the largest integer with the same
5508 | sign as `a' is returned.
5509 *----------------------------------------------------------------------------*/
5510
5511 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5512 {
5513 flag aSign;
5514 int32_t aExp, shiftCount;
5515 uint64_t aSig;
5516 int64_t z;
5517
5518 if (floatx80_invalid_encoding(a)) {
5519 float_raise(float_flag_invalid, status);
5520 return 1ULL << 63;
5521 }
5522 aSig = extractFloatx80Frac( a );
5523 aExp = extractFloatx80Exp( a );
5524 aSign = extractFloatx80Sign( a );
5525 shiftCount = aExp - 0x403E;
5526 if ( 0 <= shiftCount ) {
5527 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5528 if ( ( a.high != 0xC03E ) || aSig ) {
5529 float_raise(float_flag_invalid, status);
5530 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5531 return LIT64( 0x7FFFFFFFFFFFFFFF );
5532 }
5533 }
5534 return (int64_t) LIT64( 0x8000000000000000 );
5535 }
5536 else if ( aExp < 0x3FFF ) {
5537 if (aExp | aSig) {
5538 status->float_exception_flags |= float_flag_inexact;
5539 }
5540 return 0;
5541 }
5542 z = aSig>>( - shiftCount );
5543 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5544 status->float_exception_flags |= float_flag_inexact;
5545 }
5546 if ( aSign ) z = - z;
5547 return z;
5548
5549 }
5550
5551 /*----------------------------------------------------------------------------
5552 | Returns the result of converting the extended double-precision floating-
5553 | point value `a' to the single-precision floating-point format. The
5554 | conversion is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5557
5558 float32 floatx80_to_float32(floatx80 a, float_status *status)
5559 {
5560 flag aSign;
5561 int32_t aExp;
5562 uint64_t aSig;
5563
5564 if (floatx80_invalid_encoding(a)) {
5565 float_raise(float_flag_invalid, status);
5566 return float32_default_nan(status);
5567 }
5568 aSig = extractFloatx80Frac( a );
5569 aExp = extractFloatx80Exp( a );
5570 aSign = extractFloatx80Sign( a );
5571 if ( aExp == 0x7FFF ) {
5572 if ( (uint64_t) ( aSig<<1 ) ) {
5573 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5574 }
5575 return packFloat32( aSign, 0xFF, 0 );
5576 }
5577 shift64RightJamming( aSig, 33, &aSig );
5578 if ( aExp || aSig ) aExp -= 0x3F81;
5579 return roundAndPackFloat32(aSign, aExp, aSig, status);
5580
5581 }
5582
5583 /*----------------------------------------------------------------------------
5584 | Returns the result of converting the extended double-precision floating-
5585 | point value `a' to the double-precision floating-point format. The
5586 | conversion is performed according to the IEC/IEEE Standard for Binary
5587 | Floating-Point Arithmetic.
5588 *----------------------------------------------------------------------------*/
5589
5590 float64 floatx80_to_float64(floatx80 a, float_status *status)
5591 {
5592 flag aSign;
5593 int32_t aExp;
5594 uint64_t aSig, zSig;
5595
5596 if (floatx80_invalid_encoding(a)) {
5597 float_raise(float_flag_invalid, status);
5598 return float64_default_nan(status);
5599 }
5600 aSig = extractFloatx80Frac( a );
5601 aExp = extractFloatx80Exp( a );
5602 aSign = extractFloatx80Sign( a );
5603 if ( aExp == 0x7FFF ) {
5604 if ( (uint64_t) ( aSig<<1 ) ) {
5605 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5606 }
5607 return packFloat64( aSign, 0x7FF, 0 );
5608 }
5609 shift64RightJamming( aSig, 1, &zSig );
5610 if ( aExp || aSig ) aExp -= 0x3C01;
5611 return roundAndPackFloat64(aSign, aExp, zSig, status);
5612
5613 }
5614
5615 /*----------------------------------------------------------------------------
5616 | Returns the result of converting the extended double-precision floating-
5617 | point value `a' to the quadruple-precision floating-point format. The
5618 | conversion is performed according to the IEC/IEEE Standard for Binary
5619 | Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5621
5622 float128 floatx80_to_float128(floatx80 a, float_status *status)
5623 {
5624 flag aSign;
5625 int aExp;
5626 uint64_t aSig, zSig0, zSig1;
5627
5628 if (floatx80_invalid_encoding(a)) {
5629 float_raise(float_flag_invalid, status);
5630 return float128_default_nan(status);
5631 }
5632 aSig = extractFloatx80Frac( a );
5633 aExp = extractFloatx80Exp( a );
5634 aSign = extractFloatx80Sign( a );
5635 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5636 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5637 }
5638 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5639 return packFloat128( aSign, aExp, zSig0, zSig1 );
5640
5641 }
5642
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650
5651 floatx80 floatx80_round(floatx80 a, float_status *status)
5652 {
5653 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5654 extractFloatx80Sign(a),
5655 extractFloatx80Exp(a),
5656 extractFloatx80Frac(a), 0, status);
5657 }
5658
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value. The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5665
5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5667 {
5668 flag aSign;
5669 int32_t aExp;
5670 uint64_t lastBitMask, roundBitsMask;
5671 floatx80 z;
5672
5673 if (floatx80_invalid_encoding(a)) {
5674 float_raise(float_flag_invalid, status);
5675 return floatx80_default_nan(status);
5676 }
5677 aExp = extractFloatx80Exp( a );
5678 if ( 0x403E <= aExp ) {
5679 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5680 return propagateFloatx80NaN(a, a, status);
5681 }
5682 return a;
5683 }
5684 if ( aExp < 0x3FFF ) {
5685 if ( ( aExp == 0 )
5686 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5687 return a;
5688 }
5689 status->float_exception_flags |= float_flag_inexact;
5690 aSign = extractFloatx80Sign( a );
5691 switch (status->float_rounding_mode) {
5692 case float_round_nearest_even:
5693 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5694 ) {
5695 return
5696 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5697 }
5698 break;
5699 case float_round_ties_away:
5700 if (aExp == 0x3FFE) {
5701 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5702 }
5703 break;
5704 case float_round_down:
5705 return
5706 aSign ?
5707 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5708 : packFloatx80( 0, 0, 0 );
5709 case float_round_up:
5710 return
5711 aSign ? packFloatx80( 1, 0, 0 )
5712 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5713 }
5714 return packFloatx80( aSign, 0, 0 );
5715 }
5716 lastBitMask = 1;
5717 lastBitMask <<= 0x403E - aExp;
5718 roundBitsMask = lastBitMask - 1;
5719 z = a;
5720 switch (status->float_rounding_mode) {
5721 case float_round_nearest_even:
5722 z.low += lastBitMask>>1;
5723 if ((z.low & roundBitsMask) == 0) {
5724 z.low &= ~lastBitMask;
5725 }
5726 break;
5727 case float_round_ties_away:
5728 z.low += lastBitMask >> 1;
5729 break;
5730 case float_round_to_zero:
5731 break;
5732 case float_round_up:
5733 if (!extractFloatx80Sign(z)) {
5734 z.low += roundBitsMask;
5735 }
5736 break;
5737 case float_round_down:
5738 if (extractFloatx80Sign(z)) {
5739 z.low += roundBitsMask;
5740 }
5741 break;
5742 default:
5743 abort();
5744 }
5745 z.low &= ~ roundBitsMask;
5746 if ( z.low == 0 ) {
5747 ++z.high;
5748 z.low = LIT64( 0x8000000000000000 );
5749 }
5750 if (z.low != a.low) {
5751 status->float_exception_flags |= float_flag_inexact;
5752 }
5753 return z;
5754
5755 }
5756
5757 /*----------------------------------------------------------------------------
5758 | Returns the result of adding the absolute values of the extended double-
5759 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5760 | negated before being returned. `zSign' is ignored if the result is a NaN.
5761 | The addition is performed according to the IEC/IEEE Standard for Binary
5762 | Floating-Point Arithmetic.
5763 *----------------------------------------------------------------------------*/
5764
5765 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5766 float_status *status)
5767 {
5768 int32_t aExp, bExp, zExp;
5769 uint64_t aSig, bSig, zSig0, zSig1;
5770 int32_t expDiff;
5771
5772 aSig = extractFloatx80Frac( a );
5773 aExp = extractFloatx80Exp( a );
5774 bSig = extractFloatx80Frac( b );
5775 bExp = extractFloatx80Exp( b );
5776 expDiff = aExp - bExp;
5777 if ( 0 < expDiff ) {
5778 if ( aExp == 0x7FFF ) {
5779 if ((uint64_t)(aSig << 1)) {
5780 return propagateFloatx80NaN(a, b, status);
5781 }
5782 return a;
5783 }
5784 if ( bExp == 0 ) --expDiff;
5785 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5786 zExp = aExp;
5787 }
5788 else if ( expDiff < 0 ) {
5789 if ( bExp == 0x7FFF ) {
5790 if ((uint64_t)(bSig << 1)) {
5791 return propagateFloatx80NaN(a, b, status);
5792 }
5793 return packFloatx80(zSign,
5794 floatx80_infinity_high,
5795 floatx80_infinity_low);
5796 }
5797 if ( aExp == 0 ) ++expDiff;
5798 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5799 zExp = bExp;
5800 }
5801 else {
5802 if ( aExp == 0x7FFF ) {
5803 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5804 return propagateFloatx80NaN(a, b, status);
5805 }
5806 return a;
5807 }
5808 zSig1 = 0;
5809 zSig0 = aSig + bSig;
5810 if ( aExp == 0 ) {
5811 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5812 goto roundAndPack;
5813 }
5814 zExp = aExp;
5815 goto shiftRight1;
5816 }
5817 zSig0 = aSig + bSig;
5818 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5819 shiftRight1:
5820 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5821 zSig0 |= LIT64( 0x8000000000000000 );
5822 ++zExp;
5823 roundAndPack:
5824 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5825 zSign, zExp, zSig0, zSig1, status);
5826 }
5827
5828 /*----------------------------------------------------------------------------
5829 | Returns the result of subtracting the absolute values of the extended
5830 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5831 | difference is negated before being returned. `zSign' is ignored if the
5832 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5833 | Standard for Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5835
5836 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5837 float_status *status)
5838 {
5839 int32_t aExp, bExp, zExp;
5840 uint64_t aSig, bSig, zSig0, zSig1;
5841 int32_t expDiff;
5842
5843 aSig = extractFloatx80Frac( a );
5844 aExp = extractFloatx80Exp( a );
5845 bSig = extractFloatx80Frac( b );
5846 bExp = extractFloatx80Exp( b );
5847 expDiff = aExp - bExp;
5848 if ( 0 < expDiff ) goto aExpBigger;
5849 if ( expDiff < 0 ) goto bExpBigger;
5850 if ( aExp == 0x7FFF ) {
5851 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5852 return propagateFloatx80NaN(a, b, status);
5853 }
5854 float_raise(float_flag_invalid, status);
5855 return floatx80_default_nan(status);
5856 }
5857 if ( aExp == 0 ) {
5858 aExp = 1;
5859 bExp = 1;
5860 }
5861 zSig1 = 0;
5862 if ( bSig < aSig ) goto aBigger;
5863 if ( aSig < bSig ) goto bBigger;
5864 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5865 bExpBigger:
5866 if ( bExp == 0x7FFF ) {
5867 if ((uint64_t)(bSig << 1)) {
5868 return propagateFloatx80NaN(a, b, status);
5869 }
5870 return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5871 floatx80_infinity_low);
5872 }
5873 if ( aExp == 0 ) ++expDiff;
5874 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5875 bBigger:
5876 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5877 zExp = bExp;
5878 zSign ^= 1;
5879 goto normalizeRoundAndPack;
5880 aExpBigger:
5881 if ( aExp == 0x7FFF ) {
5882 if ((uint64_t)(aSig << 1)) {
5883 return propagateFloatx80NaN(a, b, status);
5884 }
5885 return a;
5886 }
5887 if ( bExp == 0 ) --expDiff;
5888 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5889 aBigger:
5890 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5891 zExp = aExp;
5892 normalizeRoundAndPack:
5893 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5894 zSign, zExp, zSig0, zSig1, status);
5895 }
5896
5897 /*----------------------------------------------------------------------------
5898 | Returns the result of adding the extended double-precision floating-point
5899 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5900 | Standard for Binary Floating-Point Arithmetic.
5901 *----------------------------------------------------------------------------*/
5902
5903 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5904 {
5905 flag aSign, bSign;
5906
5907 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5908 float_raise(float_flag_invalid, status);
5909 return floatx80_default_nan(status);
5910 }
5911 aSign = extractFloatx80Sign( a );
5912 bSign = extractFloatx80Sign( b );
5913 if ( aSign == bSign ) {
5914 return addFloatx80Sigs(a, b, aSign, status);
5915 }
5916 else {
5917 return subFloatx80Sigs(a, b, aSign, status);
5918 }
5919
5920 }
5921
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of subtracting the extended double-precision floating-
5924 | point values `a' and `b'. The operation is performed according to the
5925 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5926 *----------------------------------------------------------------------------*/
5927
5928 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5929 {
5930 flag aSign, bSign;
5931
5932 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5933 float_raise(float_flag_invalid, status);
5934 return floatx80_default_nan(status);
5935 }
5936 aSign = extractFloatx80Sign( a );
5937 bSign = extractFloatx80Sign( b );
5938 if ( aSign == bSign ) {
5939 return subFloatx80Sigs(a, b, aSign, status);
5940 }
5941 else {
5942 return addFloatx80Sigs(a, b, aSign, status);
5943 }
5944
5945 }
5946
5947 /*----------------------------------------------------------------------------
5948 | Returns the result of multiplying the extended double-precision floating-
5949 | point values `a' and `b'. The operation is performed according to the
5950 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5952
5953 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5954 {
5955 flag aSign, bSign, zSign;
5956 int32_t aExp, bExp, zExp;
5957 uint64_t aSig, bSig, zSig0, zSig1;
5958
5959 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5960 float_raise(float_flag_invalid, status);
5961 return floatx80_default_nan(status);
5962 }
5963 aSig = extractFloatx80Frac( a );
5964 aExp = extractFloatx80Exp( a );
5965 aSign = extractFloatx80Sign( a );
5966 bSig = extractFloatx80Frac( b );
5967 bExp = extractFloatx80Exp( b );
5968 bSign = extractFloatx80Sign( b );
5969 zSign = aSign ^ bSign;
5970 if ( aExp == 0x7FFF ) {
5971 if ( (uint64_t) ( aSig<<1 )
5972 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5973 return propagateFloatx80NaN(a, b, status);
5974 }
5975 if ( ( bExp | bSig ) == 0 ) goto invalid;
5976 return packFloatx80(zSign, floatx80_infinity_high,
5977 floatx80_infinity_low);
5978 }
5979 if ( bExp == 0x7FFF ) {
5980 if ((uint64_t)(bSig << 1)) {
5981 return propagateFloatx80NaN(a, b, status);
5982 }
5983 if ( ( aExp | aSig ) == 0 ) {
5984 invalid:
5985 float_raise(float_flag_invalid, status);
5986 return floatx80_default_nan(status);
5987 }
5988 return packFloatx80(zSign, floatx80_infinity_high,
5989 floatx80_infinity_low);
5990 }
5991 if ( aExp == 0 ) {
5992 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5993 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5994 }
5995 if ( bExp == 0 ) {
5996 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5997 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5998 }
5999 zExp = aExp + bExp - 0x3FFE;
6000 mul64To128( aSig, bSig, &zSig0, &zSig1 );
6001 if ( 0 < (int64_t) zSig0 ) {
6002 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6003 --zExp;
6004 }
6005 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6006 zSign, zExp, zSig0, zSig1, status);
6007 }
6008
6009 /*----------------------------------------------------------------------------
6010 | Returns the result of dividing the extended double-precision floating-point
6011 | value `a' by the corresponding value `b'. The operation is performed
6012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6013 *----------------------------------------------------------------------------*/
6014
6015 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6016 {
6017 flag aSign, bSign, zSign;
6018 int32_t aExp, bExp, zExp;
6019 uint64_t aSig, bSig, zSig0, zSig1;
6020 uint64_t rem0, rem1, rem2, term0, term1, term2;
6021
6022 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6023 float_raise(float_flag_invalid, status);
6024 return floatx80_default_nan(status);
6025 }
6026 aSig = extractFloatx80Frac( a );
6027 aExp = extractFloatx80Exp( a );
6028 aSign = extractFloatx80Sign( a );
6029 bSig = extractFloatx80Frac( b );
6030 bExp = extractFloatx80Exp( b );
6031 bSign = extractFloatx80Sign( b );
6032 zSign = aSign ^ bSign;
6033 if ( aExp == 0x7FFF ) {
6034 if ((uint64_t)(aSig << 1)) {
6035 return propagateFloatx80NaN(a, b, status);
6036 }
6037 if ( bExp == 0x7FFF ) {
6038 if ((uint64_t)(bSig << 1)) {
6039 return propagateFloatx80NaN(a, b, status);
6040 }
6041 goto invalid;
6042 }
6043 return packFloatx80(zSign, floatx80_infinity_high,
6044 floatx80_infinity_low);
6045 }
6046 if ( bExp == 0x7FFF ) {
6047 if ((uint64_t)(bSig << 1)) {
6048 return propagateFloatx80NaN(a, b, status);
6049 }
6050 return packFloatx80( zSign, 0, 0 );
6051 }
6052 if ( bExp == 0 ) {
6053 if ( bSig == 0 ) {
6054 if ( ( aExp | aSig ) == 0 ) {
6055 invalid:
6056 float_raise(float_flag_invalid, status);
6057 return floatx80_default_nan(status);
6058 }
6059 float_raise(float_flag_divbyzero, status);
6060 return packFloatx80(zSign, floatx80_infinity_high,
6061 floatx80_infinity_low);
6062 }
6063 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6064 }
6065 if ( aExp == 0 ) {
6066 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6067 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6068 }
6069 zExp = aExp - bExp + 0x3FFE;
6070 rem1 = 0;
6071 if ( bSig <= aSig ) {
6072 shift128Right( aSig, 0, 1, &aSig, &rem1 );
6073 ++zExp;
6074 }
6075 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6076 mul64To128( bSig, zSig0, &term0, &term1 );
6077 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6078 while ( (int64_t) rem0 < 0 ) {
6079 --zSig0;
6080 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6081 }
6082 zSig1 = estimateDiv128To64( rem1, 0, bSig );
6083 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6084 mul64To128( bSig, zSig1, &term1, &term2 );
6085 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6086 while ( (int64_t) rem1 < 0 ) {
6087 --zSig1;
6088 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6089 }
6090 zSig1 |= ( ( rem1 | rem2 ) != 0 );
6091 }
6092 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6093 zSign, zExp, zSig0, zSig1, status);
6094 }
6095
6096 /*----------------------------------------------------------------------------
6097 | Returns the remainder of the extended double-precision floating-point value
6098 | `a' with respect to the corresponding value `b'. The operation is performed
6099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6100 *----------------------------------------------------------------------------*/
6101
6102 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6103 {
6104 flag aSign, zSign;
6105 int32_t aExp, bExp, expDiff;
6106 uint64_t aSig0, aSig1, bSig;
6107 uint64_t q, term0, term1, alternateASig0, alternateASig1;
6108
6109 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6110 float_raise(float_flag_invalid, status);
6111 return floatx80_default_nan(status);
6112 }
6113 aSig0 = extractFloatx80Frac( a );
6114 aExp = extractFloatx80Exp( a );
6115 aSign = extractFloatx80Sign( a );
6116 bSig = extractFloatx80Frac( b );
6117 bExp = extractFloatx80Exp( b );
6118 if ( aExp == 0x7FFF ) {
6119 if ( (uint64_t) ( aSig0<<1 )
6120 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6121 return propagateFloatx80NaN(a, b, status);
6122 }
6123 goto invalid;
6124 }
6125 if ( bExp == 0x7FFF ) {
6126 if ((uint64_t)(bSig << 1)) {
6127 return propagateFloatx80NaN(a, b, status);
6128 }
6129 return a;
6130 }
6131 if ( bExp == 0 ) {
6132 if ( bSig == 0 ) {
6133 invalid:
6134 float_raise(float_flag_invalid, status);
6135 return floatx80_default_nan(status);
6136 }
6137 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6138 }
6139 if ( aExp == 0 ) {
6140 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6141 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6142 }
6143 bSig |= LIT64( 0x8000000000000000 );
6144 zSign = aSign;
6145 expDiff = aExp - bExp;
6146 aSig1 = 0;
6147 if ( expDiff < 0 ) {
6148 if ( expDiff < -1 ) return a;
6149 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6150 expDiff = 0;
6151 }
6152 q = ( bSig <= aSig0 );
6153 if ( q ) aSig0 -= bSig;
6154 expDiff -= 64;
6155 while ( 0 < expDiff ) {
6156 q = estimateDiv128To64( aSig0, aSig1, bSig );
6157 q = ( 2 < q ) ? q - 2 : 0;
6158 mul64To128( bSig, q, &term0, &term1 );
6159 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6160 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6161 expDiff -= 62;
6162 }
6163 expDiff += 64;
6164 if ( 0 < expDiff ) {
6165 q = estimateDiv128To64( aSig0, aSig1, bSig );
6166 q = ( 2 < q ) ? q - 2 : 0;
6167 q >>= 64 - expDiff;
6168 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6169 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6170 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6171 while ( le128( term0, term1, aSig0, aSig1 ) ) {
6172 ++q;
6173 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6174 }
6175 }
6176 else {
6177 term1 = 0;
6178 term0 = bSig;
6179 }
6180 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6181 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6182 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6183 && ( q & 1 ) )
6184 ) {
6185 aSig0 = alternateASig0;
6186 aSig1 = alternateASig1;
6187 zSign = ! zSign;
6188 }
6189 return
6190 normalizeRoundAndPackFloatx80(
6191 80, zSign, bExp + expDiff, aSig0, aSig1, status);
6192
6193 }
6194
6195 /*----------------------------------------------------------------------------
6196 | Returns the square root of the extended double-precision floating-point
6197 | value `a'. The operation is performed according to the IEC/IEEE Standard
6198 | for Binary Floating-Point Arithmetic.
6199 *----------------------------------------------------------------------------*/
6200
6201 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6202 {
6203 flag aSign;
6204 int32_t aExp, zExp;
6205 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6206 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6207
6208 if (floatx80_invalid_encoding(a)) {
6209 float_raise(float_flag_invalid, status);
6210 return floatx80_default_nan(status);
6211 }
6212 aSig0 = extractFloatx80Frac( a );
6213 aExp = extractFloatx80Exp( a );
6214 aSign = extractFloatx80Sign( a );
6215 if ( aExp == 0x7FFF ) {
6216 if ((uint64_t)(aSig0 << 1)) {
6217 return propagateFloatx80NaN(a, a, status);
6218 }
6219 if ( ! aSign ) return a;
6220 goto invalid;
6221 }
6222 if ( aSign ) {
6223 if ( ( aExp | aSig0 ) == 0 ) return a;
6224 invalid:
6225 float_raise(float_flag_invalid, status);
6226 return floatx80_default_nan(status);
6227 }
6228 if ( aExp == 0 ) {
6229 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6230 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6231 }
6232 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6233 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6234 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6235 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6236 doubleZSig0 = zSig0<<1;
6237 mul64To128( zSig0, zSig0, &term0, &term1 );
6238 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6239 while ( (int64_t) rem0 < 0 ) {
6240 --zSig0;
6241 doubleZSig0 -= 2;
6242 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6243 }
6244 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6245 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6246 if ( zSig1 == 0 ) zSig1 = 1;
6247 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6248 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6249 mul64To128( zSig1, zSig1, &term2, &term3 );
6250 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6251 while ( (int64_t) rem1 < 0 ) {
6252 --zSig1;
6253 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6254 term3 |= 1;
6255 term2 |= doubleZSig0;
6256 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6257 }
6258 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6259 }
6260 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6261 zSig0 |= doubleZSig0;
6262 return roundAndPackFloatx80(status->floatx80_rounding_precision,
6263 0, zExp, zSig0, zSig1, status);
6264 }
6265
6266 /*----------------------------------------------------------------------------
6267 | Returns 1 if the extended double-precision floating-point value `a' is equal
6268 | to the corresponding value `b', and 0 otherwise. The invalid exception is
6269 | raised if either operand is a NaN. Otherwise, the comparison is performed
6270 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6272
6273 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6274 {
6275
6276 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6277 || (extractFloatx80Exp(a) == 0x7FFF
6278 && (uint64_t) (extractFloatx80Frac(a) << 1))
6279 || (extractFloatx80Exp(b) == 0x7FFF
6280 && (uint64_t) (extractFloatx80Frac(b) << 1))
6281 ) {
6282 float_raise(float_flag_invalid, status);
6283 return 0;
6284 }
6285 return
6286 ( a.low == b.low )
6287 && ( ( a.high == b.high )
6288 || ( ( a.low == 0 )
6289 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6290 );
6291
6292 }
6293
6294 /*----------------------------------------------------------------------------
6295 | Returns 1 if the extended double-precision floating-point value `a' is
6296 | less than or equal to the corresponding value `b', and 0 otherwise. The
6297 | invalid exception is raised if either operand is a NaN. The comparison is
6298 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6299 | Arithmetic.
6300 *----------------------------------------------------------------------------*/
6301
6302 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6303 {
6304 flag aSign, bSign;
6305
6306 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6307 || (extractFloatx80Exp(a) == 0x7FFF
6308 && (uint64_t) (extractFloatx80Frac(a) << 1))
6309 || (extractFloatx80Exp(b) == 0x7FFF
6310 && (uint64_t) (extractFloatx80Frac(b) << 1))
6311 ) {
6312 float_raise(float_flag_invalid, status);
6313 return 0;
6314 }
6315 aSign = extractFloatx80Sign( a );
6316 bSign = extractFloatx80Sign( b );
6317 if ( aSign != bSign ) {
6318 return
6319 aSign
6320 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6321 == 0 );
6322 }
6323 return
6324 aSign ? le128( b.high, b.low, a.high, a.low )
6325 : le128( a.high, a.low, b.high, b.low );
6326
6327 }
6328
6329 /*----------------------------------------------------------------------------
6330 | Returns 1 if the extended double-precision floating-point value `a' is
6331 | less than the corresponding value `b', and 0 otherwise. The invalid
6332 | exception is raised if either operand is a NaN. The comparison is performed
6333 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6335
6336 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6337 {
6338 flag aSign, bSign;
6339
6340 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6341 || (extractFloatx80Exp(a) == 0x7FFF
6342 && (uint64_t) (extractFloatx80Frac(a) << 1))
6343 || (extractFloatx80Exp(b) == 0x7FFF
6344 && (uint64_t) (extractFloatx80Frac(b) << 1))
6345 ) {
6346 float_raise(float_flag_invalid, status);
6347 return 0;
6348 }
6349 aSign = extractFloatx80Sign( a );
6350 bSign = extractFloatx80Sign( b );
6351 if ( aSign != bSign ) {
6352 return
6353 aSign
6354 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6355 != 0 );
6356 }
6357 return
6358 aSign ? lt128( b.high, b.low, a.high, a.low )
6359 : lt128( a.high, a.low, b.high, b.low );
6360
6361 }
6362
6363 /*----------------------------------------------------------------------------
6364 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6365 | cannot be compared, and 0 otherwise. The invalid exception is raised if
6366 | either operand is a NaN. The comparison is performed according to the
6367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6368 *----------------------------------------------------------------------------*/
6369 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6370 {
6371 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6372 || (extractFloatx80Exp(a) == 0x7FFF
6373 && (uint64_t) (extractFloatx80Frac(a) << 1))
6374 || (extractFloatx80Exp(b) == 0x7FFF
6375 && (uint64_t) (extractFloatx80Frac(b) << 1))
6376 ) {
6377 float_raise(float_flag_invalid, status);
6378 return 1;
6379 }
6380 return 0;
6381 }
6382
6383 /*----------------------------------------------------------------------------
6384 | Returns 1 if the extended double-precision floating-point value `a' is
6385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6386 | cause an exception. The comparison is performed according to the IEC/IEEE
6387 | Standard for Binary Floating-Point Arithmetic.
6388 *----------------------------------------------------------------------------*/
6389
6390 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6391 {
6392
6393 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6394 float_raise(float_flag_invalid, status);
6395 return 0;
6396 }
6397 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6398 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6399 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6400 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6401 ) {
6402 if (floatx80_is_signaling_nan(a, status)
6403 || floatx80_is_signaling_nan(b, status)) {
6404 float_raise(float_flag_invalid, status);
6405 }
6406 return 0;
6407 }
6408 return
6409 ( a.low == b.low )
6410 && ( ( a.high == b.high )
6411 || ( ( a.low == 0 )
6412 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6413 );
6414
6415 }
6416
6417 /*----------------------------------------------------------------------------
6418 | Returns 1 if the extended double-precision floating-point value `a' is less
6419 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6420 | do not cause an exception. Otherwise, the comparison is performed according
6421 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6422 *----------------------------------------------------------------------------*/
6423
6424 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6425 {
6426 flag aSign, bSign;
6427
6428 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6429 float_raise(float_flag_invalid, status);
6430 return 0;
6431 }
6432 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6433 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6434 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6435 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6436 ) {
6437 if (floatx80_is_signaling_nan(a, status)
6438 || floatx80_is_signaling_nan(b, status)) {
6439 float_raise(float_flag_invalid, status);
6440 }
6441 return 0;
6442 }
6443 aSign = extractFloatx80Sign( a );
6444 bSign = extractFloatx80Sign( b );
6445 if ( aSign != bSign ) {
6446 return
6447 aSign
6448 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6449 == 0 );
6450 }
6451 return
6452 aSign ? le128( b.high, b.low, a.high, a.low )
6453 : le128( a.high, a.low, b.high, b.low );
6454
6455 }
6456
6457 /*----------------------------------------------------------------------------
6458 | Returns 1 if the extended double-precision floating-point value `a' is less
6459 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6460 | an exception. Otherwise, the comparison is performed according to the
6461 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6462 *----------------------------------------------------------------------------*/
6463
6464 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6465 {
6466 flag aSign, bSign;
6467
6468 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6469 float_raise(float_flag_invalid, status);
6470 return 0;
6471 }
6472 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6473 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6474 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6475 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6476 ) {
6477 if (floatx80_is_signaling_nan(a, status)
6478 || floatx80_is_signaling_nan(b, status)) {
6479 float_raise(float_flag_invalid, status);
6480 }
6481 return 0;
6482 }
6483 aSign = extractFloatx80Sign( a );
6484 bSign = extractFloatx80Sign( b );
6485 if ( aSign != bSign ) {
6486 return
6487 aSign
6488 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6489 != 0 );
6490 }
6491 return
6492 aSign ? lt128( b.high, b.low, a.high, a.low )
6493 : lt128( a.high, a.low, b.high, b.low );
6494
6495 }
6496
6497 /*----------------------------------------------------------------------------
6498 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6499 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
6500 | The comparison is performed according to the IEC/IEEE Standard for Binary
6501 | Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6504 {
6505 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6506 float_raise(float_flag_invalid, status);
6507 return 1;
6508 }
6509 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
6510 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6511 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
6512 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6513 ) {
6514 if (floatx80_is_signaling_nan(a, status)
6515 || floatx80_is_signaling_nan(b, status)) {
6516 float_raise(float_flag_invalid, status);
6517 }
6518 return 1;
6519 }
6520 return 0;
6521 }
6522
6523 /*----------------------------------------------------------------------------
6524 | Returns the result of converting the quadruple-precision floating-point
6525 | value `a' to the 32-bit two's complement integer format. The conversion
6526 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6527 | Arithmetic---which means in particular that the conversion is rounded
6528 | according to the current rounding mode. If `a' is a NaN, the largest
6529 | positive integer is returned. Otherwise, if the conversion overflows, the
6530 | largest integer with the same sign as `a' is returned.
6531 *----------------------------------------------------------------------------*/
6532
6533 int32_t float128_to_int32(float128 a, float_status *status)
6534 {
6535 flag aSign;
6536 int32_t aExp, shiftCount;
6537 uint64_t aSig0, aSig1;
6538
6539 aSig1 = extractFloat128Frac1( a );
6540 aSig0 = extractFloat128Frac0( a );
6541 aExp = extractFloat128Exp( a );
6542 aSign = extractFloat128Sign( a );
6543 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6544 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6545 aSig0 |= ( aSig1 != 0 );
6546 shiftCount = 0x4028 - aExp;
6547 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6548 return roundAndPackInt32(aSign, aSig0, status);
6549
6550 }
6551
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of converting the quadruple-precision floating-point
6554 | value `a' to the 32-bit two's complement integer format. The conversion
6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6556 | Arithmetic, except that the conversion is always rounded toward zero. If
6557 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6558 | conversion overflows, the largest integer with the same sign as `a' is
6559 | returned.
6560 *----------------------------------------------------------------------------*/
6561
6562 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6563 {
6564 flag aSign;
6565 int32_t aExp, shiftCount;
6566 uint64_t aSig0, aSig1, savedASig;
6567 int32_t z;
6568
6569 aSig1 = extractFloat128Frac1( a );
6570 aSig0 = extractFloat128Frac0( a );
6571 aExp = extractFloat128Exp( a );
6572 aSign = extractFloat128Sign( a );
6573 aSig0 |= ( aSig1 != 0 );
6574 if ( 0x401E < aExp ) {
6575 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6576 goto invalid;
6577 }
6578 else if ( aExp < 0x3FFF ) {
6579 if (aExp || aSig0) {
6580 status->float_exception_flags |= float_flag_inexact;
6581 }
6582 return 0;
6583 }
6584 aSig0 |= LIT64( 0x0001000000000000 );
6585 shiftCount = 0x402F - aExp;
6586 savedASig = aSig0;
6587 aSig0 >>= shiftCount;
6588 z = aSig0;
6589 if ( aSign ) z = - z;
6590 if ( ( z < 0 ) ^ aSign ) {
6591 invalid:
6592 float_raise(float_flag_invalid, status);
6593 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6594 }
6595 if ( ( aSig0<<shiftCount ) != savedASig ) {
6596 status->float_exception_flags |= float_flag_inexact;
6597 }
6598 return z;
6599
6600 }
6601
6602 /*----------------------------------------------------------------------------
6603 | Returns the result of converting the quadruple-precision floating-point
6604 | value `a' to the 64-bit two's complement integer format. The conversion
6605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6606 | Arithmetic---which means in particular that the conversion is rounded
6607 | according to the current rounding mode. If `a' is a NaN, the largest
6608 | positive integer is returned. Otherwise, if the conversion overflows, the
6609 | largest integer with the same sign as `a' is returned.
6610 *----------------------------------------------------------------------------*/
6611
6612 int64_t float128_to_int64(float128 a, float_status *status)
6613 {
6614 flag aSign;
6615 int32_t aExp, shiftCount;
6616 uint64_t aSig0, aSig1;
6617
6618 aSig1 = extractFloat128Frac1( a );
6619 aSig0 = extractFloat128Frac0( a );
6620 aExp = extractFloat128Exp( a );
6621 aSign = extractFloat128Sign( a );
6622 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6623 shiftCount = 0x402F - aExp;
6624 if ( shiftCount <= 0 ) {
6625 if ( 0x403E < aExp ) {
6626 float_raise(float_flag_invalid, status);
6627 if ( ! aSign
6628 || ( ( aExp == 0x7FFF )
6629 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6630 )
6631 ) {
6632 return LIT64( 0x7FFFFFFFFFFFFFFF );
6633 }
6634 return (int64_t) LIT64( 0x8000000000000000 );
6635 }
6636 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6637 }
6638 else {
6639 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6640 }
6641 return roundAndPackInt64(aSign, aSig0, aSig1, status);
6642
6643 }
6644
6645 /*----------------------------------------------------------------------------
6646 | Returns the result of converting the quadruple-precision floating-point
6647 | value `a' to the 64-bit two's complement integer format. The conversion
6648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6649 | Arithmetic, except that the conversion is always rounded toward zero.
6650 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6651 | the conversion overflows, the largest integer with the same sign as `a' is
6652 | returned.
6653 *----------------------------------------------------------------------------*/
6654
6655 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6656 {
6657 flag aSign;
6658 int32_t aExp, shiftCount;
6659 uint64_t aSig0, aSig1;
6660 int64_t z;
6661
6662 aSig1 = extractFloat128Frac1( a );
6663 aSig0 = extractFloat128Frac0( a );
6664 aExp = extractFloat128Exp( a );
6665 aSign = extractFloat128Sign( a );
6666 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6667 shiftCount = aExp - 0x402F;
6668 if ( 0 < shiftCount ) {
6669 if ( 0x403E <= aExp ) {
6670 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6671 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
6672 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6673 if (aSig1) {
6674 status->float_exception_flags |= float_flag_inexact;
6675 }
6676 }
6677 else {
6678 float_raise(float_flag_invalid, status);
6679 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6680 return LIT64( 0x7FFFFFFFFFFFFFFF );
6681 }
6682 }
6683 return (int64_t) LIT64( 0x8000000000000000 );
6684 }
6685 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6686 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6687 status->float_exception_flags |= float_flag_inexact;
6688 }
6689 }
6690 else {
6691 if ( aExp < 0x3FFF ) {
6692 if ( aExp | aSig0 | aSig1 ) {
6693 status->float_exception_flags |= float_flag_inexact;
6694 }
6695 return 0;
6696 }
6697 z = aSig0>>( - shiftCount );
6698 if ( aSig1
6699 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6700 status->float_exception_flags |= float_flag_inexact;
6701 }
6702 }
6703 if ( aSign ) z = - z;
6704 return z;
6705
6706 }
6707
6708 /*----------------------------------------------------------------------------
6709 | Returns the result of converting the quadruple-precision floating-point value
6710 | `a' to the 64-bit unsigned integer format. The conversion is
6711 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6712 | Arithmetic---which means in particular that the conversion is rounded
6713 | according to the current rounding mode. If `a' is a NaN, the largest
6714 | positive integer is returned. If the conversion overflows, the
6715 | largest unsigned integer is returned. If 'a' is negative, the value is
6716 | rounded and zero is returned; negative values that do not round to zero
6717 | will raise the inexact exception.
6718 *----------------------------------------------------------------------------*/
6719
6720 uint64_t float128_to_uint64(float128 a, float_status *status)
6721 {
6722 flag aSign;
6723 int aExp;
6724 int shiftCount;
6725 uint64_t aSig0, aSig1;
6726
6727 aSig0 = extractFloat128Frac0(a);
6728 aSig1 = extractFloat128Frac1(a);
6729 aExp = extractFloat128Exp(a);
6730 aSign = extractFloat128Sign(a);
6731 if (aSign && (aExp > 0x3FFE)) {
6732 float_raise(float_flag_invalid, status);
6733 if (float128_is_any_nan(a)) {
6734 return LIT64(0xFFFFFFFFFFFFFFFF);
6735 } else {
6736 return 0;
6737 }
6738 }
6739 if (aExp) {
6740 aSig0 |= LIT64(0x0001000000000000);
6741 }
6742 shiftCount = 0x402F - aExp;
6743 if (shiftCount <= 0) {
6744 if (0x403E < aExp) {
6745 float_raise(float_flag_invalid, status);
6746 return LIT64(0xFFFFFFFFFFFFFFFF);
6747 }
6748 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6749 } else {
6750 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6751 }
6752 return roundAndPackUint64(aSign, aSig0, aSig1, status);
6753 }
6754
6755 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6756 {
6757 uint64_t v;
6758 signed char current_rounding_mode = status->float_rounding_mode;
6759
6760 set_float_rounding_mode(float_round_to_zero, status);
6761 v = float128_to_uint64(a, status);
6762 set_float_rounding_mode(current_rounding_mode, status);
6763
6764 return v;
6765 }
6766
6767 /*----------------------------------------------------------------------------
6768 | Returns the result of converting the quadruple-precision floating-point
6769 | value `a' to the 32-bit unsigned integer format. The conversion
6770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6771 | Arithmetic except that the conversion is always rounded toward zero.
6772 | If `a' is a NaN, the largest positive integer is returned. Otherwise,
6773 | if the conversion overflows, the largest unsigned integer is returned.
6774 | If 'a' is negative, the value is rounded and zero is returned; negative
6775 | values that do not round to zero will raise the inexact exception.
6776 *----------------------------------------------------------------------------*/
6777
6778 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6779 {
6780 uint64_t v;
6781 uint32_t res;
6782 int old_exc_flags = get_float_exception_flags(status);
6783
6784 v = float128_to_uint64_round_to_zero(a, status);
6785 if (v > 0xffffffff) {
6786 res = 0xffffffff;
6787 } else {
6788 return v;
6789 }
6790 set_float_exception_flags(old_exc_flags, status);
6791 float_raise(float_flag_invalid, status);
6792 return res;
6793 }
6794
6795 /*----------------------------------------------------------------------------
6796 | Returns the result of converting the quadruple-precision floating-point
6797 | value `a' to the single-precision floating-point format. The conversion
6798 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6799 | Arithmetic.
6800 *----------------------------------------------------------------------------*/
6801
6802 float32 float128_to_float32(float128 a, float_status *status)
6803 {
6804 flag aSign;
6805 int32_t aExp;
6806 uint64_t aSig0, aSig1;
6807 uint32_t zSig;
6808
6809 aSig1 = extractFloat128Frac1( a );
6810 aSig0 = extractFloat128Frac0( a );
6811 aExp = extractFloat128Exp( a );
6812 aSign = extractFloat128Sign( a );
6813 if ( aExp == 0x7FFF ) {
6814 if ( aSig0 | aSig1 ) {
6815 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6816 }
6817 return packFloat32( aSign, 0xFF, 0 );
6818 }
6819 aSig0 |= ( aSig1 != 0 );
6820 shift64RightJamming( aSig0, 18, &aSig0 );
6821 zSig = aSig0;
6822 if ( aExp || zSig ) {
6823 zSig |= 0x40000000;
6824 aExp -= 0x3F81;
6825 }
6826 return roundAndPackFloat32(aSign, aExp, zSig, status);
6827
6828 }
6829
6830 /*----------------------------------------------------------------------------
6831 | Returns the result of converting the quadruple-precision floating-point
6832 | value `a' to the double-precision floating-point format. The conversion
6833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6834 | Arithmetic.
6835 *----------------------------------------------------------------------------*/
6836
6837 float64 float128_to_float64(float128 a, float_status *status)
6838 {
6839 flag aSign;
6840 int32_t aExp;
6841 uint64_t aSig0, aSig1;
6842
6843 aSig1 = extractFloat128Frac1( a );
6844 aSig0 = extractFloat128Frac0( a );
6845 aExp = extractFloat128Exp( a );
6846 aSign = extractFloat128Sign( a );
6847 if ( aExp == 0x7FFF ) {
6848 if ( aSig0 | aSig1 ) {
6849 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6850 }
6851 return packFloat64( aSign, 0x7FF, 0 );
6852 }
6853 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6854 aSig0 |= ( aSig1 != 0 );
6855 if ( aExp || aSig0 ) {
6856 aSig0 |= LIT64( 0x4000000000000000 );
6857 aExp -= 0x3C01;
6858 }
6859 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6860
6861 }
6862
6863 /*----------------------------------------------------------------------------
6864 | Returns the result of converting the quadruple-precision floating-point
6865 | value `a' to the extended double-precision floating-point format. The
6866 | conversion is performed according to the IEC/IEEE Standard for Binary
6867 | Floating-Point Arithmetic.
6868 *----------------------------------------------------------------------------*/
6869
6870 floatx80 float128_to_floatx80(float128 a, float_status *status)
6871 {
6872 flag aSign;
6873 int32_t aExp;
6874 uint64_t aSig0, aSig1;
6875
6876 aSig1 = extractFloat128Frac1( a );
6877 aSig0 = extractFloat128Frac0( a );
6878 aExp = extractFloat128Exp( a );
6879 aSign = extractFloat128Sign( a );
6880 if ( aExp == 0x7FFF ) {
6881 if ( aSig0 | aSig1 ) {
6882 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6883 }
6884 return packFloatx80(aSign, floatx80_infinity_high,
6885 floatx80_infinity_low);
6886 }
6887 if ( aExp == 0 ) {
6888 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6889 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6890 }
6891 else {
6892 aSig0 |= LIT64( 0x0001000000000000 );
6893 }
6894 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6895 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6896
6897 }
6898
6899 /*----------------------------------------------------------------------------
6900 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6901 | returns the result as a quadruple-precision floating-point value. The
6902 | operation is performed according to the IEC/IEEE Standard for Binary
6903 | Floating-Point Arithmetic.
6904 *----------------------------------------------------------------------------*/
6905
6906 float128 float128_round_to_int(float128 a, float_status *status)
6907 {
6908 flag aSign;
6909 int32_t aExp;
6910 uint64_t lastBitMask, roundBitsMask;
6911 float128 z;
6912
6913 aExp = extractFloat128Exp( a );
6914 if ( 0x402F <= aExp ) {
6915 if ( 0x406F <= aExp ) {
6916 if ( ( aExp == 0x7FFF )
6917 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6918 ) {
6919 return propagateFloat128NaN(a, a, status);
6920 }
6921 return a;
6922 }
6923 lastBitMask = 1;
6924 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6925 roundBitsMask = lastBitMask - 1;
6926 z = a;
6927 switch (status->float_rounding_mode) {
6928 case float_round_nearest_even:
6929 if ( lastBitMask ) {
6930 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6931 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6932 }
6933 else {
6934 if ( (int64_t) z.low < 0 ) {
6935 ++z.high;
6936 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6937 }
6938 }
6939 break;
6940 case float_round_ties_away:
6941 if (lastBitMask) {
6942 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6943 } else {
6944 if ((int64_t) z.low < 0) {
6945 ++z.high;
6946 }
6947 }
6948 break;
6949 case float_round_to_zero:
6950 break;
6951 case float_round_up:
6952 if (!extractFloat128Sign(z)) {
6953 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6954 }
6955 break;
6956 case float_round_down:
6957 if (extractFloat128Sign(z)) {
6958 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6959 }
6960 break;
6961 default:
6962 abort();
6963 }
6964 z.low &= ~ roundBitsMask;
6965 }
6966 else {
6967 if ( aExp < 0x3FFF ) {
6968 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6969 status->float_exception_flags |= float_flag_inexact;
6970 aSign = extractFloat128Sign( a );
6971 switch (status->float_rounding_mode) {
6972 case float_round_nearest_even:
6973 if ( ( aExp == 0x3FFE )
6974 && ( extractFloat128Frac0( a )
6975 | extractFloat128Frac1( a ) )
6976 ) {
6977 return packFloat128( aSign, 0x3FFF, 0, 0 );
6978 }
6979 break;
6980 case float_round_ties_away:
6981 if (aExp == 0x3FFE) {
6982 return packFloat128(aSign, 0x3FFF, 0, 0);
6983 }
6984 break;
6985 case float_round_down:
6986 return
6987 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6988 : packFloat128( 0, 0, 0, 0 );
6989 case float_round_up:
6990 return
6991 aSign ? packFloat128( 1, 0, 0, 0 )
6992 : packFloat128( 0, 0x3FFF, 0, 0 );
6993 }
6994 return packFloat128( aSign, 0, 0, 0 );
6995 }
6996 lastBitMask = 1;
6997 lastBitMask <<= 0x402F - aExp;
6998 roundBitsMask = lastBitMask - 1;
6999 z.low = 0;
7000 z.high = a.high;
7001 switch (status->float_rounding_mode) {
7002 case float_round_nearest_even:
7003 z.high += lastBitMask>>1;
7004 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7005 z.high &= ~ lastBitMask;
7006 }
7007 break;
7008 case float_round_ties_away:
7009 z.high += lastBitMask>>1;
7010 break;
7011 case float_round_to_zero:
7012 break;
7013 case float_round_up:
7014 if (!extractFloat128Sign(z)) {
7015 z.high |= ( a.low != 0 );
7016 z.high += roundBitsMask;
7017 }
7018 break;
7019 case float_round_down:
7020 if (extractFloat128Sign(z)) {
7021 z.high |= (a.low != 0);
7022 z.high += roundBitsMask;
7023 }
7024 break;
7025 default:
7026 abort();
7027 }
7028 z.high &= ~ roundBitsMask;
7029 }
7030 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7031 status->float_exception_flags |= float_flag_inexact;
7032 }
7033 return z;
7034
7035 }
7036
7037 /*----------------------------------------------------------------------------
7038 | Returns the result of adding the absolute values of the quadruple-precision
7039 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7040 | before being returned. `zSign' is ignored if the result is a NaN.
7041 | The addition is performed according to the IEC/IEEE Standard for Binary
7042 | Floating-Point Arithmetic.
7043 *----------------------------------------------------------------------------*/
7044
7045 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7046 float_status *status)
7047 {
7048 int32_t aExp, bExp, zExp;
7049 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7050 int32_t expDiff;
7051
7052 aSig1 = extractFloat128Frac1( a );
7053 aSig0 = extractFloat128Frac0( a );
7054 aExp = extractFloat128Exp( a );
7055 bSig1 = extractFloat128Frac1( b );
7056 bSig0 = extractFloat128Frac0( b );
7057 bExp = extractFloat128Exp( b );
7058 expDiff = aExp - bExp;
7059 if ( 0 < expDiff ) {
7060 if ( aExp == 0x7FFF ) {
7061 if (aSig0 | aSig1) {
7062 return propagateFloat128NaN(a, b, status);
7063 }
7064 return a;
7065 }
7066 if ( bExp == 0 ) {
7067 --expDiff;
7068 }
7069 else {
7070 bSig0 |= LIT64( 0x0001000000000000 );
7071 }
7072 shift128ExtraRightJamming(
7073 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7074 zExp = aExp;
7075 }
7076 else if ( expDiff < 0 ) {
7077 if ( bExp == 0x7FFF ) {
7078 if (bSig0 | bSig1) {
7079 return propagateFloat128NaN(a, b, status);
7080 }
7081 return packFloat128( zSign, 0x7FFF, 0, 0 );
7082 }
7083 if ( aExp == 0 ) {
7084 ++expDiff;
7085 }
7086 else {
7087 aSig0 |= LIT64( 0x0001000000000000 );
7088 }
7089 shift128ExtraRightJamming(
7090 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7091 zExp = bExp;
7092 }
7093 else {
7094 if ( aExp == 0x7FFF ) {
7095 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7096 return propagateFloat128NaN(a, b, status);
7097 }
7098 return a;
7099 }
7100 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7101 if ( aExp == 0 ) {
7102 if (status->flush_to_zero) {
7103 if (zSig0 | zSig1) {
7104 float_raise(float_flag_output_denormal, status);
7105 }
7106 return packFloat128(zSign, 0, 0, 0);
7107 }
7108 return packFloat128( zSign, 0, zSig0, zSig1 );
7109 }
7110 zSig2 = 0;
7111 zSig0 |= LIT64( 0x0002000000000000 );
7112 zExp = aExp;
7113 goto shiftRight1;
7114 }
7115 aSig0 |= LIT64( 0x0001000000000000 );
7116 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7117 --zExp;
7118 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7119 ++zExp;
7120 shiftRight1:
7121 shift128ExtraRightJamming(
7122 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7123 roundAndPack:
7124 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7125
7126 }
7127
7128 /*----------------------------------------------------------------------------
7129 | Returns the result of subtracting the absolute values of the quadruple-
7130 | precision floating-point values `a' and `b'. If `zSign' is 1, the
7131 | difference is negated before being returned. `zSign' is ignored if the
7132 | result is a NaN. The subtraction is performed according to the IEC/IEEE
7133 | Standard for Binary Floating-Point Arithmetic.
7134 *----------------------------------------------------------------------------*/
7135
7136 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7137 float_status *status)
7138 {
7139 int32_t aExp, bExp, zExp;
7140 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7141 int32_t expDiff;
7142
7143 aSig1 = extractFloat128Frac1( a );
7144 aSig0 = extractFloat128Frac0( a );
7145 aExp = extractFloat128Exp( a );
7146 bSig1 = extractFloat128Frac1( b );
7147 bSig0 = extractFloat128Frac0( b );
7148 bExp = extractFloat128Exp( b );
7149 expDiff = aExp - bExp;
7150 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7151 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7152 if ( 0 < expDiff ) goto aExpBigger;
7153 if ( expDiff < 0 ) goto bExpBigger;
7154 if ( aExp == 0x7FFF ) {
7155 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7156 return propagateFloat128NaN(a, b, status);
7157 }
7158 float_raise(float_flag_invalid, status);
7159 return float128_default_nan(status);
7160 }
7161 if ( aExp == 0 ) {
7162 aExp = 1;
7163 bExp = 1;
7164 }
7165 if ( bSig0 < aSig0 ) goto aBigger;
7166 if ( aSig0 < bSig0 ) goto bBigger;
7167 if ( bSig1 < aSig1 ) goto aBigger;
7168 if ( aSig1 < bSig1 ) goto bBigger;
7169 return packFloat128(status->float_rounding_mode == float_round_down,
7170 0, 0, 0);
7171 bExpBigger:
7172 if ( bExp == 0x7FFF ) {
7173 if (bSig0 | bSig1) {
7174 return propagateFloat128NaN(a, b, status);
7175 }
7176 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7177 }
7178 if ( aExp == 0 ) {
7179 ++expDiff;
7180 }
7181 else {
7182 aSig0 |= LIT64( 0x4000000000000000 );
7183 }
7184 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7185 bSig0 |= LIT64( 0x4000000000000000 );
7186 bBigger:
7187 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7188 zExp = bExp;
7189 zSign ^= 1;
7190 goto normalizeRoundAndPack;
7191 aExpBigger:
7192 if ( aExp == 0x7FFF ) {
7193 if (aSig0 | aSig1) {
7194 return propagateFloat128NaN(a, b, status);
7195 }
7196 return a;
7197 }
7198 if ( bExp == 0 ) {
7199 --expDiff;
7200 }
7201 else {
7202 bSig0 |= LIT64( 0x4000000000000000 );
7203 }
7204 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7205 aSig0 |= LIT64( 0x4000000000000000 );
7206 aBigger:
7207 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7208 zExp = aExp;
7209 normalizeRoundAndPack:
7210 --zExp;
7211 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7212 status);
7213
7214 }
7215
7216 /*----------------------------------------------------------------------------
7217 | Returns the result of adding the quadruple-precision floating-point values
7218 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7219 | for Binary Floating-Point Arithmetic.
7220 *----------------------------------------------------------------------------*/
7221
7222 float128 float128_add(float128 a, float128 b, float_status *status)
7223 {
7224 flag aSign, bSign;
7225
7226 aSign = extractFloat128Sign( a );
7227 bSign = extractFloat128Sign( b );
7228 if ( aSign == bSign ) {
7229 return addFloat128Sigs(a, b, aSign, status);
7230 }
7231 else {
7232 return subFloat128Sigs(a, b, aSign, status);
7233 }
7234
7235 }
7236
7237 /*----------------------------------------------------------------------------
7238 | Returns the result of subtracting the quadruple-precision floating-point
7239 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7240 | Standard for Binary Floating-Point Arithmetic.
7241 *----------------------------------------------------------------------------*/
7242
7243 float128 float128_sub(float128 a, float128 b, float_status *status)
7244 {
7245 flag aSign, bSign;
7246
7247 aSign = extractFloat128Sign( a );
7248 bSign = extractFloat128Sign( b );
7249 if ( aSign == bSign ) {
7250 return subFloat128Sigs(a, b, aSign, status);
7251 }
7252 else {
7253 return addFloat128Sigs(a, b, aSign, status);
7254 }
7255
7256 }
7257
7258 /*----------------------------------------------------------------------------
7259 | Returns the result of multiplying the quadruple-precision floating-point
7260 | values `a' and `b'. The operation is performed according to the IEC/IEEE
7261 | Standard for Binary Floating-Point Arithmetic.
7262 *----------------------------------------------------------------------------*/
7263
7264 float128 float128_mul(float128 a, float128 b, float_status *status)
7265 {
7266 flag aSign, bSign, zSign;
7267 int32_t aExp, bExp, zExp;
7268 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7269
7270 aSig1 = extractFloat128Frac1( a );
7271 aSig0 = extractFloat128Frac0( a );
7272 aExp = extractFloat128Exp( a );
7273 aSign = extractFloat128Sign( a );
7274 bSig1 = extractFloat128Frac1( b );
7275 bSig0 = extractFloat128Frac0( b );
7276 bExp = extractFloat128Exp( b );
7277 bSign = extractFloat128Sign( b );
7278 zSign = aSign ^ bSign;
7279 if ( aExp == 0x7FFF ) {
7280 if ( ( aSig0 | aSig1 )
7281 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7282 return propagateFloat128NaN(a, b, status);
7283 }
7284 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7285 return packFloat128( zSign, 0x7FFF, 0, 0 );
7286 }
7287 if ( bExp == 0x7FFF ) {
7288 if (bSig0 | bSig1) {
7289 return propagateFloat128NaN(a, b, status);
7290 }
7291 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7292 invalid:
7293 float_raise(float_flag_invalid, status);
7294 return float128_default_nan(status);
7295 }
7296 return packFloat128( zSign, 0x7FFF, 0, 0 );
7297 }
7298 if ( aExp == 0 ) {
7299 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7300 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7301 }
7302 if ( bExp == 0 ) {
7303 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7304 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7305 }
7306 zExp = aExp + bExp - 0x4000;
7307 aSig0 |= LIT64( 0x0001000000000000 );
7308 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7309 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7310 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7311 zSig2 |= ( zSig3 != 0 );
7312 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7313 shift128ExtraRightJamming(
7314 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7315 ++zExp;
7316 }
7317 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7318
7319 }
7320
7321 /*----------------------------------------------------------------------------
7322 | Returns the result of dividing the quadruple-precision floating-point value
7323 | `a' by the corresponding value `b'. The operation is performed according to
7324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7325 *----------------------------------------------------------------------------*/
7326
7327 float128 float128_div(float128 a, float128 b, float_status *status)
7328 {
7329 flag aSign, bSign, zSign;
7330 int32_t aExp, bExp, zExp;
7331 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7332 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7333
7334 aSig1 = extractFloat128Frac1( a );
7335 aSig0 = extractFloat128Frac0( a );
7336 aExp = extractFloat128Exp( a );
7337 aSign = extractFloat128Sign( a );
7338 bSig1 = extractFloat128Frac1( b );
7339 bSig0 = extractFloat128Frac0( b );
7340 bExp = extractFloat128Exp( b );
7341 bSign = extractFloat128Sign( b );
7342 zSign = aSign ^ bSign;
7343 if ( aExp == 0x7FFF ) {
7344 if (aSig0 | aSig1) {
7345 return propagateFloat128NaN(a, b, status);
7346 }
7347 if ( bExp == 0x7FFF ) {
7348 if (bSig0 | bSig1) {
7349 return propagateFloat128NaN(a, b, status);
7350 }
7351 goto invalid;
7352 }
7353 return packFloat128( zSign, 0x7FFF, 0, 0 );
7354 }
7355 if ( bExp == 0x7FFF ) {
7356 if (bSig0 | bSig1) {
7357 return propagateFloat128NaN(a, b, status);
7358 }
7359 return packFloat128( zSign, 0, 0, 0 );
7360 }
7361 if ( bExp == 0 ) {
7362 if ( ( bSig0 | bSig1 ) == 0 ) {
7363 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7364 invalid:
7365 float_raise(float_flag_invalid, status);
7366 return float128_default_nan(status);
7367 }
7368 float_raise(float_flag_divbyzero, status);
7369 return packFloat128( zSign, 0x7FFF, 0, 0 );
7370 }
7371 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7372 }
7373 if ( aExp == 0 ) {
7374 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7375 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7376 }
7377 zExp = aExp - bExp + 0x3FFD;
7378 shortShift128Left(
7379 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7380 shortShift128Left(
7381 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7382 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7383 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7384 ++zExp;
7385 }
7386 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7387 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7388 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7389 while ( (int64_t) rem0 < 0 ) {
7390 --zSig0;
7391 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7392 }
7393 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7394 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7395 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7396 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7397 while ( (int64_t) rem1 < 0 ) {
7398 --zSig1;
7399 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7400 }
7401 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7402 }
7403 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7404 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7405
7406 }
7407
7408 /*----------------------------------------------------------------------------
7409 | Returns the remainder of the quadruple-precision floating-point value `a'
7410 | with respect to the corresponding value `b'. The operation is performed
7411 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7412 *----------------------------------------------------------------------------*/
7413
7414 float128 float128_rem(float128 a, float128 b, float_status *status)
7415 {
7416 flag aSign, zSign;
7417 int32_t aExp, bExp, expDiff;
7418 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7419 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7420 int64_t sigMean0;
7421
7422 aSig1 = extractFloat128Frac1( a );
7423 aSig0 = extractFloat128Frac0( a );
7424 aExp = extractFloat128Exp( a );
7425 aSign = extractFloat128Sign( a );
7426 bSig1 = extractFloat128Frac1( b );
7427 bSig0 = extractFloat128Frac0( b );
7428 bExp = extractFloat128Exp( b );
7429 if ( aExp == 0x7FFF ) {
7430 if ( ( aSig0 | aSig1 )
7431 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7432 return propagateFloat128NaN(a, b, status);
7433 }
7434 goto invalid;
7435 }
7436 if ( bExp == 0x7FFF ) {
7437 if (bSig0 | bSig1) {
7438 return propagateFloat128NaN(a, b, status);
7439 }
7440 return a;
7441 }
7442 if ( bExp == 0 ) {
7443 if ( ( bSig0 | bSig1 ) == 0 ) {
7444 invalid:
7445 float_raise(float_flag_invalid, status);
7446 return float128_default_nan(status);
7447 }
7448 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7449 }
7450 if ( aExp == 0 ) {
7451 if ( ( aSig0 | aSig1 ) == 0 ) return a;
7452 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7453 }
7454 expDiff = aExp - bExp;
7455 if ( expDiff < -1 ) return a;
7456 shortShift128Left(
7457 aSig0 | LIT64( 0x0001000000000000 ),
7458 aSig1,
7459 15 - ( expDiff < 0 ),
7460 &aSig0,
7461 &aSig1
7462 );
7463 shortShift128Left(
7464 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7465 q = le128( bSig0, bSig1, aSig0, aSig1 );
7466 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7467 expDiff -= 64;
7468 while ( 0 < expDiff ) {
7469 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7470 q = ( 4 < q ) ? q - 4 : 0;
7471 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7472 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7473 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7474 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7475 expDiff -= 61;
7476 }
7477 if ( -64 < expDiff ) {
7478 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7479 q = ( 4 < q ) ? q - 4 : 0;
7480 q >>= - expDiff;
7481 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7482 expDiff += 52;
7483 if ( expDiff < 0 ) {
7484 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7485 }
7486 else {
7487 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7488 }
7489 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7490 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7491 }
7492 else {
7493 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7494 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7495 }
7496 do {
7497 alternateASig0 = aSig0;
7498 alternateASig1 = aSig1;
7499 ++q;
7500 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7501 } while ( 0 <= (int64_t) aSig0 );
7502 add128(
7503 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7504 if ( ( sigMean0 < 0 )
7505 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7506 aSig0 = alternateASig0;
7507 aSig1 = alternateASig1;
7508 }
7509 zSign = ( (int64_t) aSig0 < 0 );
7510 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7511 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7512 status);
7513 }
7514
7515 /*----------------------------------------------------------------------------
7516 | Returns the square root of the quadruple-precision floating-point value `a'.
7517 | The operation is performed according to the IEC/IEEE Standard for Binary
7518 | Floating-Point Arithmetic.
7519 *----------------------------------------------------------------------------*/
7520
7521 float128 float128_sqrt(float128 a, float_status *status)
7522 {
7523 flag aSign;
7524 int32_t aExp, zExp;
7525 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7526 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7527
7528 aSig1 = extractFloat128Frac1( a );
7529 aSig0 = extractFloat128Frac0( a );
7530 aExp = extractFloat128Exp( a );
7531 aSign = extractFloat128Sign( a );
7532 if ( aExp == 0x7FFF ) {
7533 if (aSig0 | aSig1) {
7534 return propagateFloat128NaN(a, a, status);
7535 }
7536 if ( ! aSign ) return a;
7537 goto invalid;
7538 }
7539 if ( aSign ) {
7540 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7541 invalid:
7542 float_raise(float_flag_invalid, status);
7543 return float128_default_nan(status);
7544 }
7545 if ( aExp == 0 ) {
7546 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7547 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7548 }
7549 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7550 aSig0 |= LIT64( 0x0001000000000000 );
7551 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7552 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7553 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7554 doubleZSig0 = zSig0<<1;
7555 mul64To128( zSig0, zSig0, &term0, &term1 );
7556 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7557 while ( (int64_t) rem0 < 0 ) {
7558 --zSig0;
7559 doubleZSig0 -= 2;
7560 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7561 }
7562 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7563 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7564 if ( zSig1 == 0 ) zSig1 = 1;
7565 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7566 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7567 mul64To128( zSig1, zSig1, &term2, &term3 );
7568 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7569 while ( (int64_t) rem1 < 0 ) {
7570 --zSig1;
7571 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7572 term3 |= 1;
7573 term2 |= doubleZSig0;
7574 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7575 }
7576 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7577 }
7578 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7579 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7580
7581 }
7582
7583 /*----------------------------------------------------------------------------
7584 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7585 | the corresponding value `b', and 0 otherwise. The invalid exception is
7586 | raised if either operand is a NaN. Otherwise, the comparison is performed
7587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7588 *----------------------------------------------------------------------------*/
7589
7590 int float128_eq(float128 a, float128 b, float_status *status)
7591 {
7592
7593 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7594 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7595 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7596 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7597 ) {
7598 float_raise(float_flag_invalid, status);
7599 return 0;
7600 }
7601 return
7602 ( a.low == b.low )
7603 && ( ( a.high == b.high )
7604 || ( ( a.low == 0 )
7605 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7606 );
7607
7608 }
7609
7610 /*----------------------------------------------------------------------------
7611 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7612 | or equal to the corresponding value `b', and 0 otherwise. The invalid
7613 | exception is raised if either operand is a NaN. The comparison is performed
7614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7615 *----------------------------------------------------------------------------*/
7616
7617 int float128_le(float128 a, float128 b, float_status *status)
7618 {
7619 flag aSign, bSign;
7620
7621 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7622 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7623 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7624 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7625 ) {
7626 float_raise(float_flag_invalid, status);
7627 return 0;
7628 }
7629 aSign = extractFloat128Sign( a );
7630 bSign = extractFloat128Sign( b );
7631 if ( aSign != bSign ) {
7632 return
7633 aSign
7634 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7635 == 0 );
7636 }
7637 return
7638 aSign ? le128( b.high, b.low, a.high, a.low )
7639 : le128( a.high, a.low, b.high, b.low );
7640
7641 }
7642
7643 /*----------------------------------------------------------------------------
7644 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7645 | the corresponding value `b', and 0 otherwise. The invalid exception is
7646 | raised if either operand is a NaN. The comparison is performed according
7647 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7648 *----------------------------------------------------------------------------*/
7649
7650 int float128_lt(float128 a, float128 b, float_status *status)
7651 {
7652 flag aSign, bSign;
7653
7654 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7655 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7656 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7657 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7658 ) {
7659 float_raise(float_flag_invalid, status);
7660 return 0;
7661 }
7662 aSign = extractFloat128Sign( a );
7663 bSign = extractFloat128Sign( b );
7664 if ( aSign != bSign ) {
7665 return
7666 aSign
7667 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7668 != 0 );
7669 }
7670 return
7671 aSign ? lt128( b.high, b.low, a.high, a.low )
7672 : lt128( a.high, a.low, b.high, b.low );
7673
7674 }
7675
7676 /*----------------------------------------------------------------------------
7677 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7678 | be compared, and 0 otherwise. The invalid exception is raised if either
7679 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7680 | Standard for Binary Floating-Point Arithmetic.
7681 *----------------------------------------------------------------------------*/
7682
7683 int float128_unordered(float128 a, float128 b, float_status *status)
7684 {
7685 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7686 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7687 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7688 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7689 ) {
7690 float_raise(float_flag_invalid, status);
7691 return 1;
7692 }
7693 return 0;
7694 }
7695
7696 /*----------------------------------------------------------------------------
7697 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7698 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7699 | exception. The comparison is performed according to the IEC/IEEE Standard
7700 | for Binary Floating-Point Arithmetic.
7701 *----------------------------------------------------------------------------*/
7702
7703 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7704 {
7705
7706 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7707 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7708 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7709 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7710 ) {
7711 if (float128_is_signaling_nan(a, status)
7712 || float128_is_signaling_nan(b, status)) {
7713 float_raise(float_flag_invalid, status);
7714 }
7715 return 0;
7716 }
7717 return
7718 ( a.low == b.low )
7719 && ( ( a.high == b.high )
7720 || ( ( a.low == 0 )
7721 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7722 );
7723
7724 }
7725
7726 /*----------------------------------------------------------------------------
7727 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7728 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7729 | cause an exception. Otherwise, the comparison is performed according to the
7730 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7731 *----------------------------------------------------------------------------*/
7732
7733 int float128_le_quiet(float128 a, float128 b, float_status *status)
7734 {
7735 flag aSign, bSign;
7736
7737 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7738 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7739 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7740 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7741 ) {
7742 if (float128_is_signaling_nan(a, status)
7743 || float128_is_signaling_nan(b, status)) {
7744 float_raise(float_flag_invalid, status);
7745 }
7746 return 0;
7747 }
7748 aSign = extractFloat128Sign( a );
7749 bSign = extractFloat128Sign( b );
7750 if ( aSign != bSign ) {
7751 return
7752 aSign
7753 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7754 == 0 );
7755 }
7756 return
7757 aSign ? le128( b.high, b.low, a.high, a.low )
7758 : le128( a.high, a.low, b.high, b.low );
7759
7760 }
7761
7762 /*----------------------------------------------------------------------------
7763 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7764 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7765 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7766 | Standard for Binary Floating-Point Arithmetic.
7767 *----------------------------------------------------------------------------*/
7768
7769 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7770 {
7771 flag aSign, bSign;
7772
7773 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7774 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7775 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7776 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7777 ) {
7778 if (float128_is_signaling_nan(a, status)
7779 || float128_is_signaling_nan(b, status)) {
7780 float_raise(float_flag_invalid, status);
7781 }
7782 return 0;
7783 }
7784 aSign = extractFloat128Sign( a );
7785 bSign = extractFloat128Sign( b );
7786 if ( aSign != bSign ) {
7787 return
7788 aSign
7789 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7790 != 0 );
7791 }
7792 return
7793 aSign ? lt128( b.high, b.low, a.high, a.low )
7794 : lt128( a.high, a.low, b.high, b.low );
7795
7796 }
7797
7798 /*----------------------------------------------------------------------------
7799 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7800 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7801 | comparison is performed according to the IEC/IEEE Standard for Binary
7802 | Floating-Point Arithmetic.
7803 *----------------------------------------------------------------------------*/
7804
7805 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7806 {
7807 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7808 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7809 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7810 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7811 ) {
7812 if (float128_is_signaling_nan(a, status)
7813 || float128_is_signaling_nan(b, status)) {
7814 float_raise(float_flag_invalid, status);
7815 }
7816 return 1;
7817 }
7818 return 0;
7819 }
7820
7821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7822 int is_quiet, float_status *status)
7823 {
7824 flag aSign, bSign;
7825
7826 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7827 float_raise(float_flag_invalid, status);
7828 return float_relation_unordered;
7829 }
7830 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7831 ( extractFloatx80Frac( a )<<1 ) ) ||
7832 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7833 ( extractFloatx80Frac( b )<<1 ) )) {
7834 if (!is_quiet ||
7835 floatx80_is_signaling_nan(a, status) ||
7836 floatx80_is_signaling_nan(b, status)) {
7837 float_raise(float_flag_invalid, status);
7838 }
7839 return float_relation_unordered;
7840 }
7841 aSign = extractFloatx80Sign( a );
7842 bSign = extractFloatx80Sign( b );
7843 if ( aSign != bSign ) {
7844
7845 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7846 ( ( a.low | b.low ) == 0 ) ) {
7847 /* zero case */
7848 return float_relation_equal;
7849 } else {
7850 return 1 - (2 * aSign);
7851 }
7852 } else {
7853 if (a.low == b.low && a.high == b.high) {
7854 return float_relation_equal;
7855 } else {
7856 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7857 }
7858 }
7859 }
7860
7861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7862 {
7863 return floatx80_compare_internal(a, b, 0, status);
7864 }
7865
7866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7867 {
7868 return floatx80_compare_internal(a, b, 1, status);
7869 }
7870
7871 static inline int float128_compare_internal(float128 a, float128 b,
7872 int is_quiet, float_status *status)
7873 {
7874 flag aSign, bSign;
7875
7876 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7877 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7878 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7879 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7880 if (!is_quiet ||
7881 float128_is_signaling_nan(a, status) ||
7882 float128_is_signaling_nan(b, status)) {
7883 float_raise(float_flag_invalid, status);
7884 }
7885 return float_relation_unordered;
7886 }
7887 aSign = extractFloat128Sign( a );
7888 bSign = extractFloat128Sign( b );
7889 if ( aSign != bSign ) {
7890 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7891 /* zero case */
7892 return float_relation_equal;
7893 } else {
7894 return 1 - (2 * aSign);
7895 }
7896 } else {
7897 if (a.low == b.low && a.high == b.high) {
7898 return float_relation_equal;
7899 } else {
7900 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7901 }
7902 }
7903 }
7904
7905 int float128_compare(float128 a, float128 b, float_status *status)
7906 {
7907 return float128_compare_internal(a, b, 0, status);
7908 }
7909
7910 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7911 {
7912 return float128_compare_internal(a, b, 1, status);
7913 }
7914
7915 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7916 {
7917 flag aSign;
7918 int32_t aExp;
7919 uint64_t aSig;
7920
7921 if (floatx80_invalid_encoding(a)) {
7922 float_raise(float_flag_invalid, status);
7923 return floatx80_default_nan(status);
7924 }
7925 aSig = extractFloatx80Frac( a );
7926 aExp = extractFloatx80Exp( a );
7927 aSign = extractFloatx80Sign( a );
7928
7929 if ( aExp == 0x7FFF ) {
7930 if ( aSig<<1 ) {
7931 return propagateFloatx80NaN(a, a, status);
7932 }
7933 return a;
7934 }
7935
7936 if (aExp == 0) {
7937 if (aSig == 0) {
7938 return a;
7939 }
7940 aExp++;
7941 }
7942
7943 if (n > 0x10000) {
7944 n = 0x10000;
7945 } else if (n < -0x10000) {
7946 n = -0x10000;
7947 }
7948
7949 aExp += n;
7950 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7951 aSign, aExp, aSig, 0, status);
7952 }
7953
7954 float128 float128_scalbn(float128 a, int n, float_status *status)
7955 {
7956 flag aSign;
7957 int32_t aExp;
7958 uint64_t aSig0, aSig1;
7959
7960 aSig1 = extractFloat128Frac1( a );
7961 aSig0 = extractFloat128Frac0( a );
7962 aExp = extractFloat128Exp( a );
7963 aSign = extractFloat128Sign( a );
7964 if ( aExp == 0x7FFF ) {
7965 if ( aSig0 | aSig1 ) {
7966 return propagateFloat128NaN(a, a, status);
7967 }
7968 return a;
7969 }
7970 if (aExp != 0) {
7971 aSig0 |= LIT64( 0x0001000000000000 );
7972 } else if (aSig0 == 0 && aSig1 == 0) {
7973 return a;
7974 } else {
7975 aExp++;
7976 }
7977
7978 if (n > 0x10000) {
7979 n = 0x10000;
7980 } else if (n < -0x10000) {
7981 n = -0x10000;
7982 }
7983
7984 aExp += n - 1;
7985 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7986 , status);
7987
7988 }
7989
7990 static void __attribute__((constructor)) softfloat_init(void)
7991 {
7992 union_float64 ua, ub, uc, ur;
7993
7994 if (QEMU_NO_HARDFLOAT) {
7995 return;
7996 }
7997 /*
7998 * Test that the host's FMA is not obviously broken. For example,
7999 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8000 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8001 */
8002 ua.s = 0x0020000000000001ULL;
8003 ub.s = 0x3ca0000000000000ULL;
8004 uc.s = 0x0020000000000000ULL;
8005 ur.h = fma(ua.h, ub.h, uc.h);
8006 if (ur.s != 0x0020000000000001ULL) {
8007 force_soft_fma = true;
8008 }
8009 }