sysdeps/ia64/fpu/s_expm1.S

   1 .file "exp_m1.s"
   2
   3
   4 // Copyright (c) 2000 - 2005, Intel Corporation
   5 // All rights reserved.
   6 //
   7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
  39 //
  40 // History
  41 //==============================================================
  42 // 02/02/00 Initial Version
  43 // 04/04/00 Unwind support added
  44 // 08/15/00 Bundle added after call to __libm_error_support to properly
  45 //          set [the previously overwritten] GR_Parameter_RESULT.
  46 // 07/07/01 Improved speed of all paths
  47 // 05/20/02 Cleaned up namespace and sf0 syntax
  48 // 11/20/02 Improved speed, algorithm based on exp
  49 // 03/31/05 Reformatted delimiters between data tables
  50
  51 // API
  52 //==============================================================
  53 // double expm1(double)
  54
  55 // Overview of operation
  56 //==============================================================
  57 // 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths
  58 //
  59 // 2. |x| < 2^-60
  60 //    Result = x, computed by x + x*x to handle appropriate flags and rounding
  61 //
  62 // 3. 2^-60 <= |x| < 2^-2
  63 //    Result determined by 13th order Taylor series polynomial
  64 //    expm1f(x) = x + Q2*x^2 + ... + Q13*x^13
  65 //
  66 // 4. x < -48.0
  67 //    Here we know result is essentially -1 + eps, where eps only affects
  68 //    rounded result.  Set I.
  69 //
  70 // 5. x >= 709.7827
  71 //    Result overflows.  Set I, O, and call error support
  72 //
  73 // 6. 2^-2 <= x < 709.7827  or  -48.0 <= x < -2^-2
  74 //    This is the main path.  The algorithm is described below:
  75
  76 // Take the input x. w is "how many log2/128 in x?"
  77 //  w = x * 128/log2
  78 //  n = int(w)
  79 //  x = n log2/128 + r + delta
  80
  81 //  n = 128M + index_1 + 2^4 index_2
  82 //  x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
  83
  84 //  exp(x) = 2^M  2^(index_1/128)  2^(index_2/8) exp(r) exp(delta)
  85 //       Construct 2^M
  86 //       Get 2^(index_1/128) from table_1;
  87 //       Get 2^(index_2/8)   from table_2;
  88 //       Calculate exp(r) by series by 5th order polynomial
  89 //          r = x - n (log2/128)_high
  90 //          delta = - n (log2/128)_low
  91 //       Calculate exp(delta) as 1 + delta
  92
  93
  94 // Special values
  95 //==============================================================
  96 // expm1(+0)    = +0.0
  97 // expm1(-0)    = -0.0
  98
  99 // expm1(+qnan) = +qnan
 100 // expm1(-qnan) = -qnan
 101 // expm1(+snan) = +qnan
 102 // expm1(-snan) = -qnan
 103
 104 // expm1(-inf)  = -1.0
 105 // expm1(+inf)  = +inf
 106
 107 // Overflow and Underflow
 108 //=======================
 109 // expm1(x) = largest double normal when
 110 //     x = 709.7827 = 40862e42fefa39ef
 111 //
 112 // Underflow is handled as described in case 2 above.
 113
 114
 115 // Registers used
 116 //==============================================================
 117 // Floating Point registers used:
 118 // f8, input
 119 // f9 -> f15,  f32 -> f75
 120
 121 // General registers used:
 122 // r14 -> r40
 123
 124 // Predicate registers used:
 125 // p6 -> p15
 126
 127 // Assembly macros
 128 //==============================================================
 129
 130 rRshf                  = r14
 131 rAD_TB1                = r15
 132 rAD_T1                 = r15
 133 rAD_TB2                = r16
 134 rAD_T2                 = r16
 135 rAD_Ln2_lo             = r17
 136 rAD_P                  = r17
 137
 138 rN                     = r18
 139 rIndex_1               = r19
 140 rIndex_2_16            = r20
 141
 142 rM                     = r21
 143 rBiased_M              = r21
 144 rIndex_1_16            = r22
 145 rSignexp_x             = r23
 146 rExp_x                 = r24
 147 rSig_inv_ln2           = r25
 148
 149 rAD_Q1                 = r26
 150 rAD_Q2                 = r27
 151 rTmp                   = r27
 152 rExp_bias              = r28
 153 rExp_mask              = r29
 154 rRshf_2to56            = r30
 155
 156 rGt_ln                 = r31
 157 rExp_2tom56            = r31
 158
 159
 160 GR_SAVE_B0             = r33
 161 GR_SAVE_PFS            = r34
 162 GR_SAVE_GP             = r35
 163 GR_SAVE_SP             = r36
 164
 165 GR_Parameter_X         = r37
 166 GR_Parameter_Y         = r38
 167 GR_Parameter_RESULT    = r39
 168 GR_Parameter_TAG       = r40
 169
 170
 171 FR_X                   = f10
 172 FR_Y                   = f1
 173 FR_RESULT              = f8
 174
 175 fRSHF_2TO56            = f6
 176 fINV_LN2_2TO63         = f7
 177 fW_2TO56_RSH           = f9
 178 f2TOM56                = f11
 179 fP5                    = f12
 180 fP54                   = f50
 181 fP5432                 = f50
 182 fP4                    = f13
 183 fP3                    = f14
 184 fP32                   = f14
 185 fP2                    = f15
 186
 187 fLn2_by_128_hi         = f33
 188 fLn2_by_128_lo         = f34
 189
 190 fRSHF                  = f35
 191 fNfloat                = f36
 192 fW                     = f37
 193 fR                     = f38
 194 fF                     = f39
 195
 196 fRsq                   = f40
 197 fRcube                 = f41
 198
 199 f2M                    = f42
 200 fS1                    = f43
 201 fT1                    = f44
 202
 203 fMIN_DBL_OFLOW_ARG     = f45
 204 fMAX_DBL_MINUS_1_ARG   = f46
 205 fMAX_DBL_NORM_ARG      = f47
 206 fP_lo                  = f51
 207 fP_hi                  = f52
 208 fP                     = f53
 209 fS                     = f54
 210
 211 fNormX                 = f56
 212
 213 fWre_urm_f8            = f57
 214
 215 fGt_pln                = f58
 216 fTmp                   = f58
 217
 218 fS2                    = f59
 219 fT2                    = f60
 220 fSm1                   = f61
 221
 222 fXsq                   = f62
 223 fX6                    = f63
 224 fX4                    = f63
 225 fQ7                    = f64
 226 fQ76                   = f64
 227 fQ7654                 = f64
 228 fQ765432               = f64
 229 fQ6                    = f65
 230 fQ5                    = f66
 231 fQ54                   = f66
 232 fQ4                    = f67
 233 fQ3                    = f68
 234 fQ32                   = f68
 235 fQ2                    = f69
 236 fQD                    = f70
 237 fQDC                   = f70
 238 fQDCBA                 = f70
 239 fQDCBA98               = f70
 240 fQDCBA98765432         = f70
 241 fQC                    = f71
 242 fQB                    = f72
 243 fQBA                   = f72
 244 fQA                    = f73
 245 fQ9                    = f74
 246 fQ98                   = f74
 247 fQ8                    = f75
 248
 249 // Data tables
 250 //==============================================================
 251
 252 RODATA
 253 .align 16
 254
 255 // ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
 256
 257 // double-extended 1/ln(2)
 258 // 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
 259 // 3fff b8aa 3b29 5c17 f0bc
 260 // For speed the significand will be loaded directly with a movl and setf.sig
 261 //   and the exponent will be bias+63 instead of bias+0.  Thus subsequent
 262 //   computations need to scale appropriately.
 263 // The constant 128/ln(2) is needed for the computation of w.  This is also
 264 //   obtained by scaling the computations.
 265 //
 266 // Two shifting constants are loaded directly with movl and setf.d.
 267 //   1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
 268 //        This constant is added to x*1/ln2 to shift the integer part of
 269 //        x*128/ln2 into the rightmost bits of the significand.
 270 //        The result of this fma is fW_2TO56_RSH.
 271 //   2. fRSHF       = 1.1000..00 * 2^(63)
 272 //        This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
 273 //        the integer part of w, n, as a floating-point number.
 274 //        The result of this fms is fNfloat.
 275
 276
 277 LOCAL_OBJECT_START(exp_Table_1)
 278 data8 0x40862e42fefa39f0 // smallest dbl overflow arg
 279 data8 0xc048000000000000 // approx largest arg for minus one result
 280 data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result
 281 data8 0x0                // pad
 282 data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
 283 data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
 284 //
 285 // Table 1 is 2^(index_1/128) where
 286 // index_1 goes from 0 to 15
 287 //
 288 data8 0x8000000000000000 , 0x00003FFF
 289 data8 0x80B1ED4FD999AB6C , 0x00003FFF
 290 data8 0x8164D1F3BC030773 , 0x00003FFF
 291 data8 0x8218AF4373FC25EC , 0x00003FFF
 292 data8 0x82CD8698AC2BA1D7 , 0x00003FFF
 293 data8 0x8383594EEFB6EE37 , 0x00003FFF
 294 data8 0x843A28C3ACDE4046 , 0x00003FFF
 295 data8 0x84F1F656379C1A29 , 0x00003FFF
 296 data8 0x85AAC367CC487B15 , 0x00003FFF
 297 data8 0x8664915B923FBA04 , 0x00003FFF
 298 data8 0x871F61969E8D1010 , 0x00003FFF
 299 data8 0x87DB357FF698D792 , 0x00003FFF
 300 data8 0x88980E8092DA8527 , 0x00003FFF
 301 data8 0x8955EE03618E5FDD , 0x00003FFF
 302 data8 0x8A14D575496EFD9A , 0x00003FFF
 303 data8 0x8AD4C6452C728924 , 0x00003FFF
 304 LOCAL_OBJECT_END(exp_Table_1)
 305
 306 // Table 2 is 2^(index_1/8) where
 307 // index_2 goes from 0 to 7
 308 LOCAL_OBJECT_START(exp_Table_2)
 309 data8 0x8000000000000000 , 0x00003FFF
 310 data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
 311 data8 0x9837F0518DB8A96F , 0x00003FFF
 312 data8 0xA5FED6A9B15138EA , 0x00003FFF
 313 data8 0xB504F333F9DE6484 , 0x00003FFF
 314 data8 0xC5672A115506DADD , 0x00003FFF
 315 data8 0xD744FCCAD69D6AF4 , 0x00003FFF
 316 data8 0xEAC0C6E7DD24392F , 0x00003FFF
 317 LOCAL_OBJECT_END(exp_Table_2)
 318
 319
 320 LOCAL_OBJECT_START(exp_p_table)
 321 data8 0x3f8111116da21757 //P5
 322 data8 0x3fa55555d787761c //P4
 323 data8 0x3fc5555555555414 //P3
 324 data8 0x3fdffffffffffd6a //P2
 325 LOCAL_OBJECT_END(exp_p_table)
 326
 327 LOCAL_OBJECT_START(exp_Q1_table)
 328 data8 0x3de6124613a86d09 // QD = 1/13!
 329 data8 0x3e21eed8eff8d898 // QC = 1/12!
 330 data8 0x3ec71de3a556c734 // Q9 = 1/9!
 331 data8 0x3efa01a01a01a01a // Q8 = 1/8!
 332 data8 0x8888888888888889,0x3ff8 // Q5 = 1/5!
 333 data8 0xaaaaaaaaaaaaaaab,0x3ffc // Q3 = 1/3!
 334 data8 0x0,0x0            // Pad to avoid bank conflicts
 335 LOCAL_OBJECT_END(exp_Q1_table)
 336
 337 LOCAL_OBJECT_START(exp_Q2_table)
 338 data8 0x3e5ae64567f544e4 // QB = 1/11!
 339 data8 0x3e927e4fb7789f5c // QA = 1/10!
 340 data8 0x3f2a01a01a01a01a // Q7 = 1/7!
 341 data8 0x3f56c16c16c16c17 // Q6 = 1/6!
 342 data8 0xaaaaaaaaaaaaaaab,0x3ffa // Q4 = 1/4!
 343 data8 0x8000000000000000,0x3ffe // Q2 = 1/2!
 344 LOCAL_OBJECT_END(exp_Q2_table)
 345
 346
 347 .section .text
 348 GLOBAL_IEEE754_ENTRY(expm1)
 349
 350 { .mlx
 351       getf.exp        rSignexp_x = f8  // Must recompute if x unorm
 352       movl            rSig_inv_ln2 = 0xb8aa3b295c17f0bc  // signif of 1/ln2
 353 }
 354 { .mlx
 355       addl            rAD_TB1    = @ltoff(exp_Table_1), gp
 356       movl            rRshf_2to56 = 0x4768000000000000   // 1.10000 2^(63+56)
 357 }
 358 ;;
 359
 360 // We do this fnorm right at the beginning to normalize
 361 // any input unnormals so that SWA is not taken.
 362 { .mfi
 363       ld8             rAD_TB1    = [rAD_TB1]
 364       fclass.m        p6,p0 = f8,0x0b  // Test for x=unorm
 365       mov             rExp_mask = 0x1ffff
 366 }
 367 { .mfi
 368       mov             rExp_bias = 0xffff
 369       fnorm.s1        fNormX   = f8
 370       mov             rExp_2tom56 = 0xffff-56
 371 }
 372 ;;
 373
 374 // Form two constants we need
 375 //  1/ln2 * 2^63  to compute  w = x * 1/ln2 * 128
 376 //  1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
 377
 378 { .mfi
 379       setf.sig        fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
 380       fclass.m        p8,p0 = f8,0x07  // Test for x=0
 381       nop.i           0
 382 }
 383 { .mlx
 384       setf.d          fRSHF_2TO56 = rRshf_2to56 // Form 1.100 * 2^(63+56)
 385       movl            rRshf = 0x43e8000000000000   // 1.10000 2^63 for rshift
 386 }
 387 ;;
 388
 389 { .mfi
 390       setf.exp        f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
 391       fclass.m        p9,p0 = f8,0x22  // Test for x=-inf
 392       add             rAD_TB2 = 0x140, rAD_TB1 // Point to Table 2
 393 }
 394 { .mib
 395       add             rAD_Q1 = 0x1e0, rAD_TB1 // Point to Q table for small path
 396       add             rAD_Ln2_lo = 0x30, rAD_TB1 // Point to ln2_by_128_lo
 397 (p6)  br.cond.spnt    EXPM1_UNORM // Branch if x unorm
 398 }
 399 ;;
 400
 401 EXPM1_COMMON:
 402 { .mfi
 403       ldfpd           fMIN_DBL_OFLOW_ARG, fMAX_DBL_MINUS_1_ARG = [rAD_TB1],16
 404       fclass.m        p10,p0 = f8,0x1e1  // Test for x=+inf, NaN, NaT
 405       add             rAD_Q2 = 0x50, rAD_Q1   // Point to Q table for small path
 406 }
 407 { .mfb
 408       nop.m           0
 409       nop.f           0
 410 (p8)  br.ret.spnt     b0                        // Exit for x=0, return x
 411 }
 412 ;;
 413
 414 { .mfi
 415       ldfd            fMAX_DBL_NORM_ARG = [rAD_TB1],16
 416       nop.f           0
 417       and             rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
 418 }
 419 { .mfb
 420       setf.d          fRSHF = rRshf // Form right shift const 1.100 * 2^63
 421 (p9)  fms.d.s0        f8 = f0,f0,f1            // quick exit for x=-inf
 422 (p9)  br.ret.spnt     b0
 423 }
 424 ;;
 425
 426 { .mfi
 427       ldfpd           fQD, fQC = [rAD_Q1], 16  // Load coeff for small path
 428       nop.f           0
 429       sub             rExp_x = rExp_x, rExp_bias // True exponent of x
 430 }
 431 { .mfb
 432       ldfpd           fQB, fQA = [rAD_Q2], 16  // Load coeff for small path
 433 (p10) fma.d.s0        f8 = f8, f1, f0          // For x=+inf, NaN, NaT
 434 (p10) br.ret.spnt     b0                       // Exit for x=+inf, NaN, NaT
 435 }
 436 ;;
 437
 438 { .mfi
 439       ldfpd           fQ9, fQ8 = [rAD_Q1], 16  // Load coeff for small path
 440       fma.s1          fXsq = fNormX, fNormX, f0  // x*x for small path
 441       cmp.gt          p7, p8 = -2, rExp_x      // Test |x| < 2^(-2)
 442 }
 443 { .mfi
 444       ldfpd           fQ7, fQ6 = [rAD_Q2], 16  // Load coeff for small path
 445       nop.f           0
 446       nop.i           0
 447 }
 448 ;;
 449
 450 { .mfi
 451       ldfe            fQ5 = [rAD_Q1], 16       // Load coeff for small path
 452       nop.f           0
 453       nop.i           0
 454 }
 455 { .mib
 456       ldfe            fQ4 = [rAD_Q2], 16       // Load coeff for small path
 457 (p7)  cmp.gt.unc      p6, p7 = -60, rExp_x     // Test |x| < 2^(-60)
 458 (p7)  br.cond.spnt    EXPM1_SMALL              // Branch if 2^-60 <= |x| < 2^-2
 459 }
 460 ;;
 461
 462 // W = X * Inv_log2_by_128
 463 // By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
 464 // We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
 465
 466 { .mfi
 467       ldfe            fLn2_by_128_hi  = [rAD_TB1],32
 468       fma.s1          fW_2TO56_RSH  = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
 469       nop.i           0
 470 }
 471 { .mfb
 472       ldfe            fLn2_by_128_lo  = [rAD_Ln2_lo]
 473 (p6)  fma.d.s0        f8 = f8, f8, f8 // If x < 2^-60, result=x+x*x
 474 (p6)  br.ret.spnt     b0              // Exit if x < 2^-60
 475 }
 476 ;;
 477
 478 // Divide arguments into the following categories:
 479 //  Certain minus one       p11 - -inf < x <= MAX_DBL_MINUS_1_ARG
 480 //  Possible Overflow       p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG
 481 //  Certain Overflow        p15 - MIN_DBL_OFLOW_ARG <= x < +inf
 482 //
 483 // If the input is really a double arg, then there will never be "Possible
 484 // Overflow" arguments.
 485 //
 486
 487 // After that last load, rAD_TB1 points to the beginning of table 1
 488
 489 { .mfi
 490       nop.m           0
 491       fcmp.ge.s1      p15,p14 = fNormX,fMIN_DBL_OFLOW_ARG
 492       nop.i           0
 493 }
 494 ;;
 495
 496 { .mfi
 497       add             rAD_P = 0x80, rAD_TB2
 498       fcmp.le.s1      p11,p0 = fNormX,fMAX_DBL_MINUS_1_ARG
 499       nop.i           0
 500 }
 501 ;;
 502
 503 { .mfb
 504       ldfpd           fP5, fP4  = [rAD_P] ,16
 505 (p14) fcmp.gt.unc.s1  p14,p0 = fNormX,fMAX_DBL_NORM_ARG
 506 (p15) br.cond.spnt    EXPM1_CERTAIN_OVERFLOW
 507 }
 508 ;;
 509
 510 // Nfloat = round_int(W)
 511 // The signficand of fW_2TO56_RSH contains the rounded integer part of W,
 512 // as a twos complement number in the lower bits (that is, it may be negative).
 513 // That twos complement number (called N) is put into rN.
 514
 515 // Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
 516 // before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
 517 // Thus, fNfloat contains the floating point version of N
 518
 519 { .mfb
 520       ldfpd           fP3, fP2  = [rAD_P]
 521       fms.s1          fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
 522 (p11) br.cond.spnt    EXPM1_CERTAIN_MINUS_ONE
 523 }
 524 ;;
 525
 526 { .mfi
 527       getf.sig        rN = fW_2TO56_RSH
 528       nop.f           0
 529       nop.i           0
 530 }
 531 ;;
 532
 533 // rIndex_1 has index_1
 534 // rIndex_2_16 has index_2 * 16
 535 // rBiased_M has M
 536 // rIndex_1_16 has index_1 * 16
 537
 538 // r = x - Nfloat * ln2_by_128_hi
 539 // f = 1 - Nfloat * ln2_by_128_lo
 540 { .mfi
 541       and             rIndex_1 = 0x0f, rN
 542       fnma.s1         fR   = fNfloat, fLn2_by_128_hi, fNormX
 543       shr             rM = rN,  0x7
 544 }
 545 { .mfi
 546       and             rIndex_2_16 = 0x70, rN
 547       fnma.s1         fF   = fNfloat, fLn2_by_128_lo, f1
 548       nop.i           0
 549 }
 550 ;;
 551
 552 // rAD_T1 has address of T1
 553 // rAD_T2 has address if T2
 554
 555 { .mmi
 556       add             rBiased_M = rExp_bias, rM
 557       add             rAD_T2 = rAD_TB2, rIndex_2_16
 558       shladd          rAD_T1 = rIndex_1, 4, rAD_TB1
 559 }
 560 ;;
 561
 562 // Create Scale = 2^M
 563 // Load T1 and T2
 564 { .mmi
 565       setf.exp        f2M = rBiased_M
 566       ldfe            fT2  = [rAD_T2]
 567       nop.i           0
 568 }
 569 ;;
 570
 571 { .mfi
 572       ldfe            fT1  = [rAD_T1]
 573       fmpy.s0         fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
 574       nop.i           0
 575 }
 576 ;;
 577
 578 { .mfi
 579       nop.m           0
 580       fma.s1          fP54 = fR, fP5, fP4
 581       nop.i           0
 582 }
 583 { .mfi
 584       nop.m           0
 585       fma.s1          fP32 = fR, fP3, fP2
 586       nop.i           0
 587 }
 588 ;;
 589
 590 { .mfi
 591       nop.m           0
 592       fma.s1          fRsq = fR, fR, f0
 593       nop.i           0
 594 }
 595 ;;
 596
 597 { .mfi
 598       nop.m           0
 599       fma.s1          fP5432  = fRsq, fP54, fP32
 600       nop.i           0
 601 }
 602 ;;
 603
 604 { .mfi
 605       nop.m           0
 606       fma.s1          fS2  = fF,fT2,f0
 607       nop.i           0
 608 }
 609 { .mfi
 610       nop.m           0
 611       fma.s1          fS1  = f2M,fT1,f0
 612       nop.i           0
 613 }
 614 ;;
 615
 616 { .mfi
 617       nop.m           0
 618       fma.s1          fP = fRsq, fP5432, fR
 619       nop.i           0
 620 }
 621 ;;
 622
 623 { .mfi
 624       nop.m           0
 625       fms.s1          fSm1 = fS1,fS2,f1    // S - 1.0
 626       nop.i           0
 627 }
 628 { .mfb
 629       nop.m           0
 630       fma.s1          fS   = fS1,fS2,f0
 631 (p14) br.cond.spnt    EXPM1_POSSIBLE_OVERFLOW
 632 }
 633 ;;
 634
 635 { .mfb
 636       nop.m           0
 637       fma.d.s0        f8 = fS, fP, fSm1
 638       br.ret.sptk     b0                // Normal path exit
 639 }
 640 ;;
 641
 642 // Here if 2^-60 <= |x| <2^-2
 643 // Compute 13th order polynomial
 644 EXPM1_SMALL:
 645 { .mmf
 646       ldfe            fQ3 = [rAD_Q1], 16
 647       ldfe            fQ2 = [rAD_Q2], 16
 648       fma.s1          fX4 = fXsq, fXsq, f0
 649 }
 650 ;;
 651
 652 { .mfi
 653       nop.m           0
 654       fma.s1          fQDC = fQD, fNormX, fQC
 655       nop.i           0
 656 }
 657 { .mfi
 658       nop.m           0
 659       fma.s1          fQBA = fQB, fNormX, fQA
 660       nop.i           0
 661 }
 662 ;;
 663
 664 { .mfi
 665       nop.m           0
 666       fma.s1          fQ98 = fQ9, fNormX, fQ8
 667       nop.i           0
 668 }
 669 { .mfi
 670       nop.m           0
 671       fma.s1          fQ76= fQ7, fNormX, fQ6
 672       nop.i           0
 673 }
 674 ;;
 675
 676 { .mfi
 677       nop.m           0
 678       fma.s1          fQ54 = fQ5, fNormX, fQ4
 679       nop.i           0
 680 }
 681 ;;
 682
 683 { .mfi
 684       nop.m           0
 685       fma.s1          fX6 = fX4, fXsq, f0
 686       nop.i           0
 687 }
 688 { .mfi
 689       nop.m           0
 690       fma.s1          fQ32= fQ3, fNormX, fQ2
 691       nop.i           0
 692 }
 693 ;;
 694
 695 { .mfi
 696       nop.m           0
 697       fma.s1          fQDCBA = fQDC, fXsq, fQBA
 698       nop.i           0
 699 }
 700 { .mfi
 701       nop.m           0
 702       fma.s1          fQ7654 = fQ76, fXsq, fQ54
 703       nop.i           0
 704 }
 705 ;;
 706
 707 { .mfi
 708       nop.m           0
 709       fma.s1          fQDCBA98 = fQDCBA, fXsq, fQ98
 710       nop.i           0
 711 }
 712 { .mfi
 713       nop.m           0
 714       fma.s1          fQ765432 = fQ7654, fXsq, fQ32
 715       nop.i           0
 716 }
 717 ;;
 718
 719 { .mfi
 720       nop.m           0
 721       fma.s1          fQDCBA98765432 = fQDCBA98, fX6, fQ765432
 722       nop.i           0
 723 }
 724 ;;
 725
 726 { .mfb
 727       nop.m           0
 728       fma.d.s0        f8 = fQDCBA98765432, fXsq, fNormX
 729       br.ret.sptk     b0                   // Exit small branch
 730 }
 731 ;;
 732
 733
 734 EXPM1_POSSIBLE_OVERFLOW:
 735
 736 // Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG
 737 // This cannot happen if input is a double, only if input higher precision.
 738 // Overflow is a possibility, not a certainty.
 739
 740 // Recompute result using status field 2 with user's rounding mode,
 741 // and wre set.  If result is larger than largest double, then we have
 742 // overflow
 743
 744 { .mfi
 745       mov             rGt_ln  = 0x103ff // Exponent for largest dbl + 1 ulp
 746       fsetc.s2        0x7F,0x42         // Get user's round mode, set wre
 747       nop.i           0
 748 }
 749 ;;
 750
 751 { .mfi
 752       setf.exp        fGt_pln = rGt_ln  // Create largest double + 1 ulp
 753       fma.d.s2        fWre_urm_f8 = fS, fP, fSm1  // Result with wre set
 754       nop.i           0
 755 }
 756 ;;
 757
 758 { .mfi
 759       nop.m           0
 760       fsetc.s2        0x7F,0x40                   // Turn off wre in sf2
 761       nop.i           0
 762 }
 763 ;;
 764
 765 { .mfi
 766       nop.m           0
 767       fcmp.ge.s1      p6, p0 =  fWre_urm_f8, fGt_pln // Test for overflow
 768       nop.i           0
 769 }
 770 ;;
 771
 772 { .mfb
 773       nop.m           0
 774       nop.f           0
 775 (p6)  br.cond.spnt    EXPM1_CERTAIN_OVERFLOW // Branch if overflow
 776 }
 777 ;;
 778
 779 { .mfb
 780       nop.m           0
 781       fma.d.s0        f8 = fS, fP, fSm1
 782       br.ret.sptk     b0                     // Exit if really no overflow
 783 }
 784 ;;
 785
 786 EXPM1_CERTAIN_OVERFLOW:
 787 { .mmi
 788       sub             rTmp = rExp_mask, r0, 1
 789 ;;
 790       setf.exp        fTmp = rTmp
 791       nop.i           0
 792 }
 793 ;;
 794
 795 { .mfi
 796       alloc           r32=ar.pfs,1,4,4,0
 797       fmerge.s        FR_X = f8,f8
 798       nop.i           0
 799 }
 800 { .mfb
 801       mov             GR_Parameter_TAG = 41
 802       fma.d.s0        FR_RESULT = fTmp, fTmp, f0    // Set I,O and +INF result
 803       br.cond.sptk    __libm_error_region
 804 }
 805 ;;
 806
 807 // Here if x unorm
 808 EXPM1_UNORM:
 809 { .mfb
 810       getf.exp        rSignexp_x = fNormX    // Must recompute if x unorm
 811       fcmp.eq.s0      p6, p0 = f8, f0        // Set D flag
 812       br.cond.sptk    EXPM1_COMMON
 813 }
 814 ;;
 815
 816 // here if result will be -1 and inexact, x <= -48.0
 817 EXPM1_CERTAIN_MINUS_ONE:
 818 { .mmi
 819       mov             rTmp = 1
 820 ;;
 821       setf.exp        fTmp = rTmp
 822       nop.i           0
 823 }
 824 ;;
 825
 826 { .mfb
 827       nop.m           0
 828       fms.d.s0        FR_RESULT = fTmp, fTmp, f1 // Set I, rounded -1+eps result
 829       br.ret.sptk     b0
 830 }
 831 ;;
 832
 833 GLOBAL_IEEE754_END(expm1)
 834
 835
 836 LOCAL_LIBM_ENTRY(__libm_error_region)
 837 .prologue
 838 { .mfi
 839         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
 840         nop.f 0
 841 .save   ar.pfs,GR_SAVE_PFS
 842         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
 843 }
 844 { .mfi
 845 .fframe 64
 846         add sp=-64,sp                           // Create new stack
 847         nop.f 0
 848         mov GR_SAVE_GP=gp                       // Save gp
 849 };;
 850 { .mmi
 851         stfd [GR_Parameter_Y] = FR_Y,16         // STORE Parameter 2 on stack
 852         add GR_Parameter_X = 16,sp              // Parameter 1 address
 853 .save   b0, GR_SAVE_B0
 854         mov GR_SAVE_B0=b0                       // Save b0
 855 };;
 856 .body
 857 { .mib
 858         stfd [GR_Parameter_X] = FR_X            // STORE Parameter 1 on stack
 859         add   GR_Parameter_RESULT = 0,GR_Parameter_Y  // Parameter 3 address
 860         nop.b 0
 861 }
 862 { .mib
 863         stfd [GR_Parameter_Y] = FR_RESULT       // STORE Parameter 3 on stack
 864         add   GR_Parameter_Y = -16,GR_Parameter_Y
 865         br.call.sptk b0=__libm_error_support#   // Call error handling function
 866 };;
 867 { .mmi
 868         add   GR_Parameter_RESULT = 48,sp
 869         nop.m 0
 870         nop.i 0
 871 };;
 872 { .mmi
 873         ldfd  f8 = [GR_Parameter_RESULT]       // Get return result off stack
 874 .restore sp
 875         add   sp = 64,sp                       // Restore stack pointer
 876         mov   b0 = GR_SAVE_B0                  // Restore return address
 877 };;
 878 { .mib
 879         mov   gp = GR_SAVE_GP                  // Restore gp
 880         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
 881         br.ret.sptk     b0                     // Return
 882 };;
 883
 884 LOCAL_LIBM_END(__libm_error_region)
 885 .type   __libm_error_support#,@function
 886 .global __libm_error_support#