libgcc/config/xtensa/ieee754-sf.S

   1 /* IEEE-754 single-precision functions for Xtensa
   2    Copyright (C) 2006-2024 Free Software Foundation, Inc.
   3    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but WITHOUT
  13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  15    License for more details.
  16
  17    Under Section 7 of GPL version 3, you are granted additional
  18    permissions described in the GCC Runtime Library Exception, version
  19    3.1, as published by the Free Software Foundation.
  20
  21    You should have received a copy of the GNU General Public License and
  22    a copy of the GCC Runtime Library Exception along with this program;
  23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24    <http://www.gnu.org/licenses/>.  */
  25
  26 #ifdef __XTENSA_EB__
  27 #define xh a2
  28 #define xl a3
  29 #define yh a4
  30 #define yl a5
  31 #else
  32 #define xh a3
  33 #define xl a2
  34 #define yh a5
  35 #define yl a4
  36 #endif
  37
  38 /*  Warning!  The branch displacements for some Xtensa branch instructions
  39     are quite small, and this code has been carefully laid out to keep
  40     branch targets in range.  If you change anything, be sure to check that
  41     the assembler is not relaxing anything to branch over a jump.  */
  42
  43 #ifdef L_negsf2
  44
  45         .align  4
  46         .global __negsf2
  47         .type   __negsf2, @function
  48 __negsf2:
  49         leaf_entry sp, 16
  50         movi    a4, 0x80000000
  51         xor     a2, a2, a4
  52         leaf_return
  53
  54 #endif /* L_negsf2 */
  55
  56 #ifdef L_addsubsf3
  57
  58         .literal_position
  59         /* Addition */
  60 __addsf3_aux:
  61
  62         /* Handle NaNs and Infinities.  (This code is placed before the
  63            start of the function just to keep it in range of the limited
  64            branch displacements.)  */
  65
  66 .Ladd_xnan_or_inf:
  67         /* If y is neither Infinity nor NaN, return x.  */
  68         bnall   a3, a6, .Ladd_return_nan_or_inf
  69         /* If x is a NaN, return it.  Otherwise, return y.  */
  70         slli    a7, a2, 9
  71         bnez    a7, .Ladd_return_nan
  72
  73 .Ladd_ynan_or_inf:
  74         /* Return y.  */
  75         mov     a2, a3
  76
  77 .Ladd_return_nan_or_inf:
  78         slli    a7, a2, 9
  79         bnez    a7, .Ladd_return_nan
  80         leaf_return
  81
  82 .Ladd_return_nan:
  83         movi    a6, 0x400000    /* make it a quiet NaN */
  84         or      a2, a2, a6
  85         leaf_return
  86
  87 .Ladd_opposite_signs:
  88         /* Operand signs differ.  Do a subtraction.  */
  89         slli    a7, a6, 8
  90         xor     a3, a3, a7
  91         j       .Lsub_same_sign
  92
  93         .align  4
  94         .global __addsf3
  95         .type   __addsf3, @function
  96 __addsf3:
  97         leaf_entry sp, 16
  98         movi    a6, 0x7f800000
  99
 100         /* Check if the two operands have the same sign.  */
 101         xor     a7, a2, a3
 102         bltz    a7, .Ladd_opposite_signs
 103
 104 .Ladd_same_sign:
 105         /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
 106         ball    a2, a6, .Ladd_xnan_or_inf
 107         ball    a3, a6, .Ladd_ynan_or_inf
 108
 109         /* Compare the exponents.  The smaller operand will be shifted
 110            right by the exponent difference and added to the larger
 111            one.  */
 112         extui   a7, a2, 23, 9
 113         extui   a8, a3, 23, 9
 114         bltu    a7, a8, .Ladd_shiftx
 115
 116 .Ladd_shifty:
 117         /* Check if the smaller (or equal) exponent is zero.  */
 118         bnone   a3, a6, .Ladd_yexpzero
 119
 120         /* Replace y sign/exponent with 0x008.  */
 121         or      a3, a3, a6
 122         slli    a3, a3, 8
 123         srli    a3, a3, 8
 124
 125 .Ladd_yexpdiff:
 126         /* Compute the exponent difference.  */
 127         sub     a10, a7, a8
 128
 129         /* Exponent difference > 32 -- just return the bigger value.  */
 130         bgeui   a10, 32, 1f
 131
 132         /* Shift y right by the exponent difference.  Any bits that are
 133            shifted out of y are saved in a9 for rounding the result.  */
 134         ssr     a10
 135         movi    a9, 0
 136         src     a9, a3, a9
 137         srl     a3, a3
 138
 139         /* Do the addition.  */
 140         add     a2, a2, a3
 141
 142         /* Check if the add overflowed into the exponent.  */
 143         extui   a10, a2, 23, 9
 144         beq     a10, a7, .Ladd_round
 145         mov     a8, a7
 146         j       .Ladd_carry
 147
 148 .Ladd_yexpzero:
 149         /* y is a subnormal value.  Replace its sign/exponent with zero,
 150            i.e., no implicit "1.0", and increment the apparent exponent
 151            because subnormals behave as if they had the minimum (nonzero)
 152            exponent.  Test for the case when both exponents are zero.  */
 153         slli    a3, a3, 9
 154         srli    a3, a3, 9
 155         bnone   a2, a6, .Ladd_bothexpzero
 156         addi    a8, a8, 1
 157         j       .Ladd_yexpdiff
 158
 159 .Ladd_bothexpzero:
 160         /* Both exponents are zero.  Handle this as a special case.  There
 161            is no need to shift or round, and the normal code for handling
 162            a carry into the exponent field will not work because it
 163            assumes there is an implicit "1.0" that needs to be added.  */
 164         add     a2, a2, a3
 165 1:      leaf_return
 166
 167 .Ladd_xexpzero:
 168         /* Same as "yexpzero" except skip handling the case when both
 169            exponents are zero.  */
 170         slli    a2, a2, 9
 171         srli    a2, a2, 9
 172         addi    a7, a7, 1
 173         j       .Ladd_xexpdiff
 174
 175 .Ladd_shiftx:
 176         /* Same thing as the "shifty" code, but with x and y swapped.  Also,
 177            because the exponent difference is always nonzero in this version,
 178            the shift sequence can use SLL and skip loading a constant zero.  */
 179         bnone   a2, a6, .Ladd_xexpzero
 180
 181         or      a2, a2, a6
 182         slli    a2, a2, 8
 183         srli    a2, a2, 8
 184
 185 .Ladd_xexpdiff:
 186         sub     a10, a8, a7
 187         bgeui   a10, 32, .Ladd_returny
 188
 189         ssr     a10
 190         sll     a9, a2
 191         srl     a2, a2
 192
 193         add     a2, a2, a3
 194
 195         /* Check if the add overflowed into the exponent.  */
 196         extui   a10, a2, 23, 9
 197         bne     a10, a8, .Ladd_carry
 198
 199 .Ladd_round:
 200         /* Round up if the leftover fraction is >= 1/2.  */
 201         bgez    a9, 1f
 202         addi    a2, a2, 1
 203
 204         /* Check if the leftover fraction is exactly 1/2.  */
 205         slli    a9, a9, 1
 206         beqz    a9, .Ladd_exactlyhalf
 207 1:      leaf_return
 208
 209 .Ladd_returny:
 210         mov     a2, a3
 211         leaf_return
 212
 213 .Ladd_carry:
 214         /* The addition has overflowed into the exponent field, so the
 215            value needs to be renormalized.  The mantissa of the result
 216            can be recovered by subtracting the original exponent and
 217            adding 0x800000 (which is the explicit "1.0" for the
 218            mantissa of the non-shifted operand -- the "1.0" for the
 219            shifted operand was already added).  The mantissa can then
 220            be shifted right by one bit.  The explicit "1.0" of the
 221            shifted mantissa then needs to be replaced by the exponent,
 222            incremented by one to account for the normalizing shift.
 223            It is faster to combine these operations: do the shift first
 224            and combine the additions and subtractions.  If x is the
 225            original exponent, the result is:
 226                shifted mantissa - (x << 22) + (1 << 22) + (x << 23)
 227            or:
 228                shifted mantissa + ((x + 1) << 22)
 229            Note that the exponent is incremented here by leaving the
 230            explicit "1.0" of the mantissa in the exponent field.  */
 231
 232         /* Shift x right by one bit.  Save the lsb.  */
 233         mov     a10, a2
 234         srli    a2, a2, 1
 235
 236         /* See explanation above.  The original exponent is in a8.  */
 237         addi    a8, a8, 1
 238         slli    a8, a8, 22
 239         add     a2, a2, a8
 240
 241         /* Return an Infinity if the exponent overflowed.  */
 242         ball    a2, a6, .Ladd_infinity
 243
 244         /* Same thing as the "round" code except the msb of the leftover
 245            fraction is bit 0 of a10, with the rest of the fraction in a9.  */
 246         bbci.l  a10, 0, 1f
 247         addi    a2, a2, 1
 248         beqz    a9, .Ladd_exactlyhalf
 249 1:      leaf_return
 250
 251 .Ladd_infinity:
 252         /* Clear the mantissa.  */
 253         srli    a2, a2, 23
 254         slli    a2, a2, 23
 255
 256         /* The sign bit may have been lost in a carry-out.  Put it back.  */
 257         slli    a8, a8, 1
 258         or      a2, a2, a8
 259         leaf_return
 260
 261 .Ladd_exactlyhalf:
 262         /* Round down to the nearest even value.  */
 263         srli    a2, a2, 1
 264         slli    a2, a2, 1
 265         leaf_return
 266
 267
 268         /* Subtraction */
 269 __subsf3_aux:
 270
 271         /* Handle NaNs and Infinities.  (This code is placed before the
 272            start of the function just to keep it in range of the limited
 273            branch displacements.)  */
 274
 275 .Lsub_xnan_or_inf:
 276         /* If y is neither Infinity nor NaN, return x.  */
 277         bnall   a3, a6, .Lsub_return_nan_or_inf
 278         /* Both x and y are either NaN or Inf, so the result is NaN.  */
 279
 280 .Lsub_return_nan:
 281         movi    a4, 0x400000    /* make it a quiet NaN */
 282         or      a2, a2, a4
 283         leaf_return
 284
 285 .Lsub_ynan_or_inf:
 286         /* Negate y and return it.  */
 287         slli    a7, a6, 8
 288         xor     a2, a3, a7
 289
 290 .Lsub_return_nan_or_inf:
 291         slli    a7, a2, 9
 292         bnez    a7, .Lsub_return_nan
 293         leaf_return
 294
 295 .Lsub_opposite_signs:
 296         /* Operand signs differ.  Do an addition.  */
 297         slli    a7, a6, 8
 298         xor     a3, a3, a7
 299         j       .Ladd_same_sign
 300
 301         .align  4
 302         .global __subsf3
 303         .type   __subsf3, @function
 304 __subsf3:
 305         leaf_entry sp, 16
 306         movi    a6, 0x7f800000
 307
 308         /* Check if the two operands have the same sign.  */
 309         xor     a7, a2, a3
 310         bltz    a7, .Lsub_opposite_signs
 311
 312 .Lsub_same_sign:
 313         /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
 314         ball    a2, a6, .Lsub_xnan_or_inf
 315         ball    a3, a6, .Lsub_ynan_or_inf
 316
 317         /* Compare the operands.  In contrast to addition, the entire
 318            value matters here.  */
 319         extui   a7, a2, 23, 8
 320         extui   a8, a3, 23, 8
 321         bltu    a2, a3, .Lsub_xsmaller
 322
 323 .Lsub_ysmaller:
 324         /* Check if the smaller (or equal) exponent is zero.  */
 325         bnone   a3, a6, .Lsub_yexpzero
 326
 327         /* Replace y sign/exponent with 0x008.  */
 328         or      a3, a3, a6
 329         slli    a3, a3, 8
 330         srli    a3, a3, 8
 331
 332 .Lsub_yexpdiff:
 333         /* Compute the exponent difference.  */
 334         sub     a10, a7, a8
 335
 336         /* Exponent difference > 32 -- just return the bigger value.  */
 337         bgeui   a10, 32, 1f
 338
 339         /* Shift y right by the exponent difference.  Any bits that are
 340            shifted out of y are saved in a9 for rounding the result.  */
 341         ssr     a10
 342         movi    a9, 0
 343         src     a9, a3, a9
 344         srl     a3, a3
 345
 346         sub     a2, a2, a3
 347
 348         /* Subtract the leftover bits in a9 from zero and propagate any
 349            borrow from a2.  */
 350         neg     a9, a9
 351         addi    a10, a2, -1
 352         movnez  a2, a10, a9
 353
 354         /* Check if the subtract underflowed into the exponent.  */
 355         extui   a10, a2, 23, 8
 356         beq     a10, a7, .Lsub_round
 357         j       .Lsub_borrow
 358
 359 .Lsub_yexpzero:
 360         /* Return zero if the inputs are equal.  (For the non-subnormal
 361            case, subtracting the "1.0" will cause a borrow from the exponent
 362            and this case can be detected when handling the borrow.)  */
 363         beq     a2, a3, .Lsub_return_zero
 364
 365         /* y is a subnormal value.  Replace its sign/exponent with zero,
 366            i.e., no implicit "1.0".  Unless x is also a subnormal, increment
 367            y's apparent exponent because subnormals behave as if they had
 368            the minimum (nonzero) exponent.  */
 369         slli    a3, a3, 9
 370         srli    a3, a3, 9
 371         bnone   a2, a6, .Lsub_yexpdiff
 372         addi    a8, a8, 1
 373         j       .Lsub_yexpdiff
 374
 375 .Lsub_returny:
 376         /* Negate and return y.  */
 377         slli    a7, a6, 8
 378         xor     a2, a3, a7
 379 1:      leaf_return
 380
 381 .Lsub_xsmaller:
 382         /* Same thing as the "ysmaller" code, but with x and y swapped and
 383            with y negated.  */
 384         bnone   a2, a6, .Lsub_xexpzero
 385
 386         or      a2, a2, a6
 387         slli    a2, a2, 8
 388         srli    a2, a2, 8
 389
 390 .Lsub_xexpdiff:
 391         sub     a10, a8, a7
 392         bgeui   a10, 32, .Lsub_returny
 393
 394         ssr     a10
 395         movi    a9, 0
 396         src     a9, a2, a9
 397         srl     a2, a2
 398
 399         /* Negate y.  */
 400         slli    a11, a6, 8
 401         xor     a3, a3, a11
 402
 403         sub     a2, a3, a2
 404
 405         neg     a9, a9
 406         addi    a10, a2, -1
 407         movnez  a2, a10, a9
 408
 409         /* Check if the subtract underflowed into the exponent.  */
 410         extui   a10, a2, 23, 8
 411         bne     a10, a8, .Lsub_borrow
 412
 413 .Lsub_round:
 414         /* Round up if the leftover fraction is >= 1/2.  */
 415         bgez    a9, 1f
 416         addi    a2, a2, 1
 417
 418         /* Check if the leftover fraction is exactly 1/2.  */
 419         slli    a9, a9, 1
 420         beqz    a9, .Lsub_exactlyhalf
 421 1:      leaf_return
 422
 423 .Lsub_xexpzero:
 424         /* Same as "yexpzero".  */
 425         beq     a2, a3, .Lsub_return_zero
 426         slli    a2, a2, 9
 427         srli    a2, a2, 9
 428         bnone   a3, a6, .Lsub_xexpdiff
 429         addi    a7, a7, 1
 430         j       .Lsub_xexpdiff
 431
 432 .Lsub_return_zero:
 433         movi    a2, 0
 434         leaf_return
 435
 436 .Lsub_borrow:
 437         /* The subtraction has underflowed into the exponent field, so the
 438            value needs to be renormalized.  Shift the mantissa left as
 439            needed to remove any leading zeros and adjust the exponent
 440            accordingly.  If the exponent is not large enough to remove
 441            all the leading zeros, the result will be a subnormal value.  */
 442
 443         slli    a8, a2, 9
 444         beqz    a8, .Lsub_xzero
 445         do_nsau a6, a8, a7, a11
 446         srli    a8, a8, 9
 447         bge     a6, a10, .Lsub_subnormal
 448         addi    a6, a6, 1
 449
 450 .Lsub_normalize_shift:
 451         /* Shift the mantissa (a8/a9) left by a6.  */
 452         ssl     a6
 453         src     a8, a8, a9
 454         sll     a9, a9
 455
 456         /* Combine the shifted mantissa with the sign and exponent,
 457            decrementing the exponent by a6.  (The exponent has already
 458            been decremented by one due to the borrow from the subtraction,
 459            but adding the mantissa will increment the exponent by one.)  */
 460         srli    a2, a2, 23
 461         sub     a2, a2, a6
 462         slli    a2, a2, 23
 463         add     a2, a2, a8
 464         j       .Lsub_round
 465
 466 .Lsub_exactlyhalf:
 467         /* Round down to the nearest even value.  */
 468         srli    a2, a2, 1
 469         slli    a2, a2, 1
 470         leaf_return
 471
 472 .Lsub_xzero:
 473         /* If there was a borrow from the exponent, and the mantissa and
 474            guard digits are all zero, then the inputs were equal and the
 475            result should be zero.  */
 476         beqz    a9, .Lsub_return_zero
 477
 478         /* Only the guard digit is nonzero.  Shift by min(24, a10).  */
 479         addi    a11, a10, -24
 480         movi    a6, 24
 481         movltz  a6, a10, a11
 482         j       .Lsub_normalize_shift
 483
 484 .Lsub_subnormal:
 485         /* The exponent is too small to shift away all the leading zeros.
 486            Set a6 to the current exponent (which has already been
 487            decremented by the borrow) so that the exponent of the result
 488            will be zero.  Do not add 1 to a6 in this case, because: (1)
 489            adding the mantissa will not increment the exponent, so there is
 490            no need to subtract anything extra from the exponent to
 491            compensate, and (2) the effective exponent of a subnormal is 1
 492            not 0 so the shift amount must be 1 smaller than normal. */
 493         mov     a6, a10
 494         j       .Lsub_normalize_shift
 495
 496 #endif /* L_addsubsf3 */
 497
 498 #ifdef L_mulsf3
 499
 500         /* Multiplication */
 501 #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
 502 #define XCHAL_NO_MUL 1
 503 #endif
 504
 505         .literal_position
 506 __mulsf3_aux:
 507
 508         /* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
 509            (This code is placed before the start of the function just to
 510            keep it in range of the limited branch displacements.)  */
 511
 512 .Lmul_xexpzero:
 513         /* Clear the sign bit of x.  */
 514         slli    a2, a2, 1
 515         srli    a2, a2, 1
 516
 517         /* If x is zero, return zero.  */
 518         beqz    a2, .Lmul_return_zero
 519
 520         /* Normalize x.  Adjust the exponent in a8.  */
 521         do_nsau a10, a2, a11, a12
 522         addi    a10, a10, -8
 523         ssl     a10
 524         sll     a2, a2
 525         movi    a8, 1
 526         sub     a8, a8, a10
 527         j       .Lmul_xnormalized
 528
 529 .Lmul_yexpzero:
 530         /* Clear the sign bit of y.  */
 531         slli    a3, a3, 1
 532         srli    a3, a3, 1
 533
 534         /* If y is zero, return zero.  */
 535         beqz    a3, .Lmul_return_zero
 536
 537         /* Normalize y.  Adjust the exponent in a9.  */
 538         do_nsau a10, a3, a11, a12
 539         addi    a10, a10, -8
 540         ssl     a10
 541         sll     a3, a3
 542         movi    a9, 1
 543         sub     a9, a9, a10
 544         j       .Lmul_ynormalized
 545
 546 .Lmul_return_zero:
 547         /* Return zero with the appropriate sign bit.  */
 548         srli    a2, a7, 31
 549         slli    a2, a2, 31
 550         j       .Lmul_done
 551
 552 .Lmul_xnan_or_inf:
 553         /* If y is zero, return NaN.  */
 554         slli    a8, a3, 1
 555         beqz    a8, .Lmul_return_nan
 556         /* If y is NaN, return y.  */
 557         bnall   a3, a6, .Lmul_returnx
 558         slli    a8, a3, 9
 559         beqz    a8, .Lmul_returnx
 560
 561 .Lmul_returny:
 562         mov     a2, a3
 563
 564 .Lmul_returnx:
 565         slli    a8, a2, 9
 566         bnez    a8, .Lmul_return_nan
 567         /* Set the sign bit and return.  */
 568         extui   a7, a7, 31, 1
 569         slli    a2, a2, 1
 570         ssai    1
 571         src     a2, a7, a2
 572         j       .Lmul_done
 573
 574 .Lmul_ynan_or_inf:
 575         /* If x is zero, return NaN.  */
 576         slli    a8, a2, 1
 577         bnez    a8, .Lmul_returny
 578         mov     a2, a3
 579
 580 .Lmul_return_nan:
 581         movi    a4, 0x400000    /* make it a quiet NaN */
 582         or      a2, a2, a4
 583         j       .Lmul_done
 584
 585         .align  4
 586         .global __mulsf3
 587         .type   __mulsf3, @function
 588 __mulsf3:
 589 #if __XTENSA_CALL0_ABI__
 590         leaf_entry sp, 32
 591         addi    sp, sp, -32
 592         s32i    a12, sp, 16
 593         s32i    a13, sp, 20
 594         s32i    a14, sp, 24
 595         s32i    a15, sp, 28
 596 #elif XCHAL_NO_MUL
 597         /* This is not really a leaf function; allocate enough stack space
 598            to allow CALL12s to a helper function.  */
 599         leaf_entry sp, 64
 600 #else
 601         leaf_entry sp, 32
 602 #endif
 603         movi    a6, 0x7f800000
 604
 605         /* Get the sign of the result.  */
 606         xor     a7, a2, a3
 607
 608         /* Check for NaN and infinity.  */
 609         ball    a2, a6, .Lmul_xnan_or_inf
 610         ball    a3, a6, .Lmul_ynan_or_inf
 611
 612         /* Extract the exponents.  */
 613         extui   a8, a2, 23, 8
 614         extui   a9, a3, 23, 8
 615
 616         beqz    a8, .Lmul_xexpzero
 617 .Lmul_xnormalized:
 618         beqz    a9, .Lmul_yexpzero
 619 .Lmul_ynormalized:
 620
 621         /* Add the exponents.  */
 622         add     a8, a8, a9
 623
 624         /* Replace sign/exponent fields with explicit "1.0".  */
 625         movi    a10, 0xffffff
 626         or      a2, a2, a6
 627         and     a2, a2, a10
 628         or      a3, a3, a6
 629         and     a3, a3, a10
 630
 631         /* Multiply 32x32 to 64 bits.  The result ends up in a2/a6.  */
 632
 633 #if XCHAL_HAVE_MUL32_HIGH
 634
 635         mull    a6, a2, a3
 636         muluh   a2, a2, a3
 637
 638 #else
 639
 640         /* Break the inputs into 16-bit chunks and compute 4 32-bit partial
 641            products.  These partial products are:
 642
 643                 0 xl * yl
 644
 645                 1 xl * yh
 646                 2 xh * yl
 647
 648                 3 xh * yh
 649
 650            If using the Mul16 or Mul32 multiplier options, these input
 651            chunks must be stored in separate registers.  For Mac16, the
 652            UMUL.AA.* opcodes can specify that the inputs come from either
 653            half of the registers, so there is no need to shift them out
 654            ahead of time.  If there is no multiply hardware, the 16-bit
 655            chunks can be extracted when setting up the arguments to the
 656            separate multiply function.  */
 657
 658 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 659         /* Calling a separate multiply function will clobber a0 and requires
 660            use of a8 as a temporary, so save those values now.  (The function
 661            uses a custom ABI so nothing else needs to be saved.)  */
 662         s32i    a0, sp, 0
 663         s32i    a8, sp, 4
 664 #endif
 665
 666 #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
 667
 668 #define a2h a4
 669 #define a3h a5
 670
 671         /* Get the high halves of the inputs into registers.  */
 672         srli    a2h, a2, 16
 673         srli    a3h, a3, 16
 674
 675 #define a2l a2
 676 #define a3l a3
 677
 678 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
 679         /* Clear the high halves of the inputs.  This does not matter
 680            for MUL16 because the high bits are ignored.  */
 681         extui   a2, a2, 0, 16
 682         extui   a3, a3, 0, 16
 683 #endif
 684 #endif /* MUL16 || MUL32 */
 685
 686
 687 #if XCHAL_HAVE_MUL16
 688
 689 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 690         mul16u  dst, xreg ## xhalf, yreg ## yhalf
 691
 692 #elif XCHAL_HAVE_MUL32
 693
 694 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 695         mull    dst, xreg ## xhalf, yreg ## yhalf
 696
 697 #elif XCHAL_HAVE_MAC16
 698
 699 /* The preprocessor insists on inserting a space when concatenating after
 700    a period in the definition of do_mul below.  These macros are a workaround
 701    using underscores instead of periods when doing the concatenation.  */
 702 #define umul_aa_ll umul.aa.ll
 703 #define umul_aa_lh umul.aa.lh
 704 #define umul_aa_hl umul.aa.hl
 705 #define umul_aa_hh umul.aa.hh
 706
 707 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 708         umul_aa_ ## xhalf ## yhalf      xreg, yreg; \
 709         rsr     dst, ACCLO
 710
 711 #else /* no multiply hardware */
 712
 713 #define set_arg_l(dst, src) \
 714         extui   dst, src, 0, 16
 715 #define set_arg_h(dst, src) \
 716         srli    dst, src, 16
 717
 718 #if __XTENSA_CALL0_ABI__
 719 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 720         set_arg_ ## xhalf (a13, xreg); \
 721         set_arg_ ## yhalf (a14, yreg); \
 722         call0   .Lmul_mulsi3; \
 723         mov     dst, a12
 724 #else
 725 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
 726         set_arg_ ## xhalf (a14, xreg); \
 727         set_arg_ ## yhalf (a15, yreg); \
 728         call12  .Lmul_mulsi3; \
 729         mov     dst, a14
 730 #endif /* __XTENSA_CALL0_ABI__ */
 731
 732 #endif /* no multiply hardware */
 733
 734         /* Add pp1 and pp2 into a6 with carry-out in a9.  */
 735         do_mul(a6, a2, l, a3, h)        /* pp 1 */
 736         do_mul(a11, a2, h, a3, l)       /* pp 2 */
 737         movi    a9, 0
 738         add     a6, a6, a11
 739         bgeu    a6, a11, 1f
 740         addi    a9, a9, 1
 741 1:
 742         /* Shift the high half of a9/a6 into position in a9.  Note that
 743            this value can be safely incremented without any carry-outs.  */
 744         ssai    16
 745         src     a9, a9, a6
 746
 747         /* Compute the low word into a6.  */
 748         do_mul(a11, a2, l, a3, l)       /* pp 0 */
 749         sll     a6, a6
 750         add     a6, a6, a11
 751         bgeu    a6, a11, 1f
 752         addi    a9, a9, 1
 753 1:
 754         /* Compute the high word into a2.  */
 755         do_mul(a2, a2, h, a3, h)        /* pp 3 */
 756         add     a2, a2, a9
 757
 758 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
 759         /* Restore values saved on the stack during the multiplication.  */
 760         l32i    a0, sp, 0
 761         l32i    a8, sp, 4
 762 #endif
 763 #endif /* ! XCHAL_HAVE_MUL32_HIGH */
 764
 765         /* Shift left by 9 bits, unless there was a carry-out from the
 766            multiply, in which case, shift by 8 bits and increment the
 767            exponent.  */
 768         movi    a4, 9
 769         srli    a5, a2, 24 - 9
 770         beqz    a5, 1f
 771         addi    a4, a4, -1
 772         addi    a8, a8, 1
 773 1:      ssl     a4
 774         src     a2, a2, a6
 775         sll     a6, a6
 776
 777         /* Subtract the extra bias from the exponent sum (plus one to account
 778            for the explicit "1.0" of the mantissa that will be added to the
 779            exponent in the final result).  */
 780         movi    a4, 0x80
 781         sub     a8, a8, a4
 782
 783         /* Check for over/underflow.  The value in a8 is one less than the
 784            final exponent, so values in the range 0..fd are OK here.  */
 785         movi    a4, 0xfe
 786         bgeu    a8, a4, .Lmul_overflow
 787
 788 .Lmul_round:
 789         /* Round.  */
 790         bgez    a6, .Lmul_rounded
 791         addi    a2, a2, 1
 792         slli    a6, a6, 1
 793         beqz    a6, .Lmul_exactlyhalf
 794
 795 .Lmul_rounded:
 796         /* Add the exponent to the mantissa.  */
 797         slli    a8, a8, 23
 798         add     a2, a2, a8
 799
 800 .Lmul_addsign:
 801         /* Add the sign bit.  */
 802         srli    a7, a7, 31
 803         slli    a7, a7, 31
 804         or      a2, a2, a7
 805
 806 .Lmul_done:
 807 #if __XTENSA_CALL0_ABI__
 808         l32i    a12, sp, 16
 809         l32i    a13, sp, 20
 810         l32i    a14, sp, 24
 811         l32i    a15, sp, 28
 812         addi    sp, sp, 32
 813 #endif
 814         leaf_return
 815
 816 .Lmul_exactlyhalf:
 817         /* Round down to the nearest even value.  */
 818         srli    a2, a2, 1
 819         slli    a2, a2, 1
 820         j       .Lmul_rounded
 821
 822 .Lmul_overflow:
 823         bltz    a8, .Lmul_underflow
 824         /* Return +/- Infinity.  */
 825         movi    a8, 0xff
 826         slli    a2, a8, 23
 827         j       .Lmul_addsign
 828
 829 .Lmul_underflow:
 830         /* Create a subnormal value, where the exponent field contains zero,
 831            but the effective exponent is 1.  The value of a8 is one less than
 832            the actual exponent, so just negate it to get the shift amount.  */
 833         neg     a8, a8
 834         mov     a9, a6
 835         ssr     a8
 836         bgeui   a8, 32, .Lmul_flush_to_zero
 837
 838         /* Shift a2 right.  Any bits that are shifted out of a2 are saved
 839            in a6 (combined with the shifted-out bits currently in a6) for
 840            rounding the result.  */
 841         sll     a6, a2
 842         srl     a2, a2
 843
 844         /* Set the exponent to zero.  */
 845         movi    a8, 0
 846
 847         /* Pack any nonzero bits shifted out into a6.  */
 848         beqz    a9, .Lmul_round
 849         movi    a9, 1
 850         or      a6, a6, a9
 851         j       .Lmul_round
 852
 853 .Lmul_flush_to_zero:
 854         /* Return zero with the appropriate sign bit.  */
 855         srli    a2, a7, 31
 856         slli    a2, a2, 31
 857         j       .Lmul_done
 858
 859 #if XCHAL_NO_MUL
 860
 861         /* For Xtensa processors with no multiply hardware, this simplified
 862            version of _mulsi3 is used for multiplying 16-bit chunks of
 863            the floating-point mantissas.  When using CALL0, this function
 864            uses a custom ABI: the inputs are passed in a13 and a14, the
 865            result is returned in a12, and a8 and a15 are clobbered.  */
 866         .align  4
 867 .Lmul_mulsi3:
 868         leaf_entry sp, 16
 869         .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
 870         movi    \dst, 0
 871 1:      add     \tmp1, \src2, \dst
 872         extui   \tmp2, \src1, 0, 1
 873         movnez  \dst, \tmp1, \tmp2
 874
 875         do_addx2 \tmp1, \src2, \dst, \tmp1
 876         extui   \tmp2, \src1, 1, 1
 877         movnez  \dst, \tmp1, \tmp2
 878
 879         do_addx4 \tmp1, \src2, \dst, \tmp1
 880         extui   \tmp2, \src1, 2, 1
 881         movnez  \dst, \tmp1, \tmp2
 882
 883         do_addx8 \tmp1, \src2, \dst, \tmp1
 884         extui   \tmp2, \src1, 3, 1
 885         movnez  \dst, \tmp1, \tmp2
 886
 887         srli    \src1, \src1, 4
 888         slli    \src2, \src2, 4
 889         bnez    \src1, 1b
 890         .endm
 891 #if __XTENSA_CALL0_ABI__
 892         mul_mulsi3_body a12, a13, a14, a15, a8
 893 #else
 894         /* The result will be written into a2, so save that argument in a4.  */
 895         mov     a4, a2
 896         mul_mulsi3_body a2, a4, a3, a5, a6
 897 #endif
 898         leaf_return
 899 #endif /* XCHAL_NO_MUL */
 900 #endif /* L_mulsf3 */
 901
 902 #ifdef L_divsf3
 903
 904         /* Division */
 905
 906 #if XCHAL_HAVE_FP_DIV
 907
 908         .align  4
 909         .global __divsf3
 910         .type   __divsf3, @function
 911 __divsf3:
 912         leaf_entry      sp, 16
 913
 914         wfr             f1, a2  /* dividend */
 915         wfr             f2, a3  /* divisor */
 916
 917         div0.s          f3, f2
 918         nexp01.s        f4, f2
 919         const.s         f5, 1
 920         maddn.s         f5, f4, f3
 921         mov.s           f6, f3
 922         mov.s           f7, f2
 923         nexp01.s        f2, f1
 924         maddn.s         f6, f5, f6
 925         const.s         f5, 1
 926         const.s         f0, 0
 927         neg.s           f8, f2
 928         maddn.s         f5, f4, f6
 929         maddn.s         f0, f8, f3
 930         mkdadj.s        f7, f1
 931         maddn.s         f6, f5, f6
 932         maddn.s         f8, f4, f0
 933         const.s         f3, 1
 934         maddn.s         f3, f4, f6
 935         maddn.s         f0, f8, f6
 936         neg.s           f2, f2
 937         maddn.s         f6, f3, f6
 938         maddn.s         f2, f4, f0
 939         addexpm.s       f0, f7
 940         addexp.s        f6, f7
 941         divn.s          f0, f2, f6
 942
 943         rfr             a2, f0
 944
 945         leaf_return
 946
 947 #else
 948
 949         .literal_position
 950 __divsf3_aux:
 951
 952         /* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
 953            (This code is placed before the start of the function just to
 954            keep it in range of the limited branch displacements.)  */
 955
 956 .Ldiv_yexpzero:
 957         /* Clear the sign bit of y.  */
 958         slli    a3, a3, 1
 959         srli    a3, a3, 1
 960
 961         /* Check for division by zero.  */
 962         beqz    a3, .Ldiv_yzero
 963
 964         /* Normalize y.  Adjust the exponent in a9.  */
 965         do_nsau a10, a3, a4, a5
 966         addi    a10, a10, -8
 967         ssl     a10
 968         sll     a3, a3
 969         movi    a9, 1
 970         sub     a9, a9, a10
 971         j       .Ldiv_ynormalized
 972
 973 .Ldiv_yzero:
 974         /* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
 975         slli    a4, a2, 1
 976         srli    a4, a4, 1
 977         srli    a2, a7, 31
 978         slli    a2, a2, 31
 979         or      a2, a2, a6
 980         bnez    a4, 1f
 981         movi    a4, 0x400000    /* make it a quiet NaN */
 982         or      a2, a2, a4
 983 1:      leaf_return
 984
 985 .Ldiv_xexpzero:
 986         /* Clear the sign bit of x.  */
 987         slli    a2, a2, 1
 988         srli    a2, a2, 1
 989
 990         /* If x is zero, return zero.  */
 991         beqz    a2, .Ldiv_return_zero
 992
 993         /* Normalize x.  Adjust the exponent in a8.  */
 994         do_nsau a10, a2, a4, a5
 995         addi    a10, a10, -8
 996         ssl     a10
 997         sll     a2, a2
 998         movi    a8, 1
 999         sub     a8, a8, a10
1000         j       .Ldiv_xnormalized
1001
1002 .Ldiv_return_zero:
1003         /* Return zero with the appropriate sign bit.  */
1004         srli    a2, a7, 31
1005         slli    a2, a2, 31
1006         leaf_return
1007
1008 .Ldiv_xnan_or_inf:
1009         /* Set the sign bit of the result.  */
1010         srli    a7, a3, 31
1011         slli    a7, a7, 31
1012         xor     a2, a2, a7
1013         /* If y is NaN or Inf, return NaN.  */
1014         ball    a3, a6, .Ldiv_return_nan
1015         slli    a7, a2, 9
1016         bnez    a7, .Ldiv_return_nan
1017         leaf_return
1018
1019 .Ldiv_ynan_or_inf:
1020         /* If y is Infinity, return zero.  */
1021         slli    a8, a3, 9
1022         beqz    a8, .Ldiv_return_zero
1023         /* y is NaN; return it.  */
1024         mov     a2, a3
1025
1026 .Ldiv_return_nan:
1027         movi    a4, 0x400000    /* make it a quiet NaN */
1028         or      a2, a2, a4
1029         leaf_return
1030
1031         .align  4
1032         .global __divsf3
1033         .type   __divsf3, @function
1034 __divsf3:
1035         leaf_entry sp, 16
1036         movi    a6, 0x7f800000
1037
1038         /* Get the sign of the result.  */
1039         xor     a7, a2, a3
1040
1041         /* Check for NaN and infinity.  */
1042         ball    a2, a6, .Ldiv_xnan_or_inf
1043         ball    a3, a6, .Ldiv_ynan_or_inf
1044
1045         /* Extract the exponents.  */
1046         extui   a8, a2, 23, 8
1047         extui   a9, a3, 23, 8
1048
1049         beqz    a9, .Ldiv_yexpzero
1050 .Ldiv_ynormalized:
1051         beqz    a8, .Ldiv_xexpzero
1052 .Ldiv_xnormalized:
1053
1054         /* Subtract the exponents.  */
1055         sub     a8, a8, a9
1056
1057         /* Replace sign/exponent fields with explicit "1.0".  */
1058         movi    a10, 0xffffff
1059         or      a2, a2, a6
1060         and     a2, a2, a10
1061         or      a3, a3, a6
1062         and     a3, a3, a10
1063
1064         /* The first digit of the mantissa division must be a one.
1065            Shift x (and adjust the exponent) as needed to make this true.  */
1066         bltu    a3, a2, 1f
1067         slli    a2, a2, 1
1068         addi    a8, a8, -1
1069 1:
1070         /* Do the first subtraction and shift.  */
1071         sub     a2, a2, a3
1072         slli    a2, a2, 1
1073
1074         /* Put the quotient into a10.  */
1075         movi    a10, 1
1076
1077         /* Divide one bit at a time for 23 bits.  */
1078         movi    a9, 23
1079 #if XCHAL_HAVE_LOOPS
1080         loop    a9, .Ldiv_loopend
1081 #endif
1082 .Ldiv_loop:
1083         /* Shift the quotient << 1.  */
1084         slli    a10, a10, 1
1085
1086         /* Is this digit a 0 or 1?  */
1087         bltu    a2, a3, 1f
1088
1089         /* Output a 1 and subtract.  */
1090         addi    a10, a10, 1
1091         sub     a2, a2, a3
1092
1093         /* Shift the dividend << 1.  */
1094 1:      slli    a2, a2, 1
1095
1096 #if !XCHAL_HAVE_LOOPS
1097         addi    a9, a9, -1
1098         bnez    a9, .Ldiv_loop
1099 #endif
1100 .Ldiv_loopend:
1101
1102         /* Add the exponent bias (less one to account for the explicit "1.0"
1103            of the mantissa that will be added to the exponent in the final
1104            result).  */
1105         addi    a8, a8, 0x7e
1106
1107         /* Check for over/underflow.  The value in a8 is one less than the
1108            final exponent, so values in the range 0..fd are OK here.  */
1109         movi    a4, 0xfe
1110         bgeu    a8, a4, .Ldiv_overflow
1111
1112 .Ldiv_round:
1113         /* Round.  The remainder (<< 1) is in a2.  */
1114         bltu    a2, a3, .Ldiv_rounded
1115         addi    a10, a10, 1
1116         beq     a2, a3, .Ldiv_exactlyhalf
1117
1118 .Ldiv_rounded:
1119         /* Add the exponent to the mantissa.  */
1120         slli    a8, a8, 23
1121         add     a2, a10, a8
1122
1123 .Ldiv_addsign:
1124         /* Add the sign bit.  */
1125         srli    a7, a7, 31
1126         slli    a7, a7, 31
1127         or      a2, a2, a7
1128         leaf_return
1129
1130 .Ldiv_overflow:
1131         bltz    a8, .Ldiv_underflow
1132         /* Return +/- Infinity.  */
1133         addi    a8, a4, 1       /* 0xff */
1134         slli    a2, a8, 23
1135         j       .Ldiv_addsign
1136
1137 .Ldiv_exactlyhalf:
1138         /* Remainder is exactly half the divisor.  Round even.  */
1139         srli    a10, a10, 1
1140         slli    a10, a10, 1
1141         j       .Ldiv_rounded
1142
1143 .Ldiv_underflow:
1144         /* Create a subnormal value, where the exponent field contains zero,
1145            but the effective exponent is 1.  The value of a8 is one less than
1146            the actual exponent, so just negate it to get the shift amount.  */
1147         neg     a8, a8
1148         ssr     a8
1149         bgeui   a8, 32, .Ldiv_flush_to_zero
1150
1151         /* Shift a10 right.  Any bits that are shifted out of a10 are
1152            saved in a6 for rounding the result.  */
1153         sll     a6, a10
1154         srl     a10, a10
1155
1156         /* Set the exponent to zero.  */
1157         movi    a8, 0
1158
1159         /* Pack any nonzero remainder (in a2) into a6.  */
1160         beqz    a2, 1f
1161         movi    a9, 1
1162         or      a6, a6, a9
1163
1164         /* Round a10 based on the bits shifted out into a6.  */
1165 1:      bgez    a6, .Ldiv_rounded
1166         addi    a10, a10, 1
1167         slli    a6, a6, 1
1168         bnez    a6, .Ldiv_rounded
1169         srli    a10, a10, 1
1170         slli    a10, a10, 1
1171         j       .Ldiv_rounded
1172
1173 .Ldiv_flush_to_zero:
1174         /* Return zero with the appropriate sign bit.  */
1175         srli    a2, a7, 31
1176         slli    a2, a2, 31
1177         leaf_return
1178
1179 #endif /* XCHAL_HAVE_FP_DIV */
1180
1181 #endif /* L_divsf3 */
1182
1183 #ifdef L_cmpsf2
1184
1185         /* Equal and Not Equal */
1186
1187         .align  4
1188         .global __eqsf2
1189         .global __nesf2
1190         .set    __nesf2, __eqsf2
1191         .type   __eqsf2, @function
1192 __eqsf2:
1193         leaf_entry sp, 16
1194         bne     a2, a3, 4f
1195
1196         /* The values are equal but NaN != NaN.  Check the exponent.  */
1197         movi    a6, 0x7f800000
1198         ball    a2, a6, 3f
1199
1200         /* Equal.  */
1201         movi    a2, 0
1202         leaf_return
1203
1204         /* Not equal.  */
1205 2:      movi    a2, 1
1206         leaf_return
1207
1208         /* Check if the mantissas are nonzero.  */
1209 3:      slli    a7, a2, 9
1210         j       5f
1211
1212         /* Check if x and y are zero with different signs.  */
1213 4:      or      a7, a2, a3
1214         slli    a7, a7, 1
1215
1216         /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
1217            or x when exponent(x) = 0x7f8 and x == y.  */
1218 5:      movi    a2, 0
1219         movi    a3, 1
1220         movnez  a2, a3, a7
1221         leaf_return
1222
1223
1224         /* Greater Than */
1225
1226         .align  4
1227         .global __gtsf2
1228         .type   __gtsf2, @function
1229 __gtsf2:
1230         leaf_entry sp, 16
1231         movi    a6, 0x7f800000
1232         ball    a2, a6, 2f
1233 1:      bnall   a3, a6, .Lle_cmp
1234
1235         /* Check if y is a NaN.  */
1236         slli    a7, a3, 9
1237         beqz    a7, .Lle_cmp
1238         movi    a2, 0
1239         leaf_return
1240
1241         /* Check if x is a NaN.  */
1242 2:      slli    a7, a2, 9
1243         beqz    a7, 1b
1244         movi    a2, 0
1245         leaf_return
1246
1247
1248         /* Less Than or Equal */
1249
1250         .align  4
1251         .global __lesf2
1252         .type   __lesf2, @function
1253 __lesf2:
1254         leaf_entry sp, 16
1255         movi    a6, 0x7f800000
1256         ball    a2, a6, 2f
1257 1:      bnall   a3, a6, .Lle_cmp
1258
1259         /* Check if y is a NaN.  */
1260         slli    a7, a3, 9
1261         beqz    a7, .Lle_cmp
1262         movi    a2, 1
1263         leaf_return
1264
1265         /* Check if x is a NaN.  */
1266 2:      slli    a7, a2, 9
1267         beqz    a7, 1b
1268         movi    a2, 1
1269         leaf_return
1270
1271 .Lle_cmp:
1272         /* Check if x and y have different signs.  */
1273         xor     a7, a2, a3
1274         bltz    a7, .Lle_diff_signs
1275
1276         /* Check if x is negative.  */
1277         bltz    a2, .Lle_xneg
1278
1279         /* Check if x <= y.  */
1280         bltu    a3, a2, 5f
1281 4:      movi    a2, 0
1282         leaf_return
1283
1284 .Lle_xneg:
1285         /* Check if y <= x.  */
1286         bgeu    a2, a3, 4b
1287 5:      movi    a2, 1
1288         leaf_return
1289
1290 .Lle_diff_signs:
1291         bltz    a2, 4b
1292
1293         /* Check if both x and y are zero.  */
1294         or      a7, a2, a3
1295         slli    a7, a7, 1
1296         movi    a2, 1
1297         movi    a3, 0
1298         moveqz  a2, a3, a7
1299         leaf_return
1300
1301
1302         /* Greater Than or Equal */
1303
1304         .align  4
1305         .global __gesf2
1306         .type   __gesf2, @function
1307 __gesf2:
1308         leaf_entry sp, 16
1309         movi    a6, 0x7f800000
1310         ball    a2, a6, 2f
1311 1:      bnall   a3, a6, .Llt_cmp
1312
1313         /* Check if y is a NaN.  */
1314         slli    a7, a3, 9
1315         beqz    a7, .Llt_cmp
1316         movi    a2, -1
1317         leaf_return
1318
1319         /* Check if x is a NaN.  */
1320 2:      slli    a7, a2, 9
1321         beqz    a7, 1b
1322         movi    a2, -1
1323         leaf_return
1324
1325
1326         /* Less Than */
1327
1328         .align  4
1329         .global __ltsf2
1330         .type   __ltsf2, @function
1331 __ltsf2:
1332         leaf_entry sp, 16
1333         movi    a6, 0x7f800000
1334         ball    a2, a6, 2f
1335 1:      bnall   a3, a6, .Llt_cmp
1336
1337         /* Check if y is a NaN.  */
1338         slli    a7, a3, 9
1339         beqz    a7, .Llt_cmp
1340         movi    a2, 0
1341         leaf_return
1342
1343         /* Check if x is a NaN.  */
1344 2:      slli    a7, a2, 9
1345         beqz    a7, 1b
1346         movi    a2, 0
1347         leaf_return
1348
1349 .Llt_cmp:
1350         /* Check if x and y have different signs.  */
1351         xor     a7, a2, a3
1352         bltz    a7, .Llt_diff_signs
1353
1354         /* Check if x is negative.  */
1355         bltz    a2, .Llt_xneg
1356
1357         /* Check if x < y.  */
1358         bgeu    a2, a3, 5f
1359 4:      movi    a2, -1
1360         leaf_return
1361
1362 .Llt_xneg:
1363         /* Check if y < x.  */
1364         bltu    a3, a2, 4b
1365 5:      movi    a2, 0
1366         leaf_return
1367
1368 .Llt_diff_signs:
1369         bgez    a2, 5b
1370
1371         /* Check if both x and y are nonzero.  */
1372         or      a7, a2, a3
1373         slli    a7, a7, 1
1374         movi    a2, 0
1375         movi    a3, -1
1376         movnez  a2, a3, a7
1377         leaf_return
1378
1379
1380         /* Unordered */
1381
1382         .align  4
1383         .global __unordsf2
1384         .type   __unordsf2, @function
1385 __unordsf2:
1386         leaf_entry sp, 16
1387         movi    a6, 0x7f800000
1388         ball    a2, a6, 3f
1389 1:      ball    a3, a6, 4f
1390 2:      movi    a2, 0
1391         leaf_return
1392
1393 3:      slli    a7, a2, 9
1394         beqz    a7, 1b
1395         movi    a2, 1
1396         leaf_return
1397
1398 4:      slli    a7, a3, 9
1399         beqz    a7, 2b
1400         movi    a2, 1
1401         leaf_return
1402
1403 #endif /* L_cmpsf2 */
1404
1405 #ifdef L_fixsfsi
1406
1407         .align  4
1408         .global __fixsfsi
1409         .type   __fixsfsi, @function
1410 __fixsfsi:
1411         leaf_entry sp, 16
1412
1413         /* Check for NaN and Infinity.  */
1414         movi    a6, 0x7f800000
1415         ball    a2, a6, .Lfixsfsi_nan_or_inf
1416
1417         /* Extract the exponent and check if 0 < (exp - 0x7e) < 32.  */
1418         extui   a4, a2, 23, 8
1419         addi    a4, a4, -0x7e
1420         bgei    a4, 32, .Lfixsfsi_maxint
1421         blti    a4, 1, .Lfixsfsi_zero
1422
1423         /* Add explicit "1.0" and shift << 8.  */
1424         or      a7, a2, a6
1425         slli    a5, a7, 8
1426
1427         /* Shift back to the right, based on the exponent.  */
1428         ssl     a4              /* shift by 32 - a4 */
1429         srl     a5, a5
1430
1431         /* Negate the result if sign != 0.  */
1432         neg     a2, a5
1433         movgez  a2, a5, a7
1434         leaf_return
1435
1436 .Lfixsfsi_nan_or_inf:
1437         /* Handle Infinity and NaN.  */
1438         slli    a4, a2, 9
1439         beqz    a4, .Lfixsfsi_maxint
1440
1441         /* Translate NaN to +maxint.  */
1442         movi    a2, 0
1443
1444 .Lfixsfsi_maxint:
1445         slli    a4, a6, 8       /* 0x80000000 */
1446         addi    a5, a4, -1      /* 0x7fffffff */
1447         movgez  a4, a5, a2
1448         mov     a2, a4
1449         leaf_return
1450
1451 .Lfixsfsi_zero:
1452         movi    a2, 0
1453         leaf_return
1454
1455 #endif /* L_fixsfsi */
1456
1457 #ifdef L_fixsfdi
1458
1459         .align  4
1460         .global __fixsfdi
1461         .type   __fixsfdi, @function
1462 __fixsfdi:
1463         leaf_entry sp, 16
1464
1465         /* Check for NaN and Infinity.  */
1466         movi    a6, 0x7f800000
1467         ball    a2, a6, .Lfixsfdi_nan_or_inf
1468
1469         /* Extract the exponent and check if 0 < (exp - 0x7e) < 64.  */
1470         extui   a4, a2, 23, 8
1471         addi    a4, a4, -0x7e
1472         bgei    a4, 64, .Lfixsfdi_maxint
1473         blti    a4, 1, .Lfixsfdi_zero
1474
1475         /* Add explicit "1.0" and shift << 8.  */
1476         or      a7, a2, a6
1477         slli    xh, a7, 8
1478
1479         /* Shift back to the right, based on the exponent.  */
1480         ssl     a4              /* shift by 64 - a4 */
1481         bgei    a4, 32, .Lfixsfdi_smallshift
1482         srl     xl, xh
1483         movi    xh, 0
1484
1485 .Lfixsfdi_shifted:
1486         /* Negate the result if sign != 0.  */
1487         bgez    a7, 1f
1488         neg     xl, xl
1489         neg     xh, xh
1490         beqz    xl, 1f
1491         addi    xh, xh, -1
1492 1:      leaf_return
1493
1494 .Lfixsfdi_smallshift:
1495         movi    xl, 0
1496         sll     xl, xh
1497         srl     xh, xh
1498         j       .Lfixsfdi_shifted
1499
1500 .Lfixsfdi_nan_or_inf:
1501         /* Handle Infinity and NaN.  */
1502         slli    a4, a2, 9
1503         beqz    a4, .Lfixsfdi_maxint
1504
1505         /* Translate NaN to +maxint.  */
1506         movi    a2, 0
1507
1508 .Lfixsfdi_maxint:
1509         slli    a7, a6, 8       /* 0x80000000 */
1510         bgez    a2, 1f
1511         mov     xh, a7
1512         movi    xl, 0
1513         leaf_return
1514
1515 1:      addi    xh, a7, -1      /* 0x7fffffff */
1516         movi    xl, -1
1517         leaf_return
1518
1519 .Lfixsfdi_zero:
1520         movi    xh, 0
1521         movi    xl, 0
1522         leaf_return
1523
1524 #endif /* L_fixsfdi */
1525
1526 #ifdef L_fixunssfsi
1527
1528         .align  4
1529         .global __fixunssfsi
1530         .type   __fixunssfsi, @function
1531 __fixunssfsi:
1532         leaf_entry sp, 16
1533
1534         /* Check for NaN and Infinity.  */
1535         movi    a6, 0x7f800000
1536         ball    a2, a6, .Lfixunssfsi_nan_or_inf
1537
1538         /* Extract the exponent and check if 0 <= (exp - 0x7f) < 32.  */
1539         extui   a4, a2, 23, 8
1540         addi    a4, a4, -0x7f
1541         bgei    a4, 32, .Lfixunssfsi_maxint
1542         bltz    a4, .Lfixunssfsi_zero
1543
1544         /* Add explicit "1.0" and shift << 8.  */
1545         or      a7, a2, a6
1546         slli    a5, a7, 8
1547
1548         /* Shift back to the right, based on the exponent.  */
1549         addi    a4, a4, 1
1550         beqi    a4, 32, .Lfixunssfsi_bigexp
1551         ssl     a4              /* shift by 32 - a4 */
1552         srl     a5, a5
1553
1554         /* Negate the result if sign != 0.  */
1555         neg     a2, a5
1556         movgez  a2, a5, a7
1557         leaf_return
1558
1559 .Lfixunssfsi_nan_or_inf:
1560         /* Handle Infinity and NaN.  */
1561         slli    a4, a2, 9
1562         beqz    a4, .Lfixunssfsi_maxint
1563
1564         /* Translate NaN to 0xffffffff.  */
1565         movi    a2, -1
1566         leaf_return
1567
1568 .Lfixunssfsi_maxint:
1569         slli    a4, a6, 8       /* 0x80000000 */
1570         movi    a5, -1          /* 0xffffffff */
1571         movgez  a4, a5, a2
1572         mov     a2, a4
1573         leaf_return
1574
1575 .Lfixunssfsi_zero:
1576         movi    a2, 0
1577         leaf_return
1578
1579 .Lfixunssfsi_bigexp:
1580         /* Handle unsigned maximum exponent case.  */
1581         bltz    a2, 1f
1582         mov     a2, a5          /* no shift needed */
1583         leaf_return
1584
1585         /* Return 0x80000000 if negative.  */
1586 1:      slli    a2, a6, 8
1587         leaf_return
1588
1589 #endif /* L_fixunssfsi */
1590
1591 #ifdef L_fixunssfdi
1592
1593         .align  4
1594         .global __fixunssfdi
1595         .type   __fixunssfdi, @function
1596 __fixunssfdi:
1597         leaf_entry sp, 16
1598
1599         /* Check for NaN and Infinity.  */
1600         movi    a6, 0x7f800000
1601         ball    a2, a6, .Lfixunssfdi_nan_or_inf
1602
1603         /* Extract the exponent and check if 0 <= (exp - 0x7f) < 64.  */
1604         extui   a4, a2, 23, 8
1605         addi    a4, a4, -0x7f
1606         bgei    a4, 64, .Lfixunssfdi_maxint
1607         bltz    a4, .Lfixunssfdi_zero
1608
1609         /* Add explicit "1.0" and shift << 8.  */
1610         or      a7, a2, a6
1611         slli    xh, a7, 8
1612
1613         /* Shift back to the right, based on the exponent.  */
1614         addi    a4, a4, 1
1615         beqi    a4, 64, .Lfixunssfdi_bigexp
1616         ssl     a4              /* shift by 64 - a4 */
1617         bgei    a4, 32, .Lfixunssfdi_smallshift
1618         srl     xl, xh
1619         movi    xh, 0
1620
1621 .Lfixunssfdi_shifted:
1622         /* Negate the result if sign != 0.  */
1623         bgez    a7, 1f
1624         neg     xl, xl
1625         neg     xh, xh
1626         beqz    xl, 1f
1627         addi    xh, xh, -1
1628 1:      leaf_return
1629
1630 .Lfixunssfdi_smallshift:
1631         movi    xl, 0
1632         src     xl, xh, xl
1633         srl     xh, xh
1634         j       .Lfixunssfdi_shifted
1635
1636 .Lfixunssfdi_nan_or_inf:
1637         /* Handle Infinity and NaN.  */
1638         slli    a4, a2, 9
1639         beqz    a4, .Lfixunssfdi_maxint
1640
1641         /* Translate NaN to 0xffffffff.... */
1642 1:      movi    xh, -1
1643         movi    xl, -1
1644         leaf_return
1645
1646 .Lfixunssfdi_maxint:
1647         bgez    a2, 1b
1648 2:      slli    xh, a6, 8       /* 0x80000000 */
1649         movi    xl, 0
1650         leaf_return
1651
1652 .Lfixunssfdi_zero:
1653         movi    xh, 0
1654         movi    xl, 0
1655         leaf_return
1656
1657 .Lfixunssfdi_bigexp:
1658         /* Handle unsigned maximum exponent case.  */
1659         bltz    a7, 2b
1660         movi    xl, 0
1661         leaf_return             /* no shift needed */
1662
1663 #endif /* L_fixunssfdi */
1664
1665 #ifdef L_floatsisf
1666
1667         .align  4
1668         .global __floatunsisf
1669         .type   __floatunsisf, @function
1670 __floatunsisf:
1671         leaf_entry sp, 16
1672         beqz    a2, .Lfloatsisf_return
1673
1674         /* Set the sign to zero and jump to the floatsisf code.  */
1675         movi    a7, 0
1676         j       .Lfloatsisf_normalize
1677
1678         .align  4
1679         .global __floatsisf
1680         .type   __floatsisf, @function
1681 __floatsisf:
1682         leaf_entry sp, 16
1683
1684         /* Check for zero.  */
1685         beqz    a2, .Lfloatsisf_return
1686
1687         /* Save the sign.  */
1688         extui   a7, a2, 31, 1
1689
1690         /* Get the absolute value.  */
1691 #if XCHAL_HAVE_ABS
1692         abs     a2, a2
1693 #else
1694         neg     a4, a2
1695         movltz  a2, a4, a2
1696 #endif
1697
1698 .Lfloatsisf_normalize:
1699         /* Normalize with the first 1 bit in the msb.  */
1700         do_nsau a4, a2, a5, a6
1701         ssl     a4
1702         sll     a5, a2
1703
1704         /* Shift the mantissa into position, with rounding bits in a6.  */
1705         srli    a2, a5, 8
1706         slli    a6, a5, (32 - 8)
1707
1708         /* Set the exponent.  */
1709         movi    a5, 0x9d        /* 0x7e + 31 */
1710         sub     a5, a5, a4
1711         slli    a5, a5, 23
1712         add     a2, a2, a5
1713
1714         /* Add the sign.  */
1715         slli    a7, a7, 31
1716         or      a2, a2, a7
1717
1718         /* Round up if the leftover fraction is >= 1/2.  */
1719         bgez    a6, .Lfloatsisf_return
1720         addi    a2, a2, 1       /* Overflow to the exponent is OK.  */
1721
1722         /* Check if the leftover fraction is exactly 1/2.  */
1723         slli    a6, a6, 1
1724         beqz    a6, .Lfloatsisf_exactlyhalf
1725
1726 .Lfloatsisf_return:
1727         leaf_return
1728
1729 .Lfloatsisf_exactlyhalf:
1730         /* Round down to the nearest even value.  */
1731         srli    a2, a2, 1
1732         slli    a2, a2, 1
1733         leaf_return
1734
1735 #endif /* L_floatsisf */
1736
1737 #ifdef L_floatdisf
1738
1739         .align  4
1740         .global __floatundisf
1741         .type   __floatundisf, @function
1742 __floatundisf:
1743         leaf_entry sp, 16
1744
1745         /* Check for zero.  */
1746         or      a4, xh, xl
1747         beqz    a4, 2f
1748
1749         /* Set the sign to zero and jump to the floatdisf code.  */
1750         movi    a7, 0
1751         j       .Lfloatdisf_normalize
1752
1753         .align  4
1754         .global __floatdisf
1755         .type   __floatdisf, @function
1756 __floatdisf:
1757         leaf_entry sp, 16
1758
1759         /* Check for zero.  */
1760         or      a4, xh, xl
1761         beqz    a4, 2f
1762
1763         /* Save the sign.  */
1764         extui   a7, xh, 31, 1
1765
1766         /* Get the absolute value.  */
1767         bgez    xh, .Lfloatdisf_normalize
1768         neg     xl, xl
1769         neg     xh, xh
1770         beqz    xl, .Lfloatdisf_normalize
1771         addi    xh, xh, -1
1772
1773 .Lfloatdisf_normalize:
1774         /* Normalize with the first 1 bit in the msb of xh.  */
1775         beqz    xh, .Lfloatdisf_bigshift
1776         do_nsau a4, xh, a5, a6
1777         ssl     a4
1778         src     xh, xh, xl
1779         sll     xl, xl
1780
1781 .Lfloatdisf_shifted:
1782         /* Shift the mantissa into position, with rounding bits in a6.  */
1783         ssai    8
1784         sll     a5, xl
1785         src     a6, xh, xl
1786         srl     xh, xh
1787         beqz    a5, 1f
1788         movi    a5, 1
1789         or      a6, a6, a5
1790 1:
1791         /* Set the exponent.  */
1792         movi    a5, 0xbd        /* 0x7e + 63 */
1793         sub     a5, a5, a4
1794         slli    a5, a5, 23
1795         add     a2, xh, a5
1796
1797         /* Add the sign.  */
1798         slli    a7, a7, 31
1799         or      a2, a2, a7
1800
1801         /* Round up if the leftover fraction is >= 1/2.  */
1802         bgez    a6, 2f
1803         addi    a2, a2, 1       /* Overflow to the exponent is OK.  */
1804
1805         /* Check if the leftover fraction is exactly 1/2.  */
1806         slli    a6, a6, 1
1807         beqz    a6, .Lfloatdisf_exactlyhalf
1808 2:      leaf_return
1809
1810 .Lfloatdisf_bigshift:
1811         /* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
1812         do_nsau a4, xl, a5, a6
1813         ssl     a4
1814         sll     xh, xl
1815         movi    xl, 0
1816         addi    a4, a4, 32
1817         j       .Lfloatdisf_shifted
1818
1819 .Lfloatdisf_exactlyhalf:
1820         /* Round down to the nearest even value.  */
1821         srli    a2, a2, 1
1822         slli    a2, a2, 1
1823         leaf_return
1824
1825 #endif /* L_floatdisf */
1826
1827 #if XCHAL_HAVE_FP_SQRT
1828 #ifdef L_sqrtf
1829         /* Square root */
1830
1831         .align  4
1832         .global __ieee754_sqrtf
1833         .type   __ieee754_sqrtf, @function
1834 __ieee754_sqrtf:
1835         leaf_entry      sp, 16
1836
1837         wfr             f1, a2
1838
1839         sqrt0.s         f2, f1
1840         const.s         f3, 0
1841         maddn.s         f3, f2, f2
1842         nexp01.s        f4, f1
1843         const.s         f0, 3
1844         addexp.s        f4, f0
1845         maddn.s         f0, f3, f4
1846         nexp01.s        f3, f1
1847         neg.s           f5, f3
1848         maddn.s         f2, f0, f2
1849         const.s         f0, 0
1850         const.s         f6, 0
1851         const.s         f7, 0
1852         maddn.s         f0, f5, f2
1853         maddn.s         f6, f2, f4
1854         const.s         f4, 3
1855         maddn.s         f7, f4, f2
1856         maddn.s         f3, f0, f0
1857         maddn.s         f4, f6, f2
1858         neg.s           f2, f7
1859         maddn.s         f0, f3, f2
1860         maddn.s         f7, f4, f7
1861         mksadj.s        f2, f1
1862         nexp01.s        f1, f1
1863         maddn.s         f1, f0, f0
1864         neg.s           f3, f7
1865         addexpm.s       f0, f2
1866         addexp.s        f3, f2
1867         divn.s          f0, f1, f3
1868
1869         rfr             a2, f0
1870
1871         leaf_return
1872
1873 #endif /* L_sqrtf */
1874 #endif /* XCHAL_HAVE_FP_SQRT */
1875
1876 #if XCHAL_HAVE_FP_RECIP
1877 #ifdef L_recipsf2
1878         /* Reciprocal */
1879
1880         .align  4
1881         .global __recipsf2
1882         .type   __recipsf2, @function
1883 __recipsf2:
1884         leaf_entry      sp, 16
1885
1886         wfr             f1, a2
1887
1888         recip0.s        f0, f1
1889         const.s         f2, 1
1890         msub.s          f2, f1, f0
1891         maddn.s         f0, f0, f2
1892         const.s         f2, 1
1893         msub.s          f2, f1, f0
1894         maddn.s         f0, f0, f2
1895
1896         rfr             a2, f0
1897
1898         leaf_return
1899
1900 #endif /* L_recipsf2 */
1901 #endif /* XCHAL_HAVE_FP_RECIP */
1902
1903 #if XCHAL_HAVE_FP_RSQRT
1904 #ifdef L_rsqrtsf2
1905         /* Reciprocal square root */
1906
1907         .align  4
1908         .global __rsqrtsf2
1909         .type   __rsqrtsf2, @function
1910 __rsqrtsf2:
1911         leaf_entry      sp, 16
1912
1913         wfr             f1, a2
1914
1915         rsqrt0.s        f0, f1
1916         mul.s           f2, f1, f0
1917         const.s         f3, 3;
1918         mul.s           f4, f3, f0
1919         const.s         f5, 1
1920         msub.s          f5, f2, f0
1921         maddn.s         f0, f4, f5
1922         mul.s           f2, f1, f0
1923         mul.s           f1, f3, f0
1924         const.s         f3, 1
1925         msub.s          f3, f2, f0
1926         maddn.s         f0, f1, f3
1927
1928         rfr             a2, f0
1929
1930         leaf_return
1931
1932 #endif /* L_rsqrtsf2 */
1933 #endif /* XCHAL_HAVE_FP_RSQRT */