libgcc/config/arc/ieee-754/adddf3.S

   1 /* Copyright (C) 2008-2024 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "arc-ieee-754.h"
  27 #if 0 /* DEBUG */
  28         .global __adddf3
  29         .balign 4
  30 __adddf3:
  31         push_s blink
  32         push_s r2
  33         push_s r3
  34         push_s r0
  35         bl.d __adddf3_c
  36         push_s r1
  37         ld_s r2,[sp,12]
  38         ld_s r3,[sp,8]
  39         st_s r0,[sp,12]
  40         st_s r1,[sp,8]
  41         pop_s r1
  42         bl.d __adddf3_asm
  43         pop_s r0
  44         pop_s r3
  45         pop_s r2
  46         pop_s blink
  47         cmp r0,r2
  48         cmp.eq r1,r3
  49         jeq_s [blink]
  50         bl abort
  51         .global __subdf3
  52         .balign 4
  53 __subdf3:
  54         push_s blink
  55         push_s r2
  56         push_s r3
  57         push_s r0
  58         bl.d __subdf3_c
  59         push_s r1
  60         ld_s r2,[sp,12]
  61         ld_s r3,[sp,8]
  62         st_s r0,[sp,12]
  63         st_s r1,[sp,8]
  64         pop_s r1
  65         bl.d __subdf3_asm
  66         pop_s r0
  67         pop_s r3
  68         pop_s r2
  69         pop_s blink
  70         cmp r0,r2
  71         cmp.eq r1,r3
  72         jeq_s [blink]
  73         bl abort
  74 #define __adddf3 __adddf3_asm
  75 #define __subdf3 __subdf3_asm
  76 #endif /* DEBUG */
  77 /* N.B. This is optimized for ARC700.
  78   ARC600 has very different scheduling / instruction selection criteria.  */
  79
  80 /* inputs: DBL0, DBL1 (r0-r3)
  81    output: DBL0 (r0, r1)
  82    clobber: r2-r10, r12, flags
  83    All NaN highword bits must be 1.  NaN low word is random.  */
  84
  85         .balign 4
  86         .global __adddf3
  87         .global __subdf3
  88         .long 0x7ff00000 ; exponent mask
  89         FUNC(__adddf3)
  90         FUNC(__subdf3)
  91 __subdf3:
  92         bxor_l DBL1H,DBL1H,31
  93 __adddf3:
  94         ld r9,[pcl,-8]
  95         bmsk r4,DBL0H,30
  96         xor r10,DBL0H,DBL1H
  97         and r6,DBL1H,r9
  98         sub.f r12,r4,r6
  99         asr_s r12,r12,20
 100         blo .Ldbl1_gt
 101         brhs r4,r9,.Linf_nan
 102         brhs r12,32,.Large_shift
 103         brne r12,0,.Lsmall_shift
 104         brge r10,0,.Ladd_same_exp ; r12 == 0
 105
 106 /* After subtracting, we need to normalize; when shifting to place the
 107   leading 1 into position for the implicit 1 and adding that to DBL0H,
 108   we increment the exponent.  Thus, we have to subtract one more than
 109   the shift count from the exponent beforehand.  Iff the exponent drops thus
 110   below zero (before adding in the fraction with the leading one), we have
 111   generated a denormal number.  Denormal handling is basicallly reducing the
 112   shift count so that we produce a zero exponent instead; however, this way
 113   the shift count can become zero (if we started out with exponent 1).
 114   Therefore, a simple min operation is not good enough, since we don't
 115   want to handle a zero normalizing shift in the main path.
 116   On the plus side, we don't need to check for denorm input, the result
 117   of subtracing these looks just the same as denormals generated during
 118   subtraction.  */
 119         bmsk r7,DBL1H,30
 120         cmp r4,r7
 121         cmp.eq DBL0L,DBL1L
 122         blo .L_rsub_same_exp
 123         sub.f DBL0L,DBL0L,DBL1L
 124         bmsk r12,DBL0H,19
 125         bic DBL1H,DBL0H,r12
 126         sbc.f r4,r4,r7
 127         beq_l .Large_cancel
 128         norm DBL1L,r4
 129         b.d .Lsub_done_same_exp
 130         sub r12,DBL1L,9
 131
 132         .balign 4
 133 .Linf_nan:
 134         ; If both inputs are inf, but with different signs, the result is NaN.
 135         asr r12,r10,31
 136         or_s DBL1H,DBL1H,r12
 137         j_s.d [blink]
 138         or.eq DBL0H,DBL0H,DBL1H
 139
 140         .balign 4
 141 .L_rsub_same_exp:
 142         rsub.f DBL0L,DBL0L,DBL1L
 143         bmsk r12,DBL1H,19
 144         bic_s DBL1H,DBL1H,r12
 145         sbc.f r4,r7,r4
 146         beq_l .Large_cancel
 147         norm DBL1L,r4
 148
 149         sub r12,DBL1L,9
 150 .Lsub_done_same_exp:
 151         asl_s r12,r12,20
 152         sub_s DBL1L,DBL1L,10
 153         sub DBL0H,DBL1H,r12
 154         xor.f 0,DBL0H,DBL1H
 155         bmi .Ldenorm
 156 .Lpast_denorm:
 157         neg_s r12,DBL1L
 158         lsr r7,DBL0L,r12
 159         asl r12,r4,DBL1L
 160         asl_s DBL0L,DBL0L,DBL1L
 161         add_s r12,r12,r7
 162         j_s.d [blink]
 163         add_l DBL0H,DBL0H,r12
 164         .balign 4
 165 .Ladd_same_exp:
 166         /* This is a special case because we can't test for need to shift
 167            down by checking if bit 20 of DBL0H changes.  OTOH, here we know
 168            that we always need to shift down.  */
 169         ; The implicit 1 of DBL0 is not shifted together with the
 170         ;  fraction, thus effectively doubled, compensating for not setting
 171         ;  implicit1 for DBL1
 172         add_s r12,DBL0L,DBL1L
 173         lsr.f 0,r12,2 ; round to even
 174         breq r6,0,.Ldenorm_add
 175         adc.f DBL0L,DBL0L,DBL1L
 176         sub r7,DBL1H,DBL0H
 177         sub1 r7,r7,r9 ; boost exponent by 2/2
 178         rrc DBL0L,DBL0L
 179         asr.f r7,r7 ; DBL1.fraction/2 - DBL0.fraction/2 ; exp++
 180         add.cs.f DBL0L,DBL0L,0x80000000
 181         add_l DBL0H,DBL0H,r7 ; DBL0.implicit1 not shifted for DBL1.implicit1
 182         add.cs DBL0H,DBL0H,1
 183         bic.f 0,r9,DBL0H ; check for overflow -> infinity.
 184         jne_l [blink]
 185         and DBL0H,DBL0H,0xfff00000
 186         j_s.d [blink]
 187         mov_s DBL0L,0
 188         .balign 4
 189 .Large_shift:
 190         brhs r12,55,.Lret_dbl0
 191         bmsk_s DBL1H,DBL1H,19
 192         brne r6,0,.Lno_denorm_large_shift
 193         brhi.d r12,33,.Lfixed_denorm_large_shift
 194         sub_s r12,r12,1
 195         breq r12,31, .Lfixed_denorm_small_shift
 196 .Lshift32:
 197         mov_s r12,DBL1L
 198         mov_s DBL1L,DBL1H
 199         brlt.d r10,0,.Lsub
 200         mov_s DBL1H,0
 201         b_s .Ladd
 202 .Ldenorm_add:
 203         cmp_s r12,DBL1L
 204         mov_s DBL0L,r12
 205         j_s.d [blink]
 206         adc DBL0H,r4,DBL1H
 207
 208 .Lret_dbl0:
 209         j_s [blink]
 210         .balign 4
 211 .Lsmall_shift:
 212         breq.d r6,0,.Ldenorm_small_shift
 213         bmsk_s DBL1H,DBL1H,19
 214         bset_s DBL1H,DBL1H,20
 215 .Lfixed_denorm_small_shift:
 216         neg r8,r12
 217         asl r4,DBL1H,r8
 218         lsr_l DBL1H,DBL1H,r12
 219         lsr r5,DBL1L,r12
 220         asl r12,DBL1L,r8
 221         brge.d r10,0,.Ladd
 222         or DBL1L,r4,r5
 223 /* subtract, abs(DBL0) > abs(DBL1) */
 224 /* DBL0H, DBL0L: original values
 225    DBL1H, DBL1L: fraction with explicit leading 1, shifted into place
 226    r4:  orig. DBL0H & 0x7fffffff
 227    r6:  orig. DBL1H & 0x7ff00000
 228    r9:  0x7ff00000
 229    r10: orig. DBL0H ^ DBL1H
 230    r12: guard bits */
 231         .balign 4
 232 .Lsub:
 233         neg.f r12,r12
 234         mov_s r7,DBL1H
 235         bmsk r5,DBL0H,19
 236         sbc.f DBL0L,DBL0L,DBL1L
 237         bic DBL1H,DBL0H,r5
 238         bset r5,r5,20
 239         sbc.f r4,r5,r7
 240         beq_l .Large_cancel_sub
 241         norm DBL1L,r4
 242         bmsk r6,DBL1H,30
 243 .Lsub_done:
 244         sub_s DBL1L,DBL1L,9
 245         breq DBL1L,1,.Lsub_done_noshift
 246         asl r5,DBL1L,20
 247         sub_s DBL1L,DBL1L,1
 248         brlo r6,r5,.Ldenorm_sub
 249         sub DBL0H,DBL1H,r5
 250 .Lpast_denorm_sub:
 251         neg_s DBL1H,DBL1L
 252         lsr r6,r12,DBL1H
 253         asl_s r12,r12,DBL1L
 254         and r8,r6,1
 255         add1.f 0,r8,r12
 256         add.ne.f r12,r12,r12
 257         asl r8,DBL0L,DBL1L
 258         lsr r12,DBL0L,DBL1H
 259         adc.f DBL0L,r8,r6
 260         asl r5,r4,DBL1L
 261         add_s DBL0H,DBL0H,r12
 262         j_s.d [blink]
 263         adc DBL0H,DBL0H,r5
 264
 265         .balign 4
 266 .Lno_denorm_large_shift:
 267         breq.d r12,32,.Lshift32
 268         bset_l DBL1H,DBL1H,20
 269 .Lfixed_denorm_large_shift:
 270         neg r8,r12
 271         asl r4,DBL1H,r8
 272         lsr r5,DBL1L,r12
 273         asl.f 0,DBL1L,r8
 274         lsr DBL1L,DBL1H,r12
 275         or r12,r4,r5
 276         tst.eq r12,1
 277         or.ne r12,r12,2
 278         brlt.d r10,0,.Lsub
 279         mov_s DBL1H,0
 280         b_l .Ladd
 281
 282         ; If a denorm is produced without shifting, we have an exact result -
 283         ; no need for rounding.
 284         .balign 4
 285 .Ldenorm_sub:
 286         lsr DBL1L,r6,20
 287         xor DBL0H,r6,DBL1H
 288         brne.d DBL1L,1,.Lpast_denorm_sub
 289         sub_s DBL1L,DBL1L,1
 290 .Lsub_done_noshift:
 291         add.f 0,r12,r12
 292         btst.eq DBL0L,0
 293         cmp.eq r12,r12
 294         add.cs.f DBL0L,DBL0L,1
 295         bclr r4,r4,20
 296         j_s.d [blink]
 297         adc DBL0H,DBL1H,r4
 298
 299         .balign 4
 300 .Ldenorm_small_shift:
 301         brne.d r12,1,.Lfixed_denorm_small_shift
 302         sub_l r12,r12,1
 303         brlt r10,0,.Lsub
 304 .Ladd: ; bit 20 of DBL1H is clear and bit 0 of r12 does not matter
 305         add.f DBL0L,DBL0L,DBL1L
 306         add_s DBL1H,DBL1H,DBL0H
 307         add.cs DBL1H,DBL1H,1
 308         xor_l DBL0H,DBL0H,DBL1H
 309         bbit0 DBL0H,20,.Lno_shiftdown
 310         lsr.f DBL0H,DBL1H
 311         and r4,DBL0L,2
 312         bmsk DBL0H,DBL0H,18
 313         sbc DBL0H,DBL1H,DBL0H
 314         rrc.f DBL0L,DBL0L
 315         or.f r12,r12,r4
 316         cmp.eq r12,r12
 317         add.cs.f DBL0L,DBL0L,1
 318         bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
 319         jne.d [blink]    ; ... non-zero fraction
 320         add.cs DBL0H,DBL0H,1
 321         mov_s DBL0L,0
 322         bmsk DBL1H,DBL0H,19
 323         j_s.d [blink]
 324         bic_s DBL0H,DBL0H,DBL1H
 325 .Lno_shiftdown:
 326         mov_s DBL0H,DBL1H
 327         add.f 0,r12,r12
 328         btst.eq DBL0L,0
 329         cmp.eq r12,r12
 330         add.cs.f DBL0L,DBL0L,1
 331         j_s.d [blink]
 332         add.cs DBL0H,DBL0H,1
 333         .balign 4
 334 .Ldenorm:
 335         bmsk DBL0H,DBL1H,30
 336         lsr r12,DBL0H,20
 337         xor_s DBL0H,DBL0H,DBL1H
 338         sub_l DBL1L,r12,1
 339         bgt .Lpast_denorm
 340         j_s.d [blink]
 341         add_l DBL0H,DBL0H,r4
 342
 343         .balign 4
 344 .Large_cancel:
 345         ;DBL0L: mantissa DBL1H: sign & exponent
 346         norm.f DBL1L,DBL0L
 347         bmsk DBL0H,DBL1H,30
 348         add_s DBL1L,DBL1L,22
 349         mov.mi DBL1L,21
 350         add_s r12,DBL1L,1
 351         asl_s r12,r12,20
 352         beq_s .Lret0
 353         brhs.d DBL0H,r12,.Lpast_denorm_large_cancel
 354         sub DBL0H,DBL1H,r12
 355         bmsk DBL0H,DBL1H,30
 356         lsr r12,DBL0H,20
 357         xor_s DBL0H,DBL0H,DBL1H
 358         sub.f DBL1L,r12,1
 359         jle [blink]
 360 .Lpast_denorm_large_cancel:
 361         rsub.f r7,DBL1L,32
 362         lsr r7,DBL0L,r7
 363         asl_s DBL0L,DBL0L,DBL1L
 364         mov.ls r7,DBL0L
 365         add_s DBL0H,DBL0H,r7
 366         j_s.d [blink]
 367         mov.ls DBL0L,0
 368 .Lret0:
 369         j_s.d   [blink]
 370         mov_l   DBL0H,0
 371
 372 /* r4:DBL0L:r12 : unnormalized result fraction
 373    DBL1H: result sign and exponent         */
 374 /* When seeing large cancellation, only the topmost guard bit might be set.  */
 375         .balign 4
 376 .Large_cancel_sub:
 377         norm.f DBL1L,DBL0L
 378         bpnz.d 0f
 379         bmsk DBL0H,DBL1H,30
 380         mov r5,22<<20
 381         bne.d 1f
 382         mov_s DBL1L,21
 383         bset r5,r5,5+20
 384         add_s DBL1L,DBL1L,32
 385         brne r12,0,1f
 386         j_s.d   [blink]
 387         mov_l   DBL0H,0
 388         .balign 4
 389 0:      add r5,DBL1L,23
 390         asl r5,r5,20
 391         add_s DBL1L,DBL1L,22
 392 1:      brlo DBL0H,r5,.Ldenorm_large_cancel_sub
 393         sub DBL0H,DBL1H,r5
 394 .Lpast_denorm_large_cancel_sub:
 395         rsub.f r7,DBL1L,32
 396         lsr r12,r12,r7
 397         lsr r7,DBL0L,r7
 398         asl_s DBL0L,DBL0L,DBL1L
 399         add.ge DBL0H,DBL0H,r7
 400         add_s DBL0L,DBL0L,r12
 401         add.lt DBL0H,DBL0H,DBL0L
 402         mov.eq DBL0L,r12
 403         j_s.d [blink]
 404         mov.lt DBL0L,0
 405         .balign 4
 406 .Ldenorm_large_cancel_sub:
 407         lsr r5,DBL0H,20
 408         xor_s DBL0H,DBL0H,DBL1H
 409         brgt.d r5,1,.Lpast_denorm_large_cancel_sub
 410         sub DBL1L,r5,1
 411         j_l [blink] ; denorm, no shift -> no rounding needed.
 412
 413 /* r4: DBL0H & 0x7fffffff
 414    r6: DBL1H & 0x7ff00000
 415    r9: 0x7ff00000
 416    r10: sign difference
 417    r12: shift count (negative) */
 418         .balign 4
 419 .Ldbl1_gt:
 420         brhs r6,r9,.Lret_dbl1 ; inf or NaN
 421         neg r8,r12
 422         brhs r8,32,.Large_shift_dbl0
 423 .Lsmall_shift_dbl0:
 424         breq.d r6,0,.Ldenorm_small_shift_dbl0
 425         bmsk_s DBL0H,DBL0H,19
 426         bset_s DBL0H,DBL0H,20
 427 .Lfixed_denorm_small_shift_dbl0:
 428         asl r4,DBL0H,r12
 429         lsr DBL0H,DBL0H,r8
 430         lsr r5,DBL0L,r8
 431         asl r12,DBL0L,r12
 432         brge.d r10,0,.Ladd_dbl1_gt
 433         or DBL0L,r4,r5
 434 /* subtract, abs(DBL0) < abs(DBL1) */
 435 /* DBL0H, DBL0L: fraction with explicit leading 1, shifted into place
 436    DBL1H, DBL1L: original values
 437    r6:  orig. DBL1H & 0x7ff00000
 438    r9:  0x7ff00000
 439    r12: guard bits */
 440         .balign 4
 441 .Lrsub:
 442         neg.f r12,r12
 443         bmsk r7,DBL1H,19
 444         mov_s r5,DBL0H
 445         sbc.f DBL0L,DBL1L,DBL0L
 446         bic DBL1H,DBL1H,r7
 447         bset r7,r7,20
 448         sbc.f r4,r7,r5
 449         beq_l .Large_cancel_sub
 450         norm DBL1L,r4
 451         b_l .Lsub_done ; note: r6 is already set up.
 452
 453 .Lret_dbl1:
 454         mov_s DBL0H,DBL1H
 455         j_s.d [blink]
 456         mov_l DBL0L,DBL1L
 457         .balign 4
 458 .Ldenorm_small_shift_dbl0:
 459         sub.f r8,r8,1
 460         bne.d .Lfixed_denorm_small_shift_dbl0
 461         add_s r12,r12,1
 462         brlt r10,0,.Lrsub
 463 .Ladd_dbl1_gt: ; bit 20 of DBL0H is clear and bit 0 of r12 does not matter
 464         add.f DBL0L,DBL0L,DBL1L
 465         add_s DBL0H,DBL0H,DBL1H
 466         add.cs DBL0H,DBL0H,1
 467         xor DBL1H,DBL0H,DBL1H
 468         bbit0 DBL1H,20,.Lno_shiftdown_dbl1_gt
 469         lsr.f DBL1H,DBL0H
 470         and r4,DBL0L,2
 471         bmsk DBL1H,DBL1H,18
 472         sbc DBL0H,DBL0H,DBL1H
 473         rrc.f DBL0L,DBL0L
 474         or.f r12,r12,r4
 475         cmp.eq r12,r12
 476         add.cs.f DBL0L,DBL0L,1
 477         bic.f 0,r9,DBL0H ; check for generating infinity with possible ...
 478         jne.d [blink]    ; ... non-zero fraction
 479         add.cs DBL0H,DBL0H,1
 480         mov_s DBL0L,0
 481         bmsk DBL1H,DBL0H,19
 482         j_s.d [blink]
 483         bic_s DBL0H,DBL0H,DBL1H
 484 .Lno_shiftdown_dbl1_gt:
 485         add.f 0,r12,r12
 486         btst.eq DBL0L,0
 487         cmp.eq r12,r12
 488         add.cs.f DBL0L,DBL0L,1
 489         j_s.d [blink]
 490         add.cs DBL0H,DBL0H,1
 491
 492         .balign 4
 493 .Large_shift_dbl0:
 494         brhs r8,55,.Lret_dbl1
 495         bmsk_s DBL0H,DBL0H,19
 496         brne r6,0,.Lno_denorm_large_shift_dbl0
 497         add_s r12,r12,1
 498         brne.d r8,33,.Lfixed_denorm_large_shift_dbl0
 499         sub r8,r8,1
 500         bset_s DBL0H,DBL0H,20
 501 .Lshift32_dbl0:
 502         mov_s r12,DBL0L
 503         mov_s DBL0L,DBL0H
 504         brlt.d r10,0,.Lrsub
 505         mov_s DBL0H,0
 506         b_s .Ladd_dbl1_gt
 507
 508         .balign 4
 509 .Lno_denorm_large_shift_dbl0:
 510         breq.d r8,32,.Lshift32_dbl0
 511         bset_l DBL0H,DBL0H,20
 512 .Lfixed_denorm_large_shift_dbl0:
 513         asl r4,DBL0H,r12
 514         lsr r5,DBL0L,r8
 515         asl.f 0,DBL0L,r12
 516         lsr DBL0L,DBL0H,r8
 517         or r12,r4,r5
 518         tst.eq r12,1
 519         or.ne r12,r12,2
 520         brlt.d r10,0,.Lrsub
 521         mov_s DBL0H,0
 522         b_l .Ladd_dbl1_gt
 523         ENDFUNC(__adddf3)
 524         ENDFUNC(__subdf3)