libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S

   1 /* Copyright (C) 2008-2020 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 /*
  27    to calculate a := b/x as b*y, with y := 1/x:
  28    - x is in the range [1..2)
  29    - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
  30      Precision is higher for polynoms used to evaluate input with larger
  31      value.
  32    - Do one newton-raphson iteration step to double the precision,
  33      then multiply this with the divisor
  34         -> more time to decide if dividend is subnormal
  35      - the worst error propagation is on the side of the value range
  36        with the least initial defect, thus giving us about 30 bits precision.
  37       The truncation error for the either is less than 1 + x/2 ulp.
  38       A 31 bit inverse can be simply calculated by using x with implicit 1
  39       and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
  40       with the bare fraction part of x, then add in y0^2 for the implicit
  41       1 of x.
  42     - If calculating a 31 bit inverse, the systematic error is less than
  43       -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
  44     - If we calculate our seed with a 32 bit fraction, we can archive a
  45       tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
  46       only need to take the step to calculate the 2nd stage rest and
  47       rounding adjust 1/32th of the time.  However, if we use a 20 bit
  48       fraction for the seed, the negative error can exceed -2 ulp/128, (2)
  49       thus for a simple add / tst check, we need to do the 2nd stage
  50       rest calculation/ rounding adjust 1/16th of the time.
  51       (1): The inexactness of the 32 bit inverse contributes an error in the
  52       range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
  53       rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
  54       x/2 + 1/x <= 1.5 .
  55       (2): Unless proven otherwise.  I have not actually looked for an
  56       example where -2 ulp/128 is exceeded, and my calculations indicate
  57       that the excess, if existent, is less than -1/512 ulp.
  58     ??? The algorithm is still based on the ARC700 optimized code.
  59     Maybe we could make better use of 64 bit multiply results and/or mmed .
  60  */
  61 #include "../arc-ieee-754.h"
  62
  63 /* N.B. fp-bit.c does double rounding on denormal numbers.  */
  64 #if 0 /* DEBUG */
  65         .global __divdf3
  66         FUNC(__divdf3)
  67         .balign 4
  68 __divdf3:
  69         push_s blink
  70         push_s r2
  71         push_s r3
  72         push_s r0
  73         bl.d __divdf3_c
  74         push_s r1
  75         ld_s r2,[sp,12]
  76         ld_s r3,[sp,8]
  77         st_s r0,[sp,12]
  78         st_s r1,[sp,8]
  79         pop_s r1
  80         bl.d __divdf3_asm
  81         pop_s r0
  82         pop_s r3
  83         pop_s r2
  84         pop_s blink
  85         cmp r0,r2
  86         cmp.eq r1,r3
  87         jeq_s [blink]
  88         and r12,DBL0H,DBL1H
  89         bic.f 0,0x7ff80000,r12 ; both NaN -> OK
  90         jeq_s [blink]
  91         bl abort
  92         ENDFUNC(__divdf3)
  93 #define __divdf3 __divdf3_asm
  94 #endif /* DEBUG */
  95
  96         FUNC(__divdf3)
  97         .balign 4
  98 .L7ff00000:
  99         .long 0x7ff00000
 100 .Ldivtab:
 101         .long 0xfc0fffe1
 102         .long 0xf46ffdfb
 103         .long 0xed1ffa54
 104         .long 0xe61ff515
 105         .long 0xdf7fee75
 106         .long 0xd91fe680
 107         .long 0xd2ffdd52
 108         .long 0xcd1fd30c
 109         .long 0xc77fc7cd
 110         .long 0xc21fbbb6
 111         .long 0xbcefaec0
 112         .long 0xb7efa100
 113         .long 0xb32f92bf
 114         .long 0xae8f83b7
 115         .long 0xaa2f7467
 116         .long 0xa5ef6479
 117         .long 0xa1cf53fa
 118         .long 0x9ddf433e
 119         .long 0x9a0f3216
 120         .long 0x965f2091
 121         .long 0x92df0f11
 122         .long 0x8f6efd05
 123         .long 0x8c1eeacc
 124         .long 0x88eed876
 125         .long 0x85dec615
 126         .long 0x82eeb3b9
 127         .long 0x800ea10b
 128         .long 0x7d3e8e0f
 129         .long 0x7a8e7b3f
 130         .long 0x77ee6836
 131         .long 0x756e5576
 132         .long 0x72fe4293
 133         .long 0x709e2f93
 134         .long 0x6e4e1c7f
 135         .long 0x6c0e095e
 136         .long 0x69edf6c5
 137         .long 0x67cde3a5
 138         .long 0x65cdd125
 139         .long 0x63cdbe25
 140         .long 0x61ddab3f
 141         .long 0x600d991f
 142         .long 0x5e3d868c
 143         .long 0x5c6d7384
 144         .long 0x5abd615f
 145         .long 0x590d4ecd
 146         .long 0x576d3c83
 147         .long 0x55dd2a89
 148         .long 0x545d18e9
 149         .long 0x52dd06e9
 150         .long 0x516cf54e
 151         .long 0x4ffce356
 152         .long 0x4e9cd1ce
 153         .long 0x4d3cbfec
 154         .long 0x4becae86
 155         .long 0x4aac9da4
 156         .long 0x496c8c73
 157         .long 0x483c7bd3
 158         .long 0x470c6ae8
 159         .long 0x45dc59af
 160         .long 0x44bc4915
 161         .long 0x43ac3924
 162         .long 0x428c27fb
 163         .long 0x418c187a
 164         .long 0x407c07bd
 165
 166 __divdf3_support: /* This label makes debugger output saner.  */
 167         .balign 4
 168 .Ldenorm_dbl1:
 169         brge r6, \
 170                 0x43500000,.Linf_NaN ; large number / denorm -> Inf
 171         bmsk.f r12,DBL1H,19
 172         mov.eq r12,DBL1L
 173         mov.eq DBL1L,0
 174         sub.eq r7,r7,32
 175         norm.f r11,r12 ; flag for x/0 -> Inf check
 176         beq_s .Linf_NaN
 177         mov.mi r11,0
 178         add.pl r11,r11,1
 179         add_s r12,r12,r12
 180         asl r8,r12,r11
 181         rsub r12,r11,31
 182         lsr r12,DBL1L,r12
 183         tst_s DBL1H,DBL1H
 184         or r8,r8,r12
 185         lsr r4,r8,26
 186         lsr DBL1H,r8,12
 187         ld.as r4,[r10,r4]
 188         bxor.mi DBL1H,DBL1H,31
 189         sub r11,r11,11
 190         asl DBL1L,DBL1L,r11
 191         sub r11,r11,1
 192         mulu64 r4,r8
 193         sub r7,r7,r11
 194         b.d .Lpast_denorm_dbl1
 195         asl r7,r7,20
 196
 197         .balign 4
 198 .Ldenorm_dbl0:
 199         bmsk.f r12,DBL0H,19
 200         ; wb stall
 201         mov.eq r12,DBL0L
 202         sub.eq r6,r6,32
 203         norm.f r11,r12 ; flag for 0/x -> 0 check
 204         brge r7, \
 205                 0x43500000, .Lret0_2 ; denorm/large number -> 0
 206         beq_s .Lret0_2
 207         mov.mi r11,0
 208         add.pl r11,r11,1
 209         asl r12,r12,r11
 210         sub r6,r6,r11
 211         add.f 0,r6,31
 212         lsr r10,DBL0L,r6
 213         mov.mi r10,0
 214         add r6,r6,11+32
 215         neg.f r11,r6
 216         asl DBL0L,DBL0L,r11
 217         mov.pl DBL0L,0
 218         sub r6,r6,32-1
 219         b.d .Lpast_denorm_dbl0
 220         asl r6,r6,20
 221
 222 .Linf_NaN:
 223         tst_s DBL0L,DBL0L ; 0/0 -> NaN
 224         xor_s DBL1H,DBL1H,DBL0H
 225         bclr.eq.f DBL0H,DBL0H,31
 226         bmsk DBL0H,DBL1H,30
 227         xor_s DBL0H,DBL0H,DBL1H
 228         sub.eq DBL0H,DBL0H,1
 229         mov_s DBL0L,0
 230         j_s.d [blink]
 231         or DBL0H,DBL0H,r9
 232         .balign 4
 233 .Lret0_2:
 234         xor_s DBL1H,DBL1H,DBL0H
 235         mov_s DBL0L,0
 236         bmsk DBL0H,DBL1H,30
 237         j_s.d [blink]
 238         xor_s DBL0H,DBL0H,DBL1H
 239         .balign 4
 240         .global __divdf3
 241 /* N.B. the spacing between divtab and the sub3 to get its address must
 242    be a multiple of 8.  */
 243 __divdf3:
 244         asl r8,DBL1H,12
 245         lsr r4,r8,26
 246         sub3 r10,pcl,61; (.-.Ldivtab) >> 3
 247         ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 248         ld.as r4,[r10,r4]
 249         lsr r12,DBL1L,20
 250         and.f r7,DBL1H,r9
 251         or r8,r8,r12
 252         mulu64 r4,r8
 253         beq.d .Ldenorm_dbl1
 254 .Lpast_denorm_dbl1:
 255         and.f r6,DBL0H,r9
 256         breq.d r7,r9,.Linf_nan_dbl1
 257         asl r4,r4,12
 258         sub r4,r4,mhi
 259         mulu64 r4,r4
 260         beq.d .Ldenorm_dbl0
 261         lsr r8,r8,1
 262         breq.d r6,r9,.Linf_nan_dbl0
 263         asl r12,DBL0H,11
 264         lsr r10,DBL0L,21
 265 .Lpast_denorm_dbl0:
 266         bset r8,r8,31
 267         mulu64 mhi,r8
 268         add_s r12,r12,r10
 269         bset r5,r12,31
 270         cmp r5,r8
 271         cmp.eq DBL0L,DBL1L
 272         lsr.cc r5,r5,1
 273         sub r4,r4,mhi ; u1.31 inverse, about 30 bit
 274         mulu64 r5,r4 ; result fraction highpart
 275         lsr r8,r8,2 ; u3.29
 276         add r5,r6, /* wait for immediate */ \
 277                 0x3fe00000
 278         mov r11,mhi ; result fraction highpart
 279         mulu64 r11,r8 ; u-28.31
 280         asl_s DBL1L,DBL1L,9 ; u-29.23:9
 281         sbc r6,r5,r7
 282         mov r12,mlo ; u-28.31
 283         mulu64 r11,DBL1L ; mhi: u-28.23:9
 284         add.cs DBL0L,DBL0L,DBL0L
 285         asl_s DBL0L,DBL0L,6 ; u-26.25:7
 286         asl r10,r11,23
 287         sub_l DBL0L,DBL0L,r12
 288         lsr r7,r11,9
 289         sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
 290         mul64 r5,r4 ; mhi: result fraction lowpart
 291         xor.f 0,DBL0H,DBL1H
 292         and DBL0H,r6,r9
 293         add_s DBL0H,DBL0H,r7
 294         bclr r12,r9,20 ; 0x7fe00000
 295         brhs.d r6,r12,.Linf_denorm
 296         bxor.mi DBL0H,DBL0H,31
 297         add.f r12,mhi,0x11
 298         asr r9,r12,5
 299         sub.mi DBL0H,DBL0H,1
 300         add.f DBL0L,r9,r10
 301         tst r12,0x1c
 302         jne.d [blink]
 303         add.cs DBL0H,DBL0H,1
 304         /* work out exact rounding if we fall through here.  */
 305         /* We know that the exact result cannot be represented in double
 306            precision.  Find the mid-point between the two nearest
 307            representable values, multiply with the divisor, and check if
 308            the result is larger than the dividend.  Since we want to know
 309            only the sign bit, it is sufficient to calculate only the
 310            highpart of the lower 64 bits.  */
 311         mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
 312         sub.f DBL0L,DBL0L,1
 313         asl r12,r9,2 ; u-22.30:2
 314         sub.cs DBL0H,DBL0H,1
 315         sub.f r12,r12,2
 316         mov r10,mlo ; rest before considering r12 in r5 : -r10
 317         mulu64 r12,DBL1L ; mhi: u-51.32
 318         asl r5,r5,25 ; s-51.7:25
 319         lsr r10,r10,7 ; u-51.30:2
 320         mov r7,mhi ; u-51.32
 321         mulu64 r12,r8 ; mlo: u-51.31:1
 322         sub r5,r5,r10
 323         add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
 324         bset r7,r7,0 ; make sure that the result is not zero, and that
 325         sub r5,r5,r7 ; a highpart zero appears negative
 326         sub.f r5,r5,mlo ; rest msw
 327         add.pl.f DBL0L,DBL0L,1
 328         j_s.d [blink]
 329         add.eq DBL0H,DBL0H,1
 330
 331 .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
 332         or.f 0,r6,DBL0L
 333         cmp.ne r6,r9
 334         not_s DBL0L,DBL1H
 335         sub_s.ne DBL0L,DBL0L,DBL0L
 336         tst_s DBL0H,DBL0H
 337         add_s DBL0H,DBL1H,DBL0L
 338         j_s.d [blink]
 339         bxor.mi DBL0H,DBL0H,31
 340 .Linf_nan_dbl0:
 341         tst_s DBL1H,DBL1H
 342         j_s.d [blink]
 343         bxor.mi DBL0H,DBL0H,31
 344         .balign 4
 345 .Linf_denorm:
 346         lsr r12,r6,28
 347         brlo.d r12,0xc,.Linf
 348 .Ldenorm:
 349         asr r6,r6,20
 350         neg r9,r6
 351         mov_s DBL0H,0
 352         brhs.d r9,54,.Lret0
 353         bxor.mi DBL0H,DBL0H,31
 354         add r12,mhi,1
 355         and r12,r12,-4
 356         rsub r7,r6,5
 357         asr r10,r12,28
 358         bmsk r4,r12,27
 359         min r7,r7,31
 360         asr DBL0L,r4,r7
 361         add DBL1H,r11,r10
 362         abs.f r10,r4
 363         sub.mi r10,r10,1
 364         add.f r7,r6,32-5
 365         asl r4,r4,r7
 366         mov.mi r4,r10
 367         add.f r10,r6,23
 368         rsub r7,r6,9
 369         lsr r7,DBL1H,r7
 370         asl r10,DBL1H,r10
 371         or.pnz DBL0H,DBL0H,r7
 372         or.mi r4,r4,r10
 373         mov.mi r10,r7
 374         add.f DBL0L,r10,DBL0L
 375         add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
 376         bxor.f 0,r4,31
 377         add.pnz.f DBL0L,DBL0L,1
 378         add.cs.f DBL0H,DBL0H,1
 379         jne_s [blink]
 380         /* Calculation so far was not conclusive; calculate further rest.  */
 381         mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
 382         asr.f r12,r12,3
 383         asl r5,r5,25 ; s-51.7:25
 384         mov r11,mlo ; rest before considering r12 in r5 : -r11
 385         mulu64 r12,r8 ; u-51.31:1
 386         and r9,DBL0L,1 ; tie-breaker: round to even
 387         lsr r11,r11,7 ; u-51.30:2
 388         mov DBL1H,mlo ; u-51.31:1
 389         mulu64 r12,DBL1L ; u-51.62:2
 390         sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
 391         add_s DBL1H,DBL1H,r11
 392         sub DBL1H,DBL1H,r5 ; -rest msw
 393         add_s DBL1H,DBL1H,mhi ; -rest msw
 394         add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
 395         tst_s DBL1H,DBL1H
 396         cmp.eq mlo,r9
 397         add.cs.f DBL0L,DBL0L,1
 398         j_s.d [blink]
 399         add.cs DBL0H,DBL0H,1
 400
 401 .Lret0:
 402         /* return +- 0 */
 403         j_s.d [blink]
 404         mov_s DBL0L,0
 405 .Linf:
 406         mov_s DBL0H,r9
 407         mov_s DBL0L,0
 408         j_s.d [blink]
 409         bxor.mi DBL0H,DBL0H,31
 410         ENDFUNC(__divdf3)