libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S

   1 /* Copyright (C) 2008-2016 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 /*
  27    to calculate a := b/x as b*y, with y := 1/x:
  28    - x is in the range [1..2)
  29    - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
  30      Precision is higher for polynoms used to evaluate input with larger
  31      value.
  32    - Do one newton-raphson iteration step to double the precision,
  33      then multiply this with the divisor
  34         -> more time to decide if dividend is subnormal
  35      - the worst error propagation is on the side of the value range
  36        with the least initial defect, thus giving us about 30 bits precision.
  37       The truncation error for the either is less than 1 + x/2 ulp.
  38       A 31 bit inverse can be simply calculated by using x with implicit 1
  39       and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
  40       with the bare fraction part of x, then add in y0^2 for the implicit
  41       1 of x.
  42     - If calculating a 31 bit inverse, the systematic error is less than
  43       -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
  44     - If we calculate our seed with a 32 bit fraction, we can archive a
  45       tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
  46       only need to take the step to calculate the 2nd stage rest and
  47       rounding adjust 1/32th of the time.  However, if we use a 20 bit
  48       fraction for the seed, the negative error can exceed -2 ulp/128, (2)
  49       thus for a simple add / tst check, we need to do the 2nd stage
  50       rest calculation/ rounding adjust 1/16th of the time.
  51       (1): The inexactness of the 32 bit inverse contributes an error in the
  52       range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
  53       rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
  54       x/2 + 1/x <= 1.5 .
  55       (2): Unless proven otherwise.  I have not actually looked for an
  56       example where -2 ulp/128 is exceeded, and my calculations indicate
  57       that the excess, if existent, is less than -1/512 ulp.
  58     ??? The algorithm is still based on the ARC700 optimized code.
  59     Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
  60     results.
  61  */
  62 #include "../arc-ieee-754.h"
  63 #define mlo acc2
  64 #define mhi acc1
  65 #define mul64(b,c) mullw 0,b,c` machlw 0,b,c
  66 #define mulu64(b,c) mululw 0,b,c` machulw 0,b,c
  67
  68 /* N.B. fp-bit.c does double rounding on denormal numbers.  */
  69 #if 0 /* DEBUG */
  70         .global __divdf3
  71         FUNC(__divdf3)
  72         .balign 4
  73 __divdf3:
  74         push_s blink
  75         push_s r2
  76         push_s r3
  77         push_s r0
  78         bl.d __divdf3_c
  79         push_s r1
  80         ld_s r2,[sp,12]
  81         ld_s r3,[sp,8]
  82         st_s r0,[sp,12]
  83         st_s r1,[sp,8]
  84         pop_s r1
  85         bl.d __divdf3_asm
  86         pop_s r0
  87         pop_s r3
  88         pop_s r2
  89         pop_s blink
  90         cmp r0,r2
  91         cmp.eq r1,r3
  92         jeq_s [blink]
  93         and r12,DBL0H,DBL1H
  94         bic.f 0,0x7ff80000,r12 ; both NaN -> OK
  95         jeq_s [blink]
  96         bl abort
  97         ENDFUNC(__divdf3)
  98 #define __divdf3 __divdf3_asm
  99 #endif /* DEBUG */
 100
 101         FUNC(__divdf3)
 102         .balign 4
 103 .L7ff00000:
 104         .long 0x7ff00000
 105 .Ldivtab:
 106         .long 0xfc0fffe1
 107         .long 0xf46ffdfb
 108         .long 0xed1ffa54
 109         .long 0xe61ff515
 110         .long 0xdf7fee75
 111         .long 0xd91fe680
 112         .long 0xd2ffdd52
 113         .long 0xcd1fd30c
 114         .long 0xc77fc7cd
 115         .long 0xc21fbbb6
 116         .long 0xbcefaec0
 117         .long 0xb7efa100
 118         .long 0xb32f92bf
 119         .long 0xae8f83b7
 120         .long 0xaa2f7467
 121         .long 0xa5ef6479
 122         .long 0xa1cf53fa
 123         .long 0x9ddf433e
 124         .long 0x9a0f3216
 125         .long 0x965f2091
 126         .long 0x92df0f11
 127         .long 0x8f6efd05
 128         .long 0x8c1eeacc
 129         .long 0x88eed876
 130         .long 0x85dec615
 131         .long 0x82eeb3b9
 132         .long 0x800ea10b
 133         .long 0x7d3e8e0f
 134         .long 0x7a8e7b3f
 135         .long 0x77ee6836
 136         .long 0x756e5576
 137         .long 0x72fe4293
 138         .long 0x709e2f93
 139         .long 0x6e4e1c7f
 140         .long 0x6c0e095e
 141         .long 0x69edf6c5
 142         .long 0x67cde3a5
 143         .long 0x65cdd125
 144         .long 0x63cdbe25
 145         .long 0x61ddab3f
 146         .long 0x600d991f
 147         .long 0x5e3d868c
 148         .long 0x5c6d7384
 149         .long 0x5abd615f
 150         .long 0x590d4ecd
 151         .long 0x576d3c83
 152         .long 0x55dd2a89
 153         .long 0x545d18e9
 154         .long 0x52dd06e9
 155         .long 0x516cf54e
 156         .long 0x4ffce356
 157         .long 0x4e9cd1ce
 158         .long 0x4d3cbfec
 159         .long 0x4becae86
 160         .long 0x4aac9da4
 161         .long 0x496c8c73
 162         .long 0x483c7bd3
 163         .long 0x470c6ae8
 164         .long 0x45dc59af
 165         .long 0x44bc4915
 166         .long 0x43ac3924
 167         .long 0x428c27fb
 168         .long 0x418c187a
 169         .long 0x407c07bd
 170
 171 __divdf3_support: /* This label makes debugger output saner.  */
 172         .balign 4
 173 .Ldenorm_dbl1:
 174         brge r6, \
 175                 0x43500000,.Linf_NaN ; large number / denorm -> Inf
 176         bmsk.f r12,DBL1H,19
 177         mov.eq r12,DBL1L
 178         mov.eq DBL1L,0
 179         sub.eq r7,r7,32
 180         norm.f r11,r12 ; flag for x/0 -> Inf check
 181         beq_s .Linf_NaN
 182         mov.mi r11,0
 183         add.pl r11,r11,1
 184         add_s r12,r12,r12
 185         asl r8,r12,r11
 186         rsub r12,r11,31
 187         lsr r12,DBL1L,r12
 188         tst_s DBL1H,DBL1H
 189         or r8,r8,r12
 190         lsr r4,r8,26
 191         lsr DBL1H,r8,12
 192         ld.as r4,[r10,r4]
 193         bxor.mi DBL1H,DBL1H,31
 194         sub r11,r11,11
 195         asl DBL1L,DBL1L,r11
 196         sub r11,r11,1
 197         mulu64 (r4,r8)
 198         sub r7,r7,r11
 199         b.d .Lpast_denorm_dbl1
 200         asl r7,r7,20
 201
 202 .Linf_NaN:
 203         tst_s DBL0L,DBL0L ; 0/0 -> NaN
 204         xor_s DBL1H,DBL1H,DBL0H
 205         bclr.eq.f DBL0H,DBL0H,31
 206         bmsk DBL0H,DBL1H,30
 207         xor_s DBL0H,DBL0H,DBL1H
 208         sub.eq DBL0H,DBL0H,1
 209         mov_s DBL0L,0
 210         j_s.d [blink]
 211         or DBL0H,DBL0H,r9
 212         .balign 4
 213 .Lret0_2:
 214         xor_s DBL1H,DBL1H,DBL0H
 215         mov_s DBL0L,0
 216         bmsk DBL0H,DBL1H,30
 217         j_s.d [blink]
 218         xor_s DBL0H,DBL0H,DBL1H
 219         .balign 4
 220         .global __divdf3
 221 /* N.B. the spacing between divtab and the sub3 to get its address must
 222    be a multiple of 8.  */
 223 __divdf3:
 224         asl r8,DBL1H,12
 225         lsr r4,r8,26
 226         sub3 r10,pcl,51;(.-.Ldivtab) >> 3
 227         ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 228         ld.as r4,[r10,r4]
 229         lsr r12,DBL1L,20
 230         and.f r7,DBL1H,r9
 231         or r8,r8,r12
 232         mulu64 (r4,r8)
 233         beq.d .Ldenorm_dbl1
 234 .Lpast_denorm_dbl1:
 235         and.f r6,DBL0H,r9
 236         breq.d r7,r9,.Linf_nan_dbl1
 237         asl r4,r4,12
 238         sub r4,r4,mhi
 239         mululw 0,r4,r4
 240         machulw r5,r4,r4
 241         bne.d .Lnormal_dbl0
 242         lsr r8,r8,1
 243
 244         .balign 4
 245 .Ldenorm_dbl0:
 246         bmsk.f r12,DBL0H,19
 247         ; wb stall
 248         mov.eq r12,DBL0L
 249         sub.eq r6,r6,32
 250         norm.f r11,r12 ; flag for 0/x -> 0 check
 251         brge r7, \
 252                 0x43500000, .Lret0_2 ; denorm/large number -> 0
 253         beq_s .Lret0_2
 254         mov.mi r11,0
 255         add.pl r11,r11,1
 256         asl r12,r12,r11
 257         sub r6,r6,r11
 258         add.f 0,r6,31
 259         lsr r10,DBL0L,r6
 260         mov.mi r10,0
 261         add r6,r6,11+32
 262         neg.f r11,r6
 263         asl DBL0L,DBL0L,r11
 264         mov.pl DBL0L,0
 265         sub r6,r6,32-1
 266         b.d .Lpast_denorm_dbl0
 267         asl r6,r6,20
 268
 269         .balign 4
 270 .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
 271         or.f 0,r6,DBL0L
 272         cmp.ne r6,r9
 273         not_s DBL0L,DBL1H
 274         sub_s.ne DBL0L,DBL0L,DBL0L
 275         tst_s DBL0H,DBL0H
 276         add_s DBL0H,DBL1H,DBL0L
 277         j_s.d [blink]
 278         bxor.mi DBL0H,DBL0H,31
 279
 280         .balign 4
 281 .Lnormal_dbl0:
 282         breq.d r6,r9,.Linf_nan_dbl0
 283         asl r12,DBL0H,11
 284         lsr r10,DBL0L,21
 285 .Lpast_denorm_dbl0:
 286         bset r8,r8,31
 287         mulu64 (r5,r8)
 288         add_s r12,r12,r10
 289         bset r5,r12,31
 290         cmp r5,r8
 291         cmp.eq DBL0L,DBL1L
 292         lsr.cc r5,r5,1
 293         sub r4,r4,mhi ; u1.31 inverse, about 30 bit
 294         mululw 0,r5,r4
 295         machulw r11,r5,r4 ; result fraction highpart
 296         lsr r8,r8,2 ; u3.29
 297         add r5,r6, /* wait for immediate */ \
 298                 0x3fe00000
 299         mulu64 (r11,r8) ; u-28.31
 300         asl_s DBL1L,DBL1L,9 ; u-29.23:9
 301         sbc r6,r5,r7
 302         mov r12,mlo ; u-28.31
 303         mulu64 (r11,DBL1L) ; mhi: u-28.23:9
 304         add.cs DBL0L,DBL0L,DBL0L
 305         asl_s DBL0L,DBL0L,6 ; u-26.25:7
 306         asl r10,r11,23
 307         sub_l DBL0L,DBL0L,r12
 308         lsr r7,r11,9
 309         sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
 310         mul64 (r5,r4) ; mhi: result fraction lowpart
 311         xor.f 0,DBL0H,DBL1H
 312         and DBL0H,r6,r9
 313         add_s DBL0H,DBL0H,r7
 314         bclr r12,r9,20 ; 0x7fe00000
 315         brhs.d r6,r12,.Linf_denorm
 316         bxor.mi DBL0H,DBL0H,31
 317         add.f r12,mhi,0x11
 318         asr r9,r12,5
 319         sub.mi DBL0H,DBL0H,1
 320         add.f DBL0L,r9,r10
 321         tst r12,0x1c
 322         jne.d [blink]
 323         add.cs DBL0H,DBL0H,1
 324         /* work out exact rounding if we fall through here.  */
 325         /* We know that the exact result cannot be represented in double
 326            precision.  Find the mid-point between the two nearest
 327            representable values, multiply with the divisor, and check if
 328            the result is larger than the dividend.  Since we want to know
 329            only the sign bit, it is sufficient to calculate only the
 330            highpart of the lower 64 bits.  */
 331         mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
 332         sub.f DBL0L,DBL0L,1
 333         asl r12,r9,2 ; u-22.30:2
 334         sub.cs DBL0H,DBL0H,1
 335         sub.f r12,r12,2
 336         mov r10,mlo ; rest before considering r12 in r5 : -r10
 337         mululw 0,r12,DBL1L
 338         machulw r7,r12,DBL1L ; mhi: u-51.32
 339         asl r5,r5,25 ; s-51.7:25
 340         lsr r10,r10,7 ; u-51.30:2
 341         mulu64 (r12,r8) ; mlo: u-51.31:1
 342         sub r5,r5,r10
 343         add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
 344         bset r7,r7,0 ; make sure that the result is not zero, and that
 345         sub r5,r5,r7 ; a highpart zero appears negative
 346         sub.f r5,r5,mlo ; rest msw
 347         add.pl.f DBL0L,DBL0L,1
 348         j_s.d [blink]
 349         add.eq DBL0H,DBL0H,1
 350
 351 .Linf_nan_dbl0:
 352         tst_s DBL1H,DBL1H
 353         j_s.d [blink]
 354         bxor.mi DBL0H,DBL0H,31
 355         .balign 4
 356 .Linf_denorm:
 357         lsr r12,r6,28
 358         brlo.d r12,0xc,.Linf
 359 .Ldenorm:
 360         asr r6,r6,20
 361         neg r9,r6
 362         mov_s DBL0H,0
 363         brhs.d r9,54,.Lret0
 364         bxor.mi DBL0H,DBL0H,31
 365         add r12,mhi,1
 366         and r12,r12,-4
 367         rsub r7,r6,5
 368         asr r10,r12,28
 369         bmsk r4,r12,27
 370         min r7,r7,31
 371         asr DBL0L,r4,r7
 372         add DBL1H,r11,r10
 373         abs.f r10,r4
 374         sub.mi r10,r10,1
 375         add.f r7,r6,32-5
 376         asl r4,r4,r7
 377         mov.mi r4,r10
 378         add.f r10,r6,23
 379         rsub r7,r6,9
 380         lsr r7,DBL1H,r7
 381         asl r10,DBL1H,r10
 382         or.pnz DBL0H,DBL0H,r7
 383         or.mi r4,r4,r10
 384         mov.mi r10,r7
 385         add.f DBL0L,r10,DBL0L
 386         add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
 387         bxor.f 0,r4,31
 388         add.pnz.f DBL0L,DBL0L,1
 389         add.cs.f DBL0H,DBL0H,1
 390         jne_s [blink]
 391         /* Calculation so far was not conclusive; calculate further rest.  */
 392         mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
 393         asr.f r12,r12,3
 394         asl r5,r5,25 ; s-51.7:25
 395         mov r11,mlo ; rest before considering r12 in r5 : -r11
 396         mulu64 (r12,r8) ; u-51.31:1
 397         and r9,DBL0L,1 ; tie-breaker: round to even
 398         lsr r11,r11,7 ; u-51.30:2
 399         mov DBL1H,mlo ; u-51.31:1
 400         mulu64 (r12,DBL1L) ; u-51.62:2
 401         sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
 402         add_s DBL1H,DBL1H,r11
 403         sub DBL1H,DBL1H,r5 ; -rest msw
 404         add_s DBL1H,DBL1H,mhi ; -rest msw
 405         add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
 406         tst_s DBL1H,DBL1H
 407         cmp.eq mlo,r9
 408         add.cs.f DBL0L,DBL0L,1
 409         j_s.d [blink]
 410         add.cs DBL0H,DBL0H,1
 411
 412 .Lret0:
 413         /* return +- 0 */
 414         j_s.d [blink]
 415         mov_s DBL0L,0
 416 .Linf:
 417         mov_s DBL0H,r9
 418         mov_s DBL0L,0
 419         j_s.d [blink]
 420         bxor.mi DBL0H,DBL0H,31
 421         ENDFUNC(__divdf3)