libgcc/config/arc/ieee-754/divdf3.S

   1 /* Copyright (C) 2008-2012 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 /*
  27    to calculate a := b/x as b*y, with y := 1/x:
  28    - x is in the range [1..2)
  29    - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
  30      Precision is higher for polynoms used to evaluate input with larger
  31      value.
  32    - Do one newton-raphson iteration step to double the precision,
  33      then multiply this with the divisor
  34         -> more time to decide if dividend is subnormal
  35      - the worst error propagation is on the side of the value range
  36        with the least initial defect, thus giving us about 30 bits precision.
  37       The truncation error for the either is less than 1 + x/2 ulp.
  38       A 31 bit inverse can be simply calculated by using x with implicit 1
  39       and chaining the multiplies.  For a 32 bit inverse, we multiply y0^2
  40       with the bare fraction part of x, then add in y0^2 for the implicit
  41       1 of x.
  42     - If calculating a 31 bit inverse, the systematic error is less than
  43       -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
  44     - If we calculate our seed with a 32 bit fraction, we can archive a
  45       tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
  46       only need to take the step to calculate the 2nd stage rest and
  47       rounding adjust 1/32th of the time.  However, if we use a 20 bit
  48       fraction for the seed, the negative error can exceed -2 ulp/128, (2)
  49       thus for a simple add / tst check, we need to do the 2nd stage
  50       rest calculation/ rounding adjust 1/16th of the time.
  51       (1): The inexactness of the 32 bit inverse contributes an error in the
  52       range of (-1 .. +(1+x/2) ) ulp/128.  Leaving out the low word of the
  53       rest contributes an error < +1/x ulp/128 .  In the interval [1,2),
  54       x/2 + 1/x <= 1.5 .
  55       (2): Unless proven otherwise.  I have not actually looked for an
  56       example where -2 ulp/128 is exceeded, and my calculations indicate
  57       that the excess, if existent, is less than -1/512 ulp.
  58  */
  59 #include "arc-ieee-754.h"
  60
  61 /* N.B. fp-bit.c does double rounding on denormal numbers.  */
  62 #if 0 /* DEBUG */
  63         .global __divdf3
  64         FUNC(__divdf3)
  65         .balign 4
  66 __divdf3:
  67         push_s blink
  68         push_s r2
  69         push_s r3
  70         push_s r0
  71         bl.d __divdf3_c
  72         push_s r1
  73         ld_s r2,[sp,12]
  74         ld_s r3,[sp,8]
  75         st_s r0,[sp,12]
  76         st_s r1,[sp,8]
  77         pop_s r1
  78         bl.d __divdf3_asm
  79         pop_s r0
  80         pop_s r3
  81         pop_s r2
  82         pop_s blink
  83         cmp r0,r2
  84         cmp.eq r1,r3
  85         jeq_s [blink]
  86         and r12,DBL0H,DBL1H
  87         bic.f 0,0x7ff80000,r12 ; both NaN -> OK
  88         jeq_s [blink]
  89         bl abort
  90         ENDFUNC(__divdf3)
  91 #define __divdf3 __divdf3_asm
  92 #endif /* DEBUG */
  93
  94         FUNC(__divdf3)
  95 __divdf3_support: /* This label makes debugger output saner.  */
  96         .balign 4
  97 .Ldenorm_dbl1:
  98         brge r6, \
  99                 0x43500000,.Linf_NaN ; large number / denorm -> Inf
 100         bmsk.f r12,DBL1H,19
 101         mov.eq r12,DBL1L
 102         mov.eq DBL1L,0
 103         sub.eq r7,r7,32
 104         norm.f r11,r12 ; flag for x/0 -> Inf check
 105         beq_s .Linf_NaN
 106         mov.mi r11,0
 107         add.pl r11,r11,1
 108         add_s r12,r12,r12
 109         asl r8,r12,r11
 110         rsub r12,r11,31
 111         lsr r12,DBL1L,r12
 112         tst_s DBL1H,DBL1H
 113         or r8,r8,r12
 114         lsr r4,r8,26
 115         lsr DBL1H,r8,12
 116         ld.as r4,[r10,r4]
 117         bxor.mi DBL1H,DBL1H,31
 118         sub r11,r11,11
 119         asl DBL1L,DBL1L,r11
 120         sub r11,r11,1
 121         mpyhu r5,r4,r8
 122         sub r7,r7,r11
 123         asl r4,r4,12
 124         b.d .Lpast_denorm_dbl1
 125         asl r7,r7,20
 126         ; wb stall
 127
 128         .balign 4
 129 .Ldenorm_dbl0:
 130         bmsk.f r12,DBL0H,19
 131         ; wb stall
 132         mov.eq r12,DBL0L
 133         sub.eq r6,r6,32
 134         norm.f r11,r12 ; flag for 0/x -> 0 check
 135         brge r7, \
 136                 0x43500000, .Lret0_NaN ; denorm/large number -> 0
 137         beq_s .Lret0_NaN
 138         mov.mi r11,0
 139         add.pl r11,r11,1
 140         asl r12,r12,r11
 141         sub r6,r6,r11
 142         add.f 0,r6,31
 143         lsr r10,DBL0L,r6
 144         mov.mi r10,0
 145         add r6,r6,11+32
 146         neg.f r11,r6
 147         asl DBL0L,DBL0L,r11
 148         mov.pl DBL0L,0
 149         sub r6,r6,32-1
 150         b.d .Lpast_denorm_dbl0
 151         asl r6,r6,20
 152
 153 .Linf_NaN:
 154         tst_s DBL0L,DBL0L ; 0/0 -> NaN
 155         xor_s DBL1H,DBL1H,DBL0H
 156         bclr.eq.f DBL0H,DBL0H,31
 157         bmsk DBL0H,DBL1H,30
 158         xor_s DBL0H,DBL0H,DBL1H
 159         sub.eq DBL0H,DBL0H,1
 160         mov_s DBL0L,0
 161         j_s.d [blink]
 162         or DBL0H,DBL0H,r9
 163         .balign 4
 164 .Lret0_NaN:
 165         xor_s DBL1H,DBL1H,DBL0H
 166         cmp_s r12,r9
 167         mov_s DBL0L,0
 168         bmsk DBL0H,DBL1H,30
 169         xor_s DBL0H,DBL0H,DBL1H
 170         j_s.d [blink]
 171         sub.hi DBL0H,DBL0H,1
 172 .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
 173         not_s DBL0L,DBL1H
 174         cmp r6,r9
 175         sub_s.ne DBL0L,DBL0L,DBL0L
 176         tst_s DBL0H,DBL0H
 177         add_s DBL0H,DBL1H,DBL0L
 178         j_s.d [blink]
 179         bxor.mi DBL0H,DBL0H,31
 180 .Linf_nan_dbl0:
 181         tst_s DBL1H,DBL1H
 182         j_s.d [blink]
 183         bxor.mi DBL0H,DBL0H,31
 184         .balign 4
 185         .global __divdf3
 186 /* N.B. the spacing between divtab and the add3 to get its address must
 187    be a multiple of 8.  */
 188 __divdf3:
 189         asl r8,DBL1H,12
 190         lsr r12,DBL1L,20
 191         lsr r4,r8,26
 192         add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
 193         ld.as r4,[r10,r4]
 194         ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
 195         or r8,r8,r12
 196         mpyhu r5,r4,r8
 197         and.f r7,DBL1H,r9
 198         asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
 199         beq.d .Ldenorm_dbl1
 200         and r6,DBL0H,r9
 201 .Lpast_denorm_dbl1: ; wb stall
 202         sub r4,r4,r5
 203         mpyhu r5,r4,r4
 204         breq.d r6,0,.Ldenorm_dbl0
 205         lsr r8,r8,1
 206         asl r12,DBL0H,11
 207         lsr r10,DBL0L,21
 208 .Lpast_denorm_dbl0: ; wb stall
 209         bset r8,r8,31
 210         mpyhu r11,r5,r8
 211         add_s r12,r12,r10
 212         bset r5,r12,31
 213         cmp r5,r8
 214         cmp.eq DBL0L,DBL1L
 215         ; wb stall
 216         lsr.cc r5,r5,1
 217         sub r4,r4,r11 ; u1.31 inverse, about 30 bit
 218         mpyhu r11,r5,r4 ; result fraction highpart
 219         breq r7,r9,.Linf_nan_dbl1
 220         lsr r8,r8,2 ; u3.29
 221         add r5,r6, /* wait for immediate /  XMAC wb stall */ \
 222                 0x3fe00000
 223         ; wb stall (not for XMAC)
 224         breq r6,r9,.Linf_nan_dbl0
 225         mpyu r12,r11,r8 ; u-28.31
 226         asl_s DBL1L,DBL1L,9 ; u-29.23:9
 227         sbc r6,r5,r7
 228         ; resource conflict (not for XMAC)
 229         mpyhu r5,r11,DBL1L ; u-28.23:9
 230         add.cs DBL0L,DBL0L,DBL0L
 231         asl_s DBL0L,DBL0L,6 ; u-26.25:7
 232         asl r10,r11,23
 233         sub_l DBL0L,DBL0L,r12
 234         ; wb stall (before 'and' for XMAC)
 235         lsr r7,r11,9
 236         sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
 237         mpyh r12,r5,r4 ; result fraction lowpart
 238         xor.f 0,DBL0H,DBL1H
 239         and DBL0H,r6,r9
 240         add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
 241         bxor.mi DBL0H,DBL0H,31
 242         brhs r6, /*  wb stall / wait for immediate */ \
 243                 0x7fe00000,.Linf_denorm
 244         add.f r12,r12,0x11
 245         asr r9,r12,5
 246         sub.mi DBL0H,DBL0H,1
 247         add.f DBL0L,r9,r10
 248         tst r12,0x1c
 249         jne.d [blink]
 250         add.cs DBL0H,DBL0H,1
 251         /* work out exact rounding if we fall through here.  */
 252         /* We know that the exact result cannot be represented in double
 253            precision.  Find the mid-point between the two nearest
 254            representable values, multiply with the divisor, and check if
 255            the result is larger than the dividend.  Since we want to know
 256            only the sign bit, it is sufficient to calculate only the
 257            highpart of the lower 64 bits.  */
 258         sub.f DBL0L,DBL0L,1
 259         asl r12,r9,2 ; u-22.30:2
 260         mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
 261         sub.cs DBL0H,DBL0H,1
 262         sub.f r12,r12,2
 263         ; resource conflict (not for XMAC)
 264         mpyhu r7,r12,DBL1L ; u-51.32
 265         asl r5,r5,25 ; s-51.7:25
 266         lsr r10,r10,7 ; u-51.30:2
 267         ; resource conflict (not for XMAC)
 268         ; resource conflict (not for XMAC)
 269         mpyu r9,r12,r8 ; u-51.31:1
 270         sub r5,r5,r10
 271         add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
 272         bset r7,r7,0 ; make sure that the result is not zero, and that
 273         ; wb stall (one earlier for XMAC)
 274         sub r5,r5,r7 ; a highpart zero appears negative
 275         sub.f r5,r5,r9 ; rest msw
 276         add.pl.f DBL0L,DBL0L,1
 277         j_s.d [blink]
 278         add.eq DBL0H,DBL0H,1
 279
 280         .balign 4
 281 .Linf_denorm:
 282         brlo r6,0xc0000000,.Linf
 283 .Ldenorm:
 284         asr r6,r6,20
 285         neg r9,r6
 286         mov_s DBL0H,0
 287         brhs.d r9,54,.Lret0
 288         bxor.mi DBL0H,DBL0H,31
 289         add_l r12,r12,1
 290         and r12,r12,-4
 291         rsub r7,r6,5
 292         asr r10,r12,28
 293         bmsk r4,r12,27
 294         asrs DBL0L,r4,r7
 295         add DBL1H,r11,r10
 296         add.f r7,r6,32-5
 297         abss r10,r4
 298         asl r4,r4,r7
 299         mov.mi r4,r10
 300         add.f r10,r6,23
 301         rsub r7,r6,9
 302         lsr r7,DBL1H,r7
 303         asl r10,DBL1H,r10
 304         or.pnz DBL0H,DBL0H,r7
 305         or.mi r4,r4,r10
 306         mov.mi r10,r7
 307         add.f DBL0L,r10,DBL0L
 308         add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
 309         bxor.f 0,r4,31
 310         add.pnz.f DBL0L,DBL0L,1
 311         add.cs.f DBL0H,DBL0H,1
 312         jne_l [blink]
 313         /* Calculation so far was not conclusive; calculate further rest.  */
 314         mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
 315         asr.f r12,r12,3
 316         asl r5,r5,25 ; s-51.7:25
 317         ; resource conflict (not for XMAC)
 318         mpyu DBL1H,r12,r8 ; u-51.31:1
 319         and r9,DBL0L,1 ; tie-breaker: round to even
 320         lsr r11,r11,7 ; u-51.30:2
 321         ; resource conflict (not for XMAC)
 322         mpyhu r8,r12,DBL1L ; u-51.32
 323         sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
 324         add_s DBL1H,DBL1H,r11
 325         ; resource conflict (not for XMAC)
 326         ; resource conflict (not for XMAC)
 327         mpyu r12,r12,DBL1L ; u-83.30:2
 328         sub DBL1H,DBL1H,r5 ; -rest msw
 329         add_s DBL1H,DBL1H,r8 ; -rest msw
 330         add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
 331         ; wb stall (XMAC: Before add.f)
 332         tst_s DBL1H,DBL1H
 333         cmp.eq r12,r9
 334         add.cs.f DBL0L,DBL0L,1
 335         j_s.d [blink]
 336         add.cs DBL0H,DBL0H,1
 337
 338 .Lret0:
 339         /* return +- 0 */
 340         j_s.d [blink]
 341         mov_s DBL0L,0
 342 .Linf:
 343         mov_s DBL0H,r9
 344         mov_s DBL0L,0
 345         j_s.d [blink]
 346         bxor.mi DBL0H,DBL0H,31
 347
 348         .balign 4
 349 .Ldivtab:
 350         .long 0xfc0fffe1
 351         .long 0xf46ffdfb
 352         .long 0xed1ffa54
 353         .long 0xe61ff515
 354         .long 0xdf7fee75
 355         .long 0xd91fe680
 356         .long 0xd2ffdd52
 357         .long 0xcd1fd30c
 358         .long 0xc77fc7cd
 359         .long 0xc21fbbb6
 360         .long 0xbcefaec0
 361         .long 0xb7efa100
 362         .long 0xb32f92bf
 363         .long 0xae8f83b7
 364         .long 0xaa2f7467
 365         .long 0xa5ef6479
 366         .long 0xa1cf53fa
 367         .long 0x9ddf433e
 368         .long 0x9a0f3216
 369         .long 0x965f2091
 370         .long 0x92df0f11
 371         .long 0x8f6efd05
 372         .long 0x8c1eeacc
 373         .long 0x88eed876
 374         .long 0x85dec615
 375         .long 0x82eeb3b9
 376         .long 0x800ea10b
 377         .long 0x7d3e8e0f
 378         .long 0x7a8e7b3f
 379         .long 0x77ee6836
 380         .long 0x756e5576
 381         .long 0x72fe4293
 382         .long 0x709e2f93
 383         .long 0x6e4e1c7f
 384         .long 0x6c0e095e
 385         .long 0x69edf6c5
 386         .long 0x67cde3a5
 387         .long 0x65cdd125
 388         .long 0x63cdbe25
 389         .long 0x61ddab3f
 390         .long 0x600d991f
 391         .long 0x5e3d868c
 392         .long 0x5c6d7384
 393         .long 0x5abd615f
 394         .long 0x590d4ecd
 395         .long 0x576d3c83
 396         .long 0x55dd2a89
 397         .long 0x545d18e9
 398         .long 0x52dd06e9
 399         .long 0x516cf54e
 400         .long 0x4ffce356
 401         .long 0x4e9cd1ce
 402         .long 0x4d3cbfec
 403         .long 0x4becae86
 404         .long 0x4aac9da4
 405         .long 0x496c8c73
 406         .long 0x483c7bd3
 407         .long 0x470c6ae8
 408         .long 0x45dc59af
 409         .long 0x44bc4915
 410         .long 0x43ac3924
 411         .long 0x428c27fb
 412         .long 0x418c187a
 413         .long 0x407c07bd
 414 .L7ff00000:
 415         .long 0x7ff00000
 416         ENDFUNC(__divdf3)