libgcc/config/arc/ieee-754/divsf3-stdmul.S

   1 /* Copyright (C) 2008-2024 Free Software Foundation, Inc.
   2    Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
   3                 on behalf of Synopsys Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 /*
  27    - calculate 15..18 bit inverse using a table of approximating polynoms.
  28      precision is higher for polynoms used to evaluate input with larger
  29      value.
  30    - do one newton-raphson iteration step to double the precision,
  31      then multiply this with the divisor
  32         -> more time to decide if dividend is subnormal
  33      - the worst error propagation is on the side of the value range
  34        with the least initial defect, thus giving us about 30 bits precision.
  35  */
  36 #include "arc-ieee-754.h"
  37
  38 #if 0 /* DEBUG */
  39         .global __divsf3
  40         FUNC(__divsf3)
  41         .balign 4
  42 __divsf3:
  43         push_s blink
  44         push_s r1
  45         bl.d __divsf3_c
  46         push_s r0
  47         ld_s r1,[sp,4]
  48         st_s r0,[sp,4]
  49         bl.d __divsf3_asm
  50         pop_s r0
  51         pop_s r1
  52         pop_s blink
  53         cmp r0,r1
  54 #if 1
  55         bne abort
  56         jeq_s [blink]
  57         b abort
  58 #else
  59         bne abort
  60         j_s [blink]
  61 #endif
  62         ENDFUNC(__divsf3)
  63 #define __divsf3 __divsf3_asm
  64 #endif /* DEBUG */
  65
  66         FUNC(__divsf3)
  67         .balign 4
  68 .L7f800000:
  69         .long 0x7f800000
  70 .Ldivtab:
  71         .long 0xfc0ffff0
  72         .long 0xf46ffefd
  73         .long 0xed1ffd2a
  74         .long 0xe627fa8e
  75         .long 0xdf7ff73b
  76         .long 0xd917f33b
  77         .long 0xd2f7eea3
  78         .long 0xcd1fe986
  79         .long 0xc77fe3e7
  80         .long 0xc21fdddb
  81         .long 0xbcefd760
  82         .long 0xb7f7d08c
  83         .long 0xb32fc960
  84         .long 0xae97c1ea
  85         .long 0xaa27ba26
  86         .long 0xa5e7b22e
  87         .long 0xa1cfa9fe
  88         .long 0x9ddfa1a0
  89         .long 0x9a0f990c
  90         .long 0x9667905d
  91         .long 0x92df878a
  92         .long 0x8f6f7e84
  93         .long 0x8c27757e
  94         .long 0x88f76c54
  95         .long 0x85df630c
  96         .long 0x82e759c5
  97         .long 0x8007506d
  98         .long 0x7d3f470a
  99         .long 0x7a8f3da2
 100         .long 0x77ef341e
 101         .long 0x756f2abe
 102         .long 0x72f7212d
 103         .long 0x709717ad
 104         .long 0x6e4f0e44
 105         .long 0x6c1704d6
 106         .long 0x69e6fb44
 107         .long 0x67cef1d7
 108         .long 0x65c6e872
 109         .long 0x63cedf18
 110         .long 0x61e6d5cd
 111         .long 0x6006cc6d
 112         .long 0x5e36c323
 113         .long 0x5c76b9f3
 114         .long 0x5abeb0b7
 115         .long 0x5916a79b
 116         .long 0x57769e77
 117         .long 0x55de954d
 118         .long 0x54568c4e
 119         .long 0x52d6834d
 120         .long 0x51667a7f
 121         .long 0x4ffe71b5
 122         .long 0x4e9e68f1
 123         .long 0x4d466035
 124         .long 0x4bf65784
 125         .long 0x4aae4ede
 126         .long 0x496e4646
 127         .long 0x48363dbd
 128         .long 0x47063547
 129         .long 0x45de2ce5
 130         .long 0x44be2498
 131         .long 0x43a61c64
 132         .long 0x4296144a
 133         .long 0x41860c0e
 134         .long 0x407e03ee
 135 __divsf3_support: /* This label makes debugger output saner.  */
 136 .Ldenorm_fp1:
 137         bclr r6,r6,31
 138         norm.f r12,r6 ; flag for x/0 -> Inf check
 139         add r6,r6,r6
 140         rsub r5,r12,16
 141         ror r5,r1,r5
 142         asl r6,r6,r12
 143         bmsk r5,r5,5
 144         ld.as r5,[r3,r5]
 145         add r4,r6,r6
 146         ; load latency
 147         MPYHU r7,r5,r4
 148         bic.ne.f 0, \
 149                 0x60000000,r0 ; large number / denorm -> Inf
 150         beq_s .Linf_NaN
 151         asl r5,r5,13
 152         ; wb stall
 153         ; slow track
 154         sub r7,r5,r7
 155         MPYHU r8,r7,r6
 156         asl_s r12,r12,23
 157         and.f r2,r0,r9
 158         add r2,r2,r12
 159         asl r12,r0,8
 160         ; wb stall
 161         bne.d .Lpast_denorm_fp1
 162 .Ldenorm_fp0:
 163         MPYHU r8,r8,r7
 164         bclr r12,r12,31
 165         norm.f r3,r12 ; flag for 0/x -> 0 check
 166         bic.ne.f 0,0x60000000,r1 ; denorm/large number -> 0
 167         beq_s .Lret0
 168         asl_s r12,r12,r3
 169         asl_s r3,r3,23
 170         add_s r12,r12,r12
 171         add r11,r11,r3
 172         b.d .Lpast_denorm_fp0
 173         mov_s r3,r12
 174         .balign 4
 175 .Linf_NaN:
 176         bclr.f 0,r0,31 ; 0/0 -> NaN
 177         xor_s r0,r0,r1
 178         bmsk r1,r0,30
 179         bic_s r0,r0,r1
 180         sub.eq r0,r0,1
 181         j_s.d [blink]
 182         or r0,r0,r9
 183 .Lret0:
 184         xor_s r0,r0,r1
 185         bmsk r1,r0,30
 186         j_s.d [blink]
 187         bic_s r0,r0,r1
 188 .Linf_nan_fp1:
 189         lsr_s r0,r0,31
 190         bmsk.f 0,r1,22
 191         asl_s r0,r0,31
 192         bne_s 0f ; inf/inf -> nan
 193         brne r2,r9,.Lsigned0 ; x/inf -> 0, but x/nan -> nan
 194 0:      j_s.d [blink]
 195         mov r0,-1
 196 .Lsigned0:
 197 .Linf_nan_fp0:
 198         tst_s r1,r1
 199         j_s.d [blink]
 200         bxor.mi r0,r0,31
 201         .balign 4
 202         .global __divsf3
 203 /* N.B. the spacing between divtab and the sub3 to get its address must
 204    be a multiple of 8.  */
 205 __divsf3:
 206         lsr r2,r1,17
 207         sub3 r3,pcl,55;(.-.Ldivtab) >> 3
 208         bmsk_s r2,r2,5
 209         ld.as r5,[r3,r2]
 210         asl r4,r1,9
 211         ld.as r9,[pcl,-114]; [pcl,(-((.-.L7f800000) >> 2))] ; 0x7f800000
 212         MPYHU r7,r5,r4
 213         asl r6,r1,8
 214         and.f r11,r1,r9
 215         bset r6,r6,31
 216         asl r5,r5,13
 217         ; wb stall
 218         beq .Ldenorm_fp1
 219         sub r7,r5,r7
 220         MPYHU r8,r7,r6
 221         breq.d r11,r9,.Linf_nan_fp1
 222         and.f r2,r0,r9
 223         beq.d .Ldenorm_fp0
 224         asl r12,r0,8
 225         ; wb stall
 226         breq r2,r9,.Linf_nan_fp0
 227         MPYHU r8,r8,r7
 228 .Lpast_denorm_fp1:
 229         bset r3,r12,31
 230 .Lpast_denorm_fp0:
 231         cmp_s r3,r6
 232         lsr.cc r3,r3,1
 233         add_s r2,r2, /* wait for immediate */ \
 234         /* wb stall */ \
 235                 0x3f000000
 236         sub r7,r7,r8 ; u1.31 inverse, about 30 bit
 237         MPYHU r3,r3,r7
 238         sbc r2,r2,r11
 239         xor.f 0,r0,r1
 240         and r0,r2,r9
 241         bxor.mi r0,r0,31
 242         brhs r2, /* wb stall / wait for immediate */ \
 243                 0x7f000000,.Linf_denorm
 244 .Lpast_denorm:
 245         add_s r3,r3,0x22 ; round to nearest or higher
 246         tst r3,0x3c ; check if rounding was unsafe
 247         lsr r3,r3,6
 248         jne.d [blink] ; return if rounding was safe.
 249         add_s r0,r0,r3
 250         /* work out exact rounding if we fall through here.  */
 251         /* We know that the exact result cannot be represented in single
 252            precision.  Find the mid-point between the two nearest
 253            representable values, multiply with the divisor, and check if
 254            the result is larger than the dividend.  */
 255         add_s r3,r3,r3
 256         sub_s r3,r3,1
 257         mpyu r3,r3,r6
 258         asr.f 0,r0,1 ; for round-to-even in case this is a denorm
 259         rsub r2,r9,25
 260         asl_s r12,r12,r2
 261         ; wb stall
 262         ; slow track
 263         sub.f 0,r12,r3
 264         j_s.d [blink]
 265         sub.mi r0,r0,1
 266 /* For denormal results, it is possible that an exact result needs
 267    rounding, and thus the round-to-even rule has to come into play.  */
 268 .Linf_denorm:
 269         brlo r2,0xc0000000,.Linf
 270 .Ldenorm:
 271         asr_s r2,r2,23
 272         bic r0,r0,r9
 273         neg r9,r2
 274         brlo.d r9,25,.Lpast_denorm
 275         lsr r3,r3,r9
 276         /* Fall through: return +- 0 */
 277         j_s [blink]
 278 .Linf:
 279         j_s.d [blink]
 280         or r0,r0,r9
 281         ENDFUNC(__divsf3)