1 /* Copyright (C) 2008-2012 Free Software Foundation, Inc.
2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3 on behalf of Synopsys Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
27 to calculate a := b/x as b*y, with y := 1/x:
28 - x is in the range [1..2)
29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30 Precision is higher for polynoms used to evaluate input with larger
32 - Do one newton-raphson iteration step to double the precision,
33 then multiply this with the divisor
34 -> more time to decide if dividend is subnormal
35 - the worst error propagation is on the side of the value range
36 with the least initial defect, thus giving us about 30 bits precision.
37 The truncation error for the either is less than 1 + x/2 ulp.
38 A 31 bit inverse can be simply calculated by using x with implicit 1
39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
40 with the bare fraction part of x, then add in y0^2 for the implicit
42 - If calculating a 31 bit inverse, the systematic error is less than
43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44 - If we calculate our seed with a 32 bit fraction, we can archive a
45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46 only need to take the step to calculate the 2nd stage rest and
47 rounding adjust 1/32th of the time. However, if we use a 20 bit
48 fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49 thus for a simple add / tst check, we need to do the 2nd stage
50 rest calculation/ rounding adjust 1/16th of the time.
51 (1): The inexactness of the 32 bit inverse contributes an error in the
52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
53 rest contributes an error < +1/x ulp/128 . In the interval [1,2),
55 (2): Unless proven otherwise. I have not actually looked for an
56 example where -2 ulp/128 is exceeded, and my calculations indicate
57 that the excess, if existent, is less than -1/512 ulp.
59 #include "arc-ieee-754.h"
61 /* N.B. fp-bit.c does double rounding on denormal numbers. */
87 bic.f 0,0x7ff80000,r12 ; both NaN -> OK
91 #define __divdf3 __divdf3_asm
95 __divdf3_support: /* This label makes debugger output saner. */
99 0x43500000,.Linf_NaN ; large number / denorm -> Inf
104 norm.f r11,r12 ; flag for x/0 -> Inf check
117 bxor.mi DBL1H,DBL1H,31
124 b.d .Lpast_denorm_dbl1
134 norm.f r11,r12 ; flag for 0/x -> 0 check
136 0x43500000, .Lret0_NaN ; denorm/large number -> 0
150 b.d .Lpast_denorm_dbl0
154 tst_s DBL0L,DBL0L ; 0/0 -> NaN
155 xor_s DBL1H,DBL1H,DBL0H
156 bclr.eq.f DBL0H,DBL0H,31
158 xor_s DBL0H,DBL0H,DBL1H
165 xor_s DBL1H,DBL1H,DBL0H
169 xor_s DBL0H,DBL0H,DBL1H
172 .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
175 sub_s.ne DBL0L,DBL0L,DBL0L
177 add_s DBL0H,DBL1H,DBL0L
179 bxor.mi DBL0H,DBL0H,31
183 bxor.mi DBL0H,DBL0H,31
186 /* N.B. the spacing between divtab and the add3 to get its address must
187 be a multiple of 8. */
192 add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
194 ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
198 asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
201 .Lpast_denorm_dbl1: ; wb stall
204 breq.d r6,0,.Ldenorm_dbl0
208 .Lpast_denorm_dbl0: ; wb stall
217 sub r4,r4,r11 ; u1.31 inverse, about 30 bit
218 mpyhu r11,r5,r4 ; result fraction highpart
219 breq r7,r9,.Linf_nan_dbl1
221 add r5,r6, /* wait for immediate / XMAC wb stall */ \
223 ; wb stall (not for XMAC)
224 breq r6,r9,.Linf_nan_dbl0
225 mpyu r12,r11,r8 ; u-28.31
226 asl_s DBL1L,DBL1L,9 ; u-29.23:9
228 ; resource conflict (not for XMAC)
229 mpyhu r5,r11,DBL1L ; u-28.23:9
230 add.cs DBL0L,DBL0L,DBL0L
231 asl_s DBL0L,DBL0L,6 ; u-26.25:7
233 sub_l DBL0L,DBL0L,r12
234 ; wb stall (before 'and' for XMAC)
236 sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
237 mpyh r12,r5,r4 ; result fraction lowpart
240 add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
241 bxor.mi DBL0H,DBL0H,31
242 brhs r6, /* wb stall / wait for immediate */ \
243 0x7fe00000,.Linf_denorm
251 /* work out exact rounding if we fall through here. */
252 /* We know that the exact result cannot be represented in double
253 precision. Find the mid-point between the two nearest
254 representable values, multiply with the divisor, and check if
255 the result is larger than the dividend. Since we want to know
256 only the sign bit, it is sufficient to calculate only the
257 highpart of the lower 64 bits. */
259 asl r12,r9,2 ; u-22.30:2
260 mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
263 ; resource conflict (not for XMAC)
264 mpyhu r7,r12,DBL1L ; u-51.32
265 asl r5,r5,25 ; s-51.7:25
266 lsr r10,r10,7 ; u-51.30:2
267 ; resource conflict (not for XMAC)
268 ; resource conflict (not for XMAC)
269 mpyu r9,r12,r8 ; u-51.31:1
271 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
272 bset r7,r7,0 ; make sure that the result is not zero, and that
273 ; wb stall (one earlier for XMAC)
274 sub r5,r5,r7 ; a highpart zero appears negative
275 sub.f r5,r5,r9 ; rest msw
276 add.pl.f DBL0L,DBL0L,1
282 brlo r6,0xc0000000,.Linf
288 bxor.mi DBL0H,DBL0H,31
304 or.pnz DBL0H,DBL0H,r7
307 add.f DBL0L,r10,DBL0L
308 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
310 add.pnz.f DBL0L,DBL0L,1
311 add.cs.f DBL0H,DBL0H,1
313 /* Calculation so far was not conclusive; calculate further rest. */
314 mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
316 asl r5,r5,25 ; s-51.7:25
317 ; resource conflict (not for XMAC)
318 mpyu DBL1H,r12,r8 ; u-51.31:1
319 and r9,DBL0L,1 ; tie-breaker: round to even
320 lsr r11,r11,7 ; u-51.30:2
321 ; resource conflict (not for XMAC)
322 mpyhu r8,r12,DBL1L ; u-51.32
323 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
324 add_s DBL1H,DBL1H,r11
325 ; resource conflict (not for XMAC)
326 ; resource conflict (not for XMAC)
327 mpyu r12,r12,DBL1L ; u-83.30:2
328 sub DBL1H,DBL1H,r5 ; -rest msw
329 add_s DBL1H,DBL1H,r8 ; -rest msw
330 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
331 ; wb stall (XMAC: Before add.f)
334 add.cs.f DBL0L,DBL0L,1
346 bxor.mi DBL0H,DBL0H,31