1 /* Copyright (C) 2008-2016 Free Software Foundation, Inc.
2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3 on behalf of Synopsys Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
27 to calculate a := b/x as b*y, with y := 1/x:
28 - x is in the range [1..2)
29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30 Precision is higher for polynoms used to evaluate input with larger
32 - Do one newton-raphson iteration step to double the precision,
33 then multiply this with the divisor
34 -> more time to decide if dividend is subnormal
35 - the worst error propagation is on the side of the value range
36 with the least initial defect, thus giving us about 30 bits precision.
37 The truncation error for the either is less than 1 + x/2 ulp.
38 A 31 bit inverse can be simply calculated by using x with implicit 1
39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
40 with the bare fraction part of x, then add in y0^2 for the implicit
42 - If calculating a 31 bit inverse, the systematic error is less than
43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44 - If we calculate our seed with a 32 bit fraction, we can archive a
45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46 only need to take the step to calculate the 2nd stage rest and
47 rounding adjust 1/32th of the time. However, if we use a 20 bit
48 fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49 thus for a simple add / tst check, we need to do the 2nd stage
50 rest calculation/ rounding adjust 1/16th of the time.
51 (1): The inexactness of the 32 bit inverse contributes an error in the
52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
53 rest contributes an error < +1/x ulp/128 . In the interval [1,2),
55 (2): Unless proven otherwise. I have not actually looked for an
56 example where -2 ulp/128 is exceeded, and my calculations indicate
57 that the excess, if existent, is less than -1/512 ulp.
59 #include "arc-ieee-754.h"
61 /* N.B. fp-bit.c does double rounding on denormal numbers. */
87 bic.f 0,0x7ff80000,r12 ; both NaN -> OK
91 #define __divdf3 __divdf3_asm
95 __divdf3_support: /* This label makes debugger output saner. */
99 0x43500000,.Linf_NaN ; large number / denorm -> Inf
104 norm.f r11,r12 ; flag for x/0 -> Inf check
117 bxor.mi DBL1H,DBL1H,31
124 b.d .Lpast_denorm_dbl1
134 norm.f r11,r12 ; flag for 0/x -> 0 check
136 0x43500000, .Lret0_NaN ; denorm/large number -> 0
150 b.d .Lpast_denorm_dbl0
154 tst_s DBL0L,DBL0L ; 0/0 -> NaN
155 xor_s DBL1H,DBL1H,DBL0H
156 bclr.eq.f DBL0H,DBL0H,31
158 xor_s DBL0H,DBL0H,DBL1H
165 xor_s DBL1H,DBL1H,DBL0H
169 xor_s DBL0H,DBL0H,DBL1H
172 .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
175 sub_s.ne DBL0L,DBL0L,DBL0L
177 add_s DBL0H,DBL1H,DBL0L
179 bxor.mi DBL0H,DBL0H,31
183 bxor.mi DBL0H,DBL0H,31
186 /* N.B. the spacing between divtab and the add3 to get its address must
187 be a multiple of 8. */
193 add3 r10,pcl,60 ; (.Ldivtab-.) >> 3
195 add3 r10,pcl,59 ; (.Ldivtab-.) >> 3
199 ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
201 ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
206 asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline.
209 .Lpast_denorm_dbl1: ; wb stall
212 breq.d r6,0,.Ldenorm_dbl0
216 .Lpast_denorm_dbl0: ; wb stall
225 sub r4,r4,r11 ; u1.31 inverse, about 30 bit
226 MPYHU r11,r5,r4 ; result fraction highpart
227 breq r7,r9,.Linf_nan_dbl1
229 add r5,r6, /* wait for immediate / XMAC wb stall */ \
231 ; wb stall (not for XMAC)
232 breq r6,r9,.Linf_nan_dbl0
233 mpyu r12,r11,r8 ; u-28.31
234 asl_s DBL1L,DBL1L,9 ; u-29.23:9
236 ; resource conflict (not for XMAC)
237 MPYHU r5,r11,DBL1L ; u-28.23:9
238 add.cs DBL0L,DBL0L,DBL0L
239 asl_s DBL0L,DBL0L,6 ; u-26.25:7
241 sub_l DBL0L,DBL0L,r12
242 ; wb stall (before 'and' for XMAC)
244 sub r5,DBL0L,r5 ; rest msw ; u-26.31:0
245 MPYH r12,r5,r4 ; result fraction lowpart
248 add_s DBL0H,DBL0H,r7 ; (XMAC wb stall)
249 bxor.mi DBL0H,DBL0H,31
250 brhs r6, /* wb stall / wait for immediate */ \
251 0x7fe00000,.Linf_denorm
259 /* work out exact rounding if we fall through here. */
260 /* We know that the exact result cannot be represented in double
261 precision. Find the mid-point between the two nearest
262 representable values, multiply with the divisor, and check if
263 the result is larger than the dividend. Since we want to know
264 only the sign bit, it is sufficient to calculate only the
265 highpart of the lower 64 bits. */
267 asl r12,r9,2 ; u-22.30:2
268 mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10
271 ; resource conflict (not for XMAC)
272 MPYHU r7,r12,DBL1L ; u-51.32
273 asl r5,r5,25 ; s-51.7:25
274 lsr r10,r10,7 ; u-51.30:2
275 ; resource conflict (not for XMAC)
276 ; resource conflict (not for XMAC)
277 mpyu r9,r12,r8 ; u-51.31:1
279 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
280 bset r7,r7,0 ; make sure that the result is not zero, and that
281 ; wb stall (one earlier for XMAC)
282 sub r5,r5,r7 ; a highpart zero appears negative
283 sub.f r5,r5,r9 ; rest msw
284 add.pl.f DBL0L,DBL0L,1
290 brlo r6,0xc0000000,.Linf
296 bxor.mi DBL0H,DBL0H,31
323 or.pnz DBL0H,DBL0H,r7
326 add.f DBL0L,r10,DBL0L
327 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
329 add.pnz.f DBL0L,DBL0L,1
330 add.cs.f DBL0H,DBL0H,1
332 /* Calculation so far was not conclusive; calculate further rest. */
333 mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11
335 asl r5,r5,25 ; s-51.7:25
336 ; resource conflict (not for XMAC)
337 mpyu DBL1H,r12,r8 ; u-51.31:1
338 and r9,DBL0L,1 ; tie-breaker: round to even
339 lsr r11,r11,7 ; u-51.30:2
340 ; resource conflict (not for XMAC)
341 MPYHU r8,r12,DBL1L ; u-51.32
342 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
343 add_s DBL1H,DBL1H,r11
344 ; resource conflict (not for XMAC)
345 ; resource conflict (not for XMAC)
346 mpyu r12,r12,DBL1L ; u-83.30:2
347 sub DBL1H,DBL1H,r5 ; -rest msw
348 add_s DBL1H,DBL1H,r8 ; -rest msw
349 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
350 ; wb stall (XMAC: Before add.f)
353 add.cs.f DBL0L,DBL0L,1
365 bxor.mi DBL0H,DBL0H,31