]>
Commit | Line | Data |
---|---|---|
fbd26352 | 1 | /* Copyright (C) 2008-2019 Free Software Foundation, Inc. |
8eaaaea3 | 2 | Contributor: Joern Rennecke <joern.rennecke@embecosm.com> |
3 | on behalf of Synopsys Inc. | |
4 | ||
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it under | |
8 | the terms of the GNU General Public License as published by the Free | |
9 | Software Foundation; either version 3, or (at your option) any later | |
10 | version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 | for more details. | |
16 | ||
17 | Under Section 7 of GPL version 3, you are granted additional | |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
25 | ||
26 | /* | |
27 | to calculate a := b/x as b*y, with y := 1/x: | |
28 | - x is in the range [1..2) | |
29 | - calculate 15..18 bit inverse y0 using a table of approximating polynoms. | |
30 | Precision is higher for polynoms used to evaluate input with larger | |
31 | value. | |
32 | - Do one newton-raphson iteration step to double the precision, | |
33 | then multiply this with the divisor | |
34 | -> more time to decide if dividend is subnormal | |
35 | - the worst error propagation is on the side of the value range | |
36 | with the least initial defect, thus giving us about 30 bits precision. | |
37 | The truncation error for the either is less than 1 + x/2 ulp. | |
38 | A 31 bit inverse can be simply calculated by using x with implicit 1 | |
39 | and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 | |
40 | with the bare fraction part of x, then add in y0^2 for the implicit | |
41 | 1 of x. | |
42 | - If calculating a 31 bit inverse, the systematic error is less than | |
43 | -1 ulp; likewise, for 32 bit, it is less than -2 ulp. | |
44 | - If we calculate our seed with a 32 bit fraction, we can archive a | |
45 | tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we | |
46 | only need to take the step to calculate the 2nd stage rest and | |
47 | rounding adjust 1/32th of the time. However, if we use a 20 bit | |
48 | fraction for the seed, the negative error can exceed -2 ulp/128, (2) | |
49 | thus for a simple add / tst check, we need to do the 2nd stage | |
50 | rest calculation/ rounding adjust 1/16th of the time. | |
51 | (1): The inexactness of the 32 bit inverse contributes an error in the | |
52 | range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the | |
53 | rest contributes an error < +1/x ulp/128 . In the interval [1,2), | |
54 | x/2 + 1/x <= 1.5 . | |
55 | (2): Unless proven otherwise. I have not actually looked for an | |
56 | example where -2 ulp/128 is exceeded, and my calculations indicate | |
57 | that the excess, if existent, is less than -1/512 ulp. | |
58 | */ | |
59 | #include "arc-ieee-754.h" | |
60 | ||
61 | /* N.B. fp-bit.c does double rounding on denormal numbers. */ | |
62 | #if 0 /* DEBUG */ | |
63 | .global __divdf3 | |
64 | FUNC(__divdf3) | |
65 | .balign 4 | |
66 | __divdf3: | |
67 | push_s blink | |
68 | push_s r2 | |
69 | push_s r3 | |
70 | push_s r0 | |
71 | bl.d __divdf3_c | |
72 | push_s r1 | |
73 | ld_s r2,[sp,12] | |
74 | ld_s r3,[sp,8] | |
75 | st_s r0,[sp,12] | |
76 | st_s r1,[sp,8] | |
77 | pop_s r1 | |
78 | bl.d __divdf3_asm | |
79 | pop_s r0 | |
80 | pop_s r3 | |
81 | pop_s r2 | |
82 | pop_s blink | |
83 | cmp r0,r2 | |
84 | cmp.eq r1,r3 | |
85 | jeq_s [blink] | |
86 | and r12,DBL0H,DBL1H | |
87 | bic.f 0,0x7ff80000,r12 ; both NaN -> OK | |
88 | jeq_s [blink] | |
89 | bl abort | |
90 | ENDFUNC(__divdf3) | |
91 | #define __divdf3 __divdf3_asm | |
92 | #endif /* DEBUG */ | |
93 | ||
94 | FUNC(__divdf3) | |
95 | __divdf3_support: /* This label makes debugger output saner. */ | |
96 | .balign 4 | |
97 | .Ldenorm_dbl1: | |
98 | brge r6, \ | |
99 | 0x43500000,.Linf_NaN ; large number / denorm -> Inf | |
100 | bmsk.f r12,DBL1H,19 | |
101 | mov.eq r12,DBL1L | |
102 | mov.eq DBL1L,0 | |
103 | sub.eq r7,r7,32 | |
104 | norm.f r11,r12 ; flag for x/0 -> Inf check | |
105 | beq_s .Linf_NaN | |
106 | mov.mi r11,0 | |
107 | add.pl r11,r11,1 | |
108 | add_s r12,r12,r12 | |
109 | asl r8,r12,r11 | |
110 | rsub r12,r11,31 | |
111 | lsr r12,DBL1L,r12 | |
112 | tst_s DBL1H,DBL1H | |
113 | or r8,r8,r12 | |
114 | lsr r4,r8,26 | |
115 | lsr DBL1H,r8,12 | |
116 | ld.as r4,[r10,r4] | |
117 | bxor.mi DBL1H,DBL1H,31 | |
118 | sub r11,r11,11 | |
119 | asl DBL1L,DBL1L,r11 | |
120 | sub r11,r11,1 | |
0c4d7986 | 121 | MPYHU r5,r4,r8 |
8eaaaea3 | 122 | sub r7,r7,r11 |
123 | asl r4,r4,12 | |
124 | b.d .Lpast_denorm_dbl1 | |
125 | asl r7,r7,20 | |
126 | ; wb stall | |
127 | ||
128 | .balign 4 | |
129 | .Ldenorm_dbl0: | |
130 | bmsk.f r12,DBL0H,19 | |
131 | ; wb stall | |
132 | mov.eq r12,DBL0L | |
133 | sub.eq r6,r6,32 | |
134 | norm.f r11,r12 ; flag for 0/x -> 0 check | |
135 | brge r7, \ | |
136 | 0x43500000, .Lret0_NaN ; denorm/large number -> 0 | |
137 | beq_s .Lret0_NaN | |
138 | mov.mi r11,0 | |
139 | add.pl r11,r11,1 | |
140 | asl r12,r12,r11 | |
141 | sub r6,r6,r11 | |
142 | add.f 0,r6,31 | |
143 | lsr r10,DBL0L,r6 | |
144 | mov.mi r10,0 | |
145 | add r6,r6,11+32 | |
146 | neg.f r11,r6 | |
147 | asl DBL0L,DBL0L,r11 | |
148 | mov.pl DBL0L,0 | |
149 | sub r6,r6,32-1 | |
150 | b.d .Lpast_denorm_dbl0 | |
151 | asl r6,r6,20 | |
152 | ||
153 | .Linf_NaN: | |
154 | tst_s DBL0L,DBL0L ; 0/0 -> NaN | |
155 | xor_s DBL1H,DBL1H,DBL0H | |
156 | bclr.eq.f DBL0H,DBL0H,31 | |
157 | bmsk DBL0H,DBL1H,30 | |
158 | xor_s DBL0H,DBL0H,DBL1H | |
159 | sub.eq DBL0H,DBL0H,1 | |
160 | mov_s DBL0L,0 | |
161 | j_s.d [blink] | |
162 | or DBL0H,DBL0H,r9 | |
163 | .balign 4 | |
164 | .Lret0_NaN: | |
165 | xor_s DBL1H,DBL1H,DBL0H | |
166 | cmp_s r12,r9 | |
167 | mov_s DBL0L,0 | |
168 | bmsk DBL0H,DBL1H,30 | |
169 | xor_s DBL0H,DBL0H,DBL1H | |
170 | j_s.d [blink] | |
171 | sub.hi DBL0H,DBL0H,1 | |
172 | .Linf_nan_dbl1: ; Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN | |
173 | not_s DBL0L,DBL1H | |
174 | cmp r6,r9 | |
175 | sub_s.ne DBL0L,DBL0L,DBL0L | |
176 | tst_s DBL0H,DBL0H | |
177 | add_s DBL0H,DBL1H,DBL0L | |
178 | j_s.d [blink] | |
179 | bxor.mi DBL0H,DBL0H,31 | |
180 | .Linf_nan_dbl0: | |
181 | tst_s DBL1H,DBL1H | |
182 | j_s.d [blink] | |
183 | bxor.mi DBL0H,DBL0H,31 | |
184 | .balign 4 | |
185 | .global __divdf3 | |
186 | /* N.B. the spacing between divtab and the add3 to get its address must | |
187 | be a multiple of 8. */ | |
188 | __divdf3: | |
189 | asl r8,DBL1H,12 | |
190 | lsr r12,DBL1L,20 | |
191 | lsr r4,r8,26 | |
e69e67d4 | 192 | #if defined (__ARCHS__) || defined (__ARCEM__) |
0c4d7986 | 193 | add3 r10,pcl,60 ; (.Ldivtab-.) >> 3 |
194 | #else | |
8eaaaea3 | 195 | add3 r10,pcl,59 ; (.Ldivtab-.) >> 3 |
0c4d7986 | 196 | #endif |
8eaaaea3 | 197 | ld.as r4,[r10,r4] |
e69e67d4 | 198 | #if defined (__ARCHS__) || defined (__ARCEM__) |
0c4d7986 | 199 | ld.as r9,[pcl,182]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 |
200 | #else | |
8eaaaea3 | 201 | ld.as r9,[pcl,180]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 |
0c4d7986 | 202 | #endif |
8eaaaea3 | 203 | or r8,r8,r12 |
0c4d7986 | 204 | MPYHU r5,r4,r8 |
8eaaaea3 | 205 | and.f r7,DBL1H,r9 |
206 | asl r4,r4,12 ; having the asl here is a concession to the XMAC pipeline. | |
207 | beq.d .Ldenorm_dbl1 | |
208 | and r6,DBL0H,r9 | |
209 | .Lpast_denorm_dbl1: ; wb stall | |
210 | sub r4,r4,r5 | |
0c4d7986 | 211 | MPYHU r5,r4,r4 |
8eaaaea3 | 212 | breq.d r6,0,.Ldenorm_dbl0 |
213 | lsr r8,r8,1 | |
214 | asl r12,DBL0H,11 | |
215 | lsr r10,DBL0L,21 | |
216 | .Lpast_denorm_dbl0: ; wb stall | |
217 | bset r8,r8,31 | |
0c4d7986 | 218 | MPYHU r11,r5,r8 |
8eaaaea3 | 219 | add_s r12,r12,r10 |
220 | bset r5,r12,31 | |
221 | cmp r5,r8 | |
222 | cmp.eq DBL0L,DBL1L | |
223 | ; wb stall | |
224 | lsr.cc r5,r5,1 | |
225 | sub r4,r4,r11 ; u1.31 inverse, about 30 bit | |
0c4d7986 | 226 | MPYHU r11,r5,r4 ; result fraction highpart |
8eaaaea3 | 227 | breq r7,r9,.Linf_nan_dbl1 |
228 | lsr r8,r8,2 ; u3.29 | |
229 | add r5,r6, /* wait for immediate / XMAC wb stall */ \ | |
230 | 0x3fe00000 | |
231 | ; wb stall (not for XMAC) | |
232 | breq r6,r9,.Linf_nan_dbl0 | |
233 | mpyu r12,r11,r8 ; u-28.31 | |
234 | asl_s DBL1L,DBL1L,9 ; u-29.23:9 | |
235 | sbc r6,r5,r7 | |
236 | ; resource conflict (not for XMAC) | |
0c4d7986 | 237 | MPYHU r5,r11,DBL1L ; u-28.23:9 |
8eaaaea3 | 238 | add.cs DBL0L,DBL0L,DBL0L |
239 | asl_s DBL0L,DBL0L,6 ; u-26.25:7 | |
240 | asl r10,r11,23 | |
241 | sub_l DBL0L,DBL0L,r12 | |
242 | ; wb stall (before 'and' for XMAC) | |
243 | lsr r7,r11,9 | |
244 | sub r5,DBL0L,r5 ; rest msw ; u-26.31:0 | |
0c4d7986 | 245 | MPYH r12,r5,r4 ; result fraction lowpart |
8eaaaea3 | 246 | xor.f 0,DBL0H,DBL1H |
247 | and DBL0H,r6,r9 | |
248 | add_s DBL0H,DBL0H,r7 ; (XMAC wb stall) | |
249 | bxor.mi DBL0H,DBL0H,31 | |
250 | brhs r6, /* wb stall / wait for immediate */ \ | |
251 | 0x7fe00000,.Linf_denorm | |
252 | add.f r12,r12,0x11 | |
253 | asr r9,r12,5 | |
254 | sub.mi DBL0H,DBL0H,1 | |
255 | add.f DBL0L,r9,r10 | |
256 | tst r12,0x1c | |
257 | jne.d [blink] | |
258 | add.cs DBL0H,DBL0H,1 | |
259 | /* work out exact rounding if we fall through here. */ | |
260 | /* We know that the exact result cannot be represented in double | |
261 | precision. Find the mid-point between the two nearest | |
262 | representable values, multiply with the divisor, and check if | |
263 | the result is larger than the dividend. Since we want to know | |
264 | only the sign bit, it is sufficient to calculate only the | |
265 | highpart of the lower 64 bits. */ | |
266 | sub.f DBL0L,DBL0L,1 | |
267 | asl r12,r9,2 ; u-22.30:2 | |
268 | mpyu r10,r11,DBL1L ; rest before considering r12 in r5 : -r10 | |
269 | sub.cs DBL0H,DBL0H,1 | |
270 | sub.f r12,r12,2 | |
271 | ; resource conflict (not for XMAC) | |
0c4d7986 | 272 | MPYHU r7,r12,DBL1L ; u-51.32 |
8eaaaea3 | 273 | asl r5,r5,25 ; s-51.7:25 |
274 | lsr r10,r10,7 ; u-51.30:2 | |
275 | ; resource conflict (not for XMAC) | |
276 | ; resource conflict (not for XMAC) | |
277 | mpyu r9,r12,r8 ; u-51.31:1 | |
278 | sub r5,r5,r10 | |
279 | add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L | |
280 | bset r7,r7,0 ; make sure that the result is not zero, and that | |
281 | ; wb stall (one earlier for XMAC) | |
282 | sub r5,r5,r7 ; a highpart zero appears negative | |
283 | sub.f r5,r5,r9 ; rest msw | |
284 | add.pl.f DBL0L,DBL0L,1 | |
285 | j_s.d [blink] | |
286 | add.eq DBL0H,DBL0H,1 | |
287 | ||
288 | .balign 4 | |
289 | .Linf_denorm: | |
290 | brlo r6,0xc0000000,.Linf | |
291 | .Ldenorm: | |
292 | asr r6,r6,20 | |
293 | neg r9,r6 | |
294 | mov_s DBL0H,0 | |
295 | brhs.d r9,54,.Lret0 | |
296 | bxor.mi DBL0H,DBL0H,31 | |
297 | add_l r12,r12,1 | |
298 | and r12,r12,-4 | |
299 | rsub r7,r6,5 | |
300 | asr r10,r12,28 | |
301 | bmsk r4,r12,27 | |
e69e67d4 | 302 | #if defined (__ARCHS__) || defined (__ARCEM__) |
0c4d7986 | 303 | min r7, r7, 31 |
304 | asr DBL0L, r4, r7 | |
305 | #else | |
8eaaaea3 | 306 | asrs DBL0L,r4,r7 |
0c4d7986 | 307 | #endif |
8eaaaea3 | 308 | add DBL1H,r11,r10 |
e69e67d4 | 309 | #if defined (__ARCHS__) || defined (__ARCEM__) |
0c4d7986 | 310 | abs.f r10, r4 |
311 | sub.mi r10, r10, 1 | |
312 | #endif | |
8eaaaea3 | 313 | add.f r7,r6,32-5 |
0c4d7986 | 314 | #ifdef __ARC700__ |
8eaaaea3 | 315 | abss r10,r4 |
0c4d7986 | 316 | #endif |
8eaaaea3 | 317 | asl r4,r4,r7 |
318 | mov.mi r4,r10 | |
319 | add.f r10,r6,23 | |
320 | rsub r7,r6,9 | |
321 | lsr r7,DBL1H,r7 | |
322 | asl r10,DBL1H,r10 | |
323 | or.pnz DBL0H,DBL0H,r7 | |
324 | or.mi r4,r4,r10 | |
325 | mov.mi r10,r7 | |
326 | add.f DBL0L,r10,DBL0L | |
327 | add.cs.f DBL0H,DBL0H,1 ; carry clear after this point | |
328 | bxor.f 0,r4,31 | |
329 | add.pnz.f DBL0L,DBL0L,1 | |
330 | add.cs.f DBL0H,DBL0H,1 | |
331 | jne_l [blink] | |
332 | /* Calculation so far was not conclusive; calculate further rest. */ | |
333 | mpyu r11,r11,DBL1L ; rest before considering r12 in r5 : -r11 | |
334 | asr.f r12,r12,3 | |
335 | asl r5,r5,25 ; s-51.7:25 | |
336 | ; resource conflict (not for XMAC) | |
337 | mpyu DBL1H,r12,r8 ; u-51.31:1 | |
338 | and r9,DBL0L,1 ; tie-breaker: round to even | |
339 | lsr r11,r11,7 ; u-51.30:2 | |
340 | ; resource conflict (not for XMAC) | |
0c4d7986 | 341 | MPYHU r8,r12,DBL1L ; u-51.32 |
8eaaaea3 | 342 | sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L |
343 | add_s DBL1H,DBL1H,r11 | |
344 | ; resource conflict (not for XMAC) | |
345 | ; resource conflict (not for XMAC) | |
346 | mpyu r12,r12,DBL1L ; u-83.30:2 | |
347 | sub DBL1H,DBL1H,r5 ; -rest msw | |
348 | add_s DBL1H,DBL1H,r8 ; -rest msw | |
349 | add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( | |
350 | ; wb stall (XMAC: Before add.f) | |
351 | tst_s DBL1H,DBL1H | |
352 | cmp.eq r12,r9 | |
353 | add.cs.f DBL0L,DBL0L,1 | |
354 | j_s.d [blink] | |
355 | add.cs DBL0H,DBL0H,1 | |
356 | ||
357 | .Lret0: | |
358 | /* return +- 0 */ | |
359 | j_s.d [blink] | |
360 | mov_s DBL0L,0 | |
361 | .Linf: | |
362 | mov_s DBL0H,r9 | |
363 | mov_s DBL0L,0 | |
364 | j_s.d [blink] | |
365 | bxor.mi DBL0H,DBL0H,31 | |
366 | ||
367 | .balign 4 | |
368 | .Ldivtab: | |
369 | .long 0xfc0fffe1 | |
370 | .long 0xf46ffdfb | |
371 | .long 0xed1ffa54 | |
372 | .long 0xe61ff515 | |
373 | .long 0xdf7fee75 | |
374 | .long 0xd91fe680 | |
375 | .long 0xd2ffdd52 | |
376 | .long 0xcd1fd30c | |
377 | .long 0xc77fc7cd | |
378 | .long 0xc21fbbb6 | |
379 | .long 0xbcefaec0 | |
380 | .long 0xb7efa100 | |
381 | .long 0xb32f92bf | |
382 | .long 0xae8f83b7 | |
383 | .long 0xaa2f7467 | |
384 | .long 0xa5ef6479 | |
385 | .long 0xa1cf53fa | |
386 | .long 0x9ddf433e | |
387 | .long 0x9a0f3216 | |
388 | .long 0x965f2091 | |
389 | .long 0x92df0f11 | |
390 | .long 0x8f6efd05 | |
391 | .long 0x8c1eeacc | |
392 | .long 0x88eed876 | |
393 | .long 0x85dec615 | |
394 | .long 0x82eeb3b9 | |
395 | .long 0x800ea10b | |
396 | .long 0x7d3e8e0f | |
397 | .long 0x7a8e7b3f | |
398 | .long 0x77ee6836 | |
399 | .long 0x756e5576 | |
400 | .long 0x72fe4293 | |
401 | .long 0x709e2f93 | |
402 | .long 0x6e4e1c7f | |
403 | .long 0x6c0e095e | |
404 | .long 0x69edf6c5 | |
405 | .long 0x67cde3a5 | |
406 | .long 0x65cdd125 | |
407 | .long 0x63cdbe25 | |
408 | .long 0x61ddab3f | |
409 | .long 0x600d991f | |
410 | .long 0x5e3d868c | |
411 | .long 0x5c6d7384 | |
412 | .long 0x5abd615f | |
413 | .long 0x590d4ecd | |
414 | .long 0x576d3c83 | |
415 | .long 0x55dd2a89 | |
416 | .long 0x545d18e9 | |
417 | .long 0x52dd06e9 | |
418 | .long 0x516cf54e | |
419 | .long 0x4ffce356 | |
420 | .long 0x4e9cd1ce | |
421 | .long 0x4d3cbfec | |
422 | .long 0x4becae86 | |
423 | .long 0x4aac9da4 | |
424 | .long 0x496c8c73 | |
425 | .long 0x483c7bd3 | |
426 | .long 0x470c6ae8 | |
427 | .long 0x45dc59af | |
428 | .long 0x44bc4915 | |
429 | .long 0x43ac3924 | |
430 | .long 0x428c27fb | |
431 | .long 0x418c187a | |
432 | .long 0x407c07bd | |
433 | .L7ff00000: | |
434 | .long 0x7ff00000 | |
435 | ENDFUNC(__divdf3) |