]>
Commit | Line | Data |
---|---|---|
83ffe9cd | 1 | /* Copyright (C) 2008-2023 Free Software Foundation, Inc. |
d38a64b4 JR |
2 | Contributor: Joern Rennecke <joern.rennecke@embecosm.com> |
3 | on behalf of Synopsys Inc. | |
4 | ||
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it under | |
8 | the terms of the GNU General Public License as published by the Free | |
9 | Software Foundation; either version 3, or (at your option) any later | |
10 | version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 | for more details. | |
16 | ||
17 | Under Section 7 of GPL version 3, you are granted additional | |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
25 | ||
26 | /* | |
27 | to calculate a := b/x as b*y, with y := 1/x: | |
28 | - x is in the range [1..2) | |
29 | - calculate 15..18 bit inverse y0 using a table of approximating polynoms. | |
30 | Precision is higher for polynoms used to evaluate input with larger | |
31 | value. | |
32 | - Do one newton-raphson iteration step to double the precision, | |
33 | then multiply this with the divisor | |
34 | -> more time to decide if dividend is subnormal | |
35 | - the worst error propagation is on the side of the value range | |
36 | with the least initial defect, thus giving us about 30 bits precision. | |
37 | The truncation error for the either is less than 1 + x/2 ulp. | |
38 | A 31 bit inverse can be simply calculated by using x with implicit 1 | |
39 | and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 | |
40 | with the bare fraction part of x, then add in y0^2 for the implicit | |
41 | 1 of x. | |
42 | - If calculating a 31 bit inverse, the systematic error is less than | |
43 | -1 ulp; likewise, for 32 bit, it is less than -2 ulp. | |
44 | - If we calculate our seed with a 32 bit fraction, we can archive a | |
45 | tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we | |
46 | only need to take the step to calculate the 2nd stage rest and | |
47 | rounding adjust 1/32th of the time. However, if we use a 20 bit | |
48 | fraction for the seed, the negative error can exceed -2 ulp/128, (2) | |
49 | thus for a simple add / tst check, we need to do the 2nd stage | |
50 | rest calculation/ rounding adjust 1/16th of the time. | |
51 | (1): The inexactness of the 32 bit inverse contributes an error in the | |
52 | range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the | |
53 | rest contributes an error < +1/x ulp/128 . In the interval [1,2), | |
54 | x/2 + 1/x <= 1.5 . | |
55 | (2): Unless proven otherwise. I have not actually looked for an | |
56 | example where -2 ulp/128 is exceeded, and my calculations indicate | |
57 | that the excess, if existent, is less than -1/512 ulp. | |
58 | ??? The algorithm is still based on the ARC700 optimized code. | |
59 | Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply | |
60 | results. | |
61 | */ | |
62 | #include "../arc-ieee-754.h" | |
63 | #define mlo acc2 | |
64 | #define mhi acc1 | |
65 | #define mul64(b,c) mullw 0,b,c` machlw 0,b,c | |
66 | #define mulu64(b,c) mululw 0,b,c` machulw 0,b,c | |
67 | ||
68 | /* N.B. fp-bit.c does double rounding on denormal numbers. */ | |
69 | #if 0 /* DEBUG */ | |
70 | .global __divdf3 | |
71 | FUNC(__divdf3) | |
72 | .balign 4 | |
73 | __divdf3: | |
74 | push_s blink | |
75 | push_s r2 | |
76 | push_s r3 | |
77 | push_s r0 | |
78 | bl.d __divdf3_c | |
79 | push_s r1 | |
80 | ld_s r2,[sp,12] | |
81 | ld_s r3,[sp,8] | |
82 | st_s r0,[sp,12] | |
83 | st_s r1,[sp,8] | |
84 | pop_s r1 | |
85 | bl.d __divdf3_asm | |
86 | pop_s r0 | |
87 | pop_s r3 | |
88 | pop_s r2 | |
89 | pop_s blink | |
90 | cmp r0,r2 | |
91 | cmp.eq r1,r3 | |
92 | jeq_s [blink] | |
93 | and r12,DBL0H,DBL1H | |
94 | bic.f 0,0x7ff80000,r12 ; both NaN -> OK | |
95 | jeq_s [blink] | |
96 | bl abort | |
97 | ENDFUNC(__divdf3) | |
98 | #define __divdf3 __divdf3_asm | |
99 | #endif /* DEBUG */ | |
100 | ||
101 | FUNC(__divdf3) | |
102 | .balign 4 | |
103 | .L7ff00000: | |
104 | .long 0x7ff00000 | |
105 | .Ldivtab: | |
106 | .long 0xfc0fffe1 | |
107 | .long 0xf46ffdfb | |
108 | .long 0xed1ffa54 | |
109 | .long 0xe61ff515 | |
110 | .long 0xdf7fee75 | |
111 | .long 0xd91fe680 | |
112 | .long 0xd2ffdd52 | |
113 | .long 0xcd1fd30c | |
114 | .long 0xc77fc7cd | |
115 | .long 0xc21fbbb6 | |
116 | .long 0xbcefaec0 | |
117 | .long 0xb7efa100 | |
118 | .long 0xb32f92bf | |
119 | .long 0xae8f83b7 | |
120 | .long 0xaa2f7467 | |
121 | .long 0xa5ef6479 | |
122 | .long 0xa1cf53fa | |
123 | .long 0x9ddf433e | |
124 | .long 0x9a0f3216 | |
125 | .long 0x965f2091 | |
126 | .long 0x92df0f11 | |
127 | .long 0x8f6efd05 | |
128 | .long 0x8c1eeacc | |
129 | .long 0x88eed876 | |
130 | .long 0x85dec615 | |
131 | .long 0x82eeb3b9 | |
132 | .long 0x800ea10b | |
133 | .long 0x7d3e8e0f | |
134 | .long 0x7a8e7b3f | |
135 | .long 0x77ee6836 | |
136 | .long 0x756e5576 | |
137 | .long 0x72fe4293 | |
138 | .long 0x709e2f93 | |
139 | .long 0x6e4e1c7f | |
140 | .long 0x6c0e095e | |
141 | .long 0x69edf6c5 | |
142 | .long 0x67cde3a5 | |
143 | .long 0x65cdd125 | |
144 | .long 0x63cdbe25 | |
145 | .long 0x61ddab3f | |
146 | .long 0x600d991f | |
147 | .long 0x5e3d868c | |
148 | .long 0x5c6d7384 | |
149 | .long 0x5abd615f | |
150 | .long 0x590d4ecd | |
151 | .long 0x576d3c83 | |
152 | .long 0x55dd2a89 | |
153 | .long 0x545d18e9 | |
154 | .long 0x52dd06e9 | |
155 | .long 0x516cf54e | |
156 | .long 0x4ffce356 | |
157 | .long 0x4e9cd1ce | |
158 | .long 0x4d3cbfec | |
159 | .long 0x4becae86 | |
160 | .long 0x4aac9da4 | |
161 | .long 0x496c8c73 | |
162 | .long 0x483c7bd3 | |
163 | .long 0x470c6ae8 | |
164 | .long 0x45dc59af | |
165 | .long 0x44bc4915 | |
166 | .long 0x43ac3924 | |
167 | .long 0x428c27fb | |
168 | .long 0x418c187a | |
169 | .long 0x407c07bd | |
170 | ||
171 | __divdf3_support: /* This label makes debugger output saner. */ | |
172 | .balign 4 | |
173 | .Ldenorm_dbl1: | |
174 | brge r6, \ | |
175 | 0x43500000,.Linf_NaN ; large number / denorm -> Inf | |
176 | bmsk.f r12,DBL1H,19 | |
177 | mov.eq r12,DBL1L | |
178 | mov.eq DBL1L,0 | |
179 | sub.eq r7,r7,32 | |
180 | norm.f r11,r12 ; flag for x/0 -> Inf check | |
181 | beq_s .Linf_NaN | |
182 | mov.mi r11,0 | |
183 | add.pl r11,r11,1 | |
184 | add_s r12,r12,r12 | |
185 | asl r8,r12,r11 | |
186 | rsub r12,r11,31 | |
187 | lsr r12,DBL1L,r12 | |
188 | tst_s DBL1H,DBL1H | |
189 | or r8,r8,r12 | |
190 | lsr r4,r8,26 | |
191 | lsr DBL1H,r8,12 | |
192 | ld.as r4,[r10,r4] | |
193 | bxor.mi DBL1H,DBL1H,31 | |
194 | sub r11,r11,11 | |
195 | asl DBL1L,DBL1L,r11 | |
196 | sub r11,r11,1 | |
197 | mulu64 (r4,r8) | |
198 | sub r7,r7,r11 | |
199 | b.d .Lpast_denorm_dbl1 | |
200 | asl r7,r7,20 | |
201 | ||
202 | .Linf_NaN: | |
203 | tst_s DBL0L,DBL0L ; 0/0 -> NaN | |
204 | xor_s DBL1H,DBL1H,DBL0H | |
205 | bclr.eq.f DBL0H,DBL0H,31 | |
206 | bmsk DBL0H,DBL1H,30 | |
207 | xor_s DBL0H,DBL0H,DBL1H | |
208 | sub.eq DBL0H,DBL0H,1 | |
209 | mov_s DBL0L,0 | |
210 | j_s.d [blink] | |
211 | or DBL0H,DBL0H,r9 | |
212 | .balign 4 | |
213 | .Lret0_2: | |
214 | xor_s DBL1H,DBL1H,DBL0H | |
215 | mov_s DBL0L,0 | |
216 | bmsk DBL0H,DBL1H,30 | |
217 | j_s.d [blink] | |
218 | xor_s DBL0H,DBL0H,DBL1H | |
219 | .balign 4 | |
220 | .global __divdf3 | |
221 | /* N.B. the spacing between divtab and the sub3 to get its address must | |
222 | be a multiple of 8. */ | |
223 | __divdf3: | |
224 | asl r8,DBL1H,12 | |
225 | lsr r4,r8,26 | |
226 | sub3 r10,pcl,51;(.-.Ldivtab) >> 3 | |
227 | ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 | |
228 | ld.as r4,[r10,r4] | |
229 | lsr r12,DBL1L,20 | |
230 | and.f r7,DBL1H,r9 | |
231 | or r8,r8,r12 | |
232 | mulu64 (r4,r8) | |
233 | beq.d .Ldenorm_dbl1 | |
234 | .Lpast_denorm_dbl1: | |
235 | and.f r6,DBL0H,r9 | |
236 | breq.d r7,r9,.Linf_nan_dbl1 | |
237 | asl r4,r4,12 | |
238 | sub r4,r4,mhi | |
239 | mululw 0,r4,r4 | |
240 | machulw r5,r4,r4 | |
241 | bne.d .Lnormal_dbl0 | |
242 | lsr r8,r8,1 | |
243 | ||
244 | .balign 4 | |
245 | .Ldenorm_dbl0: | |
246 | bmsk.f r12,DBL0H,19 | |
247 | ; wb stall | |
248 | mov.eq r12,DBL0L | |
249 | sub.eq r6,r6,32 | |
250 | norm.f r11,r12 ; flag for 0/x -> 0 check | |
251 | brge r7, \ | |
252 | 0x43500000, .Lret0_2 ; denorm/large number -> 0 | |
253 | beq_s .Lret0_2 | |
254 | mov.mi r11,0 | |
255 | add.pl r11,r11,1 | |
256 | asl r12,r12,r11 | |
257 | sub r6,r6,r11 | |
258 | add.f 0,r6,31 | |
259 | lsr r10,DBL0L,r6 | |
260 | mov.mi r10,0 | |
261 | add r6,r6,11+32 | |
262 | neg.f r11,r6 | |
263 | asl DBL0L,DBL0L,r11 | |
264 | mov.pl DBL0L,0 | |
265 | sub r6,r6,32-1 | |
266 | b.d .Lpast_denorm_dbl0 | |
267 | asl r6,r6,20 | |
268 | ||
269 | .balign 4 | |
270 | .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN | |
271 | or.f 0,r6,DBL0L | |
272 | cmp.ne r6,r9 | |
273 | not_s DBL0L,DBL1H | |
274 | sub_s.ne DBL0L,DBL0L,DBL0L | |
275 | tst_s DBL0H,DBL0H | |
276 | add_s DBL0H,DBL1H,DBL0L | |
277 | j_s.d [blink] | |
278 | bxor.mi DBL0H,DBL0H,31 | |
279 | ||
280 | .balign 4 | |
281 | .Lnormal_dbl0: | |
282 | breq.d r6,r9,.Linf_nan_dbl0 | |
283 | asl r12,DBL0H,11 | |
284 | lsr r10,DBL0L,21 | |
285 | .Lpast_denorm_dbl0: | |
286 | bset r8,r8,31 | |
287 | mulu64 (r5,r8) | |
288 | add_s r12,r12,r10 | |
289 | bset r5,r12,31 | |
290 | cmp r5,r8 | |
291 | cmp.eq DBL0L,DBL1L | |
292 | lsr.cc r5,r5,1 | |
293 | sub r4,r4,mhi ; u1.31 inverse, about 30 bit | |
294 | mululw 0,r5,r4 | |
295 | machulw r11,r5,r4 ; result fraction highpart | |
296 | lsr r8,r8,2 ; u3.29 | |
297 | add r5,r6, /* wait for immediate */ \ | |
298 | 0x3fe00000 | |
299 | mulu64 (r11,r8) ; u-28.31 | |
300 | asl_s DBL1L,DBL1L,9 ; u-29.23:9 | |
301 | sbc r6,r5,r7 | |
302 | mov r12,mlo ; u-28.31 | |
303 | mulu64 (r11,DBL1L) ; mhi: u-28.23:9 | |
304 | add.cs DBL0L,DBL0L,DBL0L | |
305 | asl_s DBL0L,DBL0L,6 ; u-26.25:7 | |
306 | asl r10,r11,23 | |
307 | sub_l DBL0L,DBL0L,r12 | |
308 | lsr r7,r11,9 | |
309 | sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 | |
310 | mul64 (r5,r4) ; mhi: result fraction lowpart | |
311 | xor.f 0,DBL0H,DBL1H | |
312 | and DBL0H,r6,r9 | |
313 | add_s DBL0H,DBL0H,r7 | |
314 | bclr r12,r9,20 ; 0x7fe00000 | |
315 | brhs.d r6,r12,.Linf_denorm | |
316 | bxor.mi DBL0H,DBL0H,31 | |
317 | add.f r12,mhi,0x11 | |
318 | asr r9,r12,5 | |
319 | sub.mi DBL0H,DBL0H,1 | |
320 | add.f DBL0L,r9,r10 | |
321 | tst r12,0x1c | |
322 | jne.d [blink] | |
323 | add.cs DBL0H,DBL0H,1 | |
324 | /* work out exact rounding if we fall through here. */ | |
325 | /* We know that the exact result cannot be represented in double | |
326 | precision. Find the mid-point between the two nearest | |
327 | representable values, multiply with the divisor, and check if | |
328 | the result is larger than the dividend. Since we want to know | |
329 | only the sign bit, it is sufficient to calculate only the | |
330 | highpart of the lower 64 bits. */ | |
331 | mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo | |
332 | sub.f DBL0L,DBL0L,1 | |
333 | asl r12,r9,2 ; u-22.30:2 | |
334 | sub.cs DBL0H,DBL0H,1 | |
335 | sub.f r12,r12,2 | |
336 | mov r10,mlo ; rest before considering r12 in r5 : -r10 | |
337 | mululw 0,r12,DBL1L | |
338 | machulw r7,r12,DBL1L ; mhi: u-51.32 | |
339 | asl r5,r5,25 ; s-51.7:25 | |
340 | lsr r10,r10,7 ; u-51.30:2 | |
341 | mulu64 (r12,r8) ; mlo: u-51.31:1 | |
342 | sub r5,r5,r10 | |
343 | add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L | |
344 | bset r7,r7,0 ; make sure that the result is not zero, and that | |
345 | sub r5,r5,r7 ; a highpart zero appears negative | |
346 | sub.f r5,r5,mlo ; rest msw | |
347 | add.pl.f DBL0L,DBL0L,1 | |
348 | j_s.d [blink] | |
349 | add.eq DBL0H,DBL0H,1 | |
350 | ||
351 | .Linf_nan_dbl0: | |
352 | tst_s DBL1H,DBL1H | |
353 | j_s.d [blink] | |
354 | bxor.mi DBL0H,DBL0H,31 | |
355 | .balign 4 | |
356 | .Linf_denorm: | |
357 | lsr r12,r6,28 | |
358 | brlo.d r12,0xc,.Linf | |
359 | .Ldenorm: | |
360 | asr r6,r6,20 | |
361 | neg r9,r6 | |
362 | mov_s DBL0H,0 | |
363 | brhs.d r9,54,.Lret0 | |
364 | bxor.mi DBL0H,DBL0H,31 | |
365 | add r12,mhi,1 | |
366 | and r12,r12,-4 | |
367 | rsub r7,r6,5 | |
368 | asr r10,r12,28 | |
369 | bmsk r4,r12,27 | |
370 | min r7,r7,31 | |
371 | asr DBL0L,r4,r7 | |
372 | add DBL1H,r11,r10 | |
373 | abs.f r10,r4 | |
374 | sub.mi r10,r10,1 | |
375 | add.f r7,r6,32-5 | |
376 | asl r4,r4,r7 | |
377 | mov.mi r4,r10 | |
378 | add.f r10,r6,23 | |
379 | rsub r7,r6,9 | |
380 | lsr r7,DBL1H,r7 | |
381 | asl r10,DBL1H,r10 | |
382 | or.pnz DBL0H,DBL0H,r7 | |
383 | or.mi r4,r4,r10 | |
384 | mov.mi r10,r7 | |
385 | add.f DBL0L,r10,DBL0L | |
386 | add.cs.f DBL0H,DBL0H,1 ; carry clear after this point | |
387 | bxor.f 0,r4,31 | |
388 | add.pnz.f DBL0L,DBL0L,1 | |
389 | add.cs.f DBL0H,DBL0H,1 | |
390 | jne_s [blink] | |
391 | /* Calculation so far was not conclusive; calculate further rest. */ | |
392 | mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo | |
393 | asr.f r12,r12,3 | |
394 | asl r5,r5,25 ; s-51.7:25 | |
395 | mov r11,mlo ; rest before considering r12 in r5 : -r11 | |
396 | mulu64 (r12,r8) ; u-51.31:1 | |
397 | and r9,DBL0L,1 ; tie-breaker: round to even | |
398 | lsr r11,r11,7 ; u-51.30:2 | |
399 | mov DBL1H,mlo ; u-51.31:1 | |
400 | mulu64 (r12,DBL1L) ; u-51.62:2 | |
401 | sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L | |
402 | add_s DBL1H,DBL1H,r11 | |
403 | sub DBL1H,DBL1H,r5 ; -rest msw | |
404 | add_s DBL1H,DBL1H,mhi ; -rest msw | |
405 | add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( | |
406 | tst_s DBL1H,DBL1H | |
407 | cmp.eq mlo,r9 | |
408 | add.cs.f DBL0L,DBL0L,1 | |
409 | j_s.d [blink] | |
410 | add.cs DBL0H,DBL0H,1 | |
411 | ||
412 | .Lret0: | |
413 | /* return +- 0 */ | |
414 | j_s.d [blink] | |
415 | mov_s DBL0L,0 | |
416 | .Linf: | |
417 | mov_s DBL0H,r9 | |
418 | mov_s DBL0L,0 | |
419 | j_s.d [blink] | |
420 | bxor.mi DBL0H,DBL0H,31 | |
421 | ENDFUNC(__divdf3) |