]>
Commit | Line | Data |
---|---|---|
83ffe9cd | 1 | /* Copyright (C) 2008-2023 Free Software Foundation, Inc. |
d38a64b4 JR |
2 | Contributor: Joern Rennecke <joern.rennecke@embecosm.com> |
3 | on behalf of Synopsys Inc. | |
4 | ||
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it under | |
8 | the terms of the GNU General Public License as published by the Free | |
9 | Software Foundation; either version 3, or (at your option) any later | |
10 | version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 | for more details. | |
16 | ||
17 | Under Section 7 of GPL version 3, you are granted additional | |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
25 | ||
26 | /* | |
27 | to calculate a := b/x as b*y, with y := 1/x: | |
28 | - x is in the range [1..2) | |
29 | - calculate 15..18 bit inverse y0 using a table of approximating polynoms. | |
30 | Precision is higher for polynoms used to evaluate input with larger | |
31 | value. | |
32 | - Do one newton-raphson iteration step to double the precision, | |
33 | then multiply this with the divisor | |
34 | -> more time to decide if dividend is subnormal | |
35 | - the worst error propagation is on the side of the value range | |
36 | with the least initial defect, thus giving us about 30 bits precision. | |
37 | The truncation error for the either is less than 1 + x/2 ulp. | |
38 | A 31 bit inverse can be simply calculated by using x with implicit 1 | |
39 | and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 | |
40 | with the bare fraction part of x, then add in y0^2 for the implicit | |
41 | 1 of x. | |
42 | - If calculating a 31 bit inverse, the systematic error is less than | |
43 | -1 ulp; likewise, for 32 bit, it is less than -2 ulp. | |
44 | - If we calculate our seed with a 32 bit fraction, we can archive a | |
45 | tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we | |
46 | only need to take the step to calculate the 2nd stage rest and | |
47 | rounding adjust 1/32th of the time. However, if we use a 20 bit | |
48 | fraction for the seed, the negative error can exceed -2 ulp/128, (2) | |
49 | thus for a simple add / tst check, we need to do the 2nd stage | |
50 | rest calculation/ rounding adjust 1/16th of the time. | |
51 | (1): The inexactness of the 32 bit inverse contributes an error in the | |
52 | range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the | |
53 | rest contributes an error < +1/x ulp/128 . In the interval [1,2), | |
54 | x/2 + 1/x <= 1.5 . | |
55 | (2): Unless proven otherwise. I have not actually looked for an | |
56 | example where -2 ulp/128 is exceeded, and my calculations indicate | |
57 | that the excess, if existent, is less than -1/512 ulp. | |
58 | ??? The algorithm is still based on the ARC700 optimized code. | |
59 | Maybe we could make better use of 64 bit multiply results and/or mmed . | |
60 | */ | |
61 | #include "../arc-ieee-754.h" | |
62 | ||
63 | /* N.B. fp-bit.c does double rounding on denormal numbers. */ | |
64 | #if 0 /* DEBUG */ | |
65 | .global __divdf3 | |
66 | FUNC(__divdf3) | |
67 | .balign 4 | |
68 | __divdf3: | |
69 | push_s blink | |
70 | push_s r2 | |
71 | push_s r3 | |
72 | push_s r0 | |
73 | bl.d __divdf3_c | |
74 | push_s r1 | |
75 | ld_s r2,[sp,12] | |
76 | ld_s r3,[sp,8] | |
77 | st_s r0,[sp,12] | |
78 | st_s r1,[sp,8] | |
79 | pop_s r1 | |
80 | bl.d __divdf3_asm | |
81 | pop_s r0 | |
82 | pop_s r3 | |
83 | pop_s r2 | |
84 | pop_s blink | |
85 | cmp r0,r2 | |
86 | cmp.eq r1,r3 | |
87 | jeq_s [blink] | |
88 | and r12,DBL0H,DBL1H | |
89 | bic.f 0,0x7ff80000,r12 ; both NaN -> OK | |
90 | jeq_s [blink] | |
91 | bl abort | |
92 | ENDFUNC(__divdf3) | |
93 | #define __divdf3 __divdf3_asm | |
94 | #endif /* DEBUG */ | |
95 | ||
96 | FUNC(__divdf3) | |
97 | .balign 4 | |
98 | .L7ff00000: | |
99 | .long 0x7ff00000 | |
100 | .Ldivtab: | |
101 | .long 0xfc0fffe1 | |
102 | .long 0xf46ffdfb | |
103 | .long 0xed1ffa54 | |
104 | .long 0xe61ff515 | |
105 | .long 0xdf7fee75 | |
106 | .long 0xd91fe680 | |
107 | .long 0xd2ffdd52 | |
108 | .long 0xcd1fd30c | |
109 | .long 0xc77fc7cd | |
110 | .long 0xc21fbbb6 | |
111 | .long 0xbcefaec0 | |
112 | .long 0xb7efa100 | |
113 | .long 0xb32f92bf | |
114 | .long 0xae8f83b7 | |
115 | .long 0xaa2f7467 | |
116 | .long 0xa5ef6479 | |
117 | .long 0xa1cf53fa | |
118 | .long 0x9ddf433e | |
119 | .long 0x9a0f3216 | |
120 | .long 0x965f2091 | |
121 | .long 0x92df0f11 | |
122 | .long 0x8f6efd05 | |
123 | .long 0x8c1eeacc | |
124 | .long 0x88eed876 | |
125 | .long 0x85dec615 | |
126 | .long 0x82eeb3b9 | |
127 | .long 0x800ea10b | |
128 | .long 0x7d3e8e0f | |
129 | .long 0x7a8e7b3f | |
130 | .long 0x77ee6836 | |
131 | .long 0x756e5576 | |
132 | .long 0x72fe4293 | |
133 | .long 0x709e2f93 | |
134 | .long 0x6e4e1c7f | |
135 | .long 0x6c0e095e | |
136 | .long 0x69edf6c5 | |
137 | .long 0x67cde3a5 | |
138 | .long 0x65cdd125 | |
139 | .long 0x63cdbe25 | |
140 | .long 0x61ddab3f | |
141 | .long 0x600d991f | |
142 | .long 0x5e3d868c | |
143 | .long 0x5c6d7384 | |
144 | .long 0x5abd615f | |
145 | .long 0x590d4ecd | |
146 | .long 0x576d3c83 | |
147 | .long 0x55dd2a89 | |
148 | .long 0x545d18e9 | |
149 | .long 0x52dd06e9 | |
150 | .long 0x516cf54e | |
151 | .long 0x4ffce356 | |
152 | .long 0x4e9cd1ce | |
153 | .long 0x4d3cbfec | |
154 | .long 0x4becae86 | |
155 | .long 0x4aac9da4 | |
156 | .long 0x496c8c73 | |
157 | .long 0x483c7bd3 | |
158 | .long 0x470c6ae8 | |
159 | .long 0x45dc59af | |
160 | .long 0x44bc4915 | |
161 | .long 0x43ac3924 | |
162 | .long 0x428c27fb | |
163 | .long 0x418c187a | |
164 | .long 0x407c07bd | |
165 | ||
166 | __divdf3_support: /* This label makes debugger output saner. */ | |
167 | .balign 4 | |
168 | .Ldenorm_dbl1: | |
169 | brge r6, \ | |
170 | 0x43500000,.Linf_NaN ; large number / denorm -> Inf | |
171 | bmsk.f r12,DBL1H,19 | |
172 | mov.eq r12,DBL1L | |
173 | mov.eq DBL1L,0 | |
174 | sub.eq r7,r7,32 | |
175 | norm.f r11,r12 ; flag for x/0 -> Inf check | |
176 | beq_s .Linf_NaN | |
177 | mov.mi r11,0 | |
178 | add.pl r11,r11,1 | |
179 | add_s r12,r12,r12 | |
180 | asl r8,r12,r11 | |
181 | rsub r12,r11,31 | |
182 | lsr r12,DBL1L,r12 | |
183 | tst_s DBL1H,DBL1H | |
184 | or r8,r8,r12 | |
185 | lsr r4,r8,26 | |
186 | lsr DBL1H,r8,12 | |
187 | ld.as r4,[r10,r4] | |
188 | bxor.mi DBL1H,DBL1H,31 | |
189 | sub r11,r11,11 | |
190 | asl DBL1L,DBL1L,r11 | |
191 | sub r11,r11,1 | |
192 | mulu64 r4,r8 | |
193 | sub r7,r7,r11 | |
194 | b.d .Lpast_denorm_dbl1 | |
195 | asl r7,r7,20 | |
196 | ||
197 | .balign 4 | |
198 | .Ldenorm_dbl0: | |
199 | bmsk.f r12,DBL0H,19 | |
200 | ; wb stall | |
201 | mov.eq r12,DBL0L | |
202 | sub.eq r6,r6,32 | |
203 | norm.f r11,r12 ; flag for 0/x -> 0 check | |
204 | brge r7, \ | |
205 | 0x43500000, .Lret0_2 ; denorm/large number -> 0 | |
206 | beq_s .Lret0_2 | |
207 | mov.mi r11,0 | |
208 | add.pl r11,r11,1 | |
209 | asl r12,r12,r11 | |
210 | sub r6,r6,r11 | |
211 | add.f 0,r6,31 | |
212 | lsr r10,DBL0L,r6 | |
213 | mov.mi r10,0 | |
214 | add r6,r6,11+32 | |
215 | neg.f r11,r6 | |
216 | asl DBL0L,DBL0L,r11 | |
217 | mov.pl DBL0L,0 | |
218 | sub r6,r6,32-1 | |
219 | b.d .Lpast_denorm_dbl0 | |
220 | asl r6,r6,20 | |
221 | ||
222 | .Linf_NaN: | |
223 | tst_s DBL0L,DBL0L ; 0/0 -> NaN | |
224 | xor_s DBL1H,DBL1H,DBL0H | |
225 | bclr.eq.f DBL0H,DBL0H,31 | |
226 | bmsk DBL0H,DBL1H,30 | |
227 | xor_s DBL0H,DBL0H,DBL1H | |
228 | sub.eq DBL0H,DBL0H,1 | |
229 | mov_s DBL0L,0 | |
230 | j_s.d [blink] | |
231 | or DBL0H,DBL0H,r9 | |
232 | .balign 4 | |
233 | .Lret0_2: | |
234 | xor_s DBL1H,DBL1H,DBL0H | |
235 | mov_s DBL0L,0 | |
236 | bmsk DBL0H,DBL1H,30 | |
237 | j_s.d [blink] | |
238 | xor_s DBL0H,DBL0H,DBL1H | |
239 | .balign 4 | |
240 | .global __divdf3 | |
241 | /* N.B. the spacing between divtab and the sub3 to get its address must | |
242 | be a multiple of 8. */ | |
243 | __divdf3: | |
244 | asl r8,DBL1H,12 | |
245 | lsr r4,r8,26 | |
246 | sub3 r10,pcl,61; (.-.Ldivtab) >> 3 | |
247 | ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 | |
248 | ld.as r4,[r10,r4] | |
249 | lsr r12,DBL1L,20 | |
250 | and.f r7,DBL1H,r9 | |
251 | or r8,r8,r12 | |
252 | mulu64 r4,r8 | |
253 | beq.d .Ldenorm_dbl1 | |
254 | .Lpast_denorm_dbl1: | |
255 | and.f r6,DBL0H,r9 | |
256 | breq.d r7,r9,.Linf_nan_dbl1 | |
257 | asl r4,r4,12 | |
258 | sub r4,r4,mhi | |
259 | mulu64 r4,r4 | |
260 | beq.d .Ldenorm_dbl0 | |
261 | lsr r8,r8,1 | |
262 | breq.d r6,r9,.Linf_nan_dbl0 | |
263 | asl r12,DBL0H,11 | |
264 | lsr r10,DBL0L,21 | |
265 | .Lpast_denorm_dbl0: | |
266 | bset r8,r8,31 | |
267 | mulu64 mhi,r8 | |
268 | add_s r12,r12,r10 | |
269 | bset r5,r12,31 | |
270 | cmp r5,r8 | |
271 | cmp.eq DBL0L,DBL1L | |
272 | lsr.cc r5,r5,1 | |
273 | sub r4,r4,mhi ; u1.31 inverse, about 30 bit | |
274 | mulu64 r5,r4 ; result fraction highpart | |
275 | lsr r8,r8,2 ; u3.29 | |
276 | add r5,r6, /* wait for immediate */ \ | |
277 | 0x3fe00000 | |
278 | mov r11,mhi ; result fraction highpart | |
279 | mulu64 r11,r8 ; u-28.31 | |
280 | asl_s DBL1L,DBL1L,9 ; u-29.23:9 | |
281 | sbc r6,r5,r7 | |
282 | mov r12,mlo ; u-28.31 | |
283 | mulu64 r11,DBL1L ; mhi: u-28.23:9 | |
284 | add.cs DBL0L,DBL0L,DBL0L | |
285 | asl_s DBL0L,DBL0L,6 ; u-26.25:7 | |
286 | asl r10,r11,23 | |
287 | sub_l DBL0L,DBL0L,r12 | |
288 | lsr r7,r11,9 | |
289 | sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 | |
290 | mul64 r5,r4 ; mhi: result fraction lowpart | |
291 | xor.f 0,DBL0H,DBL1H | |
292 | and DBL0H,r6,r9 | |
293 | add_s DBL0H,DBL0H,r7 | |
294 | bclr r12,r9,20 ; 0x7fe00000 | |
295 | brhs.d r6,r12,.Linf_denorm | |
296 | bxor.mi DBL0H,DBL0H,31 | |
297 | add.f r12,mhi,0x11 | |
298 | asr r9,r12,5 | |
299 | sub.mi DBL0H,DBL0H,1 | |
300 | add.f DBL0L,r9,r10 | |
301 | tst r12,0x1c | |
302 | jne.d [blink] | |
303 | add.cs DBL0H,DBL0H,1 | |
304 | /* work out exact rounding if we fall through here. */ | |
305 | /* We know that the exact result cannot be represented in double | |
306 | precision. Find the mid-point between the two nearest | |
307 | representable values, multiply with the divisor, and check if | |
308 | the result is larger than the dividend. Since we want to know | |
309 | only the sign bit, it is sufficient to calculate only the | |
310 | highpart of the lower 64 bits. */ | |
311 | mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo | |
312 | sub.f DBL0L,DBL0L,1 | |
313 | asl r12,r9,2 ; u-22.30:2 | |
314 | sub.cs DBL0H,DBL0H,1 | |
315 | sub.f r12,r12,2 | |
316 | mov r10,mlo ; rest before considering r12 in r5 : -r10 | |
317 | mulu64 r12,DBL1L ; mhi: u-51.32 | |
318 | asl r5,r5,25 ; s-51.7:25 | |
319 | lsr r10,r10,7 ; u-51.30:2 | |
320 | mov r7,mhi ; u-51.32 | |
321 | mulu64 r12,r8 ; mlo: u-51.31:1 | |
322 | sub r5,r5,r10 | |
323 | add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L | |
324 | bset r7,r7,0 ; make sure that the result is not zero, and that | |
325 | sub r5,r5,r7 ; a highpart zero appears negative | |
326 | sub.f r5,r5,mlo ; rest msw | |
327 | add.pl.f DBL0L,DBL0L,1 | |
328 | j_s.d [blink] | |
329 | add.eq DBL0H,DBL0H,1 | |
330 | ||
331 | .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN | |
332 | or.f 0,r6,DBL0L | |
333 | cmp.ne r6,r9 | |
334 | not_s DBL0L,DBL1H | |
335 | sub_s.ne DBL0L,DBL0L,DBL0L | |
336 | tst_s DBL0H,DBL0H | |
337 | add_s DBL0H,DBL1H,DBL0L | |
338 | j_s.d [blink] | |
339 | bxor.mi DBL0H,DBL0H,31 | |
340 | .Linf_nan_dbl0: | |
341 | tst_s DBL1H,DBL1H | |
342 | j_s.d [blink] | |
343 | bxor.mi DBL0H,DBL0H,31 | |
344 | .balign 4 | |
345 | .Linf_denorm: | |
346 | lsr r12,r6,28 | |
347 | brlo.d r12,0xc,.Linf | |
348 | .Ldenorm: | |
349 | asr r6,r6,20 | |
350 | neg r9,r6 | |
351 | mov_s DBL0H,0 | |
352 | brhs.d r9,54,.Lret0 | |
353 | bxor.mi DBL0H,DBL0H,31 | |
354 | add r12,mhi,1 | |
355 | and r12,r12,-4 | |
356 | rsub r7,r6,5 | |
357 | asr r10,r12,28 | |
358 | bmsk r4,r12,27 | |
359 | min r7,r7,31 | |
360 | asr DBL0L,r4,r7 | |
361 | add DBL1H,r11,r10 | |
362 | abs.f r10,r4 | |
363 | sub.mi r10,r10,1 | |
364 | add.f r7,r6,32-5 | |
365 | asl r4,r4,r7 | |
366 | mov.mi r4,r10 | |
367 | add.f r10,r6,23 | |
368 | rsub r7,r6,9 | |
369 | lsr r7,DBL1H,r7 | |
370 | asl r10,DBL1H,r10 | |
371 | or.pnz DBL0H,DBL0H,r7 | |
372 | or.mi r4,r4,r10 | |
373 | mov.mi r10,r7 | |
374 | add.f DBL0L,r10,DBL0L | |
375 | add.cs.f DBL0H,DBL0H,1 ; carry clear after this point | |
376 | bxor.f 0,r4,31 | |
377 | add.pnz.f DBL0L,DBL0L,1 | |
378 | add.cs.f DBL0H,DBL0H,1 | |
379 | jne_s [blink] | |
380 | /* Calculation so far was not conclusive; calculate further rest. */ | |
381 | mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo | |
382 | asr.f r12,r12,3 | |
383 | asl r5,r5,25 ; s-51.7:25 | |
384 | mov r11,mlo ; rest before considering r12 in r5 : -r11 | |
385 | mulu64 r12,r8 ; u-51.31:1 | |
386 | and r9,DBL0L,1 ; tie-breaker: round to even | |
387 | lsr r11,r11,7 ; u-51.30:2 | |
388 | mov DBL1H,mlo ; u-51.31:1 | |
389 | mulu64 r12,DBL1L ; u-51.62:2 | |
390 | sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L | |
391 | add_s DBL1H,DBL1H,r11 | |
392 | sub DBL1H,DBL1H,r5 ; -rest msw | |
393 | add_s DBL1H,DBL1H,mhi ; -rest msw | |
394 | add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( | |
395 | tst_s DBL1H,DBL1H | |
396 | cmp.eq mlo,r9 | |
397 | add.cs.f DBL0L,DBL0L,1 | |
398 | j_s.d [blink] | |
399 | add.cs DBL0H,DBL0H,1 | |
400 | ||
401 | .Lret0: | |
402 | /* return +- 0 */ | |
403 | j_s.d [blink] | |
404 | mov_s DBL0L,0 | |
405 | .Linf: | |
406 | mov_s DBL0H,r9 | |
407 | mov_s DBL0L,0 | |
408 | j_s.d [blink] | |
409 | bxor.mi DBL0H,DBL0H,31 | |
410 | ENDFUNC(__divdf3) |