]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgcc/config/arc/ieee-754/arc600-mul64/divdf3.S
Update copyright years.
[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / arc600-mul64 / divdf3.S
1 /* Copyright (C) 2008-2020 Free Software Foundation, Inc.
2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3 on behalf of Synopsys Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 /*
27 to calculate a := b/x as b*y, with y := 1/x:
28 - x is in the range [1..2)
29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30 Precision is higher for polynoms used to evaluate input with larger
31 value.
32 - Do one newton-raphson iteration step to double the precision,
33 then multiply this with the divisor
34 -> more time to decide if dividend is subnormal
35 - the worst error propagation is on the side of the value range
36 with the least initial defect, thus giving us about 30 bits precision.
37 The truncation error for the either is less than 1 + x/2 ulp.
38 A 31 bit inverse can be simply calculated by using x with implicit 1
39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
40 with the bare fraction part of x, then add in y0^2 for the implicit
41 1 of x.
42 - If calculating a 31 bit inverse, the systematic error is less than
43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44 - If we calculate our seed with a 32 bit fraction, we can archive a
45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46 only need to take the step to calculate the 2nd stage rest and
47 rounding adjust 1/32th of the time. However, if we use a 20 bit
48 fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49 thus for a simple add / tst check, we need to do the 2nd stage
50 rest calculation/ rounding adjust 1/16th of the time.
51 (1): The inexactness of the 32 bit inverse contributes an error in the
52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
53 rest contributes an error < +1/x ulp/128 . In the interval [1,2),
54 x/2 + 1/x <= 1.5 .
55 (2): Unless proven otherwise. I have not actually looked for an
56 example where -2 ulp/128 is exceeded, and my calculations indicate
57 that the excess, if existent, is less than -1/512 ulp.
58 ??? The algorithm is still based on the ARC700 optimized code.
59 Maybe we could make better use of 64 bit multiply results and/or mmed .
60 */
61 #include "../arc-ieee-754.h"
62
63 /* N.B. fp-bit.c does double rounding on denormal numbers. */
64 #if 0 /* DEBUG */
65 .global __divdf3
66 FUNC(__divdf3)
67 .balign 4
68 __divdf3:
69 push_s blink
70 push_s r2
71 push_s r3
72 push_s r0
73 bl.d __divdf3_c
74 push_s r1
75 ld_s r2,[sp,12]
76 ld_s r3,[sp,8]
77 st_s r0,[sp,12]
78 st_s r1,[sp,8]
79 pop_s r1
80 bl.d __divdf3_asm
81 pop_s r0
82 pop_s r3
83 pop_s r2
84 pop_s blink
85 cmp r0,r2
86 cmp.eq r1,r3
87 jeq_s [blink]
88 and r12,DBL0H,DBL1H
89 bic.f 0,0x7ff80000,r12 ; both NaN -> OK
90 jeq_s [blink]
91 bl abort
92 ENDFUNC(__divdf3)
93 #define __divdf3 __divdf3_asm
94 #endif /* DEBUG */
95
96 FUNC(__divdf3)
97 .balign 4
98 .L7ff00000:
99 .long 0x7ff00000
100 .Ldivtab:
101 .long 0xfc0fffe1
102 .long 0xf46ffdfb
103 .long 0xed1ffa54
104 .long 0xe61ff515
105 .long 0xdf7fee75
106 .long 0xd91fe680
107 .long 0xd2ffdd52
108 .long 0xcd1fd30c
109 .long 0xc77fc7cd
110 .long 0xc21fbbb6
111 .long 0xbcefaec0
112 .long 0xb7efa100
113 .long 0xb32f92bf
114 .long 0xae8f83b7
115 .long 0xaa2f7467
116 .long 0xa5ef6479
117 .long 0xa1cf53fa
118 .long 0x9ddf433e
119 .long 0x9a0f3216
120 .long 0x965f2091
121 .long 0x92df0f11
122 .long 0x8f6efd05
123 .long 0x8c1eeacc
124 .long 0x88eed876
125 .long 0x85dec615
126 .long 0x82eeb3b9
127 .long 0x800ea10b
128 .long 0x7d3e8e0f
129 .long 0x7a8e7b3f
130 .long 0x77ee6836
131 .long 0x756e5576
132 .long 0x72fe4293
133 .long 0x709e2f93
134 .long 0x6e4e1c7f
135 .long 0x6c0e095e
136 .long 0x69edf6c5
137 .long 0x67cde3a5
138 .long 0x65cdd125
139 .long 0x63cdbe25
140 .long 0x61ddab3f
141 .long 0x600d991f
142 .long 0x5e3d868c
143 .long 0x5c6d7384
144 .long 0x5abd615f
145 .long 0x590d4ecd
146 .long 0x576d3c83
147 .long 0x55dd2a89
148 .long 0x545d18e9
149 .long 0x52dd06e9
150 .long 0x516cf54e
151 .long 0x4ffce356
152 .long 0x4e9cd1ce
153 .long 0x4d3cbfec
154 .long 0x4becae86
155 .long 0x4aac9da4
156 .long 0x496c8c73
157 .long 0x483c7bd3
158 .long 0x470c6ae8
159 .long 0x45dc59af
160 .long 0x44bc4915
161 .long 0x43ac3924
162 .long 0x428c27fb
163 .long 0x418c187a
164 .long 0x407c07bd
165
166 __divdf3_support: /* This label makes debugger output saner. */
167 .balign 4
168 .Ldenorm_dbl1:
169 brge r6, \
170 0x43500000,.Linf_NaN ; large number / denorm -> Inf
171 bmsk.f r12,DBL1H,19
172 mov.eq r12,DBL1L
173 mov.eq DBL1L,0
174 sub.eq r7,r7,32
175 norm.f r11,r12 ; flag for x/0 -> Inf check
176 beq_s .Linf_NaN
177 mov.mi r11,0
178 add.pl r11,r11,1
179 add_s r12,r12,r12
180 asl r8,r12,r11
181 rsub r12,r11,31
182 lsr r12,DBL1L,r12
183 tst_s DBL1H,DBL1H
184 or r8,r8,r12
185 lsr r4,r8,26
186 lsr DBL1H,r8,12
187 ld.as r4,[r10,r4]
188 bxor.mi DBL1H,DBL1H,31
189 sub r11,r11,11
190 asl DBL1L,DBL1L,r11
191 sub r11,r11,1
192 mulu64 r4,r8
193 sub r7,r7,r11
194 b.d .Lpast_denorm_dbl1
195 asl r7,r7,20
196
197 .balign 4
198 .Ldenorm_dbl0:
199 bmsk.f r12,DBL0H,19
200 ; wb stall
201 mov.eq r12,DBL0L
202 sub.eq r6,r6,32
203 norm.f r11,r12 ; flag for 0/x -> 0 check
204 brge r7, \
205 0x43500000, .Lret0_2 ; denorm/large number -> 0
206 beq_s .Lret0_2
207 mov.mi r11,0
208 add.pl r11,r11,1
209 asl r12,r12,r11
210 sub r6,r6,r11
211 add.f 0,r6,31
212 lsr r10,DBL0L,r6
213 mov.mi r10,0
214 add r6,r6,11+32
215 neg.f r11,r6
216 asl DBL0L,DBL0L,r11
217 mov.pl DBL0L,0
218 sub r6,r6,32-1
219 b.d .Lpast_denorm_dbl0
220 asl r6,r6,20
221
222 .Linf_NaN:
223 tst_s DBL0L,DBL0L ; 0/0 -> NaN
224 xor_s DBL1H,DBL1H,DBL0H
225 bclr.eq.f DBL0H,DBL0H,31
226 bmsk DBL0H,DBL1H,30
227 xor_s DBL0H,DBL0H,DBL1H
228 sub.eq DBL0H,DBL0H,1
229 mov_s DBL0L,0
230 j_s.d [blink]
231 or DBL0H,DBL0H,r9
232 .balign 4
233 .Lret0_2:
234 xor_s DBL1H,DBL1H,DBL0H
235 mov_s DBL0L,0
236 bmsk DBL0H,DBL1H,30
237 j_s.d [blink]
238 xor_s DBL0H,DBL0H,DBL1H
239 .balign 4
240 .global __divdf3
241 /* N.B. the spacing between divtab and the sub3 to get its address must
242 be a multiple of 8. */
243 __divdf3:
244 asl r8,DBL1H,12
245 lsr r4,r8,26
246 sub3 r10,pcl,61; (.-.Ldivtab) >> 3
247 ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
248 ld.as r4,[r10,r4]
249 lsr r12,DBL1L,20
250 and.f r7,DBL1H,r9
251 or r8,r8,r12
252 mulu64 r4,r8
253 beq.d .Ldenorm_dbl1
254 .Lpast_denorm_dbl1:
255 and.f r6,DBL0H,r9
256 breq.d r7,r9,.Linf_nan_dbl1
257 asl r4,r4,12
258 sub r4,r4,mhi
259 mulu64 r4,r4
260 beq.d .Ldenorm_dbl0
261 lsr r8,r8,1
262 breq.d r6,r9,.Linf_nan_dbl0
263 asl r12,DBL0H,11
264 lsr r10,DBL0L,21
265 .Lpast_denorm_dbl0:
266 bset r8,r8,31
267 mulu64 mhi,r8
268 add_s r12,r12,r10
269 bset r5,r12,31
270 cmp r5,r8
271 cmp.eq DBL0L,DBL1L
272 lsr.cc r5,r5,1
273 sub r4,r4,mhi ; u1.31 inverse, about 30 bit
274 mulu64 r5,r4 ; result fraction highpart
275 lsr r8,r8,2 ; u3.29
276 add r5,r6, /* wait for immediate */ \
277 0x3fe00000
278 mov r11,mhi ; result fraction highpart
279 mulu64 r11,r8 ; u-28.31
280 asl_s DBL1L,DBL1L,9 ; u-29.23:9
281 sbc r6,r5,r7
282 mov r12,mlo ; u-28.31
283 mulu64 r11,DBL1L ; mhi: u-28.23:9
284 add.cs DBL0L,DBL0L,DBL0L
285 asl_s DBL0L,DBL0L,6 ; u-26.25:7
286 asl r10,r11,23
287 sub_l DBL0L,DBL0L,r12
288 lsr r7,r11,9
289 sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
290 mul64 r5,r4 ; mhi: result fraction lowpart
291 xor.f 0,DBL0H,DBL1H
292 and DBL0H,r6,r9
293 add_s DBL0H,DBL0H,r7
294 bclr r12,r9,20 ; 0x7fe00000
295 brhs.d r6,r12,.Linf_denorm
296 bxor.mi DBL0H,DBL0H,31
297 add.f r12,mhi,0x11
298 asr r9,r12,5
299 sub.mi DBL0H,DBL0H,1
300 add.f DBL0L,r9,r10
301 tst r12,0x1c
302 jne.d [blink]
303 add.cs DBL0H,DBL0H,1
304 /* work out exact rounding if we fall through here. */
305 /* We know that the exact result cannot be represented in double
306 precision. Find the mid-point between the two nearest
307 representable values, multiply with the divisor, and check if
308 the result is larger than the dividend. Since we want to know
309 only the sign bit, it is sufficient to calculate only the
310 highpart of the lower 64 bits. */
311 mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
312 sub.f DBL0L,DBL0L,1
313 asl r12,r9,2 ; u-22.30:2
314 sub.cs DBL0H,DBL0H,1
315 sub.f r12,r12,2
316 mov r10,mlo ; rest before considering r12 in r5 : -r10
317 mulu64 r12,DBL1L ; mhi: u-51.32
318 asl r5,r5,25 ; s-51.7:25
319 lsr r10,r10,7 ; u-51.30:2
320 mov r7,mhi ; u-51.32
321 mulu64 r12,r8 ; mlo: u-51.31:1
322 sub r5,r5,r10
323 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
324 bset r7,r7,0 ; make sure that the result is not zero, and that
325 sub r5,r5,r7 ; a highpart zero appears negative
326 sub.f r5,r5,mlo ; rest msw
327 add.pl.f DBL0L,DBL0L,1
328 j_s.d [blink]
329 add.eq DBL0H,DBL0H,1
330
331 .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
332 or.f 0,r6,DBL0L
333 cmp.ne r6,r9
334 not_s DBL0L,DBL1H
335 sub_s.ne DBL0L,DBL0L,DBL0L
336 tst_s DBL0H,DBL0H
337 add_s DBL0H,DBL1H,DBL0L
338 j_s.d [blink]
339 bxor.mi DBL0H,DBL0H,31
340 .Linf_nan_dbl0:
341 tst_s DBL1H,DBL1H
342 j_s.d [blink]
343 bxor.mi DBL0H,DBL0H,31
344 .balign 4
345 .Linf_denorm:
346 lsr r12,r6,28
347 brlo.d r12,0xc,.Linf
348 .Ldenorm:
349 asr r6,r6,20
350 neg r9,r6
351 mov_s DBL0H,0
352 brhs.d r9,54,.Lret0
353 bxor.mi DBL0H,DBL0H,31
354 add r12,mhi,1
355 and r12,r12,-4
356 rsub r7,r6,5
357 asr r10,r12,28
358 bmsk r4,r12,27
359 min r7,r7,31
360 asr DBL0L,r4,r7
361 add DBL1H,r11,r10
362 abs.f r10,r4
363 sub.mi r10,r10,1
364 add.f r7,r6,32-5
365 asl r4,r4,r7
366 mov.mi r4,r10
367 add.f r10,r6,23
368 rsub r7,r6,9
369 lsr r7,DBL1H,r7
370 asl r10,DBL1H,r10
371 or.pnz DBL0H,DBL0H,r7
372 or.mi r4,r4,r10
373 mov.mi r10,r7
374 add.f DBL0L,r10,DBL0L
375 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
376 bxor.f 0,r4,31
377 add.pnz.f DBL0L,DBL0L,1
378 add.cs.f DBL0H,DBL0H,1
379 jne_s [blink]
380 /* Calculation so far was not conclusive; calculate further rest. */
381 mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo
382 asr.f r12,r12,3
383 asl r5,r5,25 ; s-51.7:25
384 mov r11,mlo ; rest before considering r12 in r5 : -r11
385 mulu64 r12,r8 ; u-51.31:1
386 and r9,DBL0L,1 ; tie-breaker: round to even
387 lsr r11,r11,7 ; u-51.30:2
388 mov DBL1H,mlo ; u-51.31:1
389 mulu64 r12,DBL1L ; u-51.62:2
390 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
391 add_s DBL1H,DBL1H,r11
392 sub DBL1H,DBL1H,r5 ; -rest msw
393 add_s DBL1H,DBL1H,mhi ; -rest msw
394 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
395 tst_s DBL1H,DBL1H
396 cmp.eq mlo,r9
397 add.cs.f DBL0L,DBL0L,1
398 j_s.d [blink]
399 add.cs DBL0H,DBL0H,1
400
401 .Lret0:
402 /* return +- 0 */
403 j_s.d [blink]
404 mov_s DBL0L,0
405 .Linf:
406 mov_s DBL0H,r9
407 mov_s DBL0L,0
408 j_s.d [blink]
409 bxor.mi DBL0H,DBL0H,31
410 ENDFUNC(__divdf3)