]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgcc/config/arc/ieee-754/arc600-dsp/divdf3.S
Update copyright years.
[thirdparty/gcc.git] / libgcc / config / arc / ieee-754 / arc600-dsp / divdf3.S
1 /* Copyright (C) 2008-2019 Free Software Foundation, Inc.
2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com>
3 on behalf of Synopsys Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
10 version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 /*
27 to calculate a := b/x as b*y, with y := 1/x:
28 - x is in the range [1..2)
29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms.
30 Precision is higher for polynoms used to evaluate input with larger
31 value.
32 - Do one newton-raphson iteration step to double the precision,
33 then multiply this with the divisor
34 -> more time to decide if dividend is subnormal
35 - the worst error propagation is on the side of the value range
36 with the least initial defect, thus giving us about 30 bits precision.
37 The truncation error for the either is less than 1 + x/2 ulp.
38 A 31 bit inverse can be simply calculated by using x with implicit 1
39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2
40 with the bare fraction part of x, then add in y0^2 for the implicit
41 1 of x.
42 - If calculating a 31 bit inverse, the systematic error is less than
43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp.
44 - If we calculate our seed with a 32 bit fraction, we can archive a
45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we
46 only need to take the step to calculate the 2nd stage rest and
47 rounding adjust 1/32th of the time. However, if we use a 20 bit
48 fraction for the seed, the negative error can exceed -2 ulp/128, (2)
49 thus for a simple add / tst check, we need to do the 2nd stage
50 rest calculation/ rounding adjust 1/16th of the time.
51 (1): The inexactness of the 32 bit inverse contributes an error in the
52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the
53 rest contributes an error < +1/x ulp/128 . In the interval [1,2),
54 x/2 + 1/x <= 1.5 .
55 (2): Unless proven otherwise. I have not actually looked for an
56 example where -2 ulp/128 is exceeded, and my calculations indicate
57 that the excess, if existent, is less than -1/512 ulp.
58 ??? The algorithm is still based on the ARC700 optimized code.
59 Maybe we could make better use of 32x16 bit multiply, or 64 bit multiply
60 results.
61 */
62 #include "../arc-ieee-754.h"
63 #define mlo acc2
64 #define mhi acc1
65 #define mul64(b,c) mullw 0,b,c` machlw 0,b,c
66 #define mulu64(b,c) mululw 0,b,c` machulw 0,b,c
67
68 /* N.B. fp-bit.c does double rounding on denormal numbers. */
69 #if 0 /* DEBUG */
70 .global __divdf3
71 FUNC(__divdf3)
72 .balign 4
73 __divdf3:
74 push_s blink
75 push_s r2
76 push_s r3
77 push_s r0
78 bl.d __divdf3_c
79 push_s r1
80 ld_s r2,[sp,12]
81 ld_s r3,[sp,8]
82 st_s r0,[sp,12]
83 st_s r1,[sp,8]
84 pop_s r1
85 bl.d __divdf3_asm
86 pop_s r0
87 pop_s r3
88 pop_s r2
89 pop_s blink
90 cmp r0,r2
91 cmp.eq r1,r3
92 jeq_s [blink]
93 and r12,DBL0H,DBL1H
94 bic.f 0,0x7ff80000,r12 ; both NaN -> OK
95 jeq_s [blink]
96 bl abort
97 ENDFUNC(__divdf3)
98 #define __divdf3 __divdf3_asm
99 #endif /* DEBUG */
100
101 FUNC(__divdf3)
102 .balign 4
103 .L7ff00000:
104 .long 0x7ff00000
105 .Ldivtab:
106 .long 0xfc0fffe1
107 .long 0xf46ffdfb
108 .long 0xed1ffa54
109 .long 0xe61ff515
110 .long 0xdf7fee75
111 .long 0xd91fe680
112 .long 0xd2ffdd52
113 .long 0xcd1fd30c
114 .long 0xc77fc7cd
115 .long 0xc21fbbb6
116 .long 0xbcefaec0
117 .long 0xb7efa100
118 .long 0xb32f92bf
119 .long 0xae8f83b7
120 .long 0xaa2f7467
121 .long 0xa5ef6479
122 .long 0xa1cf53fa
123 .long 0x9ddf433e
124 .long 0x9a0f3216
125 .long 0x965f2091
126 .long 0x92df0f11
127 .long 0x8f6efd05
128 .long 0x8c1eeacc
129 .long 0x88eed876
130 .long 0x85dec615
131 .long 0x82eeb3b9
132 .long 0x800ea10b
133 .long 0x7d3e8e0f
134 .long 0x7a8e7b3f
135 .long 0x77ee6836
136 .long 0x756e5576
137 .long 0x72fe4293
138 .long 0x709e2f93
139 .long 0x6e4e1c7f
140 .long 0x6c0e095e
141 .long 0x69edf6c5
142 .long 0x67cde3a5
143 .long 0x65cdd125
144 .long 0x63cdbe25
145 .long 0x61ddab3f
146 .long 0x600d991f
147 .long 0x5e3d868c
148 .long 0x5c6d7384
149 .long 0x5abd615f
150 .long 0x590d4ecd
151 .long 0x576d3c83
152 .long 0x55dd2a89
153 .long 0x545d18e9
154 .long 0x52dd06e9
155 .long 0x516cf54e
156 .long 0x4ffce356
157 .long 0x4e9cd1ce
158 .long 0x4d3cbfec
159 .long 0x4becae86
160 .long 0x4aac9da4
161 .long 0x496c8c73
162 .long 0x483c7bd3
163 .long 0x470c6ae8
164 .long 0x45dc59af
165 .long 0x44bc4915
166 .long 0x43ac3924
167 .long 0x428c27fb
168 .long 0x418c187a
169 .long 0x407c07bd
170
171 __divdf3_support: /* This label makes debugger output saner. */
172 .balign 4
173 .Ldenorm_dbl1:
174 brge r6, \
175 0x43500000,.Linf_NaN ; large number / denorm -> Inf
176 bmsk.f r12,DBL1H,19
177 mov.eq r12,DBL1L
178 mov.eq DBL1L,0
179 sub.eq r7,r7,32
180 norm.f r11,r12 ; flag for x/0 -> Inf check
181 beq_s .Linf_NaN
182 mov.mi r11,0
183 add.pl r11,r11,1
184 add_s r12,r12,r12
185 asl r8,r12,r11
186 rsub r12,r11,31
187 lsr r12,DBL1L,r12
188 tst_s DBL1H,DBL1H
189 or r8,r8,r12
190 lsr r4,r8,26
191 lsr DBL1H,r8,12
192 ld.as r4,[r10,r4]
193 bxor.mi DBL1H,DBL1H,31
194 sub r11,r11,11
195 asl DBL1L,DBL1L,r11
196 sub r11,r11,1
197 mulu64 (r4,r8)
198 sub r7,r7,r11
199 b.d .Lpast_denorm_dbl1
200 asl r7,r7,20
201
202 .Linf_NaN:
203 tst_s DBL0L,DBL0L ; 0/0 -> NaN
204 xor_s DBL1H,DBL1H,DBL0H
205 bclr.eq.f DBL0H,DBL0H,31
206 bmsk DBL0H,DBL1H,30
207 xor_s DBL0H,DBL0H,DBL1H
208 sub.eq DBL0H,DBL0H,1
209 mov_s DBL0L,0
210 j_s.d [blink]
211 or DBL0H,DBL0H,r9
212 .balign 4
213 .Lret0_2:
214 xor_s DBL1H,DBL1H,DBL0H
215 mov_s DBL0L,0
216 bmsk DBL0H,DBL1H,30
217 j_s.d [blink]
218 xor_s DBL0H,DBL0H,DBL1H
219 .balign 4
220 .global __divdf3
221 /* N.B. the spacing between divtab and the sub3 to get its address must
222 be a multiple of 8. */
223 __divdf3:
224 asl r8,DBL1H,12
225 lsr r4,r8,26
226 sub3 r10,pcl,51;(.-.Ldivtab) >> 3
227 ld.as r9,[pcl,-104]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000
228 ld.as r4,[r10,r4]
229 lsr r12,DBL1L,20
230 and.f r7,DBL1H,r9
231 or r8,r8,r12
232 mulu64 (r4,r8)
233 beq.d .Ldenorm_dbl1
234 .Lpast_denorm_dbl1:
235 and.f r6,DBL0H,r9
236 breq.d r7,r9,.Linf_nan_dbl1
237 asl r4,r4,12
238 sub r4,r4,mhi
239 mululw 0,r4,r4
240 machulw r5,r4,r4
241 bne.d .Lnormal_dbl0
242 lsr r8,r8,1
243
244 .balign 4
245 .Ldenorm_dbl0:
246 bmsk.f r12,DBL0H,19
247 ; wb stall
248 mov.eq r12,DBL0L
249 sub.eq r6,r6,32
250 norm.f r11,r12 ; flag for 0/x -> 0 check
251 brge r7, \
252 0x43500000, .Lret0_2 ; denorm/large number -> 0
253 beq_s .Lret0_2
254 mov.mi r11,0
255 add.pl r11,r11,1
256 asl r12,r12,r11
257 sub r6,r6,r11
258 add.f 0,r6,31
259 lsr r10,DBL0L,r6
260 mov.mi r10,0
261 add r6,r6,11+32
262 neg.f r11,r6
263 asl DBL0L,DBL0L,r11
264 mov.pl DBL0L,0
265 sub r6,r6,32-1
266 b.d .Lpast_denorm_dbl0
267 asl r6,r6,20
268
269 .balign 4
270 .Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN
271 or.f 0,r6,DBL0L
272 cmp.ne r6,r9
273 not_s DBL0L,DBL1H
274 sub_s.ne DBL0L,DBL0L,DBL0L
275 tst_s DBL0H,DBL0H
276 add_s DBL0H,DBL1H,DBL0L
277 j_s.d [blink]
278 bxor.mi DBL0H,DBL0H,31
279
280 .balign 4
281 .Lnormal_dbl0:
282 breq.d r6,r9,.Linf_nan_dbl0
283 asl r12,DBL0H,11
284 lsr r10,DBL0L,21
285 .Lpast_denorm_dbl0:
286 bset r8,r8,31
287 mulu64 (r5,r8)
288 add_s r12,r12,r10
289 bset r5,r12,31
290 cmp r5,r8
291 cmp.eq DBL0L,DBL1L
292 lsr.cc r5,r5,1
293 sub r4,r4,mhi ; u1.31 inverse, about 30 bit
294 mululw 0,r5,r4
295 machulw r11,r5,r4 ; result fraction highpart
296 lsr r8,r8,2 ; u3.29
297 add r5,r6, /* wait for immediate */ \
298 0x3fe00000
299 mulu64 (r11,r8) ; u-28.31
300 asl_s DBL1L,DBL1L,9 ; u-29.23:9
301 sbc r6,r5,r7
302 mov r12,mlo ; u-28.31
303 mulu64 (r11,DBL1L) ; mhi: u-28.23:9
304 add.cs DBL0L,DBL0L,DBL0L
305 asl_s DBL0L,DBL0L,6 ; u-26.25:7
306 asl r10,r11,23
307 sub_l DBL0L,DBL0L,r12
308 lsr r7,r11,9
309 sub r5,DBL0L,mhi ; rest msw ; u-26.31:0
310 mul64 (r5,r4) ; mhi: result fraction lowpart
311 xor.f 0,DBL0H,DBL1H
312 and DBL0H,r6,r9
313 add_s DBL0H,DBL0H,r7
314 bclr r12,r9,20 ; 0x7fe00000
315 brhs.d r6,r12,.Linf_denorm
316 bxor.mi DBL0H,DBL0H,31
317 add.f r12,mhi,0x11
318 asr r9,r12,5
319 sub.mi DBL0H,DBL0H,1
320 add.f DBL0L,r9,r10
321 tst r12,0x1c
322 jne.d [blink]
323 add.cs DBL0H,DBL0H,1
324 /* work out exact rounding if we fall through here. */
325 /* We know that the exact result cannot be represented in double
326 precision. Find the mid-point between the two nearest
327 representable values, multiply with the divisor, and check if
328 the result is larger than the dividend. Since we want to know
329 only the sign bit, it is sufficient to calculate only the
330 highpart of the lower 64 bits. */
331 mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
332 sub.f DBL0L,DBL0L,1
333 asl r12,r9,2 ; u-22.30:2
334 sub.cs DBL0H,DBL0H,1
335 sub.f r12,r12,2
336 mov r10,mlo ; rest before considering r12 in r5 : -r10
337 mululw 0,r12,DBL1L
338 machulw r7,r12,DBL1L ; mhi: u-51.32
339 asl r5,r5,25 ; s-51.7:25
340 lsr r10,r10,7 ; u-51.30:2
341 mulu64 (r12,r8) ; mlo: u-51.31:1
342 sub r5,r5,r10
343 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L
344 bset r7,r7,0 ; make sure that the result is not zero, and that
345 sub r5,r5,r7 ; a highpart zero appears negative
346 sub.f r5,r5,mlo ; rest msw
347 add.pl.f DBL0L,DBL0L,1
348 j_s.d [blink]
349 add.eq DBL0H,DBL0H,1
350
351 .Linf_nan_dbl0:
352 tst_s DBL1H,DBL1H
353 j_s.d [blink]
354 bxor.mi DBL0H,DBL0H,31
355 .balign 4
356 .Linf_denorm:
357 lsr r12,r6,28
358 brlo.d r12,0xc,.Linf
359 .Ldenorm:
360 asr r6,r6,20
361 neg r9,r6
362 mov_s DBL0H,0
363 brhs.d r9,54,.Lret0
364 bxor.mi DBL0H,DBL0H,31
365 add r12,mhi,1
366 and r12,r12,-4
367 rsub r7,r6,5
368 asr r10,r12,28
369 bmsk r4,r12,27
370 min r7,r7,31
371 asr DBL0L,r4,r7
372 add DBL1H,r11,r10
373 abs.f r10,r4
374 sub.mi r10,r10,1
375 add.f r7,r6,32-5
376 asl r4,r4,r7
377 mov.mi r4,r10
378 add.f r10,r6,23
379 rsub r7,r6,9
380 lsr r7,DBL1H,r7
381 asl r10,DBL1H,r10
382 or.pnz DBL0H,DBL0H,r7
383 or.mi r4,r4,r10
384 mov.mi r10,r7
385 add.f DBL0L,r10,DBL0L
386 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point
387 bxor.f 0,r4,31
388 add.pnz.f DBL0L,DBL0L,1
389 add.cs.f DBL0H,DBL0H,1
390 jne_s [blink]
391 /* Calculation so far was not conclusive; calculate further rest. */
392 mulu64 (r11,DBL1L) ; rest before considering r12 in r5 : -mlo
393 asr.f r12,r12,3
394 asl r5,r5,25 ; s-51.7:25
395 mov r11,mlo ; rest before considering r12 in r5 : -r11
396 mulu64 (r12,r8) ; u-51.31:1
397 and r9,DBL0L,1 ; tie-breaker: round to even
398 lsr r11,r11,7 ; u-51.30:2
399 mov DBL1H,mlo ; u-51.31:1
400 mulu64 (r12,DBL1L) ; u-51.62:2
401 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L
402 add_s DBL1H,DBL1H,r11
403 sub DBL1H,DBL1H,r5 ; -rest msw
404 add_s DBL1H,DBL1H,mhi ; -rest msw
405 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-(
406 tst_s DBL1H,DBL1H
407 cmp.eq mlo,r9
408 add.cs.f DBL0L,DBL0L,1
409 j_s.d [blink]
410 add.cs DBL0H,DBL0H,1
411
412 .Lret0:
413 /* return +- 0 */
414 j_s.d [blink]
415 mov_s DBL0L,0
416 .Linf:
417 mov_s DBL0H,r9
418 mov_s DBL0L,0
419 j_s.d [blink]
420 bxor.mi DBL0H,DBL0H,31
421 ENDFUNC(__divdf3)